meetup analytics with r and neo4j

Post on 14-Jul-2015

1.108 Views

Category:

Science

1 Downloads

Preview:

Click to see full reader

TRANSCRIPT

Exploring London NoSQL meetups using R

Mark Needham@markhneedham

Scraper at the ready...

Not needed :(

Lots of bits of data

● Events● Members● Groups● RSVPs● Venues● Topics

The data model

Interesting questions to ask...

Interesting questions to ask...● What day of the week do people go to meetups?● Where abouts in London are NoSQL meetups held?● Do people sign up for multiple meetups on the same

day?● Are there common members between groups?● What topics are people most interested in?● In which order do people join the NoSQL groups?● Who are the most connected people on the NoSQL

scene?

The tool set

RNeo4j

Results as a data frame

Query

dplyrggplot2

igraph ggmapcluster

geosphere

When do people go to meetups?

When do people go to meetups?

(g:Group)-[:HOSTED_EVENT]->(event)<-[:TO]-

({response: 'yes'})<-[:RSVPD]-()

When do people go to meetups?MATCH (g:Group)-[:HOSTED_EVENT]->(event)<-[:TO]-

({response: 'yes'})<-[:RSVPD]-()

WHERE (event.time + event.utc_offset) < timestamp()

RETURN g.name,

event.time + event.utc_offset AS eventTime,

event.announced_at AS announcedAt,

event.name,

COUNT(*) AS rsvps

R Neo4jinstall.packages("devtools")

devtools::install_github("nicolewhite/Rneo4j")

library(RNeo4j)

graph = startGraph("http://localhost:7474/db/data/")

query = "MATCH … RETURN …"

cypher(graph, query)

Grouping events by monthlibrary(dplyr)

events %>%

group_by(month) %>%

summarise(events = n(),

count = sum(rsvps),

max = max(rsvps)) %>%

mutate(ave = count / events) %>%

arrange(desc(ave))

Grouping events by month## month events count ave

## 1 November 55 3018 54.87273

## 2 May 52 2676 51.46154

## 3 April 58 2964 51.10345

## 4 June 47 2384 50.72340

## 5 October 71 3566 50.22535

## 6 September 59 2860 48.47458

## 7 February 43 2047 47.60465

## 8 January 34 1592 46.82353

## 9 December 24 1056 44.00000

## 10 March 39 1667 42.74359

## 11 July 48 1866 38.87500

## 12 August 34 1023 30.08824

Grouping events by dayevents %>%

group_by(day) %>%

summarise(events = n(),

count = sum(rsvps),

max = max(rsvps)) %>%

mutate(ave = count / events) %>%

arrange(day)

Grouping events by day## day events count ave

## 1 Monday 63 4034 64.03175

## 2 Tuesday 151 6696 44.34437

## 3 Wednesday 225 9481 42.13778

## 4 Thursday 104 5394 51.86538

## 5 Friday 11 378 34.36364

## 6 Saturday 10 736 73.60000

Some simple bar chartslibrary(ggplot2)

g1 = ggplot(aes(x = day, y = ave), data = byDay) +

geom_bar(stat="identity", fill="dark blue") +

ggtitle("Average attendees by day")

g2 = ggplot(aes(x = day, y = count), data = byDay) +

geom_bar(stat="identity", fill="dark blue") +

ggtitle("Total attendees by day")

grid.arrange(g1,g2, ncol = 1)

London hits the pub

Where do people go to meetups?

(g:Group)-[:HOSTED_EVENT]->(event)<-[:TO]-

({response: 'yes'})<-[:RSVPD]-(),

(event)-[:HELD_AT]->(venue)

Where do people go to meetups?MATCH (g:Group)-[:HOSTED_EVENT]->(event)<-[:TO]-

({response: 'yes'})<-[:RSVPD]-(), (event)-[:HELD_AT]->(venue)

WHERE (event.time + event.utc_offset) < timestamp()

RETURN g.name,

event.time + event.utc_offset AS eventTime,

event.announced_at AS announcedAt,

event.name,

venue.name AS venue,

venue.lat AS lat,

venue.lon AS lon,

COUNT(*) AS rsvps

Where do people go to meetups?MATCH (g:Group)-[:HOSTED_EVENT]->(event)<-[:TO]-

({response: 'yes'})<-[:RSVPD]-(), (event)-[:HELD_AT]->(venue)

WHERE (event.time + event.utc_offset) < timestamp()

RETURN g.name,

event.time + event.utc_offset AS eventTime,

event.announced_at AS announcedAt,

event.name,

venue.name AS venue,

venue.lat AS lat,

venue.lon AS lon,

COUNT(*) AS rsvps

Where do people go to meetups?byVenue = events %>%

count(lat, lon, venue) %>%

ungroup() %>%

arrange(desc(n)) %>%

rename(count = n)

Where do people go to meetups?## lat lon venue count

## 1 51.50256 -0.019379 Skyline Bar at CCT Venues Plus 1

## 2 51.53373 -0.122340 The Guardian 1

## 3 51.51289 -0.067163 Erlang Solutions 3

## 4 51.49146 -0.219424 Novotel - W6 8DR 1

## 5 51.49311 -0.146531 Google HQ 1

## 6 51.52655 -0.084219 Look Mum No Hands! 22

## 7 51.51976 -0.097270 Vibrant Media, 3rd Floor 1

## 8 51.52303 -0.085178 Mind Candy HQ 2

## 9 51.51786 -0.109260 ThoughtWorks UK Office 2

## 10 51.51575 -0.097978 BT Centre 1

Where do people go to meetups?library(ggmap)

map = get_map(location = 'London', zoom = 12)

ggmap(map) +

geom_point(aes(x = lon, y = lat, size = count),

data = byVenue,

col = "red",

alpha = 0.8)

library(geosphere)

library(cluster)

clusteramounts = 40

distance.matrix = byVenue %>% select(lon, lat) %>% distm

clustersx <- as.hclust(agnes(distance.matrix, diss = T))

byVenue$group <- cutree(clustersx, k=clusteramounts)

byVenueClustered = byVenue %>%

group_by(group) %>%

summarise(meanLat = mean(lat),

meanLon = mean(lon),

total = sum(count),

venues = paste(venue, collapse = ","))

Spatial clustering

## group meanLat meanLon total

## 1 3 51.52349 -0.08506461 123

## 2 1 51.52443 -0.09919280 89

## 3 2 51.50547 -0.10325925 62

## 4 4 51.50794 -0.12714600 55

## 5 8 51.51671 -0.10028908 19

## 6 6 51.53655 -0.13798514 18

## 7 7 51.52159 -0.10934720 18

## 8 5 51.51155 -0.07004417 13

## 9 12 51.51459 -0.12314650 13

## 10 14 51.52129 -0.07588867 10

Spatial clustering

ggmap(map) +

geom_point(aes(x = meanLon, y = meanLat, size = total),

data = byVenueClustered,

col = "red",

alpha = 0.8)

Spatial clustering

byVenue %>%

filter(group == byVenueClustered$group[1])

What’s going on in Shoreditch?

Meetup Group Member Overlap

● Why would we want to know this?○ Perhaps for joint meetups○ Topics for future meetups

Extracting the dataMATCH (group1:Group), (group2:Group)

WHERE group1 <> group2

OPTIONAL MATCH p = (group1)<-[:MEMBER_OF]-()-[:MEMBER_OF]->(group2)

WITH group1, group2, COLLECT(p) AS paths

RETURN group1.name, group2.name,

LENGTH(paths) as commonMembers

ORDER BY group1.name, group2.name

MATCH (group1:Group), (group2:Group)

WHERE group1 <> group2

OPTIONAL MATCH (group1)<-[:MEMBER_OF]-(member)

WITH group1, group2, COLLECT(member) AS group1Members

WITH group1, group2, group1Members, LENGTH(group1Members) AS numberOfGroup1Members

UNWIND group1Members AS member

OPTIONAL MATCH path = (member)-[:MEMBER_OF]->(group2)

WITH group1, group2, COLLECT(path) AS paths, numberOfGroup1Members

WITH group1, group2, LENGTH(paths) as commonMembers, numberOfGroup1Members

RETURN group1.name, group2.name,

toInt(round(100.0 * commonMembers / numberOfGroup1Members)) AS percentage

ORDER BY group1.name, group1.name

Finding overlap as a percentage

How many groups are people part of?MATCH (p:MeetupProfile)-[:MEMBER_OF]->()

RETURN ID(p), COUNT(*) AS groups

ORDER BY groups DESC

How many groups are people part of?ggplot(aes(x = groups, y = n),

data = group_count %>% count(groups)) +

geom_bar(stat="identity", fill="dark blue") +

scale_y_sqrt() +

scale_x_continuous(

breaks = round(seq(min(group_count$groups), max(group_count$groups), by = 1),1)) +

ggtitle("Number of groups people are members of")

Who’s the most connected?

● i.e. the person who had the chance to meet the most people in the community

● Betweenness Centrality● Page Rank

Who’s the most connected?

Betweenness Centrality

Calculates the number of shortest paths that go through a particular node

Betweenness Centralitylibrary(igraph)

nodes_query = "MATCH (p:MeetupProfile)-[:RSVPD]->({response: 'yes'})-[:TO]->(event)

RETURN DISTINCT ID(p) AS id, p.id AS name, p.name AS fullName"

nodes = cypher(graph, nodes_query)

edges_query = "MATCH (p:MeetupProfile)-[:RSVPD]->({response: 'yes'})-[:TO]->(event),

(event)<-[:TO]-({response:'yes'})<-[:RSVPD]-(other)

RETURN ID(p) AS source, ID(other) AS target, COUNT(*) AS weight"

edges = cypher(graph, edges_query)

g = graph.data.frame(edges, directed = T, nodes)

bwGraph = betweenness(g)

bwDf = data.frame(id = names(bwGraph), score = bwGraph)

Betweenness CentralitybwDf %>% arrange(desc(score)) %>% head(5)

merge(nodes, bwDf, by.x = "name", by.y = "id") %>%

arrange(desc(score)) %>%

head(5)

Page RankPageRank works by counting the number and quality of links to a page to determine a rough estimate of how important the website is. The underlying assumption is that more important websites are likely to receive more links from other websites.

Page RankPageRank works by counting the number and quality of links to a person to determine a rough estimate of how important the person is. The underlying assumption is that more important people are likely to receive more links from other people.

Page Rankpr = page.rank(g)$vector

prDf = data.frame(name = names(pr), rank = pr)

data.frame(merge(nodes, prDf, by.x = "name", by.y = "name")) %>%

arrange(desc(rank)) %>%

head(10)

Blending back into the graphquery = "MATCH (p:MeetupProfile {id: {id}}) SET p.betweenness = {score}"

tx = newTransaction(graph)

for(i in 1:nrow(bwDf)) {

if(i %% 1000 == 0) {

commit(tx)

print(paste("Batch", i / 1000, "committed."))

tx = newTransaction(graph)

}

id = bwDf[i, "id"]

score = bwDf[i, "score"]

appendCypher(tx, query, id = id, score = as.double(score))

}

commit(tx)

Blending back into the graphquery = "MATCH (p:MeetupProfile {id: {id}}) SET p.pageRank = {score}"

tx = newTransaction(graph)

for(i in 1:nrow(prDf)) {

if(i %% 1000 == 0) {

commit(tx)

print(paste("Batch", i / 1000, "committed."))

tx = newTransaction(graph)

}

name = prDf[i, "name"]

rank = prDf[i, "rank"]

appendCypher(tx, query, id = name, score = as.double(rank))

}

commit(tx)

Are they in the Neo4j group?

MATCH (p:MeetupProfile)

WITH p

ORDER BY p.pageRank DESC

LIMIT 20

OPTIONAL MATCH member = (p)-[m:MEMBER_OF]->(g:Group)

WHERE group.name = "Neo4j - London User Group"

RETURN p.name, p.id, p.pageRank, NOT m is null AS isMember

ORDER BY p.pageRank DESC

Are they in the Neo4j group?blended_data = cypher(graph, query)

Have they been to any events?

Have they been to any events?MATCH (p:MeetupProfile)

WITH p

ORDER BY p.pageRank DESC

LIMIT 20

OPTIONAL MATCH member = (p)-[m:MEMBER_OF]->(g:Group) WHERE g.name = "Neo4j - London User Group"

WITH p, NOT m is null AS isMember, g

OPTIONAL MATCH event= (p)-[:RSVPD]-({response:'yes'})-[:TO]->()<-[:HOSTED_EVENT]-(g)

WITH p, isMember, COLLECT(event) as events

RETURN p.name, p.id, p.pageRank, isMember, LENGTH(events) AS events

ORDER BY p.pageRank DESC

Have they been to any events?blended_data = cypher(graph, query)

Take Aways

● ggplot => visualisations with minimal code● dplyr => easy data manipulation for

people from other languages● igraph => find the influencers in a network● graphs => flexible way of modelling data

that allows querying across multiple dimensions

And one final take away...

http://github.com/mneedham/neo4j-meetup

Get the code

top related