Blog»Twitter Communities Reporting

Twitter Communities Reporting

Those of you who know me know that I'm a fan of the #PPCChat hashtag on twitter although I haven't participated recently due to my change in circumstances.

I have created a #PPCChat community dashboard which (should) update weekly. This blog post shows how the dashboard is made.

For a while now I have been gathering every tweet on the #PPCChat hashtag using the following Ruby script:

#!/usr/bin/ruby
require 'tweetstream'
require 'csv'

TweetStream.configure do |config|
  config.consumer_key       = 'SECRET'
  config.consumer_secret    = 'SECRET'
  config.oauth_token        = 'SECRET'
  config.oauth_token_secret = 'SECRET'
  config.auth_method        = :oauth
end

@client = TweetStream::Client.new

@client.on_limit do |skip_count|
  puts "Rate limited. Waiting for 5 minutes"
  sleep 360
end

@client.on_enhance_your_calm do
  puts "Enhancing calm. Waiting for 5 minutes"
  sleep 360
end

@client.track('ppcchat') do |status|
  today = Time.new.strftime("%Y-%m-%d")
  row = [status.created_at,
         status.id,
         status.text.dump,
         status.source,
         status.truncated,
         status.in_reply_to_status_id,
         status.in_reply_to_user_id,
         status.in_reply_to_screen_name,
         status.user.id,
         status.user.screen_name,
         status.user.name
         ]
  c = CSV.open("data/"+today+".tsv", "a", {:col_sep => "\t"}) do |csv|
    csv << row
  end
end

The script uses the streaming api and you will have to register your own Twitter application in order to get the oauth tokens and secrets needed to use it. The script uses the excellent and easy to use TweetStream gem.

The output is a directory full of TSV files with each file being named after the day of data it contains.

We can use the statistical programming language R to do some analysis of these networks.

First we load some libraries and read in the tweets. The variable "location" is where all the tweets are stored.

Then we create a variable "time" which has the day each tweet occurred.

Finally we define a function "filterByDate" which does exactly what you might think.

library(lubridate)

raw <- read.csv(location, header=FALSE, sep='\t',stringsAsFactors=FALSE)

raw$time <- ymd(substring(raw$V1,1,10))

filterByDate <- function(startdate, enddate) {
  return(subset(raw, time >= startdate & time < enddate))
}

Then we start messing about with with the igraph library that can be used to analyse networks.

library(igraph)
library(stringr)
library(xtable)

graphByDate <- function(startdate, enddate) {
  tweets <- filterByDate(startdate,enddate)
  edges = c()

  for (i in 1:length(tweets$V3)) {
    mentions = unlist(str_extract_all(tolower(tweets$V3[i]),"@[a-z0-9_]{2,15}"))
    if (length(mentions)!=0) {
      for (j in 1:length(mentions)) {
        if(tweets$V10[i]!="" && substring(mentions[j],2)!="") { #needed for when parser borks
          edges=c(edges,c(tolower(tweets$V10[i]),substring(mentions[j],2)))
        }
      }
    }
  }

  edgematrix <- t(matrix(edges,nrow=2))

  g <- graph.edgelist(edgematrix)

  #remove self links
  for (i in 1:length(g[,1])){
    g[i,i] = 0
  }

  V(g)$indegree <- degree(g, v=V(g), mode="in")
  g.noweirdos <- delete.vertices(g,V(g)[ indegree == 0 ])
  return(g.noweirdos)
}

pageRankByDate <- function(startdate,enddate) {
  g <- graphByDate(startdate,enddate)
  pr <- page.rank(g,directed=TRUE)
  return(pr)
}

pr0 <- pageRankByDate(now()-weeks(1),now())
pr0 <- data.frame(pr0$vector)
pr0$name<-row.names(pr0)
pr1 <- pageRankByDate(now()-weeks(2),now()-weeks(1))
pr1 <- data.frame(pr1$vector)
pr1$name<-row.names(pr1)
prlist <- merge(pr0,pr1,all=TRUE)
prlist$pr1.vector[is.na(prlist$pr1.vector)] <- 0
prlist$pr0.vector[is.na(prlist$pr0.vector)] <- 0

prlist <- data.frame(Name=prlist$name, "Last week"=prlist$pr0.vector, "Week before"=prlist$pr1.vector)

top20pr <- prlist[ order(-prlist$Last.week), ][1:20,]
colnames(top20pr)<-c("Name","PageRank Last Week","PageRank Preceeding Week")
print(xtable(top20pr), type="html", include.rownames = FALSE)

We can also draw a picture of these relationships:

library(ggplot2)
library(scales)
radian.rescale <- function(x, start=0, direction=1) {
           c.rotate <- function(x) (x + start) %% (2 * pi) * direction
           c.rotate(rescale(x, c(0, 2 * pi), range(x)))
         }

g <- graphByDate(now()-weeks(1),now())
pr <- page.rank(g,directed=TRUE)
V(g)$page.rank <- pr$vector
V(g)$label <- V(g)$name
cutoff <- -1*sort(-pr$vector)[20]
g.top20 <- delete.vertices(g,V(g)[ page.rank < cutoff ])
layout <- layout.circle(g.top20)
lab.locs <- radian.rescale(x=1:20, direction=-1, start=0)
plot(g.top20,
     layout=layout,
     vertex.size=300*V(g.top20)$page.rank,
     vertex.label.dist=1,
     vertex.label.degree=lab.locs,
     vertex.frame.color=NA,
     edge.arrow.size=0.5)

Another interesting thing to look at is which links are the most popular

library(RCurl)
library(XML)

extractLinks <- function(tweettext) {
  links<-c()
  content<-c()
  urls <- str_extract_all(tweettext,"http://t.co/[a-zA-Z0-9]+")
  if (length(urls[[1]])==0) {
    return(data.frame(links=NA,content=NA))
  }
  else {
    for (i in 1:length(urls[[1]])) {
        url <- urls[[1]][i]
        h <- getCurlHandle()
        n <- tryCatch({
                getURL(url, curl=h, followlocation=T)
             },
             error=function(e){ return("") }
             )
        i <- getCurlInfo(h)
        links<-c(links,i$effective.url)
        content<-c(content,n)
    }
    return(data.frame(links=links,content=content,stringsAsFactors=FALSE))
  }
}

getLinksByDate <- function(startdate,enddate) {
  acc<-NULL            
  tweets <- filterByDate(startdate,enddate)
  for (i in 1:length(tweets$V3)) {
    l<-extractLinks(tweets$V3[i])
    acc<-rbind(acc,l)
  }
  return(subset(acc,!is.na(links)))
}

getTitle <- function(htmlsrc) {
  tryCatch({
    doc <- htmlParse(htmlsrc)
    title <- xpathApply(doc, "//title[1]", xmlValue)
    return(title[1])
    },
    error=function(e) {
      return(NA)
    })
}

links<-getLinksByDate(now()-weeks(1),now())
links$title <- NA
for (i in 1:length(links$content)) {
  links$title[i] <- getTitle(links$content[i])
}
links$count <- 1
keys <- unique(links$links)
vals <- sapply(keys, function(x) { sum(links[links$links==x,]$count) })
toptenlinks <- names(sort(-vals)[1:10])
toptentitles <- c()
for (i in 1:10) {
  title <- subset(links, links == toptenlinks[i])[1,]$title
  toptentitles <- c(toptentitles,title)
}
linked <- ifelse(is.na(toptentitles),
                 paste("<a href=\"", toptenlinks, "\">", toptenlinks, "</a>"),
                 paste("<a href=\"", toptenlinks, "\">", toptentitles, "</a>")
                )
final <- data.frame(link=linked)
print(xtable(final), type="html", sanitize.text.function=force,include.rownames=F,include.colnames=F)

And, of course, what kind of tweet analysis would be complete without a wordcloud?

library(tm)
library(wordcloud)

tweets <- filterByDate(now()-weeks(1),now())$V3
tweets <- str_replace_all(tweets,"http://t.co/[a-zA-Z0-9]+","")
corp <- Corpus(VectorSource(tweets))
corp <- tm_map(corp, stripWhitespace)
corp <- tm_map(corp, tolower)
corp <- tm_map(corp, removeWords, stopwords("english"))
corp <- tm_map(corp, removeWords, c("#ppcchat", "ppcchat", "http://t.co/"))
corp <- tm_map(corp, removePunctuation)
wordcloud(corp,colors=brewer.pal(8,"Set2"),rot.per=0.35,scale=c(6,1),max.words=80)
Authored by Richard Fergie