1
+ # EnsurePackage(x) - Installs and loads a package if necessary
2
+ EnsurePackage <- function (x )
3
+ {
4
+ x <- as.character(x )
5
+ if (! require(x ,character.only = TRUE ))
6
+ {
7
+ install.packages(pkgs = x ,repos = " http://cran.r-project.org" )
8
+ require(x ,character.only = TRUE )
9
+ }
10
+ }
11
+
12
+ # PrepareTwitter() - Load packages for working with twitteR
13
+ PrepareTwitter <- function ()
14
+ {
15
+ EnsurePackage(" bitops" )
16
+ EnsurePackage(" RCurl" )
17
+ EnsurePackage(" RJSONIO" )
18
+ EnsurePackage(" twitteR" )
19
+ }
20
+
21
+ # TweetFrame() - Return a dataframe based on a search of Twitter
22
+ TweetFrame <- function (searchTerm , maxTweets )
23
+ {
24
+ tweetList <- searchTwitter(searchTerm , n = maxTweets )
25
+
26
+ # as.data.frame() coerces each list element into a row
27
+ # lapply() applies this to all of the elements in twtList
28
+ # rbind() takes all of the rows and puts them together
29
+ # do.call() gives rbind() all the rows as individual elements
30
+ tweetDF <- do.call(" rbind" , lapply(tweetList ,as.data.frame ))
31
+
32
+ # This last step sorts the tweets in arrival order
33
+ return (tweetDF [order(as.integer(tweetDF $ created )), ])
34
+ }
35
+
36
+ # CleanTweets() - Takes the junk out of a vector of tweet texts
37
+ CleanTweets <- function (tweets )
38
+ {
39
+ # Remove redundant spaces
40
+ tweets <- str_replace_all(tweets ," " ," " )
41
+ # Get rid of URLs
42
+ tweets <- str_replace_all(tweets , " http://t.co/[a-z,A-Z,0-9]{8}" ," " )
43
+ # Take out retweet header, there is only one
44
+ tweets <- str_replace(tweets ," RT @[a-z,A-Z]*: " ," " )
45
+ tweets <- str_replace_all(tweets ," #[a-z,A-Z]*" ," " )
46
+ tweets <- str_replace_all(tweets ," @[a-z,A-Z]*" ," " )
47
+ return (tweets )
48
+ }
49
+
50
+ # ArrivalProbability - Given a list of arrival times
51
+ # calculates the delays between them with lagged differences
52
+ # then computes a list of cumulative probabilties of arrival
53
+ # for a list of time increments
54
+ # times - A sorted, ascending list of arrival times in POSIXct
55
+ # increment - the time increment for each new probability
56
+ # max - the highest time increment
57
+ #
58
+ # Returns - an ordered list of probabilities in a numeric vector
59
+ # suitable for plotting with plot()
60
+ ArrivalProbability <- function (times , increment , max )
61
+ {
62
+ # Initialize an empty vector
63
+ plist <- NULL
64
+
65
+ # Probability is defined over the size of this sample
66
+ # of arrival times
67
+ timeLen <- length(times )
68
+
69
+ # May not be necessary, but checks for input mistake
70
+ if (increment > max ) {return (NULL )}
71
+
72
+ for (i in seq(increment , max , by = increment ))
73
+ {
74
+ # diff() requires a sorted list of times
75
+ # diff() calculates the delays between neighboring times
76
+ # the logical test <i provides a list of TRUEs and FALSEs
77
+ # of length = timeLen, then sum() counts the TRUEs
78
+ plist <- c(plist ,(sum(as.integer(diff(times ))< i ))/ timeLen )
79
+ }
80
+ return (plist )
81
+ }
82
+
83
+ # Like ArrivalProbability, but works with an unsorted list
84
+ # of delay times
85
+ DelayProbability <- function (delays , increment , max )
86
+ {
87
+ # Initialize an empty vector
88
+ plist <- NULL
89
+
90
+ # Probability is defined over the size of this sample
91
+ # of arrival times
92
+ delayLen <- length(delays )
93
+
94
+ # May not be necessary, but checks for input mistake
95
+ if (increment > max ) {return (NULL )}
96
+
97
+ for (i in seq(increment , max , by = increment ))
98
+ {
99
+ # the logical test <i provides a list of TRUEs and FALSEs
100
+ # of length = timeLen, then sum() counts the TRUEs
101
+ plist <- c(plist ,(sum(delays < = i )/ delayLen ))
102
+ }
103
+ return (plist )
104
+ }
105
+
106
+ # Compare tweets - Run poisson.test() on rate ratio for two tweet streams
107
+ # search1 - the first hashtag or search twerm to look for
108
+ # search2 - the second search term or hashtag to look for
109
+ # numEvents - the number of events to sample for each search
110
+ CompareTweets <- function (search1 , search2 , numEvents )
111
+ {
112
+ tweetDF <- TweetFrame(search1 , numEvents )
113
+ sortweetDF <- tweetDF [order(as.integer(tweetDF $ created )), ]
114
+ eventDelays1 <- as.integer(diff(sortweetDF $ created ))
115
+ meanDelays1 <- round(mean(eventDelays1 ))
116
+
117
+ tweetDF <- TweetFrame(search2 , numEvents )
118
+ sortweetDF <- tweetDF [order(as.integer(tweetDF $ created )), ]
119
+ eventDelays2 <- as.integer(diff(sortweetDF $ created ))
120
+
121
+ eventCount1 <- sum(eventDelays1 < = meanDelays1 )
122
+ eventCount2 <- sum(eventDelays2 < = meanDelays1 )
123
+
124
+ return (poisson.test(c(eventCount1 ,eventCount2 ),c(numEvents ,numEvents )))
125
+ }
0 commit comments