Skip to content

Commit c124aef

Browse files
committed
Initial R code files
R Code Files for Introduction to Data Science
1 parent 5748ff9 commit c124aef

File tree

3 files changed

+189
-0
lines changed

3 files changed

+189
-0
lines changed

MappingScripts.R

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# Mapping scripts
2+
# EnsurePackage(x) - Installs and loads a package if necessary
3+
EnsurePackage<-function(x)
4+
{
5+
x <- as.character(x)
6+
if (!require(x,character.only=TRUE))
7+
{
8+
install.packages(pkgs=x,repos="http://cran.r-project.org")
9+
require(x,character.only=TRUE)
10+
}
11+
}
12+
13+
14+
# Format an URL for the Google Geocode API
15+
MakeGeoURL <- function(address)
16+
{
17+
18+
root <- "http://maps.google.com/maps/api/geocode/"
19+
20+
url <- paste(root, "json?address=", address, "&sensor=false", sep = "")
21+
22+
return(URLencode(url))
23+
}
24+
25+
Addr2latlng <- function(address)
26+
{
27+
url <- MakeGeoURL(address)
28+
29+
apiResult <- getURL(url)
30+
31+
geoStruct <- fromJSON(apiResult, simplify = FALSE)
32+
33+
lat <- NA
34+
lng <- NA
35+
36+
37+
try(lat <- geoStruct$results[[1]]$geometry$location$lat, silent=TRUE)
38+
try(lng <- geoStruct$results[[1]]$geometry$location$lng, silent=TRUE)
39+
40+
return(c(lat, lng))
41+
}
42+
43+
# Process a whole list of addresses
44+
ProcessAddrList <- function(addrList)
45+
{
46+
resultDF <- data.frame(atext=character(),X=numeric(),Y=numeric(),EID=numeric())
47+
i <- 1
48+
49+
for (addr in addrList)
50+
{
51+
latlng = Addr2latlng(addr)
52+
resultDF <- rbind(resultDF, data.frame(atext=addr,X=latlng[[2]],Y=latlng[[1]], EID=i))
53+
i <- i + 1
54+
}
55+
56+
return(resultDF)
57+
}

MyMode.R

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
MyMode <- function(myVector)
2+
{
3+
uniqueValues <- unique(myVector)
4+
uniqueCounts <- tabulate(match(myVector,uniqueValues))
5+
6+
return(uniqueValues[which.max(uniqueCounts)])
7+
}

twitterSupport.R

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
# EnsurePackage(x) - Installs and loads a package if necessary
2+
EnsurePackage<-function(x)
3+
{
4+
x <- as.character(x)
5+
if (!require(x,character.only=TRUE))
6+
{
7+
install.packages(pkgs=x,repos="http://cran.r-project.org")
8+
require(x,character.only=TRUE)
9+
}
10+
}
11+
12+
# PrepareTwitter() - Load packages for working with twitteR
13+
PrepareTwitter<-function()
14+
{
15+
EnsurePackage("bitops")
16+
EnsurePackage("RCurl")
17+
EnsurePackage("RJSONIO")
18+
EnsurePackage("twitteR")
19+
}
20+
21+
# TweetFrame() - Return a dataframe based on a search of Twitter
22+
TweetFrame<-function(searchTerm, maxTweets)
23+
{
24+
tweetList <- searchTwitter(searchTerm, n=maxTweets)
25+
26+
# as.data.frame() coerces each list element into a row
27+
# lapply() applies this to all of the elements in twtList
28+
# rbind() takes all of the rows and puts them together
29+
# do.call() gives rbind() all the rows as individual elements
30+
tweetDF <- do.call("rbind", lapply(tweetList,as.data.frame))
31+
32+
# This last step sorts the tweets in arrival order
33+
return(tweetDF[order(as.integer(tweetDF$created)), ])
34+
}
35+
36+
# CleanTweets() - Takes the junk out of a vector of tweet texts
37+
CleanTweets<-function(tweets)
38+
{
39+
# Remove redundant spaces
40+
tweets <- str_replace_all(tweets," "," ")
41+
# Get rid of URLs
42+
tweets <- str_replace_all(tweets, "http://t.co/[a-z,A-Z,0-9]{8}","")
43+
# Take out retweet header, there is only one
44+
tweets <- str_replace(tweets,"RT @[a-z,A-Z]*: ","")
45+
tweets <- str_replace_all(tweets,"#[a-z,A-Z]*","")
46+
tweets <- str_replace_all(tweets,"@[a-z,A-Z]*","")
47+
return(tweets)
48+
}
49+
50+
# ArrivalProbability - Given a list of arrival times
51+
# calculates the delays between them with lagged differences
52+
# then computes a list of cumulative probabilties of arrival
53+
# for a list of time increments
54+
# times - A sorted, ascending list of arrival times in POSIXct
55+
# increment - the time increment for each new probability
56+
# max - the highest time increment
57+
#
58+
# Returns - an ordered list of probabilities in a numeric vector
59+
# suitable for plotting with plot()
60+
ArrivalProbability<-function(times, increment, max)
61+
{
62+
# Initialize an empty vector
63+
plist <- NULL
64+
65+
# Probability is defined over the size of this sample
66+
# of arrival times
67+
timeLen <- length(times)
68+
69+
# May not be necessary, but checks for input mistake
70+
if (increment>max) {return(NULL)}
71+
72+
for (i in seq(increment, max, by=increment))
73+
{
74+
# diff() requires a sorted list of times
75+
# diff() calculates the delays between neighboring times
76+
# the logical test <i provides a list of TRUEs and FALSEs
77+
# of length = timeLen, then sum() counts the TRUEs
78+
plist<-c(plist,(sum(as.integer(diff(times))<i))/timeLen)
79+
}
80+
return(plist)
81+
}
82+
83+
# Like ArrivalProbability, but works with an unsorted list
84+
# of delay times
85+
DelayProbability<-function(delays, increment, max)
86+
{
87+
# Initialize an empty vector
88+
plist <- NULL
89+
90+
# Probability is defined over the size of this sample
91+
# of arrival times
92+
delayLen <- length(delays)
93+
94+
# May not be necessary, but checks for input mistake
95+
if (increment>max) {return(NULL)}
96+
97+
for (i in seq(increment, max, by=increment))
98+
{
99+
# the logical test <i provides a list of TRUEs and FALSEs
100+
# of length = timeLen, then sum() counts the TRUEs
101+
plist<-c(plist,(sum(delays<=i)/delayLen))
102+
}
103+
return(plist)
104+
}
105+
106+
# Compare tweets - Run poisson.test() on rate ratio for two tweet streams
107+
# search1 - the first hashtag or search twerm to look for
108+
# search2 - the second search term or hashtag to look for
109+
# numEvents - the number of events to sample for each search
110+
CompareTweets <- function(search1, search2, numEvents)
111+
{
112+
tweetDF <- TweetFrame(search1, numEvents)
113+
sortweetDF<-tweetDF[order(as.integer(tweetDF$created)), ]
114+
eventDelays1 <- as.integer(diff(sortweetDF$created))
115+
meanDelays1 <- round(mean(eventDelays1))
116+
117+
tweetDF <- TweetFrame(search2, numEvents)
118+
sortweetDF<-tweetDF[order(as.integer(tweetDF$created)), ]
119+
eventDelays2 <- as.integer(diff(sortweetDF$created))
120+
121+
eventCount1 <- sum(eventDelays1<=meanDelays1)
122+
eventCount2 <- sum(eventDelays2<=meanDelays1)
123+
124+
return(poisson.test(c(eventCount1,eventCount2),c(numEvents,numEvents)))
125+
}

0 commit comments

Comments
 (0)