RではじめるTwitter解析

R Twitter
R 2011 (2011/11/26)
@a_bicky

• Takeshi Arabiki
‣

‣ Twitter & : @a_bicky & id:a_bicky

•
R

•
http://d.hatena.ne.jp/a_bicky/

R
Osaka.R #4 Tokyo.R #16 Tsukuba.R #9

http://www.slideshare.net/abicky/twitterr http://www.slideshare.net/abicky/r-9034336 http://www.slideshare.net/abicky/r-10128090

http://twilog.org/ http://twitraq.userlocal.jp/

http://whotwi.com/
http://tweetstats.com/

http://twilog.org/ http://twitraq.userlocal.jp/

R

http://whotwi.com/
http://tweetstats.com/

Twitter

•
• reshape2
• ggplot2
•

twitteR
twitteR
> library(twitteR) # twitteR
> # (twitteR 0.99.15 )
> Sys.setlocale("LC_TIME", "C")
[1] "C"
> # @a_bicky 3,200 RT
> statuses <- userTimeline("a_bicky", n = 3200)

status
> # R5
> ls.str(statuses[[1]])
created : POSIXct[1:1], format: "2011-11-23 22:16:24"
favorited : logi FALSE ↑ UTC
id : chr "139467359571296256"
initFields : Formal class 'refMethodDef' [package "methods"]
with 5 slots
initialize : Formal class 'refMethodDef' [package "methods"]
with 5 slots
replyToSID : chr(0)
replyToSN : chr(0)
replyToUID : chr(0)
screenName : chr "a_bicky" ! Twitter
statusSource : chr "<a href="http://sites.google.com/site/
yorufukurou/" rel="nofollow">YoruFukurou</a>"
text : chr " "
truncated : logi FALSE ↑

> statusDF <- twListToDF(statuses)
> str(statusDF, vec.len = 1)
'data.frame': 3159 obs. of 10 variables:
$ text : chr "
" ... ↑

$ favorited : logi FALSE ...
$ replyToSN : logi NA ...
$ created : POSIXct, format: "2011-11-23 22:16:24" ...
$ truncated : logi FALSE ... ↑ UTC
$ replyToSID : logi NA ...
$ id : chr "139467359571296256" ...
$ replyToUID : logi NA ...
$ statusSource: chr "<a href="http://sites.google.com/
site/yorufukurou/" rel="nofollow">YoruFukurou</a>" ...
$ screenName : chr "a_bicky" ...

> wday.abb <- c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")
> statusDF <- within(statusDF, {
+ attr(created, "tzone") <- "Asia/Tokyo" # JST
+ statusSource <- factor(gsub("<a .*?>(.*?)</a>", "1",
statusSource)) # HTML
+ date <- factor(format(created, "%Y-%m-%d")) #
+ hour <- NULL; month <- NULL; year <- NULL; wday <- NULL
+ with(as.POSIXlt(created), {
+ hour <<- factor(hour) #
+ month <<- factor(mon + 1) #
+ year <<- factor(year + 1900) #
+ wday <<- factor((wday + 6) %% 7, labels = wday.abb) #
+ })
+ textLength <- nchar(text) #
+ # , URL,
+ cleanText <- removeSpecialStr(text)
+ cleanTextLength <- nchar(cleanText) # URL
+ })

> # Twitter
> topSources <- names(head(sort(table(statusDF$statusSource),
decreasing = TRUE), 5))
> statusDF <- within(statusDF, {
+ statusSource <- as.character(statusSource)
+ statusSource[!statusSource %in% topSources] <- "other"
+ #
+ statusSource <- factor(statusSource, levels = names(sort(table
(statusSource), dec = TRUE)))
+ })

Excel

9 11 ”Twitter for iPhone”, ”YoruFukurou”
Sat Mon 12 23

reshape2
> library(reshape2)
> acast(melt(statusDF, id.vars = c("statusSource", "wday", "month", "hour"),
+ measure.vars = c("textLength")),
+ month + statusSource ~ wday, mean,
+ subset = .(statusSource %in% c("YoruFukurou", "Twitter for iPhone")
+ & month %in% 9:11 & hour %in% 12:23
+ & wday %in% c("Mon", "Sat", "Sun")))
Mon Sat Sun
9_YoruFukurou 43 42.13333 54.76471
9_Twitter for iPhone 16 27.70000 20.50000
10_YoruFukurou 61 41.70175 56.98333
10_Twitter for iPhone NaN 27.00000 24.50000
11_YoruFukurou 35 41.08197 57.32609
11_Twitter for iPhone NaN NaN 32.00000

reshape2
> library(reshape2)
> acast(melt(statusDF, id.vars = c("statusSource", "wday", "month", "hour"),
+ measure.vars = c("textLength")),
+ month + statusSource ~ wday, mean,
+ subset = .(statusSource %in% c("YoruFukurou", "Twitter for iPhone")
+ & month %in% 9:11 & hour %in% 12:23
+ & wday %in% c("Mon", "Sat", "Sun")))
Mon Sat Sun
9_YoruFukurou 43 42.13333 54.76471
9_Twitter for iPhone 16 27.70000 20.50000
10_YoruFukurou 61 41.70175 56.98333
10_Twitter for iPhone NaN 27.00000 24.50000
11_YoruFukurou 35 41.08197 57.32609
11_Twitter for iPhone NaN NaN 32.00000

R

reshape2 melt
melt cast
melt
cast
> mstatus <- melt(statusDF,
+ id.vars = c("statusSource", "wday", "year", "month", "hour", "date"),
+ measure.vars = c("textLength", "cleanTextLength"))
> mstatus[3157:3162, ]
statusSource wday year month hour date variable value
3157 web Sun 2011 3 20 2011-03-13 textLength 72
3160 YoruFukurou Wed 2011 11 1 2011-11-23 cleanTextLength 87

id

reshape2 cast
cast
formula fun.aggregate
> args(acast) # array acast
function (data, formula, fun.aggregate = NULL, ..., margins = NULL,
subset = NULL, fill = NULL, drop = TRUE, value_var = guess_value(data))
NULL
> args(dcast) # data.frame dcast
function (data, formula, fun.aggregate = NULL, ..., margins = NULL,
subset = NULL, fill = NULL, drop = TRUE, value_var = guess_value(data))
NULL

formula
...
.
acast hoge ~ fuga ~ piyo
※dcast 1 hoge ~ fuga + piyo

> #
> acast(mstatus, . ~ wday, length, subset = .(variable == "textLength"))
↑ cleanTextLength

> #
Mon Tue Wed Thu Fri Sat Sun
[1,] 408 360 258 294 334 801 704
>

> #
[1,] 408 360 258 294 334 801 704
> #
> acast(mstatus, hour ~ wday, length, subset = .(variable ==
"textLength"))

> #
[1,] 408 360 258 294 334 801 704
> #
> acast(mstatus, hour ~ wday, length, subset = .(variable ==
"textLength"))
0 65 69 26 46 46 49 40
1 48 19 11 15 27 44 37
2 31 24 6 16 17 23 17
3 27 19 4 11 14 17 10
4 4 15 1 7 4 5 7
5 5 11 1 4 3 4 5
6 4 14 3 6 9 8 1

> #
> #
> acast(mstatus, hour ~ wday + month, length, subset = .(variable ==
"textLength"))

> #
> #
"textLength"))
Mon_3 Mon_4 Mon_5 Mon_6 Mon_7 Mon_8 Mon_9 Mon_10 Mon_11 Tue_3 Tue_4
0 3 4 13 3 1 10 7 15 9 4 2
1 0 0 1 0 1 9 16 12 9 1 0
2 2 0 0 0 2 7 6 7 7 2 0

> #
> #
"textLength"))
0 3 4 13 3 1 10 7 15 9 4 2
1 0 0 1 0 1 9 16 12 9 1 0
2 2 0 0 0 2 7 6 7 7 2 0

> # 3
> acast(mstatus, hour ~ wday ~ month, length, subset = .(variable ==
"textLength"))

> #
> #
"textLength"))
0 3 4 13 3 1 10 7 15 9 4 2
1 0 0 1 0 1 9 16 12 9 1 0
2 2 0 0 0 2 7 6 7 7 2 0

> # 3
"textLength"))
, , 3

0 3 4 1 0 1 6 4
1 0 1 3 0 0 0 1

> #
> #
"textLength"))
0 3 4 13 3 1 10 7 15 9 4 2
1 0 0 1 0 1 9 16 12 9 1 0
2 2 0 0 0 2 7 6 7 7 2 0

> # 3
"textLength"))
, , 3 3

0 3 4 1 0 1 6 4
1 0 1 3 0 0 0 1

Twitter
reshape2 1

> #
> dcast(mstatus, statusSource ~ .,
+ function(x) list(c(mean = mean(x), sd = sd(x))),
+ fill = list(c(mean = NaN, sd = NA)), ←
+ subset = .(variable == "textLength"))

Twitter
reshape2 1

> #
> dcast(mstatus, statusSource ~ .,
+ function(x) list(c(mean = mean(x), sd = sd(x))),
+ fill = list(c(mean = NaN, sd = NA)), ←
+ subset = .(variable == "textLength"))
statusSource NA
1 YoruFukurou 47.51462, 32.57973
2 web 57.02720, 36.33534
3 Twitter for iPhone 33.42342, 23.06466
4 Twitter for Android 28.49048, 20.08457
5 Hatena 80.00000, 25.94212
6 other 52.58621, 33.12180
>

> # t
> pc <- unlist(subset(statusDF,
+ statusSource %in% c("YoruFukurou", "web"),
+ textLength))
> sp <- unlist(subset(statusDF,
+ grepl("(iPhone|Android)", statusSource),
+ textLength))
> t.test(sp, pc, var.equal = FALSE)

Welch Two Sample t-test
!!
data: sp and pc
t = -15.7921, df = 1588.246, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-19.85334 -15.46645
sample estimates:
mean of x mean of y
31.83945 49.49935

> extractScreenNames <- function(text, strict = TRUE) {
+ if (strict) {
+ # Twitter screen_name
+ regex <- "(?:(?<!w)([@ ])((?>w+))(?![@ ])|[sS])"
+ } else {
+ # hoge@example.com
+ regex <- "(?:([@ ])(w+)|[sS])"
+ }
+ screenNames <- gsub(regex, "12", text, perl = TRUE)
+ unique(unlist(strsplit(substring(screenNames, 2), "[@ ]")))
+ }
> screenNames <- unlist(lapply(statusDF$text, extractScreenNames))
> head(sort(table(screenNames), decreasing = TRUE), 10) # Top 10
screenNames
naopr __gfx__ hirota_inoue mandy_44 ask_a_lie
105 85 51 47 40
ken_nishi nokuno yokkuns JinJin0613 kanon19_rie
39 39 33 20 20

ggplot2
plot(statusDF$wday, col = "blue")
ggplot2

qplot(wday, data = statusDF, fill = I("blue"),
alpha = I(0.7), xlab = "", ylab = "")

ggplot2

qplot(wday, data = statusDF, fill = statusSource,
xlab = "", ylab = "")

ggplot2
qplot(wday, data = statusDF, facets = ~ statusSource,
fill = I("blue"), alpha = I(0.7), xlab = "", ylab = "")

xlab = "", ylab = "")

qplot
ggplot2
> args(qplot)
function (x, y = NULL, z = NULL, ..., data, facets = . ~ ., margins =
FALSE,
geom = "auto", stat = list(NULL), position = list(NULL),
xlim = c(NA, NA), ylim = c(NA, NA), log = "", main = NULL,
xlab = deparse(substitute(x)), ylab = deparse(substitute(y)),
asp = NA)
NULL

qplot geom
geom

area:
bar:
histogram:
line:
point:

qplot geom
geom

area:
bar:
histogram:
line:
point:

qplot(as.integer(wday), data = statusDF, geom = "area", stat = "bin",
fill = statusSource, xlab = "", ylab = "", binwidth = 1)

qplot geom
geom

area:
bar:
histogram:
line:
point:

qplot(wday, data = statusDF, geom = "bar", stat = "bin",
fill = statusSource, xlab = "", ylab = "")

qplot geom
geom

area:
bar:
histogram:
line:
point:

qplot(as.integer(wday), data = statusDF, geom = "line", stat = "bin",
colour = statusSource, xlab = "", ylab = "", binwidth = 1)

qplot geom
geom

area:
bar:
histogram:
line:
point:

qplot(wday, data = statusDF, geom = "point", stat = "bin",
colour = statusSource, xlab = "", ylab = "")

qplot position
position geom

dodge :
fill : 1
jitter :
stack :

qplot position
position geom

dodge :
fill : 1
jitter :
stack :

position = "dodge", xlab = "", ylab = "")

qplot position
position geom

dodge :
fill : 1
jitter :
stack :

position = "fill", xlab = "", ylab = "")

qplot position
position geom

dodge :
fill : 1
jitter :
stack :

position = "jitter", xlab = "", ylab = "")

qplot position
position geom

dodge :
fill : 1
jitter :
stack :

position = "stack", xlab = "", ylab = "")

qplot facets
facets geom
~ :
1 ~ 2: 1, 2
※reshape2 1 ~ 2 + 3

qplot facets
facets geom
~ :
1 ~ 2: 1, 2
※reshape2 1 ~ 2 + 3

qplot(wday, data = statusDF, xlab = "", ylab = "",
facets = ~ statusSource)

qplot facets
facets geom
~ :
1 ~ 2: 1, 2
※reshape2 1 ~ 2 + 3

facets = month ~ statusSource)

qplot
alpha :
colour (color) :
fill :
linetype :
size :

colour, ﬁll, linetype statusSource
ﬁll = I("blue") I (AsIs)

qplot
alpha :
colour (color) :
fill :
linetype :
size :

alpha = as.integer(wday))

qplot
alpha :
colour (color) :
fill :
linetype :
size :

colour = statusSource)

qplot
alpha :
colour (color) :
fill :
linetype :
size :

fill = statusSource)

qplot
alpha :
colour (color) :
fill :
linetype :
size :

linetype = statusSource, colour = statusSource)

whotwi

http://whotwi.com/

whotwi
> # Twitter
> # melt cast xtabs
> cnt <- as.data.frame(xtabs(~ hour + wday + statusSource, statusDF))
> head(cnt, 3)
hour wday statusSource Freq
1 0 Mon YoruFukurou 48

whotwi
> # Twitter
> # melt cast xtabs
> cnt <- as.data.frame(xtabs(~ hour + wday + statusSource, statusDF))
> head(cnt, 3)
hour wday statusSource Freq
> freqSources <- by(cnt, cnt[c("hour", "wday")], function(df) {
+ #
+ freqSource <- with(df, statusSource[order(Freq, decreasing = TRUE)
[1]])
+ cbind(df[1, c("hour", "wday")], freqSource)
+ })
> freqSources <- do.call(rbind, freqSources)
> head(freqSources, 3)
hour wday freqSource
1 0 Mon YoruFukurou
2 1 Mon YoruFukurou
3 2 Mon YoruFukurou

whotwi
> #
> cntSum <- as.data.frame(xtabs(Freq ~ hour + wday, cnt))
> head(cntSum, 3)
hour wday Freq
1 0 Mon 65
2 1 Mon 48
3 2 Mon 31

whotwi
> #
> head(cntSum, 3)
hour wday Freq
1 0 Mon 65
2 1 Mon 48
3 2 Mon 31
> #
> data <- merge(cntSum, freqSources)
> #
> data$wday <- factor(data$wday, levels = rev(levels(data$wday)))
> #
> data$Freq <- log2(data$Freq)

whotwi
> #
> head(cntSum, 3)
hour wday Freq
1 0 Mon 65
2 1 Mon 48
3 2 Mon 31
> #
> data <- merge(cntSum, freqSources)
> #
> data$wday <- factor(data$wday, levels = rev(levels(data$wday)))
> #
> data$Freq <- log2(data$Freq)
> p <- qplot(hour, wday, data = data, xlab = "", ylab = "",
+ geom = "point", colour = freqSource, size = Freq)
> p # print(p)

whotwi
> # whotwi theme
> theme_whotwi <- function() {
+ opts( #
+ panel.background = theme_rect(fill = NA, colour = NA),
+ #
+ legend.key = theme_rect(fill = NA, colour = NA),
+ #
+ axis.ticks = theme_segment(colour = NA))
+ }
> p2 <- p + theme_whotwi() + scale_size(legend = FALSE) +
scale_colour_hue(name = "")
> p2

RMeCab
MeCab R

> library(RMeCab)
> (docDF(data.frame(" "), column = 1, type = 1))
number of extracted terms = 5
now making a data frame. wait a while!

TERM POS1 POS2 Row1
1 1
2 1
3 1
4 2
5 2

http://www.lr.pi.titech.ac.jp/~takamura/pndic_ja.html
: : :1
: : :0.999995
: : :0.999979
: : :0.999979
: : :0.999645
: : :0.999486
: : :0.999314
...

> #
> pndic <- read.table("http://www.lr.pi.titech.ac.jp/~takamura/pubs/
pn_ja.dic",
+ sep = ":",
+ col.names = c("term", "kana", "pos", "value"),
+ colClasses = c("character", "character", "factor",
"numeric"),
+ fileEncoding = "Shift_JIS")
> #
> #
> pndic2 <- aggregate(value ~ term + pos, pndic, mean)

> # pndic
> pos <- unique(pndic2$pos)
> tweetDF <- docDF(statusDF, column = "cleanText", type = 1, pos = pos)
number of extracted terms = 7164
now making a data frame. wait a while!

> tweetDF[2900:2904, 1:5]
TERM POS1 POS2 Row1 Row2
2900 0 0
2901 0 0
2902 0 0
2903 0 0
2904 0 0
> # pndic
> tweetDF <- subset(tweetDF, TERM %in% pndic2$term)
> #
> tweetDF <- merge(tweetDF, pndic2, by.x = c("TERM", "POS1"), by.y = c
("term", "pos"))

> #
> score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value)
> #
> sum(score > 0)
[1] 117

> #
> #
> sum(score > 0)
[1] 117
> #
> sum(score < 0)
[1] 2765

> #
> #
> sum(score > 0)
[1] 117
> #
> sum(score < 0)
[1] 2765
> #
> sum(score == 0)
[1] 277

> table(ifelse(pndic$value > 0, "positive",
+ ifelse(pndic$value == 0, "neutral", "negative")))

negative neutral positive
49983 20 5122

> m <- mean(score)
> #
> tweetType <- factor(ifelse(score > m, "positive",
+ ifelse(score == m, "neutral", "negative")),
+ levels = c("positive", "neutral", "negative"))
> table(tweetType)
tweetType
positive neutral negative
1912 0 1247

> statusDF$tweetType <- droplevels(tweetType)
> #
> qplot(month, data = statusDF,
+ geom = "bar", fill = tweetType, position = "fill")

twitteR
• RJSONIO
•
• ID status ID
• fav favorited TRUE
• truncated TRUE
• DM
• status
character factor

OAuth ” ” twitteR -

• twitteR
• reshape2 R

• ggplot2

• RMeCab R

• twitteR
• reshape2 R

• ggplot2

• RMeCab R

• PC
•

https://github.com/abicky/rjpusers2011_abicky

status
> statuses[[1]]$text
[1] " "
> statuses[[1]]$getText() #
[1] " "
> #
> statuses[[1]]$text <- " "
> statuses[[1]]$getText()
[1] " "
> statuses[[1]]$setText("ggrks") #
> statuses[[1]]$getText()
[1] "ggrks"
> #
> statuses[[1]]$getCreated()
[1] "2011-11-23 22:16:24 UTC"

removeSpecialStr

removeSpecialStr <- function(text) {
removeURL(removeHashTag(removeScreenName(text)))
}

removeScreenName

removeScreenName <- function(text, strict = TRUE) {
if (strict) {
regex <- "(?<!w)[@ ](?>w+)(?![@ ])"
} else {
regex <- "[@ ]w+"
}
gsub(regex, "", text, perl = TRUE)
}

removeURL

removeURL <- function(text, strict = TRUE) {
if (strict) {
regex <- "(?<![-.w#@=!'"/])https?://(?:[^:]+:.
+@)?(?:[0-9A-Za-z][-0-9A-Za-z]*(?<!-).)+[A-za-z]+(?:/[-
w#%=+,.?!&~]*)*"
} else {
regex <- "https?://[-w#%=+,.?!&~/]+"
}
gsub(regex, "", text, perl = TRUE)
}

removeHashTag

removeHashTag <- function(text, strict = TRUE) {
delimiters <- "s,.u3000-u3002uFF01uFF1F"
# cf. http://nobu666.com/2011/07/13/914.html
validJa <- "u3041-u3094u3099-u309Cu30A1-u30FA
u30FCu3400-uD7A3uFF10-uFF19uFF21-uFF3AuFF41-uFF5A
uFF66-uFF9E"
if (strict) {
regex <- sprintf("(^|[%s])(?:([# ](?>[0-9]+)(?!
w))|[# ][w%s]+)", delimiters, validJa, validJa)
} else {
regex <- sprintf("[# ][^%s]+", delimiters)
}
gsub(regex, "12", text, perl = TRUE)
}

RではじめるTwitter解析

More Related Content

What's hot

Viewers also liked

Similar to RではじめるTwitter解析

More from Takeshi Arabiki

Recently uploaded

RではじめるTwitter解析