R       Twitter
    R      2011 (2011/11/26)
                  @a_bicky
• Takeshi Arabiki
    ‣

    ‣ Twitter &          : @a_bicky & id:a_bicky

•
                                R

•
                  http://d.hatena.ne.jp/a_bicky/
R
           Osaka.R #4                               Tokyo.R #16                               Tsukuba.R #9




http://www.slideshare.net/abicky/twitterr   http://www.slideshare.net/abicky/r-9034336 http://www.slideshare.net/abicky/r-10128090
※
Twitter
Mentionmapp
Mentionmapp
Mentionmapp
http://twilog.org/   http://twitraq.userlocal.jp/




http://whotwi.com/
                                 http://tweetstats.com/
http://twilog.org/   http://twitraq.userlocal.jp/




    R

http://whotwi.com/
                                 http://tweetstats.com/
Twitter


•
•             reshape2
•               ggplot2
•
Twitter


•
•             reshape2
•               ggplot2
•
twitteR
      twitteR
> library(twitteR) # twitteR
> #                          (twitteR 0.99.15     )
> Sys.setlocale("LC_TIME", "C")
[1] "C"
> # @a_bicky         3,200          RT
> statuses <- userTimeline("a_bicky", n = 3200)
status
> #             R5
> ls.str(statuses[[1]])
created : POSIXct[1:1], format: "2011-11-23 22:16:24"
favorited : logi FALSE           ↑            UTC
id : chr "139467359571296256"
initFields : Formal class 'refMethodDef' [package "methods"]
with 5 slots
initialize : Formal class 'refMethodDef' [package "methods"]
with 5 slots
replyToSID : chr(0)
replyToSN : chr(0)
replyToUID : chr(0)
screenName : chr "a_bicky"     ! Twitter
statusSource : chr "<a href="http://sites.google.com/site/
yorufukurou/" rel="nofollow">YoruFukurou</a>"
text : chr "                                               "
truncated :   logi FALSE             ↑
> statusDF <- twListToDF(statuses)
> str(statusDF, vec.len = 1)
'data.frame':	 3159 obs. of 10 variables:
 $ text        : chr "
         " ...                     ↑

 $ favorited   : logi FALSE ...
 $ replyToSN   : logi NA ...
 $ created     : POSIXct, format: "2011-11-23 22:16:24" ...
 $ truncated   : logi FALSE ...      ↑           UTC
 $ replyToSID : logi NA ...
 $ id          : chr "139467359571296256" ...
 $ replyToUID : logi NA ...
 $ statusSource: chr "<a href="http://sites.google.com/
site/yorufukurou/" rel="nofollow">YoruFukurou</a>" ...
 $ screenName : chr "a_bicky" ...
> wday.abb <- c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")
> statusDF <- within(statusDF, {
+     attr(created, "tzone") <- "Asia/Tokyo" # JST
+     statusSource <- factor(gsub("<a .*?>(.*?)</a>", "1",
statusSource)) # HTML
+     date <- factor(format(created, "%Y-%m-%d")) #
+     hour <- NULL; month <- NULL; year <- NULL; wday <- NULL
+     with(as.POSIXlt(created), {
+         hour <<- factor(hour)         #
+         month <<- factor(mon + 1)     #
+         year <<- factor(year + 1900) #
+         wday <<- factor((wday + 6) %% 7, labels = wday.abb) #
+     })
+     textLength <- nchar(text) #
+     #        , URL,
+     cleanText <- removeSpecialStr(text)
+     cleanTextLength <- nchar(cleanText) # URL
+ })
> #                  Twitter
> topSources <- names(head(sort(table(statusDF$statusSource),
decreasing = TRUE), 5))
> statusDF <- within(statusDF, {
+     statusSource <- as.character(statusSource)
+     statusSource[!statusSource %in% topSources] <- "other"
+     #
+     statusSource <- factor(statusSource, levels = names(sort(table
(statusSource), dec = TRUE)))
+ })
Twitter


•
•             reshape2
•               ggplot2
•
reshape2
Excel




9   11   ”Twitter for iPhone”, ”YoruFukurou”
    Sat Mon 12         23
reshape2
> library(reshape2)
> acast(melt(statusDF, id.vars = c("statusSource", "wday", "month", "hour"),
+             measure.vars = c("textLength")),
+       month + statusSource ~ wday, mean,
+       subset = .(statusSource %in% c("YoruFukurou", "Twitter for iPhone")
+         & month %in% 9:11 & hour %in% 12:23
+         & wday %in% c("Mon", "Sat", "Sun")))
                       Mon      Sat      Sun
9_YoruFukurou           43 42.13333 54.76471
9_Twitter for iPhone    16 27.70000 20.50000
10_YoruFukurou          61 41.70175 56.98333
10_Twitter for iPhone NaN 27.00000 24.50000
11_YoruFukurou          35 41.08197 57.32609
11_Twitter for iPhone NaN       NaN 32.00000
reshape2
> library(reshape2)
> acast(melt(statusDF, id.vars = c("statusSource", "wday", "month", "hour"),
+             measure.vars = c("textLength")),
+       month + statusSource ~ wday, mean,
+       subset = .(statusSource %in% c("YoruFukurou", "Twitter for iPhone")
+         & month %in% 9:11 & hour %in% 12:23
+         & wday %in% c("Mon", "Sat", "Sun")))
                       Mon      Sat      Sun
9_YoruFukurou           43 42.13333 54.76471
9_Twitter for iPhone    16 27.70000 20.50000
10_YoruFukurou          61 41.70175 56.98333
10_Twitter for iPhone NaN 27.00000 24.50000
11_YoruFukurou          35 41.08197 57.32609
11_Twitter for iPhone NaN       NaN 32.00000
reshape2
> library(reshape2)
> acast(melt(statusDF, id.vars = c("statusSource", "wday", "month", "hour"),
+             measure.vars = c("textLength")),
+       month + statusSource ~ wday, mean,
+       subset = .(statusSource %in% c("YoruFukurou", "Twitter for iPhone")
+         & month %in% 9:11 & hour %in% 12:23
+         & wday %in% c("Mon", "Sat", "Sun")))
                       Mon      Sat      Sun
9_YoruFukurou           43 42.13333 54.76471
9_Twitter for iPhone    16 27.70000 20.50000
10_YoruFukurou          61 41.70175 56.98333
10_Twitter for iPhone NaN 27.00000 24.50000
11_YoruFukurou          35 41.08197 57.32609
11_Twitter for iPhone NaN       NaN 32.00000




         R
reshape2                                 melt
  melt                                    cast
   melt
cast
> mstatus <- melt(statusDF,
+    id.vars = c("statusSource", "wday", "year", "month", "hour", "date"),
+    measure.vars = c("textLength", "cleanTextLength"))
> mstatus[3157:3162, ]
      statusSource wday year month hour       date        variable value
3157           web Sun 2011      3   20 2011-03-13      textLength    72
3158           web Sun 2011      3   16 2011-03-13      textLength    24
3159           web Sun 2011      3   14 2011-03-13      textLength    82
3160 YoruFukurou Wed 2011       11    1 2011-11-23 cleanTextLength    87
3161 YoruFukurou Wed 2011       11    1 2011-11-23 cleanTextLength    14
3162 YoruFukurou Wed 2011       11    1 2011-11-23 cleanTextLength    21



              id
reshape2                                    cast
      cast
formula                                     fun.aggregate
> args(acast) #         array                       acast
function (data, formula, fun.aggregate    = NULL, ..., margins = NULL,
     subset = NULL, fill = NULL, drop =   TRUE, value_var = guess_value(data))
NULL
> args(dcast) #         data.frame                          dcast
function (data, formula, fun.aggregate    = NULL, ..., margins = NULL,
     subset = NULL, fill = NULL, drop =   TRUE, value_var = guess_value(data))
NULL


formula
...
.
acast     hoge ~ fuga ~ piyo
※dcast       1                            hoge ~ fuga + piyo
> #
> acast(mstatus, . ~ wday, length, subset = .(variable == "textLength"))
                                    ↑            cleanTextLength
> #
> acast(mstatus, . ~ wday, length, subset = .(variable == "textLength"))
     Mon Tue Wed Thu Fri Sat Sun
[1,] 408 360 258 294 334 801 704
>
> #
> acast(mstatus, . ~ wday, length, subset = .(variable == "textLength"))
     Mon Tue Wed Thu Fri Sat Sun
[1,] 408 360 258 294 334 801 704
>
> #
> acast(mstatus,   . ~ wday, length, subset = .(variable == "textLength"))
     Mon Tue Wed   Thu Fri Sat Sun
[1,] 408 360 258   294 334 801 704
> #
> acast(mstatus,   hour ~ wday, length, subset = .(variable ==
"textLength"))
> #
> acast(mstatus, . ~ wday, length, subset = .(variable == "textLength"))
     Mon Tue Wed Thu Fri Sat Sun
[1,] 408 360 258 294 334 801 704
> #
> acast(mstatus, hour ~ wday, length, subset = .(variable ==
"textLength"))
   Mon Tue Wed Thu Fri Sat Sun
0   65 69 26 46 46 49 40
1   48 19 11 15 27 44 37
2   31 24    6 16 17 23 17
3   27 19    4 11 14 17 10
4    4 15    1   7   4   5   7
5    5 11    1   4   3   4   5
6    4 14    3   6   9   8   1
> #
> #
> acast(mstatus, hour ~ wday + month, length, subset = .(variable ==
"textLength"))
> #
> #
> acast(mstatus, hour ~ wday + month, length, subset = .(variable ==
"textLength"))
   Mon_3 Mon_4 Mon_5 Mon_6 Mon_7 Mon_8 Mon_9 Mon_10 Mon_11 Tue_3 Tue_4
0      3     4    13     3     1    10     7     15      9     4     2
1      0     0     1     0     1     9    16     12      9     1     0
2      2     0     0     0     2     7     6      7      7     2     0
> #
> #
> acast(mstatus, hour ~ wday + month, length, subset = .(variable ==
"textLength"))
   Mon_3 Mon_4 Mon_5 Mon_6 Mon_7 Mon_8 Mon_9 Mon_10 Mon_11 Tue_3 Tue_4
0      3     4    13     3     1    10     7     15      9     4     2
1      0     0     1     0     1     9    16     12      9     1     0
2      2     0     0     0     2     7     6      7      7     2     0


> # 3
> acast(mstatus, hour ~ wday ~ month, length, subset = .(variable ==
"textLength"))
> #
> #
> acast(mstatus, hour ~ wday + month, length, subset = .(variable ==
"textLength"))
   Mon_3 Mon_4 Mon_5 Mon_6 Mon_7 Mon_8 Mon_9 Mon_10 Mon_11 Tue_3 Tue_4
0      3     4    13     3     1    10     7     15      9     4     2
1      0     0     1     0     1     9    16     12      9     1     0
2      2     0     0     0     2     7     6      7      7     2     0


> # 3
> acast(mstatus, hour ~ wday ~ month, length, subset = .(variable ==
"textLength"))
, , 3

    Mon Tue Wed Thu Fri Sat Sun
0     3   4   1   0   1   6   4
1     0   1   3   0   0   0   1
> #
> #
> acast(mstatus, hour ~ wday + month, length, subset = .(variable ==
"textLength"))
   Mon_3 Mon_4 Mon_5 Mon_6 Mon_7 Mon_8 Mon_9 Mon_10 Mon_11 Tue_3 Tue_4
0      3     4    13     3     1    10     7     15      9     4     2
1      0     0     1     0     1     9    16     12      9     1     0
2      2     0     0     0     2     7     6      7      7     2     0


> # 3
> acast(mstatus, hour ~ wday ~ month, length, subset = .(variable ==
"textLength"))
, , 3          3

    Mon Tue Wed Thu Fri Sat Sun
0     3   4   1   0   1   6   4
1     0   1   3   0   0   0   1
Twitter
                                               reshape2    1


> #
> dcast(mstatus, statusSource ~ .,
+       function(x) list(c(mean = mean(x), sd = sd(x))),
+       fill = list(c(mean = NaN, sd = NA)), ←
+       subset = .(variable == "textLength"))
Twitter
                                               reshape2    1


> #
> dcast(mstatus, statusSource ~ .,
+       function(x) list(c(mean = mean(x), sd = sd(x))),
+       fill = list(c(mean = NaN, sd = NA)), ←
+       subset = .(variable == "textLength"))
         statusSource                 NA
1         YoruFukurou 47.51462, 32.57973
2                 web 57.02720, 36.33534
3 Twitter for iPhone 33.42342, 23.06466
4 Twitter for Android 28.49048, 20.08457
5              Hatena 80.00000, 25.94212
6               other 52.58621, 33.12180
>
Twitter
                                               reshape2    1


> #
> dcast(mstatus, statusSource ~ .,
+       function(x) list(c(mean = mean(x), sd = sd(x))),
+       fill = list(c(mean = NaN, sd = NA)), ←
+       subset = .(variable == "textLength"))
         statusSource                 NA
1         YoruFukurou 47.51462, 32.57973
2                 web 57.02720, 36.33534
3 Twitter for iPhone 33.42342, 23.06466
4 Twitter for Android 28.49048, 20.08457
5              Hatena 80.00000, 25.94212
6               other 52.58621, 33.12180
>
>   #                     t
>   pc <- unlist(subset(statusDF,
+                       statusSource %in% c("YoruFukurou", "web"),
+                       textLength))
>   sp <- unlist(subset(statusDF,
+                       grepl("(iPhone|Android)", statusSource),
+                       textLength))
>   t.test(sp, pc, var.equal = FALSE)

	       Welch Two Sample t-test
                                        !!
data: sp and pc
t = -15.7921, df = 1588.246, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -19.85334 -15.46645
sample estimates:
mean of x mean of y
 31.83945 49.49935
>   #                     t
>   pc <- unlist(subset(statusDF,
+                       statusSource %in% c("YoruFukurou", "web"),
+                       textLength))
>   sp <- unlist(subset(statusDF,
+                       grepl("(iPhone|Android)", statusSource),
+                       textLength))
>   t.test(sp, pc, var.equal = FALSE)

	       Welch Two Sample t-test
                                        !!
data: sp and pc
t = -15.7921, df = 1588.246, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -19.85334 -15.46645
sample estimates:
mean of x mean of y
 31.83945 49.49935
> extractScreenNames <- function(text, strict = TRUE) {
+     if (strict) {
+         # Twitter     screen_name
+         regex <- "(?:(?<!w)([@ ])((?>w+))(?![@ ])|[sS])"
+     } else {
+         #       hoge@example.com
+         regex <- "(?:([@   ])(w+)|[sS])"
+     }
+     screenNames <- gsub(regex, "12", text, perl = TRUE)
+     unique(unlist(strsplit(substring(screenNames, 2), "[@ ]")))
+ }
> screenNames <- unlist(lapply(statusDF$text, extractScreenNames))
> head(sort(table(screenNames), decreasing = TRUE), 10) # Top 10
screenNames
        naopr     __gfx__ hirota_inoue     mandy_44    ask_a_lie
          105          85           51           47           40
    ken_nishi      nokuno      yokkuns   JinJin0613 kanon19_rie
           39          39           33           20           20
Twitter


•
•             reshape2
•               ggplot2
•
ggplot2
ggplot2
plot(statusDF$wday, col = "blue")
                                                                ggplot2




                                qplot(wday, data = statusDF, fill = I("blue"),
                                      alpha = I(0.7), xlab = "", ylab = "")
ggplot2




qplot(wday, data = statusDF, fill = statusSource,
      xlab = "", ylab = "")
ggplot2
                qplot(wday, data = statusDF, facets = ~ statusSource,
                      fill = I("blue"), alpha = I(0.7), xlab = "", ylab = "")




qplot(wday, data = statusDF, fill = statusSource,
      xlab = "", ylab = "")
ggplot2
                qplot(wday, data = statusDF, facets = ~ statusSource,
                      fill = I("blue"), alpha = I(0.7), xlab = "", ylab = "")




qplot(wday, data = statusDF, fill = statusSource,
      xlab = "", ylab = "")
qplot
      ggplot2
> args(qplot)
function (x, y = NULL, z = NULL, ..., data, facets = . ~ ., margins =
FALSE,
     geom = "auto", stat = list(NULL), position = list(NULL),
     xlim = c(NA, NA), ylim = c(NA, NA), log = "", main = NULL,
     xlab = deparse(substitute(x)), ylab = deparse(substitute(y)),
     asp = NA)
NULL
qplot   geom
       geom

area:
bar:
histogram:
line:
point:
qplot                                geom
       geom

area:
bar:
histogram:
line:
point:




  qplot(as.integer(wday), data = statusDF, geom = "area", stat = "bin",
        fill = statusSource, xlab = "", ylab = "", binwidth = 1)
qplot                                geom
       geom

area:
bar:
histogram:
line:
point:




  qplot(wday, data = statusDF, geom = "bar", stat = "bin",
        fill = statusSource, xlab = "", ylab = "")
qplot                                geom
       geom

area:
bar:
histogram:
line:
point:




  qplot(as.integer(wday), data = statusDF, geom = "line", stat = "bin",
        colour = statusSource, xlab = "", ylab = "", binwidth = 1)
qplot                                geom
       geom

area:
bar:
histogram:
line:
point:




  qplot(wday, data = statusDF, geom = "point", stat = "bin",
        colour = statusSource, xlab = "", ylab = "")
qplot            position
   position       geom


dodge   :
fill    :     1
jitter :
stack   :
qplot                              position
   position       geom


dodge   :
fill    :     1
jitter :
stack   :




  qplot(wday, data = statusDF, fill = statusSource,
        position = "dodge", xlab = "", ylab = "")
qplot                              position
   position       geom


dodge   :
fill    :     1
jitter :
stack   :




  qplot(wday, data = statusDF, fill = statusSource,
        position = "fill", xlab = "", ylab = "")
qplot                              position
   position       geom


dodge   :
fill    :     1
jitter :
stack   :




  qplot(wday, data = statusDF, fill = statusSource,
        position = "jitter", xlab = "", ylab = "")
qplot                              position
   position       geom


dodge   :
fill    :     1
jitter :
stack   :




  qplot(wday, data = statusDF, fill = statusSource,
        position = "stack", xlab = "", ylab = "")
qplot                           facets
    facets      geom
~           :
        1 ~       2:         1,         2
※reshape2              1 ~        2 +   3
qplot                                  facets
     facets       geom
~           :
         1 ~         2:            1,            2
※reshape2                 1 ~           2 +      3




    qplot(wday, data = statusDF, xlab = "", ylab = "",
          facets = ~ statusSource)
qplot                                  facets
     facets       geom
~           :
         1 ~         2:            1,            2
※reshape2                 1 ~           2 +      3




    qplot(wday, data = statusDF, xlab = "", ylab = "",
          facets = month ~ statusSource)
qplot
alpha               :
colour (color) :
fill                :
linetype            :
size                :



colour, fill, linetype           statusSource
                        fill = I("blue")        I   (AsIs)
qplot
alpha          :
colour (color) :
fill           :
linetype       :
size           :




  qplot(wday, data = statusDF, xlab = "", ylab = "",
        alpha = as.integer(wday))
qplot
alpha          :
colour (color) :
fill           :
linetype       :
size           :




  qplot(wday, data = statusDF, xlab = "", ylab = "",
        colour = statusSource)
qplot
alpha          :
colour (color) :
fill           :
linetype       :
size           :




  qplot(wday, data = statusDF, xlab = "", ylab = "",
        fill = statusSource)
qplot
alpha          :
colour (color) :
fill           :
linetype       :
size           :




  qplot(wday, data = statusDF, xlab = "", ylab = "",
        linetype = statusSource, colour = statusSource)
whotwi




         http://whotwi.com/
whotwi




         http://whotwi.com/
whotwi
>   #         Twitter
>   #       melt     cast               xtabs
> cnt <- as.data.frame(xtabs(~ hour + wday + statusSource, statusDF))
> head(cnt, 3)
  hour wday statusSource Freq
1    0 Mon YoruFukurou     48
2    1 Mon YoruFukurou     38
3    2 Mon YoruFukurou     25
whotwi
>   #           Twitter
>   #         melt     cast               xtabs
>   cnt <- as.data.frame(xtabs(~ hour + wday + statusSource, statusDF))
>   head(cnt, 3)
    hour wday statusSource Freq
1      0 Mon YoruFukurou     48
2      1 Mon YoruFukurou     38
3      2 Mon YoruFukurou     25
>   freqSources <- by(cnt, cnt[c("hour", "wday")], function(df) {
+      #
+      freqSource <- with(df, statusSource[order(Freq, decreasing = TRUE)
[1]])
+      cbind(df[1, c("hour", "wday")], freqSource)
+ })
> freqSources <- do.call(rbind, freqSources)
> head(freqSources, 3)
  hour wday freqSource
1     0 Mon YoruFukurou
2     1 Mon YoruFukurou
3     2 Mon YoruFukurou
whotwi
> #
> cntSum <- as.data.frame(xtabs(Freq ~ hour + wday, cnt))
> head(cntSum, 3)
  hour wday Freq
1    0 Mon    65
2    1 Mon    48
3    2 Mon    31
whotwi
> #
> cntSum <- as.data.frame(xtabs(Freq ~ hour + wday, cnt))
> head(cntSum, 3)
  hour wday Freq
1    0 Mon    65
2    1 Mon    48
3    2 Mon    31
> #
> data <- merge(cntSum, freqSources)
> #
> data$wday <- factor(data$wday, levels = rev(levels(data$wday)))
> #
> data$Freq <- log2(data$Freq)
whotwi
> #
> cntSum <- as.data.frame(xtabs(Freq ~ hour + wday, cnt))
> head(cntSum, 3)
  hour wday Freq
1    0 Mon    65
2    1 Mon    48
3    2 Mon    31
> #
> data <- merge(cntSum, freqSources)
> #
> data$wday <- factor(data$wday, levels = rev(levels(data$wday)))
> #
> data$Freq <- log2(data$Freq)
> p <- qplot(hour, wday, data = data, xlab = "", ylab = "",
+            geom = "point", colour = freqSource, size = Freq)
> p #             print(p)
whotwi
> #
> cntSum <- as.data.frame(xtabs(Freq ~ hour + wday, cnt))
> head(cntSum, 3)
  hour wday Freq
1    0 Mon    65
2    1 Mon    48
3    2 Mon    31
> #
> data <- merge(cntSum, freqSources)
> #
> data$wday <- factor(data$wday, levels = rev(levels(data$wday)))
> #
> data$Freq <- log2(data$Freq)
> p <- qplot(hour, wday, data = data, xlab = "", ylab = "",
+            geom = "point", colour = freqSource, size = Freq)
> p #             print(p)
whotwi
whotwi
whotwi
> # whotwi theme
> theme_whotwi <- function() {
+     opts( #
+          panel.background = theme_rect(fill = NA, colour = NA),
+           #
+          legend.key = theme_rect(fill = NA, colour = NA),
+           #
+          axis.ticks = theme_segment(colour = NA))
+ }
> p2 <- p + theme_whotwi() + scale_size(legend = FALSE) +
scale_colour_hue(name = "")
> p2
whotwi
> # whotwi theme
> theme_whotwi <- function() {
+     opts( #
+          panel.background = theme_rect(fill = NA, colour = NA),
+           #
+          legend.key = theme_rect(fill = NA, colour = NA),
+           #
+          axis.ticks = theme_segment(colour = NA))
+ }
> p2 <- p + theme_whotwi() + scale_size(legend = FALSE) +
scale_colour_hue(name = "")
> p2
whotwi
whotwi
whotwi
whotwi




         PC
whotwi



PC



         PC
Twitter


•
•             reshape2
•               ggplot2
•
TweetSentiments
TweetSentiments

R
1. RMeCab

2.

3.
RMeCab
    MeCab                      R

> library(RMeCab)
> (docDF(data.frame("                    "), column = 1, type = 1))
number of extracted terms = 5
now making a data frame. wait a while!

     TERM POS1   POS2 Row1
1                      1
2                       1
3                       1
4                       2
5                       2
http://www.lr.pi.titech.ac.jp/~takamura/pndic_ja.html
          :               :        :1
      :           :           :0.999995
      :               :           :0.999979
          :           :           :0.999979
              :               :         :0.999645
      :               :            :0.999486
      :           :           :0.999314
...
> #
> pndic <- read.table("http://www.lr.pi.titech.ac.jp/~takamura/pubs/
pn_ja.dic",
+                     sep = ":",
+                     col.names = c("term", "kana", "pos", "value"),
+                     colClasses = c("character", "character", "factor",
"numeric"),
+                     fileEncoding = "Shift_JIS")
> #
> #
> pndic2 <- aggregate(value ~ term + pos, pndic, mean)
> # pndic
> pos <- unique(pndic2$pos)
> tweetDF <- docDF(statusDF, column = "cleanText", type = 1, pos = pos)
number of extracted terms = 7164
now making a data frame. wait a while!

> tweetDF[2900:2904, 1:5]
         TERM   POS1 POS2 Row1 Row2
2900                      0    0
2901                         0       0
2902                     0       0
2903                         0       0
2904                         0       0
> # pndic
> tweetDF <- subset(tweetDF, TERM %in% pndic2$term)
> #
> tweetDF <- merge(tweetDF, pndic2, by.x = c("TERM", "POS1"), by.y = c
("term", "pos"))
> #
> score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value)
> #
> sum(score > 0)
[1] 117
> #
> score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value)
> #
> sum(score > 0)
[1] 117
> #
> score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value)
> #
> sum(score > 0)
[1] 117
> #
> sum(score < 0)
[1] 2765
> #
> score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value)
> #
> sum(score > 0)
[1] 117
> #
> sum(score < 0)
[1] 2765
> #
> score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value)
> #
> sum(score > 0)
[1] 117
> #
> sum(score < 0)
[1] 2765
> #
> sum(score == 0)
[1] 277
> #
> score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value)
> #
> sum(score > 0)
[1] 117
> #
> sum(score < 0)
[1] 2765
> #
> sum(score == 0)
[1] 277
> #
> score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value)
> #
> sum(score > 0)
[1] 117
> #
> sum(score < 0)
[1] 2765
> #
> sum(score == 0)
[1] 277
> table(ifelse(pndic$value > 0, "positive",
+              ifelse(pndic$value == 0, "neutral", "negative")))

negative   neutral positive
   49983        20     5122
> table(ifelse(pndic$value > 0, "positive",
+              ifelse(pndic$value == 0, "neutral", "negative")))

negative   neutral positive
   49983        20     5122
> m <- mean(score)
> #
> tweetType <- factor(ifelse(score > m, "positive",
+                     ifelse(score == m, "neutral", "negative")),
+                     levels = c("positive", "neutral", "negative"))
> table(tweetType)
tweetType
positive neutral negative
    1912        0     1247
> statusDF$tweetType <- droplevels(tweetType)
> #
> qplot(month, data = statusDF,
+       geom = "bar", fill = tweetType, position = "fill")
> statusDF$tweetType <- droplevels(tweetType)
> #
> qplot(month, data = statusDF,
+       geom = "bar", fill = tweetType, position = "fill")
> statusDF$tweetType <- droplevels(tweetType)
> #
> qplot(month, data = statusDF,
+       geom = "bar", fill = tweetType, position = "fill")
> statusDF$tweetType <- droplevels(tweetType)
> #
> qplot(month, data = statusDF,
+       geom = "bar", fill = tweetType, position = "fill")
twitteR
•                                 RJSONIO
•
•             ID   status ID
• fav               favorited   TRUE
• truncated   TRUE
• DM
• status
  character   factor
twitteR
•                                 RJSONIO
•
•             ID   status ID
• fav               favorited   TRUE
• truncated   TRUE
• DM
• status
  character   factor
OAuth   ”   ”   twitteR   -
• twitteR
• reshape2       R


• ggplot2


• RMeCab     R
• twitteR
• reshape2       R


• ggplot2


• RMeCab     R

• PC
•
https://github.com/abicky/rjpusers2011_abicky
status
> statuses[[1]]$text
[1] "                                    "
> statuses[[1]]$getText() #
[1] "                                    "
> #
> statuses[[1]]$text <- "                    "
> statuses[[1]]$getText()
[1] "                                "
> statuses[[1]]$setText("ggrks") #
> statuses[[1]]$getText()
[1] "ggrks"
> #
> statuses[[1]]$getCreated()
[1] "2011-11-23 22:16:24 UTC"
removeSpecialStr

removeSpecialStr <- function(text) {
    removeURL(removeHashTag(removeScreenName(text)))
}
removeScreenName

removeScreenName <- function(text, strict = TRUE) {
    if (strict) {
        regex <- "(?<!w)[@ ](?>w+)(?![@ ])"
    } else {
        regex <- "[@   ]w+"
    }
    gsub(regex, "", text, perl = TRUE)
}
removeURL

removeURL <- function(text, strict = TRUE) {
    if (strict) {
        regex <- "(?<![-.w#@=!'"/])https?://(?:[^:]+:.
+@)?(?:[0-9A-Za-z][-0-9A-Za-z]*(?<!-).)+[A-za-z]+(?:/[-
w#%=+,.?!&~]*)*"
    } else {
        regex <- "https?://[-w#%=+,.?!&~/]+"
    }
    gsub(regex, "", text, perl = TRUE)
}
removeHashTag

removeHashTag <- function(text, strict = TRUE) {
    delimiters <- "s,.u3000-u3002uFF01uFF1F"
    # cf. http://nobu666.com/2011/07/13/914.html
    validJa <- "u3041-u3094u3099-u309Cu30A1-u30FA
u30FCu3400-uD7A3uFF10-uFF19uFF21-uFF3AuFF41-uFF5A
uFF66-uFF9E"
    if (strict) {
        regex <- sprintf("(^|[%s])(?:([# ](?>[0-9]+)(?!
w))|[#   ][w%s]+)", delimiters, validJa, validJa)
    } else {
        regex <- sprintf("[#   ][^%s]+", delimiters)
    }
    gsub(regex, "12", text, perl = TRUE)
}

RではじめるTwitter解析

  • 1.
    R Twitter R 2011 (2011/11/26) @a_bicky
  • 2.
    • Takeshi Arabiki ‣ ‣ Twitter & : @a_bicky & id:a_bicky • R • http://d.hatena.ne.jp/a_bicky/
  • 3.
    R Osaka.R #4 Tokyo.R #16 Tsukuba.R #9 http://www.slideshare.net/abicky/twitterr http://www.slideshare.net/abicky/r-9034336 http://www.slideshare.net/abicky/r-10128090
  • 4.
  • 6.
  • 7.
  • 8.
  • 9.
  • 10.
    http://twilog.org/ http://twitraq.userlocal.jp/ http://whotwi.com/ http://tweetstats.com/
  • 11.
    http://twilog.org/ http://twitraq.userlocal.jp/ R http://whotwi.com/ http://tweetstats.com/
  • 12.
    Twitter • • reshape2 • ggplot2 •
  • 13.
    Twitter • • reshape2 • ggplot2 •
  • 15.
    twitteR twitteR > library(twitteR) # twitteR > # (twitteR 0.99.15 ) > Sys.setlocale("LC_TIME", "C") [1] "C" > # @a_bicky 3,200 RT > statuses <- userTimeline("a_bicky", n = 3200)
  • 16.
    status > # R5 > ls.str(statuses[[1]]) created : POSIXct[1:1], format: "2011-11-23 22:16:24" favorited : logi FALSE ↑ UTC id : chr "139467359571296256" initFields : Formal class 'refMethodDef' [package "methods"] with 5 slots initialize : Formal class 'refMethodDef' [package "methods"] with 5 slots replyToSID : chr(0) replyToSN : chr(0) replyToUID : chr(0) screenName : chr "a_bicky" ! Twitter statusSource : chr "<a href="http://sites.google.com/site/ yorufukurou/" rel="nofollow">YoruFukurou</a>" text : chr " " truncated : logi FALSE ↑
  • 17.
    > statusDF <-twListToDF(statuses) > str(statusDF, vec.len = 1) 'data.frame': 3159 obs. of 10 variables: $ text : chr " " ... ↑ $ favorited : logi FALSE ... $ replyToSN : logi NA ... $ created : POSIXct, format: "2011-11-23 22:16:24" ... $ truncated : logi FALSE ... ↑ UTC $ replyToSID : logi NA ... $ id : chr "139467359571296256" ... $ replyToUID : logi NA ... $ statusSource: chr "<a href="http://sites.google.com/ site/yorufukurou/" rel="nofollow">YoruFukurou</a>" ... $ screenName : chr "a_bicky" ...
  • 18.
    > wday.abb <-c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun") > statusDF <- within(statusDF, { + attr(created, "tzone") <- "Asia/Tokyo" # JST + statusSource <- factor(gsub("<a .*?>(.*?)</a>", "1", statusSource)) # HTML + date <- factor(format(created, "%Y-%m-%d")) # + hour <- NULL; month <- NULL; year <- NULL; wday <- NULL + with(as.POSIXlt(created), { + hour <<- factor(hour) # + month <<- factor(mon + 1) # + year <<- factor(year + 1900) # + wday <<- factor((wday + 6) %% 7, labels = wday.abb) # + }) + textLength <- nchar(text) # + # , URL, + cleanText <- removeSpecialStr(text) + cleanTextLength <- nchar(cleanText) # URL + })
  • 19.
    > # Twitter > topSources <- names(head(sort(table(statusDF$statusSource), decreasing = TRUE), 5)) > statusDF <- within(statusDF, { + statusSource <- as.character(statusSource) + statusSource[!statusSource %in% topSources] <- "other" + # + statusSource <- factor(statusSource, levels = names(sort(table (statusSource), dec = TRUE))) + })
  • 20.
    Twitter • • reshape2 • ggplot2 •
  • 21.
  • 23.
    Excel 9 11 ”Twitter for iPhone”, ”YoruFukurou” Sat Mon 12 23
  • 24.
    reshape2 > library(reshape2) > acast(melt(statusDF,id.vars = c("statusSource", "wday", "month", "hour"), + measure.vars = c("textLength")), + month + statusSource ~ wday, mean, + subset = .(statusSource %in% c("YoruFukurou", "Twitter for iPhone") + & month %in% 9:11 & hour %in% 12:23 + & wday %in% c("Mon", "Sat", "Sun"))) Mon Sat Sun 9_YoruFukurou 43 42.13333 54.76471 9_Twitter for iPhone 16 27.70000 20.50000 10_YoruFukurou 61 41.70175 56.98333 10_Twitter for iPhone NaN 27.00000 24.50000 11_YoruFukurou 35 41.08197 57.32609 11_Twitter for iPhone NaN NaN 32.00000
  • 25.
    reshape2 > library(reshape2) > acast(melt(statusDF,id.vars = c("statusSource", "wday", "month", "hour"), + measure.vars = c("textLength")), + month + statusSource ~ wday, mean, + subset = .(statusSource %in% c("YoruFukurou", "Twitter for iPhone") + & month %in% 9:11 & hour %in% 12:23 + & wday %in% c("Mon", "Sat", "Sun"))) Mon Sat Sun 9_YoruFukurou 43 42.13333 54.76471 9_Twitter for iPhone 16 27.70000 20.50000 10_YoruFukurou 61 41.70175 56.98333 10_Twitter for iPhone NaN 27.00000 24.50000 11_YoruFukurou 35 41.08197 57.32609 11_Twitter for iPhone NaN NaN 32.00000
  • 26.
    reshape2 > library(reshape2) > acast(melt(statusDF,id.vars = c("statusSource", "wday", "month", "hour"), + measure.vars = c("textLength")), + month + statusSource ~ wday, mean, + subset = .(statusSource %in% c("YoruFukurou", "Twitter for iPhone") + & month %in% 9:11 & hour %in% 12:23 + & wday %in% c("Mon", "Sat", "Sun"))) Mon Sat Sun 9_YoruFukurou 43 42.13333 54.76471 9_Twitter for iPhone 16 27.70000 20.50000 10_YoruFukurou 61 41.70175 56.98333 10_Twitter for iPhone NaN 27.00000 24.50000 11_YoruFukurou 35 41.08197 57.32609 11_Twitter for iPhone NaN NaN 32.00000 R
  • 27.
    reshape2 melt melt cast melt cast > mstatus <- melt(statusDF, + id.vars = c("statusSource", "wday", "year", "month", "hour", "date"), + measure.vars = c("textLength", "cleanTextLength")) > mstatus[3157:3162, ] statusSource wday year month hour date variable value 3157 web Sun 2011 3 20 2011-03-13 textLength 72 3158 web Sun 2011 3 16 2011-03-13 textLength 24 3159 web Sun 2011 3 14 2011-03-13 textLength 82 3160 YoruFukurou Wed 2011 11 1 2011-11-23 cleanTextLength 87 3161 YoruFukurou Wed 2011 11 1 2011-11-23 cleanTextLength 14 3162 YoruFukurou Wed 2011 11 1 2011-11-23 cleanTextLength 21 id
  • 28.
    reshape2 cast cast formula fun.aggregate > args(acast) # array acast function (data, formula, fun.aggregate = NULL, ..., margins = NULL, subset = NULL, fill = NULL, drop = TRUE, value_var = guess_value(data)) NULL > args(dcast) # data.frame dcast function (data, formula, fun.aggregate = NULL, ..., margins = NULL, subset = NULL, fill = NULL, drop = TRUE, value_var = guess_value(data)) NULL formula ... . acast hoge ~ fuga ~ piyo ※dcast 1 hoge ~ fuga + piyo
  • 29.
    > # > acast(mstatus,. ~ wday, length, subset = .(variable == "textLength")) ↑ cleanTextLength
  • 30.
    > # > acast(mstatus,. ~ wday, length, subset = .(variable == "textLength")) Mon Tue Wed Thu Fri Sat Sun [1,] 408 360 258 294 334 801 704 >
  • 31.
    > # > acast(mstatus,. ~ wday, length, subset = .(variable == "textLength")) Mon Tue Wed Thu Fri Sat Sun [1,] 408 360 258 294 334 801 704 >
  • 32.
    > # > acast(mstatus, . ~ wday, length, subset = .(variable == "textLength")) Mon Tue Wed Thu Fri Sat Sun [1,] 408 360 258 294 334 801 704 > # > acast(mstatus, hour ~ wday, length, subset = .(variable == "textLength"))
  • 33.
    > # > acast(mstatus,. ~ wday, length, subset = .(variable == "textLength")) Mon Tue Wed Thu Fri Sat Sun [1,] 408 360 258 294 334 801 704 > # > acast(mstatus, hour ~ wday, length, subset = .(variable == "textLength")) Mon Tue Wed Thu Fri Sat Sun 0 65 69 26 46 46 49 40 1 48 19 11 15 27 44 37 2 31 24 6 16 17 23 17 3 27 19 4 11 14 17 10 4 4 15 1 7 4 5 7 5 5 11 1 4 3 4 5 6 4 14 3 6 9 8 1
  • 34.
    > # > # >acast(mstatus, hour ~ wday + month, length, subset = .(variable == "textLength"))
  • 35.
    > # > # >acast(mstatus, hour ~ wday + month, length, subset = .(variable == "textLength")) Mon_3 Mon_4 Mon_5 Mon_6 Mon_7 Mon_8 Mon_9 Mon_10 Mon_11 Tue_3 Tue_4 0 3 4 13 3 1 10 7 15 9 4 2 1 0 0 1 0 1 9 16 12 9 1 0 2 2 0 0 0 2 7 6 7 7 2 0
  • 36.
    > # > # >acast(mstatus, hour ~ wday + month, length, subset = .(variable == "textLength")) Mon_3 Mon_4 Mon_5 Mon_6 Mon_7 Mon_8 Mon_9 Mon_10 Mon_11 Tue_3 Tue_4 0 3 4 13 3 1 10 7 15 9 4 2 1 0 0 1 0 1 9 16 12 9 1 0 2 2 0 0 0 2 7 6 7 7 2 0 > # 3 > acast(mstatus, hour ~ wday ~ month, length, subset = .(variable == "textLength"))
  • 37.
    > # > # >acast(mstatus, hour ~ wday + month, length, subset = .(variable == "textLength")) Mon_3 Mon_4 Mon_5 Mon_6 Mon_7 Mon_8 Mon_9 Mon_10 Mon_11 Tue_3 Tue_4 0 3 4 13 3 1 10 7 15 9 4 2 1 0 0 1 0 1 9 16 12 9 1 0 2 2 0 0 0 2 7 6 7 7 2 0 > # 3 > acast(mstatus, hour ~ wday ~ month, length, subset = .(variable == "textLength")) , , 3 Mon Tue Wed Thu Fri Sat Sun 0 3 4 1 0 1 6 4 1 0 1 3 0 0 0 1
  • 38.
    > # > # >acast(mstatus, hour ~ wday + month, length, subset = .(variable == "textLength")) Mon_3 Mon_4 Mon_5 Mon_6 Mon_7 Mon_8 Mon_9 Mon_10 Mon_11 Tue_3 Tue_4 0 3 4 13 3 1 10 7 15 9 4 2 1 0 0 1 0 1 9 16 12 9 1 0 2 2 0 0 0 2 7 6 7 7 2 0 > # 3 > acast(mstatus, hour ~ wday ~ month, length, subset = .(variable == "textLength")) , , 3 3 Mon Tue Wed Thu Fri Sat Sun 0 3 4 1 0 1 6 4 1 0 1 3 0 0 0 1
  • 42.
    Twitter reshape2 1 > # > dcast(mstatus, statusSource ~ ., + function(x) list(c(mean = mean(x), sd = sd(x))), + fill = list(c(mean = NaN, sd = NA)), ← + subset = .(variable == "textLength"))
  • 43.
    Twitter reshape2 1 > # > dcast(mstatus, statusSource ~ ., + function(x) list(c(mean = mean(x), sd = sd(x))), + fill = list(c(mean = NaN, sd = NA)), ← + subset = .(variable == "textLength")) statusSource NA 1 YoruFukurou 47.51462, 32.57973 2 web 57.02720, 36.33534 3 Twitter for iPhone 33.42342, 23.06466 4 Twitter for Android 28.49048, 20.08457 5 Hatena 80.00000, 25.94212 6 other 52.58621, 33.12180 >
  • 44.
    Twitter reshape2 1 > # > dcast(mstatus, statusSource ~ ., + function(x) list(c(mean = mean(x), sd = sd(x))), + fill = list(c(mean = NaN, sd = NA)), ← + subset = .(variable == "textLength")) statusSource NA 1 YoruFukurou 47.51462, 32.57973 2 web 57.02720, 36.33534 3 Twitter for iPhone 33.42342, 23.06466 4 Twitter for Android 28.49048, 20.08457 5 Hatena 80.00000, 25.94212 6 other 52.58621, 33.12180 >
  • 45.
    > # t > pc <- unlist(subset(statusDF, + statusSource %in% c("YoruFukurou", "web"), + textLength)) > sp <- unlist(subset(statusDF, + grepl("(iPhone|Android)", statusSource), + textLength)) > t.test(sp, pc, var.equal = FALSE) Welch Two Sample t-test !! data: sp and pc t = -15.7921, df = 1588.246, p-value < 2.2e-16 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -19.85334 -15.46645 sample estimates: mean of x mean of y 31.83945 49.49935
  • 46.
    > # t > pc <- unlist(subset(statusDF, + statusSource %in% c("YoruFukurou", "web"), + textLength)) > sp <- unlist(subset(statusDF, + grepl("(iPhone|Android)", statusSource), + textLength)) > t.test(sp, pc, var.equal = FALSE) Welch Two Sample t-test !! data: sp and pc t = -15.7921, df = 1588.246, p-value < 2.2e-16 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -19.85334 -15.46645 sample estimates: mean of x mean of y 31.83945 49.49935
  • 47.
    > extractScreenNames <-function(text, strict = TRUE) { + if (strict) { + # Twitter screen_name + regex <- "(?:(?<!w)([@ ])((?>w+))(?![@ ])|[sS])" + } else { + # [email protected] + regex <- "(?:([@ ])(w+)|[sS])" + } + screenNames <- gsub(regex, "12", text, perl = TRUE) + unique(unlist(strsplit(substring(screenNames, 2), "[@ ]"))) + } > screenNames <- unlist(lapply(statusDF$text, extractScreenNames)) > head(sort(table(screenNames), decreasing = TRUE), 10) # Top 10 screenNames naopr __gfx__ hirota_inoue mandy_44 ask_a_lie 105 85 51 47 40 ken_nishi nokuno yokkuns JinJin0613 kanon19_rie 39 39 33 20 20
  • 48.
    Twitter • • reshape2 • ggplot2 •
  • 49.
  • 50.
    ggplot2 plot(statusDF$wday, col ="blue") ggplot2 qplot(wday, data = statusDF, fill = I("blue"), alpha = I(0.7), xlab = "", ylab = "")
  • 51.
    ggplot2 qplot(wday, data =statusDF, fill = statusSource, xlab = "", ylab = "")
  • 52.
    ggplot2 qplot(wday, data = statusDF, facets = ~ statusSource, fill = I("blue"), alpha = I(0.7), xlab = "", ylab = "") qplot(wday, data = statusDF, fill = statusSource, xlab = "", ylab = "")
  • 53.
    ggplot2 qplot(wday, data = statusDF, facets = ~ statusSource, fill = I("blue"), alpha = I(0.7), xlab = "", ylab = "") qplot(wday, data = statusDF, fill = statusSource, xlab = "", ylab = "")
  • 54.
    qplot ggplot2 > args(qplot) function (x, y = NULL, z = NULL, ..., data, facets = . ~ ., margins = FALSE, geom = "auto", stat = list(NULL), position = list(NULL), xlim = c(NA, NA), ylim = c(NA, NA), log = "", main = NULL, xlab = deparse(substitute(x)), ylab = deparse(substitute(y)), asp = NA) NULL
  • 55.
    qplot geom geom area: bar: histogram: line: point:
  • 56.
    qplot geom geom area: bar: histogram: line: point: qplot(as.integer(wday), data = statusDF, geom = "area", stat = "bin", fill = statusSource, xlab = "", ylab = "", binwidth = 1)
  • 57.
    qplot geom geom area: bar: histogram: line: point: qplot(wday, data = statusDF, geom = "bar", stat = "bin", fill = statusSource, xlab = "", ylab = "")
  • 58.
    qplot geom geom area: bar: histogram: line: point: qplot(as.integer(wday), data = statusDF, geom = "line", stat = "bin", colour = statusSource, xlab = "", ylab = "", binwidth = 1)
  • 59.
    qplot geom geom area: bar: histogram: line: point: qplot(wday, data = statusDF, geom = "point", stat = "bin", colour = statusSource, xlab = "", ylab = "")
  • 60.
    qplot position position geom dodge : fill : 1 jitter : stack :
  • 61.
    qplot position position geom dodge : fill : 1 jitter : stack : qplot(wday, data = statusDF, fill = statusSource, position = "dodge", xlab = "", ylab = "")
  • 62.
    qplot position position geom dodge : fill : 1 jitter : stack : qplot(wday, data = statusDF, fill = statusSource, position = "fill", xlab = "", ylab = "")
  • 63.
    qplot position position geom dodge : fill : 1 jitter : stack : qplot(wday, data = statusDF, fill = statusSource, position = "jitter", xlab = "", ylab = "")
  • 64.
    qplot position position geom dodge : fill : 1 jitter : stack : qplot(wday, data = statusDF, fill = statusSource, position = "stack", xlab = "", ylab = "")
  • 65.
    qplot facets facets geom ~ : 1 ~ 2: 1, 2 ※reshape2 1 ~ 2 + 3
  • 66.
    qplot facets facets geom ~ : 1 ~ 2: 1, 2 ※reshape2 1 ~ 2 + 3 qplot(wday, data = statusDF, xlab = "", ylab = "", facets = ~ statusSource)
  • 67.
    qplot facets facets geom ~ : 1 ~ 2: 1, 2 ※reshape2 1 ~ 2 + 3 qplot(wday, data = statusDF, xlab = "", ylab = "", facets = month ~ statusSource)
  • 68.
    qplot alpha : colour (color) : fill : linetype : size : colour, fill, linetype statusSource fill = I("blue") I (AsIs)
  • 69.
    qplot alpha : colour (color) : fill : linetype : size : qplot(wday, data = statusDF, xlab = "", ylab = "", alpha = as.integer(wday))
  • 70.
    qplot alpha : colour (color) : fill : linetype : size : qplot(wday, data = statusDF, xlab = "", ylab = "", colour = statusSource)
  • 71.
    qplot alpha : colour (color) : fill : linetype : size : qplot(wday, data = statusDF, xlab = "", ylab = "", fill = statusSource)
  • 72.
    qplot alpha : colour (color) : fill : linetype : size : qplot(wday, data = statusDF, xlab = "", ylab = "", linetype = statusSource, colour = statusSource)
  • 75.
    whotwi http://whotwi.com/
  • 76.
    whotwi http://whotwi.com/
  • 77.
    whotwi > # Twitter > # melt cast xtabs > cnt <- as.data.frame(xtabs(~ hour + wday + statusSource, statusDF)) > head(cnt, 3) hour wday statusSource Freq 1 0 Mon YoruFukurou 48 2 1 Mon YoruFukurou 38 3 2 Mon YoruFukurou 25
  • 78.
    whotwi > # Twitter > # melt cast xtabs > cnt <- as.data.frame(xtabs(~ hour + wday + statusSource, statusDF)) > head(cnt, 3) hour wday statusSource Freq 1 0 Mon YoruFukurou 48 2 1 Mon YoruFukurou 38 3 2 Mon YoruFukurou 25 > freqSources <- by(cnt, cnt[c("hour", "wday")], function(df) { + # + freqSource <- with(df, statusSource[order(Freq, decreasing = TRUE) [1]]) + cbind(df[1, c("hour", "wday")], freqSource) + }) > freqSources <- do.call(rbind, freqSources) > head(freqSources, 3) hour wday freqSource 1 0 Mon YoruFukurou 2 1 Mon YoruFukurou 3 2 Mon YoruFukurou
  • 79.
    whotwi > # > cntSum<- as.data.frame(xtabs(Freq ~ hour + wday, cnt)) > head(cntSum, 3) hour wday Freq 1 0 Mon 65 2 1 Mon 48 3 2 Mon 31
  • 80.
    whotwi > # > cntSum<- as.data.frame(xtabs(Freq ~ hour + wday, cnt)) > head(cntSum, 3) hour wday Freq 1 0 Mon 65 2 1 Mon 48 3 2 Mon 31 > # > data <- merge(cntSum, freqSources) > # > data$wday <- factor(data$wday, levels = rev(levels(data$wday))) > # > data$Freq <- log2(data$Freq)
  • 81.
    whotwi > # > cntSum<- as.data.frame(xtabs(Freq ~ hour + wday, cnt)) > head(cntSum, 3) hour wday Freq 1 0 Mon 65 2 1 Mon 48 3 2 Mon 31 > # > data <- merge(cntSum, freqSources) > # > data$wday <- factor(data$wday, levels = rev(levels(data$wday))) > # > data$Freq <- log2(data$Freq) > p <- qplot(hour, wday, data = data, xlab = "", ylab = "", + geom = "point", colour = freqSource, size = Freq) > p # print(p)
  • 82.
    whotwi > # > cntSum<- as.data.frame(xtabs(Freq ~ hour + wday, cnt)) > head(cntSum, 3) hour wday Freq 1 0 Mon 65 2 1 Mon 48 3 2 Mon 31 > # > data <- merge(cntSum, freqSources) > # > data$wday <- factor(data$wday, levels = rev(levels(data$wday))) > # > data$Freq <- log2(data$Freq) > p <- qplot(hour, wday, data = data, xlab = "", ylab = "", + geom = "point", colour = freqSource, size = Freq) > p # print(p)
  • 83.
  • 84.
  • 85.
    whotwi > # whotwitheme > theme_whotwi <- function() { + opts( # + panel.background = theme_rect(fill = NA, colour = NA), + # + legend.key = theme_rect(fill = NA, colour = NA), + # + axis.ticks = theme_segment(colour = NA)) + } > p2 <- p + theme_whotwi() + scale_size(legend = FALSE) + scale_colour_hue(name = "") > p2
  • 86.
    whotwi > # whotwitheme > theme_whotwi <- function() { + opts( # + panel.background = theme_rect(fill = NA, colour = NA), + # + legend.key = theme_rect(fill = NA, colour = NA), + # + axis.ticks = theme_segment(colour = NA)) + } > p2 <- p + theme_whotwi() + scale_size(legend = FALSE) + scale_colour_hue(name = "") > p2
  • 87.
  • 88.
  • 89.
  • 90.
  • 91.
  • 92.
    Twitter • • reshape2 • ggplot2 •
  • 95.
  • 96.
  • 97.
  • 98.
    RMeCab MeCab R > library(RMeCab) > (docDF(data.frame(" "), column = 1, type = 1)) number of extracted terms = 5 now making a data frame. wait a while! TERM POS1 POS2 Row1 1 1 2 1 3 1 4 2 5 2
  • 99.
    http://www.lr.pi.titech.ac.jp/~takamura/pndic_ja.html : : :1 : : :0.999995 : : :0.999979 : : :0.999979 : : :0.999645 : : :0.999486 : : :0.999314 ...
  • 100.
    > # > pndic<- read.table("http://www.lr.pi.titech.ac.jp/~takamura/pubs/ pn_ja.dic", + sep = ":", + col.names = c("term", "kana", "pos", "value"), + colClasses = c("character", "character", "factor", "numeric"), + fileEncoding = "Shift_JIS") > # > # > pndic2 <- aggregate(value ~ term + pos, pndic, mean)
  • 101.
    > # pndic >pos <- unique(pndic2$pos) > tweetDF <- docDF(statusDF, column = "cleanText", type = 1, pos = pos) number of extracted terms = 7164 now making a data frame. wait a while! > tweetDF[2900:2904, 1:5] TERM POS1 POS2 Row1 Row2 2900 0 0 2901 0 0 2902 0 0 2903 0 0 2904 0 0 > # pndic > tweetDF <- subset(tweetDF, TERM %in% pndic2$term) > # > tweetDF <- merge(tweetDF, pndic2, by.x = c("TERM", "POS1"), by.y = c ("term", "pos"))
  • 102.
    > # > score<- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value) > # > sum(score > 0) [1] 117
  • 103.
    > # > score<- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value) > # > sum(score > 0) [1] 117
  • 104.
    > # > score<- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value) > # > sum(score > 0) [1] 117 > # > sum(score < 0) [1] 2765
  • 105.
    > # > score<- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value) > # > sum(score > 0) [1] 117 > # > sum(score < 0) [1] 2765
  • 106.
    > # > score<- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value) > # > sum(score > 0) [1] 117 > # > sum(score < 0) [1] 2765 > # > sum(score == 0) [1] 277
  • 107.
    > # > score<- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value) > # > sum(score > 0) [1] 117 > # > sum(score < 0) [1] 2765 > # > sum(score == 0) [1] 277
  • 108.
    > # > score<- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value) > # > sum(score > 0) [1] 117 > # > sum(score < 0) [1] 2765 > # > sum(score == 0) [1] 277
  • 109.
    > table(ifelse(pndic$value >0, "positive", + ifelse(pndic$value == 0, "neutral", "negative"))) negative neutral positive 49983 20 5122
  • 110.
    > table(ifelse(pndic$value >0, "positive", + ifelse(pndic$value == 0, "neutral", "negative"))) negative neutral positive 49983 20 5122
  • 111.
    > m <-mean(score) > # > tweetType <- factor(ifelse(score > m, "positive", + ifelse(score == m, "neutral", "negative")), + levels = c("positive", "neutral", "negative")) > table(tweetType) tweetType positive neutral negative 1912 0 1247
  • 112.
    > statusDF$tweetType <-droplevels(tweetType) > # > qplot(month, data = statusDF, + geom = "bar", fill = tweetType, position = "fill")
  • 113.
    > statusDF$tweetType <-droplevels(tweetType) > # > qplot(month, data = statusDF, + geom = "bar", fill = tweetType, position = "fill")
  • 114.
    > statusDF$tweetType <-droplevels(tweetType) > # > qplot(month, data = statusDF, + geom = "bar", fill = tweetType, position = "fill")
  • 115.
    > statusDF$tweetType <-droplevels(tweetType) > # > qplot(month, data = statusDF, + geom = "bar", fill = tweetType, position = "fill")
  • 118.
    twitteR • RJSONIO • • ID status ID • fav favorited TRUE • truncated TRUE • DM • status character factor
  • 119.
    twitteR • RJSONIO • • ID status ID • fav favorited TRUE • truncated TRUE • DM • status character factor
  • 120.
    OAuth ” ” twitteR -
  • 122.
    • twitteR • reshape2 R • ggplot2 • RMeCab R
  • 123.
    • twitteR • reshape2 R • ggplot2 • RMeCab R • PC •
  • 126.
  • 127.
    status > statuses[[1]]$text [1] " " > statuses[[1]]$getText() # [1] " " > # > statuses[[1]]$text <- " " > statuses[[1]]$getText() [1] " " > statuses[[1]]$setText("ggrks") # > statuses[[1]]$getText() [1] "ggrks" > # > statuses[[1]]$getCreated() [1] "2011-11-23 22:16:24 UTC"
  • 128.
    removeSpecialStr removeSpecialStr <- function(text){ removeURL(removeHashTag(removeScreenName(text))) }
  • 129.
    removeScreenName removeScreenName <- function(text,strict = TRUE) { if (strict) { regex <- "(?<!w)[@ ](?>w+)(?![@ ])" } else { regex <- "[@ ]w+" } gsub(regex, "", text, perl = TRUE) }
  • 130.
    removeURL removeURL <- function(text,strict = TRUE) { if (strict) { regex <- "(?<![-.w#@=!'"/])https?://(?:[^:]+:. +@)?(?:[0-9A-Za-z][-0-9A-Za-z]*(?<!-).)+[A-za-z]+(?:/[- w#%=+,.?!&~]*)*" } else { regex <- "https?://[-w#%=+,.?!&~/]+" } gsub(regex, "", text, perl = TRUE) }
  • 131.
    removeHashTag removeHashTag <- function(text,strict = TRUE) { delimiters <- "s,.u3000-u3002uFF01uFF1F" # cf. http://nobu666.com/2011/07/13/914.html validJa <- "u3041-u3094u3099-u309Cu30A1-u30FA u30FCu3400-uD7A3uFF10-uFF19uFF21-uFF3AuFF41-uFF5A uFF66-uFF9E" if (strict) { regex <- sprintf("(^|[%s])(?:([# ](?>[0-9]+)(?! w))|[# ][w%s]+)", delimiters, validJa, validJa) } else { regex <- sprintf("[# ][^%s]+", delimiters) } gsub(regex, "12", text, perl = TRUE) }