|
1 | | -#EasyCharts团队出品, |
| 1 | + |
| 2 | +#EasyShu团队出品,更多文章请关注微信公众号【EasyShu】 |
2 | 3 | #如有问题修正与深入学习,可联系微信:EasyCharts |
3 | 4 |
|
4 | 5 | library(tm) |
5 | 6 | library(wordcloud) |
6 | | - |
7 | | -Paper1<-paste(scan("Paper1.txt", what = character(0),sep = ""), collapse = " ") |
8 | | -Paper2<-paste(scan("Paper2.txt", what = character(0),sep = ""), collapse = " ") |
9 | | - |
| 7 | +Paper1<-paste(scan("Paper1.txt", what = character(0),sep = ""), collapse = " ") #读入TXT 文档1 |
| 8 | +Paper2<-paste(scan("Paper2.txt", what = character(0),sep = ""), collapse = " ") #读入TXT 文档2 |
10 | 9 | tmpText<- data.frame(c(Paper1, Paper2),row.names=c("Text1","Text2")) |
11 | | - |
12 | 10 | df_title <- data.frame(doc_id=row.names(tmpText), |
13 | 11 | text=tmpText$c.Paper1..Paper2.) |
14 | | - |
15 | 12 | ds <- DataframeSource(df_title) |
16 | | -corp = Corpus(ds) |
17 | | -corp = tm_map(corp,removePunctuation) |
18 | | -corp = tm_map(corp,PlainTextDocument) |
19 | | -corp = tm_map(corp,removeNumbers) |
20 | | -corp = tm_map(corp, function(x){removeWords(x,stopwords())}) |
21 | | - |
| 13 | +#创建一个数据框格式的数据源,首列是文档id(doc_id),第二列是文档内容 |
| 14 | +corp <- VCorpus(ds) |
| 15 | +#加载文档集中的文本并生成语料库文件 |
| 16 | +corp<- tm_map(corp,removePunctuation) #清除语料库内的标点符号 |
| 17 | +corp <- tm_map(corp,PlainTextDocument) #转换为纯文本 |
| 18 | +corp <- tm_map(corp,removeNumbers) #清除数字符号 |
| 19 | +corp <- tm_map(corp, function(x){removeWords(x,stopwords())}) #过滤停止词库 |
22 | 20 | term.matrix <- TermDocumentMatrix(corp) |
23 | | -term.matrix <- as.matrix(term.matrix) |
| 21 | +#利用TermDocumentMatrix()函数将处理后的语料库进行断字处理,生成词频权重矩阵 |
| 22 | + |
| 23 | +term.matrix <- as.matrix(term.matrix) #频率 |
24 | 24 | colnames(term.matrix) <- c("Paper1","paper2") |
| 25 | +df<-data.frame(term.matrix) |
| 26 | +write.csv(df,'term_matrix.csv') #导出两篇文章的频率分析结果 |
25 | 27 |
|
26 | | -#------------------------------------------------------------------------------------------------------ |
27 | | -comparison.cloud(term.matrix, max.words=300, random.order=FALSE, rot.per=.15, c(4,0.4), title.size=1.4) |
| 28 | +#---------------------------------------导入数据------------------------------------------ |
| 29 | +df<-read.csv('term_matrix.csv',header=TRUE,row.names=1) |
28 | 30 |
|
29 | | -comparison.cloud(term.matrix,max.words=300,random.order=FALSE,colors=c("#00B2FF", "red")) |
30 | | -commonality.cloud(term.matrix,max.words=100,random.order=FALSE,color="#E7298A") |
| 31 | +#----------------------------------------两篇文章数据的对比------------------------------------------------------------- |
| 32 | +comparison.cloud(df, max.words=300, random.order=FALSE, rot.per=.15, c(4,0.4), title.size=1.4) |
| 33 | + |
| 34 | +comparison.cloud(df,max.words=300,random.order=FALSE,colors=c("#00B2FF", "red")) |
| 35 | +commonality.cloud(df,max.words=100,random.order=FALSE,color="#E7298A") |
31 | 36 |
|
32 | 37 |
|
33 | 38 | # comparison cloud |
34 | | -comparison.cloud(term.matrix, random.order=FALSE, |
| 39 | +comparison.cloud(df, random.order=FALSE, |
35 | 40 | colors = c("#00B2FF", "red", "#FF0099", "#6600CC"), |
36 | 41 | title.size=1.5, max.words=500) |
37 | 42 |
|
38 | | -#------------------------------------------------------------------------------------------------------ |
39 | | -df<-data.frame(term.matrix) |
| 43 | +#-------------------------------------单篇文章数据的展示----------------------------------------------------------------- |
40 | 44 | #Colors<-colorRampPalette(rev(brewer.pal(9,'RdBu')))(length(df$Paper1>10)) |
41 | 45 | wordcloud(row.names(df) , df$Paper1 , min.freq=10,col=brewer.pal(8, "Dark2"), rot.per=0.3 ) |
0 commit comments