Skip to content

Commit 0ec1080

Browse files
committed
开源文本分类模块,新增情感分析示例
1 parent 874d0d7 commit 0ec1080

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+3255
-0
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ HanLP: Han Language Processing
5050
* 基于神经网络的高性能依存句法分析器
5151
* MaxEnt依存句法分析
5252
* CRF依存句法分析
53+
> * 文本分类
5354
> * 语料库工具
5455
* 分词语料预处理
5556
* 词频词性词典制作
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
/*
2+
* <summary></summary>
3+
* <author>He Han</author>
4+
* <email>[email protected]</email>
5+
* <create-date>2016/1/29 18:00</create-date>
6+
*
7+
* <copyright file="AbstractClassifier.java" company="码农场">
8+
* Copyright (c) 2008-2016, 码农场. All Right Reserved, http://www.hankcs.com/
9+
* This source is subject to Hankcs. Please contact Hankcs to get more information.
10+
* </copyright>
11+
*/
12+
package com.hankcs.hanlp.classification.classifiers;
13+
14+
import com.hankcs.hanlp.classification.corpus.Document;
15+
import com.hankcs.hanlp.classification.corpus.IDataSet;
16+
import com.hankcs.hanlp.classification.corpus.MemoryDataSet;
17+
import com.hankcs.hanlp.classification.models.AbstractModel;
18+
import com.hankcs.hanlp.classification.utilities.CollectionUtility;
19+
import com.hankcs.hanlp.classification.utilities.MathUtility;
20+
21+
import java.io.IOException;
22+
import java.util.Map;
23+
import java.util.TreeMap;
24+
25+
import static com.hankcs.hanlp.classification.utilities.Predefine.logger;
26+
27+
/**
28+
* @author hankcs
29+
*/
30+
public abstract class AbstractClassifier implements IClassifier
31+
{
32+
@Override
33+
public IClassifier enableProbability(boolean enable)
34+
{
35+
return this;
36+
}
37+
38+
/**
39+
* 是否计算概率
40+
*/
41+
boolean configProbabilityEnabled = true;
42+
43+
/**
44+
* 使用一个训练出来的分类器来预测分类
45+
*
46+
* @param text
47+
* @return
48+
* @throws IllegalArgumentException
49+
* @throws IllegalStateException
50+
*/
51+
@Override
52+
public String classify(String text) throws IllegalArgumentException, IllegalStateException
53+
{
54+
Map<String, Double> scoreMap = predict(text);
55+
56+
return CollectionUtility.max(scoreMap);
57+
}
58+
59+
@Override
60+
public String classify(Document document) throws IllegalArgumentException, IllegalStateException
61+
{
62+
Map<String, Double> scoreMap = predict(document);
63+
64+
return CollectionUtility.max(scoreMap);
65+
}
66+
67+
@Override
68+
public void train(String folderPath, String charsetName) throws IOException
69+
{
70+
IDataSet dataSet = new MemoryDataSet();
71+
dataSet.load(folderPath, charsetName);
72+
train(dataSet);
73+
}
74+
75+
@Override
76+
public void train(Map<String, String[]> trainingDataSet) throws IllegalArgumentException
77+
{
78+
IDataSet dataSet = new MemoryDataSet();
79+
logger.start("正在构造训练数据集...");
80+
int total = trainingDataSet.size();
81+
int cur = 0;
82+
for (Map.Entry<String, String[]> entry : trainingDataSet.entrySet())
83+
{
84+
String category = entry.getKey();
85+
logger.out("[%s]...", category);
86+
for (String doc : entry.getValue())
87+
{
88+
dataSet.add(category, doc);
89+
}
90+
++cur;
91+
logger.out("%.2f%%...", MathUtility.percentage(cur, total));
92+
}
93+
logger.finish(" 加载完毕\n");
94+
train(dataSet);
95+
}
96+
97+
@Override
98+
public void train(String folderPath) throws IOException
99+
{
100+
train(folderPath, "UTF-8");
101+
}
102+
103+
@Override
104+
public Map<String, Double> predict(Document document)
105+
{
106+
AbstractModel model = getModel();
107+
if (model == null)
108+
{
109+
throw new IllegalStateException("未训练模型!无法执行预测!");
110+
}
111+
if (document == null)
112+
{
113+
throw new IllegalArgumentException("参数 text == null");
114+
}
115+
116+
double[] probs = categorize(document);
117+
Map<String, Double> scoreMap = new TreeMap<String, Double>();
118+
for (int i = 0; i < probs.length; i++)
119+
{
120+
scoreMap.put(model.catalog[i], probs[i]);
121+
}
122+
return scoreMap;
123+
}
124+
125+
@Override
126+
public int label(Document document) throws IllegalArgumentException, IllegalStateException
127+
{
128+
AbstractModel model = getModel();
129+
if (model == null)
130+
{
131+
throw new IllegalStateException("未训练模型!无法执行预测!");
132+
}
133+
if (document == null)
134+
{
135+
throw new IllegalArgumentException("参数 text == null");
136+
}
137+
138+
double[] probs = categorize(document);
139+
double max = Double.NEGATIVE_INFINITY;
140+
int best = -1;
141+
for (int i = 0; i < probs.length; i++)
142+
{
143+
if (probs[i] > max)
144+
{
145+
max = probs[i];
146+
best = i;
147+
}
148+
}
149+
return best;
150+
}
151+
}
Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
/*
2+
* <summary></summary>
3+
* <author>He Han</author>
4+
* <email>[email protected]</email>
5+
* <create-date>2016/1/29 17:59</create-date>
6+
*
7+
* <copyright file="ITextClassifier.java" company="码农场">
8+
* Copyright (c) 2008-2016, 码农场. All Right Reserved, http://www.hankcs.com/
9+
* This source is subject to Hankcs. Please contact Hankcs to get more information.
10+
* </copyright>
11+
*/
12+
package com.hankcs.hanlp.classification.classifiers;
13+
14+
import com.hankcs.hanlp.classification.corpus.Document;
15+
import com.hankcs.hanlp.classification.corpus.IDataSet;
16+
import com.hankcs.hanlp.classification.models.AbstractModel;
17+
18+
import java.io.IOException;
19+
import java.util.Map;
20+
21+
/**
22+
* 文本分类器接口
23+
*
24+
* @author hankcs
25+
*/
26+
public interface IClassifier
27+
{
28+
/**
29+
* 是否归一化分值为概率
30+
*
31+
* @param enable
32+
* @return
33+
*/
34+
IClassifier enableProbability(boolean enable);
35+
36+
/**
37+
* 预测分类
38+
*
39+
* @param text 文本
40+
* @return 所有分类对应的分值(或概率, 需要enableProbability)
41+
* @throws IllegalArgumentException 参数错误
42+
* @throws IllegalStateException 未训练模型
43+
*/
44+
Map<String, Double> predict(String text) throws IllegalArgumentException, IllegalStateException;
45+
46+
/**
47+
* 预测分类
48+
* @param document
49+
* @return
50+
*/
51+
Map<String, Double> predict(Document document) throws IllegalArgumentException, IllegalStateException;
52+
53+
/**
54+
* 预测分类
55+
* @param document
56+
* @return
57+
* @throws IllegalArgumentException
58+
* @throws IllegalStateException
59+
*/
60+
double[] categorize(Document document) throws IllegalArgumentException, IllegalStateException;
61+
62+
/**
63+
* 预测最可能的分类
64+
* @param document
65+
* @return
66+
* @throws IllegalArgumentException
67+
* @throws IllegalStateException
68+
*/
69+
int label(Document document) throws IllegalArgumentException, IllegalStateException;
70+
71+
/**
72+
* 预测最可能的分类
73+
* @param text 文本
74+
* @return 最可能的分类
75+
* @throws IllegalArgumentException
76+
* @throws IllegalStateException
77+
*/
78+
String classify(String text) throws IllegalArgumentException, IllegalStateException;
79+
80+
/**
81+
* 预测最可能的分类
82+
* @param document 一个结构化的文档(注意!这是一个底层数据结构,请谨慎操作)
83+
* @return 最可能的分类
84+
* @throws IllegalArgumentException
85+
* @throws IllegalStateException
86+
*/
87+
String classify(Document document) throws IllegalArgumentException, IllegalStateException;
88+
89+
/**
90+
* 训练模型
91+
*
92+
* @param trainingDataSet 训练数据集,用Map储存.键是分类名,值是一个数组,数组中每个元素都是一篇文档的内容.
93+
*/
94+
void train(Map<String, String[]> trainingDataSet) throws IllegalArgumentException;
95+
96+
/**
97+
* 训练模型
98+
*
99+
* @param folderPath 分类语料的根目录.目录必须满足如下结构:<br>
100+
* 根目录<br>
101+
* ├── 分类A<br>
102+
* │ └── 1.txt<br>
103+
* │ └── 2.txt<br>
104+
* │ └── 3.txt<br>
105+
* ├── 分类B<br>
106+
* │ └── 1.txt<br>
107+
* │ └── ...<br>
108+
* └── ...<br>
109+
* 文件不一定需要用数字命名,也不需要以txt作为后缀名,但一定需要是文本文件.
110+
* @param charsetName 文件编码
111+
* @throws IOException 任何可能的IO异常
112+
*/
113+
void train(String folderPath, String charsetName) throws IOException;
114+
115+
/**
116+
* 用UTF-8编码的语料训练模型
117+
*
118+
* @param folderPath 用UTF-8编码的分类语料的根目录.目录必须满足如下结构:<br>
119+
* 根目录<br>
120+
* ├── 分类A<br>
121+
* │ └── 1.txt<br>
122+
* │ └── 2.txt<br>
123+
* │ └── 3.txt<br>
124+
* ├── 分类B<br>
125+
* │ └── 1.txt<br>
126+
* │ └── ...<br>
127+
* └── ...<br>
128+
* 文件不一定需要用数字命名,也不需要以txt作为后缀名,但一定需要是文本文件.
129+
* @throws IOException 任何可能的IO异常
130+
*/
131+
void train(String folderPath) throws IOException;
132+
133+
/**
134+
* 训练模型
135+
* @param dataSet 训练数据集
136+
* @throws IllegalArgumentException 当数据集为空时,将抛出此异常
137+
*/
138+
void train(IDataSet dataSet) throws IllegalArgumentException;
139+
140+
/**
141+
* 获取训练后的模型,可用于序列化保存或预测.
142+
* @return 模型,null表示未训练
143+
*/
144+
AbstractModel getModel();
145+
}

0 commit comments

Comments
 (0)