Skip to content

Commit c33c0b4

Browse files
committed
重构EnumItemDictionary
1 parent 3d07aff commit c33c0b4

File tree

6 files changed

+162
-444
lines changed

6 files changed

+162
-444
lines changed

src/main/java/com/hankcs/hanlp/dictionary/common/CommonDictionary.java

Lines changed: 56 additions & 133 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,17 @@
1212
package com.hankcs.hanlp.dictionary.common;
1313

1414
import com.hankcs.hanlp.collection.trie.DoubleArrayTrie;
15+
import com.hankcs.hanlp.corpus.io.ByteArray;
1516
import com.hankcs.hanlp.corpus.io.IOUtil;
16-
import com.hankcs.hanlp.dictionary.BaseSearcher;
17+
import com.hankcs.hanlp.utility.TextUtility;
1718

18-
import java.io.*;
19+
import java.io.BufferedReader;
20+
import java.io.DataOutputStream;
21+
import java.io.IOException;
22+
import java.io.InputStreamReader;
1923
import java.util.*;
2024

25+
import static com.hankcs.hanlp.utility.Predefine.BIN_EXT;
2126
import static com.hankcs.hanlp.utility.Predefine.logger;
2227

2328
/**
@@ -29,68 +34,84 @@ public abstract class CommonDictionary<V>
2934
{
3035
DoubleArrayTrie<V> trie;
3136

37+
protected abstract V[] loadValueArray(ByteArray byteArray);
38+
3239
public boolean load(String path)
3340
{
3441
trie = new DoubleArrayTrie<V>();
3542
long start = System.currentTimeMillis();
36-
V[] valueArray = onLoadValue(path);
37-
if (valueArray == null)
38-
{
39-
logger.info("加载值" + path + ".value.dat失败,耗时" + (System.currentTimeMillis() - start) + "ms");
40-
return false;
41-
}
42-
logger.info("加载值" + path + ".value.dat成功,耗时" + (System.currentTimeMillis() - start) + "ms");
43-
start = System.currentTimeMillis();
44-
if (loadDat(path + ".trie.dat", valueArray))
43+
if (loadDat(ByteArray.createByteArray(path + BIN_EXT)))
4544
{
46-
logger.info("加载键" + path + ".trie.dat成功,耗时" + (System.currentTimeMillis() - start) + "ms");
4745
return true;
4846
}
49-
List<String> keyList = new ArrayList<String>(valueArray.length);
47+
TreeMap<String, V> map = new TreeMap<String, V>();
5048
try
5149
{
5250
BufferedReader br = new BufferedReader(new InputStreamReader(IOUtil.newInputStream(path), "UTF-8"));
5351
String line;
5452
while ((line = br.readLine()) != null)
5553
{
5654
String[] paramArray = line.split("\\s");
57-
keyList.add(paramArray[0]);
55+
map.put(paramArray[0], createValue(paramArray));
5856
}
5957
br.close();
6058
}
6159
catch (Exception e)
6260
{
6361
logger.warning("读取" + path + "失败" + e);
6462
}
65-
int resultCode = trie.build(keyList, valueArray);
63+
Set<Map.Entry<String, V>> entrySet = map.entrySet();
64+
List<String> keyList = new ArrayList<String>(entrySet.size());
65+
List<V> valueList = new ArrayList<V>(entrySet.size());
66+
for (Map.Entry<String, V> entry : entrySet)
67+
{
68+
keyList.add(entry.getKey());
69+
valueList.add(entry.getValue());
70+
}
71+
int resultCode = trie.build(keyList, valueList);
6672
if (resultCode != 0)
6773
{
68-
logger.warning("trie建立失败" + resultCode + ",正在尝试排序后重载");
69-
TreeMap<String, V> map = new TreeMap<String, V>();
70-
for (int i = 0; i < valueArray.length; ++i)
71-
{
72-
map.put(keyList.get(i), valueArray[i]);
73-
}
74-
trie = new DoubleArrayTrie<V>();
75-
trie.build(map);
76-
int i = 0;
77-
for (V v : map.values())
78-
{
79-
valueArray[i++] = v;
80-
}
74+
logger.warning("trie建立失败");
75+
return false;
8176
}
82-
trie.save(path + ".trie.dat");
83-
onSaveValue(valueArray, path);
84-
logger.info(path + "加载成功");
77+
logger.info(path + "加载成功,耗时" + (System.currentTimeMillis() - start) + "ms");
78+
saveDat(path + BIN_EXT, valueList);
8579
return true;
8680
}
8781

88-
private boolean loadDat(String path, V[] valueArray)
82+
protected boolean loadDat(ByteArray byteArray)
83+
{
84+
V[] valueArray = loadValueArray(byteArray);
85+
if (valueArray == null)
86+
{
87+
return false;
88+
}
89+
return trie.load(byteArray, valueArray);
90+
}
91+
92+
protected boolean saveDat(String path, List<V> valueArray)
8993
{
90-
if (trie.load(path, valueArray)) return true;
91-
return false;
94+
try
95+
{
96+
DataOutputStream out = new DataOutputStream(IOUtil.newOutputStream(path));
97+
out.writeInt(valueArray.size());
98+
for (V item : valueArray)
99+
{
100+
saveValue(item, out);
101+
}
102+
trie.save(out);
103+
out.close();
104+
}
105+
catch (Exception e)
106+
{
107+
logger.warning("保存失败" + TextUtility.exceptionToString(e));
108+
return false;
109+
}
110+
return true;
92111
}
93112

113+
protected abstract void saveValue(V value, DataOutputStream out) throws IOException;
114+
94115
/**
95116
* 查询一个单词
96117
*
@@ -123,103 +144,5 @@ public int size()
123144
return trie.size();
124145
}
125146

126-
/**
127-
* 排序这个词典
128-
*
129-
* @param path
130-
* @return
131-
*/
132-
public static boolean sort(String path)
133-
{
134-
TreeMap<String, String> map = new TreeMap<String, String>();
135-
try
136-
{
137-
BufferedReader br = new BufferedReader(new InputStreamReader(IOUtil.newInputStream(path), "UTF-8"));
138-
String line;
139-
while ((line = br.readLine()) != null)
140-
{
141-
String[] argArray = line.split("\\s");
142-
map.put(argArray[0], line);
143-
}
144-
br.close();
145-
// 输出它们
146-
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(IOUtil.newOutputStream(path)));
147-
for (Map.Entry<String, String> entry : map.entrySet())
148-
{
149-
bw.write(entry.getValue());
150-
bw.newLine();
151-
}
152-
bw.close();
153-
}
154-
catch (Exception e)
155-
{
156-
logger.warning("读取" + path + "失败" + e);
157-
return false;
158-
}
159-
return true;
160-
}
161-
162-
/**
163-
* 实现此方法来加载值
164-
*
165-
* @param path
166-
* @return
167-
*/
168-
protected abstract V[] onLoadValue(String path);
169-
170-
protected abstract boolean onSaveValue(V[] valueArray, String path);
171-
172-
public BaseSearcher getSearcher(String text)
173-
{
174-
return new Searcher(text);
175-
}
176-
177-
/**
178-
* 前缀搜索,长短都可匹配
179-
*/
180-
public class Searcher extends BaseSearcher<V>
181-
{
182-
/**
183-
* 分词从何处开始,这是一个状态
184-
*/
185-
int begin;
186-
187-
private List<Map.Entry<String, V>> entryList;
188-
189-
protected Searcher(char[] c)
190-
{
191-
super(c);
192-
}
193-
194-
protected Searcher(String text)
195-
{
196-
super(text);
197-
entryList = new LinkedList<Map.Entry<String, V>>();
198-
}
199-
200-
@Override
201-
public Map.Entry<String, V> next()
202-
{
203-
// 保证首次调用找到一个词语
204-
while (entryList.size() == 0 && begin < c.length)
205-
{
206-
entryList = trie.commonPrefixSearchWithValue(c, begin);
207-
++begin;
208-
}
209-
// 之后调用仅在缓存用完的时候调用一次
210-
if (entryList.size() == 0 && begin < c.length)
211-
{
212-
entryList = trie.commonPrefixSearchWithValue(c, begin);
213-
++begin;
214-
}
215-
if (entryList.size() == 0)
216-
{
217-
return null;
218-
}
219-
Map.Entry<String, V> result = entryList.get(0);
220-
entryList.remove(0);
221-
offset = begin - 1;
222-
return result;
223-
}
224-
}
147+
protected abstract V createValue(String[] params);
225148
}
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
/*
2+
* <author>Hankcs</author>
3+
* <email>[email protected]</email>
4+
* <create-date>2017-11-14 下午8:32</create-date>
5+
*
6+
* <copyright file="EnumItemDictionary.java" company="码农场">
7+
* Copyright (c) 2017, 码农场. All Right Reserved, http://www.hankcs.com/
8+
* This source is subject to Hankcs. Please contact Hankcs to get more information.
9+
* </copyright>
10+
*/
11+
package com.hankcs.hanlp.dictionary.common;
12+
13+
import com.hankcs.hanlp.corpus.dictionary.item.EnumItem;
14+
import com.hankcs.hanlp.corpus.io.ByteArray;
15+
16+
import java.io.DataOutputStream;
17+
import java.io.IOException;
18+
import java.util.Map;
19+
20+
/**
21+
* @author hankcs
22+
*/
23+
public abstract class EnumItemDictionary<E extends Enum<E>> extends CommonDictionary<EnumItem<E>>
24+
{
25+
@Override
26+
protected EnumItem<E> createValue(String[] params)
27+
{
28+
Map.Entry<String, Map.Entry<String, Integer>[]> args = EnumItem.create(params);
29+
EnumItem<E> nrEnumItem = new EnumItem<E>();
30+
for (Map.Entry<String, Integer> e : args.getValue())
31+
{
32+
nrEnumItem.labelMap.put(valueOf(e.getKey()), e.getValue());
33+
}
34+
return nrEnumItem;
35+
}
36+
37+
protected abstract E valueOf(String name);
38+
39+
protected abstract E[] values();
40+
41+
protected abstract EnumItem<E> newItem();
42+
43+
@Override
44+
final protected EnumItem<E>[] loadValueArray(ByteArray byteArray)
45+
{
46+
if (byteArray == null)
47+
{
48+
return null;
49+
}
50+
E[] nrArray = values();
51+
int size = byteArray.nextInt();
52+
EnumItem<E>[] valueArray = new EnumItem[size];
53+
for (int i = 0; i < size; ++i)
54+
{
55+
int currentSize = byteArray.nextInt();
56+
EnumItem<E> item = newItem();
57+
for (int j = 0; j < currentSize; ++j)
58+
{
59+
E nr = nrArray[byteArray.nextInt()];
60+
int frequency = byteArray.nextInt();
61+
item.labelMap.put(nr, frequency);
62+
}
63+
valueArray[i] = item;
64+
}
65+
return valueArray;
66+
}
67+
68+
@Override
69+
protected void saveValue(EnumItem<E> item, DataOutputStream out) throws IOException
70+
{
71+
out.writeInt(item.labelMap.size());
72+
for (Map.Entry<E, Integer> entry : item.labelMap.entrySet())
73+
{
74+
out.writeInt(entry.getKey().ordinal());
75+
out.writeInt(entry.getValue());
76+
}
77+
}
78+
}

0 commit comments

Comments
 (0)