Skip to content

Commit 576f45f

Browse files
committed
Add normalisation for Romanian and Vietnamese from original library
1 parent 437fa12 commit 576f45f

File tree

4 files changed

+220
-32
lines changed

4 files changed

+220
-32
lines changed

src/main/java/org/xbib/elasticsearch/common/langdetect/LangdetectService.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,7 @@ public String getProfile() {
214214
}
215215

216216
public List<Language> detectAll(String text) throws LanguageDetectionException {
217+
text = NGram.normalizeVietnamese(text);
217218
if (!isStarted) {
218219
load(settings);
219220
init();

src/main/java/org/xbib/elasticsearch/common/langdetect/NGram.java

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import java.lang.Character.UnicodeBlock;
44
import java.util.HashMap;
55
import java.util.Map;
6+
import java.util.regex.Matcher;
7+
import java.util.regex.Pattern;
68

79
public class NGram {
810

@@ -72,9 +74,19 @@ public static char normalize(char ch) {
7274
if (LATIN1_EXCLUDED.indexOf(ch) >= 0) {
7375
ch = ' ';
7476
}
77+
} else if (block == UnicodeBlock.LATIN_EXTENDED_B) {
78+
// Normalization for Romanian
79+
if (ch == '\u0219') {
80+
// Small S with comma below => with cedilla
81+
ch = '\u015f';
82+
} else if (ch == '\u021b') {
83+
// Small T with comma below => with cedilla
84+
ch = '\u0163';
85+
}
7586
} else if (block == UnicodeBlock.GENERAL_PUNCTUATION) {
7687
ch = ' ';
7788
} else if (block == UnicodeBlock.ARABIC) {
89+
// Farsi yeh => Arabic yeh
7890
if (ch == '\u06cc') {
7991
ch = '\u064a';
8092
}
@@ -98,6 +110,37 @@ public static char normalize(char ch) {
98110
return ch;
99111
}
100112

113+
private static final String[] VI_NORMALIZED_CHARS = {
114+
"\u00C0\u00C8\u00CC\u00D2\u00D9\u1EF2\u00E0\u00E8\u00EC\u00F2\u00F9\u1EF3\u1EA6\u1EC0\u1ED2\u1EA7\u1EC1\u1ED3\u1EB0\u1EB1\u1EDC\u1EDD\u1EEA\u1EEB",
115+
"\u00C1\u00C9\u00CD\u00D3\u00DA\u00DD\u00E1\u00E9\u00ED\u00F3\u00FA\u00FD\u1EA4\u1EBE\u1ED0\u1EA5\u1EBF\u1ED1\u1EAE\u1EAF\u1EDA\u1EDB\u1EE8\u1EE9",
116+
"\u00C3\u1EBC\u0128\u00D5\u0168\u1EF8\u00E3\u1EBD\u0129\u00F5\u0169\u1EF9\u1EAA\u1EC4\u1ED6\u1EAB\u1EC5\u1ED7\u1EB4\u1EB5\u1EE0\u1EE1\u1EEE\u1EEF",
117+
"\u1EA2\u1EBA\u1EC8\u1ECE\u1EE6\u1EF6\u1EA3\u1EBB\u1EC9\u1ECF\u1EE7\u1EF7\u1EA8\u1EC2\u1ED4\u1EA9\u1EC3\u1ED5\u1EB2\u1EB3\u1EDE\u1EDF\u1EEC\u1EED",
118+
"\u1EA0\u1EB8\u1ECA\u1ECC\u1EE4\u1EF4\u1EA1\u1EB9\u1ECB\u1ECD\u1EE5\u1EF5\u1EAC\u1EC6\u1ED8\u1EAD\u1EC7\u1ED9\u1EB6\u1EB7\u1EE2\u1EE3\u1EF0\u1EF1"
119+
};
120+
private static final String VI_CHARS = "AEIOUYaeiouy\u00c2\u00ca\u00d4\u00e2\u00ea\u00f4\u0102\u0103\u01a0\u01a1\u01af\u01b0";
121+
private static final String VI_DIACRITICS = "\u0300\u0301\u0303\u0309\u0323";
122+
private static final Pattern VI_CHARS_WITH_DIACRITIC_PATTERN = Pattern.compile("([" + VI_CHARS + "])([" + VI_DIACRITICS + "])");
123+
124+
/**
125+
* Normalize Vietnamese letter + diacritical mark (U+03xx) to a single character (U+1Exx).
126+
*/
127+
public static String normalizeVietnamese(String text) {
128+
Matcher matcher = VI_CHARS_WITH_DIACRITIC_PATTERN.matcher(text);
129+
StringBuffer buf = new StringBuffer();
130+
while (matcher.find()) {
131+
int charIndex = VI_CHARS.indexOf(matcher.group(1));
132+
matcher.appendReplacement(
133+
buf,
134+
VI_NORMALIZED_CHARS[VI_DIACRITICS.indexOf(matcher.group(2))].substring(charIndex, charIndex + 1)
135+
);
136+
}
137+
if (buf.length() == 0) {
138+
return text;
139+
}
140+
matcher.appendTail(buf);
141+
return buf.toString();
142+
}
143+
101144
static final String[] CJK_CLASS = {
102145
"\u4F7C\u6934",
103146
"\u88CF\u95B2",

src/test/java/org/xbib/elasticsearch/index/mapper/langdetect/NGramTest.java

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,4 +119,148 @@ public final void testNGram() {
119119
assertEquals(ngram.get(3), null);
120120

121121
}
122+
123+
/**
124+
* Test method for {@link NGram#normalize(char)} with Romanian characters
125+
*/
126+
@Test
127+
public final void testNormalizeForRomanian() {
128+
assertEquals(NGram.normalize('\u015f'), '\u015f');
129+
assertEquals(NGram.normalize('\u0163'), '\u0163');
130+
assertEquals(NGram.normalize('\u0219'), '\u015f');
131+
assertEquals(NGram.normalize('\u021b'), '\u0163');
132+
}
133+
134+
@Test
135+
public final void testNormalizeVietnamese() {
136+
assertEquals(NGram.normalizeVietnamese(""), "");
137+
assertEquals(NGram.normalizeVietnamese("ABC"), "ABC");
138+
assertEquals(NGram.normalizeVietnamese("012"), "012");
139+
assertEquals(NGram.normalizeVietnamese("\u00c0"), "\u00c0");
140+
141+
assertEquals(NGram.normalizeVietnamese("\u0041\u0300"), "\u00C0");
142+
assertEquals(NGram.normalizeVietnamese("\u0045\u0300"), "\u00C8");
143+
assertEquals(NGram.normalizeVietnamese("\u0049\u0300"), "\u00CC");
144+
assertEquals(NGram.normalizeVietnamese("\u004F\u0300"), "\u00D2");
145+
assertEquals(NGram.normalizeVietnamese("\u0055\u0300"), "\u00D9");
146+
assertEquals(NGram.normalizeVietnamese("\u0059\u0300"), "\u1EF2");
147+
assertEquals(NGram.normalizeVietnamese("\u0061\u0300"), "\u00E0");
148+
assertEquals(NGram.normalizeVietnamese("\u0065\u0300"), "\u00E8");
149+
assertEquals(NGram.normalizeVietnamese("\u0069\u0300"), "\u00EC");
150+
assertEquals(NGram.normalizeVietnamese("\u006F\u0300"), "\u00F2");
151+
assertEquals(NGram.normalizeVietnamese("\u0075\u0300"), "\u00F9");
152+
assertEquals(NGram.normalizeVietnamese("\u0079\u0300"), "\u1EF3");
153+
assertEquals(NGram.normalizeVietnamese("\u00C2\u0300"), "\u1EA6");
154+
assertEquals(NGram.normalizeVietnamese("\u00CA\u0300"), "\u1EC0");
155+
assertEquals(NGram.normalizeVietnamese("\u00D4\u0300"), "\u1ED2");
156+
assertEquals(NGram.normalizeVietnamese("\u00E2\u0300"), "\u1EA7");
157+
assertEquals(NGram.normalizeVietnamese("\u00EA\u0300"), "\u1EC1");
158+
assertEquals(NGram.normalizeVietnamese("\u00F4\u0300"), "\u1ED3");
159+
assertEquals(NGram.normalizeVietnamese("\u0102\u0300"), "\u1EB0");
160+
assertEquals(NGram.normalizeVietnamese("\u0103\u0300"), "\u1EB1");
161+
assertEquals(NGram.normalizeVietnamese("\u01A0\u0300"), "\u1EDC");
162+
assertEquals(NGram.normalizeVietnamese("\u01A1\u0300"), "\u1EDD");
163+
assertEquals(NGram.normalizeVietnamese("\u01AF\u0300"), "\u1EEA");
164+
assertEquals(NGram.normalizeVietnamese("\u01B0\u0300"), "\u1EEB");
165+
166+
assertEquals(NGram.normalizeVietnamese("\u0041\u0301"), "\u00C1");
167+
assertEquals(NGram.normalizeVietnamese("\u0045\u0301"), "\u00C9");
168+
assertEquals(NGram.normalizeVietnamese("\u0049\u0301"), "\u00CD");
169+
assertEquals(NGram.normalizeVietnamese("\u004F\u0301"), "\u00D3");
170+
assertEquals(NGram.normalizeVietnamese("\u0055\u0301"), "\u00DA");
171+
assertEquals(NGram.normalizeVietnamese("\u0059\u0301"), "\u00DD");
172+
assertEquals(NGram.normalizeVietnamese("\u0061\u0301"), "\u00E1");
173+
assertEquals(NGram.normalizeVietnamese("\u0065\u0301"), "\u00E9");
174+
assertEquals(NGram.normalizeVietnamese("\u0069\u0301"), "\u00ED");
175+
assertEquals(NGram.normalizeVietnamese("\u006F\u0301"), "\u00F3");
176+
assertEquals(NGram.normalizeVietnamese("\u0075\u0301"), "\u00FA");
177+
assertEquals(NGram.normalizeVietnamese("\u0079\u0301"), "\u00FD");
178+
assertEquals(NGram.normalizeVietnamese("\u00C2\u0301"), "\u1EA4");
179+
assertEquals(NGram.normalizeVietnamese("\u00CA\u0301"), "\u1EBE");
180+
assertEquals(NGram.normalizeVietnamese("\u00D4\u0301"), "\u1ED0");
181+
assertEquals(NGram.normalizeVietnamese("\u00E2\u0301"), "\u1EA5");
182+
assertEquals(NGram.normalizeVietnamese("\u00EA\u0301"), "\u1EBF");
183+
assertEquals(NGram.normalizeVietnamese("\u00F4\u0301"), "\u1ED1");
184+
assertEquals(NGram.normalizeVietnamese("\u0102\u0301"), "\u1EAE");
185+
assertEquals(NGram.normalizeVietnamese("\u0103\u0301"), "\u1EAF");
186+
assertEquals(NGram.normalizeVietnamese("\u01A0\u0301"), "\u1EDA");
187+
assertEquals(NGram.normalizeVietnamese("\u01A1\u0301"), "\u1EDB");
188+
assertEquals(NGram.normalizeVietnamese("\u01AF\u0301"), "\u1EE8");
189+
assertEquals(NGram.normalizeVietnamese("\u01B0\u0301"), "\u1EE9");
190+
191+
assertEquals(NGram.normalizeVietnamese("\u0041\u0303"), "\u00C3");
192+
assertEquals(NGram.normalizeVietnamese("\u0045\u0303"), "\u1EBC");
193+
assertEquals(NGram.normalizeVietnamese("\u0049\u0303"), "\u0128");
194+
assertEquals(NGram.normalizeVietnamese("\u004F\u0303"), "\u00D5");
195+
assertEquals(NGram.normalizeVietnamese("\u0055\u0303"), "\u0168");
196+
assertEquals(NGram.normalizeVietnamese("\u0059\u0303"), "\u1EF8");
197+
assertEquals(NGram.normalizeVietnamese("\u0061\u0303"), "\u00E3");
198+
assertEquals(NGram.normalizeVietnamese("\u0065\u0303"), "\u1EBD");
199+
assertEquals(NGram.normalizeVietnamese("\u0069\u0303"), "\u0129");
200+
assertEquals(NGram.normalizeVietnamese("\u006F\u0303"), "\u00F5");
201+
assertEquals(NGram.normalizeVietnamese("\u0075\u0303"), "\u0169");
202+
assertEquals(NGram.normalizeVietnamese("\u0079\u0303"), "\u1EF9");
203+
assertEquals(NGram.normalizeVietnamese("\u00C2\u0303"), "\u1EAA");
204+
assertEquals(NGram.normalizeVietnamese("\u00CA\u0303"), "\u1EC4");
205+
assertEquals(NGram.normalizeVietnamese("\u00D4\u0303"), "\u1ED6");
206+
assertEquals(NGram.normalizeVietnamese("\u00E2\u0303"), "\u1EAB");
207+
assertEquals(NGram.normalizeVietnamese("\u00EA\u0303"), "\u1EC5");
208+
assertEquals(NGram.normalizeVietnamese("\u00F4\u0303"), "\u1ED7");
209+
assertEquals(NGram.normalizeVietnamese("\u0102\u0303"), "\u1EB4");
210+
assertEquals(NGram.normalizeVietnamese("\u0103\u0303"), "\u1EB5");
211+
assertEquals(NGram.normalizeVietnamese("\u01A0\u0303"), "\u1EE0");
212+
assertEquals(NGram.normalizeVietnamese("\u01A1\u0303"), "\u1EE1");
213+
assertEquals(NGram.normalizeVietnamese("\u01AF\u0303"), "\u1EEE");
214+
assertEquals(NGram.normalizeVietnamese("\u01B0\u0303"), "\u1EEF");
215+
216+
assertEquals(NGram.normalizeVietnamese("\u0041\u0309"), "\u1EA2");
217+
assertEquals(NGram.normalizeVietnamese("\u0045\u0309"), "\u1EBA");
218+
assertEquals(NGram.normalizeVietnamese("\u0049\u0309"), "\u1EC8");
219+
assertEquals(NGram.normalizeVietnamese("\u004F\u0309"), "\u1ECE");
220+
assertEquals(NGram.normalizeVietnamese("\u0055\u0309"), "\u1EE6");
221+
assertEquals(NGram.normalizeVietnamese("\u0059\u0309"), "\u1EF6");
222+
assertEquals(NGram.normalizeVietnamese("\u0061\u0309"), "\u1EA3");
223+
assertEquals(NGram.normalizeVietnamese("\u0065\u0309"), "\u1EBB");
224+
assertEquals(NGram.normalizeVietnamese("\u0069\u0309"), "\u1EC9");
225+
assertEquals(NGram.normalizeVietnamese("\u006F\u0309"), "\u1ECF");
226+
assertEquals(NGram.normalizeVietnamese("\u0075\u0309"), "\u1EE7");
227+
assertEquals(NGram.normalizeVietnamese("\u0079\u0309"), "\u1EF7");
228+
assertEquals(NGram.normalizeVietnamese("\u00C2\u0309"), "\u1EA8");
229+
assertEquals(NGram.normalizeVietnamese("\u00CA\u0309"), "\u1EC2");
230+
assertEquals(NGram.normalizeVietnamese("\u00D4\u0309"), "\u1ED4");
231+
assertEquals(NGram.normalizeVietnamese("\u00E2\u0309"), "\u1EA9");
232+
assertEquals(NGram.normalizeVietnamese("\u00EA\u0309"), "\u1EC3");
233+
assertEquals(NGram.normalizeVietnamese("\u00F4\u0309"), "\u1ED5");
234+
assertEquals(NGram.normalizeVietnamese("\u0102\u0309"), "\u1EB2");
235+
assertEquals(NGram.normalizeVietnamese("\u0103\u0309"), "\u1EB3");
236+
assertEquals(NGram.normalizeVietnamese("\u01A0\u0309"), "\u1EDE");
237+
assertEquals(NGram.normalizeVietnamese("\u01A1\u0309"), "\u1EDF");
238+
assertEquals(NGram.normalizeVietnamese("\u01AF\u0309"), "\u1EEC");
239+
assertEquals(NGram.normalizeVietnamese("\u01B0\u0309"), "\u1EED");
240+
241+
assertEquals(NGram.normalizeVietnamese("\u0041\u0323"), "\u1EA0");
242+
assertEquals(NGram.normalizeVietnamese("\u0045\u0323"), "\u1EB8");
243+
assertEquals(NGram.normalizeVietnamese("\u0049\u0323"), "\u1ECA");
244+
assertEquals(NGram.normalizeVietnamese("\u004F\u0323"), "\u1ECC");
245+
assertEquals(NGram.normalizeVietnamese("\u0055\u0323"), "\u1EE4");
246+
assertEquals(NGram.normalizeVietnamese("\u0059\u0323"), "\u1EF4");
247+
assertEquals(NGram.normalizeVietnamese("\u0061\u0323"), "\u1EA1");
248+
assertEquals(NGram.normalizeVietnamese("\u0065\u0323"), "\u1EB9");
249+
assertEquals(NGram.normalizeVietnamese("\u0069\u0323"), "\u1ECB");
250+
assertEquals(NGram.normalizeVietnamese("\u006F\u0323"), "\u1ECD");
251+
assertEquals(NGram.normalizeVietnamese("\u0075\u0323"), "\u1EE5");
252+
assertEquals(NGram.normalizeVietnamese("\u0079\u0323"), "\u1EF5");
253+
assertEquals(NGram.normalizeVietnamese("\u00C2\u0323"), "\u1EAC");
254+
assertEquals(NGram.normalizeVietnamese("\u00CA\u0323"), "\u1EC6");
255+
assertEquals(NGram.normalizeVietnamese("\u00D4\u0323"), "\u1ED8");
256+
assertEquals(NGram.normalizeVietnamese("\u00E2\u0323"), "\u1EAD");
257+
assertEquals(NGram.normalizeVietnamese("\u00EA\u0323"), "\u1EC7");
258+
assertEquals(NGram.normalizeVietnamese("\u00F4\u0323"), "\u1ED9");
259+
assertEquals(NGram.normalizeVietnamese("\u0102\u0323"), "\u1EB6");
260+
assertEquals(NGram.normalizeVietnamese("\u0103\u0323"), "\u1EB7");
261+
assertEquals(NGram.normalizeVietnamese("\u01A0\u0323"), "\u1EE2");
262+
assertEquals(NGram.normalizeVietnamese("\u01A1\u0323"), "\u1EE3");
263+
assertEquals(NGram.normalizeVietnamese("\u01AF\u0323"), "\u1EF0");
264+
assertEquals(NGram.normalizeVietnamese("\u01B0\u0323"), "\u1EF1");
265+
}
122266
}

0 commit comments

Comments
 (0)