Skip to content

Commit 1dc99f9

Browse files
committed
update to Elasticsearch 2.0, more examples, settings documented
1 parent b9ca07a commit 1dc99f9

File tree

6 files changed

+198
-93
lines changed

6 files changed

+198
-93
lines changed

README.md

Lines changed: 165 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ zh-tw
8181

8282
| Elasticsearch | Plugin | Release date |
8383
| -------------- | -------------- | ------------ |
84+
| 2.0.0 | 2.0.0.0 | Nov 12, 2015 |
8485
| 2.0.0-beta2 | 2.0.0-beta2.0 | Sep 19, 2015 |
8586
| 1.6.0 | 1.6.0.0 | Jul 1, 2015 |
8687
| 1.4.0 | 1.4.4.2 | Apr 3, 2015 |
@@ -98,7 +99,7 @@ zh-tw
9899

99100
## Installation Elasticsearch 2.x
100101

101-
./bin/plugin install http://xbib.org/repository/org/xbib/elasticsearch/plugin/elasticsearch-langdetect/2.0.0-beta2.0/elasticsearch-langdetect-2.0.0-beta2.0-plugin.zip
102+
./bin/plugin install http://xbib.org/repository/org/xbib/elasticsearch/plugin/elasticsearch-langdetect/2.0.0.0/elasticsearch-langdetect-2.0.0.0-plugin.zip
102103

103104
Do not forget to restart the node after installing.
104105

@@ -112,75 +113,155 @@ All feedback is welcome! If you find issues, please post them at [Github](https:
112113

113114
# Examples
114115

115-
## Language detection mapping example
116+
## A simple language detection example
116117

117-
curl -XDELETE 'localhost:9200/test'
118+
In this example, we create a simple detector field, and write text to it for detection.
118119

119-
curl -XPUT 'localhost:9200/test'
120+
curl -XDELETE 'localhost:9200/test'
120121

121-
curl -XPOST 'localhost:9200/test/article/_mapping' -d '
122-
{
123-
"article" : {
124-
"properties" : {
125-
"content" : { "type" : "langdetect" }
126-
}
127-
}
128-
}
129-
'
122+
curl -XPUT 'localhost:9200/test'
130123

131-
curl -XPUT 'localhost:9200/test/article/1' -d '
132-
{
133-
"title" : "Some title",
134-
"content" : "Oh, say can you see by the dawn`s early light, What so proudly we hailed at the twilight`s last gleaming?"
124+
curl -XPOST 'localhost:9200/test/article/_mapping' -d '
125+
{
126+
"article" : {
127+
"properties" : {
128+
"content" : { "type" : "langdetect" }
135129
}
136-
'
130+
}
131+
}
132+
'
137133

138-
curl -XPUT 'localhost:9200/test/article/2' -d '
139-
{
140-
"title" : "Ein Titel",
141-
"content" : "Einigkeit und Recht und Freiheit für das deutsche Vaterland!"
142-
}
143-
'
134+
curl -XPUT 'localhost:9200/test/article/1' -d '
135+
{
136+
"title" : "Some title",
137+
"content" : "Oh, say can you see by the dawn`s early light, What so proudly we hailed at the twilight`s last gleaming?"
138+
}
139+
'
144140

145-
curl -XPUT 'localhost:9200/test/article/3' -d '
146-
{
147-
"title" : "Un titre",
148-
"content" : "Allons enfants de la Patrie, Le jour de gloire est arrivé!"
149-
}
150-
'
141+
curl -XPUT 'localhost:9200/test/article/2' -d '
142+
{
143+
"title" : "Ein Titel",
144+
"content" : "Einigkeit und Recht und Freiheit für das deutsche Vaterland!"
145+
}
146+
'
151147

152-
curl -XGET 'localhost:9200/test/_refresh'
148+
curl -XPUT 'localhost:9200/test/article/3' -d '
149+
{
150+
"title" : "Un titre",
151+
"content" : "Allons enfants de la Patrie, Le jour de gloire est arrivé!"
152+
}
153+
'
153154

154-
curl -XPOST 'localhost:9200/test/_search' -d '
155-
{
156-
"query" : {
157-
"term" : {
158-
"content" : "en"
159-
}
155+
A search for the detected language codes is a simple term query, like this:
156+
157+
curl -XGET 'localhost:9200/test/_refresh'
158+
159+
curl -XPOST 'localhost:9200/test/_search' -d '
160+
{
161+
"query" : {
162+
"term" : {
163+
"content" : "en"
160164
}
161-
}
162-
'
163-
curl -XPOST 'localhost:9200/test/_search' -d '
164-
{
165-
"query" : {
166-
"term" : {
167-
"content" : "de"
168-
}
165+
}
166+
}
167+
'
168+
curl -XPOST 'localhost:9200/test/_search' -d '
169+
{
170+
"query" : {
171+
"term" : {
172+
"content" : "de"
169173
}
170-
}
171-
'
174+
}
175+
}
176+
'
172177

173-
curl -XPOST 'localhost:9200/test/_search' -d '
174-
{
175-
"query" : {
176-
"term" : {
177-
"content" : "fr"
178+
curl -XPOST 'localhost:9200/test/_search' -d '
179+
{
180+
"query" : {
181+
"term" : {
182+
"content" : "fr"
183+
}
184+
}
185+
}
186+
'
187+
188+
## Show stored language codes
189+
190+
Using multifields, it is possible to store the text alongside with the detected language(s).
191+
Here, we use another (short nonsense) example text for demonstration,
192+
which has more than one detected language code.
193+
194+
curl -XDELETE 'localhost:9200/test'
195+
196+
curl -XPUT 'localhost:9200/test'
197+
198+
curl -XPOST 'localhost:9200/test/article/_mapping' -d '
199+
{
200+
"article" : {
201+
"properties" : {
202+
"content" : {
203+
"type" : "multi_field",
204+
"fields" : {
205+
"content" : {
206+
"type" : "string"
207+
},
208+
"language" : {
209+
"type": "langdetect",
210+
"store" : true
211+
}
178212
}
179213
}
180214
}
181-
'
215+
}
216+
}
217+
'
218+
219+
curl -XPUT 'localhost:9200/test/article/1' -d '
220+
{
221+
"content" : "watt datt"
222+
}
223+
'
182224

183-
## Language detection with attachment mapper plugin example
225+
curl -XGET 'localhost:9200/test/_refresh'
226+
227+
curl -XPOST 'localhost:9200/test/_search?pretty' -d '
228+
{
229+
"fields" : "content.language",
230+
"query" : {
231+
"match" : {
232+
"content" : "watt datt"
233+
}
234+
}
235+
}
236+
'
237+
238+
The result is
239+
240+
{
241+
"took" : 2,
242+
"timed_out" : false,
243+
"_shards" : {
244+
"total" : 5,
245+
"successful" : 5,
246+
"failed" : 0
247+
},
248+
"hits" : {
249+
"total" : 1,
250+
"max_score" : 0.51623213,
251+
"hits" : [ {
252+
"_index" : "test",
253+
"_type" : "article",
254+
"_id" : "1",
255+
"_score" : 0.51623213,
256+
"fields" : {
257+
"content.language" : [ "sv", "it", "nl" ]
258+
}
259+
} ]
260+
}
261+
}
262+
263+
264+
## Language detection with attachment mapper plugin
184265

185266
curl -XDELETE 'localhost:9200/test'
186267

@@ -289,6 +370,36 @@ All feedback is welcome! If you find issues, please post them at [Github](https:
289370
} ]
290371
}
291372

373+
374+
# Settings
375+
376+
These settings can be used in `elasticsearch.yml` to modify language detection.
377+
378+
Use with caution. You don't need to modify settings. This list is just for the sake of completeness.
379+
For successful modification of the model parameters, you should study the source code
380+
and be familiar with probabilistic matching using naive bayes with character n-gram.
381+
See also Ted Dunning,
382+
[Statistical Identification of Language](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.48.1958), 1994.
383+
384+
`langdetect.languages` - a comma-separated list of language codes used to restrict the detection
385+
386+
`langdetect.map.<code>` - a substitution code for a language code
387+
388+
`langdetect.number_of_trials` - number of trials, affects CPU usage (default: 7)
389+
390+
`langdetect.alpha` - additional smoothing parameter, default: 0.5
391+
392+
`langdetect.alpha_width` - the width of smoothing, default: 0.05
393+
394+
`langdetect.iteration_limit` - safeguard to break loop, default: 10000
395+
396+
`langdetect.prob_threshold` - default: 0.1
397+
398+
`langdetect.conv_threshold` - detection is terminated when normalized probability exceeds
399+
this threshold, default: 0.99999
400+
401+
`langdetect.base_freq` - default 10000
402+
292403
# Credits
293404

294405
Thanks to Alexander Reelsen for his OpenNLP plugin, from where I have copied and

pom.xml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
<groupId>org.xbib.elasticsearch.plugin</groupId>
99
<artifactId>elasticsearch-langdetect</artifactId>
10-
<version>2.0.0-beta2.0</version>
10+
<version>2.0.0.0</version>
1111

1212
<packaging>jar</packaging>
1313

@@ -69,7 +69,7 @@
6969
<github.global.server>github</github.global.server>
7070
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
7171
<java.compiler.version>1.7</java.compiler.version>
72-
<elasticsearch.version>2.0.0-beta2</elasticsearch.version>
72+
<elasticsearch.version>2.0.0</elasticsearch.version>
7373
<jackson.version>2.5.3</jackson.version>
7474
</properties>
7575

@@ -110,14 +110,14 @@
110110
<dependency>
111111
<groupId>org.apache.logging.log4j</groupId>
112112
<artifactId>log4j-slf4j-impl</artifactId>
113-
<version>2.2</version>
113+
<version>2.4.1</version>
114114
<scope>test</scope>
115115
</dependency>
116116

117117
<dependency>
118118
<groupId>org.apache.logging.log4j</groupId>
119119
<artifactId>log4j-core</artifactId>
120-
<version>2.2</version>
120+
<version>2.4.1</version>
121121
<scope>test</scope>
122122
</dependency>
123123

src/main/java/org/xbib/elasticsearch/module/langdetect/LangdetectService.java

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -139,8 +139,8 @@ public Settings getSettings() {
139139
private void load(Settings settings) {
140140
try {
141141
String[] keys = DEFAULT_LANGUAGES;
142-
if (settings.get("languages") != null) {
143-
keys = settings.get("languages").split(",");
142+
if (settings.get("langdetect.languages") != null) {
143+
keys = settings.get("langdetect.languages").split(",");
144144
}
145145
int index = 0;
146146
int size = keys.length;
@@ -157,13 +157,13 @@ private void load(Settings settings) {
157157
try {
158158
// map by settings
159159
Settings map = Settings.EMPTY;
160-
if (settings.getByPrefix("map.") != null) {
161-
map = Settings.settingsBuilder().put(settings.getByPrefix("map.")).build();
160+
if (settings.getByPrefix("langdetect.map.") != null) {
161+
map = Settings.settingsBuilder().put(settings.getByPrefix("langdetect.map.")).build();
162162
}
163163
if (map.getAsMap().isEmpty()) {
164164
// is in "map" a resource name?
165-
String s = settings.get("map") != null ?
166-
settings.get("map") : this.profile + "language.json";
165+
String s = settings.get("langdetect.map") != null ?
166+
settings.get("langdetect.map") : this.profile + "language.json";
167167
InputStream in = getClass().getResourceAsStream(s);
168168
if (in != null) {
169169
map = Settings.settingsBuilder().loadFromStream(s, in).build();
@@ -178,15 +178,15 @@ private void load(Settings settings) {
178178

179179
private void init() {
180180
this.priorMap = null;
181-
this.n_trial = settings.getAsInt("number_of_trials", 7);
182-
this.alpha = settings.getAsDouble("alpha", 0.5);
183-
this.alpha_width = settings.getAsDouble("alpha_width", 0.05);
184-
this.iteration_limit = settings.getAsInt("iteration_limit", 10000);
185-
this.prob_threshold = settings.getAsDouble("prob_threshold", 0.1);
186-
this.conv_threshold = settings.getAsDouble("conv_threshold", 0.99999);
187-
this.base_freq = settings.getAsInt("base_freq", 10000);
188-
this.filterPattern = settings.get("pattern") != null ?
189-
Pattern.compile(settings.get("pattern"),Pattern.UNICODE_CHARACTER_CLASS) : null;
181+
this.n_trial = settings.getAsInt("langdetect.number_of_trials", 7);
182+
this.alpha = settings.getAsDouble("langdetect.alpha", 0.5);
183+
this.alpha_width = settings.getAsDouble("langdetect.alpha_width", 0.05);
184+
this.iteration_limit = settings.getAsInt("langdetect.iteration_limit", 10000);
185+
this.prob_threshold = settings.getAsDouble("langdetect.prob_threshold", 0.1);
186+
this.conv_threshold = settings.getAsDouble("langdetect.conv_threshold", 0.99999);
187+
this.base_freq = settings.getAsInt("langdetect.base_freq", 10000);
188+
this.filterPattern = settings.get("langdetect.pattern") != null ?
189+
Pattern.compile(settings.get("langdetect.pattern"),Pattern.UNICODE_CHARACTER_CLASS) : null;
190190
isStarted = true;
191191
}
192192

@@ -240,7 +240,7 @@ public List<Language> detectAll(String text) throws LanguageDetectionException {
240240
}
241241
List<String> list = new ArrayList<>();
242242
languages = sortProbability(languages, detectBlock(list, text));
243-
return languages.subList(0, Math.min(languages.size(), settings.getAsInt("max", languages.size())));
243+
return languages.subList(0, Math.min(languages.size(), settings.getAsInt("langdetect.max", languages.size())));
244244
}
245245

246246
private double[] detectBlock(List<String> list, String text) throws LanguageDetectionException {
Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
11
classname=org.xbib.elasticsearch.plugin.langdetect.LangdetectPlugin
2-
name=langdetect
3-
version=${project.version}
4-
elasticsearch.version=${elasticsearch.version}
52
jvm=true
6-
java.version=1.7
73
site=false
84
isolated=true
5+
name=langdetect
96
description=Language detection plugin
10-
hash=${buildNumber}
11-
timestamp=${timestamp}
7+
version=${project.version}
8+
elasticsearch.version=${elasticsearch.version}
9+
java.version=${java.compiler.version}

src/test/java/org/xbib/elasticsearch/index/mapper/langdetect/DetectorTests.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ public class DetectorTests extends Assert {
2323
public static void setUp() throws Exception {
2424

2525
Settings settings = Settings.settingsBuilder()
26-
.put("languages", "")
26+
.put("langdetect.languages", "")
2727
.build();
2828
detect = new LangdetectService(settings);
2929
detect.start();

0 commit comments

Comments
 (0)