update to Elasticsearch 2.0, more examples, settings documented

jprante · jprante · commit 1dc99f9a5bd8 · 2015-11-12T23:05:58.000+01:00
diff --git a/README.md b/README.md
@@ -81,6 +81,7 @@ zh-tw
 
 | Elasticsearch  | Plugin         | Release date |
 | -------------- | -------------- | ------------ |
+| 2.0.0          | 2.0.0.0        | Nov 12, 2015 |
 | 2.0.0-beta2    | 2.0.0-beta2.0  | Sep 19, 2015 |
 | 1.6.0          | 1.6.0.0        | Jul  1, 2015 |
 | 1.4.0          | 1.4.4.2        | Apr  3, 2015 |
@@ -98,7 +99,7 @@ zh-tw
 
 ## Installation Elasticsearch 2.x
 
-    ./bin/plugin install http://xbib.org/repository/org/xbib/elasticsearch/plugin/elasticsearch-langdetect/2.0.0-beta2.0/elasticsearch-langdetect-2.0.0-beta2.0-plugin.zip
+    ./bin/plugin install http://xbib.org/repository/org/xbib/elasticsearch/plugin/elasticsearch-langdetect/2.0.0.0/elasticsearch-langdetect-2.0.0.0-plugin.zip
 
 Do not forget to restart the node after installing.
 
@@ -112,75 +113,155 @@ All feedback is welcome! If you find issues, please post them at [Github](https:
 
 # Examples
 
-## Language detection mapping example
+## A simple language detection example
 
-        curl -XDELETE 'localhost:9200/test'
+In this example, we create a simple detector field, and write text to it for detection.
 
-        curl -XPUT 'localhost:9200/test'
+    curl -XDELETE 'localhost:9200/test'
 
-        curl -XPOST 'localhost:9200/test/article/_mapping' -d '
-        {
-          "article" : {
-            "properties" : {
-               "content" : { "type" : "langdetect" }
-            }
-          }
-        }
-        '
+    curl -XPUT 'localhost:9200/test'
 
-        curl -XPUT 'localhost:9200/test/article/1' -d '
-        {
-          "title" : "Some title",
-          "content" : "Oh, say can you see by the dawn`s early light, What so proudly we hailed at the twilight`s last gleaming?"
+    curl -XPOST 'localhost:9200/test/article/_mapping' -d '
+    {
+      "article" : {
+        "properties" : {
+           "content" : { "type" : "langdetect" }
         }
-        '
+      }
+    }
+    '
 
-        curl -XPUT 'localhost:9200/test/article/2' -d '
-        {
-          "title" : "Ein Titel",
-          "content" : "Einigkeit und Recht und Freiheit für das deutsche Vaterland!"
-        }
-        '
+    curl -XPUT 'localhost:9200/test/article/1' -d '
+    {
+      "title" : "Some title",
+      "content" : "Oh, say can you see by the dawn`s early light, What so proudly we hailed at the twilight`s last gleaming?"
+    }
+    '
 
-        curl -XPUT 'localhost:9200/test/article/3' -d '
-        {
-          "title" : "Un titre",
-          "content" : "Allons enfants de la Patrie, Le jour de gloire est arrivé!"
-        }
-        '
+    curl -XPUT 'localhost:9200/test/article/2' -d '
+    {
+      "title" : "Ein Titel",
+      "content" : "Einigkeit und Recht und Freiheit für das deutsche Vaterland!"
+    }
+    '
 
-        curl -XGET 'localhost:9200/test/_refresh'
+    curl -XPUT 'localhost:9200/test/article/3' -d '
+    {
+      "title" : "Un titre",
+      "content" : "Allons enfants de la Patrie, Le jour de gloire est arrivé!"
+    }
+    '
 
-        curl -XPOST 'localhost:9200/test/_search' -d '
-        {
-           "query" : {
-               "term" : {
-                    "content" : "en"
-               }
+A search for the detected language codes is a simple term query, like this:
+
+    curl -XGET 'localhost:9200/test/_refresh'
+
+    curl -XPOST 'localhost:9200/test/_search' -d '
+    {
+       "query" : {
+           "term" : {
+                "content" : "en"
            }
-        }
-        '
-        curl -XPOST 'localhost:9200/test/_search' -d '
-        {
-           "query" : {
-               "term" : {
-                    "content" : "de"
-               }
+       }
+    }
+    '
+    curl -XPOST 'localhost:9200/test/_search' -d '
+    {
+       "query" : {
+           "term" : {
+                "content" : "de"
            }
-        }
-        '
+       }
+    }
+    '
 
-        curl -XPOST 'localhost:9200/test/_search' -d '
-        {
-           "query" : {
-               "term" : {
-                    "content" : "fr"
+    curl -XPOST 'localhost:9200/test/_search' -d '
+    {
+       "query" : {
+           "term" : {
+                "content" : "fr"
+           }
+       }
+    }
+    '
+
+## Show stored language codes 
+ 
+Using multifields, it is possible to store the text alongside with the detected language(s).
+Here, we use another (short nonsense) example text for demonstration,
+which has more than one detected language code.
+
+    curl -XDELETE 'localhost:9200/test'
+
+    curl -XPUT 'localhost:9200/test'
+
+    curl -XPOST 'localhost:9200/test/article/_mapping' -d '
+    {
+      "article" : {
+        "properties" : {
+           "content" : { 
+             "type" : "multi_field",
+               "fields" : {
+                   "content" : {
+                       "type" : "string"
+                   },
+                   "language" : {
+                         "type": "langdetect",
+                         "store" : true
+                   }
                }
            }
         }
-        '
+      }
+    }
+    '
+
+    curl -XPUT 'localhost:9200/test/article/1' -d '
+    {
+      "content" : "watt datt"
+    }
+    '
 
-## Language detection with attachment mapper plugin example
+    curl -XGET 'localhost:9200/test/_refresh'
+
+    curl -XPOST 'localhost:9200/test/_search?pretty' -d '
+    {
+       "fields" : "content.language",
+       "query" : {
+           "match" : {
+                "content" : "watt datt"
+           }
+       }
+    }
+    '
+
+The result is 
+
+    {
+     "took" : 2,
+     "timed_out" : false,
+     "_shards" : {
+       "total" : 5,
+       "successful" : 5,
+       "failed" : 0
+     },
+     "hits" : {
+       "total" : 1,
+       "max_score" : 0.51623213,
+       "hits" : [ {
+         "_index" : "test",
+         "_type" : "article",
+         "_id" : "1",
+         "_score" : 0.51623213,
+         "fields" : {
+           "content.language" : [ "sv", "it", "nl" ]
+         }
+       } ]
+     }
+    }
+
+
+## Language detection with attachment mapper plugin
 
 	curl -XDELETE 'localhost:9200/test'
 
@@ -289,6 +370,36 @@ All feedback is welcome! If you find issues, please post them at [Github](https:
       } ]
     }
 
+
+# Settings
+
+These settings can be used in `elasticsearch.yml` to modify language detection.
+
+Use with caution. You don't need to modify settings. This list is just for the sake of completeness.
+For successful modification of the model parameters, you should study the source code
+and be familiar with probabilistic matching using naive bayes with character n-gram. 
+See also Ted Dunning,
+[Statistical Identification of Language](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.48.1958), 1994.
+
+`langdetect.languages` - a comma-separated list of language codes used to restrict the detection
+
+`langdetect.map.<code>` - a substitution code for a language code
+
+`langdetect.number_of_trials` - number of trials, affects CPU usage (default: 7)
+
+`langdetect.alpha` - additional smoothing parameter, default: 0.5
+
+`langdetect.alpha_width` - the width of smoothing, default: 0.05
+
+`langdetect.iteration_limit` - safeguard to break loop, default: 10000
+
+`langdetect.prob_threshold` - default: 0.1
+
+`langdetect.conv_threshold` - detection is terminated when normalized probability exceeds 
+this threshold, default: 0.99999
+
+`langdetect.base_freq` - default 10000
+
 # Credits
 
 Thanks to Alexander Reelsen for his OpenNLP plugin, from where I have copied and 
diff --git a/pom.xml b/pom.xml
@@ -7,7 +7,7 @@
 
     <groupId>org.xbib.elasticsearch.plugin</groupId>
     <artifactId>elasticsearch-langdetect</artifactId>
-    <version>2.0.0-beta2.0</version>
+    <version>2.0.0.0</version>
 
     <packaging>jar</packaging>
 
@@ -69,7 +69,7 @@
         <github.global.server>github</github.global.server>
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
         <java.compiler.version>1.7</java.compiler.version>
-        <elasticsearch.version>2.0.0-beta2</elasticsearch.version>
+        <elasticsearch.version>2.0.0</elasticsearch.version>
         <jackson.version>2.5.3</jackson.version>
     </properties>
 
@@ -110,14 +110,14 @@
         <dependency>
             <groupId>org.apache.logging.log4j</groupId>
             <artifactId>log4j-slf4j-impl</artifactId>
-            <version>2.2</version>
+            <version>2.4.1</version>
             <scope>test</scope>
         </dependency>
 
         <dependency>
             <groupId>org.apache.logging.log4j</groupId>
             <artifactId>log4j-core</artifactId>
-            <version>2.2</version>
+            <version>2.4.1</version>
             <scope>test</scope>
         </dependency>
 
diff --git a/src/main/java/org/xbib/elasticsearch/module/langdetect/LangdetectService.java b/src/main/java/org/xbib/elasticsearch/module/langdetect/LangdetectService.java
@@ -139,8 +139,8 @@ public Settings getSettings() {
     private void load(Settings settings) {
         try {
             String[] keys = DEFAULT_LANGUAGES;
-            if (settings.get("languages") != null) {
-                keys = settings.get("languages").split(",");
+            if (settings.get("langdetect.languages") != null) {
+                keys = settings.get("langdetect.languages").split(",");
             }
             int index = 0;
             int size = keys.length;
@@ -157,13 +157,13 @@ private void load(Settings settings) {
         try {
             // map by settings
             Settings map = Settings.EMPTY;
-            if (settings.getByPrefix("map.") != null) {
-                map = Settings.settingsBuilder().put(settings.getByPrefix("map.")).build();
+            if (settings.getByPrefix("langdetect.map.") != null) {
+                map = Settings.settingsBuilder().put(settings.getByPrefix("langdetect.map.")).build();
             }
             if (map.getAsMap().isEmpty()) {
                 // is in "map" a resource name?
-                String s = settings.get("map") != null ?
-                        settings.get("map") : this.profile + "language.json";
+                String s = settings.get("langdetect.map") != null ?
+                        settings.get("langdetect.map") : this.profile + "language.json";
                 InputStream in = getClass().getResourceAsStream(s);
                 if (in != null) {
                     map = Settings.settingsBuilder().loadFromStream(s, in).build();
@@ -178,15 +178,15 @@ private void load(Settings settings) {
 
     private void init() {
         this.priorMap = null;
-        this.n_trial = settings.getAsInt("number_of_trials", 7);
-        this.alpha = settings.getAsDouble("alpha", 0.5);
-        this.alpha_width = settings.getAsDouble("alpha_width", 0.05);
-        this.iteration_limit = settings.getAsInt("iteration_limit", 10000);
-        this.prob_threshold = settings.getAsDouble("prob_threshold", 0.1);
-        this.conv_threshold = settings.getAsDouble("conv_threshold",  0.99999);
-        this.base_freq = settings.getAsInt("base_freq", 10000);
-        this.filterPattern = settings.get("pattern") != null ?
-                Pattern.compile(settings.get("pattern"),Pattern.UNICODE_CHARACTER_CLASS) : null;
+        this.n_trial = settings.getAsInt("langdetect.number_of_trials", 7);
+        this.alpha = settings.getAsDouble("langdetect.alpha", 0.5);
+        this.alpha_width = settings.getAsDouble("langdetect.alpha_width", 0.05);
+        this.iteration_limit = settings.getAsInt("langdetect.iteration_limit", 10000);
+        this.prob_threshold = settings.getAsDouble("langdetect.prob_threshold", 0.1);
+        this.conv_threshold = settings.getAsDouble("langdetect.conv_threshold",  0.99999);
+        this.base_freq = settings.getAsInt("langdetect.base_freq", 10000);
+        this.filterPattern = settings.get("langdetect.pattern") != null ?
+                Pattern.compile(settings.get("langdetect.pattern"),Pattern.UNICODE_CHARACTER_CLASS) : null;
         isStarted = true;
     }
 
@@ -240,7 +240,7 @@ public List<Language> detectAll(String text) throws LanguageDetectionException {
         }
         List<String> list = new ArrayList<>();
         languages = sortProbability(languages, detectBlock(list, text));
-        return languages.subList(0, Math.min(languages.size(), settings.getAsInt("max", languages.size())));
+        return languages.subList(0, Math.min(languages.size(), settings.getAsInt("langdetect.max", languages.size())));
     }
 
     private double[] detectBlock(List<String> list, String text) throws LanguageDetectionException {
diff --git a/src/main/resources/plugin-descriptor.properties b/src/main/resources/plugin-descriptor.properties
@@ -1,11 +1,9 @@
 classname=org.xbib.elasticsearch.plugin.langdetect.LangdetectPlugin
-name=langdetect
-version=${project.version}
-elasticsearch.version=${elasticsearch.version}
 jvm=true
-java.version=1.7
 site=false
 isolated=true
+name=langdetect
 description=Language detection plugin
-hash=${buildNumber}
-timestamp=${timestamp}
+version=${project.version}
+elasticsearch.version=${elasticsearch.version}
+java.version=${java.compiler.version}
diff --git a/src/test/java/org/xbib/elasticsearch/index/mapper/langdetect/DetectorTests.java b/src/test/java/org/xbib/elasticsearch/index/mapper/langdetect/DetectorTests.java
@@ -23,7 +23,7 @@ public class DetectorTests extends Assert {
     public static void setUp() throws Exception {
 
         Settings settings = Settings.settingsBuilder()
-                .put("languages", "")
+                .put("langdetect.languages", "")
                 .build();
         detect = new LangdetectService(settings);
         detect.start();
diff --git a/src/test/java/org/xbib/elasticsearch/index/mapper/langdetect/LangdetectMappingTests.java b/src/test/java/org/xbib/elasticsearch/index/mapper/langdetect/LangdetectMappingTests.java