try rule (+5 squashed commits)

Philipp Dowling · phdowling · commit 7be892ee2ba9 · 2016-03-04T17:31:03.000+01:00
Squashed commits: [58ce825] try rule [07d3791] local fixes [df907a9] local fixes [8e8a88b] local fixes [d72002a] local fixes
diff --git a/prepare.sh b/prepare.sh
@@ -1,13 +1,45 @@
 #!/bin/bash
 #+------------------------------------------------------------------------------------------------------------------------------+
 #| Idio Wiki2Vec                                                                      |                                                                                                     |
+#+------------------------------------------------------------------------------------------------------------------------------+
+
+# Creates Wiki2Vec corpora out of a wikipedia dump
+
+# $1 Locale (en_US)
+# $2 Target Folder( Output Folder)
+# $3 Stemmer
+
+WIKI2VEC_VERSION="1.0"
+
+usage ()
+{
+     echo "prepare.sh"
+     echo "usage: ./prepare.sh en_US /data/word2vec/ [StemmerLanguage]"
+     echo "Creates a wikipedia corpus which can be fed into word2vec creation tools"
+}
+
+shift $((OPTIND - 1))
+
+if [ $# < 2 ]
+then
+    usage
+    exit
+fi
+
+BASE_DIR=$(pwd)
+TARGET_DIR="$2"
+LANGUAGE=`echo $1 | sed "s/_.*//g"`
+WDIR="$BASE_DIR/working"
+SPARK_PATH="$WDIR/spark-1.2.0-bin-hadoop2.4"
+JAR_PATH="$BASE_DIR/target/scala-2.10/wiki2vec-assembly-${WIKI2VEC_VERSION}.jar"
+READABLEWIKI="$TARGET_DIR/${LANGUAGE}wiki-latest.lines"
 SPLIT_OUTPUT_CORPUS="$WDIR/${LANGUAGE}wiki"
 OUTPUTCORPUS="$TARGET_DIR/${LANGUAGE}wiki.corpus"
 
-if [ ! -z "$3" ]; then 
-        STEMMERNAME="$3"
-else 
-        STEMMERNAME="$LANGUAGE"
+if [ ! -z "$3" ]; then
+	STEMMERNAME="$3"
+else
+	STEMMERNAME="$LANGUAGE"
 fi
 
 echo "Language: $LANGUAGE"
@@ -51,7 +83,7 @@ echo "Creating Readable Wiki.."
 
 # Create Wiki2Vec Corpus
 echo "Creating Word2vec Corpus"
-$SPARK_PATH/bin/spark-submit --driver-memory 20g --class org.idio.wikipedia.word2vec.Word2VecCorpus $JAR_PATH $READABLEWIKI $BASE_DIR/fakePathToRedirect/file.nt $SPLIT_OUTPUT_CORPUS $STEMMERNAME
+$SPARK_PATH/bin/spark-submit --driver-memory 15g --num-executors 4 --class org.idio.wikipedia.word2vec.Word2VecCorpus $JAR_PATH $READABLEWIKI $BASE_DIR/fakePathToRedirect/file.nt $SPLIT_OUTPUT_CORPUS $STEMMERNAME
 
 # joining split files
 echo "Joining corpus.."
diff --git a/resources/gensim/gensim_word2vec.py b/resources/gensim/gensim_word2vec.py
@@ -32,7 +32,12 @@
 def read_corpus(path_to_corpus, output_path, min_count=10, size=500, window=10):
     workers = multiprocessing.cpu_count()
     sentences = gensim.models.word2vec.LineSentence(path_to_corpus)
-    model = gensim.models.Word2Vec(None, min_count=min_count, size=size, window=window, sg=1, workers=workers)
+    def rule(word, count, min_count):
+        if word.startswith("DBPEDIA_ID/"):
+            return 2
+        else:
+            return 0
+    model = gensim.models.Word2Vec(None, min_count=min_count, size=size, window=window, sg=1, workers=workers, trim_rule=rule)
     model.build_vocab(sentences)
     model.train(sentences)
     model.save(output_path)
diff --git a/src/main/scala/org/idio/wikipedia/redirects/RedirectStore.scala b/src/main/scala/org/idio/wikipedia/redirects/RedirectStore.scala
@@ -55,8 +55,12 @@ class RedisRedirectStore(redisHost:String, redisPort:Int, redirects:Iterator[(St
 object RedirectStore{
 
   def readFile(pathToRedirectFile: String) = {
+     var i = 0
      Source.fromFile(pathToRedirectFile, "UTF-8").getLines().map{
           line =>
+            i += 1
+            if (i % 100000 == 0)
+                println(i)
             val entityRegex = "<http://dbpedia.org/resource/([^ >]+)>".r
             val matches = entityRegex.findAllIn(line).matchData
             if(matches.hasNext){