Skip to content
This repository was archived by the owner on Feb 22, 2021. It is now read-only.

Commit 7be892e

Browse files
Philipp Dowlingphdowling
Philipp Dowling
authored andcommitted
try rule (+5 squashed commits)
Squashed commits: [58ce825] try rule [07d3791] local fixes [df907a9] local fixes [8e8a88b] local fixes [d72002a] local fixes
1 parent 864a7e9 commit 7be892e

File tree

3 files changed

+47
-6
lines changed

3 files changed

+47
-6
lines changed

prepare.sh

100644100755
Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,45 @@
11
#!/bin/bash
22
#+------------------------------------------------------------------------------------------------------------------------------+
33
#| Idio Wiki2Vec | |
4+
#+------------------------------------------------------------------------------------------------------------------------------+
5+
6+
# Creates Wiki2Vec corpora out of a wikipedia dump
7+
8+
# $1 Locale (en_US)
9+
# $2 Target Folder( Output Folder)
10+
# $3 Stemmer
11+
12+
WIKI2VEC_VERSION="1.0"
13+
14+
usage ()
15+
{
16+
echo "prepare.sh"
17+
echo "usage: ./prepare.sh en_US /data/word2vec/ [StemmerLanguage]"
18+
echo "Creates a wikipedia corpus which can be fed into word2vec creation tools"
19+
}
20+
21+
shift $((OPTIND - 1))
22+
23+
if [ $# < 2 ]
24+
then
25+
usage
26+
exit
27+
fi
28+
29+
BASE_DIR=$(pwd)
30+
TARGET_DIR="$2"
31+
LANGUAGE=`echo $1 | sed "s/_.*//g"`
32+
WDIR="$BASE_DIR/working"
33+
SPARK_PATH="$WDIR/spark-1.2.0-bin-hadoop2.4"
34+
JAR_PATH="$BASE_DIR/target/scala-2.10/wiki2vec-assembly-${WIKI2VEC_VERSION}.jar"
35+
READABLEWIKI="$TARGET_DIR/${LANGUAGE}wiki-latest.lines"
436
SPLIT_OUTPUT_CORPUS="$WDIR/${LANGUAGE}wiki"
537
OUTPUTCORPUS="$TARGET_DIR/${LANGUAGE}wiki.corpus"
638

7-
if [ ! -z "$3" ]; then
8-
STEMMERNAME="$3"
9-
else
10-
STEMMERNAME="$LANGUAGE"
39+
if [ ! -z "$3" ]; then
40+
STEMMERNAME="$3"
41+
else
42+
STEMMERNAME="$LANGUAGE"
1143
fi
1244

1345
echo "Language: $LANGUAGE"
@@ -51,7 +83,7 @@ echo "Creating Readable Wiki.."
5183

5284
# Create Wiki2Vec Corpus
5385
echo "Creating Word2vec Corpus"
54-
$SPARK_PATH/bin/spark-submit --driver-memory 20g --class org.idio.wikipedia.word2vec.Word2VecCorpus $JAR_PATH $READABLEWIKI $BASE_DIR/fakePathToRedirect/file.nt $SPLIT_OUTPUT_CORPUS $STEMMERNAME
86+
$SPARK_PATH/bin/spark-submit --driver-memory 15g --num-executors 4 --class org.idio.wikipedia.word2vec.Word2VecCorpus $JAR_PATH $READABLEWIKI $BASE_DIR/fakePathToRedirect/file.nt $SPLIT_OUTPUT_CORPUS $STEMMERNAME
5587

5688
# joining split files
5789
echo "Joining corpus.."

resources/gensim/gensim_word2vec.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,12 @@
3232
def read_corpus(path_to_corpus, output_path, min_count=10, size=500, window=10):
3333
workers = multiprocessing.cpu_count()
3434
sentences = gensim.models.word2vec.LineSentence(path_to_corpus)
35-
model = gensim.models.Word2Vec(None, min_count=min_count, size=size, window=window, sg=1, workers=workers)
35+
def rule(word, count, min_count):
36+
if word.startswith("DBPEDIA_ID/"):
37+
return 2
38+
else:
39+
return 0
40+
model = gensim.models.Word2Vec(None, min_count=min_count, size=size, window=window, sg=1, workers=workers, trim_rule=rule)
3641
model.build_vocab(sentences)
3742
model.train(sentences)
3843
model.save(output_path)

src/main/scala/org/idio/wikipedia/redirects/RedirectStore.scala

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,12 @@ class RedisRedirectStore(redisHost:String, redisPort:Int, redirects:Iterator[(St
5555
object RedirectStore{
5656

5757
def readFile(pathToRedirectFile: String) = {
58+
var i = 0
5859
Source.fromFile(pathToRedirectFile, "UTF-8").getLines().map{
5960
line =>
61+
i += 1
62+
if (i % 100000 == 0)
63+
println(i)
6064
val entityRegex = "<http://dbpedia.org/resource/([^ >]+)>".r
6165
val matches = entityRegex.findAllIn(line).matchData
6266
if(matches.hasNext){

0 commit comments

Comments
 (0)