Skip to content

Commit 9e51f11

Browse files
committed
Refactoring Wikipedia collection classes to make more consistent with other collection processing classes.
1 parent 811aea5 commit 9e51f11

File tree

10 files changed

+294
-282
lines changed

10 files changed

+294
-282
lines changed

docs/content/wikipedia.html

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -186,8 +186,8 @@ <h2>Sequentially-Numbered Docnos</h2>
186186
docids and docnos by the following command:</p>
187187

188188
<pre class="code">
189-
etc/hadoop-cluster.sh edu.umd.cloud9.collection.wikipedia.BuildWikipediaDocnoMapping \
190-
-input /shared/collections/wikipedia/raw/enwiki-20130503-pages-articles.xml -output_path tmp \
189+
etc/hadoop-cluster.sh edu.umd.cloud9.collection.wikipedia.WikipediaDocnoMappingBuilder \
190+
-input /shared/collections/wikipedia/raw/enwiki-20130503-pages-articles.xml \
191191
-output_file enwiki-20130503-docno.dat -wiki_language en -keep_all
192192
</pre>
193193

@@ -201,7 +201,7 @@ <h2>Sequentially-Numbered Docnos</h2>
201201
fast lookup), using article titles as docids would be expensive (lots
202202
of strings to store in memory).</p>
203203

204-
<p>Note that by default, <code>BuildWikipediaDocnoMapping</code>
204+
<p>Note that by default, <code>WikipediaDocnoMappingBuilder</code>
205205
discards non-articles (e.g., disambiguation pages, redirects,
206206
etc.). To retain all pages, use the <code>-keep_all</code> option.</p>
207207

@@ -288,9 +288,8 @@ <h2>Supporting Random Access</h2>
288288
index:</p>
289289

290290
<pre class="code">
291-
etc/hadoop-cluster.sh edu.umd.cloud9.collection.wikipedia.BuildWikipediaForwardIndex \
292-
-input /user/jimmylin/enwiki-20130503.block \
293-
-output tmp -index_file enwiki-20130503.findex.dat
291+
etc/hadoop-cluster.sh edu.umd.cloud9.collection.wikipedia.WikipediaForwardIndexBuilder \
292+
-input /user/jimmylin/enwiki-20130503.block -index_file enwiki-20130503.findex.dat
294293
</pre>
295294

296295
<p><b>Note:</b> It mostly doesn't matter elsewhere, but

etc/TODO.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
- Wikipedia examples: eliminate the -tmp option
1+
- write test cases for Bidimap
2+

src/dist/edu/umd/cloud9/collection/trec/TrecDocnoMappingBuilder.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848
*/
4949
public class TrecDocnoMappingBuilder extends Configured implements Tool, DocnoMapping.Builder {
5050
private static final Logger LOG = Logger.getLogger(TrecDocnoMappingBuilder.class);
51-
private static final Random random = new Random();
51+
private static final Random RANDOM = new Random();
5252
private static enum Count { DOCS };
5353

5454
private static class MyMapper extends Mapper<LongWritable, TrecDocument, Text, IntWritable> {
@@ -93,12 +93,12 @@ public int build(Path src, Path dest, Configuration conf) throws IOException {
9393
*/
9494
public int run(String[] args) throws IOException {
9595
DocnoMapping.DefaultBuilderOptions options = DocnoMapping.BuilderUtils.parseDefaultOptions(args);
96-
if ( options == null) {
96+
if (options == null) {
9797
return -1;
9898
}
9999

100100
// Temp directory.
101-
String tmpDir = "tmp-" + TrecDocnoMappingBuilder.class.getSimpleName() + "-" + random.nextInt(10000);
101+
String tmpDir = "tmp-" + TrecDocnoMappingBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000);
102102

103103
LOG.info("Tool name: " + TrecDocnoMappingBuilder.class.getCanonicalName());
104104
LOG.info(" - input path: " + options.collection);

src/dist/edu/umd/cloud9/collection/wikipedia/BuildWikipediaForwardIndex.java

Lines changed: 0 additions & 239 deletions
This file was deleted.

src/dist/edu/umd/cloud9/collection/wikipedia/LookupWikipediaArticle.java

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,6 @@
3737
* etc/hadoop-cluster.sh edu.umd.cloud9.collection.wikipedia.LookupWikipediaArticle \
3838
* enwiki-20130503.findex.dat enwiki-20130503-docno.dat
3939
* </pre></blockquote>
40-
*
41-
* <p>
42-
* Note, you'll have to build a jar that contains the contents of bliki-core-3.0.15.jar and
43-
* commons-lang-2.5.jar, since -libjars won't work for this program (since it's not a MapReduce
44-
* job).
45-
* </p>
4640
*
4741
* @author Jimmy Lin
4842
*/

src/dist/edu/umd/cloud9/collection/wikipedia/WikipediaDocnoMapping.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ public void loadMapping(Path p, FileSystem fs) throws IOException {
7777

7878
/**
7979
* Creates a mappings file from the contents of a flat text file containing docid to docno
80-
* mappings. This method is used by {@link BuildWikipediaDocnoMapping} internally.
80+
* mappings. This method is used by {@link WikipediaDocnoMappingBuilder} internally.
8181
*
8282
* @param inputFile flat text file containing docid to docno mappings
8383
* @param outputFile output mappings file
@@ -133,7 +133,7 @@ static public int[] readDocnoMappingData(Path p, FileSystem fs) throws IOExcepti
133133

134134
@Override
135135
public Builder getBuilder() {
136-
throw new UnsupportedOperationException();
136+
return new WikipediaDocnoMappingBuilder();
137137
}
138138

139139
/**

0 commit comments

Comments
 (0)