implement reproduce as another mining pipeline.

gwenzek · gwenzek · commit 6e114039ba3b · 2020-05-11T19:04:24.000+02:00
diff --git a/Makefile b/Makefile
@@ -185,10 +185,10 @@ test:
 
 test2:
 	python -m cc_net mine --config config/test_segment.json
-	python -m cc_net mine --config config/test_segment.json -p fetch_metadata split
+	python -m cc_net mine --config config/test_reproduce.json
 	diff \
 		<(zcat test_data/mined/2019-09/fr_head_0000.json.gz | jq -c 'select(.cc_segment == "crawl-data/CC-MAIN-2019-09/segments/1550247479101.30/wet/CC-MAIN-20190215183319-20190215205319-00000.warc.wet.gz") | {url, perplexity}' | sort) \
-		<(zcat test_data2/mined/2019-09/CC-MAIN-20190215183319-20190215205319-00000.json.gz | jq -c 'select(.bucket == "head" and .language == "fr") | {url, perplexity}' | sort) \
+		<(zcat test_data2/mined_by_segment/2019-09/CC-MAIN-20190215183319-20190215205319-00000.json.gz | jq -c 'select(.bucket == "head" and .language == "fr") | {url, perplexity}' | sort) \
 		| head
 
 	diff \
diff --git a/cc_net/__main__.py b/cc_net/__main__.py
@@ -12,11 +12,7 @@
 
 
 def main():
-    parser = func_argparse.multi_argparser(
-        mine=cc_net.mine.get_main_parser("mine"),
-        reproduce=cc_net.mine.get_main_parser("reproduce"),
-    )
-    func_argparse.parse_and_call(parser)
+    func_argparse.parse_and_call(cc_net.mine.get_main_parser())
 
 
 if __name__ == "__main__":
diff --git a/cc_net/mine.py b/cc_net/mine.py
@@ -41,7 +41,7 @@
     "lm",
     "pp_bucket",
     "drop",
-    "split",
+    "split_by_lang",
 ]
 
 
@@ -156,7 +156,7 @@ def _get_dir(self, name: str, regroup: bool = False) -> Path:
             return self.output_dir / f"{name}_split" / self.dump
         return self.output_dir / name / self.dump
 
-    def get_mined_dir(self) -> Path:
+    def get_mined_dir(self, regroup_dir: bool = False) -> Path:
         return self._get_dir(self.mined_dir)
 
 
@@ -168,6 +168,13 @@ def get_mined_dir(self) -> Path:
     pipeline=list(BASE_CONFIG.pipeline[:-1]) + ["split_by_lang"],
 )
 
+REPRODUCE_CONFIG = Config(
+    config_name="reproduce",
+    mined_dir="reproduce",
+    pipeline=["fetch_metadata", "split_by_lang"],
+    execution="local",
+)
+
 TEST_CONFIG = BASE_CONFIG._replace(
     config_name="test",
     dump="2019-09",
@@ -189,6 +196,7 @@ def get_mined_dir(self) -> Path:
     "test": TEST_CONFIG,
     "test_slurm": TEST_CONFIG._replace(execution="slurm,partition=dev"),
     "debug": TEST_CONFIG._replace(config_name="debug", mine_num_processes=0),
+    "reproduce": REPRODUCE_CONFIG,
     "augment": BASE_CONFIG._replace(
         config_name="augment", dump="2019-13", lang_blacklist=["en"]
     ),
@@ -260,7 +268,7 @@ def _hashes_shard(conf: Config, shard: int, output: Path):
 
 def mine(conf: Config) -> List[Path]:
     """Remove dups, run LID and LMs, and split by lang and quality."""
-    mined_dir = conf.get_mined_dir() / conf.dump
+    mined_dir = conf.get_mined_dir()
     if conf.will_split:
         # Give a directories when splitting
         outputs = [mined_dir / f"{shard:04d}" for shard in range(conf.num_shards)]
@@ -380,13 +388,19 @@ def _mine_shard(conf: Config, hashes: List[Path], shard: int, output: Path) -> s
     steps["pp_bucket"] = perplexity.PerplexityBucket(CUTOFF_CSV)
     steps["drop"] = perplexity.DropKeys(tok_field)
 
+    if "fetch_metadata" in conf.pipeline:
+        # TODO: better default
+        assert conf.metadata is not None
+        steps["fetch_metadata"] = minify.MetadataFetcher(f"{conf.metadata}/{conf.dump}")
+
+    steps["minify"] = minify.Minifier()
+
     pattern = str(tmp_output / "{language}_{bucket}.json.gz")
     steps["split_by_lang"] = jsonql.split(pattern=str(pattern), mkdir=True)
 
     steps["split_by_segment"] = jsonql.split(
         split_fn=lambda doc: _get_segment(tmp_output, doc), mkdir=True
     )
-    steps["minify"] = minify.Minifier()
 
     pipeline = filter(None, (steps[s] for s in conf.pipeline))
 
@@ -404,8 +418,8 @@ def _mine_shard(conf: Config, hashes: List[Path], shard: int, output: Path) -> s
 
 def regroup(conf: Config, before: Callable[[Config], List[Path]], dirname: str) -> Path:
     """Reshards each language/quality after 'mine'."""
-    mined_dir = conf.output_dir / f"{dirname}_split" / conf.dump
-    regroup_dir = conf.output_dir / dirname / conf.dump
+    mined_dir = conf.output_dir / f"{conf.mined_dir}_split" / conf.dump
+    regroup_dir = conf.output_dir / conf.mined_dir / conf.dump
 
     if mined_dir.exists():
         all_files = list(mined_dir.glob("????/*.json.gz"))
@@ -656,8 +670,7 @@ def main(entry_point: str, config: str = "base", **config_as_dict: Any) -> None:
 
     print(f"Will run cc_net.mine.{entry_point} with the following config:", conf)
     first_stage = {"mine": mine, "reproduce": reproduce}[entry_point]
-    dir_name = entry_point
-    regroup_dir = conf._get_dir(dir_name, regroup=True)
+    dir_name = conf.mined_dir
 
     if "split_by_lang" in conf.pipeline:
         # Only try regrouping if we split the shards.
@@ -669,6 +682,7 @@ def main(entry_point: str, config: str = "base", **config_as_dict: Any) -> None:
         first_stage(conf)
 
     if conf.config_name == "test":
+        regroup_dir = conf._get_dir(dir_name, regroup=True)
         _validate_test(conf, regroup_dir)
 
 
diff --git a/config/test_reproduce.json b/config/test_reproduce.json
@@ -0,0 +1,16 @@
+{
+    "hash_in_mem": 2,
+    "dump": "2019-09",
+    "num_shards": 4,
+    "num_segments_per_shard": 1,
+    "pipeline": [
+        "fetch_metadata",
+        "split_by_lang"
+    ],
+    "metadata": "test_data2/mined_by_segment",
+    "execution": "debug",
+    "output_dir": "test_data2",
+    "mined_dir": "reproduce",
+    "target_size": "32M",
+    "cache_dir": "test_data/wet_cache"
+}
diff --git a/config/test_segment.json b/config/test_segment.json
@@ -0,0 +1,23 @@
+{
+    "hash_in_mem": 2,
+    "dump": "2019-09",
+    "num_shards": 4,
+    "num_segments_per_shard": 1,
+    "mine_num_processes": 0,
+    "lang_whitelist": ["de", "it", "fr"],
+    "pipeline": [
+        "dedup",
+        "lid",
+        "keep_lang",
+        "sp",
+        "lm",
+        "pp_bucket",
+        "minify",
+        "split_by_segment"
+    ],
+    "execution": "debug",
+    "output_dir": "test_data2",
+    "mined_dir": "mined_by_segment",
+    "target_size": "32M",
+    "cache_dir": "test_data/wet_cache"
+}