41
41
"lm" ,
42
42
"pp_bucket" ,
43
43
"drop" ,
44
- "split " ,
44
+ "split_by_lang " ,
45
45
]
46
46
47
47
@@ -156,7 +156,7 @@ def _get_dir(self, name: str, regroup: bool = False) -> Path:
156
156
return self .output_dir / f"{ name } _split" / self .dump
157
157
return self .output_dir / name / self .dump
158
158
159
- def get_mined_dir (self ) -> Path :
159
+ def get_mined_dir (self , regroup_dir : bool = False ) -> Path :
160
160
return self ._get_dir (self .mined_dir )
161
161
162
162
@@ -168,6 +168,13 @@ def get_mined_dir(self) -> Path:
168
168
pipeline = list (BASE_CONFIG .pipeline [:- 1 ]) + ["split_by_lang" ],
169
169
)
170
170
171
+ REPRODUCE_CONFIG = Config (
172
+ config_name = "reproduce" ,
173
+ mined_dir = "reproduce" ,
174
+ pipeline = ["fetch_metadata" , "split_by_lang" ],
175
+ execution = "local" ,
176
+ )
177
+
171
178
TEST_CONFIG = BASE_CONFIG ._replace (
172
179
config_name = "test" ,
173
180
dump = "2019-09" ,
@@ -189,6 +196,7 @@ def get_mined_dir(self) -> Path:
189
196
"test" : TEST_CONFIG ,
190
197
"test_slurm" : TEST_CONFIG ._replace (execution = "slurm,partition=dev" ),
191
198
"debug" : TEST_CONFIG ._replace (config_name = "debug" , mine_num_processes = 0 ),
199
+ "reproduce" : REPRODUCE_CONFIG ,
192
200
"augment" : BASE_CONFIG ._replace (
193
201
config_name = "augment" , dump = "2019-13" , lang_blacklist = ["en" ]
194
202
),
@@ -260,7 +268,7 @@ def _hashes_shard(conf: Config, shard: int, output: Path):
260
268
261
269
def mine (conf : Config ) -> List [Path ]:
262
270
"""Remove dups, run LID and LMs, and split by lang and quality."""
263
- mined_dir = conf .get_mined_dir () / conf . dump
271
+ mined_dir = conf .get_mined_dir ()
264
272
if conf .will_split :
265
273
# Give a directories when splitting
266
274
outputs = [mined_dir / f"{ shard :04d} " for shard in range (conf .num_shards )]
@@ -380,13 +388,19 @@ def _mine_shard(conf: Config, hashes: List[Path], shard: int, output: Path) -> s
380
388
steps ["pp_bucket" ] = perplexity .PerplexityBucket (CUTOFF_CSV )
381
389
steps ["drop" ] = perplexity .DropKeys (tok_field )
382
390
391
+ if "fetch_metadata" in conf .pipeline :
392
+ # TODO: better default
393
+ assert conf .metadata is not None
394
+ steps ["fetch_metadata" ] = minify .MetadataFetcher (f"{ conf .metadata } /{ conf .dump } " )
395
+
396
+ steps ["minify" ] = minify .Minifier ()
397
+
383
398
pattern = str (tmp_output / "{language}_{bucket}.json.gz" )
384
399
steps ["split_by_lang" ] = jsonql .split (pattern = str (pattern ), mkdir = True )
385
400
386
401
steps ["split_by_segment" ] = jsonql .split (
387
402
split_fn = lambda doc : _get_segment (tmp_output , doc ), mkdir = True
388
403
)
389
- steps ["minify" ] = minify .Minifier ()
390
404
391
405
pipeline = filter (None , (steps [s ] for s in conf .pipeline ))
392
406
@@ -404,8 +418,8 @@ def _mine_shard(conf: Config, hashes: List[Path], shard: int, output: Path) -> s
404
418
405
419
def regroup (conf : Config , before : Callable [[Config ], List [Path ]], dirname : str ) -> Path :
406
420
"""Reshards each language/quality after 'mine'."""
407
- mined_dir = conf .output_dir / f"{ dirname } _split" / conf .dump
408
- regroup_dir = conf .output_dir / dirname / conf .dump
421
+ mined_dir = conf .output_dir / f"{ conf . mined_dir } _split" / conf .dump
422
+ regroup_dir = conf .output_dir / conf . mined_dir / conf .dump
409
423
410
424
if mined_dir .exists ():
411
425
all_files = list (mined_dir .glob ("????/*.json.gz" ))
@@ -656,8 +670,7 @@ def main(entry_point: str, config: str = "base", **config_as_dict: Any) -> None:
656
670
657
671
print (f"Will run cc_net.mine.{ entry_point } with the following config:" , conf )
658
672
first_stage = {"mine" : mine , "reproduce" : reproduce }[entry_point ]
659
- dir_name = entry_point
660
- regroup_dir = conf ._get_dir (dir_name , regroup = True )
673
+ dir_name = conf .mined_dir
661
674
662
675
if "split_by_lang" in conf .pipeline :
663
676
# Only try regrouping if we split the shards.
@@ -669,6 +682,7 @@ def main(entry_point: str, config: str = "base", **config_as_dict: Any) -> None:
669
682
first_stage (conf )
670
683
671
684
if conf .config_name == "test" :
685
+ regroup_dir = conf ._get_dir (dir_name , regroup = True )
672
686
_validate_test (conf , regroup_dir )
673
687
674
688
0 commit comments