@@ -51,7 +51,8 @@ class Config(NamedTuple):
51
51
52
52
config_name
53
53
dump: CC dump id
54
- output_dir: where to write the dataset
54
+ output_dir: working directory
55
+ mined_dir: name of the destination folder, full path will be {ouput_dir}/{mined_dir}/{dump_id}
55
56
execution: chose how to parallelize the execution
56
57
num_shards: number of shards to split the dump
57
58
num_segments_per_shard: allow to download a small portion of CC (eg for tests)
@@ -74,6 +75,7 @@ class Config(NamedTuple):
74
75
config_name : str = "base"
75
76
dump : str = "2017-51"
76
77
output_dir : Path = Path ("data" )
78
+ mined_dir : str = "mined"
77
79
execution : str = "slurm"
78
80
num_shards : int = 1600
79
81
num_segments_per_shard : int = - 1
@@ -135,7 +137,7 @@ def from_json(cls, json_file: Path) -> "Config":
135
137
136
138
@property
137
139
def will_split (self ) -> bool :
138
- return "split " in self .pipeline or "split_by_segment" in self .pipeline
140
+ return "split_by_lang " in self .pipeline or "split_by_segment" in self .pipeline
139
141
140
142
def get_lm_languages (self ) -> Sequence [str ]:
141
143
if self .lm_languages is not None :
@@ -154,9 +156,18 @@ def _get_dir(self, name: str, regroup: bool = False) -> Path:
154
156
return self .output_dir / f"{ name } _split" / self .dump
155
157
return self .output_dir / name / self .dump
156
158
159
+ def get_mined_dir (self ) -> Path :
160
+ return self ._get_dir (self .mined_dir )
161
+
157
162
158
163
BASE_CONFIG = Config ()
159
164
165
+ BYLANG_CONFIG = Config (
166
+ config_name = "by_lang" ,
167
+ mined_dir = "mined_by_lang" ,
168
+ pipeline = list (BASE_CONFIG .pipeline [:- 1 ]) + ["split_by_lang" ],
169
+ )
170
+
160
171
TEST_CONFIG = BASE_CONFIG ._replace (
161
172
config_name = "test" ,
162
173
dump = "2019-09" ,
@@ -174,6 +185,7 @@ def _get_dir(self, name: str, regroup: bool = False) -> Path:
174
185
175
186
PREDEF_CONFIGS = {
176
187
"base" : BASE_CONFIG ,
188
+ "by_lang" : BYLANG_CONFIG ,
177
189
"test" : TEST_CONFIG ,
178
190
"test_slurm" : TEST_CONFIG ._replace (execution = "slurm,partition=dev" ),
179
191
"debug" : TEST_CONFIG ._replace (config_name = "debug" , mine_num_processes = 0 ),
@@ -248,13 +260,12 @@ def _hashes_shard(conf: Config, shard: int, output: Path):
248
260
249
261
def mine (conf : Config ) -> List [Path ]:
250
262
"""Remove dups, run LID and LMs, and split by lang and quality."""
263
+ mined_dir = conf .get_mined_dir () / conf .dump
251
264
if conf .will_split :
252
265
# Give a directories when splitting
253
- mined_dir = conf .output_dir / "mined_split" / conf .dump
254
266
outputs = [mined_dir / f"{ shard :04d} " for shard in range (conf .num_shards )]
255
267
else :
256
268
# Files otherwise
257
- mined_dir = conf .output_dir / "mined" / conf .dump
258
269
outputs = [
259
270
mined_dir / f"{ shard :04d} .json.gz" for shard in range (conf .num_shards )
260
271
]
@@ -370,7 +381,7 @@ def _mine_shard(conf: Config, hashes: List[Path], shard: int, output: Path) -> s
370
381
steps ["drop" ] = perplexity .DropKeys (tok_field )
371
382
372
383
pattern = str (tmp_output / "{language}_{bucket}.json.gz" )
373
- steps ["split " ] = jsonql .split (pattern = str (pattern ), mkdir = True )
384
+ steps ["split_by_lang " ] = jsonql .split (pattern = str (pattern ), mkdir = True )
374
385
375
386
steps ["split_by_segment" ] = jsonql .split (
376
387
split_fn = lambda doc : _get_segment (tmp_output , doc ), mkdir = True
@@ -391,54 +402,6 @@ def _mine_shard(conf: Config, hashes: List[Path], shard: int, output: Path) -> s
391
402
return f"Mined { output } "
392
403
393
404
394
- def reproduce (conf : Config ) -> List [Path ]:
395
- reproduce_dir = conf ._get_dir ("reproduce" )
396
- reproduce_dir .mkdir (parents = True , exist_ok = True )
397
- if conf .will_split :
398
- # Givedirectories en splitting
399
- outputs = [reproduce_dir / f"{ shard :04d} " for shard in range (conf .num_shards )]
400
- else :
401
- # Files otherwise
402
- outputs = [
403
- reproduce_dir / f"{ shard :04d} .json.gz" for shard in range (conf .num_shards )
404
- ]
405
- missing_outputs = [(shard , o ) for shard , o in enumerate (outputs ) if not o .exists ()]
406
- if not missing_outputs :
407
- return outputs
408
-
409
- ex = conf .get_executor ("reproduce" , timeout_hour = 2 , mem_gb = 2 , cpus = 2 )
410
- ex (_reproduce_shard , repeat (conf ), * _transpose (missing_outputs ))
411
- return outputs
412
-
413
-
414
- def _reproduce_shard (conf : Config , shard : int , output : Path ) -> str :
415
- from cc_net import transpose
416
-
417
- assert conf .metadata is not None
418
- tmp_output = tmp (output )
419
- cc = process_wet_file .CCShardReader (
420
- conf .dump ,
421
- shard ,
422
- num_shards = conf .num_shards ,
423
- num_segments_per_shard = conf .num_segments_per_shard ,
424
- cache_dir = conf .cache_dir ,
425
- )
426
-
427
- unminifier = transpose .LinearUnminifier (conf .metadata / conf .dump )
428
- # TODO: we should look at the conf to see how to split
429
- pipeline : List [jsonql .Transformer ] = [unminifier ]
430
-
431
- if conf .will_split :
432
- pattern = str (tmp_output / "{language}_{bucket}.json.gz" )
433
- pipeline .append (jsonql .split (pattern = str (pattern ), mkdir = True ))
434
-
435
- jsonql .run_pipes (
436
- * pipeline , file = cc , output = tmp_output if not conf .will_split else None
437
- )
438
- tmp_output .rename (output )
439
- return f"Unminified { output } "
440
-
441
-
442
405
def regroup (conf : Config , before : Callable [[Config ], List [Path ]], dirname : str ) -> Path :
443
406
"""Reshards each language/quality after 'mine'."""
444
407
mined_dir = conf .output_dir / f"{ dirname } _split" / conf .dump
@@ -450,7 +413,6 @@ def regroup(conf: Config, before: Callable[[Config], List[Path]], dirname: str)
450
413
print (f"No files found in { mined_dir } for regroup. Exiting." )
451
414
return regroup_dir
452
415
453
- # check that mining is over.
454
416
all_files = [f for d in before (conf ) for f in d .glob ("*.json.gz" )]
455
417
assert all_files , f"No files found inside mined dir: { mined_dir } "
456
418
@@ -535,7 +497,7 @@ def move_segments(conf: Config, first_stage: Callable, dirname: str) -> Path:
535
497
regroup_dir .mkdir (exist_ok = True )
536
498
ex = conf .get_executor (f"moveseg_{ conf .dump } " , mem_gb = 1 , timeout_hour = 1 , cpus = 2 )
537
499
538
- def _move_segments (subdir : Path , regroup_dir : Path ) -> Optional [ str ] :
500
+ def _move_segments (subdir : Path , regroup_dir : Path ) -> str :
539
501
n = 0
540
502
for f in subdir .iterdir ():
541
503
if not f .is_file () or f .is_symlink ():
@@ -549,7 +511,7 @@ def _move_segments(subdir: Path, regroup_dir: Path) -> Optional[str]:
549
511
f .symlink_to (target )
550
512
551
513
if n == 0 :
552
- return None
514
+ return ""
553
515
554
516
return f"Moved { n } .json.gz files from { subdir } to { regroup_dir } "
555
517
@@ -615,19 +577,70 @@ def dump(x):
615
577
616
578
617
579
def get_main_parser () -> ArgumentParser :
618
- # Generates the 'main' parser by patching a 'Config' parser
619
- p = func_argparse .func_argparser (Config )
580
+ def _parser (entry_point : str ) -> ArgumentParser :
581
+ # Generates the 'main' parser by patching a 'Config' parser
582
+ p = func_argparse .func_argparser (Config )
583
+
584
+ # Override defaults value to None, so we know what was set by the user.
585
+ # Note that it will keep the original default values in the help message.
586
+ p .set_defaults (** {f : None for f in Config ._fields })
587
+
588
+ p .add_argument ("--config" , type = str , default = "base" )
589
+ p .set_defaults (__command = main )
590
+ p .set_defaults (entry_point = entry_point )
591
+ return p
592
+
593
+ return func_argparse .multi_argparser (
594
+ mine = _parser ("mine" ),
595
+ # TODO: we should hide parameters not used in `reproduce`
596
+ reproduce = _parser ("reproduce" ),
597
+ )
620
598
621
- # Override defaults value to None, so we know what was set by the user.
622
- # Note that it will keep the original default values in the help message.
623
- p .set_defaults (** {f : None for f in Config ._fields })
624
599
625
- p .add_argument ("--config" , type = str , default = "base" )
626
- p .set_defaults (__command = main )
627
- return p
600
+ def reproduce (conf : Config ) -> List [Path ]:
601
+ reproduce_dir = conf ._get_dir ("reproduce" )
602
+ reproduce_dir .mkdir (parents = True , exist_ok = True )
603
+ if conf .will_split :
604
+ # Givedirectories en splitting
605
+ outputs = [reproduce_dir / f"{ shard :04d} " for shard in range (conf .num_shards )]
606
+ else :
607
+ # Files otherwise
608
+ outputs = [
609
+ reproduce_dir / f"{ shard :04d} .json.gz" for shard in range (conf .num_shards )
610
+ ]
611
+ missing_outputs = [(shard , o ) for shard , o in enumerate (outputs ) if not o .exists ()]
612
+ if not missing_outputs :
613
+ return outputs
628
614
615
+ ex = conf .get_executor ("reproduce" , timeout_hour = 2 , mem_gb = 2 , cpus = 2 )
616
+ ex (_reproduce_shard , repeat (conf ), * _transpose (missing_outputs ))
617
+ return outputs
629
618
630
- def main (config : str = "base" , ** config_as_dict : Any ) -> None :
619
+
620
+ def _reproduce_shard (conf : Config , shard : int , output : Path ) -> str :
621
+ metadata = conf .metadata
622
+ if metadata is None and (conf .output_dir / "mined" ).exists ():
623
+ # TODO: better default
624
+ metadata = conf .output_dir / "mined"
625
+ print (f"Will use { metadata } as metadata source" )
626
+ assert metadata is not None , "Need to set 'metadata' for reproduce"
627
+ cc = conf .get_cc_shard (shard )
628
+
629
+ unminifier = minify .MetadataFetcher (metadata / conf .dump )
630
+ # TODO: we should look at the conf to see how to split
631
+ pipeline : List [jsonql .Transformer ] = [unminifier ]
632
+
633
+ tmp_output = tmp (output )
634
+ if conf .will_split :
635
+ pattern = str (tmp (output ) / "{language}_{bucket}.json.gz" )
636
+ pipeline .append (jsonql .split (pattern = str (pattern ), mkdir = True ))
637
+
638
+ jsonql .run_pipes (* pipeline , file = cc , output = None if conf .will_split else tmp_output )
639
+ tmp_output .rename (output )
640
+ return f"Unminified { output } "
641
+
642
+
643
+ def main (entry_point : str , config : str = "base" , ** config_as_dict : Any ) -> None :
631
644
# Use the given 'config' as default value.
632
645
config_base = config
633
646
if config_base in PREDEF_CONFIGS :
@@ -640,32 +653,22 @@ def main(config: str = "base", **config_as_dict: Any) -> None:
640
653
f"Choose from ({ ', ' .join (PREDEF_CONFIGS )} ) or give an existing .json file."
641
654
)
642
655
conf = conf ._replace (** {k : v for (k , v ) in config_as_dict .items () if v is not None })
643
- print ("Will run mine.py with the following config:" , conf )
644
-
645
- # Decide if we need to mine or if we have metadata available
646
-
647
- if conf .metadata :
648
- conf = conf ._replace (pipeline = ["split" ])
649
- # this is not very clean. We should either:
650
- # - move back to the reproduce command
651
- # - add an 'unminify' step that read conf.metadata
652
656
653
- print (f"Will use pre-computed metadata from { conf .metadata } " )
654
- first_stage = reproduce
655
- dir_name = "reproduce"
656
-
657
- else :
658
- first_stage = mine
659
- dir_name = "mined"
657
+ print (f"Will run cc_net.mine.{ entry_point } with the following config:" , conf )
658
+ first_stage = {"mine" : mine , "reproduce" : reproduce }[entry_point ]
659
+ dir_name = entry_point
660
660
regroup_dir = conf ._get_dir (dir_name , regroup = True )
661
- if "split" in conf .pipeline :
662
- # Only regroup if we split the shards.
661
+
662
+ if "split_by_lang" in conf .pipeline :
663
+ # Only try regrouping if we split the shards.
663
664
regroup (conf , first_stage , dir_name )
664
665
elif "split_by_segment" in conf .pipeline :
665
666
# If we split by segment then regrouping is trivial, since segments appear in only one shard.
666
667
move_segments (conf , first_stage , dir_name )
668
+ else :
669
+ first_stage (conf )
667
670
668
- if config_base == "test" :
671
+ if conf . config_name == "test" :
669
672
_validate_test (conf , regroup_dir )
670
673
671
674
0 commit comments