Skip to content
This repository was archived by the owner on Oct 31, 2023. It is now read-only.

Commit 9c11097

Browse files
committed
add CC-100 download script
1 parent 242e10d commit 9c11097

File tree

2 files changed

+230
-0
lines changed

2 files changed

+230
-0
lines changed

README.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,30 @@ You can reconstruct the corpus used in the paper by using:
7070
python -m cc_net --conf reproduce --dump 2019-09
7171
```
7272

73+
## Extract XLM-R data
74+
75+
[Unsupervised Cross-lingual Representation Learning at Scale (XLM-RoBERTa)](https://arxiv.org/pdf/1911.02116.pdf)
76+
paper was trained on data extracted by an internal version of cc_net.
77+
78+
Due to the format being a little bit different please use the following command instead:
79+
80+
```sh
81+
python cc_net/tools/dl_cc_100.py --help
82+
python cc_net/tools/dl_cc_100.py --outdir data_cc100 --process 8
83+
```
84+
85+
If you use this version of the data please also consider citing:
86+
87+
```bibtex
88+
@article{conneau2019unsupervised,
89+
title={Unsupervised Cross-lingual Representation Learning at Scale},
90+
author={Conneau, Alexis and Khandelwal, Kartikay and Goyal, Naman and Chaudhary, Vishrav and Wenzek, Guillaume and Guzm{\'a}n, Francisco and Grave, Edouard and Ott, Myle and Zettlemoyer, Luke and Stoyanov, Veselin},
91+
journal={arXiv preprint arXiv:1911.02116},
92+
year={2019}
93+
}
94+
```
95+
96+
7397
## Adapting to your infrastructure
7498

7599
Given the computation cost of running the full pipeline we distributed the computation

cc_net/tools/dl_cc_100.py

Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
# Copyright (c) Facebook, Inc. and its affiliates.
2+
#
3+
# This source code is licensed under the MIT license found in the
4+
# LICENSE file in the root directory of this source tree.
5+
#
6+
7+
import contextlib
8+
import functools
9+
import gzip
10+
import logging
11+
import multiprocessing
12+
from collections import defaultdict
13+
from pathlib import Path
14+
from typing import Callable, Dict, Iterator, List, NamedTuple, Optional, Tuple
15+
16+
import cc_net
17+
from cc_net import jsonql
18+
from cc_net.process_wet_file import CCSegmentsReader
19+
20+
# Set this to a directory to use as cache for intermediary files.
21+
# This helps for debugging.
22+
WET_CACHE = None
23+
# WET_CACHE = Path("wet_cache")
24+
25+
S3_BUCKET = "https://dl.fbaipublicfiles.com/cc100"
26+
VERSION = "1.0.0"
27+
28+
CC_100_SNAPSHOTS = [
29+
"2018-05",
30+
"2018-09",
31+
"2018-13",
32+
"2018-17",
33+
"2018-22",
34+
"2018-26",
35+
"2018-30",
36+
"2018-34",
37+
"2018-39",
38+
"2018-43",
39+
"2018-47",
40+
"2018-51",
41+
]
42+
43+
BIG_LANGUAGES = {
44+
"es_XX",
45+
"fr_XX",
46+
"de_DE",
47+
"ja_XX",
48+
"ru_RU",
49+
"zh_CN",
50+
"en_XX",
51+
"it_IT",
52+
"ar_AR",
53+
"nl_XX",
54+
"pl_PL",
55+
"pt_XX",
56+
"tr_TR",
57+
"zh_TW",
58+
}
59+
60+
61+
class Paragraph(NamedTuple):
62+
lang: str
63+
text: str
64+
lm_score: float
65+
66+
67+
def _dl_shard(snapshot: str, shard: int) -> Iterator[Paragraph]:
68+
"""
69+
Download metadata from a shards.
70+
71+
Sample metadata:
72+
73+
{
74+
"cc_segment": "crawl-data/CC-MAIN-2018-51/segments/1544376823009.19/wet/CC-MAIN-20181209185547-20181209211547-00000.warc.wet.gz",
75+
"digest": "sha1:222LWNHN5FM26XGS7WJSMI6IISTVWBKJ",
76+
"url": "http://personals.gearplay.com/ads/DRJONES.htm",
77+
"line_ids": [10],
78+
"languages": ["en_XX"],
79+
"lm_scores": [-2.658],
80+
}
81+
"""
82+
snapshot = snapshot.replace("-", "_")
83+
name = f"snap_{snapshot}_batch_{shard}.json.gz"
84+
url = "/".join([S3_BUCKET, VERSION, name])
85+
shard_metadata: Dict[str, Dict[str, dict]] = defaultdict(dict)
86+
try:
87+
cache_file: Optional[Path] = None
88+
if WET_CACHE is not None:
89+
cache_file = WET_CACHE / name
90+
metadata_file = jsonql.open_remote_file(url, cache_file)
91+
except:
92+
logging.warning(f"Couldn't open {url}")
93+
return
94+
95+
for meta in jsonql.read_jsons(metadata_file):
96+
shard_metadata[meta["cc_segment"]][meta["digest"]] = meta
97+
98+
found_pars, missed_pars = 0, 0
99+
for seg, segment_metadata in shard_metadata.items():
100+
for doc in CCSegmentsReader([seg], cache_dir=WET_CACHE):
101+
if doc["digest"] not in segment_metadata:
102+
continue
103+
104+
meta = segment_metadata[doc["digest"]]
105+
full_pars = [doc["title"]] + doc["raw_content"].split("\n")
106+
107+
assert len(meta["line_ids"]) == len(meta["languages"])
108+
assert len(meta["line_ids"]) == len(meta["lm_scores"])
109+
for i, lang, score in zip(
110+
meta["line_ids"], meta["languages"], meta["lm_scores"]
111+
):
112+
if snapshot != "2018-51" and lang in BIG_LANGUAGES:
113+
# Big languages only come from "2018-51" snapshot
114+
continue
115+
if i >= len(full_pars):
116+
# This is because CC100 was created by saving only urls.
117+
# Some urls appears in different snapshot with slightly different
118+
# versions, but we don't know which one is correct.
119+
# Here we read both versions, but some index may end up
120+
# being incorrect.
121+
# This impact ~3% documents.
122+
missed_pars += 1
123+
continue
124+
125+
yield Paragraph(lang, full_pars[i], score)
126+
found_pars += 1
127+
if missed_pars > 0:
128+
logging.warning(
129+
f"Missed {missed_pars} ({missed_pars / found_pars:%}) paragraphes."
130+
)
131+
132+
133+
def _split_by_par(
134+
paragraphes: Iterator[Paragraph], snapshot: str, shard: int, outdir: Path
135+
) -> int:
136+
outdir.mkdir(exist_ok=True)
137+
outfiles = {}
138+
num_pars = 0
139+
try:
140+
for par in paragraphes:
141+
# MODIFY ME: filter paragraph if needed (languages, score, ...)
142+
if par.lang not in outfiles:
143+
(outdir / par.lang).mkdir(exist_ok=True)
144+
outfile = outdir / par.lang / f"snap_{snapshot}_batch_{shard}.gz"
145+
outfiles[par.lang] = gzip.open(outfile, "wt")
146+
147+
print(par.text, file=outfiles[par.lang])
148+
num_pars += 1
149+
finally:
150+
for o in outfiles.values():
151+
o.close()
152+
153+
logging.info(f"Extracted {num_pars:_d} paragraphs from shard {snapshot}_{shard}")
154+
return num_pars
155+
156+
157+
def dl_shard(snapshot: str, shard: int, outdir: Path) -> int:
158+
return _split_by_par(_dl_shard(snapshot, shard), snapshot, shard, outdir)
159+
160+
161+
@contextlib.contextmanager
162+
def unordered_map(processes: int):
163+
if processes == 0:
164+
yield map
165+
return
166+
167+
with multiprocessing.Pool(processes) as pool:
168+
yield pool.imap_unordered
169+
170+
171+
def dl_snapshot(snapshot: str, outdir: Path, processes: int = 1) -> None:
172+
_dl_shard = functools.partial(dl_shard, snapshot, outdir=outdir)
173+
174+
with unordered_map(processes) as umap:
175+
num_pars = sum(umap(_dl_shard, range(500)))
176+
177+
logging.info(f"Extracted {num_pars:_d} paragraphs from snapshot {snapshot}.")
178+
179+
180+
def dl(
181+
snapshot: str = None, outdir: Path = Path("data_cc100"), processes: int = 1
182+
) -> None:
183+
"""
184+
Download CC100 corpus.
185+
Will create one text file per language and CC snapshot.
186+
187+
- snapshot: restrict to one snapshot. Useful for parallelization.
188+
- outdir: output directory
189+
- processes: number of processes to use
190+
"""
191+
if snapshot is None:
192+
snapshots = CC_100_SNAPSHOTS
193+
else:
194+
snapshots = snapshot.split(",")
195+
196+
invalids = [s for s in snapshots if s not in CC_100_SNAPSHOTS]
197+
assert not invalids, f"Invalid snapshots {invalids}, chose from {CC_100_SNAPSHOTS}"
198+
199+
for snapshot in snapshots:
200+
dl_snapshot(snapshot, outdir, processes)
201+
202+
203+
if __name__ == "__main__":
204+
import func_argparse
205+
206+
func_argparse.single_main(dl)

0 commit comments

Comments
 (0)