Skip to content
This repository was archived by the owner on Oct 31, 2023. It is now read-only.

Commit de44e77

Browse files
committed
support line_ids in minify/unminify
1 parent 67381a5 commit de44e77

File tree

5 files changed

+58
-30
lines changed

5 files changed

+58
-30
lines changed

cc_net/dedup.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -370,7 +370,7 @@ class DuplicatesRemover(jsonql.Transformer):
370370
# The hashes can't be pickled so they will have to be read back from disk.
371371
warn_when_pickling = True
372372

373-
def __init__(self, field: str, hashes_files: List[Path], collect: bool=False):
373+
def __init__(self, field: str, hashes_files: List[Path], collect: bool = False):
374374
"""
375375
Remove duplicates
376376
"""
@@ -410,7 +410,11 @@ def do(self, doc: dict) -> Optional[dict]:
410410
doc_hashes = compute_hashes(content)
411411

412412
assert self.duplicates is not None
413-
seen = self.duplicates.add(doc_hashes) if self.collect else self.duplicates[doc_hashes]
413+
seen = (
414+
self.duplicates.add(doc_hashes)
415+
if self.collect
416+
else self.duplicates[doc_hashes]
417+
)
414418
keep = seen < True
415419
kept = keep.sum()
416420
if kept == 0:

cc_net/jsonql.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -485,12 +485,16 @@ def json_stdin() -> Iterable[dict]:
485485
return JsonReader().map(sys.stdin)
486486

487487

488-
def read_jsons(file: Union[FileDescriptor, Iterable[str], Iterable[dict]], strict=False) -> Iterator[dict]:
488+
def read_jsons(
489+
file: Union[FileDescriptor, Iterable[str], Iterable[dict]], strict=False
490+
) -> Iterator[dict]:
489491
with smart_open(file) as f:
490492
yield from _read_jsons(f)
491493

492494

493-
def _read_jsons(lines: Union[Iterable[str], Iterable[dict]], strict=False) -> Iterator[dict]:
495+
def _read_jsons(
496+
lines: Union[Iterable[str], Iterable[dict]], strict=False
497+
) -> Iterator[dict]:
494498
reader = JsonReader(strict=strict)
495499

496500
for line in reader.map(lines):

cc_net/minify.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import hashlib
1010
import itertools
1111
from pathlib import Path
12-
from typing import Dict, Iterable, List, Optional, Set, Union
12+
from typing import Dict, Iterable, List, Optional, Sequence, Set, Union
1313

1414
import numpy as np
1515

@@ -61,14 +61,14 @@ def decode_hashes(compact: str) -> List[bytes]:
6161
return res
6262

6363

64-
def encode_line_ids(line_ids: Iterable[int]) -> str:
64+
def encode_line_ids(line_ids: Sequence[int]) -> str:
6565
arr = np.array(line_ids, dtype="<u2")
6666
return base64.b64encode(arr.tobytes()).decode("ascii")
6767

6868

6969
def decode_line_ids(compact: str) -> List[int]:
7070
ids_bytes = bytearray(base64.b64decode(compact))
71-
return np.ndarray(dtype="<i2", buffer=ids_bytes)
71+
return np.ndarray(len(ids_bytes) // 2, dtype="<i2", buffer=ids_bytes)
7272

7373

7474
def get_doc_key(digest: str) -> int:

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ skip = ["third_party", "data"]
1616

1717
[mypy]
1818
python_version = 3.7
19+
check_untyped_defs = true
1920

2021
[mypy-numpy]
2122
ignore_missing_imports = true

tests/test_minify.py

Lines changed: 42 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@
1515
from cc_net.minify import (
1616
HASH_SIZE,
1717
decode_hashes,
18-
encode_as_hashes,
1918
encode_hashes,
19+
encode_line_ids,
2020
get_hashes,
2121
)
2222

@@ -41,9 +41,9 @@ def test_minify():
4141
"raw_content": "Hello world !\nIs everyone happy in here ?",
4242
"language": "en",
4343
"perplexity": 120.0,
44+
"line_ids": [0, 4],
4445
}
45-
expected = {"hashes": "fApSnZA0cQg=", "language": "en", "perplexity": 120.0}
46-
46+
expected = {"line_ids": "AAAEAA==", "language": "en", "perplexity": 120.0}
4747
minifier = minify.Minifier()
4848
assert expected == minifier(doc)
4949

@@ -60,41 +60,62 @@ def read_sample_file(url: str, n_retry: int = 3) -> bytes:
6060

6161

6262
def test_unminify(http_from_disk):
63-
# same quotes minus the "Education: ..." one
64-
quotes = """Don't part with your illusions. When they are gone you may still exist, but you have ceased to live.
63+
full_quotes = """Don't part with your illusions. When they are gone you may still exist, but you have ceased to live.
64+
Education: that which reveals to the wise, and conceals from the stupid, the vast limits of their knowledge.
6565
Facts are stubborn things, but statistics are more pliable.
6666
Fiction is obliged to stick to possibilities. Truth isn't."""
67+
# We don't need no education.
68+
chosen_quotes = "\n".join(
69+
l for l in full_quotes.splitlines() if "Education" not in l
70+
)
6771

6872
cc_doc = {
6973
"url": "http://sample_english.com",
7074
"date_download": "2019-03-18T00:00:00Z",
7175
"digest": "sha1:XQZHW7QWIG54HVAV3KPRW6MK5ILDNCER",
7276
"source_domain": "sample_english.com",
7377
"title": "Famous Mark Twain Quotes",
74-
"raw_content": quotes,
78+
"raw_content": full_quotes,
7579
"cc_segment": "crawl-data/sample.warc.txt",
76-
"nlines": 3,
77-
"length": len(quotes),
78-
"original_nlines": 4,
79-
"original_length": 353,
80+
"nlines": 4,
81+
"length": 353,
8082
}
81-
metadata = {
83+
84+
ccnet_metadata = {
8285
"language": "en",
8386
"language_score": 0.99,
8487
"perplexity": 151.5,
8588
"bucket": "head",
89+
"raw_content": chosen_quotes,
90+
"nlines": 3,
91+
"length": len(chosen_quotes),
92+
"original_nlines": 4,
93+
"original_length": 353,
94+
"line_ids": [0, 2, 3],
8695
}
87-
full_doc = dict(**cc_doc, **metadata)
88-
89-
# make a copy of doc since minifier operates in place
90-
mini = minify.Minifier()(full_doc)
91-
92-
assert mini != cc_doc
93-
assert {k: mini[k] for k in metadata} == metadata
96+
ccnet_doc = dict(cc_doc, **ccnet_metadata)
97+
mini = minify.Minifier()(ccnet_doc.copy())
98+
assert mini is not ccnet_doc
99+
100+
important_fields = [
101+
"url",
102+
"digest",
103+
"cc_segment",
104+
"language",
105+
"language_score",
106+
"perplexity",
107+
"bucket",
108+
"line_ids",
109+
]
110+
expected = {k: ccnet_doc[k] for k in important_fields}
111+
expected["line_ids"] = encode_line_ids(expected["line_ids"]) # type: ignore
112+
assert expected == mini
94113

95114
unminifier = minify.Unminifier()
96115
unminifier.look_for([mini])
97-
assert full_doc == unminifier(cc_doc)
116+
# line_ids is removed when unminifying
117+
ccnet_doc.pop("line_ids")
118+
assert ccnet_doc == unminifier(cc_doc)
98119

99120

100121
def test_unminify_hit_mem_cache(http_from_disk):
@@ -103,15 +124,13 @@ def test_unminify_hit_mem_cache(http_from_disk):
103124
"url": "http://sample_english.com",
104125
"digest": "sha1:XQZHW7QWIG54HVAV3KPRW6MK5ILDNCER",
105126
"cc_segment": "crawl-data/sample.warc.txt",
106-
"hashes": encode_as_hashes(
107-
["Facts are stubborn things, but statistics are more pliable."]
108-
),
127+
"line_ids": encode_line_ids([2]),
109128
},
110129
{
111130
"url": "http://sample_chinese.com",
112131
"digest": "sha1:Y4E6URVYGIAFNVRTPZ5S3J64RTZTP6HJ",
113132
"cc_segment": "crawl-data/sample.warc.txt",
114-
"hashes": encode_as_hashes(["事實是固執的東西,但統計數字卻比較柔和。"]),
133+
"line_ids": encode_line_ids([2]),
115134
},
116135
]
117136
unminifier = minify.Unminifier()

0 commit comments

Comments
 (0)