support line_ids in minify/unminify

gwenzek · gwenzek · commit de44e7705fac · 2020-05-07T23:37:06.000+02:00
diff --git a/cc_net/dedup.py b/cc_net/dedup.py
@@ -370,7 +370,7 @@ class DuplicatesRemover(jsonql.Transformer):
     # The hashes can't be pickled so they will have to be read back from disk.
     warn_when_pickling = True
 
-    def __init__(self, field: str, hashes_files: List[Path], collect: bool=False):
+    def __init__(self, field: str, hashes_files: List[Path], collect: bool = False):
         """
         Remove duplicates
         """
@@ -410,7 +410,11 @@ def do(self, doc: dict) -> Optional[dict]:
         doc_hashes = compute_hashes(content)
 
         assert self.duplicates is not None
-        seen = self.duplicates.add(doc_hashes) if self.collect else self.duplicates[doc_hashes]
+        seen = (
+            self.duplicates.add(doc_hashes)
+            if self.collect
+            else self.duplicates[doc_hashes]
+        )
         keep = seen < True
         kept = keep.sum()
         if kept == 0:
diff --git a/cc_net/jsonql.py b/cc_net/jsonql.py
@@ -485,12 +485,16 @@ def json_stdin() -> Iterable[dict]:
     return JsonReader().map(sys.stdin)
 
 
-def read_jsons(file: Union[FileDescriptor, Iterable[str], Iterable[dict]], strict=False) -> Iterator[dict]:
+def read_jsons(
+    file: Union[FileDescriptor, Iterable[str], Iterable[dict]], strict=False
+) -> Iterator[dict]:
     with smart_open(file) as f:
         yield from _read_jsons(f)
 
 
-def _read_jsons(lines: Union[Iterable[str], Iterable[dict]], strict=False) -> Iterator[dict]:
+def _read_jsons(
+    lines: Union[Iterable[str], Iterable[dict]], strict=False
+) -> Iterator[dict]:
     reader = JsonReader(strict=strict)
 
     for line in reader.map(lines):
diff --git a/cc_net/minify.py b/cc_net/minify.py
@@ -9,7 +9,7 @@
 import hashlib
 import itertools
 from pathlib import Path
-from typing import Dict, Iterable, List, Optional, Set, Union
+from typing import Dict, Iterable, List, Optional, Sequence, Set, Union
 
 import numpy as np
 
@@ -61,14 +61,14 @@ def decode_hashes(compact: str) -> List[bytes]:
     return res
 
 
-def encode_line_ids(line_ids: Iterable[int]) -> str:
+def encode_line_ids(line_ids: Sequence[int]) -> str:
     arr = np.array(line_ids, dtype="<u2")
     return base64.b64encode(arr.tobytes()).decode("ascii")
 
 
 def decode_line_ids(compact: str) -> List[int]:
     ids_bytes = bytearray(base64.b64decode(compact))
-    return np.ndarray(dtype="<i2", buffer=ids_bytes)
+    return np.ndarray(len(ids_bytes) // 2, dtype="<i2", buffer=ids_bytes)
 
 
 def get_doc_key(digest: str) -> int:
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,6 +16,7 @@ skip = ["third_party", "data"]
 
 [mypy]
 python_version = 3.7
+check_untyped_defs = true
 
 [mypy-numpy]
 ignore_missing_imports = true
diff --git a/tests/test_minify.py b/tests/test_minify.py
@@ -15,8 +15,8 @@
 from cc_net.minify import (
     HASH_SIZE,
     decode_hashes,
-    encode_as_hashes,
     encode_hashes,
+    encode_line_ids,
     get_hashes,
 )
 
@@ -41,9 +41,9 @@ def test_minify():
         "raw_content": "Hello world !\nIs everyone happy in here ?",
         "language": "en",
         "perplexity": 120.0,
+        "line_ids": [0, 4],
     }
-    expected = {"hashes": "fApSnZA0cQg=", "language": "en", "perplexity": 120.0}
-
+    expected = {"line_ids": "AAAEAA==", "language": "en", "perplexity": 120.0}
     minifier = minify.Minifier()
     assert expected == minifier(doc)
 
@@ -60,41 +60,62 @@ def read_sample_file(url: str, n_retry: int = 3) -> bytes:
 
 
 def test_unminify(http_from_disk):
-    # same quotes minus the "Education: ..." one
-    quotes = """Don't part with your illusions. When they are gone you may still exist, but you have ceased to live.
+    full_quotes = """Don't part with your illusions. When they are gone you may still exist, but you have ceased to live.
+Education: that which reveals to the wise, and conceals from the stupid, the vast limits of their knowledge.
 Facts are stubborn things, but statistics are more pliable.
 Fiction is obliged to stick to possibilities. Truth isn't."""
+    # We don't need no education.
+    chosen_quotes = "\n".join(
+        l for l in full_quotes.splitlines() if "Education" not in l
+    )
 
     cc_doc = {
         "url": "http://sample_english.com",
         "date_download": "2019-03-18T00:00:00Z",
         "digest": "sha1:XQZHW7QWIG54HVAV3KPRW6MK5ILDNCER",
         "source_domain": "sample_english.com",
         "title": "Famous Mark Twain Quotes",
-        "raw_content": quotes,
+        "raw_content": full_quotes,
         "cc_segment": "crawl-data/sample.warc.txt",
-        "nlines": 3,
-        "length": len(quotes),
-        "original_nlines": 4,
-        "original_length": 353,
+        "nlines": 4,
+        "length": 353,
     }
-    metadata = {
+
+    ccnet_metadata = {
         "language": "en",
         "language_score": 0.99,
         "perplexity": 151.5,
         "bucket": "head",
+        "raw_content": chosen_quotes,
+        "nlines": 3,
+        "length": len(chosen_quotes),
+        "original_nlines": 4,
+        "original_length": 353,
+        "line_ids": [0, 2, 3],
     }
-    full_doc = dict(**cc_doc, **metadata)
-
-    # make a copy of doc since minifier operates in place
-    mini = minify.Minifier()(full_doc)
-
-    assert mini != cc_doc
-    assert {k: mini[k] for k in metadata} == metadata
+    ccnet_doc = dict(cc_doc, **ccnet_metadata)
+    mini = minify.Minifier()(ccnet_doc.copy())
+    assert mini is not ccnet_doc
+
+    important_fields = [
+        "url",
+        "digest",
+        "cc_segment",
+        "language",
+        "language_score",
+        "perplexity",
+        "bucket",
+        "line_ids",
+    ]
+    expected = {k: ccnet_doc[k] for k in important_fields}
+    expected["line_ids"] = encode_line_ids(expected["line_ids"])  # type: ignore
+    assert expected == mini
 
     unminifier = minify.Unminifier()
     unminifier.look_for([mini])
-    assert full_doc == unminifier(cc_doc)
+    # line_ids is removed when unminifying
+    ccnet_doc.pop("line_ids")
+    assert ccnet_doc == unminifier(cc_doc)
 
 
 def test_unminify_hit_mem_cache(http_from_disk):
@@ -103,15 +124,13 @@ def test_unminify_hit_mem_cache(http_from_disk):
             "url": "http://sample_english.com",
             "digest": "sha1:XQZHW7QWIG54HVAV3KPRW6MK5ILDNCER",
             "cc_segment": "crawl-data/sample.warc.txt",
-            "hashes": encode_as_hashes(
-                ["Facts are stubborn things, but statistics are more pliable."]
-            ),
+            "line_ids": encode_line_ids([2]),
         },
         {
             "url": "http://sample_chinese.com",
             "digest": "sha1:Y4E6URVYGIAFNVRTPZ5S3J64RTZTP6HJ",
             "cc_segment": "crawl-data/sample.warc.txt",
-            "hashes": encode_as_hashes(["事實是固執的東西，但統計數字卻比較柔和。"]),
+            "line_ids": encode_line_ids([2]),
         },
     ]
     unminifier = minify.Unminifier()