15
15
from cc_net .minify import (
16
16
HASH_SIZE ,
17
17
decode_hashes ,
18
- encode_as_hashes ,
19
18
encode_hashes ,
19
+ encode_line_ids ,
20
20
get_hashes ,
21
21
)
22
22
@@ -41,9 +41,9 @@ def test_minify():
41
41
"raw_content" : "Hello world !\n Is everyone happy in here ?" ,
42
42
"language" : "en" ,
43
43
"perplexity" : 120.0 ,
44
+ "line_ids" : [0 , 4 ],
44
45
}
45
- expected = {"hashes" : "fApSnZA0cQg=" , "language" : "en" , "perplexity" : 120.0 }
46
-
46
+ expected = {"line_ids" : "AAAEAA==" , "language" : "en" , "perplexity" : 120.0 }
47
47
minifier = minify .Minifier ()
48
48
assert expected == minifier (doc )
49
49
@@ -60,41 +60,62 @@ def read_sample_file(url: str, n_retry: int = 3) -> bytes:
60
60
61
61
62
62
def test_unminify (http_from_disk ):
63
- # same quotes minus the "Education: ..." one
64
- quotes = """Don't part with your illusions. When they are gone you may still exist, but you have ceased to live .
63
+ full_quotes = """Don't part with your illusions. When they are gone you may still exist, but you have ceased to live.
64
+ Education: that which reveals to the wise, and conceals from the stupid, the vast limits of their knowledge .
65
65
Facts are stubborn things, but statistics are more pliable.
66
66
Fiction is obliged to stick to possibilities. Truth isn't."""
67
+ # We don't need no education.
68
+ chosen_quotes = "\n " .join (
69
+ l for l in full_quotes .splitlines () if "Education" not in l
70
+ )
67
71
68
72
cc_doc = {
69
73
"url" : "http://sample_english.com" ,
70
74
"date_download" : "2019-03-18T00:00:00Z" ,
71
75
"digest" : "sha1:XQZHW7QWIG54HVAV3KPRW6MK5ILDNCER" ,
72
76
"source_domain" : "sample_english.com" ,
73
77
"title" : "Famous Mark Twain Quotes" ,
74
- "raw_content" : quotes ,
78
+ "raw_content" : full_quotes ,
75
79
"cc_segment" : "crawl-data/sample.warc.txt" ,
76
- "nlines" : 3 ,
77
- "length" : len (quotes ),
78
- "original_nlines" : 4 ,
79
- "original_length" : 353 ,
80
+ "nlines" : 4 ,
81
+ "length" : 353 ,
80
82
}
81
- metadata = {
83
+
84
+ ccnet_metadata = {
82
85
"language" : "en" ,
83
86
"language_score" : 0.99 ,
84
87
"perplexity" : 151.5 ,
85
88
"bucket" : "head" ,
89
+ "raw_content" : chosen_quotes ,
90
+ "nlines" : 3 ,
91
+ "length" : len (chosen_quotes ),
92
+ "original_nlines" : 4 ,
93
+ "original_length" : 353 ,
94
+ "line_ids" : [0 , 2 , 3 ],
86
95
}
87
- full_doc = dict (** cc_doc , ** metadata )
88
-
89
- # make a copy of doc since minifier operates in place
90
- mini = minify .Minifier ()(full_doc )
91
-
92
- assert mini != cc_doc
93
- assert {k : mini [k ] for k in metadata } == metadata
96
+ ccnet_doc = dict (cc_doc , ** ccnet_metadata )
97
+ mini = minify .Minifier ()(ccnet_doc .copy ())
98
+ assert mini is not ccnet_doc
99
+
100
+ important_fields = [
101
+ "url" ,
102
+ "digest" ,
103
+ "cc_segment" ,
104
+ "language" ,
105
+ "language_score" ,
106
+ "perplexity" ,
107
+ "bucket" ,
108
+ "line_ids" ,
109
+ ]
110
+ expected = {k : ccnet_doc [k ] for k in important_fields }
111
+ expected ["line_ids" ] = encode_line_ids (expected ["line_ids" ]) # type: ignore
112
+ assert expected == mini
94
113
95
114
unminifier = minify .Unminifier ()
96
115
unminifier .look_for ([mini ])
97
- assert full_doc == unminifier (cc_doc )
116
+ # line_ids is removed when unminifying
117
+ ccnet_doc .pop ("line_ids" )
118
+ assert ccnet_doc == unminifier (cc_doc )
98
119
99
120
100
121
def test_unminify_hit_mem_cache (http_from_disk ):
@@ -103,15 +124,13 @@ def test_unminify_hit_mem_cache(http_from_disk):
103
124
"url" : "http://sample_english.com" ,
104
125
"digest" : "sha1:XQZHW7QWIG54HVAV3KPRW6MK5ILDNCER" ,
105
126
"cc_segment" : "crawl-data/sample.warc.txt" ,
106
- "hashes" : encode_as_hashes (
107
- ["Facts are stubborn things, but statistics are more pliable." ]
108
- ),
127
+ "line_ids" : encode_line_ids ([2 ]),
109
128
},
110
129
{
111
130
"url" : "http://sample_chinese.com" ,
112
131
"digest" : "sha1:Y4E6URVYGIAFNVRTPZ5S3J64RTZTP6HJ" ,
113
132
"cc_segment" : "crawl-data/sample.warc.txt" ,
114
- "hashes " : encode_as_hashes ([ "事實是固執的東西,但統計數字卻比較柔和。" ]),
133
+ "line_ids " : encode_line_ids ([ 2 ]),
115
134
},
116
135
]
117
136
unminifier = minify .Unminifier ()
0 commit comments