Skip to content
This repository was archived by the owner on Oct 31, 2023. It is now read-only.

Commit 5debe48

Browse files
committed
only skip empty lines at end of documents
1 parent ad655db commit 5debe48

File tree

1 file changed

+7
-3
lines changed

1 file changed

+7
-3
lines changed

cc_net/process_wet_file.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,12 @@ def parse_doc(headers: List[str], doc: List[str]) -> Optional[dict]:
8282
logger.warning("Can't parse header:", e, headers, doc)
8383
return None
8484

85-
title, doc = doc[0], doc[1:]
85+
# Docs are separated by two empty lines.
86+
last = None
87+
if not doc[-1] and not doc[-2]:
88+
last = -2
89+
title, doc = doc[0], doc[1:last]
90+
8691
return {
8792
"url": url,
8893
"date_download": date,
@@ -113,8 +118,7 @@ def group_by_docs(warc_lines: Iterable[str]) -> Iterable[dict]:
113118
headers, doc, read_headers = [warc], [], True
114119
continue
115120

116-
if warc:
117-
doc.append(warc)
121+
doc.append(warc)
118122

119123
# Return the last document
120124
if doc:

0 commit comments

Comments
 (0)