ForkLab
diff --git a/‎README.rst
Lines changed: 4 additions & 1 deletion b/‎README.rst
Lines changed: 4 additions & 1 deletion
diff --git a/‎extractors/run_html2text.py
Lines changed: 26 additions & 0 deletions b/‎extractors/run_html2text.py
Lines changed: 26 additions & 0 deletions
@@ -13,7 +13,8 @@ and open-source libraries
 `html-text <https://github.com/TeamHG-Memex/html-text>`_,
 `trafilatura <https://github.com/adbar/trafilatura>`_,
 `go-readability <https://github.com/go-shiori/go-readability>`_,
-`Readability.js <https://github.com/mozilla/readability>`_.
+`Readability.js <https://github.com/mozilla/readability>`_,
+`html2text <https://github.com/Alir3z4/html2text>`_.
 We release evaluation datasets and scripts,
 and provide more details in a whitepaper.
 
@@ -44,6 +45,7 @@ Result of packages added after original evaluation::
     trafilatura          precision=0.925 ± 0.011  recall=0.966 ± 0.009  F1=0.945 ± 0.009 accuracy=0.221 ± 0.031
     go_readability       precision=0.912 ± 0.009  recall=0.975 ± 0.007  F1=0.943 ± 0.007 accuracy=0.210 ± 0.030
     readability_js       precision=0.853 ± 0.013  recall=0.924 ± 0.012  F1=0.887 ± 0.012 accuracy=0.149 ± 0.026
+    html2text            precision=0.499 ± 0.017  recall=0.983 ± 0.002  F1=0.662 ± 0.015 accuracy=0.000 ± 0.000
 
 Below you can find more details about the packages and result reproduction.
 
@@ -102,6 +104,7 @@ or external resources:
   at https://github.com/scrapinghub/article-extraction-benchmark/pull/4
 - go-readability: https://github.com/go-shiori/go-readability
 - Readability.js: https://github.com/mozilla/readability
+- html2text: https://github.com/Alir3z4/html2text
 
 Output from these libraries is already present in the repo in ``output/*.json`` files.
 They were generated with ``extractors/run_*.py`` files.
 
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+import gzip
+import json
+from pathlib import Path
+
+from html2text import HTML2Text
+
+
+def main():
+    output = {}
+    for path in Path('html').glob('*.html.gz'):
+        with gzip.open(path, 'rt', encoding='utf8') as f:
+            html = f.read()
+        item_id = path.stem.split('.')[0]
+        h = HTML2Text()
+        h.ignore_links = True
+        h.ignore_images = True
+        content = h.handle(html)
+        output[item_id] = {'articleBody': content}
+    (Path('output') / 'html2text.json').write_text(
+        json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4),
+        encoding='utf8')
+
+
+if __name__ == '__main__':
+    main()