html5lib · hugovk · Jan 11, 2021 · May 24, 2021 · May 24, 2021 · May 24, 2021
diff --git a/.appveyor.yml b/.appveyor.yml
@@ -1,27 +1,21 @@
 # To activate, change the Appveyor settings to use `.appveyor.yml`.
 environment:
   global:
-    PATH: "C:\\Python27\\Scripts\\;%PATH%"
+    PATH: "C:\\Python38\\Scripts\\;%PATH%"
   matrix:
-    - TOXENV: py27-base
-    - TOXENV: py27-optional
-    - TOXENV: py35-base
-    - TOXENV: py35-optional
-    - TOXENV: py36-base
-    - TOXENV: py36-optional
     - TOXENV: py37-base
     - TOXENV: py37-optional
     - TOXENV: py38-base
     - TOXENV: py38-optional
 
 install:
   - git submodule update --init --recursive
-  - python -m pip install tox
+  - C:\\Python38\\python.exe -m pip install tox
 
 build: off
 
 test_script:
   - tox
 
 after_test:
-  - python debug-info.py
+  - C:\\Python38\\python.exe debug-info.py
diff --git a/.github/workflows/python-tox.yml b/.github/workflows/python-tox.yml
@@ -6,13 +6,14 @@ jobs:
     if: github.event.push || github.event.pull_request.head.repo.full_name != github.repository
     runs-on: ubuntu-latest
     strategy:
+      fail-fast: false
       matrix:
-        python: [2.7, 3.5, 3.6, 3.7, 3.8, pypy-2.7, pypy3]
+        python: [3.7, 3.8, pypy3.8]
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
         with:
           submodules: true
-      - uses: actions/setup-python@v2
+      - uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python }}
       - run: pip install tox

diff --git a/.travis.yml b/.travis.yml
@@ -1,19 +1,15 @@
 language: python
 python:
   - "pypy3"
-  - "pypy"
+  - "3.9"
   - "3.8"
   - "3.7"
-  - "3.6"
-  - "3.5"
-  - "2.7"
-  - "3.9-dev"
 
 cache: pip
 
 env:
   global:
-    - TOXENV=base,optional,six19-optional
+    - TOXENV=base,optional
 
 install:
   - pip install tox

diff --git a/README.rst b/README.rst
@@ -91,7 +91,7 @@ More documentation is available at https://html5lib.readthedocs.io/.
 Installation
 ------------
 
-html5lib works on CPython 2.7+, CPython 3.5+ and PyPy. To install:
+html5lib works on CPython 3.6+ and PyPy3. To install:
 
 .. code-block:: bash
 
@@ -127,7 +127,7 @@ Please report any bugs on the `issue tracker
 Tests
 -----
 
-Unit tests require the ``pytest`` and ``mock`` libraries and can be
+Unit tests require the ``pytest`` library and can be
 run using the ``py.test`` command in the root directory.
 
 Test data are contained in a separate `html5lib-tests

diff --git a/debug-info.py b/debug-info.py
@@ -1,5 +1,3 @@
-from __future__ import print_function, unicode_literals
-
 import platform
 import sys
 
@@ -12,7 +10,7 @@
     "maxsize": sys.maxsize
 }
 
-search_modules = ["chardet", "genshi", "html5lib", "lxml", "six"]
+search_modules = ["chardet", "genshi", "html5lib", "lxml"]
 found_modules = []
 
 for m in search_modules:

diff --git a/doc/conf.py b/doc/conf.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 #
 # html5lib documentation build configuration file, created by
 # sphinx-quickstart on Wed May  8 00:04:49 2013.
@@ -92,7 +91,7 @@
 ]
 
 
-class CExtMock(object):
+class CExtMock:
     """Required for autodoc on readthedocs.org where you cannot build C extensions."""
     def __init__(self, *args, **kwargs):
         pass

diff --git a/html5lib/__init__.py b/html5lib/__init__.py
@@ -20,7 +20,6 @@
 * :func:`~.serializer.serialize`
 """
 
-from __future__ import absolute_import, division, unicode_literals
 
 from .html5parser import HTMLParser, parse, parseFragment
 from .treebuilders import getTreeBuilder

diff --git a/html5lib/_ihatexml.py b/html5lib/_ihatexml.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
-
 import re
 import warnings
 
@@ -184,7 +182,7 @@ def escapeRegexp(string):
 nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\\-'()+,./:=?;!*#@$_%]")
 
 
-class InfosetFilter(object):
+class InfosetFilter:
     replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
 
     def __init__(self,

diff --git a/html5lib/_inputstream.py b/html5lib/_inputstream.py
@@ -1,10 +1,7 @@
-from __future__ import absolute_import, division, unicode_literals
-
-from six import text_type
-from six.moves import http_client, urllib
-
 import codecs
+import http.client
 import re
+import urllib
 from io import BytesIO, StringIO
 
 import webencodings
@@ -14,9 +11,9 @@
 from . import _utils
 
 # Non-unicode versions of constants for use in the pre-parser
-spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
-asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
-asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
+spaceCharactersBytes = frozenset(item.encode("ascii") for item in spaceCharacters)
+asciiLettersBytes = frozenset(item.encode("ascii") for item in asciiLetters)
+asciiUppercaseBytes = frozenset(item.encode("ascii") for item in asciiUppercase)
 spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
 
 
@@ -48,7 +45,7 @@
 charsUntilRegEx = {}
 
 
-class BufferedStream(object):
+class BufferedStream:
     """Buffering for streams that do not have buffering of their own
 
     The buffer is implemented as a list of chunks on the assumption that
@@ -86,7 +83,7 @@ def read(self, bytes):
             return self._readFromBuffer(bytes)
 
     def _bufferedBytes(self):
-        return sum([len(item) for item in self.buffer])
+        return sum(len(item) for item in self.buffer)
 
     def _readStream(self, bytes):
         data = self.stream.read(bytes)
@@ -125,15 +122,15 @@ def _readFromBuffer(self, bytes):
 def HTMLInputStream(source, **kwargs):
     # Work around Python bug #20007: read(0) closes the connection.
     # http://bugs.python.org/issue20007
-    if (isinstance(source, http_client.HTTPResponse) or
+    if (isinstance(source, http.client.HTTPResponse) or
         # Also check for addinfourl wrapping HTTPResponse
         (isinstance(source, urllib.response.addbase) and
-         isinstance(source.fp, http_client.HTTPResponse))):
+         isinstance(source.fp, http.client.HTTPResponse))):
         isUnicode = False
     elif hasattr(source, "read"):
-        isUnicode = isinstance(source.read(0), text_type)
+        isUnicode = isinstance(source.read(0), str)
     else:
-        isUnicode = isinstance(source, text_type)
+        isUnicode = isinstance(source, str)
 
     if isUnicode:
         encodings = [x for x in kwargs if x.endswith("_encoding")]
@@ -145,7 +142,7 @@ def HTMLInputStream(source, **kwargs):
         return HTMLBinaryInputStream(source, **kwargs)
 
 
-class HTMLUnicodeInputStream(object):
+class HTMLUnicodeInputStream:
     """Provides a unicode stream of characters to the HTMLTokenizer.
 
     This class takes care of character encoding and removing or replacing
@@ -325,7 +322,7 @@ def charsUntil(self, characters, opposite=False):
             if __debug__:
                 for c in characters:
                     assert(ord(c) < 128)
-            regex = "".join(["\\x%02x" % ord(c) for c in characters])
+            regex = "".join("\\x%02x" % ord(c) for c in characters)
             if not opposite:
                 regex = "^%s" % regex
             chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)
@@ -524,7 +521,7 @@ def changeEncoding(self, newEncoding):
             self.rawStream.seek(0)
             self.charEncoding = (newEncoding, "certain")
             self.reset()
-            raise _ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
+            raise _ReparseException(f"Encoding changed from {self.charEncoding[0]} to {newEncoding}")
 
     def detectBOM(self):
         """Attempts to detect at BOM at the start of the stream. If
@@ -673,7 +670,7 @@ def jumpTo(self, bytes):
         return True
 
 
-class EncodingParser(object):
+class EncodingParser:
     """Mini parser for detecting character encoding from meta elements"""
 
     def __init__(self, data):
@@ -861,7 +858,7 @@ def getAttribute(self):
                 attrValue.append(c)
 
 
-class ContentAttrParser(object):
+class ContentAttrParser:
     def __init__(self, data):
         assert isinstance(data, bytes)
         self.data = data

diff --git a/html5lib/_tokenizer.py b/html5lib/_tokenizer.py
@@ -1,9 +1,4 @@
-from __future__ import absolute_import, division, unicode_literals
-
-from six import unichr as chr
-
-from collections import deque, OrderedDict
-from sys import version_info
+from collections import deque
 
 from .constants import spaceCharacters
 from .constants import entities
@@ -18,13 +13,8 @@
 
 entitiesTrie = Trie(entities)
 
-if version_info >= (3, 7):
-    attributeMap = dict
-else:
-    attributeMap = OrderedDict
-
 
-class HTMLTokenizer(object):
+class HTMLTokenizer:
     """ This class takes care of tokenizing HTML.
 
     * self.currentToken
@@ -50,7 +40,7 @@ def __init__(self, stream, parser=None, **kwargs):
 
         # The current token being created
         self.currentToken = None
-        super(HTMLTokenizer, self).__init__()
+        super().__init__()
 
     def __iter__(self):
         """ This is where the magic happens.
@@ -236,7 +226,7 @@ def emitCurrentToken(self):
             token["name"] = token["name"].translate(asciiUpper2Lower)
             if token["type"] == tokenTypes["StartTag"]:
                 raw = token["data"]
-                data = attributeMap(raw)
+                data = dict(raw)
                 if len(raw) > len(data):
                     # we had some duplicated attribute, fix so first wins
                     data.update(raw[::-1])

diff --git a/html5lib/_trie/__init__.py b/html5lib/_trie/__init__.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
-
 from .py import Trie
 
 __all__ = ["Trie"]
diff --git a/html5lib/_trie/_base.py b/html5lib/_trie/_base.py
@@ -1,17 +1,12 @@
-from __future__ import absolute_import, division, unicode_literals
-
-try:
-    from collections.abc import Mapping
-except ImportError:  # Python 2.7
-    from collections import Mapping
+from collections.abc import Mapping
 
 
 class Trie(Mapping):
     """Abstract base class for tries"""
 
     def keys(self, prefix=None):
         # pylint:disable=arguments-differ
-        keys = super(Trie, self).keys()
+        keys = super().keys()
 
         if prefix is None:
             return set(keys)

diff --git a/html5lib/_trie/py.py b/html5lib/_trie/py.py
@@ -1,14 +1,11 @@
-from __future__ import absolute_import, division, unicode_literals
-from six import text_type
-
 from bisect import bisect_left
 
 from ._base import Trie as ABCTrie
 
 
 class Trie(ABCTrie):
     def __init__(self, data):
-        if not all(isinstance(x, text_type) for x in data.keys()):
+        if not all(isinstance(x, str) for x in data.keys()):
             raise TypeError("All keys must be strings")
 
         self._data = data

diff --git a/html5lib/_utils.py b/html5lib/_utils.py
@@ -1,21 +1,9 @@
-from __future__ import absolute_import, division, unicode_literals
-
 from types import ModuleType
 
-try:
-    from collections.abc import Mapping
-except ImportError:
-    from collections import Mapping
+from collections.abc import Mapping
 
-from six import text_type, PY3
 
-if PY3:
-    import xml.etree.ElementTree as default_etree
-else:
-    try:
-        import xml.etree.cElementTree as default_etree
-    except ImportError:
-        import xml.etree.ElementTree as default_etree
+import xml.etree.ElementTree as default_etree
 
 
 __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
@@ -31,10 +19,10 @@
 # escapes.
 try:
     _x = eval('"\\uD800"')  # pylint:disable=eval-used
-    if not isinstance(_x, text_type):
+    if not isinstance(_x, str):
         # We need this with u"" because of http://bugs.jython.org/issue2039
         _x = eval('u"\\uD800"')  # pylint:disable=eval-used
-        assert isinstance(_x, text_type)
+        assert isinstance(_x, str)
 except Exception:
     supports_lone_surrogates = False
 else:
@@ -122,7 +110,7 @@ def moduleFactoryFactory(factory):
     moduleCache = {}
 
     def moduleFactory(baseModule, *args, **kwargs):
-        if isinstance(ModuleType.__name__, type("")):
+        if isinstance(ModuleType.__name__, str):
             name = "_%s_factory" % baseModule.__name__
         else:
             name = b"_%s_factory" % baseModule.__name__

diff --git a/html5lib/constants.py b/html5lib/constants.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
-
 import string
 
 EOF = None

diff --git a/html5lib/filters/alphabeticalattributes.py b/html5lib/filters/alphabeticalattributes.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
-
 from . import base
 
 from collections import OrderedDict