html5lib · ashleysommer · Sep 9, 2024 · Sep 9, 2024 · Sep 9, 2024 · Sep 9, 2024
diff --git a/.appveyor.yml b/.appveyor.yml
diff --git a/.github/workflows/python-tox.yml b/.github/workflows/python-tox.yml
@@ -12,9 +12,6 @@ jobs:
         os: [ubuntu-latest, windows-latest]
         deps: [base, optional]
         include:
-          - python: "pypy-2.7"
-            os: ubuntu-latest
-            deps: base
           - python: "pypy-3.10"
             os: ubuntu-latest
             deps: base

diff --git a/README.rst b/README.rst
@@ -29,7 +29,7 @@ or:
 
 By default, the ``document`` will be an ``xml.etree`` element instance.
 Whenever possible, html5lib chooses the accelerated ``ElementTree``
-implementation (i.e. ``xml.etree.cElementTree`` on Python 2.x).
+implementation.
 
 Two other tree types are supported: ``xml.dom.minidom`` and
 ``lxml.etree``. To use an alternative format, specify the name of
@@ -41,18 +41,6 @@ a treebuilder:
   with open("mydocument.html", "rb") as f:
       lxml_etree_document = html5lib.parse(f, treebuilder="lxml")
 
-When using with ``urllib2`` (Python 2), the charset from HTTP should be
-pass into html5lib as follows:
-
-.. code-block:: python
-
-  from contextlib import closing
-  from urllib2 import urlopen
-  import html5lib
-
-  with closing(urlopen("http://example.com/")) as f:
-      document = html5lib.parse(f, transport_encoding=f.info().getparam("charset"))
-
 When using with ``urllib.request`` (Python 3), the charset from HTTP
 should be pass into html5lib as follows:
 
@@ -90,7 +78,7 @@ More documentation is available at https://html5lib.readthedocs.io/.
 Installation
 ------------
 
-html5lib works on CPython 2.7+, CPython 3.5+ and PyPy. To install:
+html5lib works on CPython 3.8+ and PyPy. To install:
 
 .. code-block:: bash
 

diff --git a/debug-info.py b/debug-info.py
@@ -1,4 +1,3 @@
-from __future__ import print_function, unicode_literals
 
 import platform
 import sys
@@ -12,7 +11,7 @@
     "maxsize": sys.maxsize
 }
 
-search_modules = ["chardet", "genshi", "html5lib", "lxml", "six"]
+search_modules = ["chardet", "genshi", "html5lib", "lxml"]
 found_modules = []
 
 for m in search_modules:

diff --git a/doc/conf.py b/doc/conf.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 #
 # html5lib documentation build configuration file, created by
 # sphinx-quickstart on Wed May  8 00:04:49 2013.
@@ -100,7 +99,7 @@
 }
 
 
-class CExtMock(object):
+class CExtMock:
     """Required for autodoc on readthedocs.org where you cannot build C extensions."""
     def __init__(self, *args, **kwargs):
         pass

diff --git a/html5lib/__init__.py b/html5lib/__init__.py
@@ -20,7 +20,6 @@
 * :func:`~.serializer.serialize`
 """
 
-from __future__ import absolute_import, division, unicode_literals
 
 from .html5parser import HTMLParser, parse, parseFragment
 from .treebuilders import getTreeBuilder

diff --git a/html5lib/_ihatexml.py b/html5lib/_ihatexml.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
 
 import re
 import warnings
@@ -181,7 +180,7 @@ def escapeRegexp(string):
 nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\\-'()+,./:=?;!*#@$_%]")
 
 
-class InfosetFilter(object):
+class InfosetFilter:
     replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
 
     def __init__(self,

diff --git a/html5lib/_inputstream.py b/html5lib/_inputstream.py
@@ -1,13 +1,12 @@
-from __future__ import absolute_import, division, unicode_literals
 
-from six import text_type
-from six.moves import http_client, urllib
+import http.client
+import urllib.response
 
 import codecs
 import re
 from io import BytesIO, StringIO
 
-import webencodings
+from .contrib import webencodings
 
 from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
 from .constants import _ReparseException
@@ -48,7 +47,7 @@
 charsUntilRegEx = {}
 
 
-class BufferedStream(object):
+class BufferedStream:
     """Buffering for streams that do not have buffering of their own
 
     The buffer is implemented as a list of chunks on the assumption that
@@ -125,15 +124,15 @@ def _readFromBuffer(self, bytes):
 def HTMLInputStream(source, **kwargs):
     # Work around Python bug #20007: read(0) closes the connection.
     # http://bugs.python.org/issue20007
-    if (isinstance(source, http_client.HTTPResponse) or
+    if (isinstance(source, http.client.HTTPResponse) or
         # Also check for addinfourl wrapping HTTPResponse
         (isinstance(source, urllib.response.addbase) and
-         isinstance(source.fp, http_client.HTTPResponse))):
+         isinstance(source.fp, http.client.HTTPResponse))):
         isUnicode = False
     elif hasattr(source, "read"):
-        isUnicode = isinstance(source.read(0), text_type)
+        isUnicode = isinstance(source.read(0), str)
     else:
-        isUnicode = isinstance(source, text_type)
+        isUnicode = isinstance(source, str)
 
     if isUnicode:
         encodings = [x for x in kwargs if x.endswith("_encoding")]
@@ -145,7 +144,7 @@ def HTMLInputStream(source, **kwargs):
         return HTMLBinaryInputStream(source, **kwargs)
 
 
-class HTMLUnicodeInputStream(object):
+class HTMLUnicodeInputStream:
     """Provides a unicode stream of characters to the HTMLTokenizer.
 
     This class takes care of character encoding and removing or replacing
@@ -673,7 +672,7 @@ def jumpTo(self, bytes):
         return True
 
 
-class EncodingParser(object):
+class EncodingParser:
     """Mini parser for detecting character encoding from meta elements"""
 
     def __init__(self, data):
@@ -861,7 +860,7 @@ def getAttribute(self):
                 attrValue.append(c)
 
 
-class ContentAttrParser(object):
+class ContentAttrParser:
     def __init__(self, data):
         assert isinstance(data, bytes)
         self.data = data

diff --git a/html5lib/_tokenizer.py b/html5lib/_tokenizer.py
@@ -1,6 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
-
-from six import unichr as chr
 
 from collections import deque, OrderedDict
 from sys import version_info
@@ -24,7 +21,7 @@
     attributeMap = OrderedDict
 
 
-class HTMLTokenizer(object):
+class HTMLTokenizer:
     """ This class takes care of tokenizing HTML.
 
     * self.currentToken

diff --git a/html5lib/_trie/__init__.py b/html5lib/_trie/__init__.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
 
 from .py import Trie
 

diff --git a/html5lib/_trie/_base.py b/html5lib/_trie/_base.py
@@ -1,9 +1,5 @@
-from __future__ import absolute_import, division, unicode_literals
 
-try:
-    from collections.abc import Mapping
-except ImportError:  # Python 2.7
-    from collections import Mapping
+from collections.abc import Mapping
 
 
 class Trie(Mapping):

diff --git a/html5lib/_trie/py.py b/html5lib/_trie/py.py
@@ -1,14 +1,11 @@
-from __future__ import absolute_import, division, unicode_literals
-from six import text_type
-
 from bisect import bisect_left
 
 from ._base import Trie as ABCTrie
 
 
 class Trie(ABCTrie):
     def __init__(self, data):
-        if not all(isinstance(x, text_type) for x in data.keys()):
+        if not all(isinstance(x, str) for x in data.keys()):
             raise TypeError("All keys must be strings")
 
         self._data = data

diff --git a/html5lib/_utils.py b/html5lib/_utils.py
@@ -1,21 +1,9 @@
-from __future__ import absolute_import, division, unicode_literals
 
 from types import ModuleType
 
-try:
-    from collections.abc import Mapping
-except ImportError:
-    from collections import Mapping
-
-from six import text_type, PY3
+from collections.abc import Mapping
 
-if PY3:
-    import xml.etree.ElementTree as default_etree
-else:
-    try:
-        import xml.etree.cElementTree as default_etree
-    except ImportError:
-        import xml.etree.ElementTree as default_etree
+import xml.etree.ElementTree as default_etree
 
 
 __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
@@ -31,10 +19,10 @@
 # escapes.
 try:
     _x = eval('"\\uD800"')  # pylint:disable=eval-used
-    if not isinstance(_x, text_type):
+    if not isinstance(_x, str):
         # We need this with u"" because of http://bugs.jython.org/issue2039
         _x = eval('u"\\uD800"')  # pylint:disable=eval-used
-        assert isinstance(_x, text_type)
+        assert isinstance(_x, str)
 except Exception:
     supports_lone_surrogates = False
 else:
@@ -122,7 +110,7 @@ def moduleFactoryFactory(factory):
     moduleCache = {}
 
     def moduleFactory(baseModule, *args, **kwargs):
-        if isinstance(ModuleType.__name__, type("")):
+        if isinstance(ModuleType.__name__, str):
             name = "_%s_factory" % baseModule.__name__
         else:
             name = b"_%s_factory" % baseModule.__name__

diff --git a/html5lib/constants.py b/html5lib/constants.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
 
 import string
 

diff --git a/html5lib/contrib/__init__.py b/html5lib/contrib/__init__.py
diff --git a/html5lib/contrib/webencodings/LiCENSE b/html5lib/contrib/webencodings/LiCENSE
@@ -0,0 +1,31 @@
+Copyright (c) 2012 by Simon Sapin.
+
+Some rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the following
+      disclaimer in the documentation and/or other materials provided
+      with the distribution.
+
+    * The names of the contributors may not be used to endorse or
+      promote products derived from this software without specific
+      prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Original file line number	Diff line number	Diff line change
		@@ -1,4 +1,3 @@
		from __future__ import absolute_import, division, unicode_literals

		from .py import Trie

Expand Down
Original file line number	Diff line number	Diff line change
		@@ -1,4 +1,3 @@
		from __future__ import absolute_import, division, unicode_literals

		import string

Expand Down