Drop/replace “is ASCII superset”-checking code

This change drops or replaces all code for checking whether a particular encoding is an ASCII superset — because the code no longer corresponds to actual requirements in the Encoding spec (which instead now requires checking only whether an encoding is utf-16be or utf-16le).
validator · sideshowbarker · Sep 12, 2020 · Sep 12, 2020 · Sep 12, 2020 · Sep 12, 2020
commit eab36025dded2cc7a9dee75e32c003ce35cbf8b0
diff --git a/src/nu/validator/htmlparser/extra/ChardetSniffer.java b/src/nu/validator/htmlparser/extra/ChardetSniffer.java
@@ -54,7 +54,9 @@ public Encoding sniff() throws IOException {
         detector.Init(this);
         detector.DoIt(source, length, false);
         detector.DataEnd();
-        if (returnValue != null && returnValue != Encoding.WINDOWS1252 && returnValue.isAsciiSuperset()) {
+        if (returnValue != null && returnValue != Encoding.WINDOWS1252
+                && returnValue != Encoding.UTF16BE
+                && returnValue != Encoding.UTF16LE) {
             return returnValue;
         } else {
             return null;

diff --git a/src/nu/validator/htmlparser/extra/IcuDetectorSniffer.java b/src/nu/validator/htmlparser/extra/IcuDetectorSniffer.java
@@ -57,7 +57,8 @@ public Encoding sniff() throws IOException {
             if (actual != null) {
                 enc = actual;
             }
-            if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) {
+            if (enc != Encoding.WINDOWS1252 //
+                    && enc != Encoding.UTF16BE && enc != Encoding.UTF16LE) {
                 return enc;
             } else {
                 return null;

diff --git a/src/nu/validator/htmlparser/io/Driver.java b/src/nu/validator/htmlparser/io/Driver.java
@@ -370,12 +370,6 @@ public boolean internalEncodingDeclaration(String internalCharset)
             if (actual == null) {
                 actual = cs;
             }
-            if (!actual.isAsciiSuperset()) {
-                tokenizer.errTreeBuilder("Internal encoding declaration specified \u201C"
-                        + internalCharset
-                        + "\u201D which is not an ASCII superset. Not changing the encoding.");
-                return false;
-            }
             if (characterEncoding == null) {
                 // Reader case
                 return true;

diff --git a/src/nu/validator/htmlparser/io/Encoding.java b/src/nu/validator/htmlparser/io/Encoding.java
@@ -304,8 +304,6 @@ public class Encoding {
 
     private final Charset charset;
 
-    private final boolean asciiSuperset;
-
     private final boolean obscure;
 
     private final boolean shouldNot;
@@ -315,15 +313,6 @@ public class Encoding {
     private Encoding actualHtmlEncoding = null;
 
     static {
-        byte[] testBuf = new byte[0x7F];
-        for (int i = 0; i < 0x7F; i++) {
-            if (isAsciiSupersetnessSensitive(i)) {
-                testBuf[i] = (byte) i;
-            } else {
-                testBuf[i] = (byte) 0x20;
-            }
-        }
-
         Set<Encoding> encodings = new HashSet<Encoding>();
 
         SortedMap<String, Charset> charsets = Charset.availableCharsets();
@@ -398,12 +387,6 @@ asciiSuperset, isObscure(name),
         }
     }
 
-    private static boolean isAsciiSupersetnessSensitive(int c) {
-        return (c >= 0x09 && c <= 0x0D) || (c >= 0x20 && c <= 0x22)
-                || (c >= 0x26 && c <= 0x27) || (c >= 0x2C && c <= 0x3F)
-                || (c >= 0x41 && c <= 0x5A) || (c >= 0x61 && c <= 0x7A);
-    }
-
     private static boolean isObscure(String lowerCasePreferredIanaName) {
         return !(Arrays.binarySearch(NOT_OBSCURE, lowerCasePreferredIanaName) > -1);
     }
@@ -419,38 +402,6 @@ private static boolean isShouldNot(String lowerCasePreferredIanaName) {
         return (Arrays.binarySearch(SHOULD_NOT, lowerCasePreferredIanaName) > -1);
     }
 
-    /**
-     * @param testBuf
-     * @param cs
-     */
-    private static boolean asciiMapsToBasicLatin(byte[] testBuf, Charset cs) {
-        CharsetDecoder dec = cs.newDecoder();
-        dec.onMalformedInput(CodingErrorAction.REPORT);
-        dec.onUnmappableCharacter(CodingErrorAction.REPORT);
-        Reader r = new InputStreamReader(new ByteArrayInputStream(testBuf), dec);
-        try {
-            for (int i = 0; i < 0x7F; i++) {
-                if (isAsciiSupersetnessSensitive(i)) {
-                    if (r.read() != i) {
-                        return false;
-                    }
-                } else {
-                    if (r.read() != 0x20) {
-                        return false;
-                    }
-                }
-            }
-        } catch (IOException e) {
-            return false;
-        } catch (Exception e) {
-            return false;
-        } catch (CoderMalfunctionError e) {
-            return false;
-        }
-
-        return true;
-    }
-
     private static boolean isLikelyEbcdic(String canonName,
             boolean asciiSuperset) {
         if (!asciiSuperset) {
@@ -536,15 +487,6 @@ private Encoding(final String canonName, final Charset charset,
         this.likelyEbcdic = likelyEbcdic;
     }
 
-    /**
-     * Returns the asciiSuperset.
-     * 
-     * @return the asciiSuperset
-     */
-    public boolean isAsciiSuperset() {
-        return asciiSuperset;
-    }
-
     /**
      * Returns the canonName.
      * 

diff --git a/src/nu/validator/htmlparser/io/MetaSniffer.java b/src/nu/validator/htmlparser/io/MetaSniffer.java
@@ -169,12 +169,6 @@ protected boolean tryCharset(String encoding) throws SAXException {
             } else {
                 Encoding cs = Encoding.forName(encoding);
                 String canonName = cs.getCanonName();
-                if (!cs.isAsciiSuperset()) {
-                    err("The encoding \u201C"
-                                + encoding
-                                + "\u201D is not an ASCII superset and, therefore, cannot be used in an internal encoding declaration. Continuing the sniffing algorithm.");
-                    return false;
-                }
                 if (!cs.isRegistered()) {
                     if (encoding.startsWith("x-")) {
                         err("The encoding \u201C"