Skip to content

Conform encoding handling to Encoding spec #48

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Drop/replace “is ASCII superset”-checking code
This change drops or replaces all code for checking whether a particular
encoding is an ASCII superset — because the code no longer corresponds
to actual requirements in the Encoding spec (which instead now requires
checking only whether an encoding is utf-16be or utf-16le).
  • Loading branch information
sideshowbarker committed Sep 2, 2021
commit eab36025dded2cc7a9dee75e32c003ce35cbf8b0
4 changes: 3 additions & 1 deletion src/nu/validator/htmlparser/extra/ChardetSniffer.java
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,9 @@ public Encoding sniff() throws IOException {
detector.Init(this);
detector.DoIt(source, length, false);
detector.DataEnd();
if (returnValue != null && returnValue != Encoding.WINDOWS1252 && returnValue.isAsciiSuperset()) {
if (returnValue != null && returnValue != Encoding.WINDOWS1252
&& returnValue != Encoding.UTF16BE
&& returnValue != Encoding.UTF16LE) {
return returnValue;
} else {
return null;
Expand Down
3 changes: 2 additions & 1 deletion src/nu/validator/htmlparser/extra/IcuDetectorSniffer.java
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ public Encoding sniff() throws IOException {
if (actual != null) {
enc = actual;
}
if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) {
if (enc != Encoding.WINDOWS1252 //
&& enc != Encoding.UTF16BE && enc != Encoding.UTF16LE) {
return enc;
} else {
return null;
Expand Down
6 changes: 0 additions & 6 deletions src/nu/validator/htmlparser/io/Driver.java
Original file line number Diff line number Diff line change
Expand Up @@ -370,12 +370,6 @@ public boolean internalEncodingDeclaration(String internalCharset)
if (actual == null) {
actual = cs;
}
if (!actual.isAsciiSuperset()) {
tokenizer.errTreeBuilder("Internal encoding declaration specified \u201C"
+ internalCharset
+ "\u201D which is not an ASCII superset. Not changing the encoding.");
return false;
}
if (characterEncoding == null) {
// Reader case
return true;
Expand Down
58 changes: 0 additions & 58 deletions src/nu/validator/htmlparser/io/Encoding.java
Original file line number Diff line number Diff line change
Expand Up @@ -304,8 +304,6 @@ public class Encoding {

private final Charset charset;

private final boolean asciiSuperset;

private final boolean obscure;

private final boolean shouldNot;
Expand All @@ -315,15 +313,6 @@ public class Encoding {
private Encoding actualHtmlEncoding = null;

static {
byte[] testBuf = new byte[0x7F];
for (int i = 0; i < 0x7F; i++) {
if (isAsciiSupersetnessSensitive(i)) {
testBuf[i] = (byte) i;
} else {
testBuf[i] = (byte) 0x20;
}
}

Set<Encoding> encodings = new HashSet<Encoding>();

SortedMap<String, Charset> charsets = Charset.availableCharsets();
Expand Down Expand Up @@ -398,12 +387,6 @@ asciiSuperset, isObscure(name),
}
}

private static boolean isAsciiSupersetnessSensitive(int c) {
return (c >= 0x09 && c <= 0x0D) || (c >= 0x20 && c <= 0x22)
|| (c >= 0x26 && c <= 0x27) || (c >= 0x2C && c <= 0x3F)
|| (c >= 0x41 && c <= 0x5A) || (c >= 0x61 && c <= 0x7A);
}

private static boolean isObscure(String lowerCasePreferredIanaName) {
return !(Arrays.binarySearch(NOT_OBSCURE, lowerCasePreferredIanaName) > -1);
}
Expand All @@ -419,38 +402,6 @@ private static boolean isShouldNot(String lowerCasePreferredIanaName) {
return (Arrays.binarySearch(SHOULD_NOT, lowerCasePreferredIanaName) > -1);
}

/**
* @param testBuf
* @param cs
*/
private static boolean asciiMapsToBasicLatin(byte[] testBuf, Charset cs) {
CharsetDecoder dec = cs.newDecoder();
dec.onMalformedInput(CodingErrorAction.REPORT);
dec.onUnmappableCharacter(CodingErrorAction.REPORT);
Reader r = new InputStreamReader(new ByteArrayInputStream(testBuf), dec);
try {
for (int i = 0; i < 0x7F; i++) {
if (isAsciiSupersetnessSensitive(i)) {
if (r.read() != i) {
return false;
}
} else {
if (r.read() != 0x20) {
return false;
}
}
}
} catch (IOException e) {
return false;
} catch (Exception e) {
return false;
} catch (CoderMalfunctionError e) {
return false;
}

return true;
}

private static boolean isLikelyEbcdic(String canonName,
boolean asciiSuperset) {
if (!asciiSuperset) {
Expand Down Expand Up @@ -536,15 +487,6 @@ private Encoding(final String canonName, final Charset charset,
this.likelyEbcdic = likelyEbcdic;
}

/**
* Returns the asciiSuperset.
*
* @return the asciiSuperset
*/
public boolean isAsciiSuperset() {
return asciiSuperset;
}

/**
* Returns the canonName.
*
Expand Down
6 changes: 0 additions & 6 deletions src/nu/validator/htmlparser/io/MetaSniffer.java
Original file line number Diff line number Diff line change
Expand Up @@ -169,12 +169,6 @@ protected boolean tryCharset(String encoding) throws SAXException {
} else {
Encoding cs = Encoding.forName(encoding);
String canonName = cs.getCanonName();
if (!cs.isAsciiSuperset()) {
err("The encoding \u201C"
+ encoding
+ "\u201D is not an ASCII superset and, therefore, cannot be used in an internal encoding declaration. Continuing the sniffing algorithm.");
return false;
}
if (!cs.isRegistered()) {
if (encoding.startsWith("x-")) {
err("The encoding \u201C"
Expand Down