Skip to content

Conform encoding handling to Encoding spec #48

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Drop superseded encoding-checking code
This change drops some code which performs various encoding checks that
no longer correspond to any current requirements in the Encoding spec.
  • Loading branch information
sideshowbarker committed Sep 2, 2021
commit 24538913bedd0259cfa19075160dadc822bb6d1c
27 changes: 0 additions & 27 deletions src/nu/validator/htmlparser/io/Driver.java
Original file line number Diff line number Diff line change
Expand Up @@ -470,33 +470,6 @@ protected Encoding encodingFromExternalDeclaration(String encoding)
protected Encoding whineAboutEncodingAndReturnActual(String encoding,
Encoding cs) throws SAXException {
String canonName = cs.getCanonName();
if (!cs.isRegistered()) {
if (encoding.startsWith("x-")) {
tokenizer.err("The encoding \u201C"
+ encoding
+ "\u201D is not an IANA-registered encoding. (Charmod C022)");
} else {
tokenizer.err("The encoding \u201C"
+ encoding
+ "\u201D is not an IANA-registered encoding and did not use the \u201Cx-\u201D prefix. (Charmod C023)");
}
} else if (!canonName.equals(encoding)) {
tokenizer.err("The encoding \u201C"
+ encoding
+ "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C"
+ canonName + "\u201D. (Charmod C024)");
}
if (cs.isShouldNot()) {
tokenizer.warn("Authors should not use the character encoding \u201C"
+ encoding
+ "\u201D. It is recommended to use \u201CUTF-8\u201D.");
} else if (cs.isLikelyEbcdic()) {
tokenizer.warn("Authors should not use EBCDIC-based encodings. It is recommended to use \u201CUTF-8\u201D.");
} else if (cs.isObscure()) {
tokenizer.warn("The character encoding \u201C"
+ encoding
+ "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D.");
}
if (!canonName.equals(encoding)) {
tokenizer.err(Encoding.msgNotPreferredName(encoding, canonName));
}
Expand Down
72 changes: 0 additions & 72 deletions src/nu/validator/htmlparser/io/Encoding.java
Original file line number Diff line number Diff line change
Expand Up @@ -52,17 +52,6 @@ public class Encoding {

public static final Encoding WINDOWS1252;

private static String[] SHOULD_NOT = { "jisx02121990", "xjis0208" };

private static String[] BANNED = { "bocu1", "cesu8", "compoundtext",
"iscii91", "macarabic", "maccentraleurroman", "maccroatian",
"maccyrillic", "macdevanagari", "macfarsi", "macgreek",
"macgujarati", "macgurmukhi", "machebrew", "macicelandic",
"macroman", "macromanian", "macthai", "macturkish", "macukranian",
"scsu", "utf32", "utf32be", "utf32le", "utf7", "ximapmailboxname",
"xjisautodetect", "xutf16bebom", "xutf16lebom", "xutf32bebom",
"xutf32lebom", "xutf16oppositeendian", "xutf16platformendian",
"xutf32oppositeendian", "xutf32platformendian" };
private static Map<String, Encoding> encodingByLabel =
new HashMap<String, Encoding>();

Expand Down Expand Up @@ -304,12 +293,6 @@ public class Encoding {

private final Charset charset;

private final boolean obscure;

private final boolean shouldNot;

private final boolean likelyEbcdic;

static {
Set<Encoding> encodings = new HashSet<Encoding>();

Expand Down Expand Up @@ -345,30 +328,6 @@ asciiSuperset, isObscure(name),
WINDOWS1252 = forName("windows-1252");
}

private static boolean isObscure(String lowerCasePreferredIanaName) {
return !(Arrays.binarySearch(NOT_OBSCURE, lowerCasePreferredIanaName) > -1);
}

private static boolean isBanned(String lowerCasePreferredIanaName) {
if (lowerCasePreferredIanaName.startsWith("xibm")) {
return true;
}
return (Arrays.binarySearch(BANNED, lowerCasePreferredIanaName) > -1);
}

private static boolean isShouldNot(String lowerCasePreferredIanaName) {
return (Arrays.binarySearch(SHOULD_NOT, lowerCasePreferredIanaName) > -1);
}

private static boolean isLikelyEbcdic(String canonName,
boolean asciiSuperset) {
if (!asciiSuperset) {
return (canonName.startsWith("cp") || canonName.startsWith("ibm") || canonName.startsWith("xibm"));
} else {
return false;
}
}

public static Encoding forName(String name) {
Encoding rv = encodingByLabel.get(toNameKey(name));
if (rv == null) {
Expand Down Expand Up @@ -454,37 +413,6 @@ public String getCanonName() {
return canonName;
}

/**
* Returns the likelyEbcdic.
*
* @return the likelyEbcdic
*/
public boolean isLikelyEbcdic() {
return likelyEbcdic;
}

/**
* Returns the obscure.
*
* @return the obscure
*/
public boolean isObscure() {
return obscure;
}

/**
* Returns the shouldNot.
*
* @return the shouldNot
*/
public boolean isShouldNot() {
return shouldNot;
}

public boolean isRegistered() {
return !canonName.startsWith("x-");
}

/**
* @return
* @see java.nio.charset.Charset#canEncode()
Expand Down
22 changes: 0 additions & 22 deletions src/nu/validator/htmlparser/io/MetaSniffer.java
Original file line number Diff line number Diff line change
Expand Up @@ -169,28 +169,6 @@ protected boolean tryCharset(String encoding) throws SAXException {
} else {
Encoding cs = Encoding.forName(encoding);
String canonName = cs.getCanonName();
if (!cs.isRegistered()) {
if (encoding.startsWith("x-")) {
err("The encoding \u201C"
+ encoding
+ "\u201D is not an IANA-registered encoding. (Charmod C022)");
} else {
err("The encoding \u201C"
+ encoding
+ "\u201D is not an IANA-registered encoding and did not use the \u201Cx-\u201D prefix. (Charmod C023)");
}
} else if (!cs.getCanonName().equals(encoding)) {
err("The encoding \u201C" + encoding
+ "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C"
+ canonName + "\u201D. (Charmod C024)");
}
if (cs.isShouldNot()) {
warn("Authors should not use the character encoding \u201C"
+ encoding
+ "\u201D. It is recommended to use \u201CUTF-8\u201D.");
} else if (cs.isObscure()) {
warn("The character encoding \u201C" + encoding + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D.");
}
if (!cs.getCanonName().equals(encoding)) {
err(Encoding.msgNotCanonicalName(encoding, canonName));
this.characterEncoding = cs;
Expand Down