Skip to content

Commit ce94b4c

Browse files
committed
GH-3281: Update TokenizerText for checking RDF Strings
1 parent 22b36dc commit ce94b4c

File tree

5 files changed

+99
-39
lines changed

5 files changed

+99
-39
lines changed

jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java

Lines changed: 62 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,6 @@ public final class TokenizerText implements Tokenizer
5555
private final PeekReader reader;
5656
// Whether whitespace between tokens includes newlines (in various forms).
5757
private final boolean singleLineMode;
58-
// Indicator. The PeekReader should throw java.nio.charset.MalformedInputException
59-
private final boolean isASCII;
6058
// The code assumes that errors throw exception and so stop parsing.
6159
private final ErrorHandler errorHandler;
6260

@@ -67,15 +65,14 @@ public final class TokenizerText implements Tokenizer
6765

6866
public static Tokenizer fromString(String string) { return create().fromString(string).build(); }
6967

70-
/*package*/ static TokenizerText internal(PeekReader reader, boolean singleLineMode, boolean isASCII, ErrorHandler errorHandler) {
71-
return new TokenizerText(reader, singleLineMode, isASCII, errorHandler);
68+
/*package*/ static TokenizerText internal(PeekReader reader, boolean singleLineMode, ErrorHandler errorHandler) {
69+
return new TokenizerText(reader, singleLineMode, errorHandler);
7270
}
7371

74-
private TokenizerText(PeekReader reader, boolean singleLineMode, boolean isASCII, ErrorHandler errorHandler) {
72+
private TokenizerText(PeekReader reader, boolean singleLineMode, ErrorHandler errorHandler) {
7573
this.reader = Objects.requireNonNull(reader, "PeekReader");
7674
this.singleLineMode = singleLineMode;
7775
this.errorHandler = Objects.requireNonNull(errorHandler, "ErrorHandler");
78-
this.isASCII = isASCII;
7976
}
8077

8178
@Override
@@ -215,7 +212,7 @@ private Token parseToken() {
215212
int ch3 = reader.peekChar();
216213
if ( ch3 == ch ) {
217214
reader.readChar(); // Read potential third quote.
218-
token.setImage(readStringQuote3(ch, false));
215+
token.setImage(readStringQuote3(ch));
219216
StringType st = (ch == CH_QUOTE1) ? StringType.LONG_STRING1 : StringType.LONG_STRING2;
220217
token.setStringType(st);
221218
} else {
@@ -539,17 +536,13 @@ private String readIRI() {
539536
fatal("Broken IRI (CR): %s", stringBuilder.toString()); return null;
540537
case CH_GT:
541538
// Done!
542-
return stringBuilder.toString();
539+
String str = stringBuilder.toString();
540+
checkRDFString(str);
541+
return str;
543542
case CH_RSLASH:
544-
if ( VeryVeryLaxIRI )
545-
// Includes unicode escapes and also \n etc
546-
ch = readLiteralEscape();
547-
else
548-
// NORMAL
549-
ch = readUnicodeEscape();
543+
ch = readUnicodeEscape();
550544
// Don't check legality of ch (strict syntax at this point).
551-
// That does not mean it is a good idea to bypass checking.
552-
// Bad characters will lead to trouble elsewhere.
545+
// IRI parsing will catch errors.
553546
break;
554547
case CH_LT:
555548
// Probably a corrupt file so treat as fatal.
@@ -558,7 +551,7 @@ private String readIRI() {
558551
error("Bad character in IRI (tab character): <%s[tab]...>", stringBuilder.toString()); break;
559552
case '{': case '}': case '"': case '|': case '^': case '`' :
560553
if ( ! VeryVeryLaxIRI )
561-
warning("Illegal character in IRI (codepoint 0x%02X, '%c'): <%s[%c]...>", ch, (char)ch, stringBuilder.toString(), (char)ch);
554+
warning("Illegal character in IRI (codepoint U+%04X, '%c'): <%s[%c]...>", ch, (char)ch, stringBuilder.toString(), (char)ch);
562555
break;
563556
case SPC:
564557
if ( ! AllowSpacesInIRI )
@@ -786,13 +779,42 @@ else if ( ch == CH_RSLASH )
786779
throw new ARQInternalErrorException("Not a '\\' or a '%' character");
787780
}
788781

782+
/**
783+
* Apply any checks for "RDF String" to a string that has already had escape processing applied.
784+
* An RDF String is a sequence of codepoints in the range U+0000 to U+10FFFF, excluding surrogates.
785+
* Because this is java, we test for no non-paired surrogates.
786+
* A surrogate pair is high-low.
787+
*/
788+
private void checkRDFString(String string) {
789+
for ( int i = 0 ; i < string.length() ; i++ ) {
790+
// Not "codePointAt" which does surrogate processing.
791+
char ch = string.charAt(i);
792+
793+
if ( ! Character.isValidCodePoint(ch) )
794+
warning("Illegal code point in \\U sequence value: 0x%08X", ch);
795+
796+
// Check surrogate pairs are pairs.
797+
if ( Character.isHighSurrogate(ch) ) {
798+
i++;
799+
if ( i == string.length() )
800+
fatal("Bad surrogate pair (end of string)");
801+
char ch1 = string.charAt(i);
802+
if ( ! Character.isLowSurrogate(ch1) ) {
803+
fatal("Bad surrogate pair (high surrogate not followed by low surrogate)");
804+
}
805+
} else if ( Character.isLowSurrogate(ch) ) {
806+
fatal("Bad surrogate pair (low surrogate not preceded by a high surrogate)");
807+
}
808+
}
809+
}
810+
811+
789812
// Get characters between two markers.
790-
// strEscapes may be processed
813+
// String escapes may be processed
791814
private String readStringQuote1(int startCh, int endCh) {
792-
// Position at start of string.
793-
stringBuilder.setLength(0);
794815
// Assumes first delimiter char read already.
795-
// Reads terminating delimiter
816+
// Reads the terminating delimiter.
817+
stringBuilder.setLength(0);
796818

797819
for (;;) {
798820
int ch = reader.readChar();
@@ -805,9 +827,12 @@ private String readStringQuote1(int startCh, int endCh) {
805827
warning("Unicode non-character U+%04X in string", ch);
806828
if ( ch == EOF )
807829
fatal("Broken token: %s", stringBuilder.toString());
808-
else if ( ch == endCh )
809-
return stringBuilder.toString();
810-
else if ( ch == NL )
830+
else if ( ch == endCh ) {
831+
// Done!
832+
String str = stringBuilder.toString();
833+
checkRDFString(str);
834+
return str;
835+
} else if ( ch == NL )
811836
fatal("Broken token (newline in string)", stringBuilder.toString());
812837
else if ( ch == CR )
813838
fatal("Broken token (carriage return in string)", stringBuilder.toString());
@@ -823,7 +848,9 @@ else if ( ch == CH_RSLASH )
823848
}
824849
}
825850

826-
private String readStringQuote3(int quoteChar, boolean endNL) {
851+
private String readStringQuote3(int quoteChar) {
852+
// Assumes 3 character delimiter has been read.
853+
// Reads the terminating delimiter.
827854
stringBuilder.setLength(0);
828855
for (;;) {
829856
int ch = reader.readChar();
@@ -833,13 +860,14 @@ private String readStringQuote3(int quoteChar, boolean endNL) {
833860
warning("Unicode replacement character U+FFFD in string");
834861
}
835862
if ( ch == EOF ) {
836-
if ( endNL )
837-
return stringBuilder.toString();
838863
fatal("Broken long string");
839-
}
840-
else if ( ch == quoteChar ) {
841-
if ( threeQuotes(quoteChar) )
842-
return stringBuilder.toString();
864+
} else if ( ch == quoteChar ) {
865+
if ( threeQuotes(quoteChar) ) {
866+
String str = stringBuilder.toString();
867+
checkRDFString(str);
868+
return str;
869+
}
870+
// quote, not triple. It is a normal character.
843871
} else if ( ch == CH_RSLASH )
844872
ch = readLiteralEscape();
845873
insertCodepoint(stringBuilder, ch);
@@ -1325,7 +1353,7 @@ private final int readLiteralEscape() {
13251353
case 'u': return readUnicode4Escape();
13261354
case 'U': return readUnicode8Escape();
13271355
default:
1328-
fatal("Illegal escape sequence value: %c (0x%02X)", c, c);
1356+
fatal("Illegal escape sequence value: %c (0x%02X)",c , c);
13291357
return 0;
13301358
}
13311359
}
@@ -1356,8 +1384,8 @@ private final int readUnicode4Escape() {
13561384

13571385
private final int readUnicode8Escape() {
13581386
int ch8 = readHexSequence(8);
1359-
if ( ch8 > Character.MAX_CODE_POINT )
1360-
fatal("Illegal code point in \\U sequence value: 0x%08X", ch8);
1387+
if ( ! Character.isValidCodePoint(ch8) )
1388+
fatal("Illegal code point from \\U sequence value: 0x%08X", ch8);
13611389
return ch8;
13621390
}
13631391

jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerTextBuilder.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,6 @@ public Tokenizer build() {
129129
throw new IllegalStateException("No data source");
130130
}
131131

132-
return TokenizerText.internal(pr, singleLineMode, !utf8, errHandler);
132+
return TokenizerText.internal(pr, singleLineMode, errHandler);
133133
}
134134
}

jena-arq/src/main/java/org/apache/jena/sparql/lang/QueryParserBase.java

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -212,13 +212,21 @@ protected Node stripSign(Node node) {
212212
return NodeFactory.createLiteral(lex, lang, dt);
213213
}
214214

215-
// Because of Java (Java strings have surrogate pairs) we only detect singleton surrogates.
216-
protected void checkString(String string, int line, int column) {
215+
/**
216+
* Apply any checks for "RDF String" to a string that has already had escape processing applied.
217+
* An RDF String is a sequence of codepoints in the range U+0000 to U+10FFFF, excluding surrogates.
218+
* Because this is java, we test for no non-paired surrogates.
219+
* A surrogate pair is high-low.
220+
*/
221+
protected static void checkRDFString(String string, int line, int column) {
217222
// Checks for bare surrogate pairs.
218223
for ( int i = 0; i < string.length(); i++ ) {
219224
// Not "codePointAt" which does surrogate processing.
220225
char ch = string.charAt(i);
221226

227+
if ( ! Character.isValidCodePoint(ch) )
228+
throw new QueryParseException(String.format("Illegal code point in \\U sequence value: 0x%08X", ch), line, column);
229+
222230
// Check surrogate pairs are in pairs. Pairs are high-low.
223231
if ( Character.isLowSurrogate(ch) )
224232
throw new QueryParseException("Bad surrogate pair (low surrogate without high surrogate)", line, column);
@@ -252,10 +260,15 @@ protected Node createLiteralLang(String lexicalForm, String langTagDir, int line
252260
return createLiteralAny(lexicalForm, langTagDir, null, null, line, column);
253261
}
254262

263+
private static void checkLexicalForm(String lexicalForm, int line, int column) {
264+
checkRDFString(lexicalForm, line, column);
265+
}
266+
255267
/**
256268
* Create a literal, given all possible component parts.
257269
*/
258270
private Node createLiteralAny(String lexicalForm, String langTag, String textDirStr, String datatypeURI, int line, int column) {
271+
checkLexicalForm(lexicalForm, line, column);
259272
Node n = null;
260273
// Can't have type and lang tag in parsing.
261274
if ( datatypeURI != null ) {
@@ -369,7 +382,7 @@ protected String resolveQuotedIRI(String iriStr, int line, int column) {
369382
iriStr = stripQuotes(iriStr);
370383
iriStr = unescapeUnicode(iriStr, line, column);
371384
// Check for Unicode surrogates
372-
checkString(iriStr, line, column);
385+
checkRDFString(iriStr, line, column);
373386
return resolveIRI(iriStr, line, column);
374387
}
375388

jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTurtle.java

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,25 @@ public void turtle_rdf12_bad_13() {
267267
Triple t = parseOneTriple("@version \"\"\"1.2\"\"\" <x:s> <x:p> 123 . ") ;
268268
}
269269

270+
@Test (expected=ExFatal.class)
271+
public void turtle_bad_surrogate_1() {
272+
Triple t = parseOneTriple("<x:s> <x:p> '\\ud800' . ") ;
273+
}
274+
275+
@Test (expected=ExFatal.class)
276+
public void turtle_bad_surrogate_2() {
277+
Triple t = parseOneTriple("<x:s> <x:p> '\\udff' . ") ;
278+
}
279+
@Test (expected=ExFatal.class)
280+
public void turtle_bad_surrogate_3() {
281+
Triple t = parseOneTriple("<x:s> <x:p> '\\U0000d800' . ") ;
282+
}
283+
284+
@Test (expected=ExFatal.class)
285+
public void turtle_bad_surrogate_4() {
286+
Triple t = parseOneTriple("<x:s> <x:p> '\\U0000dff' . ") ;
287+
}
288+
270289
// No Formulae. Not trig.
271290
@Test (expected=ExFatal.class)
272291
public void turtle_50() { parse("@prefix ex: <http://example/> . { ex:s ex:p 123 . } ") ; }

jena-arq/src/test/java/org/apache/jena/sparql/syntax/TestQueryParser.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ public void syntax_unicode_escaped_surrogate_strings() {
7070

7171
@Test
7272
public void syntax_unicode_surrogate_pair_by_unicode_escape() {
73-
// Allow - because Java strings may have surrogate pairs so we allow then in unicode escapes if paired.
73+
// Allow - because Java strings may have surrogate pairs so we allow them in unicode escapes if paired.
7474
testParse("SELECT * { ?s ?p '\\uD801\\uDC37'}");
7575

7676
// QueryParseException ex = assertThrows(QueryParseException.class, ()->testParse("SELECT * { ?s ?p '\\uD801\\uDC37'}"));

0 commit comments

Comments
 (0)