GH-3281: Update TokenizerText for checking RDF Strings

afs · afs · commit ce94b4c95410 · 2025-06-26T18:45:53.000+01:00
diff --git a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java
@@ -55,8 +55,6 @@ public final class TokenizerText implements Tokenizer
     private final PeekReader reader;
     // Whether whitespace between tokens includes newlines (in various forms).
     private final boolean singleLineMode;
-    // Indicator. The PeekReader should throw java.nio.charset.MalformedInputException
-    private final boolean isASCII;
     // The code assumes that errors throw exception and so stop parsing.
     private final ErrorHandler errorHandler;
 
@@ -67,15 +65,14 @@ public final class TokenizerText implements Tokenizer
 
     public static Tokenizer fromString(String string) { return create().fromString(string).build(); }
 
-    /*package*/ static TokenizerText internal(PeekReader reader, boolean singleLineMode, boolean isASCII, ErrorHandler errorHandler) {
-        return new TokenizerText(reader, singleLineMode, isASCII, errorHandler);
+    /*package*/ static TokenizerText internal(PeekReader reader, boolean singleLineMode, ErrorHandler errorHandler) {
+        return new TokenizerText(reader, singleLineMode, errorHandler);
     }
 
-    private TokenizerText(PeekReader reader, boolean singleLineMode, boolean isASCII, ErrorHandler errorHandler) {
+    private TokenizerText(PeekReader reader, boolean singleLineMode, ErrorHandler errorHandler) {
         this.reader = Objects.requireNonNull(reader, "PeekReader");
         this.singleLineMode = singleLineMode;
         this.errorHandler = Objects.requireNonNull(errorHandler, "ErrorHandler");
-        this.isASCII = isASCII;
     }
 
     @Override
@@ -215,7 +212,7 @@ private Token parseToken() {
                 int ch3 = reader.peekChar();
                 if ( ch3 == ch ) {
                     reader.readChar();     // Read potential third quote.
-                    token.setImage(readStringQuote3(ch, false));
+                    token.setImage(readStringQuote3(ch));
                     StringType st = (ch == CH_QUOTE1) ? StringType.LONG_STRING1 : StringType.LONG_STRING2;
                     token.setStringType(st);
                 } else {
@@ -539,17 +536,13 @@ private String readIRI() {
                     fatal("Broken IRI (CR): %s", stringBuilder.toString()); return null;
                 case CH_GT:
                     // Done!
-                    return stringBuilder.toString();
+                    String str = stringBuilder.toString();
+                    checkRDFString(str);
+                    return str;
                 case CH_RSLASH:
-                    if ( VeryVeryLaxIRI )
-                        // Includes unicode escapes and also \n etc
-                        ch = readLiteralEscape();
-                    else
-                        // NORMAL
-                        ch = readUnicodeEscape();
+                    ch = readUnicodeEscape();
                     // Don't check legality of ch (strict syntax at this point).
-                    // That does not mean it is a good idea to bypass checking.
-                    // Bad characters will lead to trouble elsewhere.
+                    // IRI parsing will catch errors.
                     break;
                 case CH_LT:
                     // Probably a corrupt file so treat as fatal.
@@ -558,7 +551,7 @@ private String readIRI() {
                     error("Bad character in IRI (tab character): <%s[tab]...>", stringBuilder.toString()); break;
                 case '{': case '}': case '"': case '|': case '^': case '`' :
                     if ( ! VeryVeryLaxIRI )
-                        warning("Illegal character in IRI (codepoint 0x%02X, '%c'): <%s[%c]...>", ch, (char)ch, stringBuilder.toString(), (char)ch);
+                        warning("Illegal character in IRI (codepoint U+%04X, '%c'): <%s[%c]...>", ch, (char)ch, stringBuilder.toString(), (char)ch);
                     break;
                 case SPC:
                     if ( ! AllowSpacesInIRI )
@@ -786,13 +779,42 @@ else if ( ch == CH_RSLASH )
             throw new ARQInternalErrorException("Not a '\\' or a '%' character");
     }
 
+    /**
+     * Apply any checks for "RDF String" to a string that has already had escape processing applied.
+     * An RDF String is a sequence of codepoints in the range U+0000 to U+10FFFF, excluding surrogates.
+     * Because this is java, we test for no non-paired surrogates.
+     * A surrogate pair is high-low.
+     */
+    private void checkRDFString(String string) {
+        for ( int i = 0 ; i < string.length() ; i++ ) {
+            // Not "codePointAt" which does surrogate processing.
+            char ch = string.charAt(i);
+
+            if ( ! Character.isValidCodePoint(ch) )
+                warning("Illegal code point in \\U sequence value: 0x%08X", ch);
+
+            // Check surrogate pairs are pairs.
+            if ( Character.isHighSurrogate(ch) ) {
+                i++;
+                if ( i == string.length() )
+                    fatal("Bad surrogate pair (end of string)");
+                char ch1 = string.charAt(i);
+                if ( ! Character.isLowSurrogate(ch1) ) {
+                    fatal("Bad surrogate pair (high surrogate not followed by low surrogate)");
+                }
+            } else if ( Character.isLowSurrogate(ch) ) {
+                fatal("Bad surrogate pair (low surrogate not preceded by a high surrogate)");
+            }
+        }
+    }
+
+
     // Get characters between two markers.
-    // strEscapes may be processed
+    // String escapes may be processed
     private String readStringQuote1(int startCh, int endCh) {
-        // Position at start of string.
-        stringBuilder.setLength(0);
         // Assumes first delimiter char read already.
-        // Reads terminating delimiter
+        // Reads the terminating delimiter.
+        stringBuilder.setLength(0);
 
         for (;;) {
             int ch = reader.readChar();
@@ -805,9 +827,12 @@ private String readStringQuote1(int startCh, int endCh) {
                 warning("Unicode non-character U+%04X in string", ch);
             if ( ch == EOF )
                 fatal("Broken token: %s", stringBuilder.toString());
-            else if ( ch == endCh )
-                return stringBuilder.toString();
-            else if ( ch == NL )
+            else if ( ch == endCh ) {
+                // Done!
+                String str = stringBuilder.toString();
+                checkRDFString(str);
+                return str;
+            } else if ( ch == NL )
                 fatal("Broken token (newline in string)", stringBuilder.toString());
             else if ( ch == CR )
                 fatal("Broken token (carriage return in string)", stringBuilder.toString());
@@ -823,7 +848,9 @@ else if ( ch == CH_RSLASH )
         }
     }
 
-    private String readStringQuote3(int quoteChar, boolean endNL) {
+    private String readStringQuote3(int quoteChar) {
+        // Assumes 3 character delimiter has been read.
+        // Reads the terminating delimiter.
         stringBuilder.setLength(0);
         for (;;) {
             int ch = reader.readChar();
@@ -833,13 +860,14 @@ private String readStringQuote3(int quoteChar, boolean endNL) {
                     warning("Unicode replacement character U+FFFD in string");
             }
             if ( ch == EOF ) {
-                if ( endNL )
-                    return stringBuilder.toString();
                 fatal("Broken long string");
-            }
-            else if ( ch == quoteChar ) {
-                if ( threeQuotes(quoteChar) )
-                    return stringBuilder.toString();
+            } else if ( ch == quoteChar ) {
+                if ( threeQuotes(quoteChar) ) {
+                    String str = stringBuilder.toString();
+                    checkRDFString(str);
+                    return str;
+                }
+                // quote, not triple. It is a normal character.
             } else if ( ch == CH_RSLASH )
                 ch = readLiteralEscape();
             insertCodepoint(stringBuilder, ch);
@@ -1325,7 +1353,7 @@ private final int readLiteralEscape() {
             case 'u':   return readUnicode4Escape();
             case 'U':   return readUnicode8Escape();
             default:
-                fatal("Illegal escape sequence value: %c (0x%02X)", c, c);
+                fatal("Illegal escape sequence value: %c (0x%02X)",c , c);
                 return 0;
         }
     }
@@ -1356,8 +1384,8 @@ private final int readUnicode4Escape() {
 
     private final int readUnicode8Escape() {
         int ch8 = readHexSequence(8);
-        if ( ch8 > Character.MAX_CODE_POINT )
-            fatal("Illegal code point in \\U sequence value: 0x%08X", ch8);
+        if ( ! Character.isValidCodePoint(ch8) )
+            fatal("Illegal code point from \\U sequence value: 0x%08X", ch8);
         return ch8;
     }
 
diff --git a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerTextBuilder.java b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerTextBuilder.java
@@ -129,6 +129,6 @@ public Tokenizer build() {
             throw new IllegalStateException("No data source");
         }
 
-        return TokenizerText.internal(pr, singleLineMode, !utf8, errHandler);
+        return TokenizerText.internal(pr, singleLineMode, errHandler);
     }
 }
diff --git a/jena-arq/src/main/java/org/apache/jena/sparql/lang/QueryParserBase.java b/jena-arq/src/main/java/org/apache/jena/sparql/lang/QueryParserBase.java
@@ -212,13 +212,21 @@ protected Node stripSign(Node node) {
         return NodeFactory.createLiteral(lex, lang, dt);
     }
 
-    // Because of Java (Java strings have surrogate pairs) we only detect singleton surrogates.
-    protected void checkString(String string, int line, int column) {
+    /**
+     * Apply any checks for "RDF String" to a string that has already had escape processing applied.
+     * An RDF String is a sequence of codepoints in the range U+0000 to U+10FFFF, excluding surrogates.
+     * Because this is java, we test for no non-paired surrogates.
+     * A surrogate pair is high-low.
+     */
+    protected static void checkRDFString(String string, int line, int column) {
         // Checks for bare surrogate pairs.
         for ( int i = 0; i < string.length(); i++ ) {
             // Not "codePointAt" which does surrogate processing.
             char ch = string.charAt(i);
 
+            if ( ! Character.isValidCodePoint(ch) )
+                throw new QueryParseException(String.format("Illegal code point in \\U sequence value: 0x%08X", ch), line, column);
+
             // Check surrogate pairs are in pairs. Pairs are high-low.
             if ( Character.isLowSurrogate(ch) )
                 throw new QueryParseException("Bad surrogate pair (low surrogate without high surrogate)", line, column);
@@ -252,10 +260,15 @@ protected Node createLiteralLang(String lexicalForm, String langTagDir, int line
         return createLiteralAny(lexicalForm, langTagDir, null, null, line, column);
     }
 
+    private static void checkLexicalForm(String lexicalForm, int line, int column) {
+        checkRDFString(lexicalForm, line, column);
+    }
+
     /**
      * Create a literal, given all possible component parts.
      */
     private Node createLiteralAny(String lexicalForm, String langTag, String textDirStr, String datatypeURI, int line, int column) {
+        checkLexicalForm(lexicalForm, line, column);
         Node n = null;
         // Can't have type and lang tag in parsing.
         if ( datatypeURI != null ) {
@@ -369,7 +382,7 @@ protected String resolveQuotedIRI(String iriStr, int line, int column) {
         iriStr = stripQuotes(iriStr);
         iriStr = unescapeUnicode(iriStr, line, column);
         // Check for Unicode surrogates
-        checkString(iriStr, line, column);
+        checkRDFString(iriStr, line, column);
         return resolveIRI(iriStr, line, column);
     }
 
diff --git a/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTurtle.java b/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTurtle.java
@@ -267,6 +267,25 @@ public void turtle_rdf12_bad_13() {
         Triple t = parseOneTriple("@version \"\"\"1.2\"\"\" <x:s> <x:p> 123 . ") ;
     }
 
+    @Test (expected=ExFatal.class)
+    public void turtle_bad_surrogate_1() {
+        Triple t = parseOneTriple("<x:s> <x:p> '\\ud800' . ") ;
+    }
+
+    @Test (expected=ExFatal.class)
+    public void turtle_bad_surrogate_2() {
+        Triple t = parseOneTriple("<x:s> <x:p> '\\udff' . ") ;
+    }
+    @Test (expected=ExFatal.class)
+    public void turtle_bad_surrogate_3() {
+        Triple t = parseOneTriple("<x:s> <x:p> '\\U0000d800' . ") ;
+    }
+
+    @Test (expected=ExFatal.class)
+    public void turtle_bad_surrogate_4() {
+        Triple t = parseOneTriple("<x:s> <x:p> '\\U0000dff' . ") ;
+    }
+
     // No Formulae. Not trig.
     @Test (expected=ExFatal.class)
     public void turtle_50()     { parse("@prefix ex:  <http://example/> .  { ex:s ex:p 123 . } ") ; }
diff --git a/jena-arq/src/test/java/org/apache/jena/sparql/syntax/TestQueryParser.java b/jena-arq/src/test/java/org/apache/jena/sparql/syntax/TestQueryParser.java
@@ -70,7 +70,7 @@ public void syntax_unicode_escaped_surrogate_strings() {
 
     @Test
     public void syntax_unicode_surrogate_pair_by_unicode_escape() {
-        // Allow - because Java strings may have surrogate pairs so we allow then in unicode escapes if paired.
+        // Allow - because Java strings may have surrogate pairs so we allow them in unicode escapes if paired.
         testParse("SELECT * { ?s ?p '\\uD801\\uDC37'}");
 
 //        QueryParseException ex = assertThrows(QueryParseException.class,  ()->testParse("SELECT * { ?s ?p '\\uD801\\uDC37'}"));

Original file line number	Diff line number	Diff line change
`@@ -129,6 +129,6 @@ public Tokenizer build() {`
`129`	`129`	`throw new IllegalStateException("No data source");`
`130`	`130`	`}`
`131`	`131`
`132`		`- return TokenizerText.internal(pr, singleLineMode, !utf8, errHandler);`
	`132`	`+ return TokenizerText.internal(pr, singleLineMode, errHandler);`
`133`	`133`	`}`
`134`	`134`	`}`