@@ -55,8 +55,6 @@ public final class TokenizerText implements Tokenizer
55
55
private final PeekReader reader ;
56
56
// Whether whitespace between tokens includes newlines (in various forms).
57
57
private final boolean singleLineMode ;
58
- // Indicator. The PeekReader should throw java.nio.charset.MalformedInputException
59
- private final boolean isASCII ;
60
58
// The code assumes that errors throw exception and so stop parsing.
61
59
private final ErrorHandler errorHandler ;
62
60
@@ -67,15 +65,14 @@ public final class TokenizerText implements Tokenizer
67
65
68
66
public static Tokenizer fromString (String string ) { return create ().fromString (string ).build (); }
69
67
70
- /*package*/ static TokenizerText internal (PeekReader reader , boolean singleLineMode , boolean isASCII , ErrorHandler errorHandler ) {
71
- return new TokenizerText (reader , singleLineMode , isASCII , errorHandler );
68
+ /*package*/ static TokenizerText internal (PeekReader reader , boolean singleLineMode , ErrorHandler errorHandler ) {
69
+ return new TokenizerText (reader , singleLineMode , errorHandler );
72
70
}
73
71
74
- private TokenizerText (PeekReader reader , boolean singleLineMode , boolean isASCII , ErrorHandler errorHandler ) {
72
+ private TokenizerText (PeekReader reader , boolean singleLineMode , ErrorHandler errorHandler ) {
75
73
this .reader = Objects .requireNonNull (reader , "PeekReader" );
76
74
this .singleLineMode = singleLineMode ;
77
75
this .errorHandler = Objects .requireNonNull (errorHandler , "ErrorHandler" );
78
- this .isASCII = isASCII ;
79
76
}
80
77
81
78
@ Override
@@ -215,7 +212,7 @@ private Token parseToken() {
215
212
int ch3 = reader .peekChar ();
216
213
if ( ch3 == ch ) {
217
214
reader .readChar (); // Read potential third quote.
218
- token .setImage (readStringQuote3 (ch , false ));
215
+ token .setImage (readStringQuote3 (ch ));
219
216
StringType st = (ch == CH_QUOTE1 ) ? StringType .LONG_STRING1 : StringType .LONG_STRING2 ;
220
217
token .setStringType (st );
221
218
} else {
@@ -539,17 +536,13 @@ private String readIRI() {
539
536
fatal ("Broken IRI (CR): %s" , stringBuilder .toString ()); return null ;
540
537
case CH_GT :
541
538
// Done!
542
- return stringBuilder .toString ();
539
+ String str = stringBuilder .toString ();
540
+ checkRDFString (str );
541
+ return str ;
543
542
case CH_RSLASH :
544
- if ( VeryVeryLaxIRI )
545
- // Includes unicode escapes and also \n etc
546
- ch = readLiteralEscape ();
547
- else
548
- // NORMAL
549
- ch = readUnicodeEscape ();
543
+ ch = readUnicodeEscape ();
550
544
// Don't check legality of ch (strict syntax at this point).
551
- // That does not mean it is a good idea to bypass checking.
552
- // Bad characters will lead to trouble elsewhere.
545
+ // IRI parsing will catch errors.
553
546
break ;
554
547
case CH_LT :
555
548
// Probably a corrupt file so treat as fatal.
@@ -558,7 +551,7 @@ private String readIRI() {
558
551
error ("Bad character in IRI (tab character): <%s[tab]...>" , stringBuilder .toString ()); break ;
559
552
case '{' : case '}' : case '"' : case '|' : case '^' : case '`' :
560
553
if ( ! VeryVeryLaxIRI )
561
- warning ("Illegal character in IRI (codepoint 0x%02X , '%c'): <%s[%c]...>" , ch , (char )ch , stringBuilder .toString (), (char )ch );
554
+ warning ("Illegal character in IRI (codepoint U+%04X , '%c'): <%s[%c]...>" , ch , (char )ch , stringBuilder .toString (), (char )ch );
562
555
break ;
563
556
case SPC :
564
557
if ( ! AllowSpacesInIRI )
@@ -786,13 +779,42 @@ else if ( ch == CH_RSLASH )
786
779
throw new ARQInternalErrorException ("Not a '\\ ' or a '%' character" );
787
780
}
788
781
782
+ /**
783
+ * Apply any checks for "RDF String" to a string that has already had escape processing applied.
784
+ * An RDF String is a sequence of codepoints in the range U+0000 to U+10FFFF, excluding surrogates.
785
+ * Because this is java, we test for no non-paired surrogates.
786
+ * A surrogate pair is high-low.
787
+ */
788
+ private void checkRDFString (String string ) {
789
+ for ( int i = 0 ; i < string .length () ; i ++ ) {
790
+ // Not "codePointAt" which does surrogate processing.
791
+ char ch = string .charAt (i );
792
+
793
+ if ( ! Character .isValidCodePoint (ch ) )
794
+ warning ("Illegal code point in \\ U sequence value: 0x%08X" , ch );
795
+
796
+ // Check surrogate pairs are pairs.
797
+ if ( Character .isHighSurrogate (ch ) ) {
798
+ i ++;
799
+ if ( i == string .length () )
800
+ fatal ("Bad surrogate pair (end of string)" );
801
+ char ch1 = string .charAt (i );
802
+ if ( ! Character .isLowSurrogate (ch1 ) ) {
803
+ fatal ("Bad surrogate pair (high surrogate not followed by low surrogate)" );
804
+ }
805
+ } else if ( Character .isLowSurrogate (ch ) ) {
806
+ fatal ("Bad surrogate pair (low surrogate not preceded by a high surrogate)" );
807
+ }
808
+ }
809
+ }
810
+
811
+
789
812
// Get characters between two markers.
790
- // strEscapes may be processed
813
+ // String escapes may be processed
791
814
private String readStringQuote1 (int startCh , int endCh ) {
792
- // Position at start of string.
793
- stringBuilder .setLength (0 );
794
815
// Assumes first delimiter char read already.
795
- // Reads terminating delimiter
816
+ // Reads the terminating delimiter.
817
+ stringBuilder .setLength (0 );
796
818
797
819
for (;;) {
798
820
int ch = reader .readChar ();
@@ -805,9 +827,12 @@ private String readStringQuote1(int startCh, int endCh) {
805
827
warning ("Unicode non-character U+%04X in string" , ch );
806
828
if ( ch == EOF )
807
829
fatal ("Broken token: %s" , stringBuilder .toString ());
808
- else if ( ch == endCh )
809
- return stringBuilder .toString ();
810
- else if ( ch == NL )
830
+ else if ( ch == endCh ) {
831
+ // Done!
832
+ String str = stringBuilder .toString ();
833
+ checkRDFString (str );
834
+ return str ;
835
+ } else if ( ch == NL )
811
836
fatal ("Broken token (newline in string)" , stringBuilder .toString ());
812
837
else if ( ch == CR )
813
838
fatal ("Broken token (carriage return in string)" , stringBuilder .toString ());
@@ -823,7 +848,9 @@ else if ( ch == CH_RSLASH )
823
848
}
824
849
}
825
850
826
- private String readStringQuote3 (int quoteChar , boolean endNL ) {
851
+ private String readStringQuote3 (int quoteChar ) {
852
+ // Assumes 3 character delimiter has been read.
853
+ // Reads the terminating delimiter.
827
854
stringBuilder .setLength (0 );
828
855
for (;;) {
829
856
int ch = reader .readChar ();
@@ -833,13 +860,14 @@ private String readStringQuote3(int quoteChar, boolean endNL) {
833
860
warning ("Unicode replacement character U+FFFD in string" );
834
861
}
835
862
if ( ch == EOF ) {
836
- if ( endNL )
837
- return stringBuilder .toString ();
838
863
fatal ("Broken long string" );
839
- }
840
- else if ( ch == quoteChar ) {
841
- if ( threeQuotes (quoteChar ) )
842
- return stringBuilder .toString ();
864
+ } else if ( ch == quoteChar ) {
865
+ if ( threeQuotes (quoteChar ) ) {
866
+ String str = stringBuilder .toString ();
867
+ checkRDFString (str );
868
+ return str ;
869
+ }
870
+ // quote, not triple. It is a normal character.
843
871
} else if ( ch == CH_RSLASH )
844
872
ch = readLiteralEscape ();
845
873
insertCodepoint (stringBuilder , ch );
@@ -1325,7 +1353,7 @@ private final int readLiteralEscape() {
1325
1353
case 'u' : return readUnicode4Escape ();
1326
1354
case 'U' : return readUnicode8Escape ();
1327
1355
default :
1328
- fatal ("Illegal escape sequence value: %c (0x%02X)" , c , c );
1356
+ fatal ("Illegal escape sequence value: %c (0x%02X)" ,c , c );
1329
1357
return 0 ;
1330
1358
}
1331
1359
}
@@ -1356,8 +1384,8 @@ private final int readUnicode4Escape() {
1356
1384
1357
1385
private final int readUnicode8Escape () {
1358
1386
int ch8 = readHexSequence (8 );
1359
- if ( ch8 > Character .MAX_CODE_POINT )
1360
- fatal ("Illegal code point in \\ U sequence value: 0x%08X" , ch8 );
1387
+ if ( ! Character .isValidCodePoint ( ch8 ) )
1388
+ fatal ("Illegal code point from \\ U sequence value: 0x%08X" , ch8 );
1361
1389
return ch8 ;
1362
1390
}
1363
1391
0 commit comments