Skip to content

Commit 96c4f8f

Browse files
committed
Support unicode escapes in TextFormat.
NOTE: The existing JSON support handles surrogate pairs differently that the conformance test for TextFormat. This implementation follow the conformance test recommended handing even though it differs from what the C++ implementation does. Also update known failing conformance tests list. Progress on apple#1085
1 parent 650964e commit 96c4f8f

File tree

4 files changed

+217
-10
lines changed

4 files changed

+217
-10
lines changed
Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,2 @@
1-
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesBytes.ProtobufOutput
2-
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesBytes.TextFormatOutput
3-
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesString.ProtobufOutput
4-
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesString.TextFormatOutput
5-
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeBytes.ProtobufOutput
6-
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeBytes.TextFormatOutput
7-
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeString.ProtobufOutput
8-
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeString.TextFormatOutput
91
Required.Proto3.TextFormatInput.StringLiteralIncludesLFBytes
102
Required.Proto3.TextFormatInput.StringLiteralIncludesLFString

Sources/SwiftProtobuf/TextFormatScanner.swift

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ private let asciiLowerS = UInt8(ascii: "s")
6161
private let asciiLowerT = UInt8(ascii: "t")
6262
private let asciiUpperT = UInt8(ascii: "T")
6363
private let asciiLowerU = UInt8(ascii: "u")
64+
private let asciiUpperU = UInt8(ascii: "U")
6465
private let asciiLowerV = UInt8(ascii: "v")
6566
private let asciiLowerX = UInt8(ascii: "x")
6667
private let asciiLowerY = UInt8(ascii: "y")
@@ -80,6 +81,13 @@ private func fromHexDigit(_ c: UInt8) -> UInt8? {
8081
return nil
8182
}
8283

84+
private func uint32FromHexDigit(_ c: UInt8) -> UInt32? {
85+
guard let u8 = fromHexDigit(c) else {
86+
return nil
87+
}
88+
return UInt32(u8)
89+
}
90+
8391
// Protobuf Text encoding assumes that you're working directly
8492
// in UTF-8. So this implementation converts the string to UTF8,
8593
// then decodes it into a sequence of bytes, then converts
@@ -116,6 +124,58 @@ private func decodeString(_ s: String) -> String? {
116124
out.append(n)
117125
bytes = savedPosition
118126
}
127+
case asciiLowerU, asciiUpperU: // "u"
128+
// \u - 4 hex digits, \U 8 hex digits:
129+
if let digit1 = bytes.next(),
130+
let d1 = uint32FromHexDigit(digit1),
131+
let digit2 = bytes.next(),
132+
let d2 = uint32FromHexDigit(digit2),
133+
let digit3 = bytes.next(),
134+
let d3 = uint32FromHexDigit(digit3),
135+
let digit4 = bytes.next(),
136+
let d4 = uint32FromHexDigit(digit4) {
137+
var codePoint = (d1 << 12) + (d2 << 8) + (d3 << 4) + d4
138+
if escaped == asciiUpperU {
139+
if let digit5 = bytes.next(),
140+
let d5 = uint32FromHexDigit(digit5),
141+
let digit6 = bytes.next(),
142+
let d6 = uint32FromHexDigit(digit6),
143+
let digit7 = bytes.next(),
144+
let d7 = uint32FromHexDigit(digit7),
145+
let digit8 = bytes.next(),
146+
let d8 = uint32FromHexDigit(digit8) {
147+
codePoint = (codePoint << 16) + (d5 << 12) + (d6 << 8) + (d7 << 4) + d8
148+
} else {
149+
// Malformed \U escape
150+
return nil
151+
}
152+
}
153+
switch codePoint {
154+
case 0...0x7f:
155+
// 1 byte encoding
156+
out.append(UInt8(truncatingIfNeeded: codePoint))
157+
case 0x80...0x7ff:
158+
// 2 byte encoding
159+
out.append(0xC0 + UInt8(truncatingIfNeeded: codePoint >> 6))
160+
out.append(0x80 + UInt8(truncatingIfNeeded: codePoint & 0x3F))
161+
case 0x800...0xffff:
162+
// 3 byte encoding
163+
out.append(0xE0 + UInt8(truncatingIfNeeded: codePoint >> 12))
164+
out.append(0x80 + UInt8(truncatingIfNeeded: (codePoint >> 6) & 0x3F))
165+
out.append(0x80 + UInt8(truncatingIfNeeded: codePoint & 0x3F))
166+
case 0x10000...0x10FFFF:
167+
// 4 byte encoding
168+
out.append(0xF0 + UInt8(truncatingIfNeeded: codePoint >> 18))
169+
out.append(0x80 + UInt8(truncatingIfNeeded: (codePoint >> 12) & 0x3F))
170+
out.append(0x80 + UInt8(truncatingIfNeeded: (codePoint >> 6) & 0x3F))
171+
out.append(0x80 + UInt8(truncatingIfNeeded: codePoint & 0x3F))
172+
default:
173+
return nil
174+
}
175+
} else {
176+
// Malformed \u,\U escape
177+
return nil
178+
}
119179
case asciiLowerX: // "x"
120180
// Unlike C/C++, protobuf only allows 1 or 2 digits here:
121181
if let byte = bytes.next(), let digit = fromHexDigit(byte) {
@@ -315,6 +375,39 @@ internal struct TextFormatScanner {
315375
}
316376
}
317377
count += 1
378+
case asciiLowerU, asciiUpperU: // 'u' or 'U' unicode escape
379+
let numDigits = (escaped == asciiLowerU) ? 4 : 8
380+
var codePoint: UInt32 = 0
381+
for i in 0..<numDigits {
382+
guard p != end else {
383+
throw TextFormatDecodingError.malformedText // unicode escape must 4/8 digits
384+
}
385+
if let digit = uint32FromHexDigit(p[i]) {
386+
codePoint = (codePoint << 4) + digit
387+
} else {
388+
throw TextFormatDecodingError.malformedText // wasn't a hex digit
389+
}
390+
}
391+
p += numDigits
392+
switch codePoint {
393+
case 0...0x7f:
394+
// 1 byte encoding
395+
count += 1
396+
case 0x80...0x7ff:
397+
// 2 byte encoding
398+
count += 2
399+
case 0xD800...0xDFFF:
400+
// Surrogate pair (low or high), shouldn't get a unicode literal of those.
401+
throw TextFormatDecodingError.malformedText
402+
case 0x800...0xffff:
403+
// 3 byte encoding
404+
count += 3
405+
case 0x10000...0x10FFFF:
406+
// 4 byte encoding
407+
count += 4
408+
default:
409+
throw TextFormatDecodingError.malformedText // Isn't a valid unicode character
410+
}
318411
case asciiLowerX: // 'x' hexadecimal escape
319412
if p != end && fromHexDigit(p[0]) != nil {
320413
p += 1
@@ -387,6 +480,39 @@ internal struct TextFormatScanner {
387480
out[0] = digit1Value
388481
out += 1
389482
}
483+
case asciiLowerU, asciiUpperU:
484+
let numDigits = (escaped == asciiLowerU) ? 4 : 8
485+
var codePoint: UInt32 = 0
486+
for i in 0..<numDigits {
487+
codePoint = (codePoint << 4) + uint32FromHexDigit(p[i])!
488+
}
489+
p += numDigits
490+
switch codePoint {
491+
case 0...0x7f:
492+
// 1 byte encoding
493+
out[0] = UInt8(truncatingIfNeeded: codePoint)
494+
out += 1
495+
case 0x80...0x7ff:
496+
// 2 byte encoding
497+
out[0] = 0xC0 + UInt8(truncatingIfNeeded: codePoint >> 6)
498+
out[1] = 0x80 + UInt8(truncatingIfNeeded: codePoint & 0x3F)
499+
out += 2
500+
case 0x800...0xffff:
501+
// 3 byte encoding
502+
out[0] = 0xE0 + UInt8(truncatingIfNeeded: codePoint >> 12)
503+
out[1] = 0x80 + UInt8(truncatingIfNeeded: (codePoint >> 6) & 0x3F)
504+
out[2] = 0x80 + UInt8(truncatingIfNeeded: codePoint & 0x3F)
505+
out += 3
506+
case 0x10000...0x10FFFF:
507+
// 4 byte encoding
508+
out[0] = 0xF0 + UInt8(truncatingIfNeeded: codePoint >> 18)
509+
out[1] = 0x80 + UInt8(truncatingIfNeeded: (codePoint >> 12) & 0x3F)
510+
out[2] = 0x80 + UInt8(truncatingIfNeeded: (codePoint >> 6) & 0x3F)
511+
out[3] = 0x80 + UInt8(truncatingIfNeeded: codePoint & 0x3F)
512+
out += 4
513+
default:
514+
preconditionFailure() // Already validated, can't happen
515+
}
390516
case asciiLowerX: // 'x' hexadecimal escape
391517
// We already validated, so we know there's at least one digit:
392518
var n = fromHexDigit(p[0])!

Tests/LinuxMain.swift

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -423,8 +423,10 @@ extension Test_Conformance {
423423
("testInt32_min_roundtrip", testInt32_min_roundtrip),
424424
("testInt32_toosmall", testInt32_toosmall),
425425
("testRepeatedBoolWrapper", testRepeatedBoolWrapper),
426-
("testString_badUnicodeEscape", testString_badUnicodeEscape),
426+
("testString_unicodeEscape", testString_unicodeEscape),
427427
("testString_surrogates", testString_surrogates),
428+
("testBytes_unicodeEscape", testBytes_unicodeEscape),
429+
("testBytes_surrogates", testBytes_surrogates),
428430
("testMaps_TextFormatKeysSorted", testMaps_TextFormatKeysSorted)
429431
]
430432
}

Tests/SwiftProtobufTests/Test_Conformance.swift

Lines changed: 88 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,13 +77,57 @@ class Test_Conformance: XCTestCase, PBTestHelpers {
7777
}
7878
}
7979

80-
func testString_badUnicodeEscape() {
80+
func testString_unicodeEscape() {
81+
assertTextFormatDecodeSucceeds("optional_string: \"\\u1234\"") {
82+
return $0.optionalString == "\u{1234}"
83+
}
84+
assertTextFormatDecodeSucceeds("optional_string: \"\\U0001F601\"") {
85+
return $0.optionalString == "\u{1F601}"
86+
}
87+
88+
assertTextFormatDecodeFails("optional_string: \"\\u")
89+
assertTextFormatDecodeFails("optional_string: \"\\uDC\"")
90+
assertTextFormatDecodeFails("optional_string: \"\\uDCXY\"")
91+
assertTextFormatDecodeFails("optional_string: \"\\U")
92+
assertTextFormatDecodeFails("optional_string: \"\\UDC\"")
93+
assertTextFormatDecodeFails("optional_string: \"\\UDCXY\"")
94+
assertTextFormatDecodeFails("optional_string: \"\\U1234DC\"")
95+
assertTextFormatDecodeFails("optional_string: \"\\U1234DCXY\"")
96+
97+
assertJSONDecodeSucceeds("{\"optional_string\": \"\\u1234\"}") {
98+
return $0.optionalString == "\u{1234}"
99+
}
100+
81101
assertJSONDecodeFails("{\"optionalString\": \"\\u")
82102
assertJSONDecodeFails("{\"optionalString\": \"\\uDC\"}")
83103
assertJSONDecodeFails("{\"optionalString\": \"\\uDCXY\"}")
84104
}
85105

86106
func testString_surrogates() {
107+
// Unpaired low surrogate
108+
assertTextFormatDecodeFails("optional_string: \"\\uDC00\"")
109+
assertTextFormatDecodeFails("optional_string: \"\\uDC00x\"")
110+
assertTextFormatDecodeFails("optional_string: \"\\uDC00\\b\"")
111+
assertTextFormatDecodeFails("optional_string: \"\\U0000DC00\"")
112+
assertTextFormatDecodeFails("optional_string: \"\\U0000DC00x\"")
113+
assertTextFormatDecodeFails("optional_string: \"\\U0000DC00\\b\"")
114+
// Unpaired high surrogate
115+
assertTextFormatDecodeFails("optional_string: \"\\uD800\"")
116+
assertTextFormatDecodeFails("optional_string: \"\\uD800\\u0061\"")
117+
assertTextFormatDecodeFails("optional_string: \"\\uD800abcdefghijkl\"")
118+
assertTextFormatDecodeFails("optional_string: \"\\U0000D800\"")
119+
assertTextFormatDecodeFails("optional_string: \"\\U0000D800\\u0061\"")
120+
assertTextFormatDecodeFails("optional_string: \"\\U0000D800abcdefghijkl\"")
121+
// Mis-ordered surrogate
122+
assertTextFormatDecodeFails("optional_string: \"\\uDE01\\uD83D\"")
123+
assertTextFormatDecodeFails("optional_string: \"\\U0000DE01\\uD83D\"")
124+
// Correct surrogate
125+
// NOTE: This differs from JSON at the moment in that surrogates fail
126+
// there is a conformance test that recommends this even though the
127+
// C++ impl accepts it.
128+
assertTextFormatDecodeFails("optional_string: \"\\uD83D\\uDE01\"")
129+
assertTextFormatDecodeFails("optional_string: \"\\U0000D83D\\uDE01\"")
130+
87131
// Unpaired low surrogate
88132
assertJSONDecodeFails("{\"optionalString\": \"\\uDC00\"}")
89133
assertJSONDecodeFails("{\"optionalString\": \"\\uDC00x\"}")
@@ -100,6 +144,49 @@ class Test_Conformance: XCTestCase, PBTestHelpers {
100144
}
101145
}
102146

147+
func testBytes_unicodeEscape() {
148+
assertTextFormatDecodeSucceeds("optional_bytes: \"\\u1234\"") {
149+
return $0.optionalBytes == Data("\u{1234}".utf8)
150+
}
151+
assertTextFormatDecodeSucceeds("optional_bytes: \"\\U0001F601\"") {
152+
return $0.optionalBytes == Data("\u{1F601}".utf8)
153+
}
154+
155+
assertTextFormatDecodeFails("optional_bytes: \"\\u")
156+
assertTextFormatDecodeFails("optional_bytes: \"\\uDC\"")
157+
assertTextFormatDecodeFails("optional_bytes: \"\\uDCXY\"")
158+
assertTextFormatDecodeFails("optional_bytes: \"\\U")
159+
assertTextFormatDecodeFails("optional_bytes: \"\\UDC\"")
160+
assertTextFormatDecodeFails("optional_bytes: \"\\UDCXY\"")
161+
assertTextFormatDecodeFails("optional_bytes: \"\\U1234DC\"")
162+
assertTextFormatDecodeFails("optional_bytes: \"\\U1234DCXY\"")
163+
}
164+
165+
func testBytes_surrogates() {
166+
// Unpaired low surrogate
167+
assertTextFormatDecodeFails("optional_bytes: \"\\uDC00\"")
168+
assertTextFormatDecodeFails("optional_bytes: \"\\uDC00x\"")
169+
assertTextFormatDecodeFails("optional_bytes: \"\\uDC00\\b\"")
170+
assertTextFormatDecodeFails("optional_bytes: \"\\U0000DC00\"")
171+
assertTextFormatDecodeFails("optional_bytes: \"\\U0000DC00x\"")
172+
assertTextFormatDecodeFails("optional_bytes: \"\\U0000DC00\\b\"")
173+
// Unpaired high surrogate
174+
assertTextFormatDecodeFails("optional_bytes: \"\\uD800\"")
175+
assertTextFormatDecodeFails("optional_bytes: \"\\uD800\\u0061\"")
176+
assertTextFormatDecodeFails("optional_bytes: \"\\uD800abcdefghijkl\"")
177+
assertTextFormatDecodeFails("optional_bytes: \"\\U0000D800\"")
178+
assertTextFormatDecodeFails("optional_bytes: \"\\U0000D800\\u0061\"")
179+
assertTextFormatDecodeFails("optional_bytes: \"\\U0000D800abcdefghijkl\"")
180+
// Mis-ordered surrogate
181+
assertTextFormatDecodeFails("optional_bytes: \"\\uDE01\\uD83D\"")
182+
assertTextFormatDecodeFails("optional_bytes: \"\\U0000DE01\\uD83D\"")
183+
// Correct surrogate
184+
// NOTE: Conformance test recommends this even though the C++ impl
185+
// accepts it.
186+
assertTextFormatDecodeFails("optional_bytes: \"\\uD83D\\uDE01\"")
187+
assertTextFormatDecodeFails("optional_bytes: \"\\U0000D83D\\uDE01\"")
188+
}
189+
103190
func testMaps_TextFormatKeysSorted() {
104191
assertTextFormatEncode("map_string_string {\n key: \"a\"\n value: \"value\"\n}\nmap_string_string {\n key: \"b\"\n value: \"value\"\n}\nmap_string_string {\n key: \"c\"\n value: \"value\"\n}\n") {(o: inout MessageTestType) in
105192
o.mapStringString = ["c":"value", "b":"value", "a":"value"]

0 commit comments

Comments
 (0)