Skip to content

Commit c2c63ba

Browse files
committed
query: Fix escape sequence parsing in anonymous node patterns
Fixes tree-sitter#776 Fixes tree-sitter#760
1 parent 071f4e4 commit c2c63ba

File tree

3 files changed

+86
-115
lines changed

3 files changed

+86
-115
lines changed

cli/src/tests/query_test.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1475,16 +1475,19 @@ fn test_query_matches_with_anonymous_tokens() {
14751475
r#"
14761476
";" @punctuation
14771477
"&&" @operator
1478+
"\"" @quote
14781479
"#,
14791480
)
14801481
.unwrap();
14811482

14821483
assert_query_matches(
14831484
language,
14841485
&query,
1485-
"foo(a && b);",
1486+
r#"foo(a && "b");"#,
14861487
&[
14871488
(1, vec![("operator", "&&")]),
1489+
(2, vec![("quote", "\"")]),
1490+
(2, vec![("quote", "\"")]),
14881491
(0, vec![("punctuation", ";")]),
14891492
],
14901493
);

lib/src/array.h

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,14 +52,24 @@ extern "C" {
5252
(self)->size += (count))
5353

5454
#define array_push_all(self, other) \
55-
array_splice((self), (self)->size, 0, (other)->size, (other)->contents)
55+
array_extend((self), (other)->size, (other)->contents)
56+
57+
// Append `count` elements to the end of the array, reading their values from the
58+
// `contents` pointer.
59+
#define array_extend(self, count, contents) \
60+
array__splice( \
61+
(VoidArray *)(self), array__elem_size(self), (self)->size, \
62+
0, count, contents \
63+
)
5664

5765
// Remove `old_count` elements from the array starting at the given `index`. At
5866
// the same index, insert `new_count` new elements, reading their values from the
5967
// `new_contents` pointer.
60-
#define array_splice(self, index, old_count, new_count, new_contents) \
61-
array__splice((VoidArray *)(self), array__elem_size(self), index, old_count, \
62-
new_count, new_contents)
68+
#define array_splice(self, index, old_count, new_count, new_contents) \
69+
array__splice( \
70+
(VoidArray *)(self), array__elem_size(self), index, \
71+
old_count, new_count, new_contents \
72+
)
6373

6474
// Insert one `element` into the array at the given `index`.
6575
#define array_insert(self, index, element) \

lib/src/query.c

Lines changed: 68 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,7 @@ struct TSQuery {
214214
Array(TSQueryPredicateStep) predicate_steps;
215215
Array(QueryPattern) patterns;
216216
Array(StepOffset) step_offsets;
217+
Array(char) string_buffer;
217218
const TSLanguage *language;
218219
uint16_t wildcard_root_pattern_count;
219220
TSSymbol *symbol_map;
@@ -439,67 +440,6 @@ static uint16_t symbol_table_insert_name(
439440
return self->slices.size - 1;
440441
}
441442

442-
static uint16_t symbol_table_insert_name_with_escapes(
443-
SymbolTable *self,
444-
const char *escaped_name,
445-
uint32_t escaped_length
446-
) {
447-
Slice slice = {
448-
.offset = self->characters.size,
449-
.length = 0,
450-
};
451-
array_grow_by(&self->characters, escaped_length + 1);
452-
453-
// Copy the contents of the literal into the characters buffer, processing escape
454-
// sequences like \n and \". This needs to be done before checking if the literal
455-
// is already present, in order to do the string comparison.
456-
bool is_escaped = false;
457-
for (unsigned i = 0; i < escaped_length; i++) {
458-
const char *src = &escaped_name[i];
459-
char *dest = &self->characters.contents[slice.offset + slice.length];
460-
if (is_escaped) {
461-
switch (*src) {
462-
case 'n':
463-
*dest = '\n';
464-
break;
465-
case 'r':
466-
*dest = '\r';
467-
break;
468-
case 't':
469-
*dest = '\t';
470-
break;
471-
case '0':
472-
*dest = '\0';
473-
break;
474-
default:
475-
*dest = *src;
476-
break;
477-
}
478-
is_escaped = false;
479-
slice.length++;
480-
} else {
481-
if (*src == '\\') {
482-
is_escaped = true;
483-
} else {
484-
*dest = *src;
485-
slice.length++;
486-
}
487-
}
488-
}
489-
490-
// If the string is already present, remove the redundant content from the characters
491-
// buffer and return the existing id.
492-
int id = symbol_table_id_for_name(self, &self->characters.contents[slice.offset], slice.length);
493-
if (id >= 0) {
494-
self->characters.size -= (escaped_length + 1);
495-
return id;
496-
}
497-
498-
self->characters.contents[slice.offset + slice.length] = 0;
499-
array_push(&self->slices, slice);
500-
return self->slices.size - 1;
501-
}
502-
503443
/************
504444
* QueryStep
505445
************/
@@ -1393,6 +1333,59 @@ static void ts_query__finalize_steps(TSQuery *self) {
13931333
}
13941334
}
13951335

1336+
static TSQueryError ts_query__parse_string_literal(
1337+
TSQuery *self,
1338+
Stream *stream
1339+
) {
1340+
const char *string_start = stream->input;
1341+
if (stream->next != '"') return TSQueryErrorSyntax;
1342+
stream_advance(stream);
1343+
const char *prev_position = stream->input;
1344+
1345+
bool is_escaped = false;
1346+
array_clear(&self->string_buffer);
1347+
for (;;) {
1348+
if (is_escaped) {
1349+
is_escaped = false;
1350+
switch (stream->next) {
1351+
case 'n':
1352+
array_push(&self->string_buffer, '\n');
1353+
break;
1354+
case 'r':
1355+
array_push(&self->string_buffer, '\r');
1356+
break;
1357+
case 't':
1358+
array_push(&self->string_buffer, '\t');
1359+
break;
1360+
case '0':
1361+
array_push(&self->string_buffer, '\0');
1362+
break;
1363+
default:
1364+
array_extend(&self->string_buffer, stream->next_size, stream->input);
1365+
break;
1366+
}
1367+
prev_position = stream->input + stream->next_size;
1368+
} else {
1369+
if (stream->next == '\\') {
1370+
array_extend(&self->string_buffer, (stream->input - prev_position), prev_position);
1371+
prev_position = stream->input + 1;
1372+
is_escaped = true;
1373+
} else if (stream->next == '"') {
1374+
array_extend(&self->string_buffer, (stream->input - prev_position), prev_position);
1375+
stream_advance(stream);
1376+
return TSQueryErrorNone;
1377+
} else if (stream->next == '\n') {
1378+
stream_reset(stream, string_start);
1379+
return TSQueryErrorSyntax;
1380+
}
1381+
}
1382+
if (!stream_advance(stream)) {
1383+
stream_reset(stream, string_start);
1384+
return TSQueryErrorSyntax;
1385+
}
1386+
}
1387+
}
1388+
13961389
// Parse a single predicate associated with a pattern, adding it to the
13971390
// query's internal `predicate_steps` array. Predicates are arbitrary
13981391
// S-expressions associated with a pattern which are meant to be handled at
@@ -1458,44 +1451,17 @@ static TSQueryError ts_query__parse_predicate(
14581451

14591452
// Parse a string literal
14601453
else if (stream->next == '"') {
1461-
stream_advance(stream);
1462-
1463-
// Parse the string content
1464-
bool is_escaped = false;
1465-
const char *string_content = stream->input;
1466-
for (;;) {
1467-
if (is_escaped) {
1468-
is_escaped = false;
1469-
} else {
1470-
if (stream->next == '\\') {
1471-
is_escaped = true;
1472-
} else if (stream->next == '"') {
1473-
break;
1474-
} else if (stream->next == '\n') {
1475-
stream_reset(stream, string_content - 1);
1476-
return TSQueryErrorSyntax;
1477-
}
1478-
}
1479-
if (!stream_advance(stream)) {
1480-
stream_reset(stream, string_content - 1);
1481-
return TSQueryErrorSyntax;
1482-
}
1483-
}
1484-
uint32_t length = stream->input - string_content;
1485-
1486-
// Add a step for the node
1487-
uint16_t id = symbol_table_insert_name_with_escapes(
1454+
TSQueryError e = ts_query__parse_string_literal(self, stream);
1455+
if (e) return e;
1456+
uint16_t id = symbol_table_insert_name(
14881457
&self->predicate_values,
1489-
string_content,
1490-
length
1458+
self->string_buffer.contents,
1459+
self->string_buffer.size
14911460
);
14921461
array_push(&self->predicate_steps, ((TSQueryPredicateStep) {
14931462
.type = TSQueryPredicateStepTypeString,
14941463
.value_id = id,
14951464
}));
1496-
1497-
if (stream->next != '"') return TSQueryErrorSyntax;
1498-
stream_advance(stream);
14991465
}
15001466

15011467
// Parse a bare symbol
@@ -1761,33 +1727,22 @@ static TSQueryError ts_query__parse_pattern(
17611727

17621728
// Parse a double-quoted anonymous leaf node expression
17631729
else if (stream->next == '"') {
1764-
stream_advance(stream);
1765-
1766-
// Parse the string content
1767-
const char *string_content = stream->input;
1768-
while (stream->next != '"') {
1769-
if (!stream_advance(stream)) {
1770-
stream_reset(stream, string_content - 1);
1771-
return TSQueryErrorSyntax;
1772-
}
1773-
}
1774-
uint32_t length = stream->input - string_content;
1730+
const char *string_start = stream->input;
1731+
TSQueryError e = ts_query__parse_string_literal(self, stream);
1732+
if (e) return e;
17751733

17761734
// Add a step for the node
17771735
TSSymbol symbol = ts_language_symbol_for_name(
17781736
self->language,
1779-
string_content,
1780-
length,
1737+
self->string_buffer.contents,
1738+
self->string_buffer.size,
17811739
false
17821740
);
17831741
if (!symbol) {
1784-
stream_reset(stream, string_content);
1742+
stream_reset(stream, string_start + 1);
17851743
return TSQueryErrorNodeType;
17861744
}
17871745
array_push(&self->steps, query_step__new(symbol, depth, is_immediate));
1788-
1789-
if (stream->next != '"') return TSQueryErrorSyntax;
1790-
stream_advance(stream);
17911746
}
17921747

17931748
// Parse a field-prefixed pattern
@@ -1977,6 +1932,7 @@ TSQuery *ts_query_new(
19771932
.predicate_steps = array_new(),
19781933
.patterns = array_new(),
19791934
.step_offsets = array_new(),
1935+
.string_buffer = array_new(),
19801936
.symbol_map = symbol_map,
19811937
.wildcard_root_pattern_count = 0,
19821938
.language = language,
@@ -2056,6 +2012,7 @@ TSQuery *ts_query_new(
20562012
}
20572013

20582014
ts_query__finalize_steps(self);
2015+
array_delete(&self->string_buffer);
20592016
return self;
20602017
}
20612018

@@ -2066,6 +2023,7 @@ void ts_query_delete(TSQuery *self) {
20662023
array_delete(&self->predicate_steps);
20672024
array_delete(&self->patterns);
20682025
array_delete(&self->step_offsets);
2026+
array_delete(&self->string_buffer);
20692027
symbol_table_delete(&self->captures);
20702028
symbol_table_delete(&self->predicate_values);
20712029
ts_free(self->symbol_map);

0 commit comments

Comments
 (0)