Skip to content

Commit 066fd77

Browse files
authored
Fix cases where error recovery could infinite loop (tree-sitter#4257)
* Rename corpus test functions to allow easy filtering by language * Use usize for seed argument * Avoid retaining useless stack versions when reductions merge We found this problem when debugging an infinite loop that happened during error recovery when using the Zig grammar. The large number of unnecessary paused stack versions were preventing the correct recovery strategy from being tried. * Fix leaked lookahead token when reduction results in a merged stack * Enable running PHP tests in CI * Fix possible infinite loop during error recovery at EOF * Account for external scanner state changes when detecting changed ranges in subtrees
1 parent 8138dba commit 066fd77

File tree

8 files changed

+110
-56
lines changed

8 files changed

+110
-56
lines changed

cli/src/fuzz/mod.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,9 @@ fn regex_env_var(name: &'static str) -> Option<Regex> {
5656
pub fn new_seed() -> usize {
5757
int_env_var("TREE_SITTER_SEED").unwrap_or_else(|| {
5858
let mut rng = rand::thread_rng();
59-
rng.gen::<usize>()
59+
let seed = rng.gen::<usize>();
60+
eprintln!("Seed: {seed}");
61+
seed
6062
})
6163
}
6264

cli/src/tests/corpus_test.rs

Lines changed: 22 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ use crate::{
2323
};
2424

2525
#[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)]
26-
fn test_corpus_for_bash(seed: usize) {
26+
fn test_corpus_for_bash_language(seed: usize) {
2727
test_language_corpus(
2828
"bash",
2929
seed,
@@ -39,73 +39,77 @@ fn test_corpus_for_bash(seed: usize) {
3939
}
4040

4141
#[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)]
42-
fn test_corpus_for_c(seed: usize) {
42+
fn test_corpus_for_c_language(seed: usize) {
4343
test_language_corpus("c", seed, None, None);
4444
}
4545

4646
#[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)]
47-
fn test_corpus_for_cpp(seed: usize) {
47+
fn test_corpus_for_cpp_language(seed: usize) {
4848
test_language_corpus("cpp", seed, None, None);
4949
}
5050

5151
#[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)]
52-
fn test_corpus_for_embedded_template(seed: usize) {
52+
fn test_corpus_for_embedded_template_language(seed: usize) {
5353
test_language_corpus("embedded-template", seed, None, None);
5454
}
5555

5656
#[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)]
57-
fn test_corpus_for_go(seed: usize) {
57+
fn test_corpus_for_go_language(seed: usize) {
5858
test_language_corpus("go", seed, None, None);
5959
}
6060

6161
#[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)]
62-
fn test_corpus_for_html(seed: usize) {
62+
fn test_corpus_for_html_language(seed: usize) {
6363
test_language_corpus("html", seed, None, None);
6464
}
6565

6666
#[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)]
67-
fn test_corpus_for_java(seed: usize) {
68-
test_language_corpus("java", seed, None, None);
67+
fn test_corpus_for_java_language(seed: usize) {
68+
test_language_corpus(
69+
"java",
70+
seed,
71+
Some(&["java - corpus - expressions - switch with unnamed pattern variable"]),
72+
None,
73+
);
6974
}
7075

7176
#[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)]
72-
fn test_corpus_for_javascript(seed: usize) {
77+
fn test_corpus_for_javascript_language(seed: usize) {
7378
test_language_corpus("javascript", seed, None, None);
7479
}
7580

7681
#[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)]
77-
fn test_corpus_for_json(seed: usize) {
82+
fn test_corpus_for_json_language(seed: usize) {
7883
test_language_corpus("json", seed, None, None);
7984
}
8085

81-
#[ignore]
8286
#[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)]
83-
fn test_corpus_for_php(seed: usize) {
84-
test_language_corpus("php", seed, None, None);
87+
fn test_corpus_for_php_language(seed: usize) {
88+
test_language_corpus("php", seed, None, Some("php"));
8589
}
8690

8791
#[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)]
88-
fn test_corpus_for_python(seed: usize) {
92+
fn test_corpus_for_python_language(seed: usize) {
8993
test_language_corpus("python", seed, None, None);
9094
}
9195

9296
#[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)]
93-
fn test_corpus_for_ruby(seed: usize) {
97+
fn test_corpus_for_ruby_language(seed: usize) {
9498
test_language_corpus("ruby", seed, None, None);
9599
}
96100

97101
#[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)]
98-
fn test_corpus_for_rust(seed: usize) {
102+
fn test_corpus_for_rust_language(seed: usize) {
99103
test_language_corpus("rust", seed, None, None);
100104
}
101105

102106
#[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)]
103-
fn test_corpus_for_typescript(seed: usize) {
107+
fn test_corpus_for_typescript_language(seed: usize) {
104108
test_language_corpus("typescript", seed, None, Some("typescript"));
105109
}
106110

107111
#[test_with_seed(retry=10, seed=*START_SEED, seed_fn=new_seed)]
108-
fn test_corpus_for_tsx(seed: usize) {
112+
fn test_corpus_for_tsx_language(seed: usize) {
109113
test_language_corpus("typescript", seed, None, Some("tsx"));
110114
}
111115

lib/src/get_changed_ranges.c

Lines changed: 40 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ typedef struct {
108108
const TSLanguage *language;
109109
unsigned visible_depth;
110110
bool in_padding;
111+
Subtree prev_external_token;
111112
} Iterator;
112113

113114
static Iterator iterator_new(
@@ -127,6 +128,7 @@ static Iterator iterator_new(
127128
.language = language,
128129
.visible_depth = 1,
129130
.in_padding = false,
131+
.prev_external_token = NULL_SUBTREE,
130132
};
131133
}
132134

@@ -244,6 +246,10 @@ static bool iterator_descend(Iterator *self, uint32_t goal_position) {
244246

245247
position = child_right;
246248
if (!ts_subtree_extra(*child)) structural_child_index++;
249+
Subtree last_external_token = ts_subtree_last_external_token(*child);
250+
if (last_external_token.ptr) {
251+
self->prev_external_token = last_external_token;
252+
}
247253
}
248254
} while (did_descend);
249255

@@ -268,6 +274,10 @@ static void iterator_advance(Iterator *self) {
268274

269275
const Subtree *parent = array_back(&self->cursor.stack)->subtree;
270276
uint32_t child_index = entry.child_index + 1;
277+
Subtree last_external_token = ts_subtree_last_external_token(*entry.subtree);
278+
if (last_external_token.ptr) {
279+
self->prev_external_token = last_external_token;
280+
}
271281
if (ts_subtree_child_count(*parent) > child_index) {
272282
Length position = length_add(entry.position, ts_subtree_total_size(*entry.subtree));
273283
uint32_t structural_child_index = entry.structural_child_index;
@@ -313,29 +323,41 @@ static IteratorComparison iterator_compare(
313323
TSSymbol new_alias_symbol = 0;
314324
iterator_get_visible_state(old_iter, &old_tree, &old_alias_symbol, &old_start);
315325
iterator_get_visible_state(new_iter, &new_tree, &new_alias_symbol, &new_start);
326+
TSSymbol old_symbol = ts_subtree_symbol(old_tree);
327+
TSSymbol new_symbol = ts_subtree_symbol(new_tree);
316328

317329
if (!old_tree.ptr && !new_tree.ptr) return IteratorMatches;
318330
if (!old_tree.ptr || !new_tree.ptr) return IteratorDiffers;
331+
if (old_alias_symbol != new_alias_symbol || old_symbol != new_symbol) return IteratorDiffers;
332+
333+
uint32_t old_size = ts_subtree_size(old_tree).bytes;
334+
uint32_t new_size = ts_subtree_size(new_tree).bytes;
335+
TSStateId old_state = ts_subtree_parse_state(old_tree);
336+
TSStateId new_state = ts_subtree_parse_state(new_tree);
337+
bool old_has_external_tokens = ts_subtree_has_external_tokens(old_tree);
338+
bool new_has_external_tokens = ts_subtree_has_external_tokens(new_tree);
339+
uint32_t old_error_cost = ts_subtree_error_cost(old_tree);
340+
uint32_t new_error_cost = ts_subtree_error_cost(new_tree);
319341

320342
if (
321-
old_alias_symbol == new_alias_symbol &&
322-
ts_subtree_symbol(old_tree) == ts_subtree_symbol(new_tree)
343+
old_start != new_start ||
344+
old_symbol == ts_builtin_sym_error ||
345+
old_size != new_size ||
346+
old_state == TS_TREE_STATE_NONE ||
347+
new_state == TS_TREE_STATE_NONE ||
348+
((old_state == ERROR_STATE) != (new_state == ERROR_STATE)) ||
349+
old_error_cost != new_error_cost ||
350+
old_has_external_tokens != new_has_external_tokens ||
351+
ts_subtree_has_changes(old_tree) ||
352+
(
353+
old_has_external_tokens &&
354+
!ts_subtree_external_scanner_state_eq(old_iter->prev_external_token, new_iter->prev_external_token)
355+
)
323356
) {
324-
if (old_start == new_start &&
325-
!ts_subtree_has_changes(old_tree) &&
326-
ts_subtree_symbol(old_tree) != ts_builtin_sym_error &&
327-
ts_subtree_size(old_tree).bytes == ts_subtree_size(new_tree).bytes &&
328-
ts_subtree_parse_state(old_tree) != TS_TREE_STATE_NONE &&
329-
ts_subtree_parse_state(new_tree) != TS_TREE_STATE_NONE &&
330-
(ts_subtree_parse_state(old_tree) == ERROR_STATE) ==
331-
(ts_subtree_parse_state(new_tree) == ERROR_STATE)) {
332-
return IteratorMatches;
333-
} else {
334-
return IteratorMayDiffer;
335-
}
357+
return IteratorMayDiffer;
336358
}
337359

338-
return IteratorDiffers;
360+
return IteratorMatches;
339361
}
340362

341363
#ifdef DEBUG_GET_CHANGED_RANGES
@@ -348,8 +370,8 @@ static inline void iterator_print_state(Iterator *self) {
348370
"(%-25s %s\t depth:%u [%u, %u] - [%u, %u])",
349371
name, self->in_padding ? "(p)" : " ",
350372
self->visible_depth,
351-
start.row + 1, start.column,
352-
end.row + 1, end.column
373+
start.row, start.column,
374+
end.row, end.column
353375
);
354376
}
355377
#endif
@@ -380,7 +402,7 @@ unsigned ts_subtree_get_changed_ranges(
380402

381403
do {
382404
#ifdef DEBUG_GET_CHANGED_RANGES
383-
printf("At [%-2u, %-2u] Compare ", position.extent.row + 1, position.extent.column);
405+
printf("At [%-2u, %-2u] Compare ", position.extent.row, position.extent.column);
384406
iterator_print_state(&old_iter);
385407
printf("\tvs\t");
386408
iterator_print_state(&new_iter);

lib/src/parser.c

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -949,6 +949,7 @@ static StackVersion ts_parser__reduce(
949949
// children.
950950
StackSliceArray pop = ts_stack_pop_count(self->stack, version, count);
951951
uint32_t removed_version_count = 0;
952+
uint32_t halted_version_count = ts_stack_halted_version_count(self->stack);
952953
for (uint32_t i = 0; i < pop.size; i++) {
953954
StackSlice slice = pop.contents[i];
954955
StackVersion slice_version = slice.version - removed_version_count;
@@ -957,11 +958,12 @@ static StackVersion ts_parser__reduce(
957958
// will all be sorted and truncated at the end of the outer parsing loop.
958959
// Allow the maximum version count to be temporarily exceeded, but only
959960
// by a limited threshold.
960-
if (slice_version > MAX_VERSION_COUNT + MAX_VERSION_COUNT_OVERFLOW) {
961+
if (slice_version > MAX_VERSION_COUNT + MAX_VERSION_COUNT_OVERFLOW + halted_version_count) {
961962
ts_stack_remove_version(self->stack, slice_version);
962963
ts_subtree_array_delete(&self->tree_pool, &slice.subtrees);
963964
removed_version_count++;
964965
while (i + 1 < pop.size) {
966+
LOG("aborting reduce with too many versions")
965967
StackSlice next_slice = pop.contents[i + 1];
966968
if (next_slice.version != slice.version) break;
967969
ts_subtree_array_delete(&self->tree_pool, &next_slice.subtrees);
@@ -1318,10 +1320,23 @@ static void ts_parser__recover(
13181320
// and subsequently halted. Remove those versions.
13191321
for (unsigned i = previous_version_count; i < ts_stack_version_count(self->stack); i++) {
13201322
if (!ts_stack_is_active(self->stack, i)) {
1323+
LOG("removed paused version:%u", i);
13211324
ts_stack_remove_version(self->stack, i--);
1325+
LOG_STACK();
13221326
}
13231327
}
13241328

1329+
// If the parser is still in the error state at the end of the file, just wrap everything
1330+
// in an ERROR node and terminate.
1331+
if (ts_subtree_is_eof(lookahead)) {
1332+
LOG("recover_eof");
1333+
SubtreeArray children = array_new();
1334+
Subtree parent = ts_subtree_new_error_node(&children, false, self->language);
1335+
ts_stack_push(self->stack, version, parent, false, 1);
1336+
ts_parser__accept(self, version, lookahead);
1337+
return;
1338+
}
1339+
13251340
// If strategy 1 succeeded, a new stack version will have been created which is able to handle
13261341
// the current lookahead token. Now, in addition, try strategy 2 described above: skip the
13271342
// current lookahead token by wrapping it in an ERROR node.
@@ -1342,17 +1357,6 @@ static void ts_parser__recover(
13421357
return;
13431358
}
13441359

1345-
// If the parser is still in the error state at the end of the file, just wrap everything
1346-
// in an ERROR node and terminate.
1347-
if (ts_subtree_is_eof(lookahead)) {
1348-
LOG("recover_eof");
1349-
SubtreeArray children = array_new();
1350-
Subtree parent = ts_subtree_new_error_node(&children, false, self->language);
1351-
ts_stack_push(self->stack, version, parent, false, 1);
1352-
ts_parser__accept(self, version, lookahead);
1353-
return;
1354-
}
1355-
13561360
// Do not recover if the result would clearly be worse than some existing stack version.
13571361
unsigned new_cost =
13581362
current_error_cost + ERROR_COST_PER_SKIPPED_TREE +
@@ -1618,6 +1622,7 @@ static bool ts_parser__advance(
16181622
// an ambiguous state. REDUCE actions always create a new stack
16191623
// version, whereas SHIFT actions update the existing stack version
16201624
// and terminate this loop.
1625+
bool did_reduce = false;
16211626
StackVersion last_reduction_version = STACK_VERSION_NONE;
16221627
for (uint32_t i = 0; i < table_entry.action_count; i++) {
16231628
TSParseAction action = table_entry.actions[i];
@@ -1653,6 +1658,7 @@ static bool ts_parser__advance(
16531658
action.reduce.dynamic_precedence, action.reduce.production_id,
16541659
is_fragile, end_of_non_terminal_extra
16551660
);
1661+
did_reduce = true;
16561662
if (reduction_version != STACK_VERSION_NONE) {
16571663
last_reduction_version = reduction_version;
16581664
}
@@ -1704,9 +1710,12 @@ static bool ts_parser__advance(
17041710
continue;
17051711
}
17061712

1707-
// A non-terminal extra rule was reduced and merged into an existing
1708-
// stack version. This version can be discarded.
1709-
if (!lookahead.ptr) {
1713+
// A reduction was performed, but was merged into an existing stack version.
1714+
// This version can be discarded.
1715+
if (did_reduce) {
1716+
if (lookahead.ptr) {
1717+
ts_subtree_release(&self->tree_pool, lookahead);
1718+
}
17101719
ts_stack_halt(self->stack, version);
17111720
return true;
17121721
}
@@ -1755,7 +1764,7 @@ static bool ts_parser__advance(
17551764
// versions that exist. If some other version advances successfully, then
17561765
// this version can simply be removed. But if all versions end up paused,
17571766
// then error recovery is needed.
1758-
LOG("detect_error");
1767+
LOG("detect_error lookahead:%s", TREE_NAME(lookahead));
17591768
ts_stack_pause(self->stack, version, lookahead);
17601769
return true;
17611770
}
@@ -1844,6 +1853,7 @@ static unsigned ts_parser__condense_stack(TSParser *self) {
18441853
has_unpaused_version = true;
18451854
} else {
18461855
ts_stack_remove_version(self->stack, i);
1856+
made_changes = true;
18471857
i--;
18481858
n--;
18491859
}

lib/src/stack.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -460,6 +460,17 @@ uint32_t ts_stack_version_count(const Stack *self) {
460460
return self->heads.size;
461461
}
462462

463+
uint32_t ts_stack_halted_version_count(Stack *self) {
464+
uint32_t count = 0;
465+
for (uint32_t i = 0; i < self->heads.size; i++) {
466+
StackHead *head = array_get(&self->heads, i);
467+
if (head->status == StackStatusHalted) {
468+
count++;
469+
}
470+
}
471+
return count;
472+
}
473+
463474
TSStateId ts_stack_state(const Stack *self, StackVersion version) {
464475
return array_get(&self->heads, version)->node->state;
465476
}
@@ -524,6 +535,7 @@ StackSliceArray ts_stack_pop_count(Stack *self, StackVersion version, uint32_t c
524535
return stack__iter(self, version, pop_count_callback, &count, (int)count);
525536
}
526537

538+
527539
forceinline StackAction pop_pending_callback(void *payload, const StackIterator *iterator) {
528540
(void)payload;
529541
if (iterator->subtree_count >= 1) {

lib/src/stack.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@ void ts_stack_delete(Stack *self);
3636
// Get the stack's current number of versions.
3737
uint32_t ts_stack_version_count(const Stack *self);
3838

39+
// Get the stack's current number of halted versions.
40+
uint32_t ts_stack_halted_version_count(Stack *self);
41+
3942
// Get the state at the top of the given version of the stack. If the stack is
4043
// empty, this returns the initial state, 0.
4144
TSStateId ts_stack_state(const Stack *self, StackVersion version);

lib/src/tree_cursor.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -304,8 +304,9 @@ int64_t ts_tree_cursor_goto_first_child_for_point(TSTreeCursor *self, TSPoint go
304304
}
305305

306306
TreeCursorStep ts_tree_cursor_goto_sibling_internal(
307-
TSTreeCursor *_self,
308-
bool (*advance)(CursorChildIterator *, TreeCursorEntry *, bool *)) {
307+
TSTreeCursor *_self,
308+
bool (*advance)(CursorChildIterator *, TreeCursorEntry *, bool *)
309+
) {
309310
TreeCursor *self = (TreeCursor *)_self;
310311
uint32_t initial_size = self->stack.size;
311312

xtask/src/main.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ struct Test {
144144
iterations: Option<u32>,
145145
/// Set the seed used to control random behavior.
146146
#[arg(long, short)]
147-
seed: Option<u32>,
147+
seed: Option<usize>,
148148
/// Print parsing log to stderr.
149149
#[arg(long, short)]
150150
debug: bool,

0 commit comments

Comments
 (0)