Skip to content

Commit d223a81

Browse files
committed
Allow empty external tokens during err recovery if they change the scanner's state
1 parent c0e1991 commit d223a81

File tree

4 files changed

+143
-52
lines changed

4 files changed

+143
-52
lines changed

lib/src/parser.c

Lines changed: 42 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -408,6 +408,8 @@ static Subtree ts_parser__lex(
408408
Length error_start_position = length_zero();
409409
Length error_end_position = length_zero();
410410
uint32_t lookahead_end_byte = 0;
411+
uint32_t external_scanner_state_len = 0;
412+
bool external_scanner_state_changed = false;
411413
ts_lexer_reset(&self->lexer, start_position);
412414

413415
for (;;) {
@@ -429,22 +431,32 @@ static Subtree ts_parser__lex(
429431
);
430432
ts_lexer_finish(&self->lexer, &lookahead_end_byte);
431433

432-
// Zero-length external tokens are generally allowed, but they're not
433-
// allowed right after a syntax error. This is for two reasons:
434-
// 1. After a syntax error, the lexer is looking for any possible token,
435-
// as opposed to the specific set of tokens that are valid in some
436-
// parse state. In this situation, it's very easy for an external
437-
// scanner to produce unwanted zero-length tokens.
438-
// 2. The parser sometimes inserts *missing* tokens to recover from
439-
// errors. These tokens are also zero-length. If we allow more
440-
// zero-length tokens to be created after missing tokens, it
441-
// can lead to infinite loops. Forbidding zero-length tokens
442-
// right at the point of error recovery is a conservative strategy
443-
// for preventing this kind of infinite loop.
444-
if (found_token && (
445-
self->lexer.token_end_position.bytes > current_position.bytes ||
446-
(!error_mode && ts_stack_has_advanced_since_error(self->stack, version))
447-
)) {
434+
if (found_token) {
435+
external_scanner_state_len = self->language->external_scanner.serialize(
436+
self->external_scanner_payload,
437+
self->lexer.debug_buffer
438+
);
439+
external_scanner_state_changed = !ts_external_scanner_state_eq(
440+
ts_subtree_external_scanner_state(external_token),
441+
self->lexer.debug_buffer,
442+
external_scanner_state_len
443+
);
444+
445+
// When recovering from an error, ignore any zero-length external tokens
446+
// unless they have changed the external scanner's state. This helps to
447+
// avoid infinite loops which could otherwise occur, because the lexer is
448+
// looking for any possible token, instead of looking for the specific set of
449+
// tokens that are valid in some parse state.
450+
if (
451+
self->lexer.token_end_position.bytes == current_position.bytes &&
452+
(error_mode || !ts_stack_has_advanced_since_error(self->stack, version)) &&
453+
!external_scanner_state_changed
454+
) {
455+
found_token = false;
456+
}
457+
}
458+
459+
if (found_token) {
448460
found_external_token = true;
449461
called_get_column = self->lexer.did_get_column;
450462
break;
@@ -549,15 +561,13 @@ static Subtree ts_parser__lex(
549561
);
550562

551563
if (found_external_token) {
552-
unsigned length = self->language->external_scanner.serialize(
553-
self->external_scanner_payload,
554-
self->lexer.debug_buffer
555-
);
564+
MutableSubtree mut_result = ts_subtree_to_mut_unsafe(result);
556565
ts_external_scanner_state_init(
557-
&((SubtreeHeapData *)result.ptr)->external_scanner_state,
566+
&mut_result.ptr->external_scanner_state,
558567
self->lexer.debug_buffer,
559-
length
568+
external_scanner_state_len
560569
);
570+
mut_result.ptr->has_external_scanner_state_change = external_scanner_state_changed;
561571
}
562572
}
563573

@@ -1199,6 +1209,15 @@ static void ts_parser__recover(
11991209
return;
12001210
}
12011211

1212+
if (
1213+
did_recover &&
1214+
ts_subtree_has_external_scanner_state_change(lookahead)
1215+
) {
1216+
ts_stack_halt(self->stack, version);
1217+
ts_subtree_release(&self->tree_pool, lookahead);
1218+
return;
1219+
}
1220+
12021221
// If the parser is still in the error state at the end of the file, just wrap everything
12031222
// in an ERROR node and terminate.
12041223
if (ts_subtree_is_eof(lookahead)) {
@@ -1929,6 +1948,7 @@ TSTree *ts_parser_parse(
19291948
}
19301949
} while (version_count != 0);
19311950

1951+
assert(self->finished_tree.ptr);
19321952
ts_subtree_balance(self->finished_tree, &self->tree_pool, self->language);
19331953
LOG("done");
19341954
LOG_TREE(self->finished_tree);

lib/src/subtree.c

Lines changed: 32 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,6 @@ typedef struct {
2121
#define TS_MAX_INLINE_TREE_LENGTH UINT8_MAX
2222
#define TS_MAX_TREE_POOL_SIZE 32
2323

24-
static const ExternalScannerState empty_state = {{.short_data = {0}}, .length = 0};
25-
2624
// ExternalScannerState
2725

2826
void ts_external_scanner_state_init(ExternalScannerState *self, const char *data, unsigned length) {
@@ -58,11 +56,10 @@ const char *ts_external_scanner_state_data(const ExternalScannerState *self) {
5856
}
5957
}
6058

61-
bool ts_external_scanner_state_eq(const ExternalScannerState *a, const ExternalScannerState *b) {
62-
return a == b || (
63-
a->length == b->length &&
64-
!memcmp(ts_external_scanner_state_data(a), ts_external_scanner_state_data(b), a->length)
65-
);
59+
bool ts_external_scanner_state_eq(const ExternalScannerState *a, const char *buffer, unsigned length) {
60+
return
61+
a->length == length &&
62+
memcmp(ts_external_scanner_state_data(a), buffer, length) == 0;
6663
}
6764

6865
// SubtreeArray
@@ -214,6 +211,7 @@ Subtree ts_subtree_new_leaf(
214211
.fragile_right = false,
215212
.has_changes = false,
216213
.has_external_tokens = has_external_tokens,
214+
.has_external_scanner_state_change = false,
217215
.depends_on_column = depends_on_column,
218216
.is_missing = false,
219217
.is_keyword = is_keyword,
@@ -381,6 +379,7 @@ void ts_subtree_summarize_children(
381379
self.ptr->node_count = 1;
382380
self.ptr->has_external_tokens = false;
383381
self.ptr->depends_on_column = false;
382+
self.ptr->has_external_scanner_state_change = false;
384383
self.ptr->dynamic_precedence = 0;
385384

386385
uint32_t structural_index = 0;
@@ -398,6 +397,10 @@ void ts_subtree_summarize_children(
398397
self.ptr->depends_on_column = true;
399398
}
400399

400+
if (ts_subtree_has_external_scanner_state_change(child)) {
401+
self.ptr->has_external_scanner_state_change = true;
402+
}
403+
401404
if (i == 0) {
402405
self.ptr->padding = ts_subtree_padding(child);
403406
self.ptr->size = ts_subtree_size(child);
@@ -521,6 +524,7 @@ MutableSubtree ts_subtree_new_node(
521524
.visible = metadata.visible,
522525
.named = metadata.named,
523526
.has_changes = false,
527+
.has_external_scanner_state_change = false,
524528
.fragile_left = fragile,
525529
.fragile_right = fragile,
526530
.is_keyword = false,
@@ -1024,14 +1028,26 @@ void ts_subtree_print_dot_graph(Subtree self, const TSLanguage *language, FILE *
10241028
fprintf(f, "}\n");
10251029
}
10261030

1027-
bool ts_subtree_external_scanner_state_eq(Subtree self, Subtree other) {
1028-
const ExternalScannerState *state1 = &empty_state;
1029-
const ExternalScannerState *state2 = &empty_state;
1030-
if (self.ptr && ts_subtree_has_external_tokens(self) && !self.ptr->child_count) {
1031-
state1 = &self.ptr->external_scanner_state;
1032-
}
1033-
if (other.ptr && ts_subtree_has_external_tokens(other) && !other.ptr->child_count) {
1034-
state2 = &other.ptr->external_scanner_state;
1031+
const ExternalScannerState *ts_subtree_external_scanner_state(Subtree self) {
1032+
static const ExternalScannerState empty_state = {{.short_data = {0}}, .length = 0};
1033+
if (
1034+
self.ptr &&
1035+
!self.data.is_inline &&
1036+
self.ptr->has_external_tokens &&
1037+
self.ptr->child_count == 0
1038+
) {
1039+
return &self.ptr->external_scanner_state;
1040+
} else {
1041+
return &empty_state;
10351042
}
1036-
return ts_external_scanner_state_eq(state1, state2);
1043+
}
1044+
1045+
bool ts_subtree_external_scanner_state_eq(Subtree a, Subtree b) {
1046+
const ExternalScannerState *state_a = ts_subtree_external_scanner_state(a);
1047+
const ExternalScannerState *state_b = ts_subtree_external_scanner_state(b);
1048+
return ts_external_scanner_state_eq(
1049+
state_a,
1050+
ts_external_scanner_state_data(state_b),
1051+
state_b->length
1052+
);
10371053
}

lib/src/subtree.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ typedef struct {
114114
Length size;
115115
uint32_t lookahead_bytes;
116116
uint32_t error_cost;
117-
uint32_t child_count;
117+
uint16_t child_count;
118118
TSSymbol symbol;
119119
TSStateId parse_state;
120120

@@ -125,6 +125,7 @@ typedef struct {
125125
bool fragile_right : 1;
126126
bool has_changes : 1;
127127
bool has_external_tokens : 1;
128+
bool has_external_scanner_state_change : 1;
128129
bool depends_on_column: 1;
129130
bool is_missing : 1;
130131
bool is_keyword : 1;
@@ -135,8 +136,8 @@ typedef struct {
135136
uint32_t visible_child_count;
136137
uint32_t named_child_count;
137138
uint32_t node_count;
138-
uint32_t repeat_depth;
139139
int32_t dynamic_precedence;
140+
uint16_t repeat_depth;
140141
uint16_t production_id;
141142
struct {
142143
TSSymbol symbol;
@@ -174,6 +175,8 @@ typedef struct {
174175

175176
void ts_external_scanner_state_init(ExternalScannerState *, const char *, unsigned);
176177
const char *ts_external_scanner_state_data(const ExternalScannerState *);
178+
bool ts_external_scanner_state_eq(const ExternalScannerState *a, const char *, unsigned);
179+
void ts_external_scanner_state_delete(ExternalScannerState *self);
177180

178181
void ts_subtree_array_copy(SubtreeArray, SubtreeArray *);
179182
void ts_subtree_array_clear(SubtreePool *, SubtreeArray *);
@@ -206,6 +209,7 @@ Subtree ts_subtree_edit(Subtree, const TSInputEdit *edit, SubtreePool *);
206209
char *ts_subtree_string(Subtree, const TSLanguage *, bool include_all);
207210
void ts_subtree_print_dot_graph(Subtree, const TSLanguage *, FILE *);
208211
Subtree ts_subtree_last_external_token(Subtree);
212+
const ExternalScannerState *ts_subtree_external_scanner_state(Subtree self);
209213
bool ts_subtree_external_scanner_state_eq(Subtree, Subtree);
210214

211215
#define SUBTREE_GET(self, name) (self.data.is_inline ? self.data.name : self.ptr->name)
@@ -331,6 +335,10 @@ static inline bool ts_subtree_has_external_tokens(Subtree self) {
331335
return self.data.is_inline ? false : self.ptr->has_external_tokens;
332336
}
333337

338+
static inline bool ts_subtree_has_external_scanner_state_change(Subtree self) {
339+
return self.data.is_inline ? false : self.ptr->has_external_scanner_state_change;
340+
}
341+
334342
static inline bool ts_subtree_depends_on_column(Subtree self) {
335343
return self.data.is_inline ? false : self.ptr->depends_on_column;
336344
}
Lines changed: 59 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,76 @@
1-
==========================================
2-
errors in if statements
3-
==========================================
1+
=============================================
2+
incomplete condition in if statement
3+
=============================================
44

55
if a is:
6-
print b
7-
print c
6+
print b
7+
print c
8+
print d
89

910
---
1011

1112
(module
12-
(if_statement (identifier) (ERROR) (block
13-
(print_statement (identifier))
14-
(print_statement (identifier)))))
13+
(if_statement
14+
condition: (identifier)
15+
(ERROR)
16+
consequence: (block
17+
(print_statement argument: (identifier))
18+
(print_statement argument: (identifier))))
19+
(print_statement argument: (identifier)))
1520

1621
==========================================
17-
errors in function definitions
22+
extra colon in function definition
1823
==========================================
1924

2025
def a()::
2126
b
2227
c
28+
d
29+
30+
---
31+
32+
(module
33+
(function_definition
34+
name: (identifier)
35+
parameters: (parameters)
36+
(ERROR)
37+
body: (block
38+
(expression_statement (identifier))
39+
(expression_statement (identifier))))
40+
(expression_statement (identifier)))
41+
42+
========================================================
43+
incomplete if statement in function definition
44+
========================================================
45+
46+
def a():
47+
if a
48+
49+
---
50+
51+
(module
52+
(function_definition
53+
name: (identifier)
54+
parameters: (parameters)
55+
(ERROR (identifier))
56+
body: (block)))
57+
58+
========================================================
59+
incomplete expression before triple-quoted string
60+
========================================================
61+
62+
def a():
63+
b.
64+
"""
65+
c
66+
"""
2367

2468
---
2569

2670
(module
27-
(function_definition (identifier) (parameters) (ERROR) (block
28-
(expression_statement (identifier))
29-
(expression_statement (identifier)))))
71+
(function_definition
72+
name: (identifier)
73+
parameters: (parameters)
74+
(ERROR (identifier))
75+
body: (block
76+
(expression_statement (string)))))

0 commit comments

Comments
 (0)