Skip to content

Commit dedcc52

Browse files
Ignore external tokens that are zero-length and extra (tree-sitter#4213)
Co-authored-by: Anthony <[email protected]>
1 parent 14b8ead commit dedcc52

File tree

4 files changed

+71
-16
lines changed

4 files changed

+71
-16
lines changed

lib/src/parser.c

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -556,27 +556,29 @@ static Subtree ts_parser__lex(
556556
external_scanner_state_len
557557
);
558558

559-
// When recovering from an error, ignore any zero-length external tokens
560-
// unless they have changed the external scanner's state. This helps to
561-
// avoid infinite loops which could otherwise occur, because the lexer is
562-
// looking for any possible token, instead of looking for the specific set of
563-
// tokens that are valid in some parse state.
559+
// Avoid infinite loops caused by the external scanner returning empty tokens.
560+
// Empty tokens are needed in some circumstances, e.g. indent/dedent tokens
561+
// in Python. Ignore the following classes of empty tokens:
564562
//
565-
// Note that it's possible that the token end position may be *before* the
566-
// original position of the lexer because of the way that tokens are positioned
567-
// at included range boundaries: when a token is terminated at the start of
568-
// an included range, it is marked as ending at the *end* of the preceding
569-
// included range.
563+
// * Tokens produced during error recovery. When recovering from an error,
564+
// all tokens are allowed, so it's easy to accidentally return unwanted
565+
// empty tokens.
566+
// * Tokens that are marked as 'extra' in the grammar. These don't change
567+
// the parse state, so they would definitely cause an infinite loop.
570568
if (
571569
self->lexer.token_end_position.bytes <= current_position.bytes &&
572-
(error_mode || !ts_stack_has_advanced_since_error(self->stack, version)) &&
573570
!external_scanner_state_changed
574571
) {
575-
LOG(
576-
"ignore_empty_external_token symbol:%s",
577-
SYM_NAME(self->language->external_scanner.symbol_map[self->lexer.data.result_symbol])
578-
)
579-
found_token = false;
572+
TSSymbol symbol = self->language->external_scanner.symbol_map[self->lexer.data.result_symbol];
573+
TSStateId next_parse_state = ts_language_next_state(self->language, parse_state, symbol);
574+
bool token_is_extra = (next_parse_state == parse_state);
575+
if (error_mode || !ts_stack_has_advanced_since_error(self->stack, version) || token_is_extra) {
576+
LOG(
577+
"ignore_empty_external_token symbol:%s",
578+
SYM_NAME(self->language->external_scanner.symbol_map[self->lexer.data.result_symbol])
579+
);
580+
found_token = false;
581+
}
580582
}
581583
}
582584

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
==========================
2+
A document
3+
==========================
4+
5+
a b
6+
7+
---
8+
9+
(document)
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
module.exports = grammar({
2+
name: 'epsilon_external_extra_tokens',
3+
4+
extras: $ => [/\s/, $.comment],
5+
6+
externals: $ => [$.comment],
7+
8+
rules: {
9+
document: $ => seq('a', 'b'),
10+
}
11+
});
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#include "tree_sitter/parser.h"
2+
3+
enum TokenType {
4+
COMMENT
5+
};
6+
7+
void *tree_sitter_epsilon_external_extra_tokens_external_scanner_create(void) {
8+
return NULL;
9+
}
10+
11+
bool tree_sitter_epsilon_external_extra_tokens_external_scanner_scan(
12+
void *payload,
13+
TSLexer *lexer,
14+
const bool *valid_symbols
15+
) {
16+
lexer->result_symbol = COMMENT;
17+
return true;
18+
}
19+
20+
unsigned tree_sitter_epsilon_external_extra_tokens_external_scanner_serialize(
21+
void *payload,
22+
char *buffer
23+
) {
24+
return 0;
25+
}
26+
27+
void tree_sitter_epsilon_external_extra_tokens_external_scanner_deserialize(
28+
void *payload,
29+
const char *buffer,
30+
unsigned length
31+
) {}
32+
33+
void tree_sitter_epsilon_external_extra_tokens_external_scanner_destroy(void *payload) {}

0 commit comments

Comments
 (0)