Skip to content

Commit f8b8596

Browse files
author
Max Brunsfeld
authored
Merge pull request tree-sitter#35 from tree-sitter/handle-invalid-chars-at-eof
Handle invalid chars at EOF
2 parents b6e2bed + 4f0c83b commit f8b8596

File tree

9 files changed

+94
-112
lines changed

9 files changed

+94
-112
lines changed

include/tree_sitter/parser.h

Lines changed: 9 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -28,30 +28,19 @@ typedef struct {
2828
bool structural : 1;
2929
} TSSymbolMetadata;
3030

31-
typedef enum {
32-
TSTransitionTypeMain,
33-
TSTransitionTypeSeparator,
34-
TSTransitionTypeError,
35-
} TSTransitionType;
36-
3731
typedef struct TSLexer {
38-
void (*advance)(struct TSLexer *, TSStateId, TSTransitionType);
32+
void (*advance)(struct TSLexer *, TSStateId, bool);
3933

4034
TSLength current_position;
41-
TSLength token_end_position;
4235
TSLength token_start_position;
43-
TSLength error_end_position;
4436

4537
const char *chunk;
4638
size_t chunk_start;
4739
size_t chunk_size;
4840

4941
size_t lookahead_size;
5042
int32_t lookahead;
51-
TSStateId starting_state;
5243
TSSymbol result_symbol;
53-
bool result_follows_error;
54-
int32_t first_unexpected_character;
5544

5645
TSInput input;
5746
TSDebugger debugger;
@@ -94,7 +83,7 @@ struct TSLanguage {
9483
const unsigned short *parse_table;
9584
const TSParseActionEntry *parse_actions;
9685
const TSStateId *lex_states;
97-
bool (*lex_fn)(TSLexer *, TSStateId, bool);
86+
bool (*lex_fn)(TSLexer *, TSStateId);
9887
};
9988

10089
/*
@@ -106,22 +95,18 @@ struct TSLanguage {
10695
next_state: \
10796
lookahead = lexer->lookahead;
10897

109-
#define GO_TO_STATE(state_value) \
110-
{ \
111-
state = state_value; \
112-
goto next_state; \
113-
}
114-
11598
#define ADVANCE(state_value) \
11699
{ \
117-
lexer->advance(lexer, state_value, TSTransitionTypeMain); \
118-
GO_TO_STATE(state_value); \
100+
lexer->advance(lexer, state_value, false); \
101+
state = state_value; \
102+
goto next_state; \
119103
}
120104

121105
#define SKIP(state_value) \
122106
{ \
123-
lexer->advance(lexer, state_value, TSTransitionTypeSeparator); \
124-
GO_TO_STATE(state_value); \
107+
lexer->advance(lexer, state_value, true); \
108+
state = state_value; \
109+
goto next_state; \
125110
}
126111

127112
#define ACCEPT_TOKEN(symbol_value) \
@@ -130,14 +115,7 @@ struct TSLanguage {
130115
return true; \
131116
}
132117

133-
#define LEX_ERROR() \
134-
if (error_mode) { \
135-
if (state == TS_STATE_ERROR) \
136-
lexer->advance(lexer, state, TSTransitionTypeError); \
137-
GO_TO_STATE(TS_STATE_ERROR); \
138-
} else { \
139-
return false; \
140-
}
118+
#define LEX_ERROR() return false
141119

142120
/*
143121
* Parse Table Macros

spec/runtime/parser_spec.cc

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,15 @@ describe("Parser", [&]() {
162162
AssertThat(get_node_text(last), Equals("true"));
163163
});
164164
});
165+
166+
describe("when there is an unterminated error", [&]() {
167+
it("maintains a consistent tree", [&]() {
168+
ts_document_set_language(doc, get_test_language("javascript"));
169+
set_text("a; /* b");
170+
assert_root_node(
171+
"(ERROR (program (expression_statement (identifier))) (UNEXPECTED EOF))");
172+
});
173+
});
165174
});
166175

167176
describe("handling extra tokens", [&]() {

src/compiler/generate_code/c_code.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ class CCodeGenerator {
184184

185185
void add_lex_function() {
186186
line(
187-
"static bool ts_lex(TSLexer *lexer, TSStateId state, bool error_mode) {");
187+
"static bool ts_lex(TSLexer *lexer, TSStateId state) {");
188188
indent([&]() {
189189
line("START_LEXER();");
190190
_switch("state", [&]() {

src/runtime/length.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@ static inline void ts_length_set_unknown(TSLength *self) {
1414
self->columns = 0;
1515
}
1616

17+
static inline TSLength ts_length_min(TSLength len1, TSLength len2) {
18+
return (len1.chars < len2.chars) ? len1 : len2;
19+
}
20+
1721
static inline TSLength ts_length_add(TSLength len1, TSLength len2) {
1822
TSLength result;
1923
result.chars = len1.chars + len2.chars;

src/runtime/lexer.c

Lines changed: 6 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -47,16 +47,14 @@ static void ts_lexer__get_lookahead(TSLexer *self) {
4747
LOG_LOOKAHEAD();
4848
}
4949

50-
static void ts_lexer__advance(TSLexer *self, TSStateId state,
51-
TSTransitionType transition_type) {
50+
static void ts_lexer__advance(TSLexer *self, TSStateId state, bool skip) {
5251

5352
if (self->chunk == empty_chunk)
5453
return;
5554

5655
if (self->lookahead_size) {
5756
self->current_position.bytes += self->lookahead_size;
5857
self->current_position.chars++;
59-
6058
if (self->lookahead == '\n') {
6159
self->current_position.rows++;
6260
self->current_position.columns = 0;
@@ -65,25 +63,11 @@ static void ts_lexer__advance(TSLexer *self, TSStateId state,
6563
}
6664
}
6765

68-
switch (transition_type) {
69-
case TSTransitionTypeSeparator:
70-
if (self->result_follows_error) {
71-
LOG("skip_error state:%d", state);
72-
} else {
73-
LOG("skip_separator state:%d", state);
74-
self->token_start_position = self->current_position;
75-
}
76-
break;
77-
case TSTransitionTypeError:
78-
LOG("skip_error state:%d", state);
79-
self->result_follows_error = true;
80-
self->error_end_position = self->current_position;
81-
if (!self->first_unexpected_character)
82-
self->first_unexpected_character = self->lookahead;
83-
break;
84-
default:
85-
LOG("advance state:%d", state);
86-
break;
66+
if (skip) {
67+
LOG("skip_separator state:%d", state);
68+
self->token_start_position = self->current_position;
69+
} else {
70+
LOG("advance state:%d", state);
8771
}
8872

8973
if (self->current_position.bytes >= self->chunk_start + self->chunk_size)
@@ -109,7 +93,6 @@ void ts_lexer_init(TSLexer *self) {
10993

11094
static inline void ts_lexer__reset(TSLexer *self, TSLength position) {
11195
self->token_start_position = position;
112-
self->token_end_position = position;
11396
self->current_position = position;
11497

11598
self->chunk = 0;
@@ -132,34 +115,12 @@ void ts_lexer_reset(TSLexer *self, TSLength position) {
132115

133116
void ts_lexer_start(TSLexer *self, TSStateId lex_state) {
134117
LOG("start_lex state:%d, pos:%lu", lex_state, self->current_position.chars);
135-
LOG_LOOKAHEAD();
136118

137-
self->starting_state = lex_state;
138119
self->token_start_position = self->current_position;
139-
self->result_follows_error = false;
140120
self->result_symbol = 0;
141-
self->first_unexpected_character = 0;
142121

143122
if (!self->chunk)
144123
ts_lexer__get_chunk(self);
145124
if (!self->lookahead_size)
146125
ts_lexer__get_lookahead(self);
147126
}
148-
149-
void ts_lexer_finish(TSLexer *self, TSLexerResult *result) {
150-
result->padding =
151-
ts_length_sub(self->token_start_position, self->token_end_position);
152-
153-
if (self->result_follows_error) {
154-
result->symbol = ts_builtin_sym_error;
155-
result->size =
156-
ts_length_sub(self->error_end_position, self->token_start_position);
157-
result->first_unexpected_character = self->first_unexpected_character;
158-
ts_lexer_reset(self, self->error_end_position);
159-
} else {
160-
result->symbol = self->result_symbol;
161-
result->size =
162-
ts_length_sub(self->current_position, self->token_start_position);
163-
self->token_end_position = self->current_position;
164-
}
165-
}

src/runtime/lexer.h

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,10 @@ extern "C" {
77

88
#include "tree_sitter/parser.h"
99

10-
typedef struct {
11-
TSSymbol symbol;
12-
TSLength padding;
13-
TSLength size;
14-
int32_t first_unexpected_character;
15-
} TSLexerResult;
16-
1710
void ts_lexer_init(TSLexer *);
1811
void ts_lexer_set_input(TSLexer *, TSInput);
1912
void ts_lexer_reset(TSLexer *, TSLength);
2013
void ts_lexer_start(TSLexer *, TSStateId);
21-
void ts_lexer_finish(TSLexer *, TSLexerResult *);
2214

2315
#ifdef __cplusplus
2416
}

src/runtime/parser.c

Lines changed: 53 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -242,37 +242,67 @@ static bool parser__condense_stack(Parser *self) {
242242
return result;
243243
}
244244

245-
static TSTree *parser__lex(Parser *self, TSStateId parse_state, bool error_mode) {
246-
TSStateId state = self->language->lex_states[parse_state];
247-
LOG("lex state:%d", state);
245+
static TSTree *parser__lex(Parser *self, TSStateId parse_state) {
246+
TSStateId start_state = self->language->lex_states[parse_state];
247+
TSStateId current_state = start_state;
248+
TSLength start_position = self->lexer.current_position;
249+
TSLength position = start_position;
250+
LOG("lex state:%d", start_state);
251+
252+
bool skipped_error = false;
253+
int32_t first_error_character = 0;
254+
TSLength error_start_position, error_end_position;
255+
256+
ts_lexer_start(&self->lexer, start_state);
257+
258+
while (!self->language->lex_fn(&self->lexer, current_state)) {
259+
if (current_state != TS_STATE_ERROR) {
260+
LOG("retry_in_error_mode");
261+
ts_lexer_reset(&self->lexer, position);
262+
ts_lexer_start(&self->lexer, start_state);
263+
current_state = TS_STATE_ERROR;
264+
continue;
265+
}
248266

249-
TSLength position = self->lexer.current_position;
267+
if (self->lexer.lookahead == 0) {
268+
self->lexer.result_symbol = ts_builtin_sym_error;
269+
break;
270+
}
250271

251-
ts_lexer_start(&self->lexer, state);
252-
if (!self->language->lex_fn(&self->lexer, state, error_mode)) {
253-
ts_lexer_reset(&self->lexer, position);
254-
ts_lexer_start(&self->lexer, state);
255-
assert(self->language->lex_fn(&self->lexer, TS_STATE_ERROR, true));
256-
}
272+
if (self->lexer.current_position.chars == position.chars) {
273+
if (!skipped_error) {
274+
error_start_position = self->lexer.current_position;
275+
first_error_character = self->lexer.lookahead;
276+
}
277+
skipped_error = true;
278+
self->lexer.advance(&self->lexer, TS_STATE_ERROR, false);
279+
error_end_position = self->lexer.current_position;
280+
}
257281

258-
TSLexerResult lex_result;
259-
ts_lexer_finish(&self->lexer, &lex_result);
282+
position = self->lexer.current_position;
283+
}
260284

261285
TSTree *result;
262-
if (lex_result.symbol == ts_builtin_sym_error) {
263-
result = ts_tree_make_error(lex_result.size, lex_result.padding,
264-
lex_result.first_unexpected_character);
286+
287+
if (skipped_error) {
288+
error_start_position = ts_length_min(error_start_position, self->lexer.token_start_position);
289+
TSLength padding = ts_length_sub(error_start_position, start_position);
290+
TSLength size = ts_length_sub(error_end_position, error_start_position);
291+
ts_lexer_reset(&self->lexer, error_end_position);
292+
result = ts_tree_make_error(size, padding, first_error_character);
265293
} else {
266-
result = ts_tree_make_leaf(
267-
lex_result.symbol, lex_result.padding, lex_result.size,
268-
ts_language_symbol_metadata(self->language, lex_result.symbol));
269-
if (!result)
270-
return NULL;
271-
result->parse_state = parse_state;
294+
TSSymbol symbol = self->lexer.result_symbol;
295+
TSLength padding = ts_length_sub(self->lexer.token_start_position, start_position);
296+
TSLength size = ts_length_sub(self->lexer.current_position, self->lexer.token_start_position);
297+
result = ts_tree_make_leaf(symbol, padding, size,
298+
ts_language_symbol_metadata(self->language, symbol));
272299
}
273300

274-
result->first_leaf.lex_state = state;
301+
if (!result)
302+
return NULL;
275303

304+
result->parse_state = parse_state;
305+
result->first_leaf.lex_state = start_state;
276306
return result;
277307
}
278308

@@ -333,8 +363,7 @@ static TSTree *parser__get_lookahead(Parser *self, StackVersion version,
333363

334364
ts_lexer_reset(&self->lexer, position);
335365
TSStateId parse_state = ts_stack_top_state(self->stack, version);
336-
bool error_mode = parse_state == TS_STATE_ERROR;
337-
return parser__lex(self, parse_state, error_mode);
366+
return parser__lex(self, parse_state);
338367

339368
error:
340369
return NULL;

src/runtime/tree.c

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -369,6 +369,15 @@ void ts_tree_edit(TSTree *self, TSInputEdit edit) {
369369
}
370370
}
371371

372+
static size_t ts_tree__write_char_to_string(char *s, size_t n, int32_t c) {
373+
if (c == 0)
374+
return snprintf(s, n, "EOF");
375+
else if (c < 128)
376+
return snprintf(s, n, "'%c'", c);
377+
else
378+
return snprintf(s, n, "%d", c);
379+
}
380+
372381
static size_t ts_tree__write_to_string(const TSTree *self,
373382
const TSLanguage *language, char *string,
374383
size_t limit, bool is_root,
@@ -386,8 +395,8 @@ static size_t ts_tree__write_to_string(const TSTree *self,
386395
if (visible) {
387396
if (self->symbol == ts_builtin_sym_error && self->child_count == 0 &&
388397
self->size.chars > 0) {
389-
cursor +=
390-
snprintf(*writer, limit, "(UNEXPECTED '%c'", self->lookahead_char);
398+
cursor += snprintf(*writer, limit, "(UNEXPECTED ");
399+
cursor += ts_tree__write_char_to_string(*writer, limit, self->lookahead_char);
391400
} else {
392401
cursor += snprintf(*writer, limit, "(%s",
393402
ts_language_symbol_name(language, self->symbol));

src/runtime/tree.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ typedef struct TSTree {
2525
size_t named_child_count;
2626
union {
2727
struct TSTree **children;
28-
char lookahead_char;
28+
int32_t lookahead_char;
2929
};
3030

3131
TSLength padding;

0 commit comments

Comments
 (0)