Skip to content

Commit 4c08325

Browse files
committed
fix(lib): advance the lookahead end byte by 4 when there's an invalid code point
This helps in the case where an edit was made in the middle of a code point, but bytes 1-3 are valid, thus we could advance by at most 4 bytes
1 parent 61d0395 commit 4c08325

File tree

2 files changed

+24
-1
lines changed

2 files changed

+24
-1
lines changed

cli/src/tests/tree_test.rs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -679,6 +679,29 @@ fn test_get_changed_ranges() {
679679
}
680680
}
681681

682+
#[test]
683+
fn test_consistency_with_mid_codepoint_edit() {
684+
let mut parser = Parser::new();
685+
parser.set_language(&get_language("php/php")).unwrap();
686+
let mut source_code =
687+
b"\n<?php\n\n<<<'\xE5\xAD\x97\xE6\xBC\xA2'\n T\n\xE5\xAD\x97\xE6\xBC\xA2;".to_vec();
688+
let mut tree = parser.parse(&source_code, None).unwrap();
689+
690+
let edit = Edit {
691+
position: 17,
692+
deleted_length: 0,
693+
inserted_text: vec![46],
694+
};
695+
perform_edit(&mut tree, &mut source_code, &edit).unwrap();
696+
let mut tree2 = parser.parse(&source_code, Some(&tree)).unwrap();
697+
698+
let inverted = invert_edit(&source_code, &edit);
699+
perform_edit(&mut tree2, &mut source_code, &inverted).unwrap();
700+
let tree3 = parser.parse(&source_code, Some(&tree2)).unwrap();
701+
702+
assert_eq!(tree3.root_node().to_sexp(), tree.root_node().to_sexp());
703+
}
704+
682705
fn index_of(text: &[u8], substring: &str) -> usize {
683706
str::from_utf8(text).unwrap().find(substring).unwrap()
684707
}

lib/src/lexer.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -365,7 +365,7 @@ void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) {
365365
// Therefore, the next byte *after* the current (invalid) character
366366
// affects the interpretation of the current character.
367367
if (self->data.lookahead == TS_DECODE_ERROR) {
368-
current_lookahead_end_byte++;
368+
current_lookahead_end_byte += 4; // the maximum number of bytes read to identify an invalid code point
369369
}
370370

371371
if (current_lookahead_end_byte > *lookahead_end_byte) {

0 commit comments

Comments
 (0)