Skip to content

Commit 44ebabd

Browse files
author
Max Brunsfeld
authored
Merge pull request tree-sitter#191 from tree-sitter/incomplete-multi-byte-characters
Handle input chunks that end within multi-byte characters
2 parents 126f84a + acc937b commit 44ebabd

File tree

10 files changed

+64
-116
lines changed

10 files changed

+64
-116
lines changed

src/runtime/lexer.c

Lines changed: 28 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,15 @@
88
#define LOG(...) \
99
if (self->logger.log) { \
1010
snprintf(self->debug_buffer, TREE_SITTER_SERIALIZATION_BUFFER_SIZE, __VA_ARGS__); \
11-
self->logger.log(self->logger.payload, TSLogTypeLex, self->debug_buffer); \
11+
self->logger.log(self->logger.payload, TSLogTypeLex, self->debug_buffer); \
1212
}
1313

1414
#define LOG_CHARACTER(message, character) \
15-
LOG(character < 255 ? message " character:'%c'" : message " character:%d", character)
15+
LOG( \
16+
32 <= character && character < 127 ? \
17+
message " character:'%c'" : \
18+
message " character:%d", character \
19+
)
1620

1721
static const char empty_chunk[3] = { 0, 0 };
1822

@@ -27,6 +31,12 @@ static void ts_lexer__get_chunk(Lexer *self) {
2731
if (!self->chunk_size) self->chunk = empty_chunk;
2832
}
2933

34+
typedef utf8proc_ssize_t (*DecodeFunction)(
35+
const utf8proc_uint8_t *,
36+
utf8proc_ssize_t,
37+
utf8proc_int32_t *
38+
);
39+
3040
static void ts_lexer__get_lookahead(Lexer *self) {
3141
uint32_t position_in_chunk = self->current_position.bytes - self->chunk_start;
3242
const uint8_t *chunk = (const uint8_t *)self->chunk + position_in_chunk;
@@ -38,15 +48,22 @@ static void ts_lexer__get_lookahead(Lexer *self) {
3848
return;
3949
}
4050

41-
if (self->input.encoding == TSInputEncodingUTF8) {
42-
int64_t lookahead_size = utf8proc_iterate(chunk, size, &self->data.lookahead);
43-
if (lookahead_size < 0) {
44-
self->lookahead_size = 1;
45-
} else {
46-
self->lookahead_size = lookahead_size;
47-
}
48-
} else {
49-
self->lookahead_size = utf16_iterate(chunk, size, &self->data.lookahead);
51+
DecodeFunction decode =
52+
self->input.encoding == TSInputEncodingUTF8 ? utf8proc_iterate : utf16_iterate;
53+
54+
self->lookahead_size = decode(chunk, size, &self->data.lookahead);
55+
56+
// If this chunk ended in the middle of a multi-byte character,
57+
// try again with a fresh chunk.
58+
if (self->data.lookahead == -1 && size < 4) {
59+
ts_lexer__get_chunk(self);
60+
chunk = (const uint8_t *)self->chunk;
61+
size = self->chunk_size;
62+
self->lookahead_size = decode(chunk, size, &self->data.lookahead);
63+
}
64+
65+
if (self->data.lookahead == -1) {
66+
self->lookahead_size = 1;
5067
}
5168
}
5269

src/runtime/utf16.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
#include "runtime/utf16.h"
22

3-
int utf16_iterate(const uint8_t *string, size_t length, int32_t *code_point) {
3+
utf8proc_ssize_t utf16_iterate(
4+
const utf8proc_uint8_t *string,
5+
utf8proc_ssize_t length,
6+
utf8proc_int32_t *code_point
7+
) {
48
if (length < 2) {
59
*code_point = -1;
610
return 0;

src/runtime/utf16.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,12 @@ extern "C" {
77

88
#include <stdint.h>
99
#include <stdlib.h>
10+
#include "utf8proc.h"
1011

1112
// Analogous to utf8proc's utf8proc_iterate function. Reads one code point from
12-
// the given string and stores it in the location pointed to by `code_point`.
13+
// the given UTF16 string and stores it in the location pointed to by `code_point`.
1314
// Returns the number of bytes in `string` that were read.
14-
int utf16_iterate(const uint8_t *string, size_t length, int32_t *code_point);
15+
utf8proc_ssize_t utf16_iterate(const utf8proc_uint8_t *, utf8proc_ssize_t, utf8proc_int32_t *);
1516

1617
#ifdef __cplusplus
1718
}

test/helpers/encoding_helpers.cc

Lines changed: 0 additions & 64 deletions
This file was deleted.

test/helpers/encoding_helpers.h

Lines changed: 0 additions & 15 deletions
This file was deleted.

test/helpers/spy_input.cc

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
#include "helpers/spy_input.h"
2-
#include "helpers/encoding_helpers.h"
32
#include "helpers/point_helpers.h"
43
#include "runtime/point.h"
54
#include <string.h>
@@ -46,19 +45,13 @@ const char *SpyInput::read(void *payload, uint32_t byte_offset,
4645
TSPoint position, uint32_t *bytes_read) {
4746
auto spy = static_cast<SpyInput *>(payload);
4847

49-
if (byte_offset >= spy->content.size()) {
50-
*bytes_read = 0;
51-
return "";
48+
unsigned end_byte = byte_offset + spy->chars_per_chunk;
49+
if (end_byte > spy->content.size()) {
50+
end_byte = spy->content.size();
5251
}
5352

54-
long byte_count = string_byte_for_character(spy->encoding, spy->content, byte_offset, spy->chars_per_chunk);
55-
if (byte_count < 0) {
56-
byte_count = spy->content.size() - byte_offset;
57-
}
58-
59-
string result = spy->content.substr(byte_offset, byte_count);
60-
*bytes_read = byte_count;
61-
add_byte_range(&spy->ranges_read, byte_offset, byte_count);
53+
*bytes_read = end_byte - byte_offset;
54+
add_byte_range(&spy->ranges_read, byte_offset, *bytes_read);
6255

6356
/*
6457
* This class stores its entire `content` in a contiguous buffer, but we want
@@ -70,9 +63,9 @@ const char *SpyInput::read(void *payload, uint32_t byte_offset,
7063
* can detect code reading too many bytes from the buffer.
7164
*/
7265
delete[] spy->buffer;
73-
if (byte_count) {
74-
spy->buffer = new char[byte_count];
75-
memcpy(spy->buffer, result.data(), byte_count);
66+
if (*bytes_read) {
67+
spy->buffer = new char[*bytes_read]();
68+
memcpy(spy->buffer, spy->content.data() + byte_offset, *bytes_read);
7669
} else {
7770
spy->buffer = nullptr;
7871
}

test/integration/real_grammars.cc

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
#include "helpers/spy_input.h"
66
#include "helpers/stderr_logger.h"
77
#include "helpers/point_helpers.h"
8-
#include "helpers/encoding_helpers.h"
98
#include "helpers/record_alloc.h"
109
#include "helpers/random_helpers.h"
1110
#include "helpers/scope_sequence.h"
@@ -57,7 +56,7 @@ for (auto &language_name : test_languages) {
5756
SpyInput *input;
5857

5958
it(("parses " + entry.description + ": initial parse").c_str(), [&]() {
60-
input = new SpyInput(entry.input, 3);
59+
input = new SpyInput(entry.input, 4);
6160
if (debug_graphs_enabled) printf("%s\n\n", input->content.c_str());
6261

6362
TSTree *tree = ts_parser_parse(parser, nullptr, input->input());
@@ -77,8 +76,8 @@ for (auto &language_name : test_languages) {
7776
set<pair<size_t, string>> insertions;
7877

7978
for (size_t i = 0; i < 60; i++) {
80-
size_t edit_position = default_generator(utf8_char_count(entry.input));
81-
size_t deletion_size = default_generator(utf8_char_count(entry.input) - edit_position);
79+
size_t edit_position = default_generator(entry.input.size());
80+
size_t deletion_size = default_generator(entry.input.size() - edit_position);
8281
string inserted_text = default_generator.words(default_generator(4) + 1);
8382

8483
if (insertions.insert({edit_position, inserted_text}).second) {

test/runtime/parser_test.cc

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -544,6 +544,21 @@ describe("Parser", [&]() {
544544
root = ts_tree_root_node(tree);
545545
AssertThat(ts_node_end_point(root), Equals<TSPoint>({0, 28}));
546546
});
547+
548+
it("handles input chunks that end in the middle of multi-byte characters", [&]() {
549+
ts_parser_set_language(parser, load_real_language("c"));
550+
spy_input->content = "A b = {'👍','👍'};";
551+
spy_input->chars_per_chunk = 4;
552+
553+
tree = ts_parser_parse(parser, nullptr, spy_input->input());
554+
root = ts_tree_root_node(tree);
555+
assert_root_node(
556+
"(translation_unit (declaration "
557+
"(type_identifier) "
558+
"(init_declarator "
559+
"(identifier) "
560+
"(initializer_list (char_literal) (char_literal)))))");
561+
});
547562
});
548563

549564
describe("set_language(language)", [&]() {

test/runtime/tree_test.cc

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
#include "helpers/load_language.h"
1212
#include "helpers/random_helpers.h"
1313
#include "helpers/read_test_entries.h"
14-
#include "helpers/encoding_helpers.h"
1514
#include "helpers/tree_helpers.h"
1615

1716
TSPoint point(uint32_t row, uint32_t column) {
@@ -71,8 +70,8 @@ describe("Tree", [&]() {
7170
for (unsigned j = 0; j < 10; j++) {
7271
random.sleep_some();
7372

74-
size_t edit_position = random(utf8_char_count(input->content));
75-
size_t deletion_size = random(utf8_char_count(input->content) - edit_position);
73+
size_t edit_position = random(input->content.size());
74+
size_t deletion_size = random(input->content.size() - edit_position);
7675
string inserted_text = random.words(random(4) + 1);
7776

7877
TSInputEdit edit = input->replace(edit_position, deletion_size, inserted_text);

tests.gyp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@
5050
'test/compiler/rules/character_set_test.cc',
5151
'test/compiler/rules/rule_test.cc',
5252
'test/compiler/util/string_helpers_test.cc',
53-
'test/helpers/encoding_helpers.cc',
5453
'test/helpers/file_helpers.cc',
5554
'test/helpers/load_language.cc',
5655
'test/helpers/point_helpers.cc',

0 commit comments

Comments
 (0)