Skip to content

Commit 2bb20fe

Browse files
rooneyamaanq
andauthored
feat: allow external scanners to use the logger
Co-authored-by: Amaan Qureshi <[email protected]>
1 parent fec6c77 commit 2bb20fe

File tree

5 files changed

+40
-0
lines changed

5 files changed

+40
-0
lines changed

cli/src/tests/parser_test.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1422,6 +1422,30 @@ if foo && bar || baz {}
14221422
parser.parse(&input, Some(&tree)).unwrap();
14231423
}
14241424

1425+
#[test]
1426+
fn test_parsing_with_scanner_logging() {
1427+
let dir = fixtures_dir().join("test_grammars").join("external_tokens");
1428+
let grammar_json = load_grammar_file(&dir.join("grammar.js"), None).unwrap();
1429+
let (grammar_name, parser_code) = generate_parser_for_grammar(&grammar_json).unwrap();
1430+
1431+
let mut parser = Parser::new();
1432+
parser
1433+
.set_language(&get_test_language(&grammar_name, &parser_code, Some(&dir)))
1434+
.unwrap();
1435+
1436+
let mut found = false;
1437+
parser.set_logger(Some(Box::new(|log_type, message| {
1438+
if log_type == LogType::Lex && message == "Found a percent string" {
1439+
found = true;
1440+
}
1441+
})));
1442+
1443+
let source_code = "x + %(sup (external) scanner?)";
1444+
1445+
parser.parse(source_code, None).unwrap();
1446+
assert!(found);
1447+
}
1448+
14251449
const fn simple_range(start: usize, end: usize) -> Range {
14261450
Range {
14271451
start_byte: start,

docs/section-3-creating-parsers.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -862,6 +862,7 @@ This function is responsible for recognizing external tokens. It should return `
862862
* **`uint32_t (*get_column)(TSLexer *)`** - A function for querying the current column position of the lexer. It returns the number of codepoints since the start of the current line. The codepoint position is recalculated on every call to this function by reading from the start of the line.
863863
* **`bool (*is_at_included_range_start)(const TSLexer *)`** - A function for checking whether the parser has just skipped some characters in the document. When parsing an embedded document using the `ts_parser_set_included_ranges` function (described in the [multi-language document section][multi-language-section]), the scanner may want to apply some special behavior when moving to a disjoint part of the document. For example, in [EJS documents][ejs], the JavaScript parser uses this function to enable inserting automatic semicolon tokens in between the code directives, delimited by `<%` and `%>`.
864864
* **`bool (*eof)(const TSLexer *)`** - A function for determining whether the lexer is at the end of the file. The value of `lookahead` will be `0` at the end of a file, but this function should be used instead of checking for that value because the `0` or "NUL" value is also a valid character that could be present in the file being parsed.
865+
- **`void (*log)(const TSLexer *, const char * format, ...)`** - A `printf`-like function for logging. The log is viewable through e.g. `tree-sitter parse --debug` or the browser's console after checking the `log` option in the [Playground](./playground).
865866

866867
The third argument to the `scan` function is an array of booleans that indicates which of external tokens are currently expected by the parser. You should only look for a given token if it is valid according to this array. At the same time, you cannot backtrack, so you may need to combine certain pieces of logic.
867868

lib/src/lexer.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include "./subtree.h"
44
#include "./length.h"
55
#include "./unicode.h"
6+
#include <stdarg.h>
67

78
#define LOG(message, character) \
89
if (self->logger.log) { \
@@ -284,6 +285,17 @@ static bool ts_lexer__is_at_included_range_start(const TSLexer *_self) {
284285
}
285286
}
286287

288+
static void ts_lexer__log(const TSLexer *_self, const char *fmt, ...) {
289+
Lexer *self = (Lexer *)_self;
290+
va_list args;
291+
va_start(args, fmt);
292+
if (self->logger.log) {
293+
vsnprintf(self->debug_buffer, TREE_SITTER_SERIALIZATION_BUFFER_SIZE, fmt, args);
294+
self->logger.log(self->logger.payload, TSLogTypeLex, self->debug_buffer);
295+
}
296+
va_end(args);
297+
}
298+
287299
void ts_lexer_init(Lexer *self) {
288300
*self = (Lexer) {
289301
.data = {
@@ -295,6 +307,7 @@ void ts_lexer_init(Lexer *self) {
295307
.get_column = ts_lexer__get_column,
296308
.is_at_included_range_start = ts_lexer__is_at_included_range_start,
297309
.eof = ts_lexer__eof,
310+
.log = ts_lexer__log,
298311
.lookahead = 0,
299312
.result_symbol = 0,
300313
},

lib/src/parser.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ struct TSLexer {
4747
uint32_t (*get_column)(TSLexer *);
4848
bool (*is_at_included_range_start)(const TSLexer *);
4949
bool (*eof)(const TSLexer *);
50+
void (*log)(const TSLexer *, const char *, ...);
5051
};
5152

5253
typedef enum {

test/fixtures/test_grammars/external_tokens/scanner.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ bool tree_sitter_external_tokens_external_scanner_scan(
7777

7878
for (;;) {
7979
if (scanner->depth == 0) {
80+
lexer->log(lexer, "Found a percent string");
8081
lexer->result_symbol = percent_string;
8182
return true;
8283
}

0 commit comments

Comments
 (0)