Skip to content

Commit 500f432

Browse files
committed
feat: add the ability to specify a custom decode function
1 parent e27160b commit 500f432

File tree

10 files changed

+347
-16
lines changed

10 files changed

+347
-16
lines changed

Cargo.lock

Lines changed: 17 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

cli/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,8 @@ tree-sitter-loader.workspace = true
7171
tree-sitter-tags.workspace = true
7272

7373
[dev-dependencies]
74+
encoding_rs = "0.8.35"
75+
widestring = "1.1.0"
7476
tree_sitter_proc_macro = { path = "src/tests/proc_macro", package = "tree-sitter-tests-proc-macro" }
7577

7678
tempfile.workspace = true

cli/src/tests/parser_test.rs

Lines changed: 171 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ use std::{
44
};
55

66
use tree_sitter::{
7-
IncludedRangesError, InputEdit, LogType, ParseOptions, ParseState, Parser, Point, Range,
7+
Decode, IncludedRangesError, InputEdit, LogType, ParseOptions, ParseState, Parser, Point, Range,
88
};
99
use tree_sitter_generate::{generate_parser_for_grammar, load_grammar_file};
1010
use tree_sitter_proc_macro::retry;
@@ -1646,6 +1646,176 @@ fn test_parsing_by_halting_at_offset() {
16461646
assert!(seen_byte_offsets.len() > 100);
16471647
}
16481648

1649+
#[test]
1650+
fn test_decode_utf32() {
1651+
use widestring::u32cstr;
1652+
1653+
let mut parser = Parser::new();
1654+
parser.set_language(&get_language("rust")).unwrap();
1655+
1656+
let utf32_text = u32cstr!("pub fn foo() { println!(\"€50\"); }");
1657+
let utf32_text = unsafe {
1658+
std::slice::from_raw_parts(utf32_text.as_ptr().cast::<u8>(), utf32_text.len() * 4)
1659+
};
1660+
1661+
struct U32Decoder;
1662+
1663+
impl Decode for U32Decoder {
1664+
fn decode(bytes: &[u8]) -> (i32, u32) {
1665+
if bytes.len() >= 4 {
1666+
#[cfg(target_endian = "big")]
1667+
{
1668+
(
1669+
i32::from_be_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]),
1670+
4,
1671+
)
1672+
}
1673+
1674+
#[cfg(target_endian = "little")]
1675+
{
1676+
(
1677+
i32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]),
1678+
4,
1679+
)
1680+
}
1681+
} else {
1682+
println!("bad decode: {bytes:?}");
1683+
(0, 0)
1684+
}
1685+
}
1686+
}
1687+
1688+
let tree = parser
1689+
.parse_custom_encoding::<U32Decoder, _, _>(
1690+
&mut |offset, _| {
1691+
if offset < utf32_text.len() {
1692+
&utf32_text[offset..]
1693+
} else {
1694+
&[]
1695+
}
1696+
},
1697+
None,
1698+
None,
1699+
)
1700+
.unwrap();
1701+
1702+
assert_eq!(
1703+
tree.root_node().to_sexp(),
1704+
"(source_file (function_item (visibility_modifier) name: (identifier) parameters: (parameters) body: (block (expression_statement (macro_invocation macro: (identifier) (token_tree (string_literal (string_content))))))))"
1705+
);
1706+
}
1707+
1708+
#[test]
1709+
fn test_decode_cp1252() {
1710+
use encoding_rs::WINDOWS_1252;
1711+
1712+
let mut parser = Parser::new();
1713+
parser.set_language(&get_language("rust")).unwrap();
1714+
1715+
let windows_1252_text = WINDOWS_1252.encode("pub fn foo() { println!(\"€50\"); }").0;
1716+
1717+
struct Cp1252Decoder;
1718+
1719+
impl Decode for Cp1252Decoder {
1720+
fn decode(bytes: &[u8]) -> (i32, u32) {
1721+
if !bytes.is_empty() {
1722+
let byte = bytes[0];
1723+
(byte as i32, 1)
1724+
} else {
1725+
(0, 0)
1726+
}
1727+
}
1728+
}
1729+
1730+
let tree = parser
1731+
.parse_custom_encoding::<Cp1252Decoder, _, _>(
1732+
&mut |offset, _| &windows_1252_text[offset..],
1733+
None,
1734+
None,
1735+
)
1736+
.unwrap();
1737+
1738+
assert_eq!(
1739+
tree.root_node().to_sexp(),
1740+
"(source_file (function_item (visibility_modifier) name: (identifier) parameters: (parameters) body: (block (expression_statement (macro_invocation macro: (identifier) (token_tree (string_literal (string_content))))))))"
1741+
);
1742+
}
1743+
1744+
#[test]
1745+
fn test_decode_macintosh() {
1746+
use encoding_rs::MACINTOSH;
1747+
1748+
let mut parser = Parser::new();
1749+
parser.set_language(&get_language("rust")).unwrap();
1750+
1751+
let macintosh_text = MACINTOSH.encode("pub fn foo() { println!(\"€50\"); }").0;
1752+
1753+
struct MacintoshDecoder;
1754+
1755+
impl Decode for MacintoshDecoder {
1756+
fn decode(bytes: &[u8]) -> (i32, u32) {
1757+
if !bytes.is_empty() {
1758+
let byte = bytes[0];
1759+
(byte as i32, 1)
1760+
} else {
1761+
(0, 0)
1762+
}
1763+
}
1764+
}
1765+
1766+
let tree = parser
1767+
.parse_custom_encoding::<MacintoshDecoder, _, _>(
1768+
&mut |offset, _| &macintosh_text[offset..],
1769+
None,
1770+
None,
1771+
)
1772+
.unwrap();
1773+
1774+
assert_eq!(
1775+
tree.root_node().to_sexp(),
1776+
"(source_file (function_item (visibility_modifier) name: (identifier) parameters: (parameters) body: (block (expression_statement (macro_invocation macro: (identifier) (token_tree (string_literal (string_content))))))))"
1777+
);
1778+
}
1779+
1780+
#[test]
1781+
fn test_decode_utf24le() {
1782+
let mut parser = Parser::new();
1783+
parser.set_language(&get_language("rust")).unwrap();
1784+
1785+
let mut utf24le_text = Vec::new();
1786+
for c in "pub fn foo() { println!(\"€50\"); }".chars() {
1787+
let code_point = c as u32;
1788+
utf24le_text.push((code_point & 0xFF) as u8);
1789+
utf24le_text.push(((code_point >> 8) & 0xFF) as u8);
1790+
utf24le_text.push(((code_point >> 16) & 0xFF) as u8);
1791+
}
1792+
1793+
struct Utf24LeDecoder;
1794+
1795+
impl Decode for Utf24LeDecoder {
1796+
fn decode(bytes: &[u8]) -> (i32, u32) {
1797+
if bytes.len() >= 3 {
1798+
(i32::from_le_bytes([bytes[0], bytes[1], bytes[2], 0]), 3)
1799+
} else {
1800+
(0, 0)
1801+
}
1802+
}
1803+
}
1804+
1805+
let tree = parser
1806+
.parse_custom_encoding::<Utf24LeDecoder, _, _>(
1807+
&mut |offset, _| &utf24le_text[offset..],
1808+
None,
1809+
None,
1810+
)
1811+
.unwrap();
1812+
1813+
assert_eq!(
1814+
tree.root_node().to_sexp(),
1815+
"(source_file (function_item (visibility_modifier) name: (identifier) parameters: (parameters) body: (block (expression_statement (macro_invocation macro: (identifier) (token_tree (string_literal (string_content))))))))"
1816+
);
1817+
}
1818+
16491819
const fn simple_range(start: usize, end: usize) -> Range {
16501820
Range {
16511821
start_byte: start,

docs/section-2-using-parsers.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,9 +149,22 @@ typedef struct {
149149
uint32_t *bytes_read
150150
);
151151
TSInputEncoding encoding;
152+
DecodeFunction decode;
152153
} TSInput;
153154
```
154155

156+
In the event that you want to decode text that is not encoded in UTF-8 or UTF16, then you can set the `decode` field of the input to your function that will decode text. The signature of the `DecodeFunction` is as follows:
157+
158+
```c
159+
typedef uint32_t (*DecodeFunction)(
160+
const uint8_t *string,
161+
uint32_t length,
162+
int32_t *code_point
163+
);
164+
```
165+
166+
The `string` argument is a pointer to the text to decode, which comes from the `read` function, and the `length` argument is the length of the `string`. The `code_point` argument is a pointer to an integer that represents the decoded code point, and should be written to in your `decode` callback. The function should return the number of bytes decoded.
167+
155168
### Syntax Nodes
156169

157170
Tree-sitter provides a [DOM](https://en.wikipedia.org/wiki/Document_Object_Model)-style interface for inspecting syntax trees. A syntax node's _type_ is a string that indicates which grammar rule the node represents.

lib/binding_rust/bindings.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,13 @@ pub struct TSQueryCursor {
3535
pub struct TSLookaheadIterator {
3636
_unused: [u8; 0],
3737
}
38+
pub type DecodeFunction = ::core::option::Option<
39+
unsafe extern "C" fn(string: *const u8, length: u32, code_point: *mut i32) -> u32,
40+
>;
3841
pub const TSInputEncodingUTF8: TSInputEncoding = 0;
3942
pub const TSInputEncodingUTF16LE: TSInputEncoding = 1;
4043
pub const TSInputEncodingUTF16BE: TSInputEncoding = 2;
44+
pub const TSInputEncodingCustom: TSInputEncoding = 3;
4145
pub type TSInputEncoding = ::core::ffi::c_uint;
4246
pub const TSSymbolTypeRegular: TSSymbolType = 0;
4347
pub const TSSymbolTypeAnonymous: TSSymbolType = 1;
@@ -71,6 +75,7 @@ pub struct TSInput {
7175
) -> *const ::core::ffi::c_char,
7276
>,
7377
pub encoding: TSInputEncoding,
78+
pub decode: DecodeFunction,
7479
}
7580
#[repr(C)]
7681
#[derive(Debug, Copy, Clone)]
@@ -212,7 +217,7 @@ extern "C" {
212217
) -> *mut TSTree;
213218
}
214219
extern "C" {
215-
#[doc = " Use the parser to parse some source code and create a syntax tree, with some options.\n\n See [`ts_parser_parse`] for more details."]
220+
#[doc = " Use the parser to parse some source code and create a syntax tree, with some options.\n\n See [`ts_parser_parse`] for more details.\n\n See [`TSParseOptions`] for more details on the options."]
216221
pub fn ts_parser_parse_with_options(
217222
self_: *mut TSParser,
218223
old_tree: *const TSTree,

0 commit comments

Comments
 (0)