Skip to content

Commit 9866674

Browse files
authored
Merge pull request tree-sitter#1660 from alex-pinkus/expanded-regex-support
Expand regex support to include emojis and binary ops
2 parents 5eb0a30 + 8fadf18 commit 9866674

File tree

7 files changed

+172
-22
lines changed

7 files changed

+172
-22
lines changed

Cargo.lock

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

cli/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ lazy_static = "1.2.0"
3232
regex = "1"
3333
regex-syntax = "0.6.4"
3434
rustc-hash = "1"
35+
semver = "1.0"
3536
serde = { version = "1.0.130", features = ["derive"] }
3637
smallbitvec = "2.5.1"
3738
tiny_http = "0.8"

cli/src/generate/mod.rs

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ use self::rules::AliasMap;
2020
use anyhow::{anyhow, Context, Result};
2121
use lazy_static::lazy_static;
2222
use regex::{Regex, RegexBuilder};
23+
use semver::Version;
2324
use std::fs;
2425
use std::io::Write;
2526
use std::path::{Path, PathBuf};
@@ -178,10 +179,20 @@ fn load_js_grammar_file(grammar_path: &Path) -> Result<String> {
178179
.stdin
179180
.take()
180181
.expect("Failed to open stdin for node");
182+
let cli_version = Version::parse(env!("CARGO_PKG_VERSION"))
183+
.expect("Could not parse this package's version as semver.");
184+
write!(
185+
node_stdin,
186+
"global.TREE_SITTER_CLI_VERSION_MAJOR = {};
187+
global.TREE_SITTER_CLI_VERSION_MINOR = {};
188+
global.TREE_SITTER_CLI_VERSION_PATCH = {};",
189+
cli_version.major, cli_version.minor, cli_version.patch,
190+
)
191+
.expect("Failed to write tree-sitter version to node's stdin");
181192
let javascript_code = include_bytes!("./dsl.js");
182193
node_stdin
183194
.write(javascript_code)
184-
.expect("Failed to write to node's stdin");
195+
.expect("Failed to write grammar dsl to node's stdin");
185196
drop(node_stdin);
186197
let output = node_process
187198
.wait_with_output()

cli/src/generate/nfa.rs

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,20 @@ impl CharacterSet {
276276
}
277277
}
278278

279+
/// Produces a `CharacterSet` containing every character in `self` that is not present in
280+
/// `other`.
281+
pub fn difference(mut self, mut other: CharacterSet) -> CharacterSet {
282+
self.remove_intersection(&mut other);
283+
self
284+
}
285+
286+
/// Produces a `CharacterSet` containing every character that is in _exactly one_ of `self` or
287+
/// `other`, but is not present in both sets.
288+
pub fn symmetric_difference(mut self, mut other: CharacterSet) -> CharacterSet {
289+
self.remove_intersection(&mut other);
290+
self.add(&other)
291+
}
292+
279293
pub fn iter<'a>(&'a self) -> impl Iterator<Item = u32> + 'a {
280294
self.ranges.iter().flat_map(|r| r.clone())
281295
}
@@ -817,7 +831,7 @@ mod tests {
817831
}
818832

819833
#[test]
820-
fn test_character_set_remove_intersection() {
834+
fn test_character_set_intersection_difference_ops() {
821835
struct Row {
822836
left: CharacterSet,
823837
right: CharacterSet,
@@ -942,6 +956,25 @@ mod tests {
942956
"row {}b: {:?} - {:?}",
943957
i, row.right, row.left
944958
);
959+
960+
assert_eq!(
961+
row.left.clone().difference(row.right.clone()),
962+
row.left_only,
963+
"row {}b: {:?} -- {:?}",
964+
i,
965+
row.left,
966+
row.right
967+
);
968+
969+
let symm_difference = row.left_only.clone().add(&mut row.right_only.clone());
970+
assert_eq!(
971+
row.left.clone().symmetric_difference(row.right.clone()),
972+
symm_difference,
973+
"row {}b: {:?} ~~ {:?}",
974+
i,
975+
row.left,
976+
row.right
977+
)
945978
}
946979
}
947980

cli/src/generate/prepare_grammar/expand_tokens.rs

Lines changed: 110 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ use anyhow::{anyhow, Context, Result};
66
use lazy_static::lazy_static;
77
use regex::Regex;
88
use regex_syntax::ast::{
9-
parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, ClassUnicodeKind, RepetitionKind,
10-
RepetitionRange,
9+
parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
10+
ClassUnicodeKind, RepetitionKind, RepetitionRange,
1111
};
1212
use std::collections::HashMap;
1313
use std::i32;
@@ -240,19 +240,14 @@ impl NfaBuilder {
240240
self.push_advance(chars, next_state_id);
241241
Ok(true)
242242
}
243-
Class::Bracketed(class) => match &class.kind {
244-
ClassSet::Item(item) => {
245-
let mut chars = self.expand_character_class(&item)?;
246-
if class.negated {
247-
chars = chars.negate();
248-
}
249-
self.push_advance(chars, next_state_id);
250-
Ok(true)
243+
Class::Bracketed(class) => {
244+
let mut chars = self.translate_class_set(&class.kind)?;
245+
if class.negated {
246+
chars = chars.negate();
251247
}
252-
ClassSet::BinaryOp(_) => Err(anyhow!(
253-
"Regex error: Binary operators in character classes aren't supported"
254-
)),
255-
},
248+
self.push_advance(chars, next_state_id);
249+
Ok(true)
250+
}
256251
},
257252
Ast::Repetition(repetition) => match repetition.op.kind {
258253
RepetitionKind::ZeroOrOne => {
@@ -319,6 +314,27 @@ impl NfaBuilder {
319314
}
320315
}
321316

317+
fn translate_class_set(&self, class_set: &ClassSet) -> Result<CharacterSet> {
318+
match &class_set {
319+
ClassSet::Item(item) => self.expand_character_class(&item),
320+
ClassSet::BinaryOp(binary_op) => {
321+
let mut lhs_char_class = self.translate_class_set(&binary_op.lhs)?;
322+
let mut rhs_char_class = self.translate_class_set(&binary_op.rhs)?;
323+
match binary_op.kind {
324+
ClassSetBinaryOpKind::Intersection => {
325+
Ok(lhs_char_class.remove_intersection(&mut rhs_char_class))
326+
}
327+
ClassSetBinaryOpKind::Difference => {
328+
Ok(lhs_char_class.difference(rhs_char_class))
329+
}
330+
ClassSetBinaryOpKind::SymmetricDifference => {
331+
Ok(lhs_char_class.symmetric_difference(rhs_char_class))
332+
}
333+
}
334+
}
335+
}
336+
}
337+
322338
fn expand_one_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result<bool> {
323339
self.nfa.states.push(NfaState::Accept {
324340
variable_index: 0,
@@ -384,6 +400,13 @@ impl NfaBuilder {
384400
}
385401
Ok(set)
386402
}
403+
ClassSetItem::Bracketed(class) => {
404+
let mut set = self.translate_class_set(&class.kind)?;
405+
if class.negated {
406+
set = set.negate();
407+
}
408+
Ok(set)
409+
}
387410
_ => Err(anyhow!(
388411
"Regex error: Unsupported character class syntax {:?}",
389412
item
@@ -782,6 +805,79 @@ mod tests {
782805
("\u{1000b}", Some((3, "\u{1000b}"))),
783806
],
784807
},
808+
// Emojis
809+
Row {
810+
rules: vec![Rule::pattern(r"\p{Emoji}+")],
811+
separators: vec![],
812+
examples: vec![
813+
("🐎", Some((0, "🐎"))),
814+
("🐴🐴", Some((0, "🐴🐴"))),
815+
("#0", Some((0, "#0"))), // These chars are technically emojis!
816+
("⻢", None),
817+
("♞", None),
818+
("horse", None),
819+
],
820+
},
821+
// Intersection
822+
Row {
823+
rules: vec![Rule::pattern(r"[[0-7]&&[4-9]]+")],
824+
separators: vec![],
825+
examples: vec![
826+
("456", Some((0, "456"))),
827+
("64", Some((0, "64"))),
828+
("452", Some((0, "45"))),
829+
("91", None),
830+
("8", None),
831+
("3", None),
832+
],
833+
},
834+
// Difference
835+
Row {
836+
rules: vec![Rule::pattern(r"[[0-9]--[4-7]]+")],
837+
separators: vec![],
838+
examples: vec![
839+
("123", Some((0, "123"))),
840+
("83", Some((0, "83"))),
841+
("9", Some((0, "9"))),
842+
("124", Some((0, "12"))),
843+
("67", None),
844+
("4", None),
845+
],
846+
},
847+
// Symmetric difference
848+
Row {
849+
rules: vec![Rule::pattern(r"[[0-7]~~[4-9]]+")],
850+
separators: vec![],
851+
examples: vec![
852+
("123", Some((0, "123"))),
853+
("83", Some((0, "83"))),
854+
("9", Some((0, "9"))),
855+
("124", Some((0, "12"))),
856+
("67", None),
857+
("4", None),
858+
],
859+
},
860+
// Nested set operations
861+
Row {
862+
// 0 1 2 3 4 5 6 7 8 9
863+
// [0-5]: y y y y y y
864+
// [2-4]: y y y
865+
// [0-5]--[2-4]: y y y
866+
// [3-9]: y y y y y y y
867+
// [6-7]: y y
868+
// [3-9]--[5-7]: y y y y y
869+
// final regex: y y y y y y
870+
rules: vec![Rule::pattern(r"[[[0-5]--[2-4]]~~[[3-9]--[6-7]]]+")],
871+
separators: vec![],
872+
examples: vec![
873+
("01", Some((0, "01"))),
874+
("432", Some((0, "43"))),
875+
("8", Some((0, "8"))),
876+
("9", Some((0, "9"))),
877+
("2", None),
878+
("567", None),
879+
],
880+
},
785881
];
786882

787883
for Row {

cli/src/generate/prepare_grammar/unicode-properties.json

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

script/generate-unicode-categories-json

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ const PROPERTY_URL = 'https://unicode.org/Public/14.0.0/ucd/PropList.txt'
1212
const DERIVED_PROPERTY_URL = 'https://unicode.org/Public/14.0.0/ucd/DerivedCoreProperties.txt'
1313
const CATEGORY_ALIAS_URL = 'https://unicode.org/Public/14.0.0/ucd/PropertyValueAliases.txt'
1414
const PROPERTY_ALIAS_URL = 'https://unicode.org/Public/14.0.0/ucd/PropertyAliases.txt'
15+
const EMOJI_DATA_URL = 'https://unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt'
1516

1617
const fs = require('fs');
1718
const path = require('path');
@@ -23,6 +24,7 @@ const propertyData = cachedDownload(PROPERTY_URL);
2324
const derivedPropertyData = cachedDownload(DERIVED_PROPERTY_URL);
2425
const categoryAliasData = cachedDownload(CATEGORY_ALIAS_URL);
2526
const propertyAliasData = cachedDownload(PROPERTY_ALIAS_URL);
27+
const emojiData = cachedDownload(EMOJI_DATA_URL);
2628
function cachedDownload(url) {
2729
let downloadPath = path.join('.', 'target', path.basename(url))
2830
if (fs.existsSync(downloadPath)) {
@@ -41,7 +43,7 @@ const propertyAliases = {}
4143
let data, row, lineStart, lineEnd;
4244

4345
// Parse the properties
44-
data = propertyData + derivedPropertyData;
46+
data = propertyData + derivedPropertyData + emojiData;
4547
row = 0;
4648
lineStart = 0;
4749
lineEnd = -1;
@@ -79,7 +81,7 @@ while (lineStart < data.length) {
7981

8082
const property = data.slice(propertyStart, propertyEnd).trim();
8183

82-
console.log(codePoints, property);
84+
console.log("Property:", codePoints, property);
8385

8486

8587
for (let c = codePoints[0]; c <= codePoints[1]; c++) {
@@ -123,7 +125,7 @@ while (lineStart < data.length) {
123125
const name = data.slice(nameStart, nameEnd);
124126
const category = data.slice(categoryStart, categoryEnd);
125127

126-
console.log(codePoint, category, name);
128+
console.log("Category:", codePoint, category, name);
127129

128130
// Group the code points by their category.
129131
if (!categories[category]) {
@@ -181,7 +183,7 @@ while (lineStart < data.length) {
181183
lineDone = true;
182184
}
183185
const alias = data.slice(aliasStart, aliasEnd).trim();
184-
console.log(alias, shortName);
186+
console.log("Category alias:", alias, shortName);
185187
categoryAliases[alias] = shortName;
186188
aliasStart = aliasEnd + 1;
187189
} while (!lineDone);
@@ -229,7 +231,7 @@ while (lineStart < data.length) {
229231
} else {
230232
alias = data.slice(nameStart, nameEnd).trim();
231233
}
232-
console.log(alias, longName);
234+
console.log("Property alias:", alias, longName);
233235
propertyAliases[alias] = longName;
234236
nameStart = nameEnd + 1;
235237
} while (!lineDone);

0 commit comments

Comments
 (0)