Skip to content

Commit e88dd22

Browse files
committed
Support {} quantifier syntax in regexes
1 parent 16376c4 commit e88dd22

File tree

2 files changed

+141
-0
lines changed

2 files changed

+141
-0
lines changed

src/compiler/prepare_grammar/parse_regex.cc

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "compiler/prepare_grammar/parse_regex.h"
22
#include <string>
33
#include <utility>
4+
#include <cwctype>
45
#include <vector>
56
#include "compiler/rule.h"
67
#include "compiler/util/string_helpers.h"
@@ -12,6 +13,7 @@ namespace prepare_grammar {
1213
using std::string;
1314
using std::vector;
1415
using std::pair;
16+
using std::iswdigit;
1517
using rules::CharacterSet;
1618
using rules::Blank;
1719
using rules::Rule;
@@ -85,6 +87,56 @@ class PatternParser {
8587
next();
8688
result = Rule::choice({result, Blank{}});
8789
break;
90+
case '{': {
91+
Checkpoint checkpoint = get_checkpoint();
92+
next();
93+
94+
string min_repeat_string;
95+
while (iswdigit(peek())) {
96+
min_repeat_string += (char)peek();
97+
next();
98+
}
99+
100+
bool has_comma = false;
101+
string max_repeat_string;
102+
if (peek() == ',') {
103+
next();
104+
has_comma = true;
105+
while (iswdigit(peek())) {
106+
max_repeat_string += (char)peek();
107+
next();
108+
}
109+
}
110+
111+
if (peek() == '}' && (!min_repeat_string.empty() || has_comma)) {
112+
next();
113+
if (min_repeat_string.size()) {
114+
unsigned min_count = std::stoi(min_repeat_string);
115+
vector<Rule> entries(min_count, result);
116+
if (max_repeat_string.size()) {
117+
unsigned max_count = std::stoi(max_repeat_string);
118+
if (max_count < min_count) {
119+
return error("numbers out of order in {} quantifier");
120+
}
121+
vector<Rule> optional_entries(max_count - min_count, Rule::choice({result, Blank{}}));
122+
entries.insert(entries.end(), optional_entries.begin(), optional_entries.end());
123+
} else if (has_comma) {
124+
entries.push_back(Rule::repeat(result));
125+
}
126+
result = Rule::seq(entries);
127+
} else if (max_repeat_string.size()) {
128+
unsigned max_count = std::stoi(max_repeat_string);
129+
vector<Rule> optional_entries(max_count, Rule::choice({result, Blank{}}));
130+
result = Rule::seq(optional_entries);
131+
} else {
132+
result = Rule::repeat(result);
133+
}
134+
} else {
135+
revert(checkpoint);
136+
}
137+
138+
break;
139+
}
88140
}
89141
}
90142

@@ -245,6 +297,20 @@ class PatternParser {
245297
iter += lookahead_size;
246298
}
247299

300+
struct Checkpoint {
301+
const uint8_t *iter;
302+
int32_t lookahead;
303+
};
304+
305+
Checkpoint get_checkpoint() {
306+
return Checkpoint{iter, lookahead};
307+
}
308+
309+
void revert(Checkpoint checkpoint) {
310+
iter = checkpoint.iter;
311+
lookahead = checkpoint.lookahead;
312+
}
313+
248314
uint32_t peek() {
249315
return lookahead;
250316
}

test/compiler/prepare_grammar/parse_regex_test.cc

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,76 @@ describe("parse_regex", []() {
274274
CharacterSet{{'/'}},
275275
}),
276276
},
277+
278+
{
279+
"characters with quantifiers",
280+
"a{3}",
281+
Rule::seq({
282+
CharacterSet{{'a'}},
283+
CharacterSet{{'a'}},
284+
CharacterSet{{'a'}},
285+
}),
286+
},
287+
288+
{
289+
"character classes with quantifiers",
290+
"[a-f]{3}",
291+
Rule::seq({
292+
CharacterSet().include('a', 'f'),
293+
CharacterSet().include('a', 'f'),
294+
CharacterSet().include('a', 'f'),
295+
}),
296+
},
297+
298+
{
299+
"characters with open range quantifiers",
300+
"a{,} b{1,} c{,2}",
301+
Rule::seq({
302+
Rule::seq({
303+
Repeat{CharacterSet{{'a'}}},
304+
}),
305+
CharacterSet{{' '}},
306+
Rule::seq({
307+
CharacterSet{{'b'}},
308+
Repeat{CharacterSet{{'b'}}},
309+
}),
310+
CharacterSet{{' '}},
311+
Rule::seq({
312+
Rule::choice({CharacterSet{{'c'}}, Blank{}}),
313+
Rule::choice({CharacterSet{{'c'}}, Blank{}}),
314+
}),
315+
}),
316+
},
317+
318+
{
319+
"characters with closed range quantifiers",
320+
"a{2,4}",
321+
Rule::seq({
322+
CharacterSet{{'a'}},
323+
CharacterSet{{'a'}},
324+
Rule::choice({CharacterSet{{'a'}}, Blank{}}),
325+
Rule::choice({CharacterSet{{'a'}}, Blank{}}),
326+
}),
327+
},
328+
329+
{
330+
"curly braces that aren't quantifiers",
331+
"a{1b} c{2,d}",
332+
Rule::seq({
333+
CharacterSet{{'a'}},
334+
CharacterSet{{'{'}},
335+
CharacterSet{{'1'}},
336+
CharacterSet{{'b'}},
337+
CharacterSet{{'}'}},
338+
CharacterSet{{' '}},
339+
CharacterSet{{'c'}},
340+
CharacterSet{{'{'}},
341+
CharacterSet{{'2'}},
342+
CharacterSet{{','}},
343+
CharacterSet{{'d'}},
344+
CharacterSet{{'}'}},
345+
}),
346+
}
277347
};
278348

279349
struct InvalidInputRow {
@@ -313,6 +383,11 @@ describe("parse_regex", []() {
313383
"a]",
314384
"unmatched close square bracket",
315385
},
386+
{
387+
"numbers out of order in range quantifiers",
388+
"a{3,1}",
389+
"numbers out of order in {} quantifier",
390+
},
316391
};
317392

318393
for (auto &row : valid_inputs) {

0 commit comments

Comments
 (0)