Skip to content

Commit 827f5bb

Browse files
Karl Williamsonobra
Karl Williamson
authored andcommitted
PATCH: [perl #89750]: Unicode regex negated case-insensitivity
This patch causes inverted [bracketed] character classes to not handle multi-character folds. The reason is that these can lead to very counter-intuitive results (see bug discussion). In an inverted character class, only single-char folds are now generated. However the fold for \xDF=>ss is hard-coded in, and it was too much trouble sending flags to the sub-sub routine that does this, so another check is done at the point of storing the list of multi-char folds. Since \xDF doesn't have a single char fold, this works.
1 parent 36bb2ab commit 827f5bb

File tree

3 files changed

+28
-1
lines changed

3 files changed

+28
-1
lines changed

regcomp.c

+21-1
Original file line numberDiff line numberDiff line change
@@ -9552,6 +9552,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
95529552
IV namedclass;
95539553
char *rangebegin = NULL;
95549554
bool need_class = 0;
9555+
bool allow_full_fold = TRUE; /* Assume wants multi-char folding */
95559556
SV *listsv = NULL;
95569557
STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
95579558
than just initialized. */
@@ -9608,6 +9609,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
96089609
RExC_parse++;
96099610
if (!SIZE_ONLY)
96109611
ANYOF_FLAGS(ret) |= ANYOF_INVERT;
9612+
9613+
/* We have decided to not allow multi-char folds in inverted character
9614+
* classes, due to the confusion that can happen, even with classes
9615+
* that are designed for a non-Unicode world: You have the peculiar
9616+
* case that:
9617+
"s s" =~ /^[^\xDF]+$/i => Y
9618+
"ss" =~ /^[^\xDF]+$/i => N
9619+
*
9620+
* See [perl #89750] */
9621+
allow_full_fold = FALSE;
96119622
}
96129623

96139624
if (SIZE_ONLY) {
@@ -10136,7 +10147,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
1013610147
/* Get its fold */
1013710148
U8 foldbuf[UTF8_MAXBYTES_CASE+1];
1013810149
STRLEN foldlen;
10139-
const UV f = to_uni_fold(j, foldbuf, &foldlen);
10150+
const UV f =
10151+
_to_uni_fold_flags(j, foldbuf, &foldlen, allow_full_fold);
1014010152

1014110153
if (foldlen > (STRLEN)UNISKIP(f)) {
1014210154

@@ -10437,10 +10449,18 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
1043710449
* used later (regexec.c:S_reginclass()). */
1043810450
av_store(av, 0, listsv);
1043910451
av_store(av, 1, NULL);
10452+
10453+
/* Store any computed multi-char folds only if we are allowing
10454+
* them */
10455+
if (allow_full_fold) {
1044010456
av_store(av, 2, MUTABLE_SV(unicode_alternate));
1044110457
if (unicode_alternate) { /* This node is variable length */
1044210458
OP(ret) = ANYOFV;
1044310459
}
10460+
}
10461+
else {
10462+
av_store(av, 2, NULL);
10463+
}
1044410464
rv = newRV_noinc(MUTABLE_SV(av));
1044510465
n = add_data(pRExC_state, 1, "s");
1044610466
RExC_rxi->data->data[n] = (void*)rv;

t/re/fold_grind.t

+2
Original file line numberDiff line numberDiff line change
@@ -452,6 +452,8 @@ foreach my $test (sort { numerically } keys %tests) {
452452
foreach my $bracketed (0, 1) { # Put rhs in [...], or not
453453
foreach my $inverted (0,1) {
454454
next if $inverted && ! $bracketed; # inversion only valid in [^...]
455+
next if $inverted && @target != 1; # [perl #89750] multi-char
456+
# not valid in [^...]
455457
456458
# In some cases, add an extra character that doesn't fold, and
457459
# looks ok in the output.

t/re/re_tests

+5
Original file line numberDiff line numberDiff line change
@@ -1517,4 +1517,9 @@ abc\N{def - c - \\N{NAME} must be resolved by the lexer
15171517
/s/aia S y $& S
15181518
/(?aia:s)/ \x{17F} n - -
15191519
/(?aia:s)/ S y $& S
1520+
1521+
# Normally 1E9E generates a multi-char fold, but not in inverted class;
1522+
# See [perl #89750]. This makes sure that the simple fold gets generated
1523+
# in that case, to DF.
1524+
/[^\x{1E9E}]/i \x{DF} n - -
15201525
# vim: softtabstop=0 noexpandtab

0 commit comments

Comments
 (0)