diff options
author | Bill Sommerfeld <sommerfeld@hamachi.org> | 2023-12-21 03:46:14 +0000 |
---|---|---|
committer | Kyle Evans <kevans@FreeBSD.org> | 2024-09-25 20:42:28 +0000 |
commit | d96ce6d000703f3f57d9214b741e16cc7741d77e (patch) | |
tree | 60d61784f39261244b7afbc9cf843f4bf8d6c45d | |
parent | 867aaad5c2bfdd8326fc805964e711ccfbb18d1e (diff) | |
download | src-d96ce6d000703f3f57d9214b741e16cc7741d77e.tar.gz src-d96ce6d000703f3f57d9214b741e16cc7741d77e.zip |
regex: mixed sets are misidentified as singletons
Fix "singleton" function used by regcomp() to turn character set matches
into exact character matches if a character set has exactly one
element.
The underlying cset representation is complex; most critically it
records"small" characters (codepoint less than either 128
or 256 depending on locale) in a bit vector, and "wide" characters in
a secondary array.
Unfortunately the "singleton" function uses to identify singleton sets
treated a cset as a singleton if either the "small" or the "wide" sets
had exactly one element (it would then ignore the other set).
The easiest way to demonstrate this bug:
$ export LANG=C.UTF-8
$ echo 'a' | grep '[abà]'
It should match (and print "a") but instead it doesn't match because the
single accented character in the set is misinterpreted as a singleton.
PR: 281710
Reviewed by: kevans, yuripv
Obtained from: illumos
(cherry picked from commit 8f7ed58a15556bf567ff876e1999e4fe4d684e1d)
-rw-r--r-- | lib/libc/regex/regcomp.c | 25 | ||||
-rwxr-xr-x | lib/libc/tests/regex/multibyte.sh | 43 |
2 files changed, 62 insertions, 6 deletions
diff --git a/lib/libc/regex/regcomp.c b/lib/libc/regex/regcomp.c index 42fa1b99e58e..2897052fa0f8 100644 --- a/lib/libc/regex/regcomp.c +++ b/lib/libc/regex/regcomp.c @@ -1592,17 +1592,32 @@ singleton(cset *cs) { wint_t i, s, n; + /* Exclude the complicated cases we don't want to deal with */ + if (cs->nranges != 0 || cs->ntypes != 0 || cs->icase != 0) + return (OUT); + + if (cs->nwides > 1) + return (OUT); + + /* Count the number of characters present in the bitmap */ for (i = n = 0; i < NC; i++) if (CHIN(cs, i)) { n++; s = i; } - if (n == 1) - return (s); - if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && - cs->icase == 0) + + if (n > 1) + return (OUT); + + if (n == 1) { + if (cs->nwides == 0) + return (s); + else + return (OUT); + } + if (cs->nwides == 1) return (cs->wides[0]); - /* Don't bother handling the other cases. */ + return (OUT); } diff --git a/lib/libc/tests/regex/multibyte.sh b/lib/libc/tests/regex/multibyte.sh index a736352bf0a2..18323f500a2b 100755 --- a/lib/libc/tests/regex/multibyte.sh +++ b/lib/libc/tests/regex/multibyte.sh @@ -1,4 +1,3 @@ - atf_test_case bmpat bmpat_head() { @@ -45,8 +44,50 @@ icase_body() echo $c | atf_check -o "inline:$c\n" sed -ne "/$a/Ip" } +atf_test_case mbset cleanup +mbset_head() +{ + atf_set "descr" "Check multibyte sets matching" +} +mbset_body() +{ + export LC_CTYPE="C.UTF-8" + + # This involved an erroneously implemented optimization which reduces + # single-element sets to an exact match with a single codepoint. + # Match sets record small-codepoint characters in a bitmap and + # large-codepoint characters in an array; the optimization would falsely + # trigger if either the bitmap or the array was a singleton, ignoring + # the members of the other side of the set. + # + # To exercise this, we construct sets which have one member of one side + # and one or more of the other, and verify that all members can be + # found. + printf "a" > mbset; atf_check -o not-empty sed -ne '/[aà]/p' mbset + printf "à" > mbset; atf_check -o not-empty sed -ne '/[aà]/p' mbset + printf "a" > mbset; atf_check -o not-empty sed -ne '/[aàá]/p' mbset + printf "à" > mbset; atf_check -o not-empty sed -ne '/[aàá]/p' mbset + printf "á" > mbset; atf_check -o not-empty sed -ne '/[aàá]/p' mbset + printf "à" > mbset; atf_check -o not-empty sed -ne '/[abà]/p' mbset + printf "a" > mbset; atf_check -o not-empty sed -ne '/[abà]/p' mbset + printf "b" > mbset; atf_check -o not-empty sed -ne '/[abà]/p' mbset + printf "a" > mbset; atf_check -o not-empty sed -Ene '/[aà]/p' mbset + printf "à" > mbset; atf_check -o not-empty sed -Ene '/[aà]/p' mbset + printf "a" > mbset; atf_check -o not-empty sed -Ene '/[aàá]/p' mbset + printf "à" > mbset; atf_check -o not-empty sed -Ene '/[aàá]/p' mbset + printf "á" > mbset; atf_check -o not-empty sed -Ene '/[aàá]/p' mbset + printf "à" > mbset; atf_check -o not-empty sed -Ene '/[abà]/p' mbset + printf "a" > mbset; atf_check -o not-empty sed -Ene '/[abà]/p' mbset + printf "b" > mbset; atf_check -o not-empty sed -Ene '/[abà]/p' mbset +} +mbset_cleanup() +{ + rm -f mbset +} + atf_init_test_cases() { atf_add_test_case bmpat atf_add_test_case icase + atf_add_test_case mbset } |