1 files changed, 104 insertions, 88 deletions
diff --git a/contrib/bmake/str.c b/contrib/bmake/str.c
index c64d407cf676..1349831af2f1 100644
--- a/contrib/bmake/str.c
+++ b/contrib/bmake/str.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: str.c,v 1.89 2022/03/03 19:50:01 rillig Exp $	*/
+/*	$NetBSD: str.c,v 1.102 2024/01/05 23:22:06 rillig Exp $	*/
 
 /*
  * Copyright (c) 1988, 1989, 1990, 1993
@@ -71,7 +71,7 @@
 #include "make.h"
 
 /*	"@(#)str.c	5.8 (Berkeley) 6/1/90"	*/
-MAKE_RCSID("$NetBSD: str.c,v 1.89 2022/03/03 19:50:01 rillig Exp $");
+MAKE_RCSID("$NetBSD: str.c,v 1.102 2024/01/05 23:22:06 rillig Exp $");
 
 
 static HashTable interned_strings;
@@ -107,6 +107,10 @@ str_concat3(const char *s1, const char *s2, const char *s3)
  * Fracture a string into an array of words (as delineated by tabs or spaces)
  * taking quotation marks into account.
  *
+ * A string that is empty or only contains whitespace nevertheless results in
+ * a single word.  This is unexpected in many places, and the caller needs to
+ * correct for this edge case.
+ *
  * If expand is true, quotes are removed and escape sequences such as \r, \t,
  * etc... are expanded. In this case, return NULL on parse errors.
  *
@@ -219,7 +223,7 @@ Substring_Words(const char *str, bool expand)
 				if (word_start == NULL)
 					word_start = word_end;
 				*word_end++ = '\\';
-				/* catch '\' at end of line */
+				/* catch lonely '\' at end of string */
 				if (str_p[1] == '\0')
 					continue;
 				ch = *++str_p;
@@ -293,111 +297,123 @@ Str_Words(const char *str, bool expand)
 }
 
 /*
- * Str_Match -- Test if a string matches a pattern like "*.[ch]".
- * The following special characters are known *?\[] (as in fnmatch(3)).
+ * XXX: In the extreme edge case that one of the characters is from the basic
+ * execution character set and the other isn't, the result of the comparison
+ * differs depending on whether plain char is signed or unsigned.
+ *
+ * An example is the character range from \xE4 to 'a', where \xE4 may come
+ * from U+00E4 'Latin small letter A with diaeresis'.
+ *
+ * If char is signed, \xE4 evaluates to -28, the first half of the condition
+ * becomes -28 <= '0' && '0' <= 'a', which evaluates to true.
+ *
+ * If char is unsigned, \xE4 evaluates to 228, the second half of the
+ * condition becomes 'a' <= '0' && '0' <= 228, which evaluates to false.
+ */
+static bool
+in_range(char e1, char c, char e2)
+{
+	return (e1 <= c && c <= e2) || (e2 <= c && c <= e1);
+}
+
+/*
+ * Test if a string matches a pattern like "*.[ch]". The pattern matching
+ * characters are '*', '?' and '[]', as in fnmatch(3).
  *
- * XXX: this function does not detect or report malformed patterns.
+ * See varmod-match.mk for examples and edge cases.
  */
-bool
+StrMatchResult
 Str_Match(const char *str, const char *pat)
 {
-	for (;;) {
-		/*
-		 * See if we're at the end of both the pattern and the
-		 * string. If so, we succeeded.  If we're at the end of the
-		 * pattern but not at the end of the string, we failed.
-		 */
-		if (*pat == '\0')
-			return *str == '\0';
-		if (*str == '\0' && *pat != '*')
-			return false;
-
-		/*
-		 * A '*' in the pattern matches any substring.  We handle this
-		 * by calling ourselves for each suffix of the string.
-		 */
-		if (*pat == '*') {
-			pat++;
-			while (*pat == '*')
-				pat++;
-			if (*pat == '\0')
-				return true;
-			while (*str != '\0') {
-				if (Str_Match(str, pat))
-					return true;
-				str++;
-			}
-			return false;
-		}
+	StrMatchResult res = { NULL, false };
+	bool asterisk = false;
+	const char *fixed_str = str;
+	const char *fixed_pat = pat;
+
+match_fixed_length:
+	str = fixed_str;
+	pat = fixed_pat;
+	for (; *pat != '\0' && *pat != '*'; str++, pat++) {
+		if (*str == '\0')
+			return res;
+
+		if (*pat == '?')	/* match any single character */
+			continue;
 
-		/* A '?' in the pattern matches any single character. */
-		if (*pat == '?')
-			goto thisCharOK;
-
-		/*
-		 * A '[' in the pattern matches a character from a list.
-		 * The '[' is followed by the list of acceptable characters,
-		 * or by ranges (two characters separated by '-'). In these
-		 * character lists, the backslash is an ordinary character.
-		 */
-		if (*pat == '[') {
+		if (*pat == '[') {	/* match a character from a list */
 			bool neg = pat[1] == '^';
 			pat += neg ? 2 : 1;
 
-			for (;;) {
-				if (*pat == ']' || *pat == '\0') {
-					if (neg)
-						break;
-					return false;
-				}
-				/*
-				 * XXX: This naive comparison makes the
-				 * control flow of the pattern parser
-				 * dependent on the actual value of the
-				 * string.  This is unpredictable.  It may be
-				 * though that the code only looks wrong but
-				 * actually all code paths result in the same
-				 * behavior.  This needs further tests.
-				 */
-				if (*pat == *str)
-					break;
-				if (pat[1] == '-') {
-					if (pat[2] == '\0')
-						return neg;
-					if (pat[0] <= *str && *str <= pat[2])
-						break;
-					if (pat[2] <= *str && *str <= pat[0])
-						break;
-					pat += 2;
-				}
-				pat++;
+		next_char_in_list:
+			if (*pat == '\0')
+				res.error = "Unfinished character list";
+			if (*pat == ']' || *pat == '\0') {
+				if (neg)
+					goto end_of_char_list;
+				goto no_match;
 			}
+			if (*pat == *str)
+				goto end_of_char_list;
+			if (pat[1] == '-' && pat[2] == '\0') {
+				res.error = "Unfinished character range";
+				res.matched = neg;
+				return res;
+			}
+			if (pat[1] == '-') {
+				if (in_range(pat[0], *str, pat[2]))
+					goto end_of_char_list;
+				pat += 2;
+			}
+			pat++;
+			goto next_char_in_list;
+
+		end_of_char_list:
 			if (neg && *pat != ']' && *pat != '\0')
-				return false;
+				goto no_match;
 			while (*pat != ']' && *pat != '\0')
 				pat++;
 			if (*pat == '\0')
 				pat--;
-			goto thisCharOK;
+			continue;
 		}
 
-		/*
-		 * A backslash in the pattern matches the character following
-		 * it exactly.
-		 */
-		if (*pat == '\\') {
+		if (*pat == '\\')	/* match the next character exactly */
 			pat++;
-			if (*pat == '\0')
-				return false;
+		if (*pat != *str) {
+			if (asterisk && str == fixed_str) {
+				while (*str != '\0' && *str != *pat)
+					str++;
+				fixed_str = str;
+				goto match_fixed_length;
+			}
+			goto no_match;
 		}
+	}
 
-		if (*pat != *str)
-			return false;
-
-	thisCharOK:
-		pat++;
-		str++;
+	if (*pat == '*') {
+		asterisk = true;
+		while (*pat == '*')
+			pat++;
+		if (*pat == '\0') {
+			res.matched = true;
+			return res;
+		}
+		fixed_str = str;
+		fixed_pat = pat;
+		goto match_fixed_length;
+	}
+	if (asterisk && *str != '\0') {
+		fixed_str += strlen(str);
+		goto match_fixed_length;
 	}
+	res.matched = *str == '\0';
+	return res;
+
+no_match:
+	if (!asterisk)
+		return res;
+	fixed_str++;
+	goto match_fixed_length;
 }
 
 void