aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWarner Losh <imp@FreeBSD.org>2023-11-26 15:12:04 +0000
committerWarner Losh <imp@FreeBSD.org>2023-11-26 15:12:04 +0000
commit18df98168fa7af30532be5c8988b80f2987c9b84 (patch)
tree7fa027d6f9d99ae0820175adb08771ef11585b5d
parent2e406c584fe40d654e4b8042006d2206eed016b3 (diff)
downloadsrc-18df98168fa7af30532be5c8988b80f2987c9b84.tar.gz
src-18df98168fa7af30532be5c8988b80f2987c9b84.zip
ota: Import ota from 20241124 (930d75157063)vendor/one-true-awk/930d75157063
Minor bug fixes to man page, tests, fnematch (fixing erroneous output), string escape sequences and improve flow control by optimizing gototab. Nov 24, 2023: Fix issue #199: gototab improvements to dynamically resize the table, qsort and bsearch to improve the lookup speed as the table gets larger for multibyte input. thanks to Arnold Robbins. Nov 23, 2023: Fix Issue #169, related to escape sequences in strings. Thanks to Github user rajeevvp. Fix Issue #147, reported by Github user drawkula, and fixed by Miguel Pineiro Jr. Nov 20, 2023: rewrite of fnematch to fix a number of issues, including extraneous output, out-of-bounds access, number of bytes to push back after a failed match etc. thanks to Miguel Pineiro Jr. Nov 15, 2023: Man page edit, regression test fixes. thanks to Arnold Robbins consolidation of sub and gsub into dosub, removing duplicate code. thanks to Miguel Pineiro Jr. gcc replaced with cc everywhere. Sponsored by: Netflix
-rw-r--r--FIXES24
-rw-r--r--README.md4
-rw-r--r--awk.13
-rw-r--r--awk.h11
-rw-r--r--b.c279
-rwxr-xr-xbugs-fixed/REGRESS2
-rw-r--r--lex.c14
-rw-r--r--main.c2
-rw-r--r--makefile8
-rw-r--r--maketab.c4
-rw-r--r--proto.h3
-rw-r--r--run.c273
-rwxr-xr-xtestdir/Compare.tt2
-rwxr-xr-xtestdir/REGRESS2
-rwxr-xr-xtestdir/T.csv1
-rwxr-xr-xtestdir/T.flags5
-rwxr-xr-xtestdir/T.misc14
17 files changed, 354 insertions, 297 deletions
diff --git a/FIXES b/FIXES
index a13ca50ccde0..52f49e3eed0f 100644
--- a/FIXES
+++ b/FIXES
@@ -25,6 +25,29 @@ THIS SOFTWARE.
This file lists all bug fixes, changes, etc., made since the
second edition of the AWK book was published in September 2023.
+Nov 24, 2023:
+ Fix issue #199: gototab improvements to dynamically resize the
+ table, qsort and bsearch to improve the lookup speed as the
+ table gets larger for multibyte input. thanks to Arnold Robbins.
+
+Nov 23, 2023:
+ Fix Issue #169, related to escape sequences in strings.
+ Thanks to Github user rajeevvp.
+ Fix Issue #147, reported by Github user drawkula, and fixed
+ by Miguel Pineiro Jr.
+
+Nov 20, 2023:
+ rewrite of fnematch to fix a number of issues, including
+ extraneous output, out-of-bounds access, number of bytes
+ to push back after a failed match etc.
+ thanks to Miguel Pineiro Jr.
+
+Nov 15, 2023:
+ Man page edit, regression test fixes. thanks to Arnold Robbins
+ consolidation of sub and gsub into dosub, removing duplicate
+ code. thanks to Miguel Pineiro Jr.
+ gcc replaced with cc everywhere.
+
Oct 30, 2023:
multiple fixes and a minor code cleanup.
disabled utf-8 for non-multibyte locales, such as C or POSIX.
@@ -32,7 +55,6 @@ Oct 30, 2023:
systems. also fixed an out-of-bounds read for empty CCL.
fixed a buffer overflow in substr with utf-8 strings.
many thanks to Todd C Miller.
-
Sep 24, 2023:
fnematch and getrune have been overhauled to solve issues around
diff --git a/README.md b/README.md
index daace23e166e..84fb06e48833 100644
--- a/README.md
+++ b/README.md
@@ -21,8 +21,6 @@ Aribtrary characters may be included with `\u` followed by 1 to 8 hexadecimal di
### Regular expressions ###
Regular expressions may include UTF-8 code points, including `\u`.
-Character classes are likely to be limited to about 256 characters
-when expanded.
### CSV ###
@@ -145,4 +143,4 @@ is not at the top of our priority list.
#### Last Updated
-Sun 15 Oct 2023 06:28:36 IDT
+Mon 16 Oct 2023 11:23:08 IDT
diff --git a/awk.1 b/awk.1
index f814e2348343..496a2a652379 100644
--- a/awk.1
+++ b/awk.1
@@ -638,6 +638,9 @@ the syntax is worse.
.PP
Input is expected to be UTF-8 encoded. Other multibyte
character sets are not handled.
+However, in eight-bit locales,
+.I awk
+treats each input byte as a separate character.
.SH UNUSUAL FLOATING-POINT VALUES
.I Awk
was designed before IEEE 754 arithmetic defined Not-A-Number (NaN)
diff --git a/awk.h b/awk.h
index 4e066b9706c5..740447ee2167 100644
--- a/awk.h
+++ b/awk.h
@@ -254,14 +254,19 @@ typedef struct rrow {
int *lfollow;
} rrow;
-typedef struct gtt { /* gototab entry */
+typedef struct gtte { /* gototab entry */
unsigned int ch;
unsigned int state;
+} gtte;
+
+typedef struct gtt { /* gototab */
+ size_t allocated;
+ size_t inuse;
+ gtte *entries;
} gtt;
typedef struct fa {
- gtt **gototab;
- int gototab_len;
+ gtt *gototab;
uschar *out;
uschar *restr;
int **posns;
diff --git a/b.c b/b.c
index aa07d59fabac..881c052811b1 100644
--- a/b.c
+++ b/b.c
@@ -96,9 +96,8 @@ extern int u8_nextlen(const char *s);
mechanism of the goto table used 8-bit byte indices into the
gototab entries to compute the next state. Unicode is a lot
bigger, so the gototab entries are now structs with a character
- and a next state, and there is a linear search of the characters
- to find the state. (Yes, this is slower, by a significant
- amount. Tough.)
+ and a next state. These are sorted by code point and binary
+ searched.
Throughout the RE mechanism in b.c, utf-8 characters are
converted to their utf-32 value. This mostly shows up in
@@ -113,8 +112,10 @@ extern int u8_nextlen(const char *s);
*/
+static int entry_cmp(const void *l, const void *r);
static int get_gototab(fa*, int, int);
static int set_gototab(fa*, int, int, int);
+static void clear_gototab(fa*, int);
extern int u8_rune(int *, const uschar *);
static int *
@@ -142,7 +143,7 @@ resizesetvec(const char *f)
static void
resize_state(fa *f, int state)
{
- gtt **p;
+ gtt *p;
uschar *p2;
int **p3;
int i, new_count;
@@ -152,7 +153,7 @@ resize_state(fa *f, int state)
new_count = state + 10; /* needs to be tuned */
- p = (gtt **) realloc(f->gototab, new_count * sizeof(f->gototab[0]));
+ p = (gtt *) realloc(f->gototab, new_count * sizeof(gtt));
if (p == NULL)
goto out;
f->gototab = p;
@@ -168,13 +169,14 @@ resize_state(fa *f, int state)
f->posns = p3;
for (i = f->state_count; i < new_count; ++i) {
- f->gototab[i] = (gtt *) calloc(NCHARS, sizeof(**f->gototab));
- if (f->gototab[i] == NULL)
+ f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte));
+ if (f->gototab[i].entries == NULL)
goto out;
- f->out[i] = 0;
+ f->gototab[i].allocated = NCHARS;
+ f->gototab[i].inuse = 0;
+ f->out[i] = 0;
f->posns[i] = NULL;
}
- f->gototab_len = NCHARS; /* should be variable, growable */
f->state_count = new_count;
return;
out:
@@ -268,8 +270,7 @@ int makeinit(fa *f, bool anchor)
}
if ((f->posns[2])[1] == f->accept)
f->out[2] = 1;
- for (i = 0; i < NCHARS; i++)
- set_gototab(f, 2, 0, 0); /* f->gototab[2][i] = 0; */
+ clear_gototab(f, 2);
f->curstat = cgoto(f, 2, HAT);
if (anchor) {
*f->posns[2] = k-1; /* leave out position 0 */
@@ -595,32 +596,104 @@ int member(int c, int *sarg) /* is c in s? */
return(0);
}
+static void resize_gototab(fa *f, int state)
+{
+ size_t new_size = f->gototab[state].allocated * 2;
+ gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte));
+ if (p == NULL)
+ overflo(__func__);
+
+ // need to initialized the new memory to zero
+ size_t orig_size = f->gototab[state].allocated; // 2nd half of new mem is this size
+ memset(p + orig_size, 0, orig_size * sizeof(gtte)); // clean it out
+
+ f->gototab[state].allocated = new_size; // update gotottab info
+ f->gototab[state].entries = p;
+}
+
static int get_gototab(fa *f, int state, int ch) /* hide gototab inplementation */
{
- int i;
- for (i = 0; i < f->gototab_len; i++) {
- if (f->gototab[state][i].ch == 0)
- break;
- if (f->gototab[state][i].ch == ch)
- return f->gototab[state][i].state;
- }
- return 0;
+ gtte key;
+ gtte *item;
+
+ key.ch = ch;
+ key.state = 0; /* irrelevant */
+ item = bsearch(& key, f->gototab[state].entries,
+ f->gototab[state].inuse, sizeof(gtte),
+ entry_cmp);
+
+ if (item == NULL)
+ return 0;
+ else
+ return item->state;
+}
+
+static int entry_cmp(const void *l, const void *r)
+{
+ const gtte *left, *right;
+
+ left = (const gtte *) l;
+ right = (const gtte *) r;
+
+ return left->ch - right->ch;
}
static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplementation */
{
- int i;
- for (i = 0; i < f->gototab_len; i++) {
- if (f->gototab[state][i].ch == 0 || f->gototab[state][i].ch == ch) {
- f->gototab[state][i].ch = ch;
- f->gototab[state][i].state = val;
- return val;
+ if (f->gototab[state].inuse == 0) {
+ f->gototab[state].entries[0].ch = ch;
+ f->gototab[state].entries[0].state = val;
+ f->gototab[state].inuse++;
+ return val;
+ } else if (ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) {
+ // not seen yet, insert and return
+ gtt *tab = & f->gototab[state];
+ if (tab->inuse + 1 >= tab->allocated)
+ resize_gototab(f, state);
+
+ f->gototab[state].entries[f->gototab[state].inuse-1].ch = ch;
+ f->gototab[state].entries[f->gototab[state].inuse-1].state = val;
+ f->gototab[state].inuse++;
+ return val;
+ } else {
+ // maybe we have it, maybe we don't
+ gtte key;
+ gtte *item;
+
+ key.ch = ch;
+ key.state = 0; /* irrelevant */
+ item = bsearch(& key, f->gototab[state].entries,
+ f->gototab[state].inuse, sizeof(gtte),
+ entry_cmp);
+
+ if (item != NULL) {
+ // we have it, update state and return
+ item->state = val;
+ return item->state;
}
+ // otherwise, fall through to insert and reallocate.
}
- overflo(__func__);
+
+ gtt *tab = & f->gototab[state];
+ if (tab->inuse + 1 >= tab->allocated)
+ resize_gototab(f, state);
+ ++tab->inuse;
+ f->gototab[state].entries[tab->inuse].ch = ch;
+ f->gototab[state].entries[tab->inuse].state = val;
+
+ qsort(f->gototab[state].entries,
+ f->gototab[state].inuse, sizeof(gtte), entry_cmp);
+
return val; /* not used anywhere at the moment */
}
+static void clear_gototab(fa *f, int state)
+{
+ memset(f->gototab[state].entries, 0,
+ f->gototab[state].allocated * sizeof(gtte));
+ f->gototab[state].inuse = 0;
+}
+
int match(fa *f, const char *p0) /* shortest match ? */
{
int s, ns;
@@ -759,59 +832,6 @@ int nematch(fa *f, const char *p0) /* non-empty match, for sub */
#define MAX_UTF_BYTES 4 // UTF-8 is up to 4 bytes long
-// Read one rune at a time from the given FILE*. Return both
-// the bytes and the actual rune.
-
-struct runedata {
- int rune;
- size_t len;
- char bytes[6];
-};
-
-struct runedata getrune(FILE *fp)
-{
- struct runedata result;
- int c, next;
-
- memset(&result, 0, sizeof(result));
-
- c = getc(fp);
- if (c == EOF)
- return result; // result.rune == 0 --> EOF
- else if (c < 128 || awk_mb_cur_max == 1) {
- result.bytes[0] = c;
- result.len = 1;
- result.rune = c;
-
- return result;
- }
-
- // need to get bytes and fill things in
- result.bytes[0] = c;
- result.len = 1;
-
- next = 1;
- for (int i = 1; i < MAX_UTF_BYTES; i++) {
- c = getc(fp);
- if (c == EOF)
- break;
- result.bytes[next++] = c;
- result.len++;
- }
-
- // put back any extra input bytes
- int actual_len = u8_nextlen(result.bytes);
- while (result.len > actual_len) {
- ungetc(result.bytes[--result.len], fp);
- }
-
- result.bytes[result.len] = '\0';
- (void) u8_rune(& result.rune, (uschar *) result.bytes);
-
- return result;
-}
-
-
/*
* NAME
* fnematch
@@ -829,58 +849,76 @@ struct runedata getrune(FILE *fp)
bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
{
- char *buf = *pbuf;
+ char *i, *j, *k, *buf = *pbuf;
int bufsize = *pbufsize;
- int i, j, k, ns, s;
- struct runedata r;
+ int c, n, ns, s;
s = pfa->initstat;
patlen = 0;
/*
- * All indices relative to buf.
- * i <= j <= k <= bufsize
+ * buf <= i <= j <= k <= buf+bufsize
*
- * i: origin of active substring (first byte of first character)
- * j: current character (last byte of current character)
- * k: destination of next getc()
+ * i: origin of active substring
+ * j: current character
+ * k: destination of the next getc
*/
- i = -1, k = 0;
- do {
- j = i++;
- do {
- r = getrune(f);
- if ((++j + r.len) >= k) {
- if (k >= bufsize)
- if (!adjbuf((char **) &buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
- FATAL("stream '%.30s...' too long", buf);
- }
- memcpy(buf + k, r.bytes, r.len);
- j += r.len - 1; // incremented next time around the loop
- k += r.len;
- if ((ns = get_gototab(pfa, s, r.rune)) != 0)
- s = ns;
- else
- s = cgoto(pfa, s, r.rune);
+ i = j = k = buf;
- if (pfa->out[s]) { /* final state */
- patlen = j - i + 1;
- if (r.rune == 0) /* don't count $ */
- patlen--;
+ do {
+ /*
+ * Call u8_rune with at least MAX_UTF_BYTES ahead in
+ * the buffer until EOF interferes.
+ */
+ if (k - j < MAX_UTF_BYTES) {
+ if (k + MAX_UTF_BYTES > buf + bufsize) {
+ adjbuf((char **) &buf, &bufsize,
+ bufsize + MAX_UTF_BYTES,
+ quantum, 0, "fnematch");
}
- } while (buf[j] && s != 1);
+ for (n = MAX_UTF_BYTES ; n > 0; n--) {
+ *k++ = (c = getc(f)) != EOF ? c : 0;
+ if (c == EOF) {
+ if (ferror(f))
+ FATAL("fnematch: getc error");
+ break;
+ }
+ }
+ }
+
+ j += u8_rune(&c, (uschar *)j);
+
+ if ((ns = get_gototab(pfa, s, c)) != 0)
+ s = ns;
+ else
+ s = cgoto(pfa, s, c);
+
+ if (pfa->out[s]) { /* final state */
+ patbeg = i;
+ patlen = j - i;
+ if (c == 0) /* don't count $ */
+ patlen--;
+ }
+
+ if (c && s != 1)
+ continue; /* origin i still viable, next j */
+ if (patlen)
+ break; /* best match found */
+
+ /* no match at origin i, next i and start over */
+ i += u8_rune(&c, (uschar *)i);
+ if (c == 0)
+ break; /* no match */
+ j = i;
s = 2;
- if (r.len > 1)
- i += r.len - 1; // i incremented around the loop
- } while (buf[i] && !patlen);
+ } while (1);
/* adjbuf() may have relocated a resized buffer. Inform the world. */
*pbuf = buf;
*pbufsize = bufsize;
if (patlen) {
- patbeg = (char *) buf + i;
/*
* Under no circumstances is the last character fed to
* the automaton part of the match. It is EOF's nullbyte,
@@ -893,11 +931,10 @@ bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
* terminate the buffer.
*/
do
- for (int ii = r.len; ii > 0; ii--)
- if (buf[--k] && ungetc(buf[k], f) == EOF)
- FATAL("unable to ungetc '%c'", buf[k]);
- while (k > i + patlen);
- buf[k] = '\0';
+ if (*--k && ungetc(*k, f) == EOF)
+ FATAL("unable to ungetc '%c'", *k);
+ while (k > patbeg + patlen);
+ *k = '\0';
return true;
}
else
@@ -1486,8 +1523,7 @@ int cgoto(fa *f, int s, int c)
/* add tmpset to current set of states */
++(f->curstat);
resize_state(f, f->curstat);
- for (i = 0; i < NCHARS; i++)
- set_gototab(f, f->curstat, 0, 0);
+ clear_gototab(f, f->curstat);
xfree(f->posns[f->curstat]);
p = intalloc(setcnt + 1, __func__);
@@ -1511,7 +1547,8 @@ void freefa(fa *f) /* free a finite automaton */
if (f == NULL)
return;
for (i = 0; i < f->state_count; i++)
- xfree(f->gototab[i])
+ xfree(f->gototab[i].entries);
+ xfree(f->gototab);
for (i = 0; i <= f->curstat; i++)
xfree(f->posns[i]);
for (i = 0; i <= f->accept; i++) {
diff --git a/bugs-fixed/REGRESS b/bugs-fixed/REGRESS
index 07160031ca07..98d578ac22ef 100755
--- a/bugs-fixed/REGRESS
+++ b/bugs-fixed/REGRESS
@@ -1,4 +1,4 @@
-#! /bin/bash
+#! /bin/sh
if [ ! -f ../a.out ]
then
diff --git a/lex.c b/lex.c
index c1b892be30f5..141cc81d2b59 100644
--- a/lex.c
+++ b/lex.c
@@ -430,8 +430,12 @@ int string(void)
{
int i;
+ if (!isxdigit(peek())) {
+ unput(c);
+ break;
+ }
n = 0;
- for (i = 1; i <= 2; i++) {
+ for (i = 0; i < 2; i++) {
c = input();
if (c == 0)
break;
@@ -442,13 +446,13 @@ int string(void)
n += (c - '0');
else
n += 10 + (c - 'a');
- } else
+ } else {
+ unput(c);
break;
+ }
}
- if (n)
+ if (i)
*bp++ = n;
- else
- unput(c);
break;
}
diff --git a/main.c b/main.c
index 3a205c80fa10..c478e321f886 100644
--- a/main.c
+++ b/main.c
@@ -22,7 +22,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
THIS SOFTWARE.
****************************************************************/
-const char *version = "version 20231030";
+const char *version = "version 20231124";
#define DEBUG
#include <stdio.h>
diff --git a/makefile b/makefile
index df966ef1b846..b47a8afcf39a 100644
--- a/makefile
+++ b/makefile
@@ -28,10 +28,10 @@ CFLAGS =
CFLAGS = -O2
# compiler options
-#CC = gcc -Wall -g -Wwrite-strings
-#CC = gcc -O4 -Wall -pedantic -fno-strict-aliasing
-#CC = gcc -fprofile-arcs -ftest-coverage # then gcov f1.c; cat f1.c.gcov
-HOSTCC = gcc -g -Wall -pedantic -Wcast-qual
+#CC = cc -Wall -g -Wwrite-strings
+#CC = cc -O4 -Wall -pedantic -fno-strict-aliasing
+#CC = cc -fprofile-arcs -ftest-coverage # then gcov f1.c; cat f1.c.gcov
+HOSTCC = cc -g -Wall -pedantic -Wcast-qual
CC = $(HOSTCC) # change this is cross-compiling.
# By fiat, to make our lives easier, yacc is now defined to be bison.
diff --git a/maketab.c b/maketab.c
index d4b756ad6706..3a80c87725ac 100644
--- a/maketab.c
+++ b/maketab.c
@@ -52,8 +52,8 @@ struct xx
{ ARRAY, "array", NULL },
{ INDIRECT, "indirect", "$(" },
{ SUBSTR, "substr", "substr" },
- { SUB, "sub", "sub" },
- { GSUB, "gsub", "gsub" },
+ { SUB, "dosub", "sub" },
+ { GSUB, "dosub", "gsub" },
{ INDEX, "sindex", "sindex" },
{ SPRINTF, "awksprintf", "sprintf " },
{ ADD, "arith", " + " },
diff --git a/proto.h b/proto.h
index 879fbde492b5..b44f9e7a5599 100644
--- a/proto.h
+++ b/proto.h
@@ -198,8 +198,7 @@ extern FILE *openfile(int, const char *, bool *);
extern const char *filename(FILE *);
extern Cell *closefile(Node **, int);
extern void closeall(void);
-extern Cell *sub(Node **, int);
-extern Cell *gsub(Node **, int);
+extern Cell *dosub(Node **, int);
extern Cell *gensub(Node **, int);
extern FILE *popen(const char *, const char *);
diff --git a/run.c b/run.c
index fcb070661e9e..5efade2108fb 100644
--- a/run.c
+++ b/run.c
@@ -1540,8 +1540,9 @@ Cell *assign(Node **a, int n) /* a[0] = a[1], a[0] += a[1], etc. */
if (x == y && !(x->tval & (FLD|REC)) && x != nfloc)
; /* self-assignment: leave alone unless it's a field or NF */
else if ((y->tval & (STR|NUM)) == (STR|NUM)) {
+ yf = getfval(y);
setsval(x, getsval(y));
- x->fval = getfval(y);
+ x->fval = yf;
x->tval |= NUM;
}
else if (isstr(y))
@@ -2492,169 +2493,143 @@ static void flush_all(void)
void backsub(char **pb_ptr, const char **sptr_ptr);
-Cell *sub(Node **a, int nnn) /* substitute command */
+Cell *dosub(Node **a, int subop) /* sub and gsub */
{
- const char *sptr, *q;
- Cell *x, *y, *result;
- char *t, *buf, *pb;
fa *pfa;
+ int tempstat;
+ char *repl;
+ Cell *x;
+
+ char *buf = NULL;
+ char *pb = NULL;
int bufsz = recsize;
- if ((buf = (char *) malloc(bufsz)) == NULL)
- FATAL("out of memory in sub");
- x = execute(a[3]); /* target string */
- t = getsval(x);
- if (a[0] == NULL) /* 0 => a[1] is already-compiled regexpr */
- pfa = (fa *) a[1]; /* regular expression */
- else {
- y = execute(a[1]);
- pfa = makedfa(getsval(y), 1);
- tempfree(y);
+ const char *r, *s;
+ const char *start;
+ const char *noempty = NULL; /* empty match disallowed here */
+ size_t m = 0; /* match count */
+ size_t whichm; /* which match to select, 0 = global */
+ int mtype; /* match type */
+
+ if (a[0] == NULL) { /* 0 => a[1] is already-compiled regexpr */
+ pfa = (fa *) a[1];
+ } else {
+ x = execute(a[1]);
+ pfa = makedfa(getsval(x), 1);
+ tempfree(x);
}
- y = execute(a[2]); /* replacement string */
- result = False;
- if (pmatch(pfa, t)) {
- sptr = t;
- adjbuf(&buf, &bufsz, 1+patbeg-sptr, recsize, 0, "sub");
- pb = buf;
- while (sptr < patbeg)
- *pb++ = *sptr++;
- sptr = getsval(y);
- while (*sptr != '\0') {
- adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "sub");
- if (*sptr == '\\') {
- backsub(&pb, &sptr);
- } else if (*sptr == '&') {
- sptr++;
- adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "sub");
- for (q = patbeg; q < patbeg+patlen; )
- *pb++ = *q++;
- } else
- *pb++ = *sptr++;
+
+ x = execute(a[2]); /* replacement string */
+ repl = tostring(getsval(x));
+ tempfree(x);
+
+ switch (subop) {
+ case SUB:
+ whichm = 1;
+ x = execute(a[3]); /* source string */
+ break;
+ case GSUB:
+ whichm = 0;
+ x = execute(a[3]); /* source string */
+ break;
+ default:
+ FATAL("dosub: unrecognized subop: %d", subop);
+ }
+
+ start = getsval(x);
+ while (pmatch(pfa, start)) {
+ if (buf == NULL) {
+ if ((pb = buf = malloc(bufsz)) == NULL)
+ FATAL("out of memory in dosub");
+ tempstat = pfa->initstat;
+ pfa->initstat = 2;
}
- *pb = '\0';
- if (pb > buf + bufsz)
- FATAL("sub result1 %.30s too big; can't happen", buf);
- sptr = patbeg + patlen;
- if ((patlen == 0 && *patbeg) || (patlen && *(sptr-1))) {
- adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "sub");
- while ((*pb++ = *sptr++) != '\0')
- continue;
+
+ /* match types */
+ #define MT_IGNORE 0 /* unselected or invalid */
+ #define MT_INSERT 1 /* selected, empty */
+ #define MT_REPLACE 2 /* selected, not empty */
+
+ /* an empty match just after replacement is invalid */
+
+ if (patbeg == noempty && patlen == 0) {
+ mtype = MT_IGNORE; /* invalid, not counted */
+ } else if (whichm == ++m || whichm == 0) {
+ mtype = patlen ? MT_REPLACE : MT_INSERT;
+ } else {
+ mtype = MT_IGNORE; /* unselected, but counted */
}
- if (pb > buf + bufsz)
- FATAL("sub result2 %.30s too big; can't happen", buf);
- setsval(x, buf); /* BUG: should be able to avoid copy */
- result = True;
- }
- tempfree(x);
- tempfree(y);
- free(buf);
- return result;
-}
-Cell *gsub(Node **a, int nnn) /* global substitute */
-{
- Cell *x, *y;
- char *rptr, *pb;
- const char *q, *t, *sptr;
- char *buf;
- fa *pfa;
- int mflag, tempstat, num;
- int bufsz = recsize;
- int charlen = 0;
+ /* leading text: */
+ if (patbeg > start) {
+ adjbuf(&buf, &bufsz, (pb - buf) + (patbeg - start),
+ recsize, &pb, "dosub");
+ s = start;
+ while (s < patbeg)
+ *pb++ = *s++;
+ }
- if ((buf = (char *) malloc(bufsz)) == NULL)
- FATAL("out of memory in gsub");
- mflag = 0; /* if mflag == 0, can replace empty string */
- num = 0;
- x = execute(a[3]); /* target string */
- t = getsval(x);
- if (a[0] == NULL) /* 0 => a[1] is already-compiled regexpr */
- pfa = (fa *) a[1]; /* regular expression */
- else {
- y = execute(a[1]);
- pfa = makedfa(getsval(y), 1);
- tempfree(y);
- }
- y = execute(a[2]); /* replacement string */
- if (pmatch(pfa, t)) {
- tempstat = pfa->initstat;
- pfa->initstat = 2;
- pb = buf;
- rptr = getsval(y);
- do {
- if (patlen == 0 && *patbeg != '\0') { /* matched empty string */
- if (mflag == 0) { /* can replace empty */
- num++;
- sptr = rptr;
- while (*sptr != '\0') {
- adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gsub");
- if (*sptr == '\\') {
- backsub(&pb, &sptr);
- } else if (*sptr == '&') {
- sptr++;
- adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gsub");
- for (q = patbeg; q < patbeg+patlen; )
- *pb++ = *q++;
- } else
- *pb++ = *sptr++;
- }
- }
- if (*t == '\0') /* at end */
- goto done;
- adjbuf(&buf, &bufsz, 2+pb-buf, recsize, &pb, "gsub");
- charlen = u8_nextlen(t);
- while (charlen-- > 0)
- *pb++ = *t++;
- if (pb > buf + bufsz) /* BUG: not sure of this test */
- FATAL("gsub result0 %.30s too big; can't happen", buf);
- mflag = 0;
- }
- else { /* matched nonempty string */
- num++;
- sptr = t;
- adjbuf(&buf, &bufsz, 1+(patbeg-sptr)+pb-buf, recsize, &pb, "gsub");
- while (sptr < patbeg)
- *pb++ = *sptr++;
- sptr = rptr;
- while (*sptr != '\0') {
- adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gsub");
- if (*sptr == '\\') {
- backsub(&pb, &sptr);
- } else if (*sptr == '&') {
- sptr++;
- adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gsub");
- for (q = patbeg; q < patbeg+patlen; )
- *pb++ = *q++;
- } else
- *pb++ = *sptr++;
- }
- t = patbeg + patlen;
- if (patlen == 0 || *t == '\0' || *(t-1) == '\0')
- goto done;
- if (pb > buf + bufsz)
- FATAL("gsub result1 %.30s too big; can't happen", buf);
- mflag = 1;
+ if (mtype == MT_IGNORE)
+ goto matching_text; /* skip replacement text */
+
+ r = repl;
+ while (*r != 0) {
+ adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "dosub");
+ if (*r == '\\') {
+ backsub(&pb, &r);
+ } else if (*r == '&') {
+ r++;
+ adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize,
+ &pb, "dosub");
+ for (s = patbeg; s < patbeg+patlen; )
+ *pb++ = *s++;
+ } else {
+ *pb++ = *r++;
}
- } while (pmatch(pfa,t));
- sptr = t;
- adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "gsub");
- while ((*pb++ = *sptr++) != '\0')
- continue;
- done: if (pb < buf + bufsz)
- *pb = '\0';
- else if (*(pb-1) != '\0')
- FATAL("gsub result2 %.30s truncated; can't happen", buf);
- setsval(x, buf); /* BUG: should be able to avoid copy + free */
+ }
+
+matching_text:
+ if (mtype == MT_REPLACE || *patbeg == '\0')
+ goto next_search; /* skip matching text */
+
+ if (patlen == 0)
+ patlen = u8_nextlen(patbeg);
+ adjbuf(&buf, &bufsz, (pb-buf) + patlen, recsize, &pb, "dosub");
+ s = patbeg;
+ while (s < patbeg + patlen)
+ *pb++ = *s++;
+
+next_search:
+ start = patbeg + patlen;
+ if (m == whichm || *patbeg == '\0')
+ break;
+ if (mtype == MT_REPLACE)
+ noempty = start;
+
+ #undef MT_IGNORE
+ #undef MT_INSERT
+ #undef MT_REPLACE
+ }
+
+ xfree(repl);
+
+ if (buf != NULL) {
pfa->initstat = tempstat;
+
+ /* trailing text */
+ adjbuf(&buf, &bufsz, 1+strlen(start)+pb-buf, 0, &pb, "dosub");
+ while ((*pb++ = *start++) != '\0')
+ ;
+
+ setsval(x, buf);
+ free(buf);
}
+
tempfree(x);
- tempfree(y);
x = gettemp();
x->tval = NUM;
- x->fval = num;
- free(buf);
- return(x);
+ x->fval = m;
+ return x;
}
Cell *gensub(Node **a, int nnn) /* global selective substitute */
diff --git a/testdir/Compare.tt b/testdir/Compare.tt
index ca828d258658..4b297d731c94 100755
--- a/testdir/Compare.tt
+++ b/testdir/Compare.tt
@@ -4,7 +4,7 @@ oldawk=${oldawk-awk}
awk=${awk-../a.out}
echo compiling time.c
-gcc time.c -o time
+cc time.c -o time
time=./time
echo time command = $time
diff --git a/testdir/REGRESS b/testdir/REGRESS
index 5c3667f5eede..b54ce3f68ea0 100755
--- a/testdir/REGRESS
+++ b/testdir/REGRESS
@@ -1,7 +1,7 @@
#!/bin/sh
uname -a
-gcc echo.c -o echo && echo echo compiled
+cc echo.c -o echo && echo echo compiled
oldawk=${oldawk-awk}
awk=${awk-../a.out}
diff --git a/testdir/T.csv b/testdir/T.csv
index 10da1ea90b8b..79c15104cfb3 100755
--- a/testdir/T.csv
+++ b/testdir/T.csv
@@ -77,5 +77,4 @@ a''b [a''b]
a, [a][]
"", [][]
, [][]
-a"b [a"b]
!!!!
diff --git a/testdir/T.flags b/testdir/T.flags
index 33d7c8db7fe6..17ce56133745 100755
--- a/testdir/T.flags
+++ b/testdir/T.flags
@@ -20,5 +20,6 @@ grep 'unknown option' foo >/dev/null || echo 'T.flags: bad unknown option'
$awk -F >foo 2>&1
grep 'no field separator' foo >/dev/null || echo 'T.flags: bad missing field separator'
-$awk -F '' >foo 2>&1
-grep 'field separator FS is empty' foo >/dev/null || echo 'T.flags: bad empty field separator'
+### Awk is now like gawk and splits into separate characters if FS = ""
+# $awk -F '' >foo 2>&1
+# grep 'field separator FS is empty' foo >/dev/null || echo 'T.flags: bad empty field separator'
diff --git a/testdir/T.misc b/testdir/T.misc
index 1e5c3c553d9e..b8ed3c1c45f9 100755
--- a/testdir/T.misc
+++ b/testdir/T.misc
@@ -510,3 +510,17 @@ cmp -s foo1 foo2 || echo 'BAD: T.misc exit status on I/O error'
echo 1b >foo1
echo ab | $awk '{ sub(/a/, "b" ~ /b/); print }' >foo2
cmp -s foo1 foo2 || echo 'BAD: T.misc lexer regex buffer clobbered'
+
+# Check handling of octal \OOO and hex \xHH esc. seqs. in strings.
+echo 'hello888
+hello
+hello
+helloxGOO
+hello
+0A' > foo1
+$awk 'BEGIN { print "hello\888" }' > foo2
+$awk 'BEGIN { print "hello\x000A" }' >> foo2
+$awk 'BEGIN { printf "hello\x0A" }' >> foo2
+$awk 'BEGIN { print "hello\xGOO" }' >> foo2
+$awk 'BEGIN { print "hello\x0A0A" }' >> foo2
+cmp -s foo1 foo2 || echo '�BAD: T.misc escape sequences in strings mishandled'