shithub: trueawk

Download patch

ref: c78a04e3f7969c206d199a3460f536c1bfab99a6
parent: c8b4698d57d7b1dc6275fd514fcdf163955e8f67
author: Arnold D. Robbins <arnold@skeeve.com>
date: Thu Sep 21 05:20:29 EDT 2023

Start on rewrite of fnematch.

--- a/b.c
+++ b/b.c
@@ -80,7 +80,9 @@
 fa	*fatab[NFA];
 int	nfatab	= 0;	/* entries in fatab */
 
+extern int u8_nextlen(const char *s);
 
+
 /* utf-8 mechanism:
 
    For most of Awk, utf-8 strings just "work", since they look like
@@ -754,36 +756,61 @@
 	return (0);
 }
 
-static int getrune(FILE *fp, char **pbuf, int *pbufsize, int quantum,
-		   int *curpos, int *lastpos)
+
+#define MAX_UTF_BYTES	4	// UTF-8 is up to 4 bytes long
+
+// Read one rune at a time from the given FILE*. Return both
+// the bytes and the actual rune.
+
+struct runedata {
+	int rune;
+	size_t len;
+	char bytes[6];
+};
+
+struct runedata getrune(FILE *fp)
 {
-	int c = 0;
-	char *buf = *pbuf;
-	static const int max_bytes = 4;	// max multiple bytes in UTF-8 is 4
-	int i, rune;
-	uschar private_buf[max_bytes + 1];
+	struct runedata result;
+	int c, next;
 
-	for (i = 0; i <= max_bytes; i++) {
-		if (++*curpos == *lastpos) {
-			if (*lastpos == *pbufsize)
-				if (!adjbuf((char **) pbuf, pbufsize, *pbufsize+1, quantum, 0, "getrune"))
-					FATAL("stream '%.30s...' too long", buf);
-			buf[(*lastpos)++] = (c = getc(fp)) != EOF ? c : 0;
-			private_buf[i] = c;
-		}
-		if (c == 0 || c < 128 ||  (c >> 6) == 4) { // 10xxxxxx starts a new character
-			ungetc(c, fp);
-			private_buf[i] = 0;
+	memset(&result, 0, sizeof(result));
+
+	c = getc(fp);
+	if (c == EOF)
+		return result;	// result.rune == 0 --> EOF
+	else if (c < 128) {
+		result.bytes[0] = c;
+		result.len = 1;
+		result.rune = c;
+
+		return result;
+	}
+
+	// need to get bytes and fill things in
+	result.bytes[0] = c;
+	result.len = 1;
+
+	next = 1;
+	for (int i = 1; i < MAX_UTF_BYTES; i++) {
+		c = getc(fp);
+		if (c == EOF)
 			break;
-		}
+		result.bytes[next++] = c;
+		result.len++;
 	}
 
-	u8_rune(& rune, private_buf);
+	// put back any extra input bytes
+	int actual_len = u8_nextlen(result.bytes);
+	while (result.len > actual_len) {
+		ungetc(result.bytes[--result.len], fp);
+	}
 
-	return rune;
+	result.bytes[result.len] = '\0';
+	(void) u8_rune(& result.rune, (uschar *) result.bytes);
+
+	return result;
 }
 
-
 /*
  * NAME
  *     fnematch
@@ -803,8 +830,8 @@
 {
 	char *buf = *pbuf;
 	int bufsize = *pbufsize;
-	int c, i, j, k, ns, s;
-	int rune;
+	int i, j, k, ns, s;
+	struct runedata r;
 
 	s = pfa->initstat;
 	patlen = 0;
@@ -813,8 +840,8 @@
 	 * All indices relative to buf.
 	 * i <= j <= k <= bufsize
 	 *
-	 * i: origin of active substring
-	 * j: current character
+	 * i: origin of active substring (first byte of first character)
+	 * j: current character		(last byte of current character)
 	 * k: destination of next getc()
 	 */
 	i = -1, k = 0;
@@ -821,30 +848,26 @@
         do {
 		j = i++;
 		do {
-			if (++j == k) {
-				if (k == bufsize)
+			r = getrune(f);
+			if ((++j + r.len) >= k) {
+				if (k >= bufsize)
 					if (!adjbuf((char **) &buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
 						FATAL("stream '%.30s...' too long", buf);
-				buf[k++] = (c = getc(f)) != EOF ? c : 0;
 			}
-			c = (uschar)buf[j];
-			if (c < 128)
-				rune = c;
-			else {
-				j--;
-				k--;
-				ungetc(c, f);
-				rune = getrune(f, &buf, &bufsize, quantum, &j, &k);
-			}
+			memcpy(buf + k, r.bytes, r.len);
+			j += r.len - 1;	// incremented next time around the loop
+			k += r.len;
+			if (r.len > 1)
+				i += r.len - 1;	// also
 
-			if ((ns = get_gototab(pfa, s, rune)) != 0)
+			if ((ns = get_gototab(pfa, s, r.rune)) != 0)
 				s = ns;
 			else
-				s = cgoto(pfa, s, rune);
+				s = cgoto(pfa, s, r.rune);
 
 			if (pfa->out[s]) {	/* final state */
 				patlen = j - i + 1;
-				if (c == 0)	/* don't count $ */
+				if (r.rune == 0)	/* don't count $ */
 					patlen--;
 			}
 		} while (buf[j] && s != 1);
@@ -869,8 +892,9 @@
 		 * terminate the buffer.
 		 */
 		do
-			if (buf[--k] && ungetc(buf[k], f) == EOF)
-				FATAL("unable to ungetc '%c'", buf[k]);
+			for (int ii = r.len; ii > 0; ii--)
+				if (buf[--k] && ungetc(buf[k], f) == EOF)
+					FATAL("unable to ungetc '%c'", buf[k]);
 		while (k > i + patlen);
 		buf[k] = '\0';
 		return true;
--- a/makefile
+++ b/makefile
@@ -23,9 +23,9 @@
 # ****************************************************************/
 
 CFLAGS = -fsanitize=address -O1 -g -fno-omit-frame-pointer -fno-optimize-sibling-calls
-CFLAGS = -g
-CFLAGS =
-CFLAGS = -O2
+CFLAGS = -g3
+#CFLAGS =
+#CFLAGS = -O2
 
 # compiler options
 #CC = gcc -Wall -g -Wwrite-strings
--