shithub: trueawk

Download patch

ref: cf7cbbb1583eed2c961366f6ab62f14548052091
parent: 11b2b7b6d5c42ea63f79ce1c1d88264f83cc2155
parent: d801514094d1140dfc9f8571b9821082ddddf107
author: Arnold D. Robbins <arnold@skeeve.com>
date: Tue Oct 31 05:15:53 EDT 2023

Merge branch 'master' into improve-gototab

--- a/FIXES
+++ b/FIXES
@@ -25,6 +25,15 @@
 This file lists all bug fixes, changes, etc., made since the 
 second edition of the AWK book was published in September 2023.
 
+Oct 30, 2023:
+	multiple fixes and a minor code cleanup.
+	disabled utf-8 for non-multibyte locales, such as C or POSIX.
+	fixed a bad char * cast that causes incorrect results on big-endian
+	systems. also fixed an out-of-bounds read for empty CCL.
+	fixed a buffer overflow in substr with utf-8 strings.
+	many thanks to Todd C Miller.
+	
+
 Sep 24, 2023:
 	fnematch and getrune have been overhauled to solve issues around
 	unicode FS and RS. also fixed gsub null match issue with unicode.
--- a/awk.h
+++ b/awk.h
@@ -64,6 +64,8 @@
 #define	RECSIZE	(8 * 1024)	/* sets limit on records, fields, etc., etc. */
 extern int	recsize;	/* size of current record, orig RECSIZE */
 
+extern size_t	awk_mb_cur_max;	/* max size of a multi-byte character */
+
 extern char	EMPTY[];	/* this avoid -Wwritable-strings issues */
 extern char	**FS;
 extern char	**RS;
--- a/b.c
+++ b/b.c
@@ -530,7 +530,7 @@
 			setvec[lp] = 1;
 			setcnt++;
 		}
-		if (type(p) == CCL && (*(char *) right(p)) == '\0')
+		if (type(p) == CCL && (*(int *) right(p)) == 0)
 			return(0);		/* empty CCL */
 		return(1);
 	case PLUS:
@@ -846,7 +846,7 @@
 	c = getc(fp);
 	if (c == EOF)
 		return result;	// result.rune == 0 --> EOF
-	else if (c < 128) {
+	else if (c < 128 || awk_mb_cur_max == 1) {
 		result.bytes[0] = c;
 		result.len = 1;
 		result.rune = c;
@@ -1038,7 +1038,7 @@
 		rtok = relex();
 		if (rtok == ')') {	/* special pleading for () */
 			rtok = relex();
-			return unary(op2(CCL, NIL, (Node *) tostring("")));
+			return unary(op2(CCL, NIL, (Node *) cclenter("")));
 		}
 		np = regexp();
 		if (rtok == ')') {
@@ -1061,7 +1061,7 @@
 		return (concat(op2(CAT, np, primary())));
 	case EMPTYRE:
 		rtok = relex();
-		return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
+		return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
 				primary())));
 	}
 	return (np);
--- a/main.c
+++ b/main.c
@@ -22,7 +22,7 @@
 THIS SOFTWARE.
 ****************************************************************/
 
-const char	*version = "version 20231001";
+const char	*version = "version 20231030";
 
 #define DEBUG
 #include <stdio.h>
@@ -53,6 +53,8 @@
 
 bool	safe = false;	/* true => "safe" mode */
 
+size_t	awk_mb_cur_max = 1;
+
 static noreturn void fpecatch(int n
 #ifdef SA_SIGINFO
 	, siginfo_t *si, void *uc
@@ -116,6 +118,7 @@
 
 	setlocale(LC_CTYPE, "");
 	setlocale(LC_NUMERIC, "C"); /* for parsing cmdline & prog */
+	awk_mb_cur_max = MB_CUR_MAX;
 	cmdname = argv[0];
 	if (argc == 1) {
 		fprintf(stderr,
--- a/run.c
+++ b/run.c
@@ -605,7 +605,7 @@
 	unsigned char c;
 
 	c = s[0];
-	if (c < 128)
+	if (c < 128 || awk_mb_cur_max == 1)
 		return 1; /* what if it's 0? */
 
 	n = strlen(s);
@@ -632,7 +632,7 @@
 	unsigned char c;
 
 	c = s[0];
-	if (c < 128) {
+	if (c < 128 || awk_mb_cur_max == 1) {
 		*rune = c;
 		return 1;
 	}
@@ -679,7 +679,7 @@
 	totlen = 0;
 	for (i = 0; i < n; i += len) {
 		c = s[i];
-		if (c < 128) {
+		if (c < 128 || awk_mb_cur_max == 1) {
 			len = 1;
 		} else {
 			len = u8_nextlen(&s[i]);
@@ -985,7 +985,7 @@
 	if (a[2] != NULL)
 		z = execute(a[2]);
 	s = getsval(x);
-	k = strlen(s) + 1;
+	k = u8_strlen(s) + 1;
 	if (k <= 1) {
 		tempfree(x);
 		tempfree(y);
@@ -1289,7 +1289,7 @@
 				int charval = (int) getfval(x);
 
 				if (charval != 0) {
-					if (charval < 128)
+					if (charval < 128 || awk_mb_cur_max == 1)
 						snprintf(p, BUFSZ(p), fmt, charval);
 					else {
 						// possible unicode character
@@ -1349,7 +1349,7 @@
 			int i;
 
 			if (ljust) { // print one char from t, then pad blanks
-				for (int i = 0; i < n; i++)
+				for (i = 0; i < n; i++)
 					*p++ = t[i];
 				for (i = 0; i < pad; i++) {
 					//printf(" ");
@@ -1360,7 +1360,7 @@
 					//printf(" ");
 					*p++ = ' ';
 				}
-				for (int i = 0; i < n; i++)
+				for (i = 0; i < n; i++)
 					*p++ = t[i];
 			}
 			*p = 0;
@@ -1977,7 +1977,7 @@
 	const char *ps = NULL;
 	size_t n       = 0;
 	wchar_t wc;
-	size_t sz = MB_CUR_MAX;
+	const size_t sz = awk_mb_cur_max;
 	int unused;
 
 	if (sz == 1) {
--