shithub: trueawk

Download patch

ref: d3a19e6f2533d479841e0a7e49582be8be3ea51c
parent: 48a41180edf8160b140ef710c20aa34d0b01b063
author: Brian Kernighan <fakeuser@fake.com>
date: Sat Aug 27 10:36:46 EDT 2022

fixed (i think) the csv code in lib.c, added tests to T.csv,
and also fixed splitting of input lines when FS="", plus tests in T.utf

--- a/lib.c
+++ b/lib.c
@@ -34,6 +34,8 @@
 #include <math.h>
 #include "awk.h"
 
+extern int u8_nextlen(const char *s);
+
 char	EMPTY[] = { '\0' };
 FILE	*infile	= NULL;
 bool	innew;		/* true = infile has not been read by readrec */
@@ -327,7 +329,7 @@
 	/* possibly with a final trailing \0 not associated with any field */
 	char *r, *fr, sep;
 	Cell *p;
-	int i, j, n, quote;
+	int i, j, n;
 
 	if (donefld)
 		return;
@@ -366,8 +368,8 @@
 			*fr++ = 0;
 		}
 		*fr = 0;
-	} else if ((sep = *inputFS) == ',') {	/* CSV: handle quotes, \x, etc. */
-		for (i = 0; *r != '\0'; ) {
+	} else if ((sep = *inputFS) == ',') {	/* CSV processing.  no error handling */
+		for (;;) {
 			i++;
 			if (i > nfields)
 				growfldtab(i);
@@ -375,23 +377,13 @@
 				xfree(fldtab[i]->sval);
 			fldtab[i]->sval = fr;
 			fldtab[i]->tval = FLD | STR | DONTFREE;
-
-/* printf("fldbld 1 [%s] [%d:] [%s]\n", r, i, fr); */
-
-			if (*r == '"' /* || *r == '\'' */ ) { /* "..."; do not include '...' */
-				quote = *r++;
-				for ( ; *r != '\0'; ) {
-/* printf("fldbld 2   [%s]\n", r); */
-					if (*r == quote && r[1] != '\0' && r[1] == quote) {
+			if (*r == '"' ) { /* start of "..." */
+				for (r++ ; *r != '\0'; ) {
+					if (*r == '"' && r[1] != '\0' && r[1] == '"') {
 						r += 2; /* doubled quote */
-						*fr++ = quote;
-					} else if (*r == '\\') { /* BUG: off end? */
-						r++; /* backslashes inside "..." ??? */
-						*fr++ = *r++;
-					} else if (*r == quote && (r[1] == '\0' || r[1] == ',')) {
-						r++;
-						if (*r == ',')
-							r++;
+						*fr++ = '"';
+					} else if (*r == '"' && (r[1] == '\0' || r[1] == ',')) {
+						r++; /* skip over closing quote */
 						break;
 					} else {
 						*fr++ = *r++;
@@ -398,45 +390,34 @@
 					}
 				}
 				*fr++ = 0;
-				continue;
-			}
-
-			/* unquoted field */
-			for ( ; *r != '\0'; ) {
-				if (*r == ',') { /* bare comma ends field */
-					r++;
-					*fr++ = 0;
-					break;
-				} else if (*r == '\\') { /* BUG: could walk off end */
-					r++;
+			} else {	/* unquoted field */
+				while (*r != ',' && *r != '\0')
 					*fr++ = *r++;
-				} else {
-					*fr++ = *r++;
-				}
+				*fr++ = 0;
 			}
-			*fr++ = 0;
+			if (*r++ == 0)
+				break;
+
 		}
 		*fr = 0;
-	} else if ((sep = *inputFS) == 0) {		/* new: FS="" => 1 char/field */
-		for (i = 0; *r != '\0'; r += n) {
-			char buf[MB_LEN_MAX + 1];
-
+	} else if ((sep = *inputFS) == 0) {	/* new: FS="" => 1 char/field */
+		for (i = 0; *r != '\0'; ) {
+			char buf[10];
 			i++;
 			if (i > nfields)
 				growfldtab(i);
 			if (freeable(fldtab[i]))
 				xfree(fldtab[i]->sval);
-			n = mblen(r, MB_LEN_MAX);
-			if (n < 0)
-				n = 1;
-			memcpy(buf, r, n);
-			buf[n] = '\0';
+			n = u8_nextlen(r);
+			for (j = 0; j < n; j++)
+				buf[j] = *r++;
+			buf[j] = '\0';
 			fldtab[i]->sval = tostring(buf);
 			fldtab[i]->tval = FLD | STR;
 		}
 		*fr = 0;
 	} else if (*r != 0) {	/* if 0, it's a null field */
-		/* subtlecase : if length(FS) == 1 && length(RS > 0)
+		/* subtle case: if length(FS) == 1 && length(RS > 0)
 		 * \n is NOT a field separator (cf awk book 61,84).
 		 * this variable is tested in the inner while loop.
 		 */
--- /dev/null
+++ b/testdir/T.csv
@@ -1,0 +1,84 @@
+#!/bin/sh
+
+echo T.csv: tests of csv field splitting
+
+awk=${awk-../a.out}
+
+$awk '
+BEGIN {
+	FS = "\t"
+	awk = "../a.out"
+}
+NF == 0 || $1 ~ /^#/ {
+	next
+}
+$1 ~ /try/ {	# new test
+	nt++
+	sub(/try /, "")
+	prog = $0
+	printf("%3d  %s\n", nt, prog)
+	prog = sprintf("%s -F\"\\t\" '"'"'%s'"'"'", awk, prog)
+	# print "prog is", prog
+	nt2 = 0
+	while (getline > 0) {
+		if (NF == 0)	# blank line terminates a sequence
+			break
+		input = $1
+		for (i = 2; i < NF; i++)	# input data
+			input = input "\t" $i
+		test = sprintf("./echo '"'"'%s'"'"' | %s >foo1; ",
+			input, prog)
+		if ($NF == "\"\"")
+			output = ">foo2;"
+		else
+			output = sprintf("./echo '"'"'%s'"'"' >foo2; ", $NF)
+		gsub(/\\t/, "\t", output)
+		gsub(/\\n/, "\n", output)
+		run = sprintf("cmp foo1 foo2 || echo test %d.%d failed",
+			nt, ++nt2)
+		# print  "input is", input
+		# print  "test is", test
+		# print  "output is", output
+		# print  "run is", run
+		system(test output run)
+	}
+	tt += nt2
+}
+END { print tt, "tests" }
+' <<\!!!!
+# General format:
+# try program as rest of line
+# $1	$2	$3	output1  (\t for tab, \n for newline,
+# $1	$2	$3	output2  ("" for null)
+# ... terminated by blank line
+
+
+try BEGIN {FS=","}; { for (i=1; i<=NF; i++) printf("[%s]", $i); printf("\n") }
+a	[a]
+  a	[  a]
+,a	[][a]
+ , a	[ ][ a]
+a,b	[a][b]
+a,b,c	[a][b][c]
+""	[]
+"abc"	[abc]
+"a""b"	[a"b]
+"a","b"	[a][b]
+a""b	[a""b]
+"a,b"	[a,b]
+""""	["]
+""""""	[""]
+"""x"""	["x"]
+,,""	[][][]
+a""b	[a""b]
+a"b	[a"b]
+a''b	[a''b]
+"abc	[abc]
+abc,"def	[abc][def]
+,,	[][][]
+a,	[a][]
+"",	[][]
+,	[][]
+"abc",def	[abc][def]
+	[]
+!!!!
--- a/testdir/T.utf
+++ b/testdir/T.utf
@@ -143,4 +143,16 @@
 για όλους τους καλούς ά	α.*α	3 3 15
 να έρθει στο πά	[^ν]	2 2 1
 
+# FS="" should split into unicode chars
+try emptyFS BEGIN {FS=""} {print NF}
+すべての善人のために	10
+の今がその時だ	7
+Сейчас	6
+现在是时候了	6
+给所有的好男	6
+来参加聚会。	6
+😀	1
+🖕 finger	8
+
+
 !!!!
--