shithub: trueawk

Download patch

ref: 1a7797e938dc2b0a52b2d4116a98c019c447660b
parent: 8444a9d1ddb54fe044d26a9ce30c2619a1bf566d
author: Brian Kernighan <fakeuser@fake.com>
date: Wed May 17 13:38:53 EDT 2023

fixes to --csv with embedded newlines

--- a/lib.c
+++ b/lib.c
@@ -289,6 +289,14 @@
 }
 
 
+/*******************
+ * loose ends here:
+ *   \r\n should become \n
+ *   what about bare \r?  Excel uses that for embedded newlines
+ *   can't have "" in unquoted fields, according to RFC 4180
+*/
+
+
 int readcsvrec(char **pbuf, int *pbufsize, FILE *inf, bool newflag) /* csv can have \n's */
 {			/* so read a complete record that might be multiple lines */
 	int sep, c;
@@ -303,9 +311,15 @@
 			    recsize, &rr, "readcsvrec 1"))
 				FATAL("input record `%.30s...' too long", buf);
 		*rr++ = c;
+		if (c == ',')
+			continue;
 
 		if (c != '"' ) {    	/* unquoted field; read until , or \n */
 			while ((c = getc(inf)) != ',' && c != '\n' && c != EOF) {
+				if (rr-buf+1 > bufsize)
+					if (!adjbuf(&buf, &bufsize, 1+rr-buf,
+					    recsize, &rr, "readcsvrec 2"))
+						FATAL("input record `%.30s...' too long", buf);
 				*rr++ = c;
 			}
 			if (c == ',')
@@ -313,6 +327,10 @@
 
 		} else { 		/* start of "..." */
 			while ((c = getc(inf)) != EOF) {
+				if (rr-buf+1 > bufsize)
+					if (!adjbuf(&buf, &bufsize, 1+rr-buf,
+					    recsize, &rr, "readcsvrec 3"))
+						FATAL("input record `%.30s...' too long", buf);
 				if (c != '"') {
 					*rr++ = c;
 				} else {
@@ -335,7 +353,7 @@
 		if (c == '\n' || c == EOF)
 			break;
 	}
-	if (!adjbuf(&buf, &bufsize, 1+rr-buf, recsize, &rr, "readcsvrec 3"))
+	if (!adjbuf(&buf, &bufsize, 1+rr-buf, recsize, &rr, "readcsvrec 4"))
 		FATAL("input record `%.30s...' too long", buf);
 	*rr = 0;
 	*pbuf = buf;
--- a/main.c
+++ b/main.c
@@ -22,7 +22,7 @@
 THIS SOFTWARE.
 ****************************************************************/
 
-const char	*version = "version 20230314";
+const char	*version = "version 20230516";
 
 #define DEBUG
 #include <stdio.h>
--- a/testdir/T.csv
+++ b/testdir/T.csv
@@ -1,6 +1,6 @@
 #!/bin/sh
 
-echo T.csv: tests of csv field splitting
+echo T.csv: tests of csv field splitting, no embedded newlines
 
 awk=${awk-../a.out}
 
@@ -69,16 +69,14 @@
 """"	["]
 """"""	[""]
 """x"""	["x"]
+""","""	[","]
 ,,""	[][][]
 a""b	[a""b]
 a"b	[a"b]
 a''b	[a''b]
-"abc	[abc]
-abc,"def	[abc][def]
 ,,	[][][]
 a,	[a][]
 "",	[][]
 ,	[][]
-"abc",def	[abc][def]
-	[]
+	
 !!!!
--- a/tran.c
+++ b/tran.c
@@ -348,6 +348,10 @@
 		(void*)vp, NN(vp->nval), s, vp->tval, donerec, donefld);
 	if ((vp->tval & (NUM | STR)) == 0)
 		funnyvar(vp, "assign to");
+	if (CSV && (vp == rsloc))
+		WARNING("danger: don't set RS when --csv is in effect");
+	if (CSV && (vp == fsloc))
+		WARNING("danger: don't set FS when --csv is in effect");
 	if (isfld(vp)) {
 		donerec = false;	/* mark $0 invalid */
 		fldno = atoi(vp->nval);
--