ref: 3c3338de9904dfd0a3ef49656542b26a660b8501
parent: 927dadbc783653f3f1023478ac6e5374f4b24d03
author: Brian Kernighan <fakeuser@fake.com>
date: Mon May 15 11:13:44 EDT 2023
first cut at proper csv with embedded newlines
--- a/lib.c
+++ b/lib.c
@@ -223,14 +223,19 @@
argno++;
}
+extern int readcsvrec(char **pbuf, int *pbufsize, FILE *inf, bool newflag);
+
int readrec(char **pbuf, int *pbufsize, FILE *inf, bool newflag) /* read one record into buf */
{- int sep, c, isrec;
- char *rr, *buf = *pbuf;
+ int sep, c, isrec; // POTENTIAL BUG? isrec is a macro in awk.h
+ char *rr = *pbuf, *buf = *pbuf;
int bufsize = *pbufsize;
char *rs = getsval(rsloc);
- if (*rs && rs[1]) {+ if (CSV) {+ c = readcsvrec(pbuf, pbufsize, inf, newflag);
+ isrec = (c == EOF && rr == buf) ? false : true;
+ } else if (*rs && rs[1]) {bool found;
fa *pfa = makedfa(rs, 1);
@@ -245,6 +250,7 @@
if (found)
setptr(patbeg, '\0');
isrec = (found == 0 && *buf == '\0') ? false : true;
+
} else { if ((sep = *rs) == 0) {sep = '\n';
@@ -282,6 +288,61 @@
return isrec;
}
+
+int readcsvrec(char **pbuf, int *pbufsize, FILE *inf, bool newflag) /* csv can have \n's */
+{ /* so read a complete record that might be multiple lines */+ int sep, c;
+ char *rr = *pbuf, *buf = *pbuf;
+ int bufsize = *pbufsize;
+
+ sep = '\n'; /* the only separator; have to skip over \n embedded in "..." */
+ rr = buf;
+ for (; (c=getc(inf)) != sep && c != EOF; ) {+ if (rr-buf+1 > bufsize)
+ if (!adjbuf(&buf, &bufsize, 1+rr-buf,
+ recsize, &rr, "readcsvrec 1"))
+ FATAL("input record `%.30s...' too long", buf);+ *rr++ = c;
+
+ if (c != '"' ) { /* unquoted field; read until , or \n */+ while ((c = getc(inf)) != ',' && c != '\n' && c != EOF) {+ *rr++ = c;
+ }
+ if (c == ',')
+ *rr++ = c;
+
+ } else { /* start of "..." */+ while ((c = getc(inf)) != EOF) {+ if (c != '"') {+ *rr++ = c;
+ } else {+ *rr++ = c;
+ if ((c = getc(inf)) == ',') {+ *rr++ = c;
+ break;
+ } else if (c == '\n') {+ break;
+ } else if (c == '"') {+ *rr++ = c;
+ } else {+ FATAL("malformed csv record %.30s...", buf);+ }
+ }
+ }
+ }
+
+ if (c == '\n' || c == EOF)
+ break;
+ }
+ if (!adjbuf(&buf, &bufsize, 1+rr-buf, recsize, &rr, "readcsvrec 3"))
+ FATAL("input record `%.30s...' too long", buf);+ *rr = 0;
+ *pbuf = buf;
+ *pbufsize = bufsize;
+ DPRINTF("readcsvrec saw <%s>, returns %d\n", buf, c);+ return c;
+}
+
char *getargv(int n) /* get ARGV[n] */
{Cell *x;
@@ -370,35 +431,37 @@
}
*fr = 0;
} else if (CSV) { /* CSV processing. no error handling */- for (;;) {- i++;
- if (i > nfields)
- growfldtab(i);
- if (freeable(fldtab[i]))
- xfree(fldtab[i]->sval);
- fldtab[i]->sval = fr;
- fldtab[i]->tval = FLD | STR | DONTFREE;
- if (*r == '"' ) { /* start of "..." */- for (r++ ; *r != '\0'; ) {- if (*r == '"' && r[1] != '\0' && r[1] == '"') {- r += 2; /* doubled quote */
- *fr++ = '"';
- } else if (*r == '"' && (r[1] == '\0' || r[1] == ',')) {- r++; /* skip over closing quote */
- break;
- } else {- *fr++ = *r++;
+ if (*r != 0) {+ for (;;) {+ i++;
+ if (i > nfields)
+ growfldtab(i);
+ if (freeable(fldtab[i]))
+ xfree(fldtab[i]->sval);
+ fldtab[i]->sval = fr;
+ fldtab[i]->tval = FLD | STR | DONTFREE;
+ if (*r == '"' ) { /* start of "..." */+ for (r++ ; *r != '\0'; ) {+ if (*r == '"' && r[1] != '\0' && r[1] == '"') {+ r += 2; /* doubled quote */
+ *fr++ = '"';
+ } else if (*r == '"' && (r[1] == '\0' || r[1] == ',')) {+ r++; /* skip over closing quote */
+ break;
+ } else {+ *fr++ = *r++;
+ }
}
+ *fr++ = 0;
+ } else { /* unquoted field */+ while (*r != ',' && *r != '\0')
+ *fr++ = *r++;
+ *fr++ = 0;
}
- *fr++ = 0;
- } else { /* unquoted field */- while (*r != ',' && *r != '\0')
- *fr++ = *r++;
- *fr++ = 0;
+ if (*r++ == 0)
+ break;
+
}
- if (*r++ == 0)
- break;
-
}
*fr = 0;
} else if ((sep = *inputFS) == 0) { /* new: FS="" => 1 char/field */--- a/tran.c
+++ b/tran.c
@@ -308,7 +308,7 @@
} else if (&vp->fval == NF) {donerec = false; /* mark $0 invalid */
setlastfld(f);
- DPRINTF("setting NF to %g\n", f);+ DPRINTF("setfval: setting NF to %g\n", f); } else if (isrec(vp)) {donefld = false; /* mark $1... invalid */
donerec = true;
@@ -375,7 +375,7 @@
donerec = false; /* mark $0 invalid */
f = getfval(vp);
setlastfld(f);
- DPRINTF("setting NF to %g\n", f);+ DPRINTF("setsval: setting NF to %g\n", f);}
return(vp->sval);
--
⑨