ref: 9b5913fffba9965972d1a63ce019dc533c0be40c
parent: 4b1b2d6357efbfbf037409e17e91ccdbbae642c1
author: Brian Kernighan <fakeuser@fake.com>
date: Sun Jun 25 21:54:12 EDT 2023
added Arnold"s state machine for handling newlines inside quoted strings in --csv mode. Also fiddled awk.1 man page
--- a/lib.c
+++ b/lib.c
@@ -302,57 +302,29 @@
int sep, c;
char *rr = *pbuf, *buf = *pbuf;
int bufsize = *pbufsize;
+ bool in_quote = false;
sep = '\n'; /* the only separator; have to skip over \n embedded in "..." */
rr = buf;
- for (; (c=getc(inf)) != sep && c != EOF; ) {+ while ((c = getc(inf)) != EOF) {+ if (c == sep) {+ if (! in_quote)
+ break;
+ if (rr > buf && rr[-1] == '\r') // remove \r if was \r\n
+ rr--;
+ }
+
if (rr-buf+1 > bufsize)
if (!adjbuf(&buf, &bufsize, 1+rr-buf,
recsize, &rr, "readcsvrec 1"))
FATAL("input record `%.30s...' too long", buf);*rr++ = c;
- if (c == ',')
- continue;
+ if (c == '"')
+ in_quote = ! in_quote;
+ }
+ if (c == '\n' && rr > buf && rr[-1] == '\r') // remove \r if was \r\n
+ rr--;
- if (c != '"' ) { /* unquoted field; read until , or \n */- while ((c = getc(inf)) != ',' && c != '\n' && c != EOF) {- if (rr-buf+1 > bufsize)
- if (!adjbuf(&buf, &bufsize, 1+rr-buf,
- recsize, &rr, "readcsvrec 2"))
- FATAL("input record `%.30s...' too long", buf);- *rr++ = c;
- }
- if (c == ',')
- *rr++ = c;
-
- } else { /* start of "..." */- while ((c = getc(inf)) != EOF) {- if (rr-buf+1 > bufsize)
- if (!adjbuf(&buf, &bufsize, 1+rr-buf,
- recsize, &rr, "readcsvrec 3"))
- FATAL("input record `%.30s...' too long", buf);- if (c != '"') {- *rr++ = c;
- } else {- *rr++ = c;
- if ((c = getc(inf)) == ',') {- *rr++ = c;
- break;
- } else if (c == '\n') {- break;
- } else if (c == '"') {- *rr++ = c;
- } else {- *rr++ = c;
- FATAL("malformed csv record %.30s...", buf);- }
- }
- }
- }
-
- if (c == '\n' || c == EOF)
- break;
- }
if (!adjbuf(&buf, &bufsize, 1+rr-buf, recsize, &rr, "readcsvrec 4"))
FATAL("input record `%.30s...' too long", buf);*rr = 0;
--
⑨