ref: d3a19e6f2533d479841e0a7e49582be8be3ea51c
parent: 48a41180edf8160b140ef710c20aa34d0b01b063
author: Brian Kernighan <fakeuser@fake.com>
date: Sat Aug 27 10:36:46 EDT 2022
fixed (i think) the csv code in lib.c, added tests to T.csv, and also fixed splitting of input lines when FS="", plus tests in T.utf
--- a/lib.c
+++ b/lib.c
@@ -34,6 +34,8 @@
#include <math.h>
#include "awk.h"
+extern int u8_nextlen(const char *s);
+
char EMPTY[] = { '\0' };FILE *infile = NULL;
bool innew; /* true = infile has not been read by readrec */
@@ -327,7 +329,7 @@
/* possibly with a final trailing \0 not associated with any field */
char *r, *fr, sep;
Cell *p;
- int i, j, n, quote;
+ int i, j, n;
if (donefld)
return;
@@ -366,8 +368,8 @@
*fr++ = 0;
}
*fr = 0;
- } else if ((sep = *inputFS) == ',') { /* CSV: handle quotes, \x, etc. */- for (i = 0; *r != '\0'; ) {+ } else if ((sep = *inputFS) == ',') { /* CSV processing. no error handling */+ for (;;) {i++;
if (i > nfields)
growfldtab(i);
@@ -375,23 +377,13 @@
xfree(fldtab[i]->sval);
fldtab[i]->sval = fr;
fldtab[i]->tval = FLD | STR | DONTFREE;
-
-/* printf("fldbld 1 [%s] [%d:] [%s]\n", r, i, fr); */-
- if (*r == '"' /* || *r == '\'' */ ) { /* "..."; do not include '...' */- quote = *r++;
- for ( ; *r != '\0'; ) {-/* printf("fldbld 2 [%s]\n", r); */- if (*r == quote && r[1] != '\0' && r[1] == quote) {+ if (*r == '"' ) { /* start of "..." */+ for (r++ ; *r != '\0'; ) {+ if (*r == '"' && r[1] != '\0' && r[1] == '"') {r += 2; /* doubled quote */
- *fr++ = quote;
- } else if (*r == '\\') { /* BUG: off end? */- r++; /* backslashes inside "..." ??? */
- *fr++ = *r++;
- } else if (*r == quote && (r[1] == '\0' || r[1] == ',')) {- r++;
- if (*r == ',')
- r++;
+ *fr++ = '"';
+ } else if (*r == '"' && (r[1] == '\0' || r[1] == ',')) {+ r++; /* skip over closing quote */
break;
} else {*fr++ = *r++;
@@ -398,45 +390,34 @@
}
}
*fr++ = 0;
- continue;
- }
-
- /* unquoted field */
- for ( ; *r != '\0'; ) {- if (*r == ',') { /* bare comma ends field */- r++;
- *fr++ = 0;
- break;
- } else if (*r == '\\') { /* BUG: could walk off end */- r++;
+ } else { /* unquoted field */+ while (*r != ',' && *r != '\0')
*fr++ = *r++;
- } else {- *fr++ = *r++;
- }
+ *fr++ = 0;
}
- *fr++ = 0;
+ if (*r++ == 0)
+ break;
+
}
*fr = 0;
- } else if ((sep = *inputFS) == 0) { /* new: FS="" => 1 char/field */- for (i = 0; *r != '\0'; r += n) {- char buf[MB_LEN_MAX + 1];
-
+ } else if ((sep = *inputFS) == 0) { /* new: FS="" => 1 char/field */+ for (i = 0; *r != '\0'; ) {+ char buf[10];
i++;
if (i > nfields)
growfldtab(i);
if (freeable(fldtab[i]))
xfree(fldtab[i]->sval);
- n = mblen(r, MB_LEN_MAX);
- if (n < 0)
- n = 1;
- memcpy(buf, r, n);
- buf[n] = '\0';
+ n = u8_nextlen(r);
+ for (j = 0; j < n; j++)
+ buf[j] = *r++;
+ buf[j] = '\0';
fldtab[i]->sval = tostring(buf);
fldtab[i]->tval = FLD | STR;
}
*fr = 0;
} else if (*r != 0) { /* if 0, it's a null field */- /* subtlecase : if length(FS) == 1 && length(RS > 0)
+ /* subtle case: if length(FS) == 1 && length(RS > 0)
* \n is NOT a field separator (cf awk book 61,84).
* this variable is tested in the inner while loop.
*/
--- /dev/null
+++ b/testdir/T.csv
@@ -1,0 +1,84 @@
+#!/bin/sh
+
+echo T.csv: tests of csv field splitting
+
+awk=${awk-../a.out}+
+$awk '
+BEGIN {+ FS = "\t"
+ awk = "../a.out"
+}
+NF == 0 || $1 ~ /^#/ {+ next
+}
+$1 ~ /try/ { # new test+ nt++
+ sub(/try /, "")
+ prog = $0
+ printf("%3d %s\n", nt, prog)+ prog = sprintf("%s -F\"\\t\" '"'"'%s'"'"'", awk, prog)+ # print "prog is", prog
+ nt2 = 0
+ while (getline > 0) {+ if (NF == 0) # blank line terminates a sequence
+ break
+ input = $1
+ for (i = 2; i < NF; i++) # input data
+ input = input "\t" $i
+ test = sprintf("./echo '"'"'%s'"'"' | %s >foo1; ",+ input, prog)
+ if ($NF == "\"\"")
+ output = ">foo2;"
+ else
+ output = sprintf("./echo '"'"'%s'"'"' >foo2; ", $NF)+ gsub(/\\t/, "\t", output)
+ gsub(/\\n/, "\n", output)
+ run = sprintf("cmp foo1 foo2 || echo test %d.%d failed",+ nt, ++nt2)
+ # print "input is", input
+ # print "test is", test
+ # print "output is", output
+ # print "run is", run
+ system(test output run)
+ }
+ tt += nt2
+}
+END { print tt, "tests" }+' <<\!!!!
+# General format:
+# try program as rest of line
+# $1 $2 $3 output1 (\t for tab, \n for newline,
+# $1 $2 $3 output2 ("" for null)+# ... terminated by blank line
+
+
+try BEGIN {FS=","}; { for (i=1; i<=NF; i++) printf("[%s]", $i); printf("\n") }+a [a]
+ a [ a]
+,a [][a]
+ , a [ ][ a]
+a,b [a][b]
+a,b,c [a][b][c]
+"" []
+"abc" [abc]
+"a""b" [a"b]
+"a","b" [a][b]
+a""b [a""b]
+"a,b" [a,b]
+"""" ["]
+"""""" [""]
+"""x""" ["x"]
+,,"" [][][]
+a""b [a""b]
+a"b [a"b]
+a''b [a''b]
+"abc [abc]
+abc,"def [abc][def]
+,, [][][]
+a, [a][]
+"", [][]
+, [][]
+"abc",def [abc][def]
+ []
+!!!!
--- a/testdir/T.utf
+++ b/testdir/T.utf
@@ -143,4 +143,16 @@
για όλους τους καλούς ά α.*α 3 3 15
να έρθει στο πά [^ν] 2 2 1
+# FS="" should split into unicode chars
+try emptyFS BEGIN {FS=""} {print NF}+すべての善人のために 10
+の今がその時だ 7
+Сейчас 6
+现在是时候了 6
+给所有的好男 6
+来参加聚会。 6
+😀 1
+🖕 finger 8
+
+
!!!!
--
⑨