shithub: trueawk

--- a/lib.c

+++ b/lib.c

@@ -34,6 +34,8 @@

 #include <math.h>

 #include "awk.h"

+extern int u8_nextlen(const char *s);

 char	EMPTY[] = { '\0' };

 FILE	*infile	= NULL;

 bool	innew;		/* true = infile has not been read by readrec */

@@ -327,7 +329,7 @@

 	/* possibly with a final trailing \0 not associated with any field */

 	char *r, *fr, sep;

 	Cell *p;

-	int i, j, n, quote;

+	int i, j, n;

 	if (donefld)

 		return;

@@ -366,8 +368,8 @@

 			*fr++ = 0;

 		*fr = 0;

-	} else if ((sep = *inputFS) == ',') {	/* CSV: handle quotes, \x, etc. */

-		for (i = 0; *r != '\0'; ) {

+	} else if ((sep = *inputFS) == ',') {	/* CSV processing.  no error handling */

+		for (;;) {

 			i++;

 			if (i > nfields)

 				growfldtab(i);

@@ -375,23 +377,13 @@

 				xfree(fldtab[i]->sval);

 			fldtab[i]->sval = fr;

 			fldtab[i]->tval = FLD | STR | DONTFREE;

-/* printf("fldbld 1 [%s] [%d:] [%s]\n", r, i, fr); */

-			if (*r == '"' /* || *r == '\'' */ ) { /* "..."; do not include '...' */

-				quote = *r++;

-				for ( ; *r != '\0'; ) {

-/* printf("fldbld 2   [%s]\n", r); */

-					if (*r == quote && r[1] != '\0' && r[1] == quote) {

+			if (*r == '"' ) { /* start of "..." */

+				for (r++ ; *r != '\0'; ) {

+					if (*r == '"' && r[1] != '\0' && r[1] == '"') {

 						r += 2; /* doubled quote */

-						*fr++ = quote;

-					} else if (*r == '\\') { /* BUG: off end? */

-						r++; /* backslashes inside "..." ??? */

-						*fr++ = *r++;

-					} else if (*r == quote && (r[1] == '\0' || r[1] == ',')) {

-						r++;

-						if (*r == ',')

-							r++;

+						*fr++ = '"';

+					} else if (*r == '"' && (r[1] == '\0' || r[1] == ',')) {

+						r++; /* skip over closing quote */

 						break;

 					} else {

 						*fr++ = *r++;

@@ -398,45 +390,34 @@

 				*fr++ = 0;

-				continue;

-			}

-			/* unquoted field */

-			for ( ; *r != '\0'; ) {

-				if (*r == ',') { /* bare comma ends field */

-					r++;

-					*fr++ = 0;

-					break;

-				} else if (*r == '\\') { /* BUG: could walk off end */

-					r++;

+			} else {	/* unquoted field */

+				while (*r != ',' && *r != '\0')

 					*fr++ = *r++;

-				} else {

-					*fr++ = *r++;

-				}

+				*fr++ = 0;

-			*fr++ = 0;

+			if (*r++ == 0)

+				break;

 		*fr = 0;

-	} else if ((sep = *inputFS) == 0) {		/* new: FS="" => 1 char/field */

-		for (i = 0; *r != '\0'; r += n) {

-			char buf[MB_LEN_MAX + 1];

+	} else if ((sep = *inputFS) == 0) {	/* new: FS="" => 1 char/field */

+		for (i = 0; *r != '\0'; ) {

+			char buf[10];

 			i++;

 			if (i > nfields)

 				growfldtab(i);

 			if (freeable(fldtab[i]))

 				xfree(fldtab[i]->sval);

-			n = mblen(r, MB_LEN_MAX);

-			if (n < 0)

-				n = 1;

-			memcpy(buf, r, n);

-			buf[n] = '\0';

+			n = u8_nextlen(r);

+			for (j = 0; j < n; j++)

+				buf[j] = *r++;

+			buf[j] = '\0';

 			fldtab[i]->sval = tostring(buf);

 			fldtab[i]->tval = FLD | STR;

 		*fr = 0;

 	} else if (*r != 0) {	/* if 0, it's a null field */

-		/* subtlecase : if length(FS) == 1 && length(RS > 0)

+		/* subtle case: if length(FS) == 1 && length(RS > 0)

 		 * \n is NOT a field separator (cf awk book 61,84).

 		 * this variable is tested in the inner while loop.

*/

--- /dev/null

+++ b/testdir/T.csv

@@ -1,0 +1,84 @@

+#!/bin/sh

+echo T.csv: tests of csv field splitting

+awk=${awk-../a.out}

+$awk '

+BEGIN {

+	FS = "\t"

+	awk = "../a.out"

+}

+NF == 0 || $1 ~ /^#/ {

+	next

+}

+$1 ~ /try/ {	# new test

+	nt++

+	sub(/try /, "")

+	prog = $0

+	printf("%3d  %s\n", nt, prog)

+	prog = sprintf("%s -F\"\\t\" '"'"'%s'"'"'", awk, prog)

+	# print "prog is", prog

+	nt2 = 0

+	while (getline > 0) {

+		if (NF == 0)	# blank line terminates a sequence

+			break

+		input = $1

+		for (i = 2; i < NF; i++)	# input data

+			input = input "\t" $i

+		test = sprintf("./echo '"'"'%s'"'"' | %s >foo1; ",

+			input, prog)

+		if ($NF == "\"\"")

+			output = ">foo2;"

+		else

+			output = sprintf("./echo '"'"'%s'"'"' >foo2; ", $NF)

+		gsub(/\\t/, "\t", output)

+		gsub(/\\n/, "\n", output)

+		run = sprintf("cmp foo1 foo2 || echo test %d.%d failed",

+			nt, ++nt2)

+		# print  "input is", input

+		# print  "test is", test

+		# print  "output is", output

+		# print  "run is", run

+		system(test output run)

+	}

+	tt += nt2

+}

+END { print tt, "tests" }

+' <<\!!!!

+# General format:

+# try program as rest of line

+# $1	$2	$3	output1  (\t for tab, \n for newline,

+# $1	$2	$3	output2  ("" for null)

+# ... terminated by blank line

+try BEGIN {FS=","}; { for (i=1; i<=NF; i++) printf("[%s]", $i); printf("\n") }

+a	[a]

+  a	[  a]

+,a	[][a]

+ , a	[ ][ a]

+a,b	[a][b]

+a,b,c	[a][b][c]

+""	[]

+"abc"	[abc]

+"a""b"	[a"b]

+"a","b"	[a][b]

+a""b	[a""b]

+"a,b"	[a,b]

+""""	["]

+""""""	[""]

+"""x"""	["x"]

+,,""	[][][]

+a""b	[a""b]

+a"b	[a"b]

+a''b	[a''b]

+"abc	[abc]

+abc,"def	[abc][def]

+,,	[][][]

+a,	[a][]

+"",	[][]

+,	[][]

+"abc",def	[abc][def]

+	[]

+!!!!

--- a/testdir/T.utf

+++ b/testdir/T.utf

@@ -143,4 +143,16 @@

 για όλους τους καλούς ά	α.*α	3 3 15

 να έρθει στο πά	[^ν]	2 2 1

+# FS="" should split into unicode chars

+try emptyFS BEGIN {FS=""} {print NF}

+すべての善人のために	10

+の今がその時だ	7

+Сейчас	6

+现在是时候了	6

+给所有的好男	6

+来参加聚会。	6

+😀	1

+🖕 finger	8

 !!!!

--

⑨