shithub: trueawk

Download patch

ref: 927dadbc783653f3f1023478ac6e5374f4b24d03
parent: ac3084de9b64d0c838e520ea1ebdb2e1bde87b3f
author: Brian Kernighan <fakeuser@fake.com>
date: Wed Mar 15 13:10:39 EDT 2023

fixed split with CSV (i think)

--- a/README.unicode
+++ /dev/null
@@ -1,55 +1,0 @@
-From bwk@cs.princeton.edu  Wed May 25 15:55:09 2022
-X-Envelope-From: bwk@cs.princeton.edu
-X-Envelope-To: <arnold@skeeve.com>
-Return-Path: <bwk@cs.princeton.edu>
-Received: from violeteyes.cs.princeton.edu (violeteyes.cs.princeton.edu [128.112.136.55])
-	by freefriends.org (8.14.7/8.14.7) with ESMTP id 24PLt7fa003331
-	(version=TLSv1/SSLv3 cipher=ECDHE-RSA-AES256-GCM-SHA384 bits=256 verify=NOT)
-	for <arnold@skeeve.com>; Wed, 25 May 2022 15:55:09 -0600
-Received: from wash.cs.princeton.edu (wash.cs.princeton.edu [128.112.155.171])
-	(authenticated bits=0)
-	by violeteyes.cs.princeton.edu (8.14.7/8.14.7) with ESMTP id 24PLt4Hv011884
-	(version=TLSv1/SSLv3 cipher=ECDHE-RSA-AES256-GCM-SHA384 bits=256 verify=NOT);
-	Wed, 25 May 2022 17:55:07 -0400
-Date: Wed, 25 May 2022 17:55:04 -0400 (EDT)
-From: Brian Kernighan <bwk@cs.princeton.edu>
-To: Arnold Robbins <arnold@skeeve.com>
-cc: Brian Kernighan <bwk@cs.princeton.edu>
-Subject: awk and unicode
-Message-ID: <b742347a-cf40-e97c-6e9d-1847aff2cf5@cs.princeton.edu>
-MIME-Version: 1.0
-Content-Type: text/plain; format=flowed; charset=US-ASCII
-
-Hi, Arnold --
-
-Finally, with a bit of spare time after the academic
-treadmill slows, I have gotten back to futzing around with
-Unicode in awk.  I now have it mostly working (modulo
-inadequate tests), through a combination of using utf-8
-internally for functions like length(), and conversion to
-utf-32 in regular expressions.  The amount of actual change
-isn't too great, so I think this might be ok.
-
-I have not looked at range matches for regular expressions,
-since require a lot of really fiddly code.  I have not fixed
-the fnematch() code since I never noticed it before.  It
-looks like the ranges will work as is; fnematch needs fixed
-but I think it should be fairly easy.
-
-There is one realloc bug, which suggests that others lurk
-too, but it's confined to very large character classes, so I
-should be able to find it.
-
-I have tested this a fair amount but clearly more tests are
-needed.  I'm working on that, but if you have more tests
-hidden away, let me know.
-
-Once I figure out how (and do some more checking, I will try
-to submit a pull request.  I wish I understood git better,
-but in spite of your help, I still don't have a proper
-understanding, so this may take a while.
-
-Hope all is well and you're enjoying your visit to the US.
-
-Brian
-
--- a/b.c
+++ b/b.c
@@ -336,13 +336,13 @@
 /* in the parsing of regular expressions, metacharacters like . have */
 /* to be seen literally;  \056 is not a metacharacter. */
 
-int hexstr(const uschar **pp)	/* find and eval hex string at pp, return new p */
+int hexstr(const uschar **pp, int max)	/* find and eval hex string at pp, return new p */
 {			/* only pick up one 8-bit byte (2 chars) */
 	const uschar *p;
 	int n = 0;
 	int i;
 
-	for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) {
+	for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
 		if (isdigit(*p))
 			n = 16 * n + *p - '0';
 		else if (*p >= 'a' && *p <= 'f')
@@ -354,6 +354,8 @@
 	return n;
 }
 
+
+
 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
 
 int quoted(const uschar **pp)	/* pick up next thing after a \\ */
@@ -364,24 +366,26 @@
 
 /* BUG: should advance by utf-8 char even if makes no sense */
 
-	if ((c = *p++) == 't')
+	if ((c = *p++) == 't') {
 		c = '\t';
-	else if (c == 'n')
+	} else if (c == 'n') {
 		c = '\n';
-	else if (c == 'f')
+	} else if (c == 'f') {
 		c = '\f';
-	else if (c == 'r')
+	} else if (c == 'r') {
 		c = '\r';
-	else if (c == 'b')
+	} else if (c == 'b') {
 		c = '\b';
-	else if (c == 'v')
+	} else if (c == 'v') {
 		c = '\v';
-	else if (c == 'a')
+	} else if (c == 'a') {
 		c = '\a';
-	else if (c == '\\')
+	} else if (c == '\\') {
 		c = '\\';
-	else if (c == 'x') {	/* hexadecimal goo follows */
-		c = hexstr(&p);	/* this adds a null if number is invalid */
+	} else if (c == 'x') {	/* 2 hex digits follow */
+		c = hexstr(&p, 2);	/* this adds a null if number is invalid */
+	} else if (c == 'u') {	/* unicode char number up to 8 hex digits */
+		c = hexstr(&p, 8);
 	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
 		int n = c - '0';
 		if (isoctdigit(*p)) {
--- a/proto.h
+++ b/proto.h
@@ -43,7 +43,6 @@
 extern	int	makeinit(fa *, bool);
 extern	void	penter(Node *);
 extern	void	freetr(Node *);
-extern	int	hexstr(const uschar **);
 extern	int	quoted(const uschar **);
 extern	int	*cclenter(const char *);
 extern	noreturn void	overflo(const char *);
--- a/run.c
+++ b/run.c
@@ -1671,7 +1671,7 @@
 	origs = s = strdup(getsval(y));
 	tempfree(y);
 	arg3type = ptoi(a[3]);
-	if (a[2] == NULL) {		/* fs string */
+	if (a[2] == NULL) {		/* BUG: CSV should override implicit fs but not explicit */
 		fs = getsval(fsloc);
 	} else if (arg3type == STRING) {	/* split(str,arr,"string") */
 		x = execute(a[2]);
@@ -1738,7 +1738,41 @@
 			setsymtab(num, s, 0.0, STR, (Array *) ap->sval);
   spdone:
 		pfa = NULL;
-	} else if (!CSV && sep == ' ') {
+
+	} else if (a[2] == NULL && CSV) {	/* CSV only if no explicit separator */
+		char *newt = (char *) malloc(strlen(s)); /* for building new string; reuse for each field */
+		for (;;) {
+			char *fr = newt;
+			n++;
+			if (*s == '"' ) { /* start of "..." */
+				for (s++ ; *s != '\0'; ) {
+					if (*s == '"' && s[1] != '\0' && s[1] == '"') {
+						s += 2; /* doubled quote */
+						*fr++ = '"';
+					} else if (*s == '"' && (s[1] == '\0' || s[1] == ',')) {
+						s++; /* skip over closing quote */
+						break;
+					} else {
+						*fr++ = *s++;
+					}
+				}
+				*fr++ = 0;
+			} else {	/* unquoted field */
+				while (*s != ',' && *s != '\0')
+					*fr++ = *s++;
+				*fr++ = 0;
+			}
+			snprintf(num, sizeof(num), "%d", n);
+			if (is_number(newt, &result))
+				setsymtab(num, newt, result, STR|NUM, (Array *) ap->sval);
+			else
+				setsymtab(num, newt, 0.0, STR, (Array *) ap->sval);
+			if (*s++ == '\0')
+				break;
+		}
+		free(newt);
+
+	} else if (!CSV && sep == ' ') { /* usual case: split on white space */
 		for (n = 0; ; ) {
 #define ISWS(c)	((c) == ' ' || (c) == '\t' || (c) == '\n')
 			while (ISWS(*s))
@@ -1761,6 +1795,7 @@
 			if (*s != '\0')
 				s++;
 		}
+
 	} else if (sep == 0) {	/* new: split(s, a, "") => 1 char/elem */
 		for (n = 0; *s != '\0'; s += u8_nextlen(s)) {
 			char buf[10];
@@ -1778,41 +1813,7 @@
 				setsymtab(num, buf, 0.0, STR, (Array *) ap->sval);
 		}
 
-
-	} else if (CSV) {	/* CSV processing.  no error handling */
-		char *newt = (char *) malloc(strlen(s)); /* for building new string; reuse for each field */
-		for (;;) {
-			char *fr = newt;
-			n++;
-			if (*s == '"' ) { /* start of "..." */
-				for (s++ ; *s != '\0'; ) {
-					if (*s == '"' && s[1] != '\0' && s[1] == '"') {
-						s += 2; /* doubled quote */
-						*fr++ = '"';
-					} else if (*s == '"' && (s[1] == '\0' || s[1] == ',')) {
-						s++; /* skip over closing quote */
-						break;
-					} else {
-						*fr++ = *s++;
-					}
-				}
-				*fr++ = 0;
-			} else {	/* unquoted field */
-				while (*s != ',' && *s != '\0')
-					*fr++ = *s++;
-				*fr++ = 0;
-			}
-			snprintf(num, sizeof(num), "%d", n);
-			if (is_number(newt, &result))
-				setsymtab(num, newt, result, STR|NUM, (Array *) ap->sval);
-			else
-				setsymtab(num, newt, 0.0, STR, (Array *) ap->sval);
-			if (*s++ == '\0')
-				break;
-		}
-		free(newt);
-
-	} else if (*s != '\0') {
+	} else if (*s != '\0') {  /* some random single character */
 		for (;;) {
 			n++;
 			t = s;
--- a/testdir/T.csv
+++ b/testdir/T.csv
@@ -7,7 +7,7 @@
 $awk '
 BEGIN {
 	FS = "\t"
-	awk = "../a.out"
+	awk = "../a.out --csv"
 }
 NF == 0 || $1 ~ /^#/ {
 	next
@@ -53,7 +53,7 @@
 # ... terminated by blank line
 
 
-try BEGIN {FS=","}; { for (i=1; i<=NF; i++) printf("[%s]", $i); printf("\n") }
+try  { for (i=1; i<=NF; i++) printf("[%s]", $i); printf("\n") }
 a	[a]
   a	[  a]
 ,a	[][a]
--