shithub: trueawk

--- a/README.unicode

+++ /dev/null

@@ -1,55 +1,0 @@

-From bwk@cs.princeton.edu  Wed May 25 15:55:09 2022

-X-Envelope-From: bwk@cs.princeton.edu

-X-Envelope-To: <arnold@skeeve.com>

-Return-Path: <bwk@cs.princeton.edu>

-Received: from violeteyes.cs.princeton.edu (violeteyes.cs.princeton.edu [128.112.136.55])

-	by freefriends.org (8.14.7/8.14.7) with ESMTP id 24PLt7fa003331

-	(version=TLSv1/SSLv3 cipher=ECDHE-RSA-AES256-GCM-SHA384 bits=256 verify=NOT)

-	for <arnold@skeeve.com>; Wed, 25 May 2022 15:55:09 -0600

-Received: from wash.cs.princeton.edu (wash.cs.princeton.edu [128.112.155.171])

-	(authenticated bits=0)

-	by violeteyes.cs.princeton.edu (8.14.7/8.14.7) with ESMTP id 24PLt4Hv011884

-	(version=TLSv1/SSLv3 cipher=ECDHE-RSA-AES256-GCM-SHA384 bits=256 verify=NOT);

-	Wed, 25 May 2022 17:55:07 -0400

-Date: Wed, 25 May 2022 17:55:04 -0400 (EDT)

-From: Brian Kernighan <bwk@cs.princeton.edu>

-To: Arnold Robbins <arnold@skeeve.com>

-cc: Brian Kernighan <bwk@cs.princeton.edu>

-Subject: awk and unicode

-Message-ID: <b742347a-cf40-e97c-6e9d-1847aff2cf5@cs.princeton.edu>

-MIME-Version: 1.0

-Content-Type: text/plain; format=flowed; charset=US-ASCII

-Hi, Arnold --

-Finally, with a bit of spare time after the academic

-treadmill slows, I have gotten back to futzing around with

-Unicode in awk.  I now have it mostly working (modulo

-inadequate tests), through a combination of using utf-8

-internally for functions like length(), and conversion to

-utf-32 in regular expressions.  The amount of actual change

-isn't too great, so I think this might be ok.

-I have not looked at range matches for regular expressions,

-since require a lot of really fiddly code.  I have not fixed

-the fnematch() code since I never noticed it before.  It

-looks like the ranges will work as is; fnematch needs fixed

-but I think it should be fairly easy.

-There is one realloc bug, which suggests that others lurk

-too, but it's confined to very large character classes, so I

-should be able to find it.

-I have tested this a fair amount but clearly more tests are

-needed.  I'm working on that, but if you have more tests

-hidden away, let me know.

-Once I figure out how (and do some more checking, I will try

-to submit a pull request.  I wish I understood git better,

-but in spite of your help, I still don't have a proper

-understanding, so this may take a while.

-Hope all is well and you're enjoying your visit to the US.

-Brian

--- a/b.c

+++ b/b.c

@@ -336,13 +336,13 @@

 /* in the parsing of regular expressions, metacharacters like . have */

 /* to be seen literally;  \056 is not a metacharacter. */

-int hexstr(const uschar **pp)	/* find and eval hex string at pp, return new p */

+int hexstr(const uschar **pp, int max)	/* find and eval hex string at pp, return new p */

 {			/* only pick up one 8-bit byte (2 chars) */

 	const uschar *p;

 	int n = 0;

 	int i;

-	for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) {

+	for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {

 		if (isdigit(*p))

 			n = 16 * n + *p - '0';

 		else if (*p >= 'a' && *p <= 'f')

@@ -354,6 +354,8 @@

 	return n;

 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */

 int quoted(const uschar **pp)	/* pick up next thing after a \\ */

@@ -364,24 +366,26 @@

 /* BUG: should advance by utf-8 char even if makes no sense */

-	if ((c = *p++) == 't')

+	if ((c = *p++) == 't') {

 		c = '\t';

-	else if (c == 'n')

+	} else if (c == 'n') {

 		c = '\n';

-	else if (c == 'f')

+	} else if (c == 'f') {

 		c = '\f';

-	else if (c == 'r')

+	} else if (c == 'r') {

 		c = '\r';

-	else if (c == 'b')

+	} else if (c == 'b') {

 		c = '\b';

-	else if (c == 'v')

+	} else if (c == 'v') {

 		c = '\v';

-	else if (c == 'a')

+	} else if (c == 'a') {

 		c = '\a';

-	else if (c == '\\')

+	} else if (c == '\\') {

 		c = '\\';

-	else if (c == 'x') {	/* hexadecimal goo follows */

-		c = hexstr(&p);	/* this adds a null if number is invalid */

+	} else if (c == 'x') {	/* 2 hex digits follow */

+		c = hexstr(&p, 2);	/* this adds a null if number is invalid */

+	} else if (c == 'u') {	/* unicode char number up to 8 hex digits */

+		c = hexstr(&p, 8);

 	} else if (isoctdigit(c)) {	/* \d \dd \ddd */

 		int n = c - '0';

 		if (isoctdigit(*p)) {

--- a/proto.h

+++ b/proto.h

@@ -43,7 +43,6 @@

 extern	int	makeinit(fa *, bool);

 extern	void	penter(Node *);

 extern	void	freetr(Node *);

-extern	int	hexstr(const uschar **);

 extern	int	quoted(const uschar **);

 extern	int	*cclenter(const char *);

 extern	noreturn void	overflo(const char *);

--- a/run.c

+++ b/run.c

@@ -1671,7 +1671,7 @@

 	origs = s = strdup(getsval(y));

 	tempfree(y);

 	arg3type = ptoi(a[3]);

-	if (a[2] == NULL) {		/* fs string */

+	if (a[2] == NULL) {		/* BUG: CSV should override implicit fs but not explicit */

 		fs = getsval(fsloc);

 	} else if (arg3type == STRING) {	/* split(str,arr,"string") */

 		x = execute(a[2]);

@@ -1738,7 +1738,41 @@

 			setsymtab(num, s, 0.0, STR, (Array *) ap->sval);

   spdone:

 		pfa = NULL;

-	} else if (!CSV && sep == ' ') {

+	} else if (a[2] == NULL && CSV) {	/* CSV only if no explicit separator */

+		char *newt = (char *) malloc(strlen(s)); /* for building new string; reuse for each field */

+		for (;;) {

+			char *fr = newt;

+			n++;

+			if (*s == '"' ) { /* start of "..." */

+				for (s++ ; *s != '\0'; ) {

+					if (*s == '"' && s[1] != '\0' && s[1] == '"') {

+						s += 2; /* doubled quote */

+						*fr++ = '"';

+					} else if (*s == '"' && (s[1] == '\0' || s[1] == ',')) {

+						s++; /* skip over closing quote */

+						break;

+					} else {

+						*fr++ = *s++;

+					}

+				}

+				*fr++ = 0;

+			} else {	/* unquoted field */

+				while (*s != ',' && *s != '\0')

+					*fr++ = *s++;

+				*fr++ = 0;

+			}

+			snprintf(num, sizeof(num), "%d", n);

+			if (is_number(newt, &result))

+				setsymtab(num, newt, result, STR|NUM, (Array *) ap->sval);

+			else

+				setsymtab(num, newt, 0.0, STR, (Array *) ap->sval);

+			if (*s++ == '\0')

+				break;

+		}

+		free(newt);

+	} else if (!CSV && sep == ' ') { /* usual case: split on white space */

 		for (n = 0; ; ) {

 #define ISWS(c)	((c) == ' ' || (c) == '\t' || (c) == '\n')

 			while (ISWS(*s))

@@ -1761,6 +1795,7 @@

 			if (*s != '\0')

 				s++;

 	} else if (sep == 0) {	/* new: split(s, a, "") => 1 char/elem */

 		for (n = 0; *s != '\0'; s += u8_nextlen(s)) {

 			char buf[10];

@@ -1778,41 +1813,7 @@

 				setsymtab(num, buf, 0.0, STR, (Array *) ap->sval);

-	} else if (CSV) {	/* CSV processing.  no error handling */

-		char *newt = (char *) malloc(strlen(s)); /* for building new string; reuse for each field */

-		for (;;) {

-			char *fr = newt;

-			n++;

-			if (*s == '"' ) { /* start of "..." */

-				for (s++ ; *s != '\0'; ) {

-					if (*s == '"' && s[1] != '\0' && s[1] == '"') {

-						s += 2; /* doubled quote */

-						*fr++ = '"';

-					} else if (*s == '"' && (s[1] == '\0' || s[1] == ',')) {

-						s++; /* skip over closing quote */

-						break;

-					} else {

-						*fr++ = *s++;

-					}

-				}

-				*fr++ = 0;

-			} else {	/* unquoted field */

-				while (*s != ',' && *s != '\0')

-					*fr++ = *s++;

-				*fr++ = 0;

-			}

-			snprintf(num, sizeof(num), "%d", n);

-			if (is_number(newt, &result))

-				setsymtab(num, newt, result, STR|NUM, (Array *) ap->sval);

-			else

-				setsymtab(num, newt, 0.0, STR, (Array *) ap->sval);

-			if (*s++ == '\0')

-				break;

-		}

-		free(newt);

-	} else if (*s != '\0') {

+	} else if (*s != '\0') {  /* some random single character */

 		for (;;) {

 			n++;

 			t = s;

--- a/testdir/T.csv

+++ b/testdir/T.csv

@@ -7,7 +7,7 @@

 $awk '

 BEGIN {

 	FS = "\t"

-	awk = "../a.out"

+	awk = "../a.out --csv"

 NF == 0 || $1 ~ /^#/ {

 	next

@@ -53,7 +53,7 @@

 # ... terminated by blank line

-try BEGIN {FS=","}; { for (i=1; i<=NF; i++) printf("[%s]", $i); printf("\n") }

+try  { for (i=1; i<=NF; i++) printf("[%s]", $i); printf("\n") }

 a	[a]

   a	[  a]

 ,a	[][a]

--

⑨