ref: 927dadbc783653f3f1023478ac6e5374f4b24d03
parent: ac3084de9b64d0c838e520ea1ebdb2e1bde87b3f
author: Brian Kernighan <fakeuser@fake.com>
date: Wed Mar 15 13:10:39 EDT 2023
fixed split with CSV (i think)
--- a/README.unicode
+++ /dev/null
@@ -1,55 +1,0 @@
-From bwk@cs.princeton.edu Wed May 25 15:55:09 2022
-X-Envelope-From: bwk@cs.princeton.edu
-X-Envelope-To: <arnold@skeeve.com>
-Return-Path: <bwk@cs.princeton.edu>
-Received: from violeteyes.cs.princeton.edu (violeteyes.cs.princeton.edu [128.112.136.55])
- by freefriends.org (8.14.7/8.14.7) with ESMTP id 24PLt7fa003331
- (version=TLSv1/SSLv3 cipher=ECDHE-RSA-AES256-GCM-SHA384 bits=256 verify=NOT)
- for <arnold@skeeve.com>; Wed, 25 May 2022 15:55:09 -0600
-Received: from wash.cs.princeton.edu (wash.cs.princeton.edu [128.112.155.171])
- (authenticated bits=0)
- by violeteyes.cs.princeton.edu (8.14.7/8.14.7) with ESMTP id 24PLt4Hv011884
- (version=TLSv1/SSLv3 cipher=ECDHE-RSA-AES256-GCM-SHA384 bits=256 verify=NOT);
- Wed, 25 May 2022 17:55:07 -0400
-Date: Wed, 25 May 2022 17:55:04 -0400 (EDT)
-From: Brian Kernighan <bwk@cs.princeton.edu>
-To: Arnold Robbins <arnold@skeeve.com>
-cc: Brian Kernighan <bwk@cs.princeton.edu>
-Subject: awk and unicode
-Message-ID: <b742347a-cf40-e97c-6e9d-1847aff2cf5@cs.princeton.edu>
-MIME-Version: 1.0
-Content-Type: text/plain; format=flowed; charset=US-ASCII
-
-Hi, Arnold --
-
-Finally, with a bit of spare time after the academic
-treadmill slows, I have gotten back to futzing around with
-Unicode in awk. I now have it mostly working (modulo
-inadequate tests), through a combination of using utf-8
-internally for functions like length(), and conversion to
-utf-32 in regular expressions. The amount of actual change
-isn't too great, so I think this might be ok.
-
-I have not looked at range matches for regular expressions,
-since require a lot of really fiddly code. I have not fixed
-the fnematch() code since I never noticed it before. It
-looks like the ranges will work as is; fnematch needs fixed
-but I think it should be fairly easy.
-
-There is one realloc bug, which suggests that others lurk
-too, but it's confined to very large character classes, so I
-should be able to find it.
-
-I have tested this a fair amount but clearly more tests are
-needed. I'm working on that, but if you have more tests
-hidden away, let me know.
-
-Once I figure out how (and do some more checking, I will try
-to submit a pull request. I wish I understood git better,
-but in spite of your help, I still don't have a proper
-understanding, so this may take a while.
-
-Hope all is well and you're enjoying your visit to the US.
-
-Brian
-
--- a/b.c
+++ b/b.c
@@ -336,13 +336,13 @@
/* in the parsing of regular expressions, metacharacters like . have */
/* to be seen literally; \056 is not a metacharacter. */
-int hexstr(const uschar **pp) /* find and eval hex string at pp, return new p */
+int hexstr(const uschar **pp, int max) /* find and eval hex string at pp, return new p */
{ /* only pick up one 8-bit byte (2 chars) */const uschar *p;
int n = 0;
int i;
- for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) {+ for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {if (isdigit(*p))
n = 16 * n + *p - '0';
else if (*p >= 'a' && *p <= 'f')
@@ -354,6 +354,8 @@
return n;
}
+
+
#define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
int quoted(const uschar **pp) /* pick up next thing after a \\ */
@@ -364,24 +366,26 @@
/* BUG: should advance by utf-8 char even if makes no sense */
- if ((c = *p++) == 't')
+ if ((c = *p++) == 't') {c = '\t';
- else if (c == 'n')
+ } else if (c == 'n') {c = '\n';
- else if (c == 'f')
+ } else if (c == 'f') {c = '\f';
- else if (c == 'r')
+ } else if (c == 'r') {c = '\r';
- else if (c == 'b')
+ } else if (c == 'b') {c = '\b';
- else if (c == 'v')
+ } else if (c == 'v') {c = '\v';
- else if (c == 'a')
+ } else if (c == 'a') {c = '\a';
- else if (c == '\\')
+ } else if (c == '\\') {c = '\\';
- else if (c == 'x') { /* hexadecimal goo follows */- c = hexstr(&p); /* this adds a null if number is invalid */
+ } else if (c == 'x') { /* 2 hex digits follow */+ c = hexstr(&p, 2); /* this adds a null if number is invalid */
+ } else if (c == 'u') { /* unicode char number up to 8 hex digits */+ c = hexstr(&p, 8);
} else if (isoctdigit(c)) { /* \d \dd \ddd */int n = c - '0';
if (isoctdigit(*p)) {--- a/proto.h
+++ b/proto.h
@@ -43,7 +43,6 @@
extern int makeinit(fa *, bool);
extern void penter(Node *);
extern void freetr(Node *);
-extern int hexstr(const uschar **);
extern int quoted(const uschar **);
extern int *cclenter(const char *);
extern noreturn void overflo(const char *);
--- a/run.c
+++ b/run.c
@@ -1671,7 +1671,7 @@
origs = s = strdup(getsval(y));
tempfree(y);
arg3type = ptoi(a[3]);
- if (a[2] == NULL) { /* fs string */+ if (a[2] == NULL) { /* BUG: CSV should override implicit fs but not explicit */fs = getsval(fsloc);
} else if (arg3type == STRING) { /* split(str,arr,"string") */x = execute(a[2]);
@@ -1738,7 +1738,41 @@
setsymtab(num, s, 0.0, STR, (Array *) ap->sval);
spdone:
pfa = NULL;
- } else if (!CSV && sep == ' ') {+
+ } else if (a[2] == NULL && CSV) { /* CSV only if no explicit separator */+ char *newt = (char *) malloc(strlen(s)); /* for building new string; reuse for each field */
+ for (;;) {+ char *fr = newt;
+ n++;
+ if (*s == '"' ) { /* start of "..." */+ for (s++ ; *s != '\0'; ) {+ if (*s == '"' && s[1] != '\0' && s[1] == '"') {+ s += 2; /* doubled quote */
+ *fr++ = '"';
+ } else if (*s == '"' && (s[1] == '\0' || s[1] == ',')) {+ s++; /* skip over closing quote */
+ break;
+ } else {+ *fr++ = *s++;
+ }
+ }
+ *fr++ = 0;
+ } else { /* unquoted field */+ while (*s != ',' && *s != '\0')
+ *fr++ = *s++;
+ *fr++ = 0;
+ }
+ snprintf(num, sizeof(num), "%d", n);
+ if (is_number(newt, &result))
+ setsymtab(num, newt, result, STR|NUM, (Array *) ap->sval);
+ else
+ setsymtab(num, newt, 0.0, STR, (Array *) ap->sval);
+ if (*s++ == '\0')
+ break;
+ }
+ free(newt);
+
+ } else if (!CSV && sep == ' ') { /* usual case: split on white space */ for (n = 0; ; ) {#define ISWS(c) ((c) == ' ' || (c) == '\t' || (c) == '\n')
while (ISWS(*s))
@@ -1761,6 +1795,7 @@
if (*s != '\0')
s++;
}
+
} else if (sep == 0) { /* new: split(s, a, "") => 1 char/elem */ for (n = 0; *s != '\0'; s += u8_nextlen(s)) {char buf[10];
@@ -1778,41 +1813,7 @@
setsymtab(num, buf, 0.0, STR, (Array *) ap->sval);
}
-
- } else if (CSV) { /* CSV processing. no error handling */- char *newt = (char *) malloc(strlen(s)); /* for building new string; reuse for each field */
- for (;;) {- char *fr = newt;
- n++;
- if (*s == '"' ) { /* start of "..." */- for (s++ ; *s != '\0'; ) {- if (*s == '"' && s[1] != '\0' && s[1] == '"') {- s += 2; /* doubled quote */
- *fr++ = '"';
- } else if (*s == '"' && (s[1] == '\0' || s[1] == ',')) {- s++; /* skip over closing quote */
- break;
- } else {- *fr++ = *s++;
- }
- }
- *fr++ = 0;
- } else { /* unquoted field */- while (*s != ',' && *s != '\0')
- *fr++ = *s++;
- *fr++ = 0;
- }
- snprintf(num, sizeof(num), "%d", n);
- if (is_number(newt, &result))
- setsymtab(num, newt, result, STR|NUM, (Array *) ap->sval);
- else
- setsymtab(num, newt, 0.0, STR, (Array *) ap->sval);
- if (*s++ == '\0')
- break;
- }
- free(newt);
-
- } else if (*s != '\0') {+ } else if (*s != '\0') { /* some random single character */ for (;;) {n++;
t = s;
--- a/testdir/T.csv
+++ b/testdir/T.csv
@@ -7,7 +7,7 @@
$awk '
BEGIN {FS = "\t"
- awk = "../a.out"
+ awk = "../a.out --csv"
}
NF == 0 || $1 ~ /^#/ {next
@@ -53,7 +53,7 @@
# ... terminated by blank line
-try BEGIN {FS=","}; { for (i=1; i<=NF; i++) printf("[%s]", $i); printf("\n") }+try { for (i=1; i<=NF; i++) printf("[%s]", $i); printf("\n") }a [a]
a [ a]
,a [][a]
--
⑨