ref: ac3084de9b64d0c838e520ea1ebdb2e1bde87b3f
parent: 25adef9bceb9addb1143c1f89dfa4fda0e6ce378
author: Brian Kernighan <fakeuser@fake.com>
date: Wed Mar 15 07:14:16 EDT 2023
added \u processing, based on plan9 runetochar()
--- a/awk.h
+++ b/awk.h
@@ -78,6 +78,8 @@
extern Awkfloat *RSTART;
extern Awkfloat *RLENGTH;
+extern bool CSV; /* true for csv input */
+
extern char *record; /* points to $0 */
extern int lineno; /* line number in awk program */
extern int errorflag; /* 1 if error has occurred */
--- a/lex.c
+++ b/lex.c
@@ -368,6 +368,8 @@
}
}
+extern int runetochar(char *str, int c);
+
int string(void)
{int c, n;
@@ -415,7 +417,7 @@
*bp++ = n;
break;
- case 'x': /* hex \x0-9a-fA-F + */
+ case 'x': /* hex \x0-9a-fA-F (exactly two) */
{int i;
@@ -438,6 +440,27 @@
*bp++ = n;
else
unput(c);
+ break;
+ }
+
+ case 'u': /* utf \u0-9a-fA-F (1..8) */
+ {+ int i;
+
+ n = 0;
+ for (i = 0; i < 8; i++) {+ c = input();
+ if (!isxdigit(c) || c == 0)
+ break;
+ c = tolower(c);
+ n *= 16;
+ if (isdigit(c))
+ n += (c - '0');
+ else
+ n += 10 + (c - 'a');
+ }
+ unput(c);
+ bp += runetochar(bp, n);
break;
}
--- a/lib.c
+++ b/lib.c
@@ -350,7 +350,7 @@
savefs();
if (strlen(inputFS) > 1) { /* it's a regular expression */i = refldbld(r, inputFS);
- } else if ((sep = *inputFS) == ' ') { /* default whitespace */+ } else if (!CSV && (sep = *inputFS) == ' ') { /* default whitespace */ for (i = 0; ; ) {while (*r == ' ' || *r == '\t' || *r == '\n')
r++;
@@ -369,7 +369,7 @@
*fr++ = 0;
}
*fr = 0;
- } else if ((sep = *inputFS) == ',') { /* CSV processing. no error handling */+ } else if (CSV) { /* CSV processing. no error handling */ for (;;) {i++;
if (i > nfields)
--- a/main.c
+++ b/main.c
@@ -22,7 +22,7 @@
THIS SOFTWARE.
****************************************************************/
-const char *version = "version 20221215";
+const char *version = "version 20230314";
#define DEBUG
#include <stdio.h>
@@ -49,6 +49,8 @@
static size_t npfile; /* number of filenames */
static size_t curpfile; /* current filename */
+bool CSV = false; /* true for csv input */
+
bool safe = false; /* true => "safe" mode */
static noreturn void fpecatch(int n
@@ -149,6 +151,12 @@
argc--;
argv++;
break;
+ }
+ if (strcmp(argv[1], "--csv") == 0) { /* turn on csv input processing */+ CSV = true;
+ argc--;
+ argv++;
+ continue;
}
switch (argv[1][1]) {case 's':
--- a/run.c
+++ b/run.c
@@ -724,9 +724,75 @@
return charnum;
}
+/* runetochar() adapted from rune.c in the Plan 9 distributione */
+
+enum
+{+ Runeerror = 128, /* from somewhere else */
+ Runemax = 0x10FFFF,
+
+ Bit1 = 7,
+ Bitx = 6,
+ Bit2 = 5,
+ Bit3 = 4,
+ Bit4 = 3,
+ Bit5 = 2,
+
+ T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
+ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
+ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
+ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
+ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
+ T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
+
+ Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */
+ Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */
+ Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */
+ Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */
+
+ Maskx = (1<<Bitx)-1, /* 0011 1111 */
+ Testx = Maskx ^ 0xFF, /* 1100 0000 */
+
+};
+
+int runetochar(char *str, int c)
+{ + /* one character sequence 00000-0007F => 00-7F */
+ if (c <= Rune1) {+ str[0] = c;
+ return 1;
+ }
+
+ /* two character sequence 00080-007FF => T2 Tx */
+ if (c <= Rune2) {+ str[0] = T2 | (c >> 1*Bitx);
+ str[1] = Tx | (c & Maskx);
+ return 2;
+ }
+
+ /* three character sequence 00800-0FFFF => T3 Tx Tx */
+ if (c > Runemax)
+ c = Runeerror;
+ if (c <= Rune3) {+ str[0] = T3 | (c >> 2*Bitx);
+ str[1] = Tx | ((c >> 1*Bitx) & Maskx);
+ str[2] = Tx | (c & Maskx);
+ return 3;
+ }
+
+ /* four character sequence 010000-1FFFFF => T4 Tx Tx Tx */
+ str[0] = T4 | (c >> 3*Bitx);
+ str[1] = Tx | ((c >> 2*Bitx) & Maskx);
+ str[2] = Tx | ((c >> 1*Bitx) & Maskx);
+ str[3] = Tx | (c & Maskx);
+ return 4;
+}
+
+
/* ========== end of utf8 code =========== */
+
Cell *matchop(Node **a, int n) /* ~ and match() */
{Cell *x, *y;
@@ -1605,16 +1671,17 @@
origs = s = strdup(getsval(y));
tempfree(y);
arg3type = ptoi(a[3]);
- if (a[2] == NULL) /* fs string */
+ if (a[2] == NULL) { /* fs string */fs = getsval(fsloc);
- else if (arg3type == STRING) { /* split(str,arr,"string") */+ } else if (arg3type == STRING) { /* split(str,arr,"string") */x = execute(a[2]);
fs = origfs = strdup(getsval(x));
tempfree(x);
- } else if (arg3type == REGEXPR)
+ } else if (arg3type == REGEXPR) {fs = "(regexpr)"; /* split(str,arr,/regexpr/) */
- else
+ } else { FATAL("illegal type of split");+ }
sep = *fs;
ap = execute(a[1]); /* array name */
/* BUG 7/26/22: this appears not to reset array: see C1/asplit */
@@ -1671,7 +1738,7 @@
setsymtab(num, s, 0.0, STR, (Array *) ap->sval);
spdone:
pfa = NULL;
- } else if (sep == ' ') {+ } else if (!CSV && sep == ' ') { for (n = 0; ; ) {#define ISWS(c) ((c) == ' ' || (c) == '\t' || (c) == '\n')
while (ISWS(*s))
@@ -1712,7 +1779,7 @@
}
- } else if (sep == ',') { /* CSV processing. no error handling */+ } else if (CSV) { /* CSV processing. no error handling */char *newt = (char *) malloc(strlen(s)); /* for building new string; reuse for each field */
for (;;) {char *fr = newt;
--
⑨