shithub: trueawk

Download patch

ref: ac3084de9b64d0c838e520ea1ebdb2e1bde87b3f
parent: 25adef9bceb9addb1143c1f89dfa4fda0e6ce378
author: Brian Kernighan <fakeuser@fake.com>
date: Wed Mar 15 07:14:16 EDT 2023

added \u processing, based on plan9 runetochar()

--- a/awk.h
+++ b/awk.h
@@ -78,6 +78,8 @@
 extern Awkfloat *RSTART;
 extern Awkfloat *RLENGTH;
 
+extern bool	CSV;		/* true for csv input */
+
 extern char	*record;	/* points to $0 */
 extern int	lineno;		/* line number in awk program */
 extern int	errorflag;	/* 1 if error has occurred */
--- a/lex.c
+++ b/lex.c
@@ -368,6 +368,8 @@
 	}
 }
 
+extern int runetochar(char *str, int c);
+
 int string(void)
 {
 	int c, n;
@@ -415,7 +417,7 @@
 				*bp++ = n;
 				break;
 
-			case 'x':	/* hex  \x0-9a-fA-F + */
+			case 'x':	/* hex  \x0-9a-fA-F (exactly two) */
 			    {
 				int i;
 
@@ -438,6 +440,27 @@
 					*bp++ = n;
 				else
 					unput(c);
+				break;
+			    }
+
+			case 'u':	/* utf  \u0-9a-fA-F (1..8) */
+			    {
+				int i;
+
+				n = 0;
+				for (i = 0; i < 8; i++) {
+					c = input();
+					if (!isxdigit(c) || c == 0)
+						break;
+					c = tolower(c);
+					n *= 16;
+					if (isdigit(c))
+						n += (c - '0');
+					else
+						n += 10 + (c - 'a');
+				}
+				unput(c);
+				bp += runetochar(bp, n);
 				break;
 			    }
 
--- a/lib.c
+++ b/lib.c
@@ -350,7 +350,7 @@
 		savefs();
 	if (strlen(inputFS) > 1) {	/* it's a regular expression */
 		i = refldbld(r, inputFS);
-	} else if ((sep = *inputFS) == ' ') {	/* default whitespace */
+	} else if (!CSV && (sep = *inputFS) == ' ') {	/* default whitespace */
 		for (i = 0; ; ) {
 			while (*r == ' ' || *r == '\t' || *r == '\n')
 				r++;
@@ -369,7 +369,7 @@
 			*fr++ = 0;
 		}
 		*fr = 0;
-	} else if ((sep = *inputFS) == ',') {	/* CSV processing.  no error handling */
+	} else if (CSV) {	/* CSV processing.  no error handling */
 		for (;;) {
 			i++;
 			if (i > nfields)
--- a/main.c
+++ b/main.c
@@ -22,7 +22,7 @@
 THIS SOFTWARE.
 ****************************************************************/
 
-const char	*version = "version 20221215";
+const char	*version = "version 20230314";
 
 #define DEBUG
 #include <stdio.h>
@@ -49,6 +49,8 @@
 static size_t	npfile;		/* number of filenames */
 static size_t	curpfile;	/* current filename */
 
+bool	CSV = false;	/* true for csv input */
+
 bool	safe = false;	/* true => "safe" mode */
 
 static noreturn void fpecatch(int n
@@ -149,6 +151,12 @@
 			argc--;
 			argv++;
 			break;
+		}
+		if (strcmp(argv[1], "--csv") == 0) {	/* turn on csv input processing */
+			CSV = true;
+			argc--;
+			argv++;
+			continue;
 		}
 		switch (argv[1][1]) {
 		case 's':
--- a/run.c
+++ b/run.c
@@ -724,9 +724,75 @@
 	return charnum;
 }
 
+/* runetochar() adapted from rune.c in the Plan 9 distributione */
+
+enum
+{
+	Runeerror = 128, /* from somewhere else */
+	Runemax = 0x10FFFF,
+
+	Bit1    = 7,
+	Bitx    = 6,
+	Bit2    = 5,
+	Bit3    = 4,
+	Bit4    = 3,
+	Bit5    = 2,
+
+	T1      = ((1<<(Bit1+1))-1) ^ 0xFF,     /* 0000 0000 */
+	Tx      = ((1<<(Bitx+1))-1) ^ 0xFF,     /* 1000 0000 */
+	T2      = ((1<<(Bit2+1))-1) ^ 0xFF,     /* 1100 0000 */
+	T3      = ((1<<(Bit3+1))-1) ^ 0xFF,     /* 1110 0000 */
+	T4      = ((1<<(Bit4+1))-1) ^ 0xFF,     /* 1111 0000 */
+	T5      = ((1<<(Bit5+1))-1) ^ 0xFF,     /* 1111 1000 */
+
+	Rune1   = (1<<(Bit1+0*Bitx))-1,	 	/* 0000 0000 0000 0000 0111 1111 */
+	Rune2   = (1<<(Bit2+1*Bitx))-1,	 	/* 0000 0000 0000 0111 1111 1111 */
+	Rune3   = (1<<(Bit3+2*Bitx))-1,	 	/* 0000 0000 1111 1111 1111 1111 */
+	Rune4   = (1<<(Bit4+3*Bitx))-1,	 	/* 0011 1111 1111 1111 1111 1111 */
+
+	Maskx   = (1<<Bitx)-1,		  	/* 0011 1111 */
+	Testx   = Maskx ^ 0xFF,		 	/* 1100 0000 */
+
+};
+
+int runetochar(char *str, int c)
+{	
+	/* one character sequence 00000-0007F => 00-7F */     
+	if (c <= Rune1) {
+		str[0] = c;
+		return 1;
+	}
+	
+	/* two character sequence 00080-007FF => T2 Tx */
+	if (c <= Rune2) {
+		str[0] = T2 | (c >> 1*Bitx);
+		str[1] = Tx | (c & Maskx);
+		return 2;
+	}
+
+	/* three character sequence 00800-0FFFF => T3 Tx Tx */
+	if (c > Runemax)
+		c = Runeerror;
+	if (c <= Rune3) {
+		str[0] = T3 |  (c >> 2*Bitx);
+		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
+		str[2] = Tx |  (c & Maskx);
+		return 3;
+	}
+	
+	/* four character sequence 010000-1FFFFF => T4 Tx Tx Tx */
+	str[0] = T4 |  (c >> 3*Bitx);
+	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
+	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
+	str[3] = Tx |  (c & Maskx);
+	return 4;
+}               
+
+
 /* ========== end of utf8 code =========== */
 
 
+
 Cell *matchop(Node **a, int n)	/* ~ and match() */
 {
 	Cell *x, *y;
@@ -1605,16 +1671,17 @@
 	origs = s = strdup(getsval(y));
 	tempfree(y);
 	arg3type = ptoi(a[3]);
-	if (a[2] == NULL)		/* fs string */
+	if (a[2] == NULL) {		/* fs string */
 		fs = getsval(fsloc);
-	else if (arg3type == STRING) {	/* split(str,arr,"string") */
+	} else if (arg3type == STRING) {	/* split(str,arr,"string") */
 		x = execute(a[2]);
 		fs = origfs = strdup(getsval(x));
 		tempfree(x);
-	} else if (arg3type == REGEXPR)
+	} else if (arg3type == REGEXPR) {
 		fs = "(regexpr)";	/* split(str,arr,/regexpr/) */
-	else
+	} else {
 		FATAL("illegal type of split");
+	}
 	sep = *fs;
 	ap = execute(a[1]);	/* array name */
 /* BUG 7/26/22: this appears not to reset array: see C1/asplit */
@@ -1671,7 +1738,7 @@
 			setsymtab(num, s, 0.0, STR, (Array *) ap->sval);
   spdone:
 		pfa = NULL;
-	} else if (sep == ' ') {
+	} else if (!CSV && sep == ' ') {
 		for (n = 0; ; ) {
 #define ISWS(c)	((c) == ' ' || (c) == '\t' || (c) == '\n')
 			while (ISWS(*s))
@@ -1712,7 +1779,7 @@
 		}
 
 
-	} else if (sep == ',') {	/* CSV processing.  no error handling */
+	} else if (CSV) {	/* CSV processing.  no error handling */
 		char *newt = (char *) malloc(strlen(s)); /* for building new string; reuse for each field */
 		for (;;) {
 			char *fr = newt;
--