shithub: trueawk

Download patch

ref: cc9e9b68d124c6c7770eb3903329423ff9f17201
parent: e508d2861c26e89e7289a5a33d99df2951498ff4
author: Arnold Robbins <arnold@skeeve.com>
date: Tue Dec 8 03:05:22 EST 2020

Rework floating point conversions. (#98)


--- a/awk.1
+++ b/awk.1
@@ -577,3 +577,56 @@
 the syntax is worse.
 .PP
 Only eight-bit characters sets are handled correctly.
+.SH UNUSUAL FLOATING-POINT VALUES
+.I Awk
+was designed before IEEE 754 arithmetic defined Not-A-Number (NaN)
+and Infinity values, which are supported by all modern floating-point
+hardware.
+.PP
+Because
+.I awk
+uses
+.IR strtod (3)
+and
+.IR atof (3)
+to convert string values to double-precision floating-point values,
+modern C libraries also convert strings starting with
+.B inf
+and
+.B nan
+into infinity and NaN values respectively.  This led to strange results,
+with something like this:
+.PP
+.EX
+.nf
+echo nancy | awk '{ print $1 + 0 }'
+.fi
+.EE
+.PP
+printing
+.B nan
+instead of zero.
+.PP
+.I Awk
+now follows GNU AWK, and prefilters string values before attempting
+to convert them to numbers, as follows:
+.TP
+.I "Hexadecimal values"
+Hexadecimal values (allowed since C99) convert to zero, as they did
+prior to C99.
+.TP
+.I "NaN values"
+The two strings
+.B +nan
+and
+.B \-nan
+(case independent) convert to NaN. No others do.
+(NaNs can have signs.)
+.TP
+.I "Infinity values"
+The two strings
+.B +inf
+and
+.B \-inf
+(case independent) convert to positive and negative infinity, respectively.
+No others do.
--- a/lex.c
+++ b/lex.c
@@ -191,7 +191,12 @@
 			return word(buf);
 		if (isdigit(c)) {
 			char *cp = tostring(buf);
-			yylval.cp = setsymtab(buf, cp, atof(buf), CON|NUM, symtab);
+			double result;
+
+			if (is_number(cp, & result))
+				yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab);
+			else
+				yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab);
 			free(cp);
 			/* should this also have STR set? */
 			RET(NUMBER);
--- a/lib.c
+++ b/lib.c
@@ -30,6 +30,7 @@
 #include <stdlib.h>
 #include <stdarg.h>
 #include <limits.h>
+#include <math.h>
 #include "awk.h"
 
 char	EMPTY[] = { '\0' };
@@ -181,12 +182,14 @@
 			innew = false;
 		if (c != 0 || buf[0] != '\0') {	/* normal record */
 			if (isrecord) {
+				double result;
+
 				if (freeable(fldtab[0]))
 					xfree(fldtab[0]->sval);
 				fldtab[0]->sval = buf;	/* buf == record */
 				fldtab[0]->tval = REC | STR | DONTFREE;
-				if (is_number(fldtab[0]->sval)) {
-					fldtab[0]->fval = atof(fldtab[0]->sval);
+				if (is_number(fldtab[0]->sval, & result)) {
+					fldtab[0]->fval = result;
 					fldtab[0]->tval |= NUM;
 				}
 			}
@@ -293,6 +296,7 @@
 {
 	char *p;
 	Cell *q;
+	double result;
 
 	for (p=s; *p != '='; p++)
 		;
@@ -300,8 +304,8 @@
 	p = qstring(p, '\0');
 	q = setsymtab(s, p, 0.0, STR, symtab);
 	setsval(q, p);
-	if (is_number(q->sval)) {
-		q->fval = atof(q->sval);
+	if (is_number(q->sval, & result)) {
+		q->fval = result;
 		q->tval |= NUM;
 	}
 	DPRINTF("command line set %s to |%s|\n", s, p);
@@ -402,9 +406,11 @@
 	lastfld = i;
 	donefld = true;
 	for (j = 1; j <= lastfld; j++) {
+		double result;
+
 		p = fldtab[j];
-		if(is_number(p->sval)) {
-			p->fval = atof(p->sval);
+		if(is_number(p->sval, & result)) {
+			p->fval = result;
 			p->tval |= NUM;
 		}
 	}
@@ -756,24 +762,67 @@
 /* strtod is supposed to be a proper test of what's a valid number */
 /* appears to be broken in gcc on linux: thinks 0x123 is a valid FP number */
 /* wrong: violates 4.10.1.4 of ansi C standard */
+
 /* well, not quite. As of C99, hex floating point is allowed. so this is
- * a bit of a mess.
+ * a bit of a mess. We work around the mess by checking for a hexadecimal
+ * value and disallowing it. Similarly, we now follow gawk and allow only
+ * +nan, -nan, +inf, and -inf for NaN and infinity values.
  */
 
-#include <math.h>
-int is_number(const char *s)
+/*
+ * This routine now has a more complicated interface, the main point
+ * being to avoid the double conversion of a string to double, and
+ * also to convey out, if requested, the information that the numeric
+ * value was a leading string or is all of the string. The latter bit
+ * is used in getfval().
+ */
+
+bool is_valid_number(const char *s, bool trailing_stuff_ok,
+			bool *no_trailing, double *result)
 {
 	double r;
 	char *ep;
+	bool retval = false;
+
+	if (no_trailing)
+		*no_trailing = false;
+
+	while (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r')
+		s++;
+
+	if (s[0] == '0' && tolower(s[1]) == 'x')	// no hex floating point, sorry
+		return false;
+
+	// allow +nan, -nan, +inf, -inf, any other letter, no
+	if (s[0] == '+' || s[0] == '-') {
+		if (strcasecmp(s+1, "nan") == 0 || strcasecmp(s+1, "inf") == 0)
+			return true;
+		else if (! isdigit(s[1]) && s[1] != '.')
+			return false;
+	}
+	else if (! isdigit(s[0]) && s[0] != '.')
+		return false;
+
 	errno = 0;
 	r = strtod(s, &ep);
 	if (ep == s || r == HUGE_VAL || errno == ERANGE)
-		return 0;
-	/* allow \r as well. windows files aren't going to go away. */
+		return false;
+
+	if (result != NULL)
+		*result = r;
+
+	/*
+	 * check for trailing stuff
+	 * allow \r as well. windows files aren't going to go away.
+	 */
 	while (*ep == ' ' || *ep == '\t' || *ep == '\n' || *ep == '\r')
 		ep++;
-	if (*ep == '\0')
-		return 1;
-	else
-		return 0;
+
+	if (no_trailing)
+		*no_trailing = (*ep == '\0');
+
+	// return true if found the end, or trailing stuff is allowed
+	retval = (*ep == '\0') || trailing_stuff_ok;
+
+	return retval;
 }
--- a/proto.h
+++ b/proto.h
@@ -146,7 +146,9 @@
 extern	void	bclass(int);
 extern	double	errcheck(double, const char *);
 extern	int	isclvar(const char *);
-extern	int	is_number(const char *);
+extern	bool	is_valid_number(const char *s, bool trailing_stuff_ok,
+				bool *no_trailing, double *result);
+#define is_number(s, val)	is_valid_number(s, false, NULL, val)
 
 extern	int	adjbuf(char **pb, int *sz, int min, int q, char **pbp, const char *what);
 extern	void	run(Node *);
--- a/run.c
+++ b/run.c
@@ -407,6 +407,7 @@
 	int bufsize = recsize;
 	int mode;
 	bool newflag;
+	double result;
 
 	if ((buf = (char *) malloc(bufsize)) == NULL)
 		FATAL("out of memory in getline");
@@ -429,15 +430,15 @@
 		} else if (a[0] != NULL) {	/* getline var <file */
 			x = execute(a[0]);
 			setsval(x, buf);
-			if (is_number(x->sval)) {
-				x->fval = atof(x->sval);
+			if (is_number(x->sval, & result)) {
+				x->fval = result;
 				x->tval |= NUM;
 			}
 			tempfree(x);
 		} else {			/* getline <file */
 			setsval(fldtab[0], buf);
-			if (is_number(fldtab[0]->sval)) {
-				fldtab[0]->fval = atof(fldtab[0]->sval);
+			if (is_number(fldtab[0]->sval, & result)) {
+				fldtab[0]->fval = result;
 				fldtab[0]->tval |= NUM;
 			}
 		}
@@ -448,8 +449,8 @@
 			n = getrec(&buf, &bufsize, false);
 			x = execute(a[0]);
 			setsval(x, buf);
-			if (is_number(x->sval)) {
-				x->fval = atof(x->sval);
+			if (is_number(x->sval, & result)) {
+				x->fval = result;
 				x->tval |= NUM;
 			}
 			tempfree(x);
@@ -726,7 +727,7 @@
 	if ((Awkfloat)INT_MAX < val)
 		FATAL("trying to access out of range field %s", x->nval);
 	m = (int) val;
-	if (m == 0 && !is_number(s = getsval(x)))	/* suspicion! */
+	if (m == 0 && !is_number(s = getsval(x), NULL))	/* suspicion! */
 		FATAL("illegal field $(%s), name \"%s\"", s, x->nval);
 		/* BUG: can x->nval ever be null??? */
 	tempfree(x);
@@ -1259,6 +1260,7 @@
 	int sep;
 	char temp, num[50];
 	int n, tempstat, arg3type;
+	double result;
 
 	y = execute(a[0]);	/* source string */
 	origs = s = strdup(getsval(y));
@@ -1303,8 +1305,8 @@
 				snprintf(num, sizeof(num), "%d", n);
 				temp = *patbeg;
 				setptr(patbeg, '\0');
-				if (is_number(s))
-					setsymtab(num, s, atof(s), STR|NUM, (Array *) ap->sval);
+				if (is_number(s, & result))
+					setsymtab(num, s, result, STR|NUM, (Array *) ap->sval);
 				else
 					setsymtab(num, s, 0.0, STR, (Array *) ap->sval);
 				setptr(patbeg, temp);
@@ -1322,8 +1324,8 @@
 		}
 		n++;
 		snprintf(num, sizeof(num), "%d", n);
-		if (is_number(s))
-			setsymtab(num, s, atof(s), STR|NUM, (Array *) ap->sval);
+		if (is_number(s, & result))
+			setsymtab(num, s, result, STR|NUM, (Array *) ap->sval);
 		else
 			setsymtab(num, s, 0.0, STR, (Array *) ap->sval);
   spdone:
@@ -1343,8 +1345,8 @@
 			temp = *s;
 			setptr(s, '\0');
 			snprintf(num, sizeof(num), "%d", n);
-			if (is_number(t))
-				setsymtab(num, t, atof(t), STR|NUM, (Array *) ap->sval);
+			if (is_number(t, & result))
+				setsymtab(num, t, result, STR|NUM, (Array *) ap->sval);
 			else
 				setsymtab(num, t, 0.0, STR, (Array *) ap->sval);
 			setptr(s, temp);
@@ -1372,8 +1374,8 @@
 			temp = *s;
 			setptr(s, '\0');
 			snprintf(num, sizeof(num), "%d", n);
-			if (is_number(t))
-				setsymtab(num, t, atof(t), STR|NUM, (Array *) ap->sval);
+			if (is_number(t, & result))
+				setsymtab(num, t, result, STR|NUM, (Array *) ap->sval);
 			else
 				setsymtab(num, t, 0.0, STR, (Array *) ap->sval);
 			setptr(s, temp);
--- a/tran.c
+++ b/tran.c
@@ -129,9 +129,11 @@
 	free(cp->sval);
 	cp->sval = (char *) ARGVtab;
 	for (i = 0; i < ac; i++) {
+		double result;
+
 		sprintf(temp, "%d", i);
-		if (is_number(*av))
-			setsymtab(temp, *av, atof(*av), STR|NUM, ARGVtab);
+		if (is_number(*av, & result))
+			setsymtab(temp, *av, result, STR|NUM, ARGVtab);
 		else
 			setsymtab(temp, *av, 0.0, STR, ARGVtab);
 		av++;
@@ -148,13 +150,15 @@
 	free(cp->sval);
 	cp->sval = (char *) ENVtab;
 	for ( ; *envp; envp++) {
+		double result;
+
 		if ((p = strchr(*envp, '=')) == NULL)
 			continue;
 		if( p == *envp ) /* no left hand side name in env string */
 			continue;
 		*p++ = 0;	/* split into two strings at = */
-		if (is_number(p))
-			setsymtab(*envp, p, atof(p), STR|NUM, ENVtab);
+		if (is_number(p, & result))
+			setsymtab(*envp, p, result, STR|NUM, ENVtab);
 		else
 			setsymtab(*envp, p, 0.0, STR, ENVtab);
 		p[-1] = '=';	/* restore in case env is passed down to a shell */
@@ -399,9 +403,15 @@
 	else if (isrec(vp) && !donerec)
 		recbld();
 	if (!isnum(vp)) {	/* not a number */
-		vp->fval = atof(vp->sval);	/* best guess */
-		if (is_number(vp->sval) && !(vp->tval&CON))
-			vp->tval |= NUM;	/* make NUM only sparingly */
+		double fval;
+		bool no_trailing;
+
+		if (is_valid_number(vp->sval, true, & no_trailing, & fval)) {
+			vp->fval = fval;
+			if (no_trailing && !(vp->tval&CON))
+				vp->tval |= NUM;	/* make NUM only sparingly */
+		} else
+			vp->fval = 0.0;
 	}
 	DPRINTF("getfval %p: %s = %g, t=%o\n",
 		(void*)vp, NN(vp->nval), vp->fval, vp->tval);
--