ref: d133a4306688e51df959da814d48f10f33cad23a
dir: /re.c/
#include <u.h>
#include <libc.h>
#include <ctype.h>
#include <bio.h>
#include <regexp.h>
#include "awk.h"
#include "y.tab.h"
/* This file provides the interface between the main body of
* awk and the pattern matching package. It preprocesses
* patterns prior to compilation to provide awk-like semantics
* to character sequences not supported by the pattern package.
* The following conversions are performed:
*
* "()" -> "[]"
* "[-" -> "[\-"
* "[^-" -> "[^\-"
* "-]" -> "\-]"
* "[]" -> "[]*"
* "\xdddd" -> "\z" where 'z' is the UTF sequence
* for the hex value
* "\ddd" -> "\o" where 'o' is a char octal value
* "\b" -> "\B" where 'B' is backspace
* "\t" -> "\T" where 'T' is tab
* "\f" -> "\F" where 'F' is form feed
* "\n" -> "\N" where 'N' is newline
* "\r" -> "\r" where 'C' is cr
*/
#define MAXRE 512
static char re[MAXRE]; /* copy buffer */
char *patbeg;
int patlen; /* number of chars in pattern */
#define NPATS 20 /* number of slots in pattern cache */
static struct pat_list /* dynamic pattern cache */
{
char *re;
int use;
Reprog *program;
} pattern[NPATS];
static int npats; /* cache fill level */
/* Compile a pattern */
void
*compre(char *pat)
{
int i, j, inclass;
char c, *p, *s;
Reprog *program;
if (!compile_time) { /* search cache for dynamic pattern */
for (i = 0; i < npats; i++)
if (!strcmp(pat, pattern[i].re)) {
pattern[i].use++;
return((void *) pattern[i].program);
}
}
/* Preprocess Pattern for compilation */
p = re;
s = pat;
inclass = 0;
while (c = *s++) {
if (c == '\\') {
quoted(&s, &p, re+MAXRE);
continue;
}
else if (!inclass && c == '(' && *s == ')') {
if (p < re+MAXRE-2) { /* '()' -> '[]*' */
*p++ = '[';
*p++ = ']';
c = '*';
s++;
}
else overflow();
}
else if (c == '['){ /* '[-' -> '[\-' */
inclass = 1;
if (*s == '-') {
if (p < re+MAXRE-2) {
*p++ = '[';
*p++ = '\\';
c = *s++;
}
else overflow();
} /* '[^-' -> '[^\-'*/
else if (*s == '^' && s[1] == '-'){
if (p < re+MAXRE-3) {
*p++ = '[';
*p++ = *s++;
*p++ = '\\';
c = *s++;
}
else overflow();
}
else if (*s == '['){ /* skip '[[' */
if (p < re+MAXRE-1)
*p++ = c;
else overflow();
c = *s++;
}
else if (*s == '^' && s[1] == '[') { /* skip '[^['*/
if (p < re+MAXRE-2) {
*p++ = c;
*p++ = *s++;
c = *s++;
}
else overflow();
}
else if (*s == ']') { /* '[]' -> '[]*' */
if (p < re+MAXRE-2) {
*p++ = c;
*p++ = *s++;
c = '*';
inclass = 0;
}
else overflow();
}
}
else if (c == '-' && *s == ']') { /* '-]' -> '\-]' */
if (p < re+MAXRE-1)
*p++ = '\\';
else overflow();
}
else if (c == ']')
inclass = 0;
if (p < re+MAXRE-1)
*p++ = c;
else overflow();
}
*p = 0;
program = regcomp(re); /* compile pattern */
if (!compile_time) {
if (npats < NPATS) /* Room in cache */
i = npats++;
else { /* Throw out least used */
int use = pattern[0].use;
i = 0;
for (j = 1; j < NPATS; j++) {
if (pattern[j].use < use) {
use = pattern[j].use;
i = j;
}
}
xfree(pattern[i].program);
xfree(pattern[i].re);
}
pattern[i].re = tostring(pat);
pattern[i].program = program;
pattern[i].use = 1;
}
return((void *) program);
}
/* T/F match indication - matched string not exported */
int
match(void *p, char *s, char *)
{
return regexec((Reprog *) p, (char *) s, 0, 0);
}
/* match and delimit the matched string */
int
pmatch(void *p, char *s, char *start)
{
Resub m;
m.sp = start;
m.ep = 0;
if (regexec((Reprog *) p, (char *) s, &m, 1)) {
patbeg = m.sp;
patlen = m.ep-m.sp;
return 1;
}
patlen = -1;
patbeg = start;
return 0;
}
/* perform a non-empty match */
int
nematch(void *p, char *s, char *start)
{
if (pmatch(p, s, start) == 1 && patlen > 0)
return 1;
patlen = -1;
patbeg = start;
return 0;
}
/* in the parsing of regular expressions, metacharacters like . have */
/* to be seen literally; \056 is not a metacharacter. */
hexstr(char **pp) /* find and eval hex string at pp, return new p */
{
char c;
int n = 0;
int i;
for (i = 0, c = (*pp)[i]; i < 4 && isxdigit(c); i++, c = (*pp)[i]) {
if (isdigit(c))
n = 16 * n + c - '0';
else if ('a' <= c && c <= 'f')
n = 16 * n + c - 'a' + 10;
else if ('A' <= c && c <= 'F')
n = 16 * n + c - 'A' + 10;
}
*pp += i;
return n;
}
/* look for awk-specific escape sequences */
#define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
void
quoted(char **s, char **to, char *end) /* handle escaped sequence */
{
char *p = *s;
char *t = *to;
Rune c;
switch(c = *p++) {
case 't':
c = '\t';
break;
case 'n':
c = '\n';
break;
case 'f':
c = '\f';
break;
case 'r':
c = '\r';
break;
case 'b':
c = '\b';
break;
default:
if (t < end-1) /* all else must be escaped */
*t++ = '\\';
if (c == 'x') { /* hexadecimal goo follows */
c = hexstr(&p);
if (t < end-UTFmax)
t += runelen(c);
else overflow();
*to = t;
*s = p;
return;
} else if (isoctdigit(c)) { /* \d \dd \ddd */
c -= '0';
if (isoctdigit(*p)) {
c = 8 * c + *p++ - '0';
if (isoctdigit(*p))
c = 8 * c + *p++ - '0';
}
}
break;
}
if (t < end-1)
*t++ = c;
*s = p;
*to = t;
}
/* pattern package error handler */
void
regerror(char *s)
{
FATAL("%s", s);
}
void
overflow(void)
{
FATAL("%s", "regular expression too big");
}