ref: 5c89ac1ce32bb5e1050ff6397bdfadad58ad3ffc
parent: ced2a9bc7cc247be72bd6915c9a01a7d2d2710a5
author: Jacob Moody <moody@posixcafe.org>
date: Mon Mar 3 01:00:09 EST 2025
libc: runecomp: unicode 16.0 and rewrite Unicode 16.0 adds some more complexity to normalization, take the chance to rethink the interface a bit more. Change mkfile's to build the tables instead of vendoring them.
--- a/lib/ucd/mkfile
+++ b/lib/ucd/mkfile
@@ -1,6 +1,6 @@
</$objtype/mkfile
-VERSION='15.0.0'
+VERSION='16.0.0'
URL='https://www.unicode.org/Public/'$VERSION'/ucd/'
TXT=\
--- a/sys/include/libc.h
+++ b/sys/include/libc.h
@@ -77,12 +77,25 @@
extern long runestrlen(Rune*);
extern Rune* runestrstr(Rune*, Rune*);
-extern int runecomp(Rune*, Rune*, int);
-extern int runedecomp(Rune*, Rune*, int);
-extern int utfcomp(char*, char*, int);
-extern int utfdecomp(char*, char*, int);
-extern char* fullutfnorm(char*,int);
-extern Rune* fullrunenorm(Rune*,int);
+enum { Maxnormctx = 1+30 };
+typedef struct Norm Norm;
+struct Norm {
+ long (*getrune)(void*);
+ void *ctx;
+
+ struct {
+ Rune *e;
+ Rune a[Maxnormctx];
+ } ibuf, obuf;
+
+ int compose;
+};
+extern void norminit(Norm*, int, void*, long (*getrune)(void*));
+extern long normpull(Norm*, Rune*, long, int);
+extern long runecomp(Rune*, long, Rune*, long);
+extern long runedecomp(Rune*, long, Rune*, long);
+extern long utfcomp(char*, long, char*, long);
+extern long utfdecomp(char*, long, char*, long);
extern Rune* runewbreak(Rune*);
extern char* utfwbreak(char*);
--- a/sys/man/2/runecomp
+++ b/sys/man/2/runecomp
@@ -7,102 +7,145 @@
.br
.B #include <libc.h>
.PP
-.B
-int runecomp(Rune *dst, Rune *src, int max)
-.PP
-.B
-int runedecomp(Rune *dst, Rune *src, int max)
-.PP
-.B
-Rune* fullrunenorm(Rune *s, int n)
-.PP
-.B
-Rune* runegbreak(Rune *s)
-.PP
-.B
-Rune* runewbreak(Rune *s)
-.PP
-.B
-int utfcomp(char *dst, char *src, int max)
-.PP
-.B
-int utfdecomp(char *dst, char *src, int max)
-.PP
-.B
-char* fullutfnorm(char *s, int n)
-.PP
-.B
-char* utfgbreak(char *s)
-.PP
-.B
-char* utfwbreak(char *s)
+.ft L
+.nf
+.EX
+typedef struct Norm Norm;
+struct Norm {
+ ... /* internals */
+};
+
+void norminit(Norm *n, int comp, void *ctx, long (*getrune)(void *ctx));
+long normpull(Norm *n, Rune *dst, long max, int flush);
+
+long runecomp(Rune *dst, long ndst, Rune *src, long nsrc);
+long runedecomp(Rune *dst, long ndst, Rune *src, long nsrc);
+
+long utfcomp(char *dst, long ndst, char *src, long nsrc);
+long utfdecomp(char *dst, long ndst, char *src, long nsrc);
+
+Rune* runegbreak(Rune *s);
+Rune* runewbreak(Rune *s);
+
+char* utfgbreak(char *s);
+char* utfwbreak(char *s);
+.EE
.SH DESCRIPTION
-These routines help in handling
-graphemes that may span multiple runes.
-These routines are for use in font rendering
-and advanced text search; most programs do not
-need to perform normalization.
+These routines handle Unicode®
+abstract characters that span more
+than one codepoint.
+Normalization can be used to turn all codepoints
+into a consistent representation. This
+may be useful if a specific protocol requires normalization, or if
+the program is interested in semantically comparing
+irregular input.
.PP
+The
+.I Norm
+structure is the core structure for all normalization routines.
+.I Norminit
+initializes the structure.
+If the
+.B comp
+argument is non-zero, the output will be normalized to
+NFC (precomposed runes), otherwise it will be normalized
+to NFD (decomposed runes).
+The
+.B getrune
+argument provides the input for normalization, with each call
+returning the next rune of input, and -1 on EOF.
+The
+.B ctx
+argument is stored and passed on to the
+.B getrune
+function in every call.
+.I Normpull
+provides the normalized output, writing at most
+.B max
+elements into
+.BR dst .
+To implement normalization the
+.I Norm
+structure must buffer input until it knows that the context
+for a given base rune is complete.
+In order to accommodate callers which only have chunks
+of data to normalize at a time, the
+.I Norm
+structure maintains runes within its buffer even when
+.B getrune
+returns an EOF.
+The
+.B flush
+argument to
+.I normpull
+changes this behavior, and will instead flush out all
+runes within the structure's buffer when it receives an
+EOF from
+.BR getrune .
+The return value of
+.I normpull
+is the number of runes written to the output.
+.I Normpull
+does not null-terminate the output string,
+however, null bytes are passed through untouched.
+As such, if the input is null terminated, so is the
+output.
+.PP
.IR Runecomp ,
.IR runedecomp ,
.IR utfcomp ,
and
-.I utfdecomp
-perform Unicode® normalization on
-.IR src ,
-storing the result in
-.IR dst .
-No more than
-.I max
-elements will be written, and the resulting string
-will always be null terminated. The return value
-is always the total number of elements required to
-store the transformation. If this value is larger
-than the supplied
-.I max
-the caller can assume the result has been truncated.
-.I Runecomp
+.IR utfdecomp ,
+are abstractions on top of the
+.I Norm
+structure. They are designed to normalize
+fixed-sized input in one go.
+In all functions
+.B src
and
-.I utfcomp
-perform NFC normalization while
-.I runedecomp
+.B dst
+specify the source and destination strings
+respectively.
+The
+.B nsrc
and
-.I utfdecomp
-perform NFD normalization.
+.B ndst
+arguments specify the number of elements
+to process.
+Functions will never read more
+than the specified input, and will never write
+more than the specified output. If there is
+not enough room in the output buffer, the
+result is truncated.
+The return value is likewise the number of elements
+written to the output string.
+Like
+.IR normpull ,
+these functions do not explicitly null terminate
+the output, and pass null bytes through untouched.
.PP
-.IR Fullrunenorm ,
-and
-.I fullutfnorm
-determine if enough elements are present in
-.I s
-to perform normalization. If enough are present,
-a pointer is returned to the first element that begins
-the next context. Otherwise
-.I s
-is returned. No more then
-.I n
-elements will be read. In order to find the boundary, the
-first element of the next context must be peeked.
+The standard for normalization does not specify
+a maximum number of decomposed attaching runes
+that may follow a base rune.
+In order to implement normalization, within a bounded
+amount of memory, these functions implement a subset
+of normalization called Stream-Safe Text.
+This subset specifies that one base rune may have no
+more than 30 attaching runes.
+In order to break up input that contains runs of more than 30 attaching runes,
+these functions will insert the Combining Grapheme Joiner (U+034F) to provide
+a new base for the remaining combining runes.
.PP
.I Runegbreak
-and
-.I utfgbreak
-search
+(\fIrunewbreak\fR)
+return the next grapheme (word) break
+opportunity in
+.BR s ,
+or
.B s
-for the next grapheme break opportunity.
-If none is found before the end of the string,
-.I s
-is returned.
-.PP
-.I Runewbreak
-and
-.I utfwbreak
-search
-.B s
-for the next word break opportunity.
-If none is found before the end of the string,
-.I s
-is returned.
+if none is found.
+Utfgbreak and utfwbreak are UTF
+variants of these routines.
.SH SOURCE
.B /sys/src/libc/port/mkrunetype.c
.br
@@ -118,4 +161,5 @@
.IR utf (6),
.IR tcs (1)
.SH HISTORY
-This implementation was written for 9front (March, 2023).
+This implementation was first written for 9front (March, 2023).
+The implementation was rewritten (in part) for Unicode 16.0 (March, 2025).
--- a/sys/src/cmd/tcs/utf.c
+++ b/sys/src/cmd/tcs/utf.c
@@ -68,44 +68,56 @@
write(1, obuf, p-obuf);
}
+static struct {
+ int off;
+ int n;
+ Rune *input;
+} nctx;
+
+static long
+normgetrune(void*)
+{
+
+ if(nctx.off >= nctx.n)
+ return -1;
+
+ return nctx.input[nctx.off++];
+}
+
void
-utfnorm_out(Rune *base, int n, int (*fn)(Rune*,Rune*,int))
+utfnorm_out(Rune *base, int n, int comp)
{
- static Rune rbuf[32];
- static int nremain = 0;
- Rune src[N + 1 + nelem(rbuf)];
- Rune dst[N + 1 + nelem(rbuf)];
- Rune *p, *p2, *e;
- int i;
+ static Norm norm;
+ static int init;
+ Rune buf[N];
+ long w;
+ int flush;
- e = base+n;
- for(i = 0; i < nremain; i++,n++)
- src[i] = rbuf[i];
- nremain = 0;
- for(p2 = p = base; n > 0;){
- p2 = fullrunenorm(p, n);
- if(p == p2)
- break;
- n -= p2-p;
- for(;p < p2; p++)
- src[i++] = *p;
+ if(init == 0){
+ init++;
+ norminit(&norm, comp, nil, normgetrune);
}
- src[i] = 0;
- utf_out(dst, fn(dst, src, sizeof dst), nil);
- for(; p2 < e; p2++)
- rbuf[nremain++] = *p2;
+ nctx.off = 0;
+ nctx.n = n;
+ nctx.input = base;
+
+ flush = n == 0;
+ do {
+ w = normpull(&norm, buf, nelem(buf), flush);
+ utf_out(buf, w, nil);
+ } while(w != 0);
}
void
utfnfc_out(Rune *base, int n, long *)
{
- utfnorm_out(base, n, runecomp);
+ utfnorm_out(base, n, 1);
}
void
utfnfd_out(Rune *base, int n, long *)
{
- utfnorm_out(base, n, runedecomp);
+ utfnorm_out(base, n, 0);
}
void
--- a/sys/src/libc/mkfile
+++ b/sys/src/libc/mkfile
@@ -10,6 +10,7 @@
cd $i
mk $MKFLAGS install
}
+ @{ cd port && mk extra }
clean:V:
for(i in $DIRS $OLDCPUS test)@{
--- a/sys/src/libc/port/mkfile
+++ b/sys/src/libc/port/mkfile
@@ -63,9 +63,6 @@
rand.c\
readn.c\
rune.c\
- runebreak.c\
- runeistype.c\
- runenorm.c\
runestrcat.c\
runestrchr.c\
runestrcmp.c\
@@ -78,7 +75,6 @@
runestrrchr.c\
runestrlen.c\
runestrstr.c\
- runetotype.c\
sin.c\
sinh.c\
sqrt.c\
@@ -136,16 +132,26 @@
runebreak.$O: runebreakdata runebreak.c
UCD=\
- /lib/ucd/WordBreakProperty.txt\
- /lib/ucd/GraphemeBreakProperty.txt\
- /lib/ucd/emoji-data.txt\
/lib/ucd/CompositionExclusions.txt\
+ /lib/ucd/DerivedNormalizationProps.txt\
+ /lib/ucd/GraphemeBreakProperty.txt\
/lib/ucd/UnicodeData.txt\
+ /lib/ucd/WordBreakProperty.txt\
+ /lib/ucd/emoji-data.txt\
-/lib/ucd/%:
- cd /lib/ucd && mk $stem
+EXTRA=\
+ runebreak.$O\
+ runeistype.$O\
+ runenorm.$O\
+ runetotype.$O\
-runenormdata runetotypedata runeistypedata runebreakdata:
+GEN=\
+ runenormdata\
+ runetotypedata\
+ runeistypedata\
+ runebreakdata\
+
+$GEN: $UCD
@{
eval `{grep '^[A-Z]' /$cputype/mkfile}
$CC $CFLAGS -o mkrunetype.$O mkrunetype.c
@@ -153,4 +159,9 @@
$O.mkrunetype
}
-regen:V: runenormdata runetotypedata runeistypedata runebreakdata mkrunetype.c $UCD
+$EXTRA: $GEN
+
+extra:V: $EXTRA
+ ar vu $LIB $prereq
+
+regen:V: $GEN
--- a/sys/src/libc/port/mkrunetype.c
+++ b/sys/src/libc/port/mkrunetype.c
@@ -172,8 +172,8 @@
return size;
}
-static void
-mklkupmatrix(char *label, int *map, Param *p)
+static int
+mklkupmatrix(int, char *label, int *map, Param *p)
{
int bestsize, size, bestx, besty;
int x, y;
@@ -193,6 +193,7 @@
assert(bestsize != -1);
fprint(2, "label: %s best: %d %d (%d)\n", label, bestx, besty, bestsize);
param(p, bestx, besty);
+ return bestsize;
}
static int myismerged[NRUNES];
@@ -205,7 +206,9 @@
static int mydecomp[NRUNES];
static int mydespecial[256*3];
static int nspecial;
+static int maxdchain;
static int myccc[NRUNES];
+static int myqc[NRUNES];
typedef struct KV KV;
struct KV{
@@ -281,6 +284,19 @@
fprint(fd, "static uint *_recompcoll = _recompdata+%d*2;\n", nelem(vals));
}
+enum {
+ OTHER,
+ Hebrew_Letter, Newline, Extend, Format,
+ Katakana, ALetter, MidLetter, MidNum,
+ MidNumLet, Numeric, ExtendNumLet, WSegSpace,
+ PREPEND = 0x10, CONTROL = 0x20, EXTEND = 0x30, REGION = 0x40,
+ L = 0x50, V = 0x60, T = 0x70, LV = 0x80, LVT = 0x90, SPACEMK = 0xA0,
+ EMOJIEX = 0xB0,
+
+ NFC_QC_No = 1, NFC_QC_Maybe = 2, NFD_QC_No = 4, NFD_QC_Maybe = 8,
+
+};
+
static void
mktables(void)
{
@@ -324,11 +340,22 @@
param(&p, 10, 7);
size = mklkup(normfd, "decomp", mydecomp, &p);
fprint(2, "%s: %d\n", "decomp", size);
+ fprint(normfd, "static enum { Maxdecomp = %d };\n\n", maxdchain);
param(&p, 9, 7);
size = mklkup(normfd, "ccc", myccc, &p);
fprint(2, "%s: %d\n", "ccc", size);
+ param(&p, 10, 6);
+ size = mklkup(normfd, "qc", myqc, &p);
+ fprint(2, "%s: %d\n", "qc", size);
+ fprint(normfd, "static\nenum {\n");
+ fprint(normfd, "\t%s = %d,\n", "Qnfcno", NFC_QC_No);
+ fprint(normfd, "\t%s = %d,\n", "Qnfcmay", NFC_QC_Maybe);
+ fprint(normfd, "\t%s = %d,\n", "Qnfdno", NFD_QC_No);
+ fprint(normfd, "\t%s = %d,\n", "Qnfdmay", NFD_QC_Maybe);
+ fprint(normfd, "};\n");
+
mkexceptarr(normfd, "_decompexceptions", mydespecial, nspecial, 0);
mkexceptarr(normfd, "_recompexceptions", recompext, nrecompext, 1);
mkrecomp(normfd);
@@ -389,21 +416,35 @@
return code;
}
-enum {
- OTHER,
- Hebrew_Letter, Newline, Extend, Format,
- Katakana, ALetter, MidLetter, MidNum,
- MidNumLet, Numeric, ExtendNumLet, WSegSpace,
- PREPEND = 0x10, CONTROL = 0x20, EXTEND = 0x30, REGION = 0x40,
- L = 0x50, V = 0x60, T = 0x70, LV = 0x80, LVT = 0x90, SPACEMK = 0xA0,
- EMOJIEX = 0xB0,
-};
+static char*
+getextraline(Biobuf *b, int *s, int *e)
+{
+ char *dot, *p;
+again:
+ p = Brdline(b, '\n');
+ if(p == nil)
+ return nil;
+ p[Blinelen(b)-1] = 0;
+ if(p[0] == 0 || p[0] == '#')
+ goto again;
+ if((dot = strstr(p, "..")) != nil){
+ *dot = 0;
+ dot += 2;
+ *s = estrtoul(p, 16);
+ *e = estrtoul(dot, 16);
+ } else {
+ *s = *e = estrtoul(p, 16);
+ dot = p;
+ }
+ return dot;
+}
+
static void
markbreak(void)
{
Biobuf *b;
- char *p, *dot;
+ char *dot;
int i, s, e;
uchar v;
@@ -411,19 +452,7 @@
if(b == nil)
sysfatal("could not load word breaks: %r");
- while((p = Brdline(b, '\n')) != nil){
- p[Blinelen(b)-1] = 0;
- if(p[0] == 0 || p[0] == '#')
- continue;
- if((dot = strstr(p, "..")) != nil){
- *dot = 0;
- dot += 2;
- s = estrtoul(p, 16);
- e = estrtoul(dot, 16);
- } else {
- s = e = estrtoul(p, 16);
- dot = p;
- }
+ while((dot = getextraline(b, &s, &e)) != nil){
v = 0;
if(strstr(dot, "ExtendNumLet") != nil)
v = ExtendNumLet;
@@ -455,19 +484,7 @@
if(b == nil)
sysfatal("could not load Grapheme breaks: %r");
- while((p = Brdline(b, '\n')) != nil){
- p[Blinelen(b)-1] = 0;
- if(p[0] == 0 || p[0] == '#')
- continue;
- if((dot = strstr(p, "..")) != nil){
- *dot = 0;
- dot += 2;
- s = estrtoul(p, 16);
- e = estrtoul(dot, 16);
- } else {
- s = e = estrtoul(p, 16);
- dot = p;
- }
+ while((dot = getextraline(b, &s, &e)) != nil){
v = 0;
if(strstr(dot, "; Prepend #") != nil)
v = PREPEND;
@@ -498,19 +515,7 @@
if(b == nil)
sysfatal("could not load emoji-data: %r");
- while((p = Brdline(b, '\n')) != nil){
- p[Blinelen(b)-1] = 0;
- if(p[0] == 0 || p[0] == '#')
- continue;
- if((dot = strstr(p, "..")) != nil){
- *dot = 0;
- dot += 2;
- s = estrtoul(p, 16);
- e = estrtoul(dot, 16);
- } else {
- s = e = estrtoul(p, 16);
- dot = p;
- }
+ while((dot = getextraline(b, &s, &e)) != nil){
v = 0;
if(strstr(dot, "; Extended_Pictographic") != nil)
v = EMOJIEX;
@@ -518,6 +523,26 @@
mybreak[i] |= v;
}
Bterm(b);
+
+ b = Bopen("/lib/ucd/DerivedNormalizationProps.txt", OREAD);
+ if(b == nil)
+ sysfatal("could not load emoji-data: %r");
+
+ while((dot = getextraline(b, &s, &e)) != nil){
+ v = 0;
+ if(strstr(dot, "; NFC_QC; N") != nil)
+ v = NFC_QC_No;
+ else if(strstr(dot, "; NFC_QC; M") != nil)
+ v = NFC_QC_Maybe;
+ else if(strstr(dot, "; NFD_QC; N") != nil)
+ v = NFD_QC_No;
+ else if(strstr(dot, "; NFD_QC; M") != nil)
+ v = NFD_QC_Maybe;
+
+ for(i = s; i <= e; i++)
+ myqc[i] |= v;
+ }
+ Bterm(b);
}
static void
@@ -555,6 +580,21 @@
Bterm(b);
}
+static void
+findlongchain(void)
+{
+ int i, n, x, r1;
+
+ for(i = 0; i < NRUNES; i++)
+ for(x = i, n = 0; r1 = mydecomp[x]>>16; x = r1){
+ if(++n > maxdchain)
+ maxdchain = n;
+ if(r1 >= DSTART && r1 <0xF8FF)
+ r1 -= DSTART;
+ }
+ maxdchain *= 2;
+}
+
void
main(int, char)
{
@@ -683,6 +723,7 @@
}
Bterm(in);
+ findlongchain();
markexclusions();
/*
--- a/sys/src/libc/port/runenorm.c
+++ b/sys/src/libc/port/runenorm.c
@@ -22,61 +22,81 @@
TLast = TBase + TCount - 1,
};
-static void
-_runedecomp(Rune dst[2], Rune c)
+/*
+ * Most runes decompose in to one/two
+ * other runes with codepoints < 0xFFFF,
+ * however there are some exceptions.
+ * To keep the table size down we instead
+ * store an index in to an exception range
+ * within the private use section and use
+ * an exception table.
+ */
+enum {
+ Estart = 0xEEEE,
+ Estop = 0xF8FF,
+};
+
+static Rune
+_runedecomp(Rune c, Rune *r2)
{
uint x;
+ if(c < Runeself){
+ *r2 = 0;
+ return 0;
+ }
+
+ //korean
if(c >= SBase && c <= SLast){
c -= SBase;
x = c % TCount;
if(x){
- dst[0] = SBase + ((c / TCount) * TCount);
- dst[1] = TBase + x;
- return;
+ *r2 = TBase + x;
+ return SBase + (c - x);
}
- dst[0] = LBase + (c / NCount);
- dst[1] = VBase + ((c % NCount) / TCount);
- return;
+ *r2 = VBase + ((c % NCount) / TCount);
+ return LBase + (c / NCount);
}
+
x = decomplkup(c);
if((x & 0xFFFF) != 0){
- dst[0] = x>>16;
- dst[1] = x & 0xFFFF;
- return;
+ *r2 = x & 0xFFFF;
+ return x>>16;
}
x >>= 16;
- if(x >= 0xEEEE && x <0xF8FF){
- memmove(dst, _decompexceptions[x - 0xEEEE], sizeof(Rune)*2);
- return;
+ if(x >= Estart && x < Estop){
+ Rune *r;
+ r = _decompexceptions[x - Estart];
+ *r2 = r[1];
+ return r[0];
}
- dst[0] = x;
- dst[1] = 0;
+ *r2 = 0;
+ return x;
}
static Rune
-_runerecomp(Rune r[2])
+_runerecomp(Rune r0, Rune r1)
{
uint x, y, *p, next;
- if(r[0] >= LBase && r[0] <= LLast){
- if(r[1] < VBase || r[1] > VLast)
+ if(r0 >= LBase && r0 <= LLast){
+ if(r1 < VBase || r1 > VLast)
return 0;
- x = (r[0] - LBase) * NCount + (r[1] - VBase) * TCount;
+ x = (r0 - LBase) * NCount + (r1 - VBase) * TCount;
return SBase + x;
}
- if(r[0] >= SBase && r[0] <= SLast && (r[0] - SBase) % TCount == 0){
- if(r[1] > TBase && r[1] <= TLast)
- return r[0] + (r[1] - TBase);
+ if(r0 >= SBase && r0 <= SLast && (r0 - SBase) % TCount == 0){
+ if(r1 > TBase && r1 <= TLast)
+ return r0 + (r1 - TBase);
return 0;
}
- if(r[0] > 0xFFFF || r[1] > 0xFFFF){
+ if(r0 > 0xFFFF || r1 > 0xFFFF){
for(x = 0; x < nelem(_recompexceptions); x++)
- if(r[0] == _recompexceptions[x][1] && r[1] == _recompexceptions[x][2])
+ if(r0 == _recompexceptions[x][1] && r1 == _recompexceptions[x][2])
return _recompexceptions[x][0];
return 0;
}
- y = x = r[0]<<16 | r[1];
+ y = x = r0<<16 | r1;
x ^= x >> 16;
x *= 0x21f0aaad;
x ^= x >> 15;
@@ -96,239 +116,329 @@
runecccsort(Rune *a, int len)
{
Rune r;
- int i;
- int fail;
+ int i, j;
- do {
- fail = 0;
- for(i = 0; i < len - 1; i++){
- if(ccclkup(a[i]) > ccclkup(a[i+1]) > 0){
- r = a[i];
- a[i] = a[i+1];
- a[i + 1] = r;
- fail = 1;
- }
- }
- } while(fail);
+ for(i = 1; i < len; i++){
+ r = a[i];
+ for(j = i; j > 0 && ccclkup(a[j-1]) > ccclkup(r); j--)
+ a[j] = a[j-1];
+ a[j] = r;
+ }
}
-char*
-fullutfnorm(char *s, int n)
+static int
+boundary(Rune r)
{
- Rune r, peek;
- char *p, *p2;
+ return !(qclkup(r) & (Qnfcno|Qnfcmay));
+}
- p = s;
- if(fullrune(p, n) == 0)
- return s;
+/*
+ * Stk stores the entire context for a chunk of
+ * an input string that is being normalized.
+ * In accordance to the standard, Unicode text
+ * has no upper bound for the amount of conjoining
+ * (also called non-starter) elements associated with
+ * a base rune. Thus to implement normalization within
+ * reasonable memory constraints we implement the
+ * "Stream-Safe Text Format" as defined in UAX #15 § 13.
+ */
+typedef struct {
+ Rune a[Maxnormctx];
+ Rune *e;
+} Stk;
- p += chartorune(&r, p);
- n -= (p - s);
+static int
+push(Stk *s, Rune c)
+{
+ int n, l;
+ Rune r2, b[Maxdecomp];
+ Rune *p = b + nelem(b) - 1;
- if((r >= LBase && r <= LLast) || (r >= SBase && r <= SLast)){
- do {
- if(fullrune(p, n) == 0)
- return s;
- p2 = p + chartorune(&peek, p);
- n -= (p2 - p);
- p = p2;
- } while(n > 0 && (peek >= VBase && peek <= VLast) || (peek > TBase && peek <= TLast));
- if(n <= 0)
- return s;
- return p;
+ for(*p = c; c = _runedecomp(c, &r2); *p = c){
+ assert(p > b);
+ if(r2 != 0)
+ *p-- = r2;
}
- do {
- if(fullrune(p, n) == 0)
- return s;
- p2 = p + chartorune(&peek, p);
- n -= (p2 - p);
- p = p2;
- if(ccclkup(peek) == 0)
- return p;
- } while(n > 0);
-
- return s;
+ n = b + nelem(b) - p;
+ l = nelem(s->a) - (s->e - s->a);
+ if(n > l){
+ werrstr("runenorm: buffer overflow");
+ return -1;
+ }
+ l -= n;
+ for(; n > 0; n--)
+ *s->e++ = *p++;
+ return l;
}
-Rune*
-fullrunenorm(Rune *r, int n)
+/*
+ * Worst case recomposition, this happens when we have to compose
+ * two runes who both have a CCC of zero.
+ */
+static void
+worstrecomp(Stk *s)
{
- Rune *e, *p;
+ int done;
+ Rune c, *p, *rp;
- p = r;
- e = p + n;
+ for(done = 0; done == 0;){
+ done = 1;
+ for(p = s->a; p+1 < s->e; p++){
+ c = _runerecomp(p[0], p[1]);
+ if(c == 0)
+ continue;
+ done = 0;
+ *p = c;
+ for(rp = p+1; rp < s->e-1; rp++)
+ rp[0] = rp[1];
+ s->e--;
+ p--;
+ }
+ }
+}
- if((*p >= LBase && *p <= LLast) || (*p >= SBase && *p <= SLast)){
- p++;
- while(p < e && (*p >= VBase && *p <= VLast) || (*p > TBase && *p <= TLast))
- p++;
+static void
+cccrecomp(Stk *s)
+{
+ Rune c, *p, *rp;
- if(p >= e)
- return r;
- return p;
+ for(p = s->a + 1; p < s->e; p++){
+ c = _runerecomp(s->a[0], *p);
+ if(c != 0){
+ s->a[0] = c;
+ for(rp = p; rp < s->e-1; rp++){
+ rp[0] = rp[1];
+ }
+ s->e--;
+ p--;
+ } else while(p + 1 < s->e && ccclkup(p[0]) == ccclkup(p[1]))
+ p++;
}
+}
- for(; p < e && p + 1 < e; p++)
- if(ccclkup(p[1]) == 0)
- return p + 1;
+void
+norminit(Norm *n, int compose, void *ctx, long (*getrune)(void*))
+{
+ memset(n, 0, sizeof *n);
+ n->ctx = ctx;
+ n->getrune = getrune;
+ n->compose = compose;
+ n->obuf.e = n->obuf.a;
+ n->ibuf.e = n->ibuf.a;
+}
+int NORMDEBUG;
+
+static long
+peekrune(Norm *n)
+{
+ long r;
+
+ if(n->ibuf.e > n->ibuf.a)
+ return n->ibuf.e[-1];
+
+ r = n->getrune(n->ctx);
+ if(r >= 0)
+ *n->ibuf.e++ = r;
return r;
}
-static int
-runenorm(Rune *dst, Rune *src, char *sdst, char *ssrc, int max, int compose)
+static long
+getrune(Norm *n)
{
- Rune c, r[2], _stack[32];
- Rune *p, *stack, *sp, *tp;
- char *strp, *strstop;
- Rune *rp, *rrp;
- Rune *stop;
- Rune peek;
- int w, w2, size;
- int mode;
+ if(n->ibuf.e > n->ibuf.a)
+ return *--n->ibuf.e;
+ return n->getrune(n->ctx);
+}
- if(src){
- mode = 1;
- p = src;
- stop = dst + (max - 1);
- strp = "";
- strstop = nil;
- } else {
- mode = 0;
- p = L"";
- stop = nil;
- strp = ssrc;
- strstop = sdst + (max - 1);
- }
+long
+normpull(Norm *n, Rune *rdst, long max, int flush)
+{
+ Rune *rp, *re;
+ Stk stk;
+ Rune *dot;
+ int r;
+ long c;
- stack = _stack + nelem(_stack)/2;
- size = 0;
- w = w2 = 0;
- while(*strp || *p){
- if(mode)
- c = *p;
- else
- w = chartorune(&c, strp);
-
- sp = stack - 1;
- tp = stack;
- _runedecomp(r, c);
- while(r[0] != 0){
- c = r[0];
- if(r[1] != 0){
- *sp-- = r[1];
- if(sp == _stack)
- break;
- }
- _runedecomp(r, c);
+ rp = rdst;
+ re = rdst + max;
+ dot = nil;
+ c = 0;
+ while(rp < re){
+ if(n->obuf.e != n->obuf.a){
+ memcpy(stk.a, n->obuf.a, (n->obuf.e - n->obuf.a)*sizeof(Rune));
+ stk.e = stk.a + (n->obuf.e - n->obuf.a);
+ n->obuf.e = n->obuf.a;
+ c = stk.a[0];
+ goto Flush;
}
- *sp = c;
- if(mode)
- peek = p[1];
- else
- w2 = chartorune(&peek, strp+w);
-
- if((*sp >= LBase && *sp <= LLast) || (*sp >= SBase && *sp <= SLast)){
- while(peek != 0 && (peek >= VBase && peek <= VLast) || (peek > TBase && peek <= TLast)){
- *tp++ = peek;
- if(mode){
- p++;
- peek = p[1];
- } else {
- strp += w;
- w = w2;
- w2 = chartorune(&peek, strp+w);
+ stk.e = stk.a;
+ c = getrune(n);
+ if(c < 0)
+ break;
+ push(&stk, c);
+ c = peekrune(n);
+ if(stk.e == stk.a+1 && stk.a[0] < Runeself && c < Runeself && c >= 0)
+ goto Flush;
+ while(c >= 0 && ccclkup(c) != 0){
+ r = push(&stk, getrune(n));
+ c = peekrune(n);
+ if(r > 2)
+ continue;
+ if(ccclkup(stk.a[0]) != 0){
+ assert(r > 0);
+ r--;
+ } else
+ assert(r >= 0);
+ if(r == 0 || (c == 0x0344 && r < 2)){
+ /* in reverse */
+ if(r > 0){
+ getrune(n);
+ *n->ibuf.e++ = 0x301;
+ *n->ibuf.e++ = 0x308;
}
- if(tp == _stack + nelem(_stack))
- break;
+ *n->ibuf.e++ = 0x034F;
+ break;
}
}
- while(peek != 0 && ccclkup(peek) != 0){
- _runedecomp(r, peek);
- if(r[1] != 0){
- if(tp+1 >= _stack + nelem(_stack))
- break;
- *tp++ = r[0];
- *tp++ = r[1];
- } else if(r[0] != 0)
- *tp++ = r[0];
- else
- *tp++ = peek;
+ if(stk.e - stk.a > 1)
+ runecccsort(stk.a, stk.e - stk.a);
- if(mode){
- p++;
- peek = p[1];
- } else {
- strp += w;
- w = w2;
- w2 = chartorune(&peek, strp+w);
- }
- if(tp == _stack + nelem(_stack))
- break;
- }
- runecccsort(sp, tp - sp);
+ if(!n->compose)
+ goto Flush;
- if(compose && ccclkup(*sp) == 0){
- for(rp = sp + 1; rp < tp; rp++){
- r[0] = *sp;
- r[1] = *rp;
- c = _runerecomp(r);
- if(c != 0){
- *sp = c;
- for(rrp = rp; rrp > sp; rrp--)
- *rrp = rrp[-1];
- sp++;
- } else while(rp + 1 < tp && ccclkup(*rp) == ccclkup(*(rp+1)))
- rp++;
+ if(ccclkup(stk.e[-1]) == 0){
+ Rune tmp;
+ while(c >= 0 && (!boundary(c) || !boundary(_runedecomp(c, &tmp)))){
+ if(push(&stk, getrune(n)) == -1){
+ *n->ibuf.e++ = c;
+ for(r = 0; r < Maxdecomp; r++)
+ *n->ibuf.e++ = *--stk.e;
+ break;
+ }
+ c = peekrune(n);
}
- }
+ worstrecomp(&stk);
+ } else if(ccclkup(stk.a[0]) == 0)
+ cccrecomp(&stk);
- for(; sp < tp; sp++){
- if(mode){
- if(dst < stop)
- *dst++ = *sp;
- size++;
- } else {
- w2 = runelen(*sp);
- if(sdst+w2 < strstop)
- sdst += runetochar(sdst, sp);
- size += w2;
+Flush:
+ if(flush || c >= 0)
+ for(dot = stk.a; dot < stk.e; dot++){
+ if(rp == re)
+ goto Out;
+ *rp++ = *dot;
}
- }
- if(mode)
- p++;
- else
- strp += w;
+ dot = nil;
+ if(c < 0)
+ break;
}
- if(mode)
- *dst = 0;
- else
- *sdst = 0;
- return size;
+Out:
+ if(c < 0 && !flush){
+ while(stk.e > stk.a)
+ *n->ibuf.e++ = *--stk.e;
+ }
+ if(dot != nil){
+ memcpy(n->obuf.a, dot, (stk.e - dot) * sizeof(Rune));
+ n->obuf.e = n->obuf.a + (stk.e - dot);
+ }
+
+ return rp - rdst;
}
-int
-runecomp(Rune *dst, Rune *src, int max)
+typedef struct {
+ Rune *s, *p;
+ int n;
+} Rctx;
+
+static long
+runegetrune(void *ctx)
{
- return runenorm(dst, src, nil, nil, max, 1);
+ Rctx *c;
+
+ c = ctx;
+ if(c->p >= c->s + c->n)
+ return -1;
+ return *c->p++;
}
-int
-runedecomp(Rune *dst, Rune *src, int max)
+static long
+runedostr(Rune *dst, long ndst, Rune *src, long nsrc, int comp)
{
- return runenorm(dst, src, nil, nil, max, 0);
+ Rctx c;
+ Norm n;
+
+ c.s = c.p = src;
+ c.n = nsrc;
+ norminit(&n, comp, &c, runegetrune);
+ return normpull(&n, dst, ndst, 1);
}
-int
-utfcomp(char *dst, char *src, int max)
+long
+runecomp(Rune *dst, long ndst, Rune *src, long nsrc)
{
- return runenorm(nil, nil, dst, src, max, 1);
+ return runedostr(dst, ndst, src, nsrc, 1);
}
-int
-utfdecomp(char *dst, char *src, int max)
+long
+runedecomp(Rune *dst, long ndst, Rune *src, long nsrc)
{
- return runenorm(nil, nil, dst, src, max, 0);
+ return runedostr(dst, ndst, src, nsrc, 0);
+}
+
+typedef struct {
+ char *s, *p;
+ int n;
+} Uctx;
+
+static long
+utfgetrune(void *ctx)
+{
+ Uctx *c;
+ Rune r;
+
+ c = ctx;
+ if(c->p >= c->s + c->n)
+ return -1;
+ c->p += chartorune(&r, c->p);
+ return r;
+}
+
+static long
+utfdostr(char *dst, long ndst, char *src, long nsrc, int comp)
+{
+ Uctx c;
+ Norm n;
+ Rune buf[Maxnormctx];
+ long i, w;
+ char *e, *p;
+
+ c.s = c.p = src;
+ c.n = nsrc;
+ norminit(&n, comp, &c, utfgetrune);
+ for(p = dst, e = dst + ndst; p < e;){
+ w = normpull(&n, buf, nelem(buf), 1);
+ if(w == 0)
+ break;
+ for(i = 0; i < w; i++){
+ if(p + runelen(buf[i]) >= e)
+ break;
+ p += runetochar(p, buf+i);
+ }
+ }
+ return p - dst;
+}
+
+long
+utfcomp(char *dst, long ndst, char *src, long nsrc)
+{
+ return utfdostr(dst, ndst, src, nsrc, 1);
+}
+
+long
+utfdecomp(char *dst, long ndst, char *src, long nsrc)
+{
+ return utfdostr(dst, ndst, src, nsrc, 0);
}
--- a/sys/src/libc/test/runenorm.c
+++ b/sys/src/libc/test/runenorm.c
@@ -2,6 +2,9 @@
#include <libc.h>
#include <bio.h>
+//Annoying to get a gauge of how broken if we bail on the first failure
+#define print sysfatal
+
static int
estrtoul(char *s)
{
@@ -14,31 +17,245 @@
return code;
}
+#pragma varargck type "V" Rune*
+static int
+vrunefmt(Fmt *f)
+{
+ Rune *s;
+ int n;
+
+ s = va_arg(f->args, Rune*);
+ n = fmtprint(f, "%S(", s);
+ for(; *s != 0; s++)
+ n += fmtprint(f, "%X ", *s);
+ n += fmtprint(f, ")");
+ return n;
+}
+
+typedef struct {
+ Rune src[64];
+ Rune nfc[70];
+ Rune nfd[70];
+} Line;
+
+typedef struct {
+ Rune *s, *p;
+ int n;
+} Ctx;
+
+static long
+getrune(void *ctx)
+{
+ Ctx *c;
+
+ c = ctx;
+ if(c->p >= c->s + c->n)
+ return -1;
+ return *c->p++;
+}
+
+static void
+testline(Line *l)
+{
+ Norm rd, rc;
+ static Ctx ctx;
+ Rune out1[70];
+ Rune out2[70];
+ int i, n;
+
+ norminit(&rd, 0, &ctx, getrune);
+ norminit(&rc, 1, &ctx, getrune);
+ ctx.s = ctx.p = l->src;
+ ctx.n = runestrlen(l->src) + 1;
+
+ n = normpull(&rd, out1, nelem(out1), 1);
+ if(out1[n-1] != '\0')
+ sysfatal("norm ate null");
+ if(runestrcmp(l->nfd, out1) != 0)
+ print("(1) %V %V %V\n", l->src, l->nfd, out1);
+
+ ctx.p = ctx.s;
+ n = normpull(&rc, out2, nelem(out2), 1);
+ if(out2[n-1] != '\0')
+ sysfatal("norm ate null");
+ if(runestrcmp(l->nfc, out2) != 0)
+ print("(2) %V %V %V\n", l->src, l->nfc, out2);
+
+ ctx.p = ctx.s;
+ i = 0;
+ do {
+ n = normpull(&rd, out1 + i, 1, 1);
+ i += n;
+ } while(n != 0);
+ if(runestrcmp(l->nfd, out1) != 0)
+ print("rune-by-rune nfd fail %V %V\n", l->nfd, out1);
+
+ ctx.p = ctx.s;
+ i = 0;
+ do {
+ n = normpull(&rc, out2 + i, 1, 1);
+ i += n;
+ } while(n != 0);
+ if(runestrcmp(l->nfc, out2) != 0)
+ print("rune-by-rune nfc fail %V %V\n", l->nfc, out2);
+}
+
+static void
+testutfline(Line *l)
+{
+ char out1[128], out2[128];
+ char buf1[128], buf2[128], buf3[128];
+
+ snprint(buf1, sizeof buf1, "%S", l->src);
+ snprint(buf2, sizeof buf2, "%S", l->nfd);
+ snprint(buf3, sizeof buf3, "%S", l->nfc);
+
+ utfcomp(out1, sizeof out1, buf1, strlen(buf1)+1);
+ utfdecomp(out2, sizeof out2, buf1, strlen(buf1)+1);
+
+ if(strcmp(out1, buf3) != 0)
+ print("utfline fail nfc: %s %s %s\n", buf1, buf3, out1);
+
+ if(strcmp(out2, buf2) != 0)
+ print("utfline fail nfd: %s %s %s\n", buf1, buf2, out2);
+}
+
+static void
+testedge(void)
+{
+ Line l;
+ int i;
+
+ /*
+ * Test that we correctly break up long attacher
+ * runs with U+034F
+ */
+ l.src[0] = L'U';
+ for(i = 1; i < nelem(l.src)-1; i++)
+ l.src[i] = 0x308;
+ l.src[nelem(l.src)-1] = 0;
+
+ memcpy(l.nfd, l.src, sizeof(l.src));
+ l.nfd[31] = 0x34F;
+ l.nfd[62] = 0x34F;
+ l.nfd[63] = 0x308;
+ l.nfd[64] = 0x308;
+ l.nfd[65] = 0;
+
+ memcpy(l.nfc, l.src, sizeof l.src);
+ l.nfc[0] = 0xDC;
+ l.nfc[30] = 0x34F;
+ l.nfc[61] = 0x34F;
+ l.nfc[62] = 0x308;
+ l.nfc[63] = 0x308;
+ l.nfc[64] = 0;
+
+ testline(&l);
+
+ for(i = 0; i < nelem(l.src)-1; i++)
+ l.src[i] = 0x308;
+ l.src[nelem(l.src)-1] = 0;
+ memcpy(l.nfd, l.src, sizeof l.src);
+ l.nfd[30] = 0x034F;
+ l.nfd[61] = 0x034F;
+ l.nfd[62] = 0x308;
+ l.nfd[63] = 0x308;
+ l.nfd[64] = 0x308;
+ l.nfd[65] = 0;
+ memcpy(l.nfc, l.nfd, sizeof l.nfd);
+
+ testline(&l);
+
+ l.src[0] = L'U';
+ for(i = 1; i < 30; i++)
+ l.src[i] = 0x300;
+ l.src[i++] = 0x0344;
+ l.src[i] = 0;
+ memcpy(l.nfc, l.src, sizeof l.src);
+ memcpy(l.nfd, l.src, sizeof l.src);
+ l.nfd[30] = 0x034F;
+ l.nfd[31] = 0x308;
+ l.nfd[32] = 0x301;
+ l.nfd[33] = 0;
+ l.nfc[0] = 0xD9;
+ l.nfc[29] = 0x034F;
+ l.nfc[30] = 0x308;
+ l.nfc[31] = 0x301;
+ l.nfc[32] = 0;
+
+ testline(&l);
+
+ for(i = 0; i < 59; i++)
+ l.src[i] = 0x300;
+ l.src[i++] = 0x0344;
+ l.src[i] = 0;
+ memcpy(l.nfd, l.src, sizeof l.src);
+ l.nfd[30] = 0x34F;
+ l.nfd[59] = 0x300;
+ l.nfd[60] = 0x34F;
+ l.nfd[61] = 0x308;
+ l.nfd[62] = 0x301;
+ l.nfd[63] = 0x0;
+ memcpy(l.nfc, l.nfd, sizeof l.nfd);
+
+ testline(&l);
+
+ l.src[0] = 0x16D63;
+ for(i = 1; i < 33; i++)
+ l.src[i] = 0x16D67;
+ l.src[i] = 0;
+ memcpy(l.nfd, l.src, sizeof l.src);
+
+ l.nfc[0] = 0x16D6A;
+ for(i = 1; i < 1+15; i++)
+ l.nfc[i] = 0x16D68;
+ l.nfc[i] = 0;
+
+ testline(&l);
+}
+
+static void
+testeof(void)
+{
+ Norm n;
+ Ctx ctx;
+ Rune buf[16], out[16], *p;
+
+ buf[0] = L'u';
+ ctx.s = ctx.p = buf;
+ ctx.n = 1;
+
+ norminit(&n, 1, &ctx, getrune);
+ for(p = out; ctx.p < ctx.s + ctx.n;){
+ p += normpull(&n, p, sizeof out - (p - out), 0);
+ }
+ if(p != out)
+ print("norm flushed when we told it not to");
+ buf[0] = L'̈';
+ buf[1] = L'a';
+ ctx.s = ctx.p = buf;
+ ctx.n = 2;
+ normpull(&n, p, sizeof out - (p - out), 1);
+ if(out[0] != L'ü' || out[1] != 'a')
+ print("eof test fail: %X\n", out[0]);
+}
+
void
main(int, char)
{
- Rune buffer1[64];
- Rune buffer2[64];
- char utfbuff1[128];
- char utfbuff2[128];
- char srctmp[128], tmp1[128], tmp2[128];
char *fields[10];
char *runes[32];
char *p;
- int n, n2;
+ int n;
int i;
- uint fail;
Biobuf *b;
+ fmtinstall('V', vrunefmt);
b = Bopen("/lib/ucd/NormalizationTest.txt", OREAD);
if(b == nil)
sysfatal("could not load composition exclusions: %r");
- struct {
- Rune src[32];
- Rune nfc[32];
- Rune nfd[32];
- } test;
+ Line test;
while((p = Brdline(b, '\n')) != nil){
p[Blinelen(b)-1] = 0;
if(p[0] == 0 || p[0] == '#' || p[0] == '@')
@@ -59,34 +276,10 @@
test.nfd[i] = estrtoul(runes[i]);
test.nfd[i] = 0;
- n = runecomp(buffer1, test.src, nelem(buffer1));
- n2 = runedecomp(buffer2, test.src, nelem(buffer2));
- fail = 0;
-
- if(runestrcmp(buffer1, test.nfc) != 0)
- fail |= 1<<0;
- if(runestrcmp(buffer2, test.nfd) != 0)
- fail |= 1<<1;
- if(fail)
- print("%d %d %S %S %S %S %S\n", fail, i, test.src, test.nfd, test.nfc, buffer2, buffer1);
- assert(n == runestrlen(test.nfc));
- assert(n2 == runestrlen(test.nfd));
-
- snprint(srctmp, sizeof tmp1, "%S", test.src);
- snprint(tmp1, sizeof tmp1, "%S", test.nfc);
- snprint(tmp2, sizeof tmp2, "%S", test.nfd);
-
- n = utfcomp(utfbuff1, srctmp, nelem(utfbuff1));
- n2 = utfdecomp(utfbuff2, srctmp, nelem(utfbuff2));
-
- if(strcmp(utfbuff1, tmp1) != 0)
- fail |= 1<<2;
- if(strcmp(utfbuff2, tmp2) != 0)
- fail |= 1<<3;
- if(fail)
- print("%d %d %s %s %s %s %s\n", fail, i, srctmp, tmp2, tmp1, utfbuff2, utfbuff1);
- assert(n == strlen(tmp1));
- assert(n2 == strlen(tmp2));
+ testline(&test);
+ testutfline(&test);
}
+ testedge();
+ testeof();
exits(nil);
}
--
⑨