ref: e8d4ad93771196cfb6a28d396e14b6fdc5e73f39
parent: bada775b5e90bff1f76034de4bd05c558ecb0848
author: cinap_lenrek <cinap_lenrek@felloff.net>
date: Wed Aug 20 13:22:08 EDT 2025
libc: move unicode data stuff from port/ to ucd/ This gets rid of the mk extra stuff in port/ and also handles nuke now, forcing regenerating the data tables.
--- a/sys/src/libc/mkfile
+++ b/sys/src/libc/mkfile
@@ -1,16 +1,15 @@
</$objtype/mkfile
PORTDIRS=9sys 9syscall fmt port
-DIRS=$PORTDIRS $CPUS
+DIRS=$PORTDIRS $CPUS ucd
OLDCPUS=68000 68020 sparc
all install:V:
- for(i in $PORTDIRS $objtype)@{
+ for(i in $PORTDIRS $objtype ucd)@{
echo $i
cd $i
mk $MKFLAGS install
}
- @{ cd port && mk extra }
clean:V:
for(i in $DIRS $OLDCPUS test)@{
@@ -20,7 +19,7 @@
}
nuke:V:
- for(i in $PORTDIRS $objtype)@{
+ for(i in $PORTDIRS $objtype ucd)@{
echo $i
cd $i
mk $MKFLAGS nuke
--- a/sys/src/libc/port/mkfile
+++ b/sys/src/libc/port/mkfile
@@ -126,43 +126,3 @@
profile.$O: /sys/include/tos.h
malloc.$O pool.$O: /sys/include/pool.h
-
-runenorm.$O: runenormdata runenorm.c
-runetotype.$O: runetotypedata runetotype.c
-runeistype.$O: runeistypedata runeistype.c
-runebreak.$O: runebreakdata runebreak.c
-
-UCD=\
- /lib/ucd/CompositionExclusions.txt\
- /lib/ucd/DerivedNormalizationProps.txt\
- /lib/ucd/GraphemeBreakProperty.txt\
- /lib/ucd/UnicodeData.txt\
- /lib/ucd/WordBreakProperty.txt\
- /lib/ucd/emoji-data.txt\
-
-EXTRA=\
- runebreak.$O\
- runeistype.$O\
- runenorm.$O\
- runetotype.$O\
-
-GEN=\
- runenormdata\
- runetotypedata\
- runeistypedata\
- runebreakdata\
-
-$GEN: $UCD
- @{
- eval `{grep '^[A-Z]' /$cputype/mkfile}
- $CC $CFLAGS -o mkrunetype.$O mkrunetype.c
- $LD $LDFLAGS -o $O.mkrunetype mkrunetype.$O
- ./$O.mkrunetype
- }
-
-$EXTRA: $GEN
-
-extra:V: $EXTRA
- ar vr $LIB $prereq
-
-regen:V: $GEN
--- a/sys/src/libc/port/mkrunetype.c
+++ /dev/null
@@ -1,789 +1,0 @@
-#include <u.h>
-#include <libc.h>
-#include <bio.h>
-
-enum{
- NRUNES = 1<<21
-};
-
-typedef struct Param Param;
-typedef struct Lvl Lvl;
-struct Lvl{
- int bits;
- int max;
- int mask;
-};
-struct Param{
- Lvl idx1;
- Lvl idx2;
- Lvl data;
-
- int round1max;
-};
-
-static void
-derive(Lvl *l)
-{
- l->max = 1 << l->bits;
- l->mask = l->max - 1;
-}
-
-static void
-param(Param *p, int idx1, int idx2)
-{
-
- assert(idx1 + idx2 < 21);
- p->idx1.bits = idx1;
- p->idx2.bits = idx2;
- p->data.bits = 21 - idx1 - idx2;
- derive(&p->idx1);
- derive(&p->idx2);
- derive(&p->data);
-
- p->round1max = NRUNES/p->data.max;
-}
-
-static int
-lkup(Param *p, int *idx1, int *idx2, int *data, int x)
-{
- int y, z;
-
- y = (((x)>>(p->data.bits+p->idx2.bits))&p->idx1.mask);
- z = (((x)>>p->data.bits)&p->idx2.mask);
- return data[idx2[idx1[y] + z] + (x&p->data.mask)];
-}
-
-static int
-mkarrvar(int fd, char *name, int *d, int len)
-{
- int i, sz;
- int max, min;
- char *t;
-
- max = min = 0;
- for(i = 0; i < len; i++){
- if(d[i] > max)
- max = d[i];
- if(d[i] < min)
- min = d[i];
- }
- if(min == 0){
- if(max < 0xFF)
- t = "uchar", sz = 1;
- else if(max < 0xFFFF)
- t = "ushort", sz = 2;
- else
- t = "uint", sz = 4;
- } else {
- if(max < 1<<7)
- t = "char", sz = 1;
- else if(max < 1<<15)
- t = "short", sz = 2;
- else
- t = "int", sz = 4;
- }
- if(fd < 0)
- return sz * len;
-
- fprint(fd, "static\n%s\t%s[%d] =\n{\n\t", t, name, len);
- for(i = 0; i < len; i++){
- fprint(fd, "%d,", d[i]);
- if((i+1) % 16 == 0)
- fprint(fd, "\n\t");
- }
- fprint(fd, "\n};\n");
-
- return sz * len;
-}
-
-static int
-mkexceptarr(int fd, char *name, int *d, int n, int all)
-{
- int i;
- fprint(fd, "static\nRune %s[][%d] =\n{\n\t", name, all ? 3 : 2);
- for(i = 0; i < n*3; i += 3){
- if(all && d[i] != 0)
- fprint(fd, "{0x%X, 0x%X, 0x%X},", d[i], d[i+1], d[i+2]);
- else if(!all)
- fprint(fd, "{0x%X, 0x%X},", d[i+1], d[i+2]);
- if((i+3) % (8*3) == 0)
- fprint(fd, "\n\t");
- }
- fprint(fd, "\n};\n");
- return n * sizeof(Rune) * 2;
-}
-
-static int
-compact(int *data, int *idx, int nidx, int *src, int chunksize)
-{
- int i, n, ndata, best;
- int *dot, *lp, *rp;
-
- dot = src;
- ndata = 0;
- idx[0] = 0;
- for(i = 1; i <= nidx; i++){
- rp = dot + chunksize;
- lp = rp - 1;
-
- for(best = 0, n = 0; i != nidx && n < chunksize; n++, lp--){
- if(memcmp(lp, rp, (n+1) * sizeof data[0]) == 0)
- best = n+1;
- }
- memmove(data + ndata, dot, (chunksize - best) * sizeof data[0]);
- ndata += (chunksize - best);
- idx[i] = idx[i - 1] + (chunksize - best);
- dot = rp;
- }
- return ndata;
-}
-
-
-static int
-mklkup(int fd, char *label, int *map, Param *p)
-{
- static int data[NRUNES];
- static int idx2[NRUNES];
- static int idx2dest[NRUNES];
- static int idx1[NRUNES];
- int i, nidx2, ndata;
- int size;
-
- ndata = compact(data, idx2, p->round1max, map, p->data.max);
- nidx2 = compact(idx2dest, idx1, p->idx1.max, idx2, p->idx2.max);
-
- if(fd >= 0){
- for(i = 0; i < NRUNES; i++)
- if(map[i] != lkup(p, idx1, idx2dest, data, i))
- sysfatal("mismatch in %s at %d %d %d", label, i, map[i], lkup(p, idx1, idx2dest, data, i));
- }
-
- size = mkarrvar(fd, smprint("_%sdata", label), data, ndata);
- size += mkarrvar(fd, smprint("_%sidx2", label), idx2dest, nidx2);
- size += mkarrvar(fd, smprint("_%sidx1", label), idx1, p->idx1.max);
- if(fd >= 0){
- fprint(fd, "\n");
- fprint(fd, "#define %sindex1(x) (((x)>>(%d+%d))&0x%X)\n", label, p->data.bits, p->idx2.bits, p->idx1.mask);
- fprint(fd, "#define %sindex2(x) (((x)>>%d)&0x%X)\n", label, p->data.bits, p->idx2.mask);
- fprint(fd, "#define %soffset(x) ((x)&0x%X)\n", label, p->data.mask);
- fprint(fd, "#define %slkup(x) (_%sdata[_%sidx2[_%sidx1[%sindex1(x)] + %sindex2(x)] + %soffset(x)] )\n\n",
- label, label, label, label, label, label, label);
- }
- return size;
-}
-
-static int
-mklkupmatrix(int, char *label, int *map, Param *p)
-{
- int bestsize, size, bestx, besty;
- int x, y;
-
- bestsize = bestx = besty = -1;
- for(x = 4; x <= 12; x++)
- for(y=4; y <= (19 - x); y++){
- param(p, x, y);
- size = mklkup(-1, label, map, p);
- if(bestsize == -1 || size < bestsize){
- bestx = x;
- besty = y;
- bestsize = size;
- }
- }
-
- assert(bestsize != -1);
- fprint(2, "label: %s best: %d %d (%d)\n", label, bestx, besty, bestsize);
- param(p, bestx, besty);
- return bestsize;
-}
-
-static int myismerged[NRUNES];
-static int mytoupper[NRUNES];
-static int mytolower[NRUNES];
-static int mytotitle[NRUNES];
-static int mybreak[NRUNES];
-
-enum{ DSTART = 0xEEEE };
-static int mydecomp[NRUNES];
-static int mydespecial[256*3];
-static int nspecial;
-static int maxdchain;
-static int myccc[NRUNES];
-static int myqc[NRUNES];
-
-typedef struct KV KV;
-struct KV{
- uint key;
- uint val;
- ushort next;
-};
-
-static KV myrecomp[2000];
-static int nrecomp;
-
-static int recompext[256*3];
-static int nrecompext;
-
-static uint
-hash(uint x)
-{
- x ^= x >> 16;
- x *= 0x21f0aaad;
- x ^= x >> 15;
- x *= 0xd35a2d97;
- x ^= x >> 15;
- return x;
-}
-
-static void
-mkrecomp(int fd)
-{
- int i;
- KV *p;
- static KV vals[512];
- static KV coll[1000];
- int over;
- int maxchain;
-
- for(i = 0; i < nelem(vals); i++)
- vals[i] = (KV){0, 0, 0};
- for(i = 0; i < nelem(coll); i++)
- coll[i] = (KV){0, 0, 0};
- over = 1;
- for(i = 0; i < nrecomp; i++){
- p = vals + (hash(myrecomp[i].key) % nelem(vals));
- maxchain = 0;
- while(p->key != 0){
- maxchain++;
- if(p->next == 0){
- p->next = over;
- p = coll + over - 1;
- over++;
- } else
- p = coll + p->next - 1;
- }
- p->key = myrecomp[i].key;
- p->val = myrecomp[i].val;
- }
- fprint(2, "recomp map [%d][%d]: %d\n", nelem(vals), over-1, (nelem(vals) + over-1) * (4+2+2));
- fprint(fd, "static\nuint\t_recompdata[] =\n{\n\t");
- for(p = vals, i = 0;; i++){
- assert(p->val < 0xFFFF);
- assert(p->next < 0xFFFF);
- fprint(fd, "%udU,%udU,", p->key, p->val | (p->next<<16));
- if((i+1) % 8 == 0)
- fprint(fd, "\n\t");
-
- if(p == vals+nelem(vals)-1)
- p = coll;
- else if(p == coll + over - 2)
- break;
- else
- p++;
- }
- fprint(fd, "\n};\n");
- fprint(fd, "static uint *_recompcoll = _recompdata+%d*2;\n", nelem(vals));
-}
-
-enum {
- OTHER,
- Hebrew_Letter, Newline, Extend, Format,
- Katakana, ALetter, MidLetter, MidNum,
- MidNumLet, Numeric, ExtendNumLet, WSegSpace,
- PREPEND = 0x10, CONTROL = 0x20, EXTEND = 0x30, REGION = 0x40,
- L = 0x50, V = 0x60, T = 0x70, LV = 0x80, LVT = 0x90, SPACEMK = 0xA0,
- EMOJIEX = 0xB0,
-
- NFC_QC_No = 1, NFC_QC_Maybe = 2, NFD_QC_No = 4, NFD_QC_Maybe = 8,
-
-};
-
-static void
-mktables(void)
-{
- Param p;
- int tofd, isfd, normfd, breakfd;
- int size;
-
- tofd = create("runetotypedata", OWRITE, 0664);
- if(tofd < 0)
- sysfatal("could not create runetotypedata: %r");
- param(&p, 10, 7);
- size = mklkup(tofd, "upper", mytoupper, &p);
- fprint(2, "%s: %d\n", "upper", size);
-
- size = mklkup(tofd, "lower", mytolower, &p);
- fprint(2, "%s: %d\n", "lower", size);
-
- size = mklkup(tofd, "title", mytotitle, &p);
- fprint(2, "%s: %d\n", "title", size);
- close(tofd);
-
- isfd = create("runeistypedata", OWRITE, 0664);
- if(isfd < 0)
- sysfatal("could not create runeistypedata: %r");
- param(&p, 11, 6);
- size = mklkup(isfd, "merged", myismerged, &p);
- fprint(2, "%s: %d\n", "merged", size);
- fprint(isfd, "static\nenum {\n");
- fprint(isfd, "\tL%s = %s,\n", "space", "1<<0");
- fprint(isfd, "\tL%s = %s,\n", "alpha", "1<<1");
- fprint(isfd, "\tL%s = %s,\n", "digit", "1<<2");
- fprint(isfd, "\tL%s = %s,\n", "upper", "1<<3");
- fprint(isfd, "\tL%s = %s,\n", "lower", "1<<4");
- fprint(isfd, "\tL%s = %s,\n", "title", "1<<5");
- fprint(isfd, "};\n");
- close(isfd);
-
- normfd = create("runenormdata", OWRITE, 0664);
- if(normfd < 0)
- sysfatal("could not create runenormdata: %r");
- param(&p, 10, 7);
- size = mklkup(normfd, "decomp", mydecomp, &p);
- fprint(2, "%s: %d\n", "decomp", size);
- fprint(normfd, "static enum { Maxdecomp = %d };\n\n", maxdchain);
-
- param(&p, 9, 7);
- size = mklkup(normfd, "ccc", myccc, &p);
- fprint(2, "%s: %d\n", "ccc", size);
-
- param(&p, 10, 6);
- size = mklkup(normfd, "qc", myqc, &p);
- fprint(2, "%s: %d\n", "qc", size);
- fprint(normfd, "static\nenum {\n");
- fprint(normfd, "\t%s = %d,\n", "Qnfcno", NFC_QC_No);
- fprint(normfd, "\t%s = %d,\n", "Qnfcmay", NFC_QC_Maybe);
- fprint(normfd, "\t%s = %d,\n", "Qnfdno", NFD_QC_No);
- fprint(normfd, "\t%s = %d,\n", "Qnfdmay", NFD_QC_Maybe);
- fprint(normfd, "};\n");
-
- mkexceptarr(normfd, "_decompexceptions", mydespecial, nspecial, 0);
- mkexceptarr(normfd, "_recompexceptions", recompext, nrecompext, 1);
- mkrecomp(normfd);
- close(normfd);
-
- param(&p, 10, 6);
- breakfd = create("runebreakdata", OWRITE, 0644);
- if(breakfd < 0)
- sysfatal("could not create runebreakdata: %r");
- size = mklkup(breakfd, "break", mybreak, &p);
- fprint(2, "%s: %d\n", "break", size);
-}
-
-enum {
- FIELD_CODE,
- FIELD_NAME,
- FIELD_CATEGORY,
- FIELD_COMBINING,
- FIELD_BIDIR,
- FIELD_DECOMP,
- FIELD_DECIMAL_DIG,
- FIELD_DIG,
- FIELD_NUMERIC_VAL,
- FIELD_MIRRORED,
- FIELD_UNICODE_1_NAME,
- FIELD_COMMENT,
- FIELD_UPPER,
- FIELD_LOWER,
- FIELD_TITLE,
- NFIELDS,
-};
-
-static int
-getunicodeline(Biobuf *in, char **fields)
-{
- char *p;
-
- if((p = Brdline(in, '\n')) == nil)
- return 0;
-
- p[Blinelen(in)-1] = '\0';
-
- if (getfields(p, fields, NFIELDS + 1, 0, ";") != NFIELDS)
- sysfatal("bad number of fields");
-
- return 1;
-}
-
-static int
-estrtoul(char *s, int base)
-{
- char *epr;
- Rune code;
-
- code = strtoul(s, &epr, base);
- if(s == epr)
- sysfatal("bad code point hex string");
- return code;
-}
-
-static char*
-getextraline(Biobuf *b, int *s, int *e)
-{
- char *dot, *p;
-
-again:
- p = Brdline(b, '\n');
- if(p == nil)
- return nil;
- p[Blinelen(b)-1] = 0;
- if(p[0] == 0 || p[0] == '#')
- goto again;
- if((dot = strstr(p, "..")) != nil){
- *dot = 0;
- dot += 2;
- *s = estrtoul(p, 16);
- *e = estrtoul(dot, 16);
- } else {
- *s = *e = estrtoul(p, 16);
- dot = p;
- }
- return dot;
-}
-
-static void
-markbreak(void)
-{
- Biobuf *b;
- char *dot;
- int i, s, e;
- uchar v;
-
- b = Bopen("/lib/ucd/WordBreakProperty.txt", OREAD);
- if(b == nil)
- sysfatal("could not load word breaks: %r");
-
- while((dot = getextraline(b, &s, &e)) != nil){
- v = 0;
- if(strstr(dot, "ExtendNumLet") != nil)
- v = ExtendNumLet;
- else if(strstr(dot, "Hebrew_Letter") != nil)
- v = Hebrew_Letter;
- else if(strstr(dot, "Newline") != nil)
- v = Newline;
- else if(strstr(dot, "Extend") != nil)
- v = Extend;
- else if(strstr(dot, "Format") != nil)
- v = Format;
- else if(strstr(dot, "Katakana") != nil)
- v = Katakana;
- else if(strstr(dot, "ALetter") != nil)
- v = ALetter;
- else if(strstr(dot, "MidLetter") != nil)
- v = MidLetter;
- else if(strstr(dot, "MidNum") != nil)
- v = MidNum;
- else if(strstr(dot, "Numeric") != nil)
- v = Numeric;
- else if(strstr(dot, "WSegSpace") != nil)
- v = WSegSpace;
- for(i = s; i <= e; i++)
- mybreak[i] = v;
- }
- Bterm(b);
- b = Bopen("/lib/ucd/GraphemeBreakProperty.txt", OREAD);
- if(b == nil)
- sysfatal("could not load Grapheme breaks: %r");
-
- while((dot = getextraline(b, &s, &e)) != nil){
- v = 0;
- if(strstr(dot, "; Prepend #") != nil)
- v = PREPEND;
- else if(strstr(dot, "; Control #") != nil)
- v = CONTROL;
- else if(strstr(dot, "; Extend #") != nil)
- v = EXTEND;
- else if(strstr(dot, "; Regional_Indicator #") != nil)
- v = REGION;
- else if(strstr(dot, "; SpacingMark #") != nil)
- v = SPACEMK;
- else if(strstr(dot, "; L #") != nil)
- v = L;
- else if(strstr(dot, "; V #") != nil)
- v = V;
- else if(strstr(dot, "; T #") != nil)
- v = T;
- else if(strstr(dot, "; LV #") != nil)
- v = LV;
- else if(strstr(dot, "; LVT #") != nil)
- v = LVT;
- for(i = s; i <= e; i++)
- mybreak[i] |= v;
- }
- Bterm(b);
-
- b = Bopen("/lib/ucd/emoji-data.txt", OREAD);
- if(b == nil)
- sysfatal("could not load emoji-data: %r");
-
- while((dot = getextraline(b, &s, &e)) != nil){
- v = 0;
- if(strstr(dot, "; Extended_Pictographic") != nil)
- v = EMOJIEX;
- for(i = s; i <= e; i++)
- mybreak[i] |= v;
- }
- Bterm(b);
-
- b = Bopen("/lib/ucd/DerivedNormalizationProps.txt", OREAD);
- if(b == nil)
- sysfatal("could not load emoji-data: %r");
-
- while((dot = getextraline(b, &s, &e)) != nil){
- v = 0;
- if(strstr(dot, "; NFC_QC; N") != nil)
- v = NFC_QC_No;
- else if(strstr(dot, "; NFC_QC; M") != nil)
- v = NFC_QC_Maybe;
- else if(strstr(dot, "; NFD_QC; N") != nil)
- v = NFD_QC_No;
- else if(strstr(dot, "; NFD_QC; M") != nil)
- v = NFD_QC_Maybe;
-
- for(i = s; i <= e; i++)
- myqc[i] |= v;
- }
- Bterm(b);
-}
-
-static void
-markexclusions(void)
-{
- Biobuf *b;
- char *p;
- int i;
- uint x;
-
- b = Bopen("/lib/ucd/CompositionExclusions.txt", OREAD);
- if(b == nil)
- sysfatal("could not load composition exclusions: %r");
-
- while((p = Brdline(b, '\n')) != nil){
- p[Blinelen(b)-1] = 0;
- if(p[0] == 0 || p[0] == '#')
- continue;
- x = estrtoul(p, 16);
- for(i = 0; i < nrecomp; i++){
- if(myrecomp[i].val == x){
- myrecomp[i].val = 0;
- break;
- }
- }
- if(i == nrecomp){
- for(i = 0; i < nrecompext; i++){
- if(recompext[i*3] == x){
- recompext[i*3] = 0;
- break;
- }
- }
- }
- }
- Bterm(b);
-}
-
-static void
-findlongchain(void)
-{
- int i, n, x, r1;
-
- for(i = 0; i < NRUNES; i++)
- for(x = i, n = 0; r1 = mydecomp[x]>>16; x = r1){
- if(++n > maxdchain)
- maxdchain = n;
- if(r1 >= DSTART && r1 <0xF8FF)
- r1 -= DSTART;
- }
- maxdchain *= 2;
-}
-
-void
-main(int, char)
-{
- static char myisspace[NRUNES];
- static char myisalpha[NRUNES];
- static char myisdigit[NRUNES];
- static char myisupper[NRUNES];
- static char myislower[NRUNES];
- static char myistitle[NRUNES];
- Biobuf *in;
- char *fields[NFIELDS + 1], *fields2[NFIELDS + 1];
- char *p, *d;
- int i, code, last;
- int decomp[2], *ip;
-
- in = Bopen("/lib/ucd/UnicodeData.txt", OREAD);
- if(in == nil)
- sysfatal("can't open UnicodeData.txt: %r");
-
- for(i = 0; i < NRUNES; i++){
- mytoupper[i] = -1;
- mytolower[i] = -1;
- mytotitle[i] = -1;
- mydecomp[i] = 0;
- myccc[i] = 0;
- mybreak[i] = 0;
- }
-
- myisspace['\t'] = 1;
- myisspace['\n'] = 1;
- myisspace['\r'] = 1;
- myisspace['\f'] = 1;
- myisspace['\v'] = 1;
- myisspace[0x85] = 1; /* control char, "next line" */
- myisspace[0xfeff] = 1; /* zero-width non-break space */
-
- last = -1;
- nspecial = nrecomp = nrecompext = 0;
- while(getunicodeline(in, fields)){
- code = estrtoul(fields[FIELD_CODE], 16);
- if (code >= NRUNES)
- sysfatal("code-point value too big: %x", code);
- if(code <= last)
- sysfatal("bad code sequence: %x then %x", last, code);
- last = code;
-
- p = fields[FIELD_CATEGORY];
- if(strstr(fields[FIELD_NAME], ", First>") != nil){
- if(!getunicodeline(in, fields2))
- sysfatal("range start at eof");
- if (strstr(fields2[FIELD_NAME], ", Last>") == nil)
- sysfatal("range start not followed by range end");
- last = estrtoul(fields2[FIELD_CODE], 16);
- if(last <= code)
- sysfatal("range out of sequence: %x then %x", code, last);
- if(strcmp(p, fields2[FIELD_CATEGORY]) != 0)
- sysfatal("range with mismatched category");
- }
-
- d = fields[FIELD_DECOMP];
- if(strlen(d) > 0 && strstr(d, "<") == nil){
- decomp[0] = estrtoul(d, 16);
- d = strstr(d, " ");
- if(d == nil){
- /* singleton recompositions are verboden */
- decomp[1] = 0;
- if(decomp[0] > 0xFFFF){
- ip = mydespecial + nspecial*3;
- ip[0] = code;
- ip[1] = decomp[0];
- ip[2] = 0;
- mydecomp[code] = (DSTART+nspecial)<<16;
- nspecial++;
- } else
- mydecomp[code] = decomp[0]<<16;
- } else {
- d++;
- decomp[1] = estrtoul(d, 16);
- if(decomp[0] > 0xFFFF || decomp[1] > 0xFFFF){
- ip = mydespecial + nspecial*3;
- ip[0] = code;
- ip[1] = decomp[0];
- ip[2] = decomp[1];
- mydecomp[code] = (DSTART+nspecial)<<16;
- nspecial++;
- ip = recompext + nrecompext*3;
- ip[0] = code;
- ip[1] = decomp[0];
- ip[2] = decomp[1];
- nrecompext++;
- } else {
- mydecomp[code] = decomp[0]<<16 | decomp[1];
- myrecomp[nrecomp++] = (KV){decomp[0]<<16 | decomp[1], code, 0};
- }
- }
- }
-
- for (; code <= last; code++){
- if(p[0] == 'L')
- myisalpha[code] = 1;
- if(p[0] == 'Z')
- myisspace[code] = 1;
-
- if(strcmp(p, "Lu") == 0)
- myisupper[code] = 1;
- if(strcmp(p, "Ll") == 0)
- myislower[code] = 1;
-
- if(strcmp(p, "Lt") == 0)
- myistitle[code] = 1;
-
- if(strcmp(p, "Nd") == 0)
- myisdigit[code] = 1;
-
- if(fields[FIELD_UPPER][0] != '\0')
- mytoupper[code] = estrtoul(fields[FIELD_UPPER], 16);
-
- if(fields[FIELD_LOWER][0] != '\0')
- mytolower[code] = estrtoul(fields[FIELD_LOWER], 16);
-
- if(fields[FIELD_TITLE][0] != '\0')
- mytotitle[code] = estrtoul(fields[FIELD_TITLE], 16);
-
- myccc[code] = estrtoul(fields[FIELD_COMBINING], 10);
- }
- }
-
- Bterm(in);
- findlongchain();
- markexclusions();
-
- /*
- * according to standard, if totitle(x) is not defined in ucd
- * but toupper(x) is, then totitle is defined to be toupper(x)
- */
- for(i = 0; i < NRUNES; i++){
- if(mytotitle[i] == -1
- && mytoupper[i] != -1
- && !myistitle[i])
- mytotitle[i] = mytoupper[i];
- }
-
- /*
- * A couple corrections:
- * is*(to*(x)) should be true.
- * restore undefined transformations.
- * store offset instead of value, makes them sparse.
- */
- for(i = 0; i < NRUNES; i++){
- if(mytoupper[i] != -1)
- myisupper[mytoupper[i]] = 1;
- else
- mytoupper[i] = i;
-
- if(mytolower[i] != -1)
- myislower[mytolower[i]] = 1;
- else
- mytolower[i] = i;
-
- if(mytotitle[i] != -1)
- myistitle[mytotitle[i]] = 1;
- else
- mytotitle[i] = i;
-
- mytoupper[i] = mytoupper[i] - i;
- mytolower[i] = mytolower[i] - i;
- mytotitle[i] = mytotitle[i] - i;
- }
-
- uchar b;
- for(i = 0; i < NRUNES; i++){
- b = 0;
- if(myisspace[i])
- b |= 1<<0;
- if(myisalpha[i])
- b |= 1<<1;
- if(myisdigit[i])
- b |= 1<<2;
- if(myisupper[i])
- b |= 1<<3;
- if(myislower[i])
- b |= 1<<4;
- if(myistitle[i])
- b |= 1<<5;
-
- myismerged[i] = b;
- }
-
- markbreak();
- mktables();
- exits(nil);
-}
--- a/sys/src/libc/port/runebreak.c
+++ /dev/null
@@ -1,293 +1,0 @@
-#include <u.h>
-#include <libc.h>
-
-#include "runebreakdata"
-
-enum {
- OTHER,
- Hebrew_Letter, Newline, Extend, Format,
- Katakana, ALetter, MidLetter, MidNum,
- MidNumLet, Numeric, ExtendNumLet, WSegSpace,
- PREPEND = 0x10, CONTROL = 0x20, EXTEND = 0x30, REGION = 0x40,
- L = 0x50, V = 0x60, T = 0x70, LV = 0x80, LVT = 0x90, SPACEMK = 0xA0,
- EMOJIEX = 0xB0,
-
- ZWJ = 0x200DU,
- LINETAB = 0xB,
-};
-
-#define IS(x, y) ((x&0xf) == y)
-#define ISG(x, y) ((x&0xf0) == y)
-
-Rune*
-runegbreak(Rune *s)
-{
- Rune l, r;
- uchar lt, rt;
- Rune *p;
-
- p = s;
- if((l = *p++) == 0)
- return s;
- if((r = *p) == 0)
- return s;
- lt = breaklkup(l);
- rt = breaklkup(r);
- if(l == '\r' && r == '\n')
- goto Done;
- if(ISG(lt, CONTROL) || l == '\r' || l == '\n')
- return p;
- if(ISG(rt, CONTROL) || r == '\r' || r == '\n')
- return p;
- if(ISG(lt, L) && (ISG(rt, L) || ISG(rt, V) || ISG(rt, LV) || ISG(rt, LVT)))
- goto Done;
- if((ISG(lt, LV) || ISG(lt, V)) && (ISG(rt, V) || ISG(rt, T)))
- goto Done;
- if((ISG(lt, LVT) || ISG(lt, T)) && (ISG(rt, T) || ISG(rt, T)))
- goto Done;
- if(ISG(rt, SPACEMK) || ISG(lt, PREPEND))
- goto Done;
- if(ISG(lt, EMOJIEX) && (ISG(rt, EXTEND) || r == ZWJ)){
- while(ISG(rt, EXTEND)){
- p++;
- if((r = *p) == 0)
- return s;
- rt = breaklkup(r);
- }
- if(r != ZWJ)
- return p;
- p++;
- if((r = *p) == 0)
- return s;
- rt = breaklkup(r);
- if(ISG(rt, EMOJIEX))
- goto Done;
- return p;
- }
- if(ISG(rt, EXTEND) || r == ZWJ)
- goto Done;
- if(ISG(lt, REGION) && ISG(rt, REGION))
- goto Done;
-
- return p;
-
-Done:
- if(p[1] == 0)
- return s;
- return p + 1;
-}
-
-char*
-utfgbreak(char *s)
-{
- Rune l, r;
- uchar lt, rt;
- char *p;
-
- p = s;
- p += chartorune(&l, p);
- if(l == 0)
- return s;
- chartorune(&r, p);
- if(r == 0)
- return s;
- lt = breaklkup(l);
- rt = breaklkup(r);
- if(l == '\r' && r == '\n')
- goto Done;
- if(ISG(lt, CONTROL) || l == '\r' || l == '\n')
- return p;
- if(ISG(rt, CONTROL) || r == '\r' || r == '\n')
- return p;
- if(ISG(lt, L) && (ISG(rt, L) || ISG(rt, V) || ISG(rt, LV) || ISG(rt, LVT)))
- goto Done;
- if((ISG(lt, LV) || ISG(lt, V)) && (ISG(rt, V) || ISG(rt, T)))
- goto Done;
- if((ISG(lt, LVT) || ISG(lt, T)) && (ISG(rt, T) || ISG(rt, T)))
- goto Done;
- if(ISG(rt, SPACEMK) || ISG(lt, PREPEND))
- goto Done;
- if(ISG(lt, EMOJIEX) && (ISG(rt, EXTEND) || r == ZWJ)){
- while(ISG(rt, EXTEND)){
- p += chartorune(&r, p);
- chartorune(&r, p);
- if(r == 0)
- return s;
- rt = breaklkup(r);
- }
- if(r != ZWJ)
- return p;
-
- p += chartorune(&r, p);
- chartorune(&r, p);
- if(r == 0)
- return s;
- rt = breaklkup(r);
- if(ISG(rt, EMOJIEX))
- goto Done;
- return p;
- }
- if(ISG(rt, EXTEND) || r == ZWJ)
- goto Done;
- if(ISG(lt, REGION) && ISG(rt, REGION))
- goto Done;
-
- return p;
-
-Done:
- p += chartorune(&r, p);
- chartorune(&r, p);
- if(r == 0)
- return s;
- return p;
-}
-
-#define AH(x) (IS(x, ALetter) || IS(x, Hebrew_Letter))
-#define MNLQ(x) (IS(x, MidNumLet) || x == '\'')
-
-Rune*
-runewbreak(Rune *s)
-{
- Rune l, r;
- uchar lt, rt;
- Rune *p;
-
- p = s;
- if((l = *p++) == 0)
- return s;
- if((r = *p) == 0)
- return s;
- lt = breaklkup(l);
- rt = breaklkup(r);
- if(l == '\r' && r == '\n')
- goto Done;
- if(l == '\r' || l == '\n' || l == LINETAB)
- return p;
- if(r == '\r' || r == '\n' || l == LINETAB)
- return p;
- if(IS(lt, WSegSpace) && IS(rt, WSegSpace))
- goto Done;
- if(IS(rt, Format) || IS(rt, Extend))
- goto Done;
- if(AH(lt)){
- if(AH(rt))
- goto Done;
- if((IS(rt, MidLetter) || MNLQ(rt)) && p[1] != 0 && AH(breaklkup(p[1])))
- goto Done;
- if(IS(lt, Hebrew_Letter) && r == '\'')
- goto Done;
- if(IS(lt, Hebrew_Letter) && r == '"' && p[1] != 0 && IS(breaklkup(p[1]), Hebrew_Letter))
- goto Done;
- if(IS(rt, Numeric))
- goto Done;
- }
- if(IS(lt, Numeric) && (AH(rt) || IS(rt, Numeric)))
- goto Done;
- if(IS(lt, Numeric) && (IS(rt, MidNum) || MNLQ(rt)) && p[1] != 0 && IS(breaklkup(p[1]), Numeric))
- goto Done;
- if(IS(lt, Katakana) && IS(rt, Katakana))
- goto Done;
- if(AH(lt) || IS(lt, Numeric) || IS(lt, Katakana) || IS(lt, ExtendNumLet))
- if(IS(rt, ExtendNumLet))
- goto Done;
- if(IS(lt, ExtendNumLet) && (AH(rt) || IS(rt, Numeric) || IS(rt, Katakana)))
- goto Done;
- if(ISG(lt, REGION)){
- if(ISG(rt, REGION))
- goto Done;
- if(r != ZWJ)
- return p;
- p++;
- if((r = *p) == 0)
- return s;
- rt = breaklkup(r);
- if(ISG(rt, REGION))
- goto Done;
- }
-
- return p;
-
-Done:
- if(p[1] == 0)
- return s;
- return p + 1;
-}
-
-char*
-utfwbreak(char *s)
-{
- Rune l, r;
- Rune peek;
- uchar lt, rt;
- char *p;
-
- p = s;
- p += chartorune(&l, p);
- if(l == 0)
- return s;
- chartorune(&peek, p+chartorune(&r, p));
- if(r == 0)
- return s;
- lt = breaklkup(l);
- rt = breaklkup(r);
- if(l == '\r' && r == '\n')
- goto Done;
- if(l == '\r' || l == '\n' || l == LINETAB)
- return p;
- if(r == '\r' || r == '\n' || l == LINETAB)
- return p;
- if(IS(lt, WSegSpace) && IS(rt, WSegSpace))
- goto Done;
- if(IS(rt, Format) || IS(rt, Extend))
- goto Done;
- if(AH(lt)){
- if(AH(rt))
- goto Done;
- if(IS(rt, MidLetter) || MNLQ(rt))
- if(peek != 0 && AH(breaklkup(peek)))
- goto Done;
-
- if(IS(lt, Hebrew_Letter) && r == '\'')
- goto Done;
-
- if(IS(lt, Hebrew_Letter) && r == '"')
- if(peek != 0 && IS(breaklkup(peek), Hebrew_Letter))
- goto Done;
-
- if(IS(rt, Numeric))
- goto Done;
- }
- if(IS(lt, Numeric) && (AH(rt) || IS(rt, Numeric)))
- goto Done;
- if(IS(lt, Numeric) && (IS(rt, MidNum) || MNLQ(rt)) && peek != 0 && IS(breaklkup(peek), Numeric))
- goto Done;
- if(IS(lt, Katakana) && IS(rt, Katakana))
- goto Done;
- if(AH(lt) || IS(lt, Numeric) || IS(lt, Katakana) || IS(lt, ExtendNumLet))
- if(IS(rt, ExtendNumLet))
- goto Done;
- if(IS(lt, ExtendNumLet) && (AH(rt) || IS(rt, Numeric) || IS(rt, Katakana)))
- goto Done;
- if(ISG(lt, REGION)){
- if(ISG(rt, REGION))
- goto Done;
- if(r != ZWJ)
- return p;
- p += chartorune(&r, p);
- chartorune(&r, p);
- if(r == 0)
- return s;
- rt = breaklkup(r);
- if(ISG(rt, REGION))
- goto Done;
- }
-
- return p;
-
-Done:
- p += chartorune(&r, p);
- chartorune(&r, p);
- if(r == 0)
- return s;
- return p;
-}
--- a/sys/src/libc/port/runeistype.c
+++ /dev/null
@@ -1,52 +1,0 @@
-#include <u.h>
-#include <libc.h>
-
-#include "runeistypedata"
-
-int
-isspacerune(Rune c)
-{
- if(c > Runemax)
- return 0;
- return (mergedlkup(c) & Lspace) == Lspace;
-}
-
-int
-isalpharune(Rune c)
-{
- if(c > Runemax)
- return 0;
- return (mergedlkup(c) & Lalpha) == Lalpha;
-}
-
-int
-isdigitrune(Rune c)
-{
- if(c > Runemax)
- return 0;
- return (mergedlkup(c) & Ldigit) == Ldigit;
-}
-
-int
-isupperrune(Rune c)
-{
- if(c > Runemax)
- return 0;
- return (mergedlkup(c) & Lupper) == Lupper;
-}
-
-int
-islowerrune(Rune c)
-{
- if(c > Runemax)
- return 0;
- return (mergedlkup(c) & Llower) == Llower;
-}
-
-int
-istitlerune(Rune c)
-{
- if(c > Runemax)
- return 0;
- return (mergedlkup(c) & Ltitle) == Ltitle;
-}
--- a/sys/src/libc/port/runenorm.c
+++ /dev/null
@@ -1,444 +1,0 @@
-#include <u.h>
-#include <libc.h>
-
-#include "runenormdata"
-
-//Unicode Standard: Section 3.12 Conjoining Jamo Behavior
-enum {
- SBase = 0xAC00,
- LBase = 0x1100,
- VBase = 0x1161,
- TBase = 0x11A7,
-
- LCount = 19,
- VCount = 21,
- TCount = 28,
- NCount = VCount * TCount,
- SCount = LCount * NCount,
-
- LLast = LBase + LCount - 1,
- SLast = SBase + SCount - 1,
- VLast = VBase + VCount - 1,
- TLast = TBase + TCount - 1,
-};
-
-/*
- * Most runes decompose in to one/two
- * other runes with codepoints < 0xFFFF,
- * however there are some exceptions.
- * To keep the table size down we instead
- * store an index in to an exception range
- * within the private use section and use
- * an exception table.
- */
-enum {
- Estart = 0xEEEE,
- Estop = 0xF8FF,
-};
-
-static Rune
-_runedecomp(Rune c, Rune *r2)
-{
- uint x;
-
- if(c < Runeself){
- *r2 = 0;
- return 0;
- }
-
- //korean
- if(c >= SBase && c <= SLast){
- c -= SBase;
- x = c % TCount;
- if(x){
- *r2 = TBase + x;
- return SBase + (c - x);
- }
- *r2 = VBase + ((c % NCount) / TCount);
- return LBase + (c / NCount);
- }
-
- x = decomplkup(c);
- if((x & 0xFFFF) != 0){
- *r2 = x & 0xFFFF;
- return x>>16;
- }
- x >>= 16;
- if(x >= Estart && x < Estop){
- Rune *r;
- r = _decompexceptions[x - Estart];
- *r2 = r[1];
- return r[0];
- }
- *r2 = 0;
- return x;
-}
-
-static Rune
-_runerecomp(Rune r0, Rune r1)
-{
- uint x, y, *p, next;
-
- if(r0 >= LBase && r0 <= LLast){
- if(r1 < VBase || r1 > VLast)
- return 0;
- x = (r0 - LBase) * NCount + (r1 - VBase) * TCount;
- return SBase + x;
- }
- if(r0 >= SBase && r0 <= SLast && (r0 - SBase) % TCount == 0){
- if(r1 > TBase && r1 <= TLast)
- return r0 + (r1 - TBase);
- return 0;
- }
- if(r0 > 0xFFFF || r1 > 0xFFFF){
- for(x = 0; x < nelem(_recompexceptions); x++)
- if(r0 == _recompexceptions[x][1] && r1 == _recompexceptions[x][2])
- return _recompexceptions[x][0];
- return 0;
- }
- y = x = r0<<16 | r1;
- x ^= x >> 16;
- x *= 0x21f0aaad;
- x ^= x >> 15;
- x *= 0xd35a2d97;
- x ^= x >> 15;
- p = _recompdata + (x%512)*2;
- while(p[0] != y){
- next = p[1]>>16;
- if(!next)
- return 0;
- p = _recompcoll + (next-1)*2;
- }
- return p[1] & 0xFFFF;
-}
-
-static void
-runecccsort(Rune *a, int len)
-{
- Rune r;
- int i, j;
-
- for(i = 1; i < len; i++){
- r = a[i];
- for(j = i; j > 0 && ccclkup(a[j-1]) > ccclkup(r); j--)
- a[j] = a[j-1];
- a[j] = r;
- }
-}
-
-static int
-boundary(Rune r)
-{
- return !(qclkup(r) & (Qnfcno|Qnfcmay));
-}
-
-/*
- * Stk stores the entire context for a chunk of
- * an input string that is being normalized.
- * In accordance to the standard, Unicode text
- * has no upper bound for the amount of conjoining
- * (also called non-starter) elements associated with
- * a base rune. Thus to implement normalization within
- * reasonable memory constraints we implement the
- * "Stream-Safe Text Format" as defined in UAX #15 § 13.
- */
-typedef struct {
- Rune a[Maxnormctx];
- Rune *e;
-} Stk;
-
-static int
-push(Stk *s, Rune c)
-{
- int n, l;
- Rune r2, b[Maxdecomp];
- Rune *p = b + nelem(b) - 1;
-
- for(*p = c; c = _runedecomp(c, &r2); *p = c){
- assert(p > b);
- if(r2 != 0)
- *p-- = r2;
- }
-
- n = b + nelem(b) - p;
- l = nelem(s->a) - (s->e - s->a);
- if(n > l){
- werrstr("runenorm: buffer overflow");
- return -1;
- }
- l -= n;
- for(; n > 0; n--)
- *s->e++ = *p++;
- return l;
-}
-
-/*
- * Worst case recomposition, this happens when we have to compose
- * two runes who both have a CCC of zero.
- */
-static void
-worstrecomp(Stk *s)
-{
- int done;
- Rune c, *p, *rp;
-
- for(done = 0; done == 0;){
- done = 1;
- for(p = s->a; p+1 < s->e; p++){
- c = _runerecomp(p[0], p[1]);
- if(c == 0)
- continue;
- done = 0;
- *p = c;
- for(rp = p+1; rp < s->e-1; rp++)
- rp[0] = rp[1];
- s->e--;
- p--;
- }
- }
-}
-
-static void
-cccrecomp(Stk *s)
-{
- Rune c, *p, *rp;
-
- for(p = s->a + 1; p < s->e; p++){
- c = _runerecomp(s->a[0], *p);
- if(c != 0){
- s->a[0] = c;
- for(rp = p; rp < s->e-1; rp++){
- rp[0] = rp[1];
- }
- s->e--;
- p--;
- } else while(p + 1 < s->e && ccclkup(p[0]) == ccclkup(p[1]))
- p++;
- }
-}
-
-void
-norminit(Norm *n, int compose, void *ctx, long (*getrune)(void*))
-{
- memset(n, 0, sizeof *n);
- n->ctx = ctx;
- n->getrune = getrune;
- n->compose = compose;
- n->obuf.e = n->obuf.a;
- n->ibuf.e = n->ibuf.a;
-}
-
-int NORMDEBUG;
-
-static long
-peekrune(Norm *n)
-{
- long r;
-
- if(n->ibuf.e > n->ibuf.a)
- return n->ibuf.e[-1];
-
- r = n->getrune(n->ctx);
- if(r >= 0)
- *n->ibuf.e++ = r;
- return r;
-}
-
-static long
-getrune(Norm *n)
-{
- if(n->ibuf.e > n->ibuf.a)
- return *--n->ibuf.e;
- return n->getrune(n->ctx);
-}
-
-long
-normpull(Norm *n, Rune *rdst, long max, int flush)
-{
- Rune *rp, *re;
- Stk stk;
- Rune *dot;
- int r;
- long c;
-
- rp = rdst;
- re = rdst + max;
- dot = nil;
- c = 0;
- while(rp < re){
- if(n->obuf.e != n->obuf.a){
- memcpy(stk.a, n->obuf.a, (n->obuf.e - n->obuf.a)*sizeof(Rune));
- stk.e = stk.a + (n->obuf.e - n->obuf.a);
- n->obuf.e = n->obuf.a;
- c = stk.a[0];
- goto Flush;
- }
-
- stk.e = stk.a;
- c = getrune(n);
- if(c < 0)
- break;
- push(&stk, c);
- c = peekrune(n);
- if(stk.e == stk.a+1 && stk.a[0] < Runeself && c < Runeself && c >= 0)
- goto Flush;
- while(c >= 0 && ccclkup(c) != 0){
- r = push(&stk, getrune(n));
- c = peekrune(n);
- if(r > 2)
- continue;
- if(ccclkup(stk.a[0]) != 0){
- assert(r > 0);
- r--;
- } else
- assert(r >= 0);
- if(r == 0 || (c == 0x0344 && r < 2)){
- /* in reverse */
- if(r > 0){
- getrune(n);
- *n->ibuf.e++ = 0x301;
- *n->ibuf.e++ = 0x308;
- }
- *n->ibuf.e++ = 0x034F;
- break;
- }
- }
- if(stk.e - stk.a > 1)
- runecccsort(stk.a, stk.e - stk.a);
-
- if(!n->compose)
- goto Flush;
-
- if(ccclkup(stk.e[-1]) == 0){
- Rune tmp;
- while(c >= 0 && (!boundary(c) || !boundary(_runedecomp(c, &tmp)))){
- if(push(&stk, getrune(n)) == -1){
- *n->ibuf.e++ = c;
- for(r = 0; r < Maxdecomp; r++)
- *n->ibuf.e++ = *--stk.e;
- break;
- }
- c = peekrune(n);
- }
- worstrecomp(&stk);
- } else if(ccclkup(stk.a[0]) == 0)
- cccrecomp(&stk);
-
-Flush:
- if(flush || c >= 0)
- for(dot = stk.a; dot < stk.e; dot++){
- if(rp == re)
- goto Out;
- *rp++ = *dot;
- }
- dot = nil;
- if(c < 0)
- break;
- }
-Out:
- if(c < 0 && !flush){
- while(stk.e > stk.a)
- *n->ibuf.e++ = *--stk.e;
- }
- if(dot != nil){
- memcpy(n->obuf.a, dot, (stk.e - dot) * sizeof(Rune));
- n->obuf.e = n->obuf.a + (stk.e - dot);
- }
-
- return rp - rdst;
-}
-
-typedef struct {
- Rune *s, *p;
- int n;
-} Rctx;
-
-static long
-runegetrune(void *ctx)
-{
- Rctx *c;
-
- c = ctx;
- if(c->p >= c->s + c->n)
- return -1;
- return *c->p++;
-}
-
-static long
-runedostr(Rune *dst, long ndst, Rune *src, long nsrc, int comp)
-{
- Rctx c;
- Norm n;
-
- c.s = c.p = src;
- c.n = nsrc;
- norminit(&n, comp, &c, runegetrune);
- return normpull(&n, dst, ndst, 1);
-}
-
-long
-runecomp(Rune *dst, long ndst, Rune *src, long nsrc)
-{
- return runedostr(dst, ndst, src, nsrc, 1);
-}
-
-long
-runedecomp(Rune *dst, long ndst, Rune *src, long nsrc)
-{
- return runedostr(dst, ndst, src, nsrc, 0);
-}
-
-typedef struct {
- char *s, *p;
- int n;
-} Uctx;
-
-static long
-utfgetrune(void *ctx)
-{
- Uctx *c;
- Rune r;
-
- c = ctx;
- if(c->p >= c->s + c->n)
- return -1;
- c->p += chartorune(&r, c->p);
- return r;
-}
-
-static long
-utfdostr(char *dst, long ndst, char *src, long nsrc, int comp)
-{
- Uctx c;
- Norm n;
- Rune buf[Maxnormctx];
- long i, w;
- char *e, *p;
-
- c.s = c.p = src;
- c.n = nsrc;
- norminit(&n, comp, &c, utfgetrune);
- for(p = dst, e = dst + ndst; p < e;){
- w = normpull(&n, buf, nelem(buf), 1);
- if(w == 0)
- break;
- for(i = 0; i < w; i++){
- if(p + runelen(buf[i]) >= e)
- break;
- p += runetochar(p, buf+i);
- }
- }
- return p - dst;
-}
-
-long
-utfcomp(char *dst, long ndst, char *src, long nsrc)
-{
- return utfdostr(dst, ndst, src, nsrc, 1);
-}
-
-long
-utfdecomp(char *dst, long ndst, char *src, long nsrc)
-{
- return utfdostr(dst, ndst, src, nsrc, 0);
-}
--- a/sys/src/libc/port/runetotype.c
+++ /dev/null
@@ -1,22 +1,0 @@
-#include <u.h>
-#include <libc.h>
-
-#include "runetotypedata"
-
-Rune
-toupperrune(Rune c)
-{
- return c + upperlkup(c);
-}
-
-Rune
-tolowerrune(Rune c)
-{
- return c + lowerlkup(c);
-}
-
-Rune
-totitlerune(Rune c)
-{
- return c + titlelkup(c);
-}
--- /dev/null
+++ b/sys/src/libc/ucd/mkfile
@@ -1,0 +1,42 @@
+</$objtype/mkfile
+
+LIB=/$objtype/lib/libc.a
+
+OFILES=\
+ runenorm.$O\
+ runetotype.$O\
+ runeistype.$O\
+ runebreak.$O\
+
+CLEANFILES=$OFILES
+
+HFILES=/sys/include/libc.h
+
+</sys/src/cmd/mksyslib
+
+DATA=\
+ runenormdata\
+ runetotypedata\
+ runeistypedata\
+ runebreakdata\
+
+$OFILES: $DATA
+
+UCD=\
+ /lib/ucd/CompositionExclusions.txt\
+ /lib/ucd/DerivedNormalizationProps.txt\
+ /lib/ucd/GraphemeBreakProperty.txt\
+ /lib/ucd/UnicodeData.txt\
+ /lib/ucd/WordBreakProperty.txt\
+ /lib/ucd/emoji-data.txt\
+
+$DATA: $UCD
+ @{
+ eval `{grep '^[A-Z]' /$cputype/mkfile}
+ $CC $CFLAGS -o mkrunetype.$O mkrunetype.c
+ $LD $LDFLAGS -o $O.mkrunetype mkrunetype.$O
+ ./$O.mkrunetype
+ }
+
+nuke:V:
+ rm -f $DATA *.mkrunetype
--- /dev/null
+++ b/sys/src/libc/ucd/mkrunetype.c
@@ -1,0 +1,789 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+
+enum{
+ NRUNES = 1<<21
+};
+
+typedef struct Param Param;
+typedef struct Lvl Lvl;
+struct Lvl{
+ int bits;
+ int max;
+ int mask;
+};
+struct Param{
+ Lvl idx1;
+ Lvl idx2;
+ Lvl data;
+
+ int round1max;
+};
+
+static void
+derive(Lvl *l)
+{
+ l->max = 1 << l->bits;
+ l->mask = l->max - 1;
+}
+
+static void
+param(Param *p, int idx1, int idx2)
+{
+
+ assert(idx1 + idx2 < 21);
+ p->idx1.bits = idx1;
+ p->idx2.bits = idx2;
+ p->data.bits = 21 - idx1 - idx2;
+ derive(&p->idx1);
+ derive(&p->idx2);
+ derive(&p->data);
+
+ p->round1max = NRUNES/p->data.max;
+}
+
+static int
+lkup(Param *p, int *idx1, int *idx2, int *data, int x)
+{
+ int y, z;
+
+ y = (((x)>>(p->data.bits+p->idx2.bits))&p->idx1.mask);
+ z = (((x)>>p->data.bits)&p->idx2.mask);
+ return data[idx2[idx1[y] + z] + (x&p->data.mask)];
+}
+
+static int
+mkarrvar(int fd, char *name, int *d, int len)
+{
+ int i, sz;
+ int max, min;
+ char *t;
+
+ max = min = 0;
+ for(i = 0; i < len; i++){
+ if(d[i] > max)
+ max = d[i];
+ if(d[i] < min)
+ min = d[i];
+ }
+ if(min == 0){
+ if(max < 0xFF)
+ t = "uchar", sz = 1;
+ else if(max < 0xFFFF)
+ t = "ushort", sz = 2;
+ else
+ t = "uint", sz = 4;
+ } else {
+ if(max < 1<<7)
+ t = "char", sz = 1;
+ else if(max < 1<<15)
+ t = "short", sz = 2;
+ else
+ t = "int", sz = 4;
+ }
+ if(fd < 0)
+ return sz * len;
+
+ fprint(fd, "static\n%s\t%s[%d] =\n{\n\t", t, name, len);
+ for(i = 0; i < len; i++){
+ fprint(fd, "%d,", d[i]);
+ if((i+1) % 16 == 0)
+ fprint(fd, "\n\t");
+ }
+ fprint(fd, "\n};\n");
+
+ return sz * len;
+}
+
+static int
+mkexceptarr(int fd, char *name, int *d, int n, int all)
+{
+ int i;
+ fprint(fd, "static\nRune %s[][%d] =\n{\n\t", name, all ? 3 : 2);
+ for(i = 0; i < n*3; i += 3){
+ if(all && d[i] != 0)
+ fprint(fd, "{0x%X, 0x%X, 0x%X},", d[i], d[i+1], d[i+2]);
+ else if(!all)
+ fprint(fd, "{0x%X, 0x%X},", d[i+1], d[i+2]);
+ if((i+3) % (8*3) == 0)
+ fprint(fd, "\n\t");
+ }
+ fprint(fd, "\n};\n");
+ return n * sizeof(Rune) * 2;
+}
+
+static int
+compact(int *data, int *idx, int nidx, int *src, int chunksize)
+{
+ int i, n, ndata, best;
+ int *dot, *lp, *rp;
+
+ dot = src;
+ ndata = 0;
+ idx[0] = 0;
+ for(i = 1; i <= nidx; i++){
+ rp = dot + chunksize;
+ lp = rp - 1;
+
+ for(best = 0, n = 0; i != nidx && n < chunksize; n++, lp--){
+ if(memcmp(lp, rp, (n+1) * sizeof data[0]) == 0)
+ best = n+1;
+ }
+ memmove(data + ndata, dot, (chunksize - best) * sizeof data[0]);
+ ndata += (chunksize - best);
+ idx[i] = idx[i - 1] + (chunksize - best);
+ dot = rp;
+ }
+ return ndata;
+}
+
+
+static int
+mklkup(int fd, char *label, int *map, Param *p)
+{
+ static int data[NRUNES];
+ static int idx2[NRUNES];
+ static int idx2dest[NRUNES];
+ static int idx1[NRUNES];
+ int i, nidx2, ndata;
+ int size;
+
+ ndata = compact(data, idx2, p->round1max, map, p->data.max);
+ nidx2 = compact(idx2dest, idx1, p->idx1.max, idx2, p->idx2.max);
+
+ if(fd >= 0){
+ for(i = 0; i < NRUNES; i++)
+ if(map[i] != lkup(p, idx1, idx2dest, data, i))
+ sysfatal("mismatch in %s at %d %d %d", label, i, map[i], lkup(p, idx1, idx2dest, data, i));
+ }
+
+ size = mkarrvar(fd, smprint("_%sdata", label), data, ndata);
+ size += mkarrvar(fd, smprint("_%sidx2", label), idx2dest, nidx2);
+ size += mkarrvar(fd, smprint("_%sidx1", label), idx1, p->idx1.max);
+ if(fd >= 0){
+ fprint(fd, "\n");
+ fprint(fd, "#define %sindex1(x) (((x)>>(%d+%d))&0x%X)\n", label, p->data.bits, p->idx2.bits, p->idx1.mask);
+ fprint(fd, "#define %sindex2(x) (((x)>>%d)&0x%X)\n", label, p->data.bits, p->idx2.mask);
+ fprint(fd, "#define %soffset(x) ((x)&0x%X)\n", label, p->data.mask);
+ fprint(fd, "#define %slkup(x) (_%sdata[_%sidx2[_%sidx1[%sindex1(x)] + %sindex2(x)] + %soffset(x)] )\n\n",
+ label, label, label, label, label, label, label);
+ }
+ return size;
+}
+
+static int
+mklkupmatrix(int, char *label, int *map, Param *p)
+{
+ int bestsize, size, bestx, besty;
+ int x, y;
+
+ bestsize = bestx = besty = -1;
+ for(x = 4; x <= 12; x++)
+ for(y=4; y <= (19 - x); y++){
+ param(p, x, y);
+ size = mklkup(-1, label, map, p);
+ if(bestsize == -1 || size < bestsize){
+ bestx = x;
+ besty = y;
+ bestsize = size;
+ }
+ }
+
+ assert(bestsize != -1);
+ fprint(2, "label: %s best: %d %d (%d)\n", label, bestx, besty, bestsize);
+ param(p, bestx, besty);
+ return bestsize;
+}
+
+static int myismerged[NRUNES];
+static int mytoupper[NRUNES];
+static int mytolower[NRUNES];
+static int mytotitle[NRUNES];
+static int mybreak[NRUNES];
+
+enum{ DSTART = 0xEEEE };
+static int mydecomp[NRUNES];
+static int mydespecial[256*3];
+static int nspecial;
+static int maxdchain;
+static int myccc[NRUNES];
+static int myqc[NRUNES];
+
+typedef struct KV KV;
+struct KV{
+ uint key;
+ uint val;
+ ushort next;
+};
+
+static KV myrecomp[2000];
+static int nrecomp;
+
+static int recompext[256*3];
+static int nrecompext;
+
+static uint
+hash(uint x)
+{
+ x ^= x >> 16;
+ x *= 0x21f0aaad;
+ x ^= x >> 15;
+ x *= 0xd35a2d97;
+ x ^= x >> 15;
+ return x;
+}
+
+static void
+mkrecomp(int fd)
+{
+ int i;
+ KV *p;
+ static KV vals[512];
+ static KV coll[1000];
+ int over;
+ int maxchain;
+
+ for(i = 0; i < nelem(vals); i++)
+ vals[i] = (KV){0, 0, 0};
+ for(i = 0; i < nelem(coll); i++)
+ coll[i] = (KV){0, 0, 0};
+ over = 1;
+ for(i = 0; i < nrecomp; i++){
+ p = vals + (hash(myrecomp[i].key) % nelem(vals));
+ maxchain = 0;
+ while(p->key != 0){
+ maxchain++;
+ if(p->next == 0){
+ p->next = over;
+ p = coll + over - 1;
+ over++;
+ } else
+ p = coll + p->next - 1;
+ }
+ p->key = myrecomp[i].key;
+ p->val = myrecomp[i].val;
+ }
+ fprint(2, "recomp map [%d][%d]: %d\n", nelem(vals), over-1, (nelem(vals) + over-1) * (4+2+2));
+ fprint(fd, "static\nuint\t_recompdata[] =\n{\n\t");
+ for(p = vals, i = 0;; i++){
+ assert(p->val < 0xFFFF);
+ assert(p->next < 0xFFFF);
+ fprint(fd, "%udU,%udU,", p->key, p->val | (p->next<<16));
+ if((i+1) % 8 == 0)
+ fprint(fd, "\n\t");
+
+ if(p == vals+nelem(vals)-1)
+ p = coll;
+ else if(p == coll + over - 2)
+ break;
+ else
+ p++;
+ }
+ fprint(fd, "\n};\n");
+ fprint(fd, "static uint *_recompcoll = _recompdata+%d*2;\n", nelem(vals));
+}
+
+enum {
+ OTHER,
+ Hebrew_Letter, Newline, Extend, Format,
+ Katakana, ALetter, MidLetter, MidNum,
+ MidNumLet, Numeric, ExtendNumLet, WSegSpace,
+ PREPEND = 0x10, CONTROL = 0x20, EXTEND = 0x30, REGION = 0x40,
+ L = 0x50, V = 0x60, T = 0x70, LV = 0x80, LVT = 0x90, SPACEMK = 0xA0,
+ EMOJIEX = 0xB0,
+
+ NFC_QC_No = 1, NFC_QC_Maybe = 2, NFD_QC_No = 4, NFD_QC_Maybe = 8,
+
+};
+
+static void
+mktables(void)
+{
+ Param p;
+ int tofd, isfd, normfd, breakfd;
+ int size;
+
+ tofd = create("runetotypedata", OWRITE, 0664);
+ if(tofd < 0)
+ sysfatal("could not create runetotypedata: %r");
+ param(&p, 10, 7);
+ size = mklkup(tofd, "upper", mytoupper, &p);
+ fprint(2, "%s: %d\n", "upper", size);
+
+ size = mklkup(tofd, "lower", mytolower, &p);
+ fprint(2, "%s: %d\n", "lower", size);
+
+ size = mklkup(tofd, "title", mytotitle, &p);
+ fprint(2, "%s: %d\n", "title", size);
+ close(tofd);
+
+ isfd = create("runeistypedata", OWRITE, 0664);
+ if(isfd < 0)
+ sysfatal("could not create runeistypedata: %r");
+ param(&p, 11, 6);
+ size = mklkup(isfd, "merged", myismerged, &p);
+ fprint(2, "%s: %d\n", "merged", size);
+ fprint(isfd, "static\nenum {\n");
+ fprint(isfd, "\tL%s = %s,\n", "space", "1<<0");
+ fprint(isfd, "\tL%s = %s,\n", "alpha", "1<<1");
+ fprint(isfd, "\tL%s = %s,\n", "digit", "1<<2");
+ fprint(isfd, "\tL%s = %s,\n", "upper", "1<<3");
+ fprint(isfd, "\tL%s = %s,\n", "lower", "1<<4");
+ fprint(isfd, "\tL%s = %s,\n", "title", "1<<5");
+ fprint(isfd, "};\n");
+ close(isfd);
+
+ normfd = create("runenormdata", OWRITE, 0664);
+ if(normfd < 0)
+ sysfatal("could not create runenormdata: %r");
+ param(&p, 10, 7);
+ size = mklkup(normfd, "decomp", mydecomp, &p);
+ fprint(2, "%s: %d\n", "decomp", size);
+ fprint(normfd, "static enum { Maxdecomp = %d };\n\n", maxdchain);
+
+ param(&p, 9, 7);
+ size = mklkup(normfd, "ccc", myccc, &p);
+ fprint(2, "%s: %d\n", "ccc", size);
+
+ param(&p, 10, 6);
+ size = mklkup(normfd, "qc", myqc, &p);
+ fprint(2, "%s: %d\n", "qc", size);
+ fprint(normfd, "static\nenum {\n");
+ fprint(normfd, "\t%s = %d,\n", "Qnfcno", NFC_QC_No);
+ fprint(normfd, "\t%s = %d,\n", "Qnfcmay", NFC_QC_Maybe);
+ fprint(normfd, "\t%s = %d,\n", "Qnfdno", NFD_QC_No);
+ fprint(normfd, "\t%s = %d,\n", "Qnfdmay", NFD_QC_Maybe);
+ fprint(normfd, "};\n");
+
+ mkexceptarr(normfd, "_decompexceptions", mydespecial, nspecial, 0);
+ mkexceptarr(normfd, "_recompexceptions", recompext, nrecompext, 1);
+ mkrecomp(normfd);
+ close(normfd);
+
+ param(&p, 10, 6);
+ breakfd = create("runebreakdata", OWRITE, 0644);
+ if(breakfd < 0)
+ sysfatal("could not create runebreakdata: %r");
+ size = mklkup(breakfd, "break", mybreak, &p);
+ fprint(2, "%s: %d\n", "break", size);
+}
+
+enum {
+ FIELD_CODE,
+ FIELD_NAME,
+ FIELD_CATEGORY,
+ FIELD_COMBINING,
+ FIELD_BIDIR,
+ FIELD_DECOMP,
+ FIELD_DECIMAL_DIG,
+ FIELD_DIG,
+ FIELD_NUMERIC_VAL,
+ FIELD_MIRRORED,
+ FIELD_UNICODE_1_NAME,
+ FIELD_COMMENT,
+ FIELD_UPPER,
+ FIELD_LOWER,
+ FIELD_TITLE,
+ NFIELDS,
+};
+
+static int
+getunicodeline(Biobuf *in, char **fields)
+{
+ char *p;
+
+ if((p = Brdline(in, '\n')) == nil)
+ return 0;
+
+ p[Blinelen(in)-1] = '\0';
+
+ if (getfields(p, fields, NFIELDS + 1, 0, ";") != NFIELDS)
+ sysfatal("bad number of fields");
+
+ return 1;
+}
+
+static int
+estrtoul(char *s, int base)
+{
+ char *epr;
+ Rune code;
+
+ code = strtoul(s, &epr, base);
+ if(s == epr)
+ sysfatal("bad code point hex string");
+ return code;
+}
+
+static char*
+getextraline(Biobuf *b, int *s, int *e)
+{
+ char *dot, *p;
+
+again:
+ p = Brdline(b, '\n');
+ if(p == nil)
+ return nil;
+ p[Blinelen(b)-1] = 0;
+ if(p[0] == 0 || p[0] == '#')
+ goto again;
+ if((dot = strstr(p, "..")) != nil){
+ *dot = 0;
+ dot += 2;
+ *s = estrtoul(p, 16);
+ *e = estrtoul(dot, 16);
+ } else {
+ *s = *e = estrtoul(p, 16);
+ dot = p;
+ }
+ return dot;
+}
+
+static void
+markbreak(void)
+{
+ Biobuf *b;
+ char *dot;
+ int i, s, e;
+ uchar v;
+
+ b = Bopen("/lib/ucd/WordBreakProperty.txt", OREAD);
+ if(b == nil)
+ sysfatal("could not load word breaks: %r");
+
+ while((dot = getextraline(b, &s, &e)) != nil){
+ v = 0;
+ if(strstr(dot, "ExtendNumLet") != nil)
+ v = ExtendNumLet;
+ else if(strstr(dot, "Hebrew_Letter") != nil)
+ v = Hebrew_Letter;
+ else if(strstr(dot, "Newline") != nil)
+ v = Newline;
+ else if(strstr(dot, "Extend") != nil)
+ v = Extend;
+ else if(strstr(dot, "Format") != nil)
+ v = Format;
+ else if(strstr(dot, "Katakana") != nil)
+ v = Katakana;
+ else if(strstr(dot, "ALetter") != nil)
+ v = ALetter;
+ else if(strstr(dot, "MidLetter") != nil)
+ v = MidLetter;
+ else if(strstr(dot, "MidNum") != nil)
+ v = MidNum;
+ else if(strstr(dot, "Numeric") != nil)
+ v = Numeric;
+ else if(strstr(dot, "WSegSpace") != nil)
+ v = WSegSpace;
+ for(i = s; i <= e; i++)
+ mybreak[i] = v;
+ }
+ Bterm(b);
+ b = Bopen("/lib/ucd/GraphemeBreakProperty.txt", OREAD);
+ if(b == nil)
+ sysfatal("could not load Grapheme breaks: %r");
+
+ while((dot = getextraline(b, &s, &e)) != nil){
+ v = 0;
+ if(strstr(dot, "; Prepend #") != nil)
+ v = PREPEND;
+ else if(strstr(dot, "; Control #") != nil)
+ v = CONTROL;
+ else if(strstr(dot, "; Extend #") != nil)
+ v = EXTEND;
+ else if(strstr(dot, "; Regional_Indicator #") != nil)
+ v = REGION;
+ else if(strstr(dot, "; SpacingMark #") != nil)
+ v = SPACEMK;
+ else if(strstr(dot, "; L #") != nil)
+ v = L;
+ else if(strstr(dot, "; V #") != nil)
+ v = V;
+ else if(strstr(dot, "; T #") != nil)
+ v = T;
+ else if(strstr(dot, "; LV #") != nil)
+ v = LV;
+ else if(strstr(dot, "; LVT #") != nil)
+ v = LVT;
+ for(i = s; i <= e; i++)
+ mybreak[i] |= v;
+ }
+ Bterm(b);
+
+ b = Bopen("/lib/ucd/emoji-data.txt", OREAD);
+ if(b == nil)
+ sysfatal("could not load emoji-data: %r");
+
+ while((dot = getextraline(b, &s, &e)) != nil){
+ v = 0;
+ if(strstr(dot, "; Extended_Pictographic") != nil)
+ v = EMOJIEX;
+ for(i = s; i <= e; i++)
+ mybreak[i] |= v;
+ }
+ Bterm(b);
+
+ b = Bopen("/lib/ucd/DerivedNormalizationProps.txt", OREAD);
+ if(b == nil)
+ sysfatal("could not load emoji-data: %r");
+
+ while((dot = getextraline(b, &s, &e)) != nil){
+ v = 0;
+ if(strstr(dot, "; NFC_QC; N") != nil)
+ v = NFC_QC_No;
+ else if(strstr(dot, "; NFC_QC; M") != nil)
+ v = NFC_QC_Maybe;
+ else if(strstr(dot, "; NFD_QC; N") != nil)
+ v = NFD_QC_No;
+ else if(strstr(dot, "; NFD_QC; M") != nil)
+ v = NFD_QC_Maybe;
+
+ for(i = s; i <= e; i++)
+ myqc[i] |= v;
+ }
+ Bterm(b);
+}
+
+static void
+markexclusions(void)
+{
+ Biobuf *b;
+ char *p;
+ int i;
+ uint x;
+
+ b = Bopen("/lib/ucd/CompositionExclusions.txt", OREAD);
+ if(b == nil)
+ sysfatal("could not load composition exclusions: %r");
+
+ while((p = Brdline(b, '\n')) != nil){
+ p[Blinelen(b)-1] = 0;
+ if(p[0] == 0 || p[0] == '#')
+ continue;
+ x = estrtoul(p, 16);
+ for(i = 0; i < nrecomp; i++){
+ if(myrecomp[i].val == x){
+ myrecomp[i].val = 0;
+ break;
+ }
+ }
+ if(i == nrecomp){
+ for(i = 0; i < nrecompext; i++){
+ if(recompext[i*3] == x){
+ recompext[i*3] = 0;
+ break;
+ }
+ }
+ }
+ }
+ Bterm(b);
+}
+
+static void
+findlongchain(void)
+{
+ int i, n, x, r1;
+
+ for(i = 0; i < NRUNES; i++)
+ for(x = i, n = 0; r1 = mydecomp[x]>>16; x = r1){
+ if(++n > maxdchain)
+ maxdchain = n;
+ if(r1 >= DSTART && r1 <0xF8FF)
+ r1 -= DSTART;
+ }
+ maxdchain *= 2;
+}
+
+void
+main(int, char)
+{
+ static char myisspace[NRUNES];
+ static char myisalpha[NRUNES];
+ static char myisdigit[NRUNES];
+ static char myisupper[NRUNES];
+ static char myislower[NRUNES];
+ static char myistitle[NRUNES];
+ Biobuf *in;
+ char *fields[NFIELDS + 1], *fields2[NFIELDS + 1];
+ char *p, *d;
+ int i, code, last;
+ int decomp[2], *ip;
+
+ in = Bopen("/lib/ucd/UnicodeData.txt", OREAD);
+ if(in == nil)
+ sysfatal("can't open UnicodeData.txt: %r");
+
+ for(i = 0; i < NRUNES; i++){
+ mytoupper[i] = -1;
+ mytolower[i] = -1;
+ mytotitle[i] = -1;
+ mydecomp[i] = 0;
+ myccc[i] = 0;
+ mybreak[i] = 0;
+ }
+
+ myisspace['\t'] = 1;
+ myisspace['\n'] = 1;
+ myisspace['\r'] = 1;
+ myisspace['\f'] = 1;
+ myisspace['\v'] = 1;
+ myisspace[0x85] = 1; /* control char, "next line" */
+ myisspace[0xfeff] = 1; /* zero-width non-break space */
+
+ last = -1;
+ nspecial = nrecomp = nrecompext = 0;
+ while(getunicodeline(in, fields)){
+ code = estrtoul(fields[FIELD_CODE], 16);
+ if (code >= NRUNES)
+ sysfatal("code-point value too big: %x", code);
+ if(code <= last)
+ sysfatal("bad code sequence: %x then %x", last, code);
+ last = code;
+
+ p = fields[FIELD_CATEGORY];
+ if(strstr(fields[FIELD_NAME], ", First>") != nil){
+ if(!getunicodeline(in, fields2))
+ sysfatal("range start at eof");
+ if (strstr(fields2[FIELD_NAME], ", Last>") == nil)
+ sysfatal("range start not followed by range end");
+ last = estrtoul(fields2[FIELD_CODE], 16);
+ if(last <= code)
+ sysfatal("range out of sequence: %x then %x", code, last);
+ if(strcmp(p, fields2[FIELD_CATEGORY]) != 0)
+ sysfatal("range with mismatched category");
+ }
+
+ d = fields[FIELD_DECOMP];
+ if(strlen(d) > 0 && strstr(d, "<") == nil){
+ decomp[0] = estrtoul(d, 16);
+ d = strstr(d, " ");
+ if(d == nil){
+ /* singleton recompositions are verboden */
+ decomp[1] = 0;
+ if(decomp[0] > 0xFFFF){
+ ip = mydespecial + nspecial*3;
+ ip[0] = code;
+ ip[1] = decomp[0];
+ ip[2] = 0;
+ mydecomp[code] = (DSTART+nspecial)<<16;
+ nspecial++;
+ } else
+ mydecomp[code] = decomp[0]<<16;
+ } else {
+ d++;
+ decomp[1] = estrtoul(d, 16);
+ if(decomp[0] > 0xFFFF || decomp[1] > 0xFFFF){
+ ip = mydespecial + nspecial*3;
+ ip[0] = code;
+ ip[1] = decomp[0];
+ ip[2] = decomp[1];
+ mydecomp[code] = (DSTART+nspecial)<<16;
+ nspecial++;
+ ip = recompext + nrecompext*3;
+ ip[0] = code;
+ ip[1] = decomp[0];
+ ip[2] = decomp[1];
+ nrecompext++;
+ } else {
+ mydecomp[code] = decomp[0]<<16 | decomp[1];
+ myrecomp[nrecomp++] = (KV){decomp[0]<<16 | decomp[1], code, 0};
+ }
+ }
+ }
+
+ for (; code <= last; code++){
+ if(p[0] == 'L')
+ myisalpha[code] = 1;
+ if(p[0] == 'Z')
+ myisspace[code] = 1;
+
+ if(strcmp(p, "Lu") == 0)
+ myisupper[code] = 1;
+ if(strcmp(p, "Ll") == 0)
+ myislower[code] = 1;
+
+ if(strcmp(p, "Lt") == 0)
+ myistitle[code] = 1;
+
+ if(strcmp(p, "Nd") == 0)
+ myisdigit[code] = 1;
+
+ if(fields[FIELD_UPPER][0] != '\0')
+ mytoupper[code] = estrtoul(fields[FIELD_UPPER], 16);
+
+ if(fields[FIELD_LOWER][0] != '\0')
+ mytolower[code] = estrtoul(fields[FIELD_LOWER], 16);
+
+ if(fields[FIELD_TITLE][0] != '\0')
+ mytotitle[code] = estrtoul(fields[FIELD_TITLE], 16);
+
+ myccc[code] = estrtoul(fields[FIELD_COMBINING], 10);
+ }
+ }
+
+ Bterm(in);
+ findlongchain();
+ markexclusions();
+
+ /*
+ * according to standard, if totitle(x) is not defined in ucd
+ * but toupper(x) is, then totitle is defined to be toupper(x)
+ */
+ for(i = 0; i < NRUNES; i++){
+ if(mytotitle[i] == -1
+ && mytoupper[i] != -1
+ && !myistitle[i])
+ mytotitle[i] = mytoupper[i];
+ }
+
+ /*
+ * A couple corrections:
+ * is*(to*(x)) should be true.
+ * restore undefined transformations.
+ * store offset instead of value, makes them sparse.
+ */
+ for(i = 0; i < NRUNES; i++){
+ if(mytoupper[i] != -1)
+ myisupper[mytoupper[i]] = 1;
+ else
+ mytoupper[i] = i;
+
+ if(mytolower[i] != -1)
+ myislower[mytolower[i]] = 1;
+ else
+ mytolower[i] = i;
+
+ if(mytotitle[i] != -1)
+ myistitle[mytotitle[i]] = 1;
+ else
+ mytotitle[i] = i;
+
+ mytoupper[i] = mytoupper[i] - i;
+ mytolower[i] = mytolower[i] - i;
+ mytotitle[i] = mytotitle[i] - i;
+ }
+
+ uchar b;
+ for(i = 0; i < NRUNES; i++){
+ b = 0;
+ if(myisspace[i])
+ b |= 1<<0;
+ if(myisalpha[i])
+ b |= 1<<1;
+ if(myisdigit[i])
+ b |= 1<<2;
+ if(myisupper[i])
+ b |= 1<<3;
+ if(myislower[i])
+ b |= 1<<4;
+ if(myistitle[i])
+ b |= 1<<5;
+
+ myismerged[i] = b;
+ }
+
+ markbreak();
+ mktables();
+ exits(nil);
+}
--- /dev/null
+++ b/sys/src/libc/ucd/runebreak.c
@@ -1,0 +1,293 @@
+#include <u.h>
+#include <libc.h>
+
+#include "runebreakdata"
+
+enum {
+ OTHER,
+ Hebrew_Letter, Newline, Extend, Format,
+ Katakana, ALetter, MidLetter, MidNum,
+ MidNumLet, Numeric, ExtendNumLet, WSegSpace,
+ PREPEND = 0x10, CONTROL = 0x20, EXTEND = 0x30, REGION = 0x40,
+ L = 0x50, V = 0x60, T = 0x70, LV = 0x80, LVT = 0x90, SPACEMK = 0xA0,
+ EMOJIEX = 0xB0,
+
+ ZWJ = 0x200DU,
+ LINETAB = 0xB,
+};
+
+#define IS(x, y) ((x&0xf) == y)
+#define ISG(x, y) ((x&0xf0) == y)
+
+Rune*
+runegbreak(Rune *s)
+{
+ Rune l, r;
+ uchar lt, rt;
+ Rune *p;
+
+ p = s;
+ if((l = *p++) == 0)
+ return s;
+ if((r = *p) == 0)
+ return s;
+ lt = breaklkup(l);
+ rt = breaklkup(r);
+ if(l == '\r' && r == '\n')
+ goto Done;
+ if(ISG(lt, CONTROL) || l == '\r' || l == '\n')
+ return p;
+ if(ISG(rt, CONTROL) || r == '\r' || r == '\n')
+ return p;
+ if(ISG(lt, L) && (ISG(rt, L) || ISG(rt, V) || ISG(rt, LV) || ISG(rt, LVT)))
+ goto Done;
+ if((ISG(lt, LV) || ISG(lt, V)) && (ISG(rt, V) || ISG(rt, T)))
+ goto Done;
+ if((ISG(lt, LVT) || ISG(lt, T)) && (ISG(rt, T) || ISG(rt, T)))
+ goto Done;
+ if(ISG(rt, SPACEMK) || ISG(lt, PREPEND))
+ goto Done;
+ if(ISG(lt, EMOJIEX) && (ISG(rt, EXTEND) || r == ZWJ)){
+ while(ISG(rt, EXTEND)){
+ p++;
+ if((r = *p) == 0)
+ return s;
+ rt = breaklkup(r);
+ }
+ if(r != ZWJ)
+ return p;
+ p++;
+ if((r = *p) == 0)
+ return s;
+ rt = breaklkup(r);
+ if(ISG(rt, EMOJIEX))
+ goto Done;
+ return p;
+ }
+ if(ISG(rt, EXTEND) || r == ZWJ)
+ goto Done;
+ if(ISG(lt, REGION) && ISG(rt, REGION))
+ goto Done;
+
+ return p;
+
+Done:
+ if(p[1] == 0)
+ return s;
+ return p + 1;
+}
+
+char*
+utfgbreak(char *s)
+{
+ Rune l, r;
+ uchar lt, rt;
+ char *p;
+
+ p = s;
+ p += chartorune(&l, p);
+ if(l == 0)
+ return s;
+ chartorune(&r, p);
+ if(r == 0)
+ return s;
+ lt = breaklkup(l);
+ rt = breaklkup(r);
+ if(l == '\r' && r == '\n')
+ goto Done;
+ if(ISG(lt, CONTROL) || l == '\r' || l == '\n')
+ return p;
+ if(ISG(rt, CONTROL) || r == '\r' || r == '\n')
+ return p;
+ if(ISG(lt, L) && (ISG(rt, L) || ISG(rt, V) || ISG(rt, LV) || ISG(rt, LVT)))
+ goto Done;
+ if((ISG(lt, LV) || ISG(lt, V)) && (ISG(rt, V) || ISG(rt, T)))
+ goto Done;
+ if((ISG(lt, LVT) || ISG(lt, T)) && (ISG(rt, T) || ISG(rt, T)))
+ goto Done;
+ if(ISG(rt, SPACEMK) || ISG(lt, PREPEND))
+ goto Done;
+ if(ISG(lt, EMOJIEX) && (ISG(rt, EXTEND) || r == ZWJ)){
+ while(ISG(rt, EXTEND)){
+ p += chartorune(&r, p);
+ chartorune(&r, p);
+ if(r == 0)
+ return s;
+ rt = breaklkup(r);
+ }
+ if(r != ZWJ)
+ return p;
+
+ p += chartorune(&r, p);
+ chartorune(&r, p);
+ if(r == 0)
+ return s;
+ rt = breaklkup(r);
+ if(ISG(rt, EMOJIEX))
+ goto Done;
+ return p;
+ }
+ if(ISG(rt, EXTEND) || r == ZWJ)
+ goto Done;
+ if(ISG(lt, REGION) && ISG(rt, REGION))
+ goto Done;
+
+ return p;
+
+Done:
+ p += chartorune(&r, p);
+ chartorune(&r, p);
+ if(r == 0)
+ return s;
+ return p;
+}
+
+#define AH(x) (IS(x, ALetter) || IS(x, Hebrew_Letter))
+#define MNLQ(x) (IS(x, MidNumLet) || x == '\'')
+
+Rune*
+runewbreak(Rune *s)
+{
+ Rune l, r;
+ uchar lt, rt;
+ Rune *p;
+
+ p = s;
+ if((l = *p++) == 0)
+ return s;
+ if((r = *p) == 0)
+ return s;
+ lt = breaklkup(l);
+ rt = breaklkup(r);
+ if(l == '\r' && r == '\n')
+ goto Done;
+ if(l == '\r' || l == '\n' || l == LINETAB)
+ return p;
+ if(r == '\r' || r == '\n' || l == LINETAB)
+ return p;
+ if(IS(lt, WSegSpace) && IS(rt, WSegSpace))
+ goto Done;
+ if(IS(rt, Format) || IS(rt, Extend))
+ goto Done;
+ if(AH(lt)){
+ if(AH(rt))
+ goto Done;
+ if((IS(rt, MidLetter) || MNLQ(rt)) && p[1] != 0 && AH(breaklkup(p[1])))
+ goto Done;
+ if(IS(lt, Hebrew_Letter) && r == '\'')
+ goto Done;
+ if(IS(lt, Hebrew_Letter) && r == '"' && p[1] != 0 && IS(breaklkup(p[1]), Hebrew_Letter))
+ goto Done;
+ if(IS(rt, Numeric))
+ goto Done;
+ }
+ if(IS(lt, Numeric) && (AH(rt) || IS(rt, Numeric)))
+ goto Done;
+ if(IS(lt, Numeric) && (IS(rt, MidNum) || MNLQ(rt)) && p[1] != 0 && IS(breaklkup(p[1]), Numeric))
+ goto Done;
+ if(IS(lt, Katakana) && IS(rt, Katakana))
+ goto Done;
+ if(AH(lt) || IS(lt, Numeric) || IS(lt, Katakana) || IS(lt, ExtendNumLet))
+ if(IS(rt, ExtendNumLet))
+ goto Done;
+ if(IS(lt, ExtendNumLet) && (AH(rt) || IS(rt, Numeric) || IS(rt, Katakana)))
+ goto Done;
+ if(ISG(lt, REGION)){
+ if(ISG(rt, REGION))
+ goto Done;
+ if(r != ZWJ)
+ return p;
+ p++;
+ if((r = *p) == 0)
+ return s;
+ rt = breaklkup(r);
+ if(ISG(rt, REGION))
+ goto Done;
+ }
+
+ return p;
+
+Done:
+ if(p[1] == 0)
+ return s;
+ return p + 1;
+}
+
+char*
+utfwbreak(char *s)
+{
+ Rune l, r;
+ Rune peek;
+ uchar lt, rt;
+ char *p;
+
+ p = s;
+ p += chartorune(&l, p);
+ if(l == 0)
+ return s;
+ chartorune(&peek, p+chartorune(&r, p));
+ if(r == 0)
+ return s;
+ lt = breaklkup(l);
+ rt = breaklkup(r);
+ if(l == '\r' && r == '\n')
+ goto Done;
+ if(l == '\r' || l == '\n' || l == LINETAB)
+ return p;
+ if(r == '\r' || r == '\n' || l == LINETAB)
+ return p;
+ if(IS(lt, WSegSpace) && IS(rt, WSegSpace))
+ goto Done;
+ if(IS(rt, Format) || IS(rt, Extend))
+ goto Done;
+ if(AH(lt)){
+ if(AH(rt))
+ goto Done;
+ if(IS(rt, MidLetter) || MNLQ(rt))
+ if(peek != 0 && AH(breaklkup(peek)))
+ goto Done;
+
+ if(IS(lt, Hebrew_Letter) && r == '\'')
+ goto Done;
+
+ if(IS(lt, Hebrew_Letter) && r == '"')
+ if(peek != 0 && IS(breaklkup(peek), Hebrew_Letter))
+ goto Done;
+
+ if(IS(rt, Numeric))
+ goto Done;
+ }
+ if(IS(lt, Numeric) && (AH(rt) || IS(rt, Numeric)))
+ goto Done;
+ if(IS(lt, Numeric) && (IS(rt, MidNum) || MNLQ(rt)) && peek != 0 && IS(breaklkup(peek), Numeric))
+ goto Done;
+ if(IS(lt, Katakana) && IS(rt, Katakana))
+ goto Done;
+ if(AH(lt) || IS(lt, Numeric) || IS(lt, Katakana) || IS(lt, ExtendNumLet))
+ if(IS(rt, ExtendNumLet))
+ goto Done;
+ if(IS(lt, ExtendNumLet) && (AH(rt) || IS(rt, Numeric) || IS(rt, Katakana)))
+ goto Done;
+ if(ISG(lt, REGION)){
+ if(ISG(rt, REGION))
+ goto Done;
+ if(r != ZWJ)
+ return p;
+ p += chartorune(&r, p);
+ chartorune(&r, p);
+ if(r == 0)
+ return s;
+ rt = breaklkup(r);
+ if(ISG(rt, REGION))
+ goto Done;
+ }
+
+ return p;
+
+Done:
+ p += chartorune(&r, p);
+ chartorune(&r, p);
+ if(r == 0)
+ return s;
+ return p;
+}
--- /dev/null
+++ b/sys/src/libc/ucd/runeistype.c
@@ -1,0 +1,52 @@
+#include <u.h>
+#include <libc.h>
+
+#include "runeistypedata"
+
+int
+isspacerune(Rune c)
+{
+ if(c > Runemax)
+ return 0;
+ return (mergedlkup(c) & Lspace) == Lspace;
+}
+
+int
+isalpharune(Rune c)
+{
+ if(c > Runemax)
+ return 0;
+ return (mergedlkup(c) & Lalpha) == Lalpha;
+}
+
+int
+isdigitrune(Rune c)
+{
+ if(c > Runemax)
+ return 0;
+ return (mergedlkup(c) & Ldigit) == Ldigit;
+}
+
+int
+isupperrune(Rune c)
+{
+ if(c > Runemax)
+ return 0;
+ return (mergedlkup(c) & Lupper) == Lupper;
+}
+
+int
+islowerrune(Rune c)
+{
+ if(c > Runemax)
+ return 0;
+ return (mergedlkup(c) & Llower) == Llower;
+}
+
+int
+istitlerune(Rune c)
+{
+ if(c > Runemax)
+ return 0;
+ return (mergedlkup(c) & Ltitle) == Ltitle;
+}
--- /dev/null
+++ b/sys/src/libc/ucd/runenorm.c
@@ -1,0 +1,444 @@
+#include <u.h>
+#include <libc.h>
+
+#include "runenormdata"
+
+//Unicode Standard: Section 3.12 Conjoining Jamo Behavior
+enum {
+ SBase = 0xAC00,
+ LBase = 0x1100,
+ VBase = 0x1161,
+ TBase = 0x11A7,
+
+ LCount = 19,
+ VCount = 21,
+ TCount = 28,
+ NCount = VCount * TCount,
+ SCount = LCount * NCount,
+
+ LLast = LBase + LCount - 1,
+ SLast = SBase + SCount - 1,
+ VLast = VBase + VCount - 1,
+ TLast = TBase + TCount - 1,
+};
+
+/*
+ * Most runes decompose in to one/two
+ * other runes with codepoints < 0xFFFF,
+ * however there are some exceptions.
+ * To keep the table size down we instead
+ * store an index in to an exception range
+ * within the private use section and use
+ * an exception table.
+ */
+enum {
+ Estart = 0xEEEE,
+ Estop = 0xF8FF,
+};
+
+static Rune
+_runedecomp(Rune c, Rune *r2)
+{
+ uint x;
+
+ if(c < Runeself){
+ *r2 = 0;
+ return 0;
+ }
+
+ //korean
+ if(c >= SBase && c <= SLast){
+ c -= SBase;
+ x = c % TCount;
+ if(x){
+ *r2 = TBase + x;
+ return SBase + (c - x);
+ }
+ *r2 = VBase + ((c % NCount) / TCount);
+ return LBase + (c / NCount);
+ }
+
+ x = decomplkup(c);
+ if((x & 0xFFFF) != 0){
+ *r2 = x & 0xFFFF;
+ return x>>16;
+ }
+ x >>= 16;
+ if(x >= Estart && x < Estop){
+ Rune *r;
+ r = _decompexceptions[x - Estart];
+ *r2 = r[1];
+ return r[0];
+ }
+ *r2 = 0;
+ return x;
+}
+
+static Rune
+_runerecomp(Rune r0, Rune r1)
+{
+ uint x, y, *p, next;
+
+ if(r0 >= LBase && r0 <= LLast){
+ if(r1 < VBase || r1 > VLast)
+ return 0;
+ x = (r0 - LBase) * NCount + (r1 - VBase) * TCount;
+ return SBase + x;
+ }
+ if(r0 >= SBase && r0 <= SLast && (r0 - SBase) % TCount == 0){
+ if(r1 > TBase && r1 <= TLast)
+ return r0 + (r1 - TBase);
+ return 0;
+ }
+ if(r0 > 0xFFFF || r1 > 0xFFFF){
+ for(x = 0; x < nelem(_recompexceptions); x++)
+ if(r0 == _recompexceptions[x][1] && r1 == _recompexceptions[x][2])
+ return _recompexceptions[x][0];
+ return 0;
+ }
+ y = x = r0<<16 | r1;
+ x ^= x >> 16;
+ x *= 0x21f0aaad;
+ x ^= x >> 15;
+ x *= 0xd35a2d97;
+ x ^= x >> 15;
+ p = _recompdata + (x%512)*2;
+ while(p[0] != y){
+ next = p[1]>>16;
+ if(!next)
+ return 0;
+ p = _recompcoll + (next-1)*2;
+ }
+ return p[1] & 0xFFFF;
+}
+
+static void
+runecccsort(Rune *a, int len)
+{
+ Rune r;
+ int i, j;
+
+ for(i = 1; i < len; i++){
+ r = a[i];
+ for(j = i; j > 0 && ccclkup(a[j-1]) > ccclkup(r); j--)
+ a[j] = a[j-1];
+ a[j] = r;
+ }
+}
+
+static int
+boundary(Rune r)
+{
+ return !(qclkup(r) & (Qnfcno|Qnfcmay));
+}
+
+/*
+ * Stk stores the entire context for a chunk of
+ * an input string that is being normalized.
+ * In accordance to the standard, Unicode text
+ * has no upper bound for the amount of conjoining
+ * (also called non-starter) elements associated with
+ * a base rune. Thus to implement normalization within
+ * reasonable memory constraints we implement the
+ * "Stream-Safe Text Format" as defined in UAX #15 § 13.
+ */
+typedef struct {
+ Rune a[Maxnormctx];
+ Rune *e;
+} Stk;
+
+static int
+push(Stk *s, Rune c)
+{
+ int n, l;
+ Rune r2, b[Maxdecomp];
+ Rune *p = b + nelem(b) - 1;
+
+ for(*p = c; c = _runedecomp(c, &r2); *p = c){
+ assert(p > b);
+ if(r2 != 0)
+ *p-- = r2;
+ }
+
+ n = b + nelem(b) - p;
+ l = nelem(s->a) - (s->e - s->a);
+ if(n > l){
+ werrstr("runenorm: buffer overflow");
+ return -1;
+ }
+ l -= n;
+ for(; n > 0; n--)
+ *s->e++ = *p++;
+ return l;
+}
+
+/*
+ * Worst case recomposition, this happens when we have to compose
+ * two runes who both have a CCC of zero.
+ */
+static void
+worstrecomp(Stk *s)
+{
+ int done;
+ Rune c, *p, *rp;
+
+ for(done = 0; done == 0;){
+ done = 1;
+ for(p = s->a; p+1 < s->e; p++){
+ c = _runerecomp(p[0], p[1]);
+ if(c == 0)
+ continue;
+ done = 0;
+ *p = c;
+ for(rp = p+1; rp < s->e-1; rp++)
+ rp[0] = rp[1];
+ s->e--;
+ p--;
+ }
+ }
+}
+
+static void
+cccrecomp(Stk *s)
+{
+ Rune c, *p, *rp;
+
+ for(p = s->a + 1; p < s->e; p++){
+ c = _runerecomp(s->a[0], *p);
+ if(c != 0){
+ s->a[0] = c;
+ for(rp = p; rp < s->e-1; rp++){
+ rp[0] = rp[1];
+ }
+ s->e--;
+ p--;
+ } else while(p + 1 < s->e && ccclkup(p[0]) == ccclkup(p[1]))
+ p++;
+ }
+}
+
+void
+norminit(Norm *n, int compose, void *ctx, long (*getrune)(void*))
+{
+ memset(n, 0, sizeof *n);
+ n->ctx = ctx;
+ n->getrune = getrune;
+ n->compose = compose;
+ n->obuf.e = n->obuf.a;
+ n->ibuf.e = n->ibuf.a;
+}
+
+int NORMDEBUG;
+
+static long
+peekrune(Norm *n)
+{
+ long r;
+
+ if(n->ibuf.e > n->ibuf.a)
+ return n->ibuf.e[-1];
+
+ r = n->getrune(n->ctx);
+ if(r >= 0)
+ *n->ibuf.e++ = r;
+ return r;
+}
+
+static long
+getrune(Norm *n)
+{
+ if(n->ibuf.e > n->ibuf.a)
+ return *--n->ibuf.e;
+ return n->getrune(n->ctx);
+}
+
+long
+normpull(Norm *n, Rune *rdst, long max, int flush)
+{
+ Rune *rp, *re;
+ Stk stk;
+ Rune *dot;
+ int r;
+ long c;
+
+ rp = rdst;
+ re = rdst + max;
+ dot = nil;
+ c = 0;
+ while(rp < re){
+ if(n->obuf.e != n->obuf.a){
+ memcpy(stk.a, n->obuf.a, (n->obuf.e - n->obuf.a)*sizeof(Rune));
+ stk.e = stk.a + (n->obuf.e - n->obuf.a);
+ n->obuf.e = n->obuf.a;
+ c = stk.a[0];
+ goto Flush;
+ }
+
+ stk.e = stk.a;
+ c = getrune(n);
+ if(c < 0)
+ break;
+ push(&stk, c);
+ c = peekrune(n);
+ if(stk.e == stk.a+1 && stk.a[0] < Runeself && c < Runeself && c >= 0)
+ goto Flush;
+ while(c >= 0 && ccclkup(c) != 0){
+ r = push(&stk, getrune(n));
+ c = peekrune(n);
+ if(r > 2)
+ continue;
+ if(ccclkup(stk.a[0]) != 0){
+ assert(r > 0);
+ r--;
+ } else
+ assert(r >= 0);
+ if(r == 0 || (c == 0x0344 && r < 2)){
+ /* in reverse */
+ if(r > 0){
+ getrune(n);
+ *n->ibuf.e++ = 0x301;
+ *n->ibuf.e++ = 0x308;
+ }
+ *n->ibuf.e++ = 0x034F;
+ break;
+ }
+ }
+ if(stk.e - stk.a > 1)
+ runecccsort(stk.a, stk.e - stk.a);
+
+ if(!n->compose)
+ goto Flush;
+
+ if(ccclkup(stk.e[-1]) == 0){
+ Rune tmp;
+ while(c >= 0 && (!boundary(c) || !boundary(_runedecomp(c, &tmp)))){
+ if(push(&stk, getrune(n)) == -1){
+ *n->ibuf.e++ = c;
+ for(r = 0; r < Maxdecomp; r++)
+ *n->ibuf.e++ = *--stk.e;
+ break;
+ }
+ c = peekrune(n);
+ }
+ worstrecomp(&stk);
+ } else if(ccclkup(stk.a[0]) == 0)
+ cccrecomp(&stk);
+
+Flush:
+ if(flush || c >= 0)
+ for(dot = stk.a; dot < stk.e; dot++){
+ if(rp == re)
+ goto Out;
+ *rp++ = *dot;
+ }
+ dot = nil;
+ if(c < 0)
+ break;
+ }
+Out:
+ if(c < 0 && !flush){
+ while(stk.e > stk.a)
+ *n->ibuf.e++ = *--stk.e;
+ }
+ if(dot != nil){
+ memcpy(n->obuf.a, dot, (stk.e - dot) * sizeof(Rune));
+ n->obuf.e = n->obuf.a + (stk.e - dot);
+ }
+
+ return rp - rdst;
+}
+
+typedef struct {
+ Rune *s, *p;
+ int n;
+} Rctx;
+
+static long
+runegetrune(void *ctx)
+{
+ Rctx *c;
+
+ c = ctx;
+ if(c->p >= c->s + c->n)
+ return -1;
+ return *c->p++;
+}
+
+static long
+runedostr(Rune *dst, long ndst, Rune *src, long nsrc, int comp)
+{
+ Rctx c;
+ Norm n;
+
+ c.s = c.p = src;
+ c.n = nsrc;
+ norminit(&n, comp, &c, runegetrune);
+ return normpull(&n, dst, ndst, 1);
+}
+
+long
+runecomp(Rune *dst, long ndst, Rune *src, long nsrc)
+{
+ return runedostr(dst, ndst, src, nsrc, 1);
+}
+
+long
+runedecomp(Rune *dst, long ndst, Rune *src, long nsrc)
+{
+ return runedostr(dst, ndst, src, nsrc, 0);
+}
+
+typedef struct {
+ char *s, *p;
+ int n;
+} Uctx;
+
+static long
+utfgetrune(void *ctx)
+{
+ Uctx *c;
+ Rune r;
+
+ c = ctx;
+ if(c->p >= c->s + c->n)
+ return -1;
+ c->p += chartorune(&r, c->p);
+ return r;
+}
+
+static long
+utfdostr(char *dst, long ndst, char *src, long nsrc, int comp)
+{
+ Uctx c;
+ Norm n;
+ Rune buf[Maxnormctx];
+ long i, w;
+ char *e, *p;
+
+ c.s = c.p = src;
+ c.n = nsrc;
+ norminit(&n, comp, &c, utfgetrune);
+ for(p = dst, e = dst + ndst; p < e;){
+ w = normpull(&n, buf, nelem(buf), 1);
+ if(w == 0)
+ break;
+ for(i = 0; i < w; i++){
+ if(p + runelen(buf[i]) >= e)
+ break;
+ p += runetochar(p, buf+i);
+ }
+ }
+ return p - dst;
+}
+
+long
+utfcomp(char *dst, long ndst, char *src, long nsrc)
+{
+ return utfdostr(dst, ndst, src, nsrc, 1);
+}
+
+long
+utfdecomp(char *dst, long ndst, char *src, long nsrc)
+{
+ return utfdostr(dst, ndst, src, nsrc, 0);
+}
--- /dev/null
+++ b/sys/src/libc/ucd/runetotype.c
@@ -1,0 +1,22 @@
+#include <u.h>
+#include <libc.h>
+
+#include "runetotypedata"
+
+Rune
+toupperrune(Rune c)
+{
+ return c + upperlkup(c);
+}
+
+Rune
+tolowerrune(Rune c)
+{
+ return c + lowerlkup(c);
+}
+
+Rune
+totitlerune(Rune c)
+{
+ return c + titlelkup(c);
+}
--
⑨