shithub: front

Download patch

ref: e8d4ad93771196cfb6a28d396e14b6fdc5e73f39
parent: bada775b5e90bff1f76034de4bd05c558ecb0848
author: cinap_lenrek <cinap_lenrek@felloff.net>
date: Wed Aug 20 13:22:08 EDT 2025

libc: move unicode data stuff from port/ to ucd/

This gets rid of the mk extra stuff in port/
and also handles nuke now, forcing regenerating
the data tables.

--- a/sys/src/libc/mkfile
+++ b/sys/src/libc/mkfile
@@ -1,16 +1,15 @@
 </$objtype/mkfile
 
 PORTDIRS=9sys 9syscall fmt port
-DIRS=$PORTDIRS $CPUS
+DIRS=$PORTDIRS $CPUS ucd
 OLDCPUS=68000 68020 sparc
 
 all install:V:
-	for(i in $PORTDIRS $objtype)@{
+	for(i in $PORTDIRS $objtype ucd)@{
 		echo $i
 		cd $i
 		mk $MKFLAGS install
 	}
-	@{ cd port && mk extra }
 
 clean:V:
 	for(i in $DIRS $OLDCPUS test)@{
@@ -20,7 +19,7 @@
 	}
 
 nuke:V:
-	for(i in $PORTDIRS $objtype)@{
+	for(i in $PORTDIRS $objtype ucd)@{
 		echo $i
 		cd $i
 		mk $MKFLAGS nuke
--- a/sys/src/libc/port/mkfile
+++ b/sys/src/libc/port/mkfile
@@ -126,43 +126,3 @@
 profile.$O: /sys/include/tos.h
 
 malloc.$O pool.$O: /sys/include/pool.h
-
-runenorm.$O:	runenormdata runenorm.c
-runetotype.$O:	runetotypedata runetotype.c
-runeistype.$O:	runeistypedata runeistype.c
-runebreak.$O:	runebreakdata runebreak.c
-
-UCD=\
-	/lib/ucd/CompositionExclusions.txt\
-	/lib/ucd/DerivedNormalizationProps.txt\
-	/lib/ucd/GraphemeBreakProperty.txt\
-	/lib/ucd/UnicodeData.txt\
-	/lib/ucd/WordBreakProperty.txt\
-	/lib/ucd/emoji-data.txt\
-
-EXTRA=\
-	runebreak.$O\
-	runeistype.$O\
-	runenorm.$O\
-	runetotype.$O\
-
-GEN=\
-	runenormdata\
-	runetotypedata\
-	runeistypedata\
-	runebreakdata\
-
-$GEN:	$UCD
-	@{
-		eval `{grep '^[A-Z]' /$cputype/mkfile}
-		$CC $CFLAGS -o mkrunetype.$O mkrunetype.c
-		$LD $LDFLAGS -o $O.mkrunetype mkrunetype.$O
-		./$O.mkrunetype
-	}
-
-$EXTRA:		$GEN
-
-extra:V:	$EXTRA
-	ar vr $LIB $prereq
-
-regen:V:	$GEN
--- a/sys/src/libc/port/mkrunetype.c
+++ /dev/null
@@ -1,789 +1,0 @@
-#include <u.h>
-#include <libc.h>
-#include <bio.h>
-
-enum{
-	NRUNES = 1<<21
-};
-
-typedef struct Param Param;
-typedef struct Lvl Lvl;
-struct Lvl{
-	int bits;
-	int max;
-	int mask;
-};
-struct Param{
-	Lvl idx1;
-	Lvl idx2;
-	Lvl data;
-
-	int round1max;
-};
-
-static void
-derive(Lvl *l)
-{
-	l->max = 1 << l->bits;
-	l->mask = l->max - 1;
-}
-
-static void
-param(Param *p, int idx1, int idx2)
-{
-
-	assert(idx1 + idx2 < 21);
-	p->idx1.bits = idx1;
-	p->idx2.bits = idx2;
-	p->data.bits = 21 - idx1 - idx2;
-	derive(&p->idx1);
-	derive(&p->idx2);
-	derive(&p->data);
-
-	p->round1max = NRUNES/p->data.max;
-}
-
-static int
-lkup(Param *p, int *idx1, int *idx2, int *data, int x)
-{
-	int y, z;
-
-	y = (((x)>>(p->data.bits+p->idx2.bits))&p->idx1.mask);
-	z = (((x)>>p->data.bits)&p->idx2.mask);
-	return data[idx2[idx1[y] + z] + (x&p->data.mask)];
-}
-
-static int
-mkarrvar(int fd, char *name, int *d, int len)
-{
-	int i, sz;
-	int max, min;
-	char *t;
-
-	max = min = 0;
-	for(i = 0; i < len; i++){
-		if(d[i] > max)
-			max = d[i];
-		if(d[i] < min)
-			min = d[i];
-	}
-	if(min == 0){
-		if(max < 0xFF)
-			t = "uchar", sz = 1;
-		else if(max < 0xFFFF)
-			t = "ushort", sz = 2;
-		else
-			t = "uint", sz = 4;
-	} else {
-		if(max < 1<<7)
-			t = "char", sz = 1;
-		else if(max < 1<<15)
-			t = "short", sz = 2;
-		else
-			t = "int", sz = 4;
-	}
-	if(fd < 0)
-		return sz * len;
-
-	fprint(fd, "static\n%s\t%s[%d] =\n{\n\t", t, name, len);
-	for(i = 0; i < len; i++){
-		fprint(fd, "%d,", d[i]);
-		if((i+1) % 16 == 0)
-			fprint(fd, "\n\t");
-	}
-	fprint(fd, "\n};\n");
-
-	return sz * len;
-}
-
-static int
-mkexceptarr(int fd, char *name, int *d, int n, int all)
-{
-	int i;
-	fprint(fd, "static\nRune %s[][%d] =\n{\n\t", name, all ? 3 : 2);
-	for(i = 0; i < n*3; i += 3){
-		if(all && d[i] != 0)
-			fprint(fd, "{0x%X, 0x%X, 0x%X},", d[i], d[i+1], d[i+2]);
-		else if(!all)
-			fprint(fd, "{0x%X, 0x%X},", d[i+1], d[i+2]);	
-		if((i+3) % (8*3) == 0)
-			fprint(fd, "\n\t");
-	}
-	fprint(fd, "\n};\n");
-	return n * sizeof(Rune) * 2;
-}
-
-static int
-compact(int *data, int *idx, int nidx, int *src, int chunksize)
-{
-	int i, n, ndata, best;
-	int *dot, *lp, *rp;
-
-	dot = src;
-	ndata = 0;
-	idx[0] = 0;
-	for(i = 1; i <= nidx; i++){
-		rp = dot + chunksize;
-		lp = rp - 1;
-
-		for(best = 0, n = 0; i != nidx && n < chunksize; n++, lp--){
-			if(memcmp(lp, rp, (n+1) * sizeof data[0]) == 0)
-				best = n+1;
-		}
-		memmove(data + ndata, dot, (chunksize - best) * sizeof data[0]);
-		ndata += (chunksize - best);
-		idx[i] = idx[i - 1] + (chunksize - best);
-		dot = rp;
-	}
-	return ndata;
-}
-
-
-static int
-mklkup(int fd, char *label, int *map, Param *p)
-{
-	static int data[NRUNES];
-	static int idx2[NRUNES];
-	static int idx2dest[NRUNES];
-	static int idx1[NRUNES];
-	int i, nidx2, ndata;
-	int size;
-
-	ndata = compact(data, idx2, p->round1max, map, p->data.max);
-	nidx2 = compact(idx2dest, idx1, p->idx1.max, idx2, p->idx2.max);
-
-	if(fd >= 0){
-		for(i = 0; i < NRUNES; i++)
-			if(map[i] != lkup(p, idx1, idx2dest, data, i))
-				sysfatal("mismatch in %s at %d %d %d", label, i, map[i], lkup(p, idx1, idx2dest, data, i));
-	}
-
-	size = mkarrvar(fd, smprint("_%sdata", label), data, ndata);
-	size += mkarrvar(fd, smprint("_%sidx2", label), idx2dest, nidx2);
-	size += mkarrvar(fd, smprint("_%sidx1", label), idx1, p->idx1.max);
-	if(fd >= 0){
-		fprint(fd, "\n");
-		fprint(fd, "#define %sindex1(x) (((x)>>(%d+%d))&0x%X)\n", label, p->data.bits, p->idx2.bits, p->idx1.mask);
-		fprint(fd, "#define %sindex2(x) (((x)>>%d)&0x%X)\n", label, p->data.bits, p->idx2.mask);
-		fprint(fd, "#define %soffset(x) ((x)&0x%X)\n", label, p->data.mask);
-		fprint(fd, "#define %slkup(x) (_%sdata[_%sidx2[_%sidx1[%sindex1(x)] + %sindex2(x)] + %soffset(x)] )\n\n",
-			label, label, label, label, label, label, label);
-	}
-	return size;
-}
-
-static int
-mklkupmatrix(int, char *label, int *map, Param *p)
-{
-	int bestsize, size, bestx, besty;
-	int x, y;
-
-	bestsize = bestx = besty = -1;
-	for(x = 4; x <= 12; x++)
-		for(y=4; y <= (19 - x); y++){
-			param(p, x, y);
-			size = mklkup(-1, label, map, p);
-			if(bestsize == -1 || size < bestsize){
-				bestx = x;
-				besty = y;
-				bestsize = size;
-			}
-		}
-
-	assert(bestsize != -1);
-	fprint(2, "label: %s best: %d %d (%d)\n", label, bestx, besty, bestsize);
-	param(p, bestx, besty);
-	return bestsize;
-}
-
-static int myismerged[NRUNES];
-static int mytoupper[NRUNES];
-static int mytolower[NRUNES];
-static int mytotitle[NRUNES];
-static int mybreak[NRUNES];
-
-enum{ DSTART = 0xEEEE };
-static int mydecomp[NRUNES];
-static int mydespecial[256*3];
-static int nspecial;
-static int maxdchain;
-static int myccc[NRUNES];
-static int myqc[NRUNES];
-
-typedef struct KV KV;
-struct KV{
-	uint key;
-	uint val;
-	ushort next;
-};
-
-static KV myrecomp[2000];
-static int nrecomp;
-
-static int recompext[256*3];
-static int nrecompext;
-
-static uint
-hash(uint x)
-{
-	x ^= x >> 16;
-	x *= 0x21f0aaad;
-	x ^= x >> 15;
-	x *= 0xd35a2d97;
-	x ^= x >> 15;
-	return x;
-}
-
-static void
-mkrecomp(int fd)
-{
-	int i;
-	KV *p;
-	static KV vals[512];
-	static KV coll[1000];
-	int over;
-	int maxchain;
-
-	for(i = 0; i < nelem(vals); i++)
-		vals[i] = (KV){0, 0, 0};
-	for(i = 0; i < nelem(coll); i++)
-		coll[i] = (KV){0, 0, 0};
-	over = 1;
-	for(i = 0; i < nrecomp; i++){
-		p = vals + (hash(myrecomp[i].key) % nelem(vals));
-		maxchain = 0;
-		while(p->key != 0){
-			maxchain++;
-			if(p->next == 0){
-				p->next = over;
-				p = coll + over - 1;
-				over++;
-			} else
-				p = coll + p->next - 1;
-		}
-		p->key = myrecomp[i].key;
-		p->val = myrecomp[i].val;
-	}
-	fprint(2, "recomp map [%d][%d]: %d\n", nelem(vals), over-1, (nelem(vals) + over-1) * (4+2+2));
-	fprint(fd, "static\nuint\t_recompdata[] =\n{\n\t");
-	for(p = vals, i = 0;; i++){
-		assert(p->val < 0xFFFF);
-		assert(p->next < 0xFFFF);
-		fprint(fd, "%udU,%udU,", p->key, p->val | (p->next<<16));
-		if((i+1) % 8 == 0)
-			fprint(fd, "\n\t");
-
-		if(p == vals+nelem(vals)-1)
-			p = coll;
-		else if(p == coll + over - 2)
-			break;
-		else
-			p++;
-	}
-	fprint(fd, "\n};\n");
-	fprint(fd, "static uint *_recompcoll = _recompdata+%d*2;\n", nelem(vals));
-}
-
-enum {
-	OTHER, 
-	Hebrew_Letter, Newline, Extend, Format,
-	Katakana, ALetter, MidLetter, MidNum,
-	MidNumLet, Numeric, ExtendNumLet, WSegSpace,
-	PREPEND = 0x10, CONTROL = 0x20, EXTEND = 0x30, REGION = 0x40,
-	L = 0x50, V = 0x60, T = 0x70, LV = 0x80, LVT = 0x90, SPACEMK = 0xA0,
-	EMOJIEX = 0xB0,
-
-	NFC_QC_No = 1, NFC_QC_Maybe = 2, NFD_QC_No = 4, NFD_QC_Maybe = 8,
-	
-};
-
-static void
-mktables(void)
-{
-	Param p;
-	int tofd, isfd, normfd, breakfd;
-	int size;
-
-	tofd = create("runetotypedata", OWRITE, 0664);
-	if(tofd < 0)
-		sysfatal("could not create runetotypedata: %r");
-	param(&p, 10, 7);
-	size = mklkup(tofd, "upper", mytoupper, &p);
-	fprint(2, "%s: %d\n", "upper", size);
-
-	size = mklkup(tofd, "lower", mytolower, &p);
-	fprint(2, "%s: %d\n", "lower", size);
-
-	size = mklkup(tofd, "title", mytotitle, &p);
-	fprint(2, "%s: %d\n", "title", size);
-	close(tofd);
-
-	isfd = create("runeistypedata", OWRITE, 0664);
-	if(isfd < 0)
-		sysfatal("could not create runeistypedata: %r");
-	param(&p, 11, 6);
-	size = mklkup(isfd, "merged", myismerged, &p);
-	fprint(2, "%s: %d\n", "merged", size);
-	fprint(isfd, "static\nenum {\n");
-	fprint(isfd, "\tL%s = %s,\n", "space", "1<<0");
-	fprint(isfd, "\tL%s = %s,\n", "alpha", "1<<1");
-	fprint(isfd, "\tL%s = %s,\n", "digit", "1<<2");
-	fprint(isfd, "\tL%s = %s,\n", "upper", "1<<3");
-	fprint(isfd, "\tL%s = %s,\n", "lower", "1<<4");
-	fprint(isfd, "\tL%s = %s,\n", "title", "1<<5");
-	fprint(isfd, "};\n");
-	close(isfd);
-
-	normfd = create("runenormdata", OWRITE, 0664);
-	if(normfd < 0)
-		sysfatal("could not create runenormdata: %r");
-	param(&p, 10, 7);
-	size = mklkup(normfd, "decomp", mydecomp, &p);
-	fprint(2, "%s: %d\n", "decomp", size);
-	fprint(normfd, "static enum { Maxdecomp = %d };\n\n", maxdchain);
-
-	param(&p, 9, 7);
-	size = mklkup(normfd, "ccc", myccc, &p);
-	fprint(2, "%s: %d\n", "ccc", size);
-
-	param(&p, 10, 6);
-	size = mklkup(normfd, "qc", myqc, &p);
-	fprint(2, "%s: %d\n", "qc", size);
-	fprint(normfd, "static\nenum {\n");
-	fprint(normfd, "\t%s = %d,\n", "Qnfcno", NFC_QC_No);
-	fprint(normfd, "\t%s = %d,\n", "Qnfcmay", NFC_QC_Maybe);
-	fprint(normfd, "\t%s = %d,\n", "Qnfdno", NFD_QC_No);
-	fprint(normfd, "\t%s = %d,\n", "Qnfdmay", NFD_QC_Maybe);
-	fprint(normfd, "};\n");
-
-	mkexceptarr(normfd, "_decompexceptions", mydespecial, nspecial, 0);
-	mkexceptarr(normfd, "_recompexceptions", recompext, nrecompext, 1);
-	mkrecomp(normfd);
-	close(normfd);
-
-	param(&p, 10, 6);
-	breakfd = create("runebreakdata", OWRITE, 0644);
-	if(breakfd < 0)
-		sysfatal("could not create runebreakdata: %r");
-	size = mklkup(breakfd, "break", mybreak, &p);
-	fprint(2, "%s: %d\n", "break", size);
-}
-
-enum {
-	FIELD_CODE,
-	FIELD_NAME,
-	FIELD_CATEGORY,
-	FIELD_COMBINING,
-	FIELD_BIDIR,
-	FIELD_DECOMP,
-	FIELD_DECIMAL_DIG,
-	FIELD_DIG,
-	FIELD_NUMERIC_VAL,
-	FIELD_MIRRORED,
-	FIELD_UNICODE_1_NAME,
-	FIELD_COMMENT,
-	FIELD_UPPER,
-	FIELD_LOWER,
-	FIELD_TITLE,
-	NFIELDS,
-};
-
-static int
-getunicodeline(Biobuf *in, char **fields)
-{
-	char *p;
-
-	if((p = Brdline(in, '\n')) == nil)
-		return 0;
-
-	p[Blinelen(in)-1] = '\0';
-
-	if (getfields(p, fields, NFIELDS + 1, 0, ";") != NFIELDS)
-		sysfatal("bad number of fields");
-
-	return 1;
-}
-
-static int
-estrtoul(char *s, int base)
-{
-	char *epr;
-	Rune code;
-
-	code = strtoul(s, &epr, base);
-	if(s == epr)
-		sysfatal("bad code point hex string");
-	return code;
-}
-
-static char*
-getextraline(Biobuf *b, int *s, int *e)
-{
-	char *dot, *p;
-
-again:
-	p = Brdline(b, '\n');
-	if(p == nil)
-		return nil;
-	p[Blinelen(b)-1] = 0;
-	if(p[0] == 0 || p[0] == '#')
-		goto again;
-	if((dot = strstr(p, "..")) != nil){
-		*dot = 0;
-		dot += 2;
-		*s = estrtoul(p, 16);
-		*e = estrtoul(dot, 16);
-	} else {
-		*s = *e = estrtoul(p, 16);
-		dot = p;
-	}
-	return dot;
-}
-
-static void
-markbreak(void)
-{
-	Biobuf *b;
-	char *dot;
-	int i, s, e;
-	uchar v;
-
-	b = Bopen("/lib/ucd/WordBreakProperty.txt", OREAD);
-	if(b == nil)
-		sysfatal("could not load word breaks: %r");
-
-	while((dot = getextraline(b, &s, &e)) != nil){
-		v = 0;
-		if(strstr(dot, "ExtendNumLet") != nil)
-			v = ExtendNumLet;
-		else if(strstr(dot, "Hebrew_Letter") != nil)
-			v = Hebrew_Letter;
-		else if(strstr(dot, "Newline") != nil)
-			v = Newline;
-		else if(strstr(dot, "Extend") != nil)
-			v = Extend;
-		else if(strstr(dot, "Format") != nil)
-			v = Format;
-		else if(strstr(dot, "Katakana") != nil)
-			v = Katakana;
-		else if(strstr(dot, "ALetter") != nil)
-			v = ALetter;
-		else if(strstr(dot, "MidLetter") != nil)
-			v = MidLetter;
-		else if(strstr(dot, "MidNum") != nil)
-			v = MidNum;
-		else if(strstr(dot, "Numeric") != nil)
-			v = Numeric;
-		else if(strstr(dot, "WSegSpace") != nil)
-			v = WSegSpace;
-		for(i = s; i <= e; i++)
-			mybreak[i] = v;
-	}
-	Bterm(b);
-	b = Bopen("/lib/ucd/GraphemeBreakProperty.txt", OREAD);
-	if(b == nil)
-		sysfatal("could not load Grapheme breaks: %r");
-
-	while((dot = getextraline(b, &s, &e)) != nil){
-		v = 0;
-		if(strstr(dot, "; Prepend #") != nil)
-			v = PREPEND;
-		else if(strstr(dot, "; Control #") != nil)
-			v = CONTROL;
-		else if(strstr(dot, "; Extend #") != nil)
-			v = EXTEND;
-		else if(strstr(dot, "; Regional_Indicator #") != nil)
-			v = REGION;
-		else if(strstr(dot, "; SpacingMark #") != nil)
-			v = SPACEMK;
-		else if(strstr(dot, "; L #") != nil)
-			v = L;
-		else if(strstr(dot, "; V #") != nil)
-			v = V;
-		else if(strstr(dot, "; T #") != nil)
-			v = T;
-		else if(strstr(dot, "; LV #") != nil)
-			v = LV;
-		else if(strstr(dot, "; LVT #") != nil)
-			v = LVT;
-		for(i = s; i <= e; i++)
-			mybreak[i] |= v;
-	}
-	Bterm(b);
-
-	b = Bopen("/lib/ucd/emoji-data.txt", OREAD);
-	if(b == nil)
-		sysfatal("could not load emoji-data: %r");
-
-	while((dot = getextraline(b, &s, &e)) != nil){
-		v = 0;
-		if(strstr(dot, "; Extended_Pictographic") != nil)
-			v = EMOJIEX;
-		for(i = s; i <= e; i++)
-			mybreak[i] |= v;
-	}
-	Bterm(b);
-
-	b = Bopen("/lib/ucd/DerivedNormalizationProps.txt", OREAD);
-	if(b == nil)
-		sysfatal("could not load emoji-data: %r");
-
-	while((dot = getextraline(b, &s, &e)) != nil){
-		v = 0;
-		if(strstr(dot, "; NFC_QC; N") != nil)
-			v = NFC_QC_No;
-		else if(strstr(dot, "; NFC_QC; M") != nil)
-			v = NFC_QC_Maybe;
-		else if(strstr(dot, "; NFD_QC; N") != nil)
-			v = NFD_QC_No;
-		else if(strstr(dot, "; NFD_QC; M") != nil)
-			v = NFD_QC_Maybe;
-
-		for(i = s; i <= e; i++)
-			myqc[i] |= v;
-	}
-	Bterm(b);
-}
-
-static void
-markexclusions(void)
-{
-	Biobuf *b;
-	char *p;
-	int i;
-	uint x;
-
-	b = Bopen("/lib/ucd/CompositionExclusions.txt", OREAD);
-	if(b == nil)
-		sysfatal("could not load composition exclusions: %r");
-
-	while((p = Brdline(b, '\n')) != nil){
-		p[Blinelen(b)-1] = 0;
-		if(p[0] == 0 || p[0] == '#')
-			continue;
-		x = estrtoul(p, 16);
-		for(i = 0; i < nrecomp; i++){
-			if(myrecomp[i].val == x){
-				myrecomp[i].val = 0;
-				break;
-			}
-		}
-		if(i == nrecomp){
-			for(i = 0; i < nrecompext; i++){
-				if(recompext[i*3] == x){
-					recompext[i*3] = 0;
-					break;
-				}
-			}
-		}
-	}
-	Bterm(b);
-}
-
-static void
-findlongchain(void)
-{
-	int i, n, x, r1;
-
-	for(i = 0; i < NRUNES; i++)
-	for(x = i, n = 0; r1 = mydecomp[x]>>16; x = r1){
-		if(++n > maxdchain)
-			maxdchain = n;
-		if(r1 >= DSTART && r1 <0xF8FF)
-			r1 -= DSTART;
-	}
-	maxdchain *= 2;
-}
-
-void
-main(int, char)
-{
-	static char myisspace[NRUNES];
-	static char myisalpha[NRUNES];
-	static char myisdigit[NRUNES];
-	static char myisupper[NRUNES];
-	static char myislower[NRUNES];
-	static char myistitle[NRUNES];
-	Biobuf *in;
-	char *fields[NFIELDS + 1], *fields2[NFIELDS + 1];
-	char *p, *d;
-	int i, code, last;
-	int decomp[2], *ip;
-
-	in = Bopen("/lib/ucd/UnicodeData.txt", OREAD);
-	if(in == nil)
-		sysfatal("can't open UnicodeData.txt: %r");
-
-	for(i = 0; i < NRUNES; i++){
-		mytoupper[i] = -1;
-		mytolower[i] = -1;
-		mytotitle[i] = -1;
-		mydecomp[i] = 0;
-		myccc[i] = 0;
-		mybreak[i] = 0;
-	}
-
-	myisspace['\t'] = 1;
-	myisspace['\n'] = 1;
-	myisspace['\r'] = 1;
-	myisspace['\f'] = 1;
-	myisspace['\v'] = 1;
-	myisspace[0x85] = 1;	/* control char, "next line" */
-	myisspace[0xfeff] = 1;	/* zero-width non-break space */
-
-	last = -1;
-	nspecial = nrecomp = nrecompext =  0;
-	while(getunicodeline(in, fields)){
-		code = estrtoul(fields[FIELD_CODE], 16);
-		if (code >= NRUNES)
-			sysfatal("code-point value too big: %x", code);
-		if(code <= last)
-			sysfatal("bad code sequence: %x then %x", last, code);
-		last = code;
-
-		p = fields[FIELD_CATEGORY];
-		if(strstr(fields[FIELD_NAME], ", First>") != nil){
-			if(!getunicodeline(in, fields2))
-				sysfatal("range start at eof");
-			if (strstr(fields2[FIELD_NAME], ", Last>") == nil)
-				sysfatal("range start not followed by range end");
-			last = estrtoul(fields2[FIELD_CODE], 16);
-			if(last <= code)
-				sysfatal("range out of sequence: %x then %x", code, last);
-			if(strcmp(p, fields2[FIELD_CATEGORY]) != 0)
-				sysfatal("range with mismatched category");
-		}
-
-		d = fields[FIELD_DECOMP];
-		if(strlen(d) > 0 && strstr(d, "<") == nil){
-			decomp[0] = estrtoul(d, 16);
-			d = strstr(d, " ");
-			if(d == nil){
-				/* singleton recompositions are verboden */
-				decomp[1] = 0;
-				if(decomp[0] > 0xFFFF){
-					ip = mydespecial + nspecial*3;
-					ip[0] = code;
-					ip[1] = decomp[0];
-					ip[2] = 0;
-					mydecomp[code] = (DSTART+nspecial)<<16;
-					nspecial++;
-				} else
-					mydecomp[code] = decomp[0]<<16;
-			} else {
-				d++;
-				decomp[1] = estrtoul(d, 16);
-				if(decomp[0] > 0xFFFF || decomp[1] > 0xFFFF){
-					ip = mydespecial + nspecial*3;
-					ip[0] = code;
-					ip[1] = decomp[0];
-					ip[2] = decomp[1];
-					mydecomp[code] = (DSTART+nspecial)<<16;
-					nspecial++;
-					ip = recompext + nrecompext*3;
-					ip[0] = code;
-					ip[1] = decomp[0];
-					ip[2] = decomp[1];
-					nrecompext++;
-				} else {
-					mydecomp[code] = decomp[0]<<16 | decomp[1];
-					myrecomp[nrecomp++] = (KV){decomp[0]<<16 | decomp[1], code, 0};
-				}
-			}
-		}
-
-		for (; code <= last; code++){
-			if(p[0] == 'L')
-				myisalpha[code] = 1;
-			if(p[0] == 'Z')
-				myisspace[code] = 1;
-
-			if(strcmp(p, "Lu") == 0)
-				myisupper[code] = 1;
-			if(strcmp(p, "Ll") == 0)
-				myislower[code] = 1;
-
-			if(strcmp(p, "Lt") == 0)
-				myistitle[code] = 1;
-
-			if(strcmp(p, "Nd") == 0)
-				myisdigit[code] = 1;
-
-			if(fields[FIELD_UPPER][0] != '\0')
-				mytoupper[code] = estrtoul(fields[FIELD_UPPER], 16);
-
-			if(fields[FIELD_LOWER][0] != '\0')
-				mytolower[code] = estrtoul(fields[FIELD_LOWER], 16);
-
-			if(fields[FIELD_TITLE][0] != '\0')
-				mytotitle[code] = estrtoul(fields[FIELD_TITLE], 16);
-
-			myccc[code] = estrtoul(fields[FIELD_COMBINING], 10);
-		}
-	}
-
-	Bterm(in);
-	findlongchain();
-	markexclusions();
-
-	/*
-	 * according to standard, if totitle(x) is not defined in ucd
-	 * but toupper(x) is, then totitle is defined to be toupper(x)
-	 */
-	for(i = 0; i < NRUNES; i++){
-		if(mytotitle[i] == -1
-		&& mytoupper[i] != -1
-		&& !myistitle[i])
-			mytotitle[i] = mytoupper[i];
-	}
-
-	/*
-	 * A couple corrections:
-	 * is*(to*(x)) should be true.
-	 * restore undefined transformations.
-	 * store offset instead of value, makes them sparse.
-	 */
-	for(i = 0; i < NRUNES; i++){
-		if(mytoupper[i] != -1)
-			myisupper[mytoupper[i]] = 1;
-		else
-			mytoupper[i] = i;
-
-		if(mytolower[i] != -1)
-			myislower[mytolower[i]] = 1;
-		else
-			mytolower[i] = i;
-
-		if(mytotitle[i] != -1)
-			myistitle[mytotitle[i]] = 1;
-		else
-			mytotitle[i] = i;
-
-		mytoupper[i] = mytoupper[i] - i;
-		mytolower[i] = mytolower[i] - i;
-		mytotitle[i] = mytotitle[i] - i;
-	}
-
-	uchar b;
-	for(i = 0; i < NRUNES; i++){
-		b = 0;
-		if(myisspace[i])
-			b |= 1<<0;
-		if(myisalpha[i])
-			b |= 1<<1;
-		if(myisdigit[i])
-			b |= 1<<2;
-		if(myisupper[i])
-			b |= 1<<3;
-		if(myislower[i])
-			b |= 1<<4;
-		if(myistitle[i])
-			b |= 1<<5;
-
-		myismerged[i] = b;
-	}
-
-	markbreak();
-	mktables();
-	exits(nil);
-}
--- a/sys/src/libc/port/runebreak.c
+++ /dev/null
@@ -1,293 +1,0 @@
-#include <u.h>
-#include <libc.h>
-
-#include "runebreakdata"
-
-enum {
-	OTHER, 
-	Hebrew_Letter, Newline, Extend, Format,
-	Katakana, ALetter, MidLetter, MidNum,
-	MidNumLet, Numeric, ExtendNumLet, WSegSpace,
-	PREPEND = 0x10, CONTROL = 0x20, EXTEND = 0x30, REGION = 0x40,
-	L = 0x50, V = 0x60, T = 0x70, LV = 0x80, LVT = 0x90, SPACEMK = 0xA0,
-	EMOJIEX = 0xB0,
-
-	ZWJ = 0x200DU,
-	LINETAB = 0xB,
-};
-
-#define IS(x, y) ((x&0xf) == y)
-#define ISG(x, y) ((x&0xf0) == y)
-
-Rune*
-runegbreak(Rune *s)
-{
-	Rune l, r;
-	uchar lt, rt;
-	Rune *p;
-
-	p = s;
-	if((l = *p++) == 0)
-		return s;
-	if((r = *p) == 0)
-		return s;
-	lt = breaklkup(l);
-	rt = breaklkup(r);
-	if(l == '\r' && r == '\n')
-		goto Done;
-	if(ISG(lt, CONTROL) || l == '\r' || l == '\n')
-		return p;
-	if(ISG(rt, CONTROL) || r == '\r' || r == '\n')
-		return p;
-	if(ISG(lt, L) && (ISG(rt, L) || ISG(rt, V) || ISG(rt, LV) || ISG(rt, LVT)))
-		goto Done;
-	if((ISG(lt, LV) || ISG(lt, V)) && (ISG(rt, V) || ISG(rt, T)))
-		goto Done;
-	if((ISG(lt, LVT) || ISG(lt, T)) && (ISG(rt, T) || ISG(rt, T)))
-		goto Done;
-	if(ISG(rt, SPACEMK) || ISG(lt, PREPEND))
-		goto Done;
-	if(ISG(lt, EMOJIEX) && (ISG(rt, EXTEND) || r == ZWJ)){
-		while(ISG(rt, EXTEND)){
-			p++;
-			if((r = *p) == 0)
-				return s;
-			rt = breaklkup(r);
-		}
-		if(r != ZWJ)
-			return p;
-		p++;
-		if((r = *p) == 0)
-			return s;
-		rt = breaklkup(r);
-		if(ISG(rt, EMOJIEX))
-			goto Done;
-		return p;
-	}
-	if(ISG(rt, EXTEND) || r == ZWJ)
-		goto Done;
-	if(ISG(lt, REGION) && ISG(rt, REGION))
-		goto Done;
-
-	return p;
-
-Done:
-	if(p[1] == 0)
-		return s;
-	return p + 1;
-}
-
-char*
-utfgbreak(char *s)
-{
-	Rune l, r;
-	uchar lt, rt;
-	char *p;
-
-	p = s;
-	p += chartorune(&l, p);
-	if(l == 0)
-		return s;
-	chartorune(&r, p);
-	if(r == 0)
-		return s;
-	lt = breaklkup(l);
-	rt = breaklkup(r);
-	if(l == '\r' && r == '\n')
-		goto Done;
-	if(ISG(lt, CONTROL) || l == '\r' || l == '\n')
-		return p;
-	if(ISG(rt, CONTROL) || r == '\r' || r == '\n')
-		return p;
-	if(ISG(lt, L) && (ISG(rt, L) || ISG(rt, V) || ISG(rt, LV) || ISG(rt, LVT)))
-		goto Done;
-	if((ISG(lt, LV) || ISG(lt, V)) && (ISG(rt, V) || ISG(rt, T)))
-		goto Done;
-	if((ISG(lt, LVT) || ISG(lt, T)) && (ISG(rt, T) || ISG(rt, T)))
-		goto Done;
-	if(ISG(rt, SPACEMK) || ISG(lt, PREPEND))
-		goto Done;
-	if(ISG(lt, EMOJIEX) && (ISG(rt, EXTEND) || r == ZWJ)){
-		while(ISG(rt, EXTEND)){
-			p += chartorune(&r, p);
-			chartorune(&r, p);
-			if(r == 0)
-				return s;
-			rt = breaklkup(r);
-		}
-		if(r != ZWJ)
-			return p;
-
-		p += chartorune(&r, p);
-		chartorune(&r, p);
-		if(r == 0)
-			return s;
-		rt = breaklkup(r);
-		if(ISG(rt, EMOJIEX))
-			goto Done;
-		return p;
-	}
-	if(ISG(rt, EXTEND) || r == ZWJ)
-		goto Done;
-	if(ISG(lt, REGION) && ISG(rt, REGION))
-		goto Done;
-
-	return p;
-
-Done:
-	p += chartorune(&r, p);
-	chartorune(&r, p);
-	if(r == 0)
-		return s;
-	return p;
-}
-
-#define AH(x) (IS(x, ALetter) || IS(x, Hebrew_Letter))
-#define MNLQ(x) (IS(x, MidNumLet) || x == '\'')
-
-Rune*
-runewbreak(Rune *s)
-{
-	Rune l, r;
-	uchar lt, rt;
-	Rune *p;
-
-	p = s;
-	if((l = *p++) == 0)
-		return s;
-	if((r = *p) == 0)
-		return s;
-	lt = breaklkup(l);
-	rt = breaklkup(r);
-	if(l == '\r' && r == '\n')
-		goto Done;
-	if(l == '\r' || l == '\n' || l == LINETAB)
-		return p;
-	if(r == '\r' || r == '\n' || l == LINETAB)
-		return p;
-	if(IS(lt, WSegSpace) && IS(rt, WSegSpace))
-		goto Done;
-	if(IS(rt, Format) || IS(rt, Extend))
-		goto Done;
-	if(AH(lt)){
-		if(AH(rt))
-			goto Done;
-		if((IS(rt, MidLetter) || MNLQ(rt)) && p[1] != 0 && AH(breaklkup(p[1])))
-			goto Done;
-		if(IS(lt, Hebrew_Letter) && r == '\'')
-			goto Done;
-		if(IS(lt, Hebrew_Letter) && r == '"' && p[1] != 0 && IS(breaklkup(p[1]), Hebrew_Letter))
-			goto Done;
-		if(IS(rt, Numeric))
-			goto Done;
-	}
-	if(IS(lt, Numeric) && (AH(rt) || IS(rt, Numeric)))
-		goto Done;
-	if(IS(lt, Numeric) && (IS(rt, MidNum) || MNLQ(rt)) && p[1] != 0 && IS(breaklkup(p[1]), Numeric))
-		goto Done;
-	if(IS(lt, Katakana) && IS(rt, Katakana))
-		goto Done;
-	if(AH(lt) || IS(lt, Numeric) || IS(lt, Katakana) || IS(lt, ExtendNumLet))
-		if(IS(rt, ExtendNumLet))
-			goto Done;
-	if(IS(lt, ExtendNumLet) && (AH(rt) || IS(rt, Numeric) || IS(rt, Katakana)))
-		goto Done;
-	if(ISG(lt, REGION)){
-		if(ISG(rt, REGION))
-			goto Done;
-		if(r != ZWJ)
-			return p;
-		p++;
-		if((r = *p) == 0)
-			return s;
-		rt = breaklkup(r);
-		if(ISG(rt, REGION))
-			goto Done;
-	}
-
-	return p;
-
-Done:
-	if(p[1] == 0)
-		return s;
-	return p + 1;
-}
-
-char*
-utfwbreak(char *s)
-{
-	Rune l, r;
-	Rune peek;
-	uchar lt, rt;
-	char *p;
-
-	p = s;
-	p += chartorune(&l, p);
-	if(l == 0)
-		return s;
-	chartorune(&peek, p+chartorune(&r, p));
-	if(r == 0)
-		return s;
-	lt = breaklkup(l);
-	rt = breaklkup(r);
-	if(l == '\r' && r == '\n')
-		goto Done;
-	if(l == '\r' || l == '\n' || l == LINETAB)
-		return p;
-	if(r == '\r' || r == '\n' || l == LINETAB)
-		return p;
-	if(IS(lt, WSegSpace) && IS(rt, WSegSpace))
-		goto Done;
-	if(IS(rt, Format) || IS(rt, Extend))
-		goto Done;
-	if(AH(lt)){
-		if(AH(rt))
-			goto Done;
-		if(IS(rt, MidLetter) || MNLQ(rt))
-		if(peek != 0 && AH(breaklkup(peek)))
-			goto Done;
-
-		if(IS(lt, Hebrew_Letter) && r == '\'')
-			goto Done;
-
-		if(IS(lt, Hebrew_Letter) && r == '"')
-		if(peek != 0 && IS(breaklkup(peek), Hebrew_Letter))
-			goto Done;
-
-		if(IS(rt, Numeric))
-			goto Done;
-	}
-	if(IS(lt, Numeric) && (AH(rt) || IS(rt, Numeric)))
-		goto Done;
-	if(IS(lt, Numeric) && (IS(rt, MidNum) || MNLQ(rt)) && peek != 0 && IS(breaklkup(peek), Numeric))
-		goto Done;
-	if(IS(lt, Katakana) && IS(rt, Katakana))
-		goto Done;
-	if(AH(lt) || IS(lt, Numeric) || IS(lt, Katakana) || IS(lt, ExtendNumLet))
-		if(IS(rt, ExtendNumLet))
-			goto Done;
-	if(IS(lt, ExtendNumLet) && (AH(rt) || IS(rt, Numeric) || IS(rt, Katakana)))
-		goto Done;
-	if(ISG(lt, REGION)){
-		if(ISG(rt, REGION))
-			goto Done;
-		if(r != ZWJ)
-			return p;
-		p += chartorune(&r, p);
-		chartorune(&r, p);
-		if(r == 0)
-			return s;
-		rt = breaklkup(r);
-		if(ISG(rt, REGION))
-			goto Done;
-	}
-
-	return p;
-
-Done:
-	p += chartorune(&r, p);
-	chartorune(&r, p);
-	if(r == 0)
-		return s;
-	return p;
-}
--- a/sys/src/libc/port/runeistype.c
+++ /dev/null
@@ -1,52 +1,0 @@
-#include <u.h>
-#include <libc.h>
-
-#include "runeistypedata"
-
-int
-isspacerune(Rune c)
-{
-	if(c > Runemax)
-		return 0;
-	return (mergedlkup(c) & Lspace) == Lspace;
-}
-
-int
-isalpharune(Rune c)
-{
-	if(c > Runemax)
-		return 0;
-	return (mergedlkup(c) & Lalpha) == Lalpha;
-}
-
-int
-isdigitrune(Rune c)
-{
-	if(c > Runemax)
-		return 0;
-	return (mergedlkup(c) & Ldigit) == Ldigit;
-}
-
-int
-isupperrune(Rune c)
-{
-	if(c > Runemax)
-		return 0;
-	return (mergedlkup(c) & Lupper) == Lupper;
-}
-
-int
-islowerrune(Rune c)
-{
-	if(c > Runemax)
-		return 0;
-	return (mergedlkup(c) & Llower) == Llower;
-}
-
-int
-istitlerune(Rune c)
-{
-	if(c > Runemax)
-		return 0;
-	return (mergedlkup(c) & Ltitle) == Ltitle;
-}
--- a/sys/src/libc/port/runenorm.c
+++ /dev/null
@@ -1,444 +1,0 @@
-#include <u.h>
-#include <libc.h>
-
-#include "runenormdata"
-
-//Unicode Standard: Section 3.12 Conjoining Jamo Behavior
-enum {
-	SBase = 0xAC00,
-	LBase = 0x1100,
-	VBase = 0x1161,
-	TBase = 0x11A7,
-
-	LCount = 19,
-	VCount = 21,
-	TCount = 28,
-	NCount = VCount * TCount,
-	SCount = LCount * NCount,
-
-	LLast = LBase + LCount - 1,
-	SLast = SBase + SCount - 1,
-	VLast = VBase + VCount - 1,
-	TLast = TBase + TCount - 1,
-};
-
-/*
- * Most runes decompose in to one/two
- * other runes with codepoints < 0xFFFF,
- * however there are some exceptions.
- * To keep the table size down we instead
- * store an index in to an exception range
- * within the private use section and use
- * an exception table.
- */
-enum {
-	Estart = 0xEEEE,
-	Estop = 0xF8FF,
-};
-
-static Rune
-_runedecomp(Rune c, Rune *r2)
-{
-	uint x;
-
-	if(c < Runeself){
-		*r2 = 0;
-		return 0;
-	}
-
-	//korean
-	if(c >= SBase && c <= SLast){
-		c -= SBase;
-		x = c % TCount;
-		if(x){
-			*r2 = TBase + x;
-			return SBase + (c - x);
-		}
-		*r2 = VBase + ((c % NCount) / TCount);
-		return LBase + (c / NCount);
-	}
-
-	x = decomplkup(c);
-	if((x & 0xFFFF) != 0){
-		*r2 = x & 0xFFFF;
-		return x>>16;
-	}
-	x >>= 16;
-	if(x >= Estart && x < Estop){
-		Rune *r;
-		r = _decompexceptions[x - Estart];
-		*r2 = r[1];
-		return r[0];
-	}
-	*r2 = 0;
-	return x;
-}
-
-static Rune
-_runerecomp(Rune r0, Rune r1)
-{
-	uint x, y, *p, next;
-
-	if(r0 >= LBase && r0 <= LLast){
-		if(r1 < VBase || r1 > VLast)
-			return 0;
-		x = (r0 - LBase) * NCount + (r1 - VBase) * TCount;
-		return SBase + x;
-	}
-	if(r0 >= SBase && r0 <= SLast && (r0 - SBase) % TCount == 0){
-		if(r1 > TBase && r1 <= TLast)
-			return r0 + (r1 - TBase);
-		return 0;
-	}
-	if(r0 > 0xFFFF || r1 > 0xFFFF){
-		for(x = 0; x < nelem(_recompexceptions); x++)
-			if(r0 == _recompexceptions[x][1] && r1 == _recompexceptions[x][2])
-				return  _recompexceptions[x][0];
-		return 0;
-	}
-	y = x = r0<<16 | r1;
-	x ^= x >> 16;
-	x *= 0x21f0aaad;
-	x ^= x >> 15;
-	x *= 0xd35a2d97;
-	x ^= x >> 15;
-	p = _recompdata + (x%512)*2;
-	while(p[0] != y){
-		next = p[1]>>16;
-		if(!next)
-			return 0;
-		p = _recompcoll + (next-1)*2;
-	}
-	return p[1] & 0xFFFF;
-}
-
-static void
-runecccsort(Rune *a, int len)
-{
-	Rune r;
-	int i, j;
-
-	for(i = 1; i < len; i++){
-		r = a[i];
-		for(j = i; j > 0 && ccclkup(a[j-1]) > ccclkup(r); j--)
-			a[j] = a[j-1];
-		a[j] = r;
-	}
-}
-
-static int
-boundary(Rune r)
-{
-	return !(qclkup(r) & (Qnfcno|Qnfcmay));
-}
-
-/*
- * Stk stores the entire context for a chunk of
- * an input string that is being normalized.
- * In accordance to the standard, Unicode text
- * has no upper bound for the amount of conjoining
- * (also called non-starter) elements associated with
- * a base rune. Thus to implement normalization within
- * reasonable memory constraints we implement the
- * "Stream-Safe Text Format" as defined in UAX #15 § 13.
- */
-typedef struct {
-	Rune a[Maxnormctx];
-	Rune *e;
-} Stk;
-
-static int
-push(Stk *s, Rune c)
-{
-	int n, l;
-	Rune r2, b[Maxdecomp];
-	Rune *p = b + nelem(b) - 1;
-
-	for(*p = c; c = _runedecomp(c, &r2); *p = c){
-		assert(p > b);
-		if(r2 != 0)
-			*p-- = r2;
-	}
-
-	n = b + nelem(b) - p;
-	l = nelem(s->a) - (s->e - s->a);
-	if(n > l){
-		werrstr("runenorm: buffer overflow");
-		return -1;
-	}
-	l -= n;
-	for(; n > 0; n--)
-		*s->e++ = *p++;
-	return l;
-}
-
-/*
- * Worst case recomposition, this happens when we have to compose
- * two runes who both have a CCC of zero.
- */
-static void
-worstrecomp(Stk *s)
-{
-	int done;
-	Rune c, *p, *rp;
-
-	for(done = 0; done == 0;){
-		done = 1;
-		for(p = s->a; p+1 < s->e; p++){
-			c = _runerecomp(p[0], p[1]);
-			if(c == 0)
-				continue;
-			done = 0;
-			*p = c;
-			for(rp = p+1; rp < s->e-1; rp++)
-				rp[0] = rp[1];
-			s->e--;
-			p--;
-		}
-	}
-}
-
-static void
-cccrecomp(Stk *s)
-{
-	Rune c, *p, *rp;
-
-	for(p = s->a + 1; p < s->e; p++){
-		c  = _runerecomp(s->a[0], *p);
-		if(c != 0){
-			s->a[0] = c;
-			for(rp = p; rp < s->e-1; rp++){
-				rp[0] = rp[1];
-			}
-			s->e--;
-			p--;
-		} else while(p + 1 < s->e && ccclkup(p[0]) == ccclkup(p[1]))
-			p++;
-	}
-}
-
-void
-norminit(Norm *n, int compose, void *ctx, long (*getrune)(void*))
-{
-	memset(n, 0, sizeof *n);
-	n->ctx = ctx;
-	n->getrune = getrune;
-	n->compose = compose;
-	n->obuf.e = n->obuf.a;
-	n->ibuf.e = n->ibuf.a;
-}
-
-int NORMDEBUG;
-
-static long
-peekrune(Norm *n)
-{
-	long r;
-
-	if(n->ibuf.e > n->ibuf.a)
-		return n->ibuf.e[-1];
-
-	r = n->getrune(n->ctx);
-	if(r >= 0)
-		*n->ibuf.e++ = r;
-	return r;
-}
-
-static long
-getrune(Norm *n)
-{
-	if(n->ibuf.e > n->ibuf.a)
-		return *--n->ibuf.e;
-	return n->getrune(n->ctx);
-}
-
-long
-normpull(Norm *n, Rune *rdst, long max, int flush)
-{
-	Rune *rp, *re;
-	Stk stk;
-	Rune *dot;
-	int r;
-	long c;
-
-	rp = rdst;
-	re = rdst + max;
-	dot = nil;
-	c = 0;
-	while(rp < re){
-		if(n->obuf.e != n->obuf.a){
-			memcpy(stk.a, n->obuf.a, (n->obuf.e - n->obuf.a)*sizeof(Rune));
-			stk.e = stk.a + (n->obuf.e - n->obuf.a);
-			n->obuf.e = n->obuf.a;
-			c = stk.a[0];
-			goto Flush;
-		}
-
-		stk.e = stk.a;
-		c = getrune(n);
-		if(c < 0)
-			break;
-		push(&stk, c);
-		c = peekrune(n);
-		if(stk.e == stk.a+1 && stk.a[0] < Runeself && c < Runeself && c >= 0)
-			goto Flush;
-		while(c >= 0 && ccclkup(c) != 0){
-			r = push(&stk, getrune(n));
-			c = peekrune(n);
-			if(r > 2)
-				continue;
-			if(ccclkup(stk.a[0]) != 0){
-				assert(r > 0);
-				r--;
-			} else
-				assert(r >= 0);
-			if(r == 0 || (c == 0x0344 && r < 2)){
-				/* in reverse */
-				if(r > 0){
-					getrune(n);
-					*n->ibuf.e++ = 0x301;
-					*n->ibuf.e++ = 0x308;
-				}
-				*n->ibuf.e++ = 0x034F;
-				break;
-			}
-		}
-		if(stk.e - stk.a > 1)
-			runecccsort(stk.a, stk.e - stk.a);
-
-		if(!n->compose)
-			goto Flush;
-
-		if(ccclkup(stk.e[-1]) == 0){
-			Rune tmp;
-			while(c >= 0 && (!boundary(c) || !boundary(_runedecomp(c, &tmp)))){
-				if(push(&stk, getrune(n)) == -1){
-					*n->ibuf.e++ = c;
-					for(r = 0; r < Maxdecomp; r++)
-						*n->ibuf.e++ = *--stk.e;
-					break;
-				}
-				c = peekrune(n);
-			}
-			worstrecomp(&stk);
-		} else if(ccclkup(stk.a[0]) == 0)
-			cccrecomp(&stk);
-
-Flush:
-		if(flush || c >= 0)
-			for(dot = stk.a; dot < stk.e; dot++){
-				if(rp == re)
-					goto Out;
-				*rp++ = *dot;
-			}
-		dot = nil;
-		if(c < 0)
-			break;
-	}
-Out:
-	if(c < 0 && !flush){
-		while(stk.e > stk.a)
-			*n->ibuf.e++ = *--stk.e;
-	}
-	if(dot != nil){
-		memcpy(n->obuf.a, dot, (stk.e - dot) * sizeof(Rune));
-		n->obuf.e = n->obuf.a + (stk.e - dot);
-	}
-
-	return rp - rdst;
-}
-
-typedef struct {
-	Rune *s, *p;
-	int n;
-} Rctx;
-
-static long
-runegetrune(void *ctx)
-{
-	Rctx *c;
-
-	c = ctx;
-	if(c->p >= c->s + c->n)
-		return -1;
-	return *c->p++;
-}
-
-static long
-runedostr(Rune *dst, long ndst, Rune *src, long nsrc, int comp)
-{
-	Rctx c;
-	Norm n;
-
-	c.s = c.p = src;
-	c.n = nsrc;
-	norminit(&n, comp, &c, runegetrune);
-	return normpull(&n, dst, ndst, 1);
-}
-
-long
-runecomp(Rune *dst, long ndst, Rune *src, long nsrc)
-{
-	return runedostr(dst, ndst, src, nsrc, 1);
-}
-
-long
-runedecomp(Rune *dst, long ndst, Rune *src, long nsrc)
-{
-	return runedostr(dst, ndst, src, nsrc, 0);
-}
-
-typedef struct {
-	char *s, *p;
-	int n;
-} Uctx;
-
-static long
-utfgetrune(void *ctx)
-{
-	Uctx *c;
-	Rune r;
-
-	c = ctx;
-	if(c->p >= c->s + c->n)
-		return -1;
-	c->p += chartorune(&r, c->p);
-	return r;
-}
-
-static long
-utfdostr(char *dst, long ndst, char *src, long nsrc, int comp)
-{
-	Uctx c;
-	Norm n;
-	Rune buf[Maxnormctx];
-	long i, w;
-	char *e, *p;
-
-	c.s = c.p = src;
-	c.n = nsrc;
-	norminit(&n, comp, &c, utfgetrune);
-	for(p = dst, e = dst + ndst; p < e;){
-		w = normpull(&n, buf, nelem(buf), 1);
-		if(w == 0)
-			break;
-		for(i = 0; i < w; i++){
-			if(p + runelen(buf[i]) >= e)
-				break;
-			p += runetochar(p, buf+i);
-		}
-	}
-	return p - dst;
-}
-
-long
-utfcomp(char *dst, long ndst, char *src, long nsrc)
-{
-	return utfdostr(dst, ndst, src, nsrc, 1);
-}
-
-long
-utfdecomp(char *dst, long ndst, char *src, long nsrc)
-{
-	return utfdostr(dst, ndst, src, nsrc, 0);
-}
--- a/sys/src/libc/port/runetotype.c
+++ /dev/null
@@ -1,22 +1,0 @@
-#include <u.h>
-#include <libc.h>
-
-#include "runetotypedata"
-
-Rune
-toupperrune(Rune c)
-{
-	return c + upperlkup(c);
-}
-
-Rune
-tolowerrune(Rune c)
-{
-	return c + lowerlkup(c);
-}
-
-Rune
-totitlerune(Rune c)
-{
-	return c + titlelkup(c);
-}
--- /dev/null
+++ b/sys/src/libc/ucd/mkfile
@@ -1,0 +1,42 @@
+</$objtype/mkfile
+
+LIB=/$objtype/lib/libc.a
+
+OFILES=\
+	runenorm.$O\
+	runetotype.$O\
+	runeistype.$O\
+	runebreak.$O\
+
+CLEANFILES=$OFILES
+
+HFILES=/sys/include/libc.h
+
+</sys/src/cmd/mksyslib
+
+DATA=\
+	runenormdata\
+	runetotypedata\
+	runeistypedata\
+	runebreakdata\
+
+$OFILES: $DATA
+
+UCD=\
+	/lib/ucd/CompositionExclusions.txt\
+	/lib/ucd/DerivedNormalizationProps.txt\
+	/lib/ucd/GraphemeBreakProperty.txt\
+	/lib/ucd/UnicodeData.txt\
+	/lib/ucd/WordBreakProperty.txt\
+	/lib/ucd/emoji-data.txt\
+
+$DATA:	$UCD
+	@{
+		eval `{grep '^[A-Z]' /$cputype/mkfile}
+		$CC $CFLAGS -o mkrunetype.$O mkrunetype.c
+		$LD $LDFLAGS -o $O.mkrunetype mkrunetype.$O
+		./$O.mkrunetype
+	}
+
+nuke:V:
+	rm -f $DATA *.mkrunetype
--- /dev/null
+++ b/sys/src/libc/ucd/mkrunetype.c
@@ -1,0 +1,789 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+
+enum{
+	NRUNES = 1<<21
+};
+
+typedef struct Param Param;
+typedef struct Lvl Lvl;
+struct Lvl{
+	int bits;
+	int max;
+	int mask;
+};
+struct Param{
+	Lvl idx1;
+	Lvl idx2;
+	Lvl data;
+
+	int round1max;
+};
+
+static void
+derive(Lvl *l)
+{
+	l->max = 1 << l->bits;
+	l->mask = l->max - 1;
+}
+
+static void
+param(Param *p, int idx1, int idx2)
+{
+
+	assert(idx1 + idx2 < 21);
+	p->idx1.bits = idx1;
+	p->idx2.bits = idx2;
+	p->data.bits = 21 - idx1 - idx2;
+	derive(&p->idx1);
+	derive(&p->idx2);
+	derive(&p->data);
+
+	p->round1max = NRUNES/p->data.max;
+}
+
+static int
+lkup(Param *p, int *idx1, int *idx2, int *data, int x)
+{
+	int y, z;
+
+	y = (((x)>>(p->data.bits+p->idx2.bits))&p->idx1.mask);
+	z = (((x)>>p->data.bits)&p->idx2.mask);
+	return data[idx2[idx1[y] + z] + (x&p->data.mask)];
+}
+
+static int
+mkarrvar(int fd, char *name, int *d, int len)
+{
+	int i, sz;
+	int max, min;
+	char *t;
+
+	max = min = 0;
+	for(i = 0; i < len; i++){
+		if(d[i] > max)
+			max = d[i];
+		if(d[i] < min)
+			min = d[i];
+	}
+	if(min == 0){
+		if(max < 0xFF)
+			t = "uchar", sz = 1;
+		else if(max < 0xFFFF)
+			t = "ushort", sz = 2;
+		else
+			t = "uint", sz = 4;
+	} else {
+		if(max < 1<<7)
+			t = "char", sz = 1;
+		else if(max < 1<<15)
+			t = "short", sz = 2;
+		else
+			t = "int", sz = 4;
+	}
+	if(fd < 0)
+		return sz * len;
+
+	fprint(fd, "static\n%s\t%s[%d] =\n{\n\t", t, name, len);
+	for(i = 0; i < len; i++){
+		fprint(fd, "%d,", d[i]);
+		if((i+1) % 16 == 0)
+			fprint(fd, "\n\t");
+	}
+	fprint(fd, "\n};\n");
+
+	return sz * len;
+}
+
+static int
+mkexceptarr(int fd, char *name, int *d, int n, int all)
+{
+	int i;
+	fprint(fd, "static\nRune %s[][%d] =\n{\n\t", name, all ? 3 : 2);
+	for(i = 0; i < n*3; i += 3){
+		if(all && d[i] != 0)
+			fprint(fd, "{0x%X, 0x%X, 0x%X},", d[i], d[i+1], d[i+2]);
+		else if(!all)
+			fprint(fd, "{0x%X, 0x%X},", d[i+1], d[i+2]);	
+		if((i+3) % (8*3) == 0)
+			fprint(fd, "\n\t");
+	}
+	fprint(fd, "\n};\n");
+	return n * sizeof(Rune) * 2;
+}
+
+static int
+compact(int *data, int *idx, int nidx, int *src, int chunksize)
+{
+	int i, n, ndata, best;
+	int *dot, *lp, *rp;
+
+	dot = src;
+	ndata = 0;
+	idx[0] = 0;
+	for(i = 1; i <= nidx; i++){
+		rp = dot + chunksize;
+		lp = rp - 1;
+
+		for(best = 0, n = 0; i != nidx && n < chunksize; n++, lp--){
+			if(memcmp(lp, rp, (n+1) * sizeof data[0]) == 0)
+				best = n+1;
+		}
+		memmove(data + ndata, dot, (chunksize - best) * sizeof data[0]);
+		ndata += (chunksize - best);
+		idx[i] = idx[i - 1] + (chunksize - best);
+		dot = rp;
+	}
+	return ndata;
+}
+
+
+static int
+mklkup(int fd, char *label, int *map, Param *p)
+{
+	static int data[NRUNES];
+	static int idx2[NRUNES];
+	static int idx2dest[NRUNES];
+	static int idx1[NRUNES];
+	int i, nidx2, ndata;
+	int size;
+
+	ndata = compact(data, idx2, p->round1max, map, p->data.max);
+	nidx2 = compact(idx2dest, idx1, p->idx1.max, idx2, p->idx2.max);
+
+	if(fd >= 0){
+		for(i = 0; i < NRUNES; i++)
+			if(map[i] != lkup(p, idx1, idx2dest, data, i))
+				sysfatal("mismatch in %s at %d %d %d", label, i, map[i], lkup(p, idx1, idx2dest, data, i));
+	}
+
+	size = mkarrvar(fd, smprint("_%sdata", label), data, ndata);
+	size += mkarrvar(fd, smprint("_%sidx2", label), idx2dest, nidx2);
+	size += mkarrvar(fd, smprint("_%sidx1", label), idx1, p->idx1.max);
+	if(fd >= 0){
+		fprint(fd, "\n");
+		fprint(fd, "#define %sindex1(x) (((x)>>(%d+%d))&0x%X)\n", label, p->data.bits, p->idx2.bits, p->idx1.mask);
+		fprint(fd, "#define %sindex2(x) (((x)>>%d)&0x%X)\n", label, p->data.bits, p->idx2.mask);
+		fprint(fd, "#define %soffset(x) ((x)&0x%X)\n", label, p->data.mask);
+		fprint(fd, "#define %slkup(x) (_%sdata[_%sidx2[_%sidx1[%sindex1(x)] + %sindex2(x)] + %soffset(x)] )\n\n",
+			label, label, label, label, label, label, label);
+	}
+	return size;
+}
+
+static int
+mklkupmatrix(int, char *label, int *map, Param *p)
+{
+	int bestsize, size, bestx, besty;
+	int x, y;
+
+	bestsize = bestx = besty = -1;
+	for(x = 4; x <= 12; x++)
+		for(y=4; y <= (19 - x); y++){
+			param(p, x, y);
+			size = mklkup(-1, label, map, p);
+			if(bestsize == -1 || size < bestsize){
+				bestx = x;
+				besty = y;
+				bestsize = size;
+			}
+		}
+
+	assert(bestsize != -1);
+	fprint(2, "label: %s best: %d %d (%d)\n", label, bestx, besty, bestsize);
+	param(p, bestx, besty);
+	return bestsize;
+}
+
+static int myismerged[NRUNES];
+static int mytoupper[NRUNES];
+static int mytolower[NRUNES];
+static int mytotitle[NRUNES];
+static int mybreak[NRUNES];
+
+enum{ DSTART = 0xEEEE };
+static int mydecomp[NRUNES];
+static int mydespecial[256*3];
+static int nspecial;
+static int maxdchain;
+static int myccc[NRUNES];
+static int myqc[NRUNES];
+
+typedef struct KV KV;
+struct KV{
+	uint key;
+	uint val;
+	ushort next;
+};
+
+static KV myrecomp[2000];
+static int nrecomp;
+
+static int recompext[256*3];
+static int nrecompext;
+
+static uint
+hash(uint x)
+{
+	x ^= x >> 16;
+	x *= 0x21f0aaad;
+	x ^= x >> 15;
+	x *= 0xd35a2d97;
+	x ^= x >> 15;
+	return x;
+}
+
+static void
+mkrecomp(int fd)
+{
+	int i;
+	KV *p;
+	static KV vals[512];
+	static KV coll[1000];
+	int over;
+	int maxchain;
+
+	for(i = 0; i < nelem(vals); i++)
+		vals[i] = (KV){0, 0, 0};
+	for(i = 0; i < nelem(coll); i++)
+		coll[i] = (KV){0, 0, 0};
+	over = 1;
+	for(i = 0; i < nrecomp; i++){
+		p = vals + (hash(myrecomp[i].key) % nelem(vals));
+		maxchain = 0;
+		while(p->key != 0){
+			maxchain++;
+			if(p->next == 0){
+				p->next = over;
+				p = coll + over - 1;
+				over++;
+			} else
+				p = coll + p->next - 1;
+		}
+		p->key = myrecomp[i].key;
+		p->val = myrecomp[i].val;
+	}
+	fprint(2, "recomp map [%d][%d]: %d\n", nelem(vals), over-1, (nelem(vals) + over-1) * (4+2+2));
+	fprint(fd, "static\nuint\t_recompdata[] =\n{\n\t");
+	for(p = vals, i = 0;; i++){
+		assert(p->val < 0xFFFF);
+		assert(p->next < 0xFFFF);
+		fprint(fd, "%udU,%udU,", p->key, p->val | (p->next<<16));
+		if((i+1) % 8 == 0)
+			fprint(fd, "\n\t");
+
+		if(p == vals+nelem(vals)-1)
+			p = coll;
+		else if(p == coll + over - 2)
+			break;
+		else
+			p++;
+	}
+	fprint(fd, "\n};\n");
+	fprint(fd, "static uint *_recompcoll = _recompdata+%d*2;\n", nelem(vals));
+}
+
+enum {
+	OTHER, 
+	Hebrew_Letter, Newline, Extend, Format,
+	Katakana, ALetter, MidLetter, MidNum,
+	MidNumLet, Numeric, ExtendNumLet, WSegSpace,
+	PREPEND = 0x10, CONTROL = 0x20, EXTEND = 0x30, REGION = 0x40,
+	L = 0x50, V = 0x60, T = 0x70, LV = 0x80, LVT = 0x90, SPACEMK = 0xA0,
+	EMOJIEX = 0xB0,
+
+	NFC_QC_No = 1, NFC_QC_Maybe = 2, NFD_QC_No = 4, NFD_QC_Maybe = 8,
+	
+};
+
+static void
+mktables(void)
+{
+	Param p;
+	int tofd, isfd, normfd, breakfd;
+	int size;
+
+	tofd = create("runetotypedata", OWRITE, 0664);
+	if(tofd < 0)
+		sysfatal("could not create runetotypedata: %r");
+	param(&p, 10, 7);
+	size = mklkup(tofd, "upper", mytoupper, &p);
+	fprint(2, "%s: %d\n", "upper", size);
+
+	size = mklkup(tofd, "lower", mytolower, &p);
+	fprint(2, "%s: %d\n", "lower", size);
+
+	size = mklkup(tofd, "title", mytotitle, &p);
+	fprint(2, "%s: %d\n", "title", size);
+	close(tofd);
+
+	isfd = create("runeistypedata", OWRITE, 0664);
+	if(isfd < 0)
+		sysfatal("could not create runeistypedata: %r");
+	param(&p, 11, 6);
+	size = mklkup(isfd, "merged", myismerged, &p);
+	fprint(2, "%s: %d\n", "merged", size);
+	fprint(isfd, "static\nenum {\n");
+	fprint(isfd, "\tL%s = %s,\n", "space", "1<<0");
+	fprint(isfd, "\tL%s = %s,\n", "alpha", "1<<1");
+	fprint(isfd, "\tL%s = %s,\n", "digit", "1<<2");
+	fprint(isfd, "\tL%s = %s,\n", "upper", "1<<3");
+	fprint(isfd, "\tL%s = %s,\n", "lower", "1<<4");
+	fprint(isfd, "\tL%s = %s,\n", "title", "1<<5");
+	fprint(isfd, "};\n");
+	close(isfd);
+
+	normfd = create("runenormdata", OWRITE, 0664);
+	if(normfd < 0)
+		sysfatal("could not create runenormdata: %r");
+	param(&p, 10, 7);
+	size = mklkup(normfd, "decomp", mydecomp, &p);
+	fprint(2, "%s: %d\n", "decomp", size);
+	fprint(normfd, "static enum { Maxdecomp = %d };\n\n", maxdchain);
+
+	param(&p, 9, 7);
+	size = mklkup(normfd, "ccc", myccc, &p);
+	fprint(2, "%s: %d\n", "ccc", size);
+
+	param(&p, 10, 6);
+	size = mklkup(normfd, "qc", myqc, &p);
+	fprint(2, "%s: %d\n", "qc", size);
+	fprint(normfd, "static\nenum {\n");
+	fprint(normfd, "\t%s = %d,\n", "Qnfcno", NFC_QC_No);
+	fprint(normfd, "\t%s = %d,\n", "Qnfcmay", NFC_QC_Maybe);
+	fprint(normfd, "\t%s = %d,\n", "Qnfdno", NFD_QC_No);
+	fprint(normfd, "\t%s = %d,\n", "Qnfdmay", NFD_QC_Maybe);
+	fprint(normfd, "};\n");
+
+	mkexceptarr(normfd, "_decompexceptions", mydespecial, nspecial, 0);
+	mkexceptarr(normfd, "_recompexceptions", recompext, nrecompext, 1);
+	mkrecomp(normfd);
+	close(normfd);
+
+	param(&p, 10, 6);
+	breakfd = create("runebreakdata", OWRITE, 0644);
+	if(breakfd < 0)
+		sysfatal("could not create runebreakdata: %r");
+	size = mklkup(breakfd, "break", mybreak, &p);
+	fprint(2, "%s: %d\n", "break", size);
+}
+
+enum {
+	FIELD_CODE,
+	FIELD_NAME,
+	FIELD_CATEGORY,
+	FIELD_COMBINING,
+	FIELD_BIDIR,
+	FIELD_DECOMP,
+	FIELD_DECIMAL_DIG,
+	FIELD_DIG,
+	FIELD_NUMERIC_VAL,
+	FIELD_MIRRORED,
+	FIELD_UNICODE_1_NAME,
+	FIELD_COMMENT,
+	FIELD_UPPER,
+	FIELD_LOWER,
+	FIELD_TITLE,
+	NFIELDS,
+};
+
+static int
+getunicodeline(Biobuf *in, char **fields)
+{
+	char *p;
+
+	if((p = Brdline(in, '\n')) == nil)
+		return 0;
+
+	p[Blinelen(in)-1] = '\0';
+
+	if (getfields(p, fields, NFIELDS + 1, 0, ";") != NFIELDS)
+		sysfatal("bad number of fields");
+
+	return 1;
+}
+
+static int
+estrtoul(char *s, int base)
+{
+	char *epr;
+	Rune code;
+
+	code = strtoul(s, &epr, base);
+	if(s == epr)
+		sysfatal("bad code point hex string");
+	return code;
+}
+
+static char*
+getextraline(Biobuf *b, int *s, int *e)
+{
+	char *dot, *p;
+
+again:
+	p = Brdline(b, '\n');
+	if(p == nil)
+		return nil;
+	p[Blinelen(b)-1] = 0;
+	if(p[0] == 0 || p[0] == '#')
+		goto again;
+	if((dot = strstr(p, "..")) != nil){
+		*dot = 0;
+		dot += 2;
+		*s = estrtoul(p, 16);
+		*e = estrtoul(dot, 16);
+	} else {
+		*s = *e = estrtoul(p, 16);
+		dot = p;
+	}
+	return dot;
+}
+
+static void
+markbreak(void)
+{
+	Biobuf *b;
+	char *dot;
+	int i, s, e;
+	uchar v;
+
+	b = Bopen("/lib/ucd/WordBreakProperty.txt", OREAD);
+	if(b == nil)
+		sysfatal("could not load word breaks: %r");
+
+	while((dot = getextraline(b, &s, &e)) != nil){
+		v = 0;
+		if(strstr(dot, "ExtendNumLet") != nil)
+			v = ExtendNumLet;
+		else if(strstr(dot, "Hebrew_Letter") != nil)
+			v = Hebrew_Letter;
+		else if(strstr(dot, "Newline") != nil)
+			v = Newline;
+		else if(strstr(dot, "Extend") != nil)
+			v = Extend;
+		else if(strstr(dot, "Format") != nil)
+			v = Format;
+		else if(strstr(dot, "Katakana") != nil)
+			v = Katakana;
+		else if(strstr(dot, "ALetter") != nil)
+			v = ALetter;
+		else if(strstr(dot, "MidLetter") != nil)
+			v = MidLetter;
+		else if(strstr(dot, "MidNum") != nil)
+			v = MidNum;
+		else if(strstr(dot, "Numeric") != nil)
+			v = Numeric;
+		else if(strstr(dot, "WSegSpace") != nil)
+			v = WSegSpace;
+		for(i = s; i <= e; i++)
+			mybreak[i] = v;
+	}
+	Bterm(b);
+	b = Bopen("/lib/ucd/GraphemeBreakProperty.txt", OREAD);
+	if(b == nil)
+		sysfatal("could not load Grapheme breaks: %r");
+
+	while((dot = getextraline(b, &s, &e)) != nil){
+		v = 0;
+		if(strstr(dot, "; Prepend #") != nil)
+			v = PREPEND;
+		else if(strstr(dot, "; Control #") != nil)
+			v = CONTROL;
+		else if(strstr(dot, "; Extend #") != nil)
+			v = EXTEND;
+		else if(strstr(dot, "; Regional_Indicator #") != nil)
+			v = REGION;
+		else if(strstr(dot, "; SpacingMark #") != nil)
+			v = SPACEMK;
+		else if(strstr(dot, "; L #") != nil)
+			v = L;
+		else if(strstr(dot, "; V #") != nil)
+			v = V;
+		else if(strstr(dot, "; T #") != nil)
+			v = T;
+		else if(strstr(dot, "; LV #") != nil)
+			v = LV;
+		else if(strstr(dot, "; LVT #") != nil)
+			v = LVT;
+		for(i = s; i <= e; i++)
+			mybreak[i] |= v;
+	}
+	Bterm(b);
+
+	b = Bopen("/lib/ucd/emoji-data.txt", OREAD);
+	if(b == nil)
+		sysfatal("could not load emoji-data: %r");
+
+	while((dot = getextraline(b, &s, &e)) != nil){
+		v = 0;
+		if(strstr(dot, "; Extended_Pictographic") != nil)
+			v = EMOJIEX;
+		for(i = s; i <= e; i++)
+			mybreak[i] |= v;
+	}
+	Bterm(b);
+
+	b = Bopen("/lib/ucd/DerivedNormalizationProps.txt", OREAD);
+	if(b == nil)
+		sysfatal("could not load emoji-data: %r");
+
+	while((dot = getextraline(b, &s, &e)) != nil){
+		v = 0;
+		if(strstr(dot, "; NFC_QC; N") != nil)
+			v = NFC_QC_No;
+		else if(strstr(dot, "; NFC_QC; M") != nil)
+			v = NFC_QC_Maybe;
+		else if(strstr(dot, "; NFD_QC; N") != nil)
+			v = NFD_QC_No;
+		else if(strstr(dot, "; NFD_QC; M") != nil)
+			v = NFD_QC_Maybe;
+
+		for(i = s; i <= e; i++)
+			myqc[i] |= v;
+	}
+	Bterm(b);
+}
+
+static void
+markexclusions(void)
+{
+	Biobuf *b;
+	char *p;
+	int i;
+	uint x;
+
+	b = Bopen("/lib/ucd/CompositionExclusions.txt", OREAD);
+	if(b == nil)
+		sysfatal("could not load composition exclusions: %r");
+
+	while((p = Brdline(b, '\n')) != nil){
+		p[Blinelen(b)-1] = 0;
+		if(p[0] == 0 || p[0] == '#')
+			continue;
+		x = estrtoul(p, 16);
+		for(i = 0; i < nrecomp; i++){
+			if(myrecomp[i].val == x){
+				myrecomp[i].val = 0;
+				break;
+			}
+		}
+		if(i == nrecomp){
+			for(i = 0; i < nrecompext; i++){
+				if(recompext[i*3] == x){
+					recompext[i*3] = 0;
+					break;
+				}
+			}
+		}
+	}
+	Bterm(b);
+}
+
+static void
+findlongchain(void)
+{
+	int i, n, x, r1;
+
+	for(i = 0; i < NRUNES; i++)
+	for(x = i, n = 0; r1 = mydecomp[x]>>16; x = r1){
+		if(++n > maxdchain)
+			maxdchain = n;
+		if(r1 >= DSTART && r1 <0xF8FF)
+			r1 -= DSTART;
+	}
+	maxdchain *= 2;
+}
+
+void
+main(int, char)
+{
+	static char myisspace[NRUNES];
+	static char myisalpha[NRUNES];
+	static char myisdigit[NRUNES];
+	static char myisupper[NRUNES];
+	static char myislower[NRUNES];
+	static char myistitle[NRUNES];
+	Biobuf *in;
+	char *fields[NFIELDS + 1], *fields2[NFIELDS + 1];
+	char *p, *d;
+	int i, code, last;
+	int decomp[2], *ip;
+
+	in = Bopen("/lib/ucd/UnicodeData.txt", OREAD);
+	if(in == nil)
+		sysfatal("can't open UnicodeData.txt: %r");
+
+	for(i = 0; i < NRUNES; i++){
+		mytoupper[i] = -1;
+		mytolower[i] = -1;
+		mytotitle[i] = -1;
+		mydecomp[i] = 0;
+		myccc[i] = 0;
+		mybreak[i] = 0;
+	}
+
+	myisspace['\t'] = 1;
+	myisspace['\n'] = 1;
+	myisspace['\r'] = 1;
+	myisspace['\f'] = 1;
+	myisspace['\v'] = 1;
+	myisspace[0x85] = 1;	/* control char, "next line" */
+	myisspace[0xfeff] = 1;	/* zero-width non-break space */
+
+	last = -1;
+	nspecial = nrecomp = nrecompext =  0;
+	while(getunicodeline(in, fields)){
+		code = estrtoul(fields[FIELD_CODE], 16);
+		if (code >= NRUNES)
+			sysfatal("code-point value too big: %x", code);
+		if(code <= last)
+			sysfatal("bad code sequence: %x then %x", last, code);
+		last = code;
+
+		p = fields[FIELD_CATEGORY];
+		if(strstr(fields[FIELD_NAME], ", First>") != nil){
+			if(!getunicodeline(in, fields2))
+				sysfatal("range start at eof");
+			if (strstr(fields2[FIELD_NAME], ", Last>") == nil)
+				sysfatal("range start not followed by range end");
+			last = estrtoul(fields2[FIELD_CODE], 16);
+			if(last <= code)
+				sysfatal("range out of sequence: %x then %x", code, last);
+			if(strcmp(p, fields2[FIELD_CATEGORY]) != 0)
+				sysfatal("range with mismatched category");
+		}
+
+		d = fields[FIELD_DECOMP];
+		if(strlen(d) > 0 && strstr(d, "<") == nil){
+			decomp[0] = estrtoul(d, 16);
+			d = strstr(d, " ");
+			if(d == nil){
+				/* singleton recompositions are verboden */
+				decomp[1] = 0;
+				if(decomp[0] > 0xFFFF){
+					ip = mydespecial + nspecial*3;
+					ip[0] = code;
+					ip[1] = decomp[0];
+					ip[2] = 0;
+					mydecomp[code] = (DSTART+nspecial)<<16;
+					nspecial++;
+				} else
+					mydecomp[code] = decomp[0]<<16;
+			} else {
+				d++;
+				decomp[1] = estrtoul(d, 16);
+				if(decomp[0] > 0xFFFF || decomp[1] > 0xFFFF){
+					ip = mydespecial + nspecial*3;
+					ip[0] = code;
+					ip[1] = decomp[0];
+					ip[2] = decomp[1];
+					mydecomp[code] = (DSTART+nspecial)<<16;
+					nspecial++;
+					ip = recompext + nrecompext*3;
+					ip[0] = code;
+					ip[1] = decomp[0];
+					ip[2] = decomp[1];
+					nrecompext++;
+				} else {
+					mydecomp[code] = decomp[0]<<16 | decomp[1];
+					myrecomp[nrecomp++] = (KV){decomp[0]<<16 | decomp[1], code, 0};
+				}
+			}
+		}
+
+		for (; code <= last; code++){
+			if(p[0] == 'L')
+				myisalpha[code] = 1;
+			if(p[0] == 'Z')
+				myisspace[code] = 1;
+
+			if(strcmp(p, "Lu") == 0)
+				myisupper[code] = 1;
+			if(strcmp(p, "Ll") == 0)
+				myislower[code] = 1;
+
+			if(strcmp(p, "Lt") == 0)
+				myistitle[code] = 1;
+
+			if(strcmp(p, "Nd") == 0)
+				myisdigit[code] = 1;
+
+			if(fields[FIELD_UPPER][0] != '\0')
+				mytoupper[code] = estrtoul(fields[FIELD_UPPER], 16);
+
+			if(fields[FIELD_LOWER][0] != '\0')
+				mytolower[code] = estrtoul(fields[FIELD_LOWER], 16);
+
+			if(fields[FIELD_TITLE][0] != '\0')
+				mytotitle[code] = estrtoul(fields[FIELD_TITLE], 16);
+
+			myccc[code] = estrtoul(fields[FIELD_COMBINING], 10);
+		}
+	}
+
+	Bterm(in);
+	findlongchain();
+	markexclusions();
+
+	/*
+	 * according to standard, if totitle(x) is not defined in ucd
+	 * but toupper(x) is, then totitle is defined to be toupper(x)
+	 */
+	for(i = 0; i < NRUNES; i++){
+		if(mytotitle[i] == -1
+		&& mytoupper[i] != -1
+		&& !myistitle[i])
+			mytotitle[i] = mytoupper[i];
+	}
+
+	/*
+	 * A couple corrections:
+	 * is*(to*(x)) should be true.
+	 * restore undefined transformations.
+	 * store offset instead of value, makes them sparse.
+	 */
+	for(i = 0; i < NRUNES; i++){
+		if(mytoupper[i] != -1)
+			myisupper[mytoupper[i]] = 1;
+		else
+			mytoupper[i] = i;
+
+		if(mytolower[i] != -1)
+			myislower[mytolower[i]] = 1;
+		else
+			mytolower[i] = i;
+
+		if(mytotitle[i] != -1)
+			myistitle[mytotitle[i]] = 1;
+		else
+			mytotitle[i] = i;
+
+		mytoupper[i] = mytoupper[i] - i;
+		mytolower[i] = mytolower[i] - i;
+		mytotitle[i] = mytotitle[i] - i;
+	}
+
+	uchar b;
+	for(i = 0; i < NRUNES; i++){
+		b = 0;
+		if(myisspace[i])
+			b |= 1<<0;
+		if(myisalpha[i])
+			b |= 1<<1;
+		if(myisdigit[i])
+			b |= 1<<2;
+		if(myisupper[i])
+			b |= 1<<3;
+		if(myislower[i])
+			b |= 1<<4;
+		if(myistitle[i])
+			b |= 1<<5;
+
+		myismerged[i] = b;
+	}
+
+	markbreak();
+	mktables();
+	exits(nil);
+}
--- /dev/null
+++ b/sys/src/libc/ucd/runebreak.c
@@ -1,0 +1,293 @@
+#include <u.h>
+#include <libc.h>
+
+#include "runebreakdata"
+
+enum {
+	OTHER, 
+	Hebrew_Letter, Newline, Extend, Format,
+	Katakana, ALetter, MidLetter, MidNum,
+	MidNumLet, Numeric, ExtendNumLet, WSegSpace,
+	PREPEND = 0x10, CONTROL = 0x20, EXTEND = 0x30, REGION = 0x40,
+	L = 0x50, V = 0x60, T = 0x70, LV = 0x80, LVT = 0x90, SPACEMK = 0xA0,
+	EMOJIEX = 0xB0,
+
+	ZWJ = 0x200DU,
+	LINETAB = 0xB,
+};
+
+#define IS(x, y) ((x&0xf) == y)
+#define ISG(x, y) ((x&0xf0) == y)
+
+Rune*
+runegbreak(Rune *s)
+{
+	Rune l, r;
+	uchar lt, rt;
+	Rune *p;
+
+	p = s;
+	if((l = *p++) == 0)
+		return s;
+	if((r = *p) == 0)
+		return s;
+	lt = breaklkup(l);
+	rt = breaklkup(r);
+	if(l == '\r' && r == '\n')
+		goto Done;
+	if(ISG(lt, CONTROL) || l == '\r' || l == '\n')
+		return p;
+	if(ISG(rt, CONTROL) || r == '\r' || r == '\n')
+		return p;
+	if(ISG(lt, L) && (ISG(rt, L) || ISG(rt, V) || ISG(rt, LV) || ISG(rt, LVT)))
+		goto Done;
+	if((ISG(lt, LV) || ISG(lt, V)) && (ISG(rt, V) || ISG(rt, T)))
+		goto Done;
+	if((ISG(lt, LVT) || ISG(lt, T)) && (ISG(rt, T) || ISG(rt, T)))
+		goto Done;
+	if(ISG(rt, SPACEMK) || ISG(lt, PREPEND))
+		goto Done;
+	if(ISG(lt, EMOJIEX) && (ISG(rt, EXTEND) || r == ZWJ)){
+		while(ISG(rt, EXTEND)){
+			p++;
+			if((r = *p) == 0)
+				return s;
+			rt = breaklkup(r);
+		}
+		if(r != ZWJ)
+			return p;
+		p++;
+		if((r = *p) == 0)
+			return s;
+		rt = breaklkup(r);
+		if(ISG(rt, EMOJIEX))
+			goto Done;
+		return p;
+	}
+	if(ISG(rt, EXTEND) || r == ZWJ)
+		goto Done;
+	if(ISG(lt, REGION) && ISG(rt, REGION))
+		goto Done;
+
+	return p;
+
+Done:
+	if(p[1] == 0)
+		return s;
+	return p + 1;
+}
+
+char*
+utfgbreak(char *s)
+{
+	Rune l, r;
+	uchar lt, rt;
+	char *p;
+
+	p = s;
+	p += chartorune(&l, p);
+	if(l == 0)
+		return s;
+	chartorune(&r, p);
+	if(r == 0)
+		return s;
+	lt = breaklkup(l);
+	rt = breaklkup(r);
+	if(l == '\r' && r == '\n')
+		goto Done;
+	if(ISG(lt, CONTROL) || l == '\r' || l == '\n')
+		return p;
+	if(ISG(rt, CONTROL) || r == '\r' || r == '\n')
+		return p;
+	if(ISG(lt, L) && (ISG(rt, L) || ISG(rt, V) || ISG(rt, LV) || ISG(rt, LVT)))
+		goto Done;
+	if((ISG(lt, LV) || ISG(lt, V)) && (ISG(rt, V) || ISG(rt, T)))
+		goto Done;
+	if((ISG(lt, LVT) || ISG(lt, T)) && (ISG(rt, T) || ISG(rt, T)))
+		goto Done;
+	if(ISG(rt, SPACEMK) || ISG(lt, PREPEND))
+		goto Done;
+	if(ISG(lt, EMOJIEX) && (ISG(rt, EXTEND) || r == ZWJ)){
+		while(ISG(rt, EXTEND)){
+			p += chartorune(&r, p);
+			chartorune(&r, p);
+			if(r == 0)
+				return s;
+			rt = breaklkup(r);
+		}
+		if(r != ZWJ)
+			return p;
+
+		p += chartorune(&r, p);
+		chartorune(&r, p);
+		if(r == 0)
+			return s;
+		rt = breaklkup(r);
+		if(ISG(rt, EMOJIEX))
+			goto Done;
+		return p;
+	}
+	if(ISG(rt, EXTEND) || r == ZWJ)
+		goto Done;
+	if(ISG(lt, REGION) && ISG(rt, REGION))
+		goto Done;
+
+	return p;
+
+Done:
+	p += chartorune(&r, p);
+	chartorune(&r, p);
+	if(r == 0)
+		return s;
+	return p;
+}
+
+#define AH(x) (IS(x, ALetter) || IS(x, Hebrew_Letter))
+#define MNLQ(x) (IS(x, MidNumLet) || x == '\'')
+
+Rune*
+runewbreak(Rune *s)
+{
+	Rune l, r;
+	uchar lt, rt;
+	Rune *p;
+
+	p = s;
+	if((l = *p++) == 0)
+		return s;
+	if((r = *p) == 0)
+		return s;
+	lt = breaklkup(l);
+	rt = breaklkup(r);
+	if(l == '\r' && r == '\n')
+		goto Done;
+	if(l == '\r' || l == '\n' || l == LINETAB)
+		return p;
+	if(r == '\r' || r == '\n' || l == LINETAB)
+		return p;
+	if(IS(lt, WSegSpace) && IS(rt, WSegSpace))
+		goto Done;
+	if(IS(rt, Format) || IS(rt, Extend))
+		goto Done;
+	if(AH(lt)){
+		if(AH(rt))
+			goto Done;
+		if((IS(rt, MidLetter) || MNLQ(rt)) && p[1] != 0 && AH(breaklkup(p[1])))
+			goto Done;
+		if(IS(lt, Hebrew_Letter) && r == '\'')
+			goto Done;
+		if(IS(lt, Hebrew_Letter) && r == '"' && p[1] != 0 && IS(breaklkup(p[1]), Hebrew_Letter))
+			goto Done;
+		if(IS(rt, Numeric))
+			goto Done;
+	}
+	if(IS(lt, Numeric) && (AH(rt) || IS(rt, Numeric)))
+		goto Done;
+	if(IS(lt, Numeric) && (IS(rt, MidNum) || MNLQ(rt)) && p[1] != 0 && IS(breaklkup(p[1]), Numeric))
+		goto Done;
+	if(IS(lt, Katakana) && IS(rt, Katakana))
+		goto Done;
+	if(AH(lt) || IS(lt, Numeric) || IS(lt, Katakana) || IS(lt, ExtendNumLet))
+		if(IS(rt, ExtendNumLet))
+			goto Done;
+	if(IS(lt, ExtendNumLet) && (AH(rt) || IS(rt, Numeric) || IS(rt, Katakana)))
+		goto Done;
+	if(ISG(lt, REGION)){
+		if(ISG(rt, REGION))
+			goto Done;
+		if(r != ZWJ)
+			return p;
+		p++;
+		if((r = *p) == 0)
+			return s;
+		rt = breaklkup(r);
+		if(ISG(rt, REGION))
+			goto Done;
+	}
+
+	return p;
+
+Done:
+	if(p[1] == 0)
+		return s;
+	return p + 1;
+}
+
+char*
+utfwbreak(char *s)
+{
+	Rune l, r;
+	Rune peek;
+	uchar lt, rt;
+	char *p;
+
+	p = s;
+	p += chartorune(&l, p);
+	if(l == 0)
+		return s;
+	chartorune(&peek, p+chartorune(&r, p));
+	if(r == 0)
+		return s;
+	lt = breaklkup(l);
+	rt = breaklkup(r);
+	if(l == '\r' && r == '\n')
+		goto Done;
+	if(l == '\r' || l == '\n' || l == LINETAB)
+		return p;
+	if(r == '\r' || r == '\n' || l == LINETAB)
+		return p;
+	if(IS(lt, WSegSpace) && IS(rt, WSegSpace))
+		goto Done;
+	if(IS(rt, Format) || IS(rt, Extend))
+		goto Done;
+	if(AH(lt)){
+		if(AH(rt))
+			goto Done;
+		if(IS(rt, MidLetter) || MNLQ(rt))
+		if(peek != 0 && AH(breaklkup(peek)))
+			goto Done;
+
+		if(IS(lt, Hebrew_Letter) && r == '\'')
+			goto Done;
+
+		if(IS(lt, Hebrew_Letter) && r == '"')
+		if(peek != 0 && IS(breaklkup(peek), Hebrew_Letter))
+			goto Done;
+
+		if(IS(rt, Numeric))
+			goto Done;
+	}
+	if(IS(lt, Numeric) && (AH(rt) || IS(rt, Numeric)))
+		goto Done;
+	if(IS(lt, Numeric) && (IS(rt, MidNum) || MNLQ(rt)) && peek != 0 && IS(breaklkup(peek), Numeric))
+		goto Done;
+	if(IS(lt, Katakana) && IS(rt, Katakana))
+		goto Done;
+	if(AH(lt) || IS(lt, Numeric) || IS(lt, Katakana) || IS(lt, ExtendNumLet))
+		if(IS(rt, ExtendNumLet))
+			goto Done;
+	if(IS(lt, ExtendNumLet) && (AH(rt) || IS(rt, Numeric) || IS(rt, Katakana)))
+		goto Done;
+	if(ISG(lt, REGION)){
+		if(ISG(rt, REGION))
+			goto Done;
+		if(r != ZWJ)
+			return p;
+		p += chartorune(&r, p);
+		chartorune(&r, p);
+		if(r == 0)
+			return s;
+		rt = breaklkup(r);
+		if(ISG(rt, REGION))
+			goto Done;
+	}
+
+	return p;
+
+Done:
+	p += chartorune(&r, p);
+	chartorune(&r, p);
+	if(r == 0)
+		return s;
+	return p;
+}
--- /dev/null
+++ b/sys/src/libc/ucd/runeistype.c
@@ -1,0 +1,52 @@
+#include <u.h>
+#include <libc.h>
+
+#include "runeistypedata"
+
+int
+isspacerune(Rune c)
+{
+	if(c > Runemax)
+		return 0;
+	return (mergedlkup(c) & Lspace) == Lspace;
+}
+
+int
+isalpharune(Rune c)
+{
+	if(c > Runemax)
+		return 0;
+	return (mergedlkup(c) & Lalpha) == Lalpha;
+}
+
+int
+isdigitrune(Rune c)
+{
+	if(c > Runemax)
+		return 0;
+	return (mergedlkup(c) & Ldigit) == Ldigit;
+}
+
+int
+isupperrune(Rune c)
+{
+	if(c > Runemax)
+		return 0;
+	return (mergedlkup(c) & Lupper) == Lupper;
+}
+
+int
+islowerrune(Rune c)
+{
+	if(c > Runemax)
+		return 0;
+	return (mergedlkup(c) & Llower) == Llower;
+}
+
+int
+istitlerune(Rune c)
+{
+	if(c > Runemax)
+		return 0;
+	return (mergedlkup(c) & Ltitle) == Ltitle;
+}
--- /dev/null
+++ b/sys/src/libc/ucd/runenorm.c
@@ -1,0 +1,444 @@
+#include <u.h>
+#include <libc.h>
+
+#include "runenormdata"
+
+//Unicode Standard: Section 3.12 Conjoining Jamo Behavior
+enum {
+	SBase = 0xAC00,
+	LBase = 0x1100,
+	VBase = 0x1161,
+	TBase = 0x11A7,
+
+	LCount = 19,
+	VCount = 21,
+	TCount = 28,
+	NCount = VCount * TCount,
+	SCount = LCount * NCount,
+
+	LLast = LBase + LCount - 1,
+	SLast = SBase + SCount - 1,
+	VLast = VBase + VCount - 1,
+	TLast = TBase + TCount - 1,
+};
+
+/*
+ * Most runes decompose in to one/two
+ * other runes with codepoints < 0xFFFF,
+ * however there are some exceptions.
+ * To keep the table size down we instead
+ * store an index in to an exception range
+ * within the private use section and use
+ * an exception table.
+ */
+enum {
+	Estart = 0xEEEE,
+	Estop = 0xF8FF,
+};
+
+static Rune
+_runedecomp(Rune c, Rune *r2)
+{
+	uint x;
+
+	if(c < Runeself){
+		*r2 = 0;
+		return 0;
+	}
+
+	//korean
+	if(c >= SBase && c <= SLast){
+		c -= SBase;
+		x = c % TCount;
+		if(x){
+			*r2 = TBase + x;
+			return SBase + (c - x);
+		}
+		*r2 = VBase + ((c % NCount) / TCount);
+		return LBase + (c / NCount);
+	}
+
+	x = decomplkup(c);
+	if((x & 0xFFFF) != 0){
+		*r2 = x & 0xFFFF;
+		return x>>16;
+	}
+	x >>= 16;
+	if(x >= Estart && x < Estop){
+		Rune *r;
+		r = _decompexceptions[x - Estart];
+		*r2 = r[1];
+		return r[0];
+	}
+	*r2 = 0;
+	return x;
+}
+
+static Rune
+_runerecomp(Rune r0, Rune r1)
+{
+	uint x, y, *p, next;
+
+	if(r0 >= LBase && r0 <= LLast){
+		if(r1 < VBase || r1 > VLast)
+			return 0;
+		x = (r0 - LBase) * NCount + (r1 - VBase) * TCount;
+		return SBase + x;
+	}
+	if(r0 >= SBase && r0 <= SLast && (r0 - SBase) % TCount == 0){
+		if(r1 > TBase && r1 <= TLast)
+			return r0 + (r1 - TBase);
+		return 0;
+	}
+	if(r0 > 0xFFFF || r1 > 0xFFFF){
+		for(x = 0; x < nelem(_recompexceptions); x++)
+			if(r0 == _recompexceptions[x][1] && r1 == _recompexceptions[x][2])
+				return  _recompexceptions[x][0];
+		return 0;
+	}
+	y = x = r0<<16 | r1;
+	x ^= x >> 16;
+	x *= 0x21f0aaad;
+	x ^= x >> 15;
+	x *= 0xd35a2d97;
+	x ^= x >> 15;
+	p = _recompdata + (x%512)*2;
+	while(p[0] != y){
+		next = p[1]>>16;
+		if(!next)
+			return 0;
+		p = _recompcoll + (next-1)*2;
+	}
+	return p[1] & 0xFFFF;
+}
+
+static void
+runecccsort(Rune *a, int len)
+{
+	Rune r;
+	int i, j;
+
+	for(i = 1; i < len; i++){
+		r = a[i];
+		for(j = i; j > 0 && ccclkup(a[j-1]) > ccclkup(r); j--)
+			a[j] = a[j-1];
+		a[j] = r;
+	}
+}
+
+static int
+boundary(Rune r)
+{
+	return !(qclkup(r) & (Qnfcno|Qnfcmay));
+}
+
+/*
+ * Stk stores the entire context for a chunk of
+ * an input string that is being normalized.
+ * In accordance to the standard, Unicode text
+ * has no upper bound for the amount of conjoining
+ * (also called non-starter) elements associated with
+ * a base rune. Thus to implement normalization within
+ * reasonable memory constraints we implement the
+ * "Stream-Safe Text Format" as defined in UAX #15 § 13.
+ */
+typedef struct {
+	Rune a[Maxnormctx];
+	Rune *e;
+} Stk;
+
+static int
+push(Stk *s, Rune c)
+{
+	int n, l;
+	Rune r2, b[Maxdecomp];
+	Rune *p = b + nelem(b) - 1;
+
+	for(*p = c; c = _runedecomp(c, &r2); *p = c){
+		assert(p > b);
+		if(r2 != 0)
+			*p-- = r2;
+	}
+
+	n = b + nelem(b) - p;
+	l = nelem(s->a) - (s->e - s->a);
+	if(n > l){
+		werrstr("runenorm: buffer overflow");
+		return -1;
+	}
+	l -= n;
+	for(; n > 0; n--)
+		*s->e++ = *p++;
+	return l;
+}
+
+/*
+ * Worst case recomposition, this happens when we have to compose
+ * two runes who both have a CCC of zero.
+ */
+static void
+worstrecomp(Stk *s)
+{
+	int done;
+	Rune c, *p, *rp;
+
+	for(done = 0; done == 0;){
+		done = 1;
+		for(p = s->a; p+1 < s->e; p++){
+			c = _runerecomp(p[0], p[1]);
+			if(c == 0)
+				continue;
+			done = 0;
+			*p = c;
+			for(rp = p+1; rp < s->e-1; rp++)
+				rp[0] = rp[1];
+			s->e--;
+			p--;
+		}
+	}
+}
+
+static void
+cccrecomp(Stk *s)
+{
+	Rune c, *p, *rp;
+
+	for(p = s->a + 1; p < s->e; p++){
+		c  = _runerecomp(s->a[0], *p);
+		if(c != 0){
+			s->a[0] = c;
+			for(rp = p; rp < s->e-1; rp++){
+				rp[0] = rp[1];
+			}
+			s->e--;
+			p--;
+		} else while(p + 1 < s->e && ccclkup(p[0]) == ccclkup(p[1]))
+			p++;
+	}
+}
+
+void
+norminit(Norm *n, int compose, void *ctx, long (*getrune)(void*))
+{
+	memset(n, 0, sizeof *n);
+	n->ctx = ctx;
+	n->getrune = getrune;
+	n->compose = compose;
+	n->obuf.e = n->obuf.a;
+	n->ibuf.e = n->ibuf.a;
+}
+
+int NORMDEBUG;
+
+static long
+peekrune(Norm *n)
+{
+	long r;
+
+	if(n->ibuf.e > n->ibuf.a)
+		return n->ibuf.e[-1];
+
+	r = n->getrune(n->ctx);
+	if(r >= 0)
+		*n->ibuf.e++ = r;
+	return r;
+}
+
+static long
+getrune(Norm *n)
+{
+	if(n->ibuf.e > n->ibuf.a)
+		return *--n->ibuf.e;
+	return n->getrune(n->ctx);
+}
+
+long
+normpull(Norm *n, Rune *rdst, long max, int flush)
+{
+	Rune *rp, *re;
+	Stk stk;
+	Rune *dot;
+	int r;
+	long c;
+
+	rp = rdst;
+	re = rdst + max;
+	dot = nil;
+	c = 0;
+	while(rp < re){
+		if(n->obuf.e != n->obuf.a){
+			memcpy(stk.a, n->obuf.a, (n->obuf.e - n->obuf.a)*sizeof(Rune));
+			stk.e = stk.a + (n->obuf.e - n->obuf.a);
+			n->obuf.e = n->obuf.a;
+			c = stk.a[0];
+			goto Flush;
+		}
+
+		stk.e = stk.a;
+		c = getrune(n);
+		if(c < 0)
+			break;
+		push(&stk, c);
+		c = peekrune(n);
+		if(stk.e == stk.a+1 && stk.a[0] < Runeself && c < Runeself && c >= 0)
+			goto Flush;
+		while(c >= 0 && ccclkup(c) != 0){
+			r = push(&stk, getrune(n));
+			c = peekrune(n);
+			if(r > 2)
+				continue;
+			if(ccclkup(stk.a[0]) != 0){
+				assert(r > 0);
+				r--;
+			} else
+				assert(r >= 0);
+			if(r == 0 || (c == 0x0344 && r < 2)){
+				/* in reverse */
+				if(r > 0){
+					getrune(n);
+					*n->ibuf.e++ = 0x301;
+					*n->ibuf.e++ = 0x308;
+				}
+				*n->ibuf.e++ = 0x034F;
+				break;
+			}
+		}
+		if(stk.e - stk.a > 1)
+			runecccsort(stk.a, stk.e - stk.a);
+
+		if(!n->compose)
+			goto Flush;
+
+		if(ccclkup(stk.e[-1]) == 0){
+			Rune tmp;
+			while(c >= 0 && (!boundary(c) || !boundary(_runedecomp(c, &tmp)))){
+				if(push(&stk, getrune(n)) == -1){
+					*n->ibuf.e++ = c;
+					for(r = 0; r < Maxdecomp; r++)
+						*n->ibuf.e++ = *--stk.e;
+					break;
+				}
+				c = peekrune(n);
+			}
+			worstrecomp(&stk);
+		} else if(ccclkup(stk.a[0]) == 0)
+			cccrecomp(&stk);
+
+Flush:
+		if(flush || c >= 0)
+			for(dot = stk.a; dot < stk.e; dot++){
+				if(rp == re)
+					goto Out;
+				*rp++ = *dot;
+			}
+		dot = nil;
+		if(c < 0)
+			break;
+	}
+Out:
+	if(c < 0 && !flush){
+		while(stk.e > stk.a)
+			*n->ibuf.e++ = *--stk.e;
+	}
+	if(dot != nil){
+		memcpy(n->obuf.a, dot, (stk.e - dot) * sizeof(Rune));
+		n->obuf.e = n->obuf.a + (stk.e - dot);
+	}
+
+	return rp - rdst;
+}
+
+typedef struct {
+	Rune *s, *p;
+	int n;
+} Rctx;
+
+static long
+runegetrune(void *ctx)
+{
+	Rctx *c;
+
+	c = ctx;
+	if(c->p >= c->s + c->n)
+		return -1;
+	return *c->p++;
+}
+
+static long
+runedostr(Rune *dst, long ndst, Rune *src, long nsrc, int comp)
+{
+	Rctx c;
+	Norm n;
+
+	c.s = c.p = src;
+	c.n = nsrc;
+	norminit(&n, comp, &c, runegetrune);
+	return normpull(&n, dst, ndst, 1);
+}
+
+long
+runecomp(Rune *dst, long ndst, Rune *src, long nsrc)
+{
+	return runedostr(dst, ndst, src, nsrc, 1);
+}
+
+long
+runedecomp(Rune *dst, long ndst, Rune *src, long nsrc)
+{
+	return runedostr(dst, ndst, src, nsrc, 0);
+}
+
+typedef struct {
+	char *s, *p;
+	int n;
+} Uctx;
+
+static long
+utfgetrune(void *ctx)
+{
+	Uctx *c;
+	Rune r;
+
+	c = ctx;
+	if(c->p >= c->s + c->n)
+		return -1;
+	c->p += chartorune(&r, c->p);
+	return r;
+}
+
+static long
+utfdostr(char *dst, long ndst, char *src, long nsrc, int comp)
+{
+	Uctx c;
+	Norm n;
+	Rune buf[Maxnormctx];
+	long i, w;
+	char *e, *p;
+
+	c.s = c.p = src;
+	c.n = nsrc;
+	norminit(&n, comp, &c, utfgetrune);
+	for(p = dst, e = dst + ndst; p < e;){
+		w = normpull(&n, buf, nelem(buf), 1);
+		if(w == 0)
+			break;
+		for(i = 0; i < w; i++){
+			if(p + runelen(buf[i]) >= e)
+				break;
+			p += runetochar(p, buf+i);
+		}
+	}
+	return p - dst;
+}
+
+long
+utfcomp(char *dst, long ndst, char *src, long nsrc)
+{
+	return utfdostr(dst, ndst, src, nsrc, 1);
+}
+
+long
+utfdecomp(char *dst, long ndst, char *src, long nsrc)
+{
+	return utfdostr(dst, ndst, src, nsrc, 0);
+}
--- /dev/null
+++ b/sys/src/libc/ucd/runetotype.c
@@ -1,0 +1,22 @@
+#include <u.h>
+#include <libc.h>
+
+#include "runetotypedata"
+
+Rune
+toupperrune(Rune c)
+{
+	return c + upperlkup(c);
+}
+
+Rune
+tolowerrune(Rune c)
+{
+	return c + lowerlkup(c);
+}
+
+Rune
+totitlerune(Rune c)
+{
+	return c + titlelkup(c);
+}
--