shithub: front

Download patch

ref: 4c583d2f6d739b044385a429eca85e7a2dd8333c
parent: 0b58d6e95f009cfb009949f93c4f97e9bf3b8c18
author: cinap_lenrek <cinap_lenrek@felloff.net>
date: Sun Jul 27 12:48:34 EDT 2025

kernel: New page cache design

Image.pghash had fixed size, and we
use images with very different sizes.
(fscache).

Most binaries are small, so we are
wasting kernel memory for most images.

So allocate images dynamically for the expected
maximum size: Adding newimage(pages).

Each images gets exactly the pghash size
it needs. Large images (>1024 entries)
get reduced, but with only 16 collisions max.

With that, we can get rid of the collision
check in pagereclaim(), as images should be
balance now.

The next thing is that reclaiming idle
images (binaries) was done randomly.

Instead, we now maintain a orderd "idle list"
of images that is sorted by least-frequent
and decreasing page usage.

We also observe that reclaiming pages from
active images is not worth it. (Unless one
wants to exceute binaries larger than the
available physical memory).

Paging in binaries now uses iounit reads,
and cachepage() ensures there are no duplicate
pages in the cache, not adding a page to the
cache when already present.

--- a/sys/src/9/port/cache.c
+++ b/sys/src/9/port/cache.c
@@ -53,15 +53,10 @@
 	Mntcache	*hash[NHASH];
 };
 
-Image fscache = {
-	{
-		.ref = 1,
-	},
-	.notext = 1,
-};
-
 static Cache cache;
 
+Image *fscache;
+
 void
 cinit(void)
 {
@@ -69,9 +64,11 @@
 	Mntcache *m;
 
 	m = xalloc(sizeof(Mntcache)*NFILE);
-	if (m == nil)
+	if(m == nil || (fscache = newimage(TOTALPAGES)) == nil)
 		panic("cinit: no memory");
 
+	fscache->notext = 1;
+
 	cache.alloc = m;
 	cache.head = m;
 
@@ -276,7 +273,7 @@
 	b = 1 << (pn%MAPBITS);
 	if((m->bitmap[pn/MAPBITS] & b) == 0)
 		return nil;
-	p = lookpage(&fscache, cacheaddr(m, pn));
+	p = lookpage(fscache, cacheaddr(m, pn));
 	if(p == nil){
 		m->bitmap[pn/MAPBITS] &= ~b;
 		return nil;
@@ -427,12 +424,12 @@
 				invalidate(m, offset + pn*BY2PG, len);
 				break;
 			}
-			if(fscache.pgref > TOTALPAGES)
-				pagereclaim(&fscache);
+			if(fscache->pgref > TOTALPAGES)
+				pagereclaim(fscache);
 			p = newpage(0, nil, pn*BY2PG);
 			p->daddr = cacheaddr(m, pn);
-			cachedel(&fscache, p->daddr);
-			cachepage(p, &fscache);
+			cachedel(fscache, p->daddr);
+			cachepage(p, fscache);
 			m->bitmap[pn/MAPBITS] |= 1 << (pn%MAPBITS);
 
 			po = offset;
--- a/sys/src/9/port/devswap.c
+++ b/sys/src/9/port/devswap.c
@@ -14,12 +14,7 @@
 static void	pagepte(Segment*, Page**);
 static void	pager(void*);
 
-Image 	swapimage = {
-	{
-		.ref = 1
-	},
-	.notext = 1,
-};
+Image 		*swapimage;
 
 static Chan	*swapchan;
 static uchar	*swapbuf;
@@ -137,16 +132,17 @@
 static int
 reclaim(void)
 {
+	enum {
+		Target = 4*MB/BY2PG,
+	};
 	ulong np;
 
 	for(;;){
-		if((np = pagereclaim(&fscache) + imagereclaim(0)) > 0){
-			if(0) print("reclaim: %lud fscache + inactive image\n", np);
-		} else if((np = pagereclaim(&swapimage)) > 0) {
-			if(0) print("reclaim: %lud swap\n", np);
-		} else if((np = imagereclaim(1)) > 0) {
-			if(0) print("reclaim: %lud active image\n", np);
-		}
+		np = pagereclaim(fscache);
+		if(np < Target)
+			np += imagereclaim(Target-np);
+		if(np < Target)
+			np += pagereclaim(swapimage);
 		if(!needpages(nil))
 			return 1;	/* have pages, done */
 		if(np == 0)
@@ -176,7 +172,7 @@
 			continue;
 		}
 
-		if(swapimage.c == nil || swapalloc.free == 0){
+		if(swapimage == nil || swapimage->c == nil || swapalloc.free == 0){
 		Killbig:
 			if(!freebroken())
 				killbig();
@@ -197,6 +193,12 @@
 			if((s = p->seg[i]) != nil) {
 				switch(s->type&SG_TYPE) {
 				case SG_TEXT:
+					/*
+					 *  imagereclaim() does not reclaim active images anymore,
+					 *  so here is no point in paging out text. it wont
+					 *  recover any pages.
+					 */
+					continue;
 				case SG_DATA:
 				case SG_BSS:
 				case SG_STACK:
@@ -240,7 +242,7 @@
 			continue;
 		for(pg = l->first; pg <= l->last; pg++) {
 			entry = *pg;
-			if(pagedout(entry) || entry->modref & PG_PRIV)
+			if(pagedout(entry) || entry->modref & PG_PRIV || entry->image != nil)
 				continue;
 			if(entry->modref & PG_REF) {
 				entry->modref &= ~PG_REF;
@@ -288,49 +290,37 @@
 	uintptr daddr;
 	Page *outp;
 
-	outp = *pg;
-	switch(s->type & SG_TYPE) {
-	case SG_TEXT:				/* Revert to demand load */
-		*pg = nil;
-		s->used--;
-		putpage(outp);
-		break;
+	if(ioptr >= conf.nswppo)
+		return;
 
-	case SG_DATA:
-	case SG_BSS:
-	case SG_STACK:
-	case SG_SHARED:
-		if(ioptr >= conf.nswppo)
-			break;
+	/*
+	 *  get a new swap address with swapcount 2, one for the pte
+	 *  and one extra ref for us while we write the page to disk
+	 */
+	daddr = newswap();
+	if(daddr == ~0)
+		return;
 
-		/*
-		 *  get a new swap address with swapcount 2, one for the pte
-		 *  and one extra ref for us while we write the page to disk
-		 */
-		daddr = newswap();
-		if(daddr == ~0)
-			break;
+	/* clear any pages referring to it from the cache */
+	cachedel(swapimage, daddr);
 
-		/* clear any pages referring to it from the cache */
-		cachedel(&swapimage, daddr);
+	outp = *pg;
 
-		/* forget anything that it used to cache */
-		uncachepage(outp);
+	/* forget anything that it used to cache */
+	uncachepage(outp);
 
-		/*
-		 *  enter it into the cache so that a fault happening
-		 *  during the write will grab the page from the cache
-		 *  rather than one partially written to the disk
-		 */
-		outp->daddr = daddr;
-		cachepage(outp, &swapimage);
-		*pg = (Page*)(daddr|PG_ONSWAP);
-		s->swapped++;
+	/*
+	 *  enter it into the cache so that a fault happening
+	 *  during the write will grab the page from the cache
+	 *  rather than one partially written to the disk
+	 */
+	outp->daddr = daddr;
+	cachepage(outp, swapimage);
+	*pg = (Page*)(daddr|PG_ONSWAP);
+	s->swapped++;
 
-		/* Add page to IO transaction list */
-		iolist[ioptr++] = outp;
-		break;
-	}
+	/* Add page to IO transaction list */
+	iolist[ioptr++] = outp;
 }
 
 static void
@@ -343,12 +333,12 @@
 		outp = iolist[i];
 
 		assert(outp->ref > 0);
-		assert(outp->image == &swapimage);
+		assert(outp->image == swapimage);
 		assert(outp->daddr != ~0);
 
 		/* only write when swap address still in use */
 		if(swapcount(outp->daddr) > 1){
-			Chan *c = swapimage.c;
+			Chan *c = swapimage->c;
 			KMap *k = kmap(outp);
 			if(waserror()){
 				kunmap(k);
@@ -405,12 +395,17 @@
 	if(s < conf.nswppo)
 		error("swap device too small");
 
-	if(swapimage.c != nil) {
+	if(swapimage != nil) {
 		if(swapalloc.free != conf.nswap)
 			error(Einuse);
-		cclose(swapimage.c);
-		swapimage.c = nil;
+		if(swapimage->c != nil)
+			cclose(swapimage->c);
+		free(swapimage);
 	}
+	swapimage = newimage(s);
+	if(swapimage == nil)
+		error(Enomem);
+	swapimage->notext = 1;
 
 	if(s < conf.nswap){
 		conf.nswap = s;
@@ -423,7 +418,7 @@
 	poperror();
 
 	swapchan = c;
-	swapimage.c = namec("#¶/swapfile", Aopen, ORDWR, 0);
+	swapimage->c = namec("#¶/swapfile", Aopen, ORDWR, 0);
 }
 
 enum {
@@ -465,10 +460,10 @@
 	case Qswapfile:
 		if(!iseve() || omode != ORDWR)
 			error(Eperm);
-		if(swapimage.c != nil)
-			error(Einuse);
-		if(swapchan == nil)
+		if(swapchan == nil || swapimage == nil)
 			error(Egreg);
+		if(swapimage->c != nil)
+			error(Einuse);
 
 		c->mode = openmode(omode);
 		c->flag |= COPEN;
@@ -517,7 +512,11 @@
 	case Qdir:
 		return devdirread(c, va, n, swapdir, nelem(swapdir), devgen);
 	case Qswap:
-		reclaim = imagecached() + fscache.pgref + swapimage.pgref;
+		reclaim = imagecached();
+		if(fscache != nil)
+			reclaim += fscache->pgref;
+		if(swapimage != nil)
+			reclaim += swapimage->pgref;
 		snprint(tmp, sizeof tmp,
 			"%llud memory\n"
 			"%llud pagesize\n"
--- a/sys/src/9/port/fault.c
+++ b/sys/src/9/port/fault.c
@@ -36,14 +36,13 @@
 static void
 pio(Segment *s, uintptr addr, uintptr soff, Page **p)
 {
-	Page *new;
 	KMap *k;
 	Chan *c;
 	int n, ask;
-	char *kaddr;
-	uintptr daddr;
-	Page *loadrec;
+	uintptr o, daddr, paddr;
+	Page *loadrec, *new;
 	Image *image;
+	Block *b;
 
 retry:
 	loadrec = *p;
@@ -57,15 +56,22 @@
 			return;
 		}
 
-		ask = BY2PG;
-		if(soff >= s->flen)
+		ask = image->c->iounit;
+		if(ask == 0) ask = qiomaxatomic;
+		ask &= -BY2PG;
+		if(ask == 0) ask = BY2PG;
+
+		daddr = soff & -ask;
+		if(daddr >= s->flen)
 			ask = 0;
-		else if((soff+ask) > s->flen)
-			ask = s->flen-soff;
+		else if((daddr+ask) > s->flen)
+			ask = s->flen-daddr;
+		paddr = s->base + daddr;
+		daddr += s->fstart;
 	}
 	else {			/* from a swap image */
 		daddr = swapaddr(loadrec);
-		image = &swapimage;
+		image = swapimage;
 		new = lookpage(image, daddr);
 		if(new != nil) {
 			*p = new;
@@ -73,29 +79,49 @@
 			putswap(loadrec);
 			return;
 		}
-
+		paddr = addr;
 		ask = BY2PG;
 	}
 	qunlock(s);
 
-	new = newpage(0, nil, addr);
-	k = kmap(new);
 	c = image->c;
 	while(waserror()) {
 		if(strcmp(up->errstr, Eintr) == 0)
 			continue;
+		faulterror(Eioload, c);
+	}
+	b = devtab[c->type]->bread(c, ask, daddr);
+	if(waserror()){
+		freeblist(b);
+		nexterror();
+	}
+	for(o = 0; o < ask; o += BY2PG){
+		new = lookpage(image, daddr + o);
+		if(new != nil){
+			putpage(new);
+			continue;
+		}
+		new = newpage(0, nil, paddr + o);
+		new->daddr = daddr + o;
+		k = kmap(new);
+		n = ask - o;
+		if(n > BY2PG)
+			n = BY2PG;
+		else if(n < BY2PG)
+			memset((uchar*)VA(k)+n, 0, BY2PG-n);
+		if(readblist(b, (uchar*)VA(k), n, o) != n){
+			kunmap(k);
+			putpage(new);
+			error(Eshort);
+		}
 		kunmap(k);
+		settxtflush(new, s->flushme);
+		cachepage(new, image);
 		putpage(new);
-		faulterror(Eioload, c);
 	}
-	kaddr = (char*)VA(k);
-	n = devtab[c->type]->read(c, kaddr, ask, daddr);
-	if(n != ask)
-		error(Eshort);
-	if(ask < BY2PG)
-		memset(kaddr+ask, 0, BY2PG-ask);
+	freeblist(b);
 	poperror();
-	kunmap(k);
+	poperror();
 
 	qlock(s);
 	/*
@@ -103,34 +129,9 @@
 	 *  (and the pager may have run on that page) while
 	 *  s was unlocked
 	 */
-	if(*p != loadrec) {
-		putpage(new);
-
-		/* another process did it for me */
-		if(!pagedout(*p))
-			return;
-
-		/* another process or the pager got in */
-		goto retry;
-	}
-
-	/*
-	 *  check the cache again to avoid double caching.
-	 */
-	if((*p = lookpage(image, daddr)) != nil)
-		putpage(new);
-	else {
-		new->daddr = daddr;
-		settxtflush(new, s->flushme);
-		cachepage(new, image);
-		*p = new;
-	}
-	if(loadrec == nil)
-		s->used++;
-	else {
-		s->swapped--;
-		putswap(loadrec);
-	}
+	if(*p != loadrec && !pagedout(*p))
+		return;
+	goto retry;
 }
 
 static int
@@ -200,7 +201,7 @@
 		}
 
 		old = *pg;
-		if(old->image == &swapimage && (old->ref + swapcount(old->daddr)) == 1)
+		if(swapimage != nil && old->image == swapimage && (old->ref + swapcount(old->daddr)) == 1)
 			uncachepage(old);
 		if(old->ref > 1 || old->image != nil) {
 			new = newpage(0, &s, addr);
--- a/sys/src/9/port/page.c
+++ b/sys/src/9/port/page.c
@@ -110,30 +110,28 @@
 ulong
 pagereclaim(Image *i)
 {
-	Page **h, **l, **x, *p;
+	Page **h, **e, **l, **x, *p;
 	Page *fh, *ft;
-	ulong np;
-	int c;
+	ulong mp, np;
 
+	if(i == nil)
+		return 0;
+
 	lock(i);
-	if(i->pgref == 0){
+	mp = i->pgref;
+	if(mp == 0){
 		unlock(i);
 		return 0;
 	}
 	np = 0;
 	fh = ft = nil;
-	for(h = i->pghash; h < &i->pghash[PGHSIZE]; h++){
+	e = &i->pghash[i->pghsize];
+	for(h = i->pghash; h < e; h++){
 		l = h;
 		x = nil;
-		c = 1;
 		for(p = *l; p != nil; p = p->next){
-			if(p->ref == 0){
+			if(p->ref == 0)
 				x = l;
-				/* too many collisions, take what we have */
-				if(c >= 64)
-					break;
-			}
-			c++;
 			l = &p->next;
 		}
 		if(x == nil)
@@ -288,15 +286,23 @@
 void
 cachepage(Page *p, Image *i)
 {
-	Page **h;
+	Page *x, **h;
+	uintptr daddr;
 
+	daddr = p->daddr;
+	h = &PGHASH(i, daddr);
 	lock(i);
+	for(x = *h; x != nil; x = x->next)
+		if(x->daddr == daddr)
+			goto done;
+	if(p->image != nil)
+		goto done;
 	p->image = i;
-	h = &PGHASH(i, p->daddr);
 	p->next = *h;
 	*h = p;
 	incref(i);
 	i->pgref++;
+done:
 	unlock(i);
 }
 
@@ -309,13 +315,10 @@
 	i = p->image;
 	if(i == nil)
 		return;
-
-	lock(i);
-	if(p->image != i){
-		unlock(i);
-		return;
-	}
 	l = &PGHASH(i, p->daddr);
+	lock(i);
+	if(p->image != i)
+		goto done;
 	for(x = *l; x != nil; x = x->next) {
 		if(x == p){
 			*l = p->next;
@@ -328,6 +331,7 @@
 		}
 		l = &x->next;
 	}
+done:
 	unlock(i);
 }
 
@@ -336,8 +340,8 @@
 {
 	Page *p, **h, **l;
 
-	lock(i);
 	l = h = &PGHASH(i, daddr);
+	lock(i);
 	for(p = *l; p != nil; p = p->next){
 		if(p->daddr == daddr){
 			*l = p->next;
@@ -359,7 +363,7 @@
 {
 	Page *p;
 
-	while((p = lookpage(i, daddr)) != nil){
+	if((p = lookpage(i, daddr)) != nil){
 		uncachepage(p);
 		putpage(p);
 	}
--- a/sys/src/9/port/portdat.h
+++ b/sys/src/9/port/portdat.h
@@ -473,32 +473,39 @@
 	MNTLOG	=	5,
 	MNTHASH =	1<<MNTLOG,	/* Hash to walk mount table */
 	NFD =		100,		/* per process file descriptors */
-	PGHLOG  =	10,
-	PGHSIZE	=	1<<PGHLOG,	/* Page hash for image lookup */
 	ENVLOG =	5,
 	ENVHASH =	1<<ENVLOG,	/* Egrp hash for variable lookup */
 };
 #define REND(p,s)	((p)->rendhash[(s)&((1<<RENDLOG)-1)])
 #define MOUNTH(p,qid)	((p)->mnthash[(qid).path&((1<<MNTLOG)-1)])
-#define PGHASH(i,daddr)	((i)->pghash[((daddr)>>PGSHIFT)&(PGHSIZE-1)])
+#define PGHASH(i,daddr)	((i)->pghash[((daddr)>>PGSHIFT)&((i)->pghsize-1)])
 
 struct Image
 {
 	Ref;
 	Lock;
+
+	long	pgref;			/* number of cached pages (pgref <= ref) */
+
+	ulong	nattach;		/* usage frequency */
+
+	Image	**link;			/* idle list */
+	Image	*next;			/* idle list */
+
+	Image	*hash;			/* Qid hash chains */
+
+	Segment *s;			/* TEXT segment for image if running */
+
 	Chan	*c;			/* channel to text file, nil when not used */
 	Qid 	qid;			/* Qid for page cache coherence */
 	ulong	dev;			/* Device id of owning channel */
 	ushort	type;			/* Device type of owning channel */
 	char	notext;			/* no file associated */
-	Segment *s;			/* TEXT segment for image if running */
-	Image	*hash;			/* Qid hash chains */
-	Image	*next;			/* Free list */
-	long	pgref;			/* number of cached pages (pgref <= ref) */
-	Page	*pghash[PGHSIZE];	/* page cache */
+
+	ulong	pghsize;
+	Page	*pghash[];		/* page cache */
 };
 
-
 struct Pgrp
 {
 	Ref;
@@ -823,8 +830,8 @@
 extern	int	panicking;
 extern	Queue*	serialoq;
 extern	char*	statename[];
-extern	Image	swapimage;
-extern	Image	fscache;
+extern	Image*	swapimage;
+extern	Image*	fscache;
 extern	char*	sysname;
 extern	uint	qiomaxatomic;
 extern	char*	sysctab[];
--- a/sys/src/9/port/portfns.h
+++ b/sys/src/9/port/portfns.h
@@ -9,7 +9,7 @@
 Block*		allocb(int);
 int		anyhigher(void);
 int		anyready(void);
-Image*		attachimage(Chan*);
+Image*		attachimage(Chan*, ulong size);
 ulong		beswal(ulong);
 uvlong		beswav(uvlong);
 int		blocklen(Block*);
@@ -148,7 +148,7 @@
 _Noreturn void	interrupted(void);
 void		iunlock(Lock*);
 ulong		imagecached(void);
-ulong		imagereclaim(int);
+ulong		imagereclaim(ulong);
 long		incref(Ref*);
 void		init0(void);
 void		initseg(void);
@@ -221,6 +221,7 @@
 int		newfd(Chan*, int);
 Mhead*		newmhead(Chan*);
 Mount*		newmount(Chan*, int, char*);
+Image*		newimage(ulong);
 Page*		newpage(int, Segment **, uintptr);
 Path*		newpath(char*);
 Pgrp*		newpgrp(void);
--- a/sys/src/9/port/proc.c
+++ b/sys/src/9/port/proc.c
@@ -192,7 +192,6 @@
 		if(up->state == Running)
 		if(up->delaysched < 20
 		|| palloc.Lock.p == up
-		|| fscache.Lock.p == up
 		|| procalloc.Lock.p == up){
 			up->delaysched++;
  			delayedscheds++;
--- a/sys/src/9/port/segment.c
+++ b/sys/src/9/port/segment.c
@@ -21,27 +21,50 @@
 static struct Imagealloc
 {
 	Lock;
-	Image	*list;
-	Image	*free;
+
+	QLock	ireclaim;	/* mutex on reclaiming idle images */
+
+	ulong	pgidle;		/* pages in idle list (reclaimable) */
+
+	ulong	nidle;
+	Image	*idle;
 	Image	*hash[IHASHSIZE];
-	QLock	ireclaim;	/* mutex on reclaiming free images */
+
 }imagealloc;
 
 Segment* (*_globalsegattach)(char*);
 
+Image*
+newimage(ulong pages)
+{
+	ulong pghsize;
+	Image *i;
+
+	/* make power of two */
+	pghsize = pages-1;
+	pghsize |= pghsize >> 16;
+	pghsize |= pghsize >> 8;
+	pghsize |= pghsize >> 4;
+	pghsize |= pghsize >> 2;
+	pghsize |= pghsize >> 1;
+	pghsize++;
+
+	if(pghsize > 1024)
+		pghsize >>= 4;
+
+	i = malloc(sizeof(Image) + pghsize * sizeof(Page*));
+	if(i == nil)
+		return nil;
+
+	i->ref = 1;
+	i->pghsize = pghsize;
+
+	return i;
+}
+
 void
 initseg(void)
 {
-	Image *i, *ie;
-
-	imagealloc.list = xalloc(conf.nimage*sizeof(Image));
-	if(imagealloc.list == nil)
-		panic("initseg: no memory for Image");
-	ie = &imagealloc.list[conf.nimage-1];
-	for(i = imagealloc.list; i < ie; i++)
-		i->next = i+1;
-	i->next = nil;
-	imagealloc.free = imagealloc.list;
 }
 
 Segment *
@@ -338,7 +361,7 @@
 
 
 Image*
-attachimage(Chan *c)
+attachimage(Chan *c, ulong pages)
 {
 	Image *i, **l;
 
@@ -350,32 +373,29 @@
 	 * or currently running incarnation
 	 */
 	for(i = ihash(c->qid.path); i != nil; i = i->hash){
-		if(eqchantdqid(c, i->type, i->dev, i->qid, 0))
+		if(eqchantdqid(c, i->type, i->dev, i->qid, 0)){
+			incref(i);
 			goto found;
+		}
 	}
-
-	/* dump pages of inactive images to free image structures */
-	if((i = imagealloc.free) == nil) {
+	if(imagealloc.nidle > conf.nimage
+	|| (i = newimage(pages)) == nil) {
 		unlock(&imagealloc);
-		if(imagereclaim(0) == 0 && imagealloc.free == nil){
+		if(imagealloc.nidle == 0)
+			error(Enomem);
+		if(imagereclaim(0) == 0)
 			freebroken();		/* can use the memory */
-			resrcwait("no image after reclaim");
-		}
 		goto retry;
 	}
-	imagealloc.free = i->next;
-
 	i->type = c->type;
 	i->dev = c->dev;
 	i->qid = c->qid;
-
 	l = &ihash(c->qid.path);
 	i->hash = *l;
 	*l = i;
 found:
-	incref(i);
+	i->nattach++;
 	unlock(&imagealloc);
-
 	lock(i);
 	if(i->c == nil){
 		i->c = c;
@@ -384,11 +404,59 @@
 	return i;
 }
 
+/* remove from idle list */
+static void
+busyimage(Image *i)
+{
+	/* not on idle list? */
+	if(i->link == nil)
+		return;
+
+	lock(&imagealloc);
+	if((*i->link = i->next) != nil)
+		i->next->link  = i->link;
+	i->link = nil;
+	i->next = nil;
+	imagealloc.pgidle -= i->pgref;
+	imagealloc.nidle--;
+	unlock(&imagealloc);
+}
+
+/* insert into idle list */
+static void
+idleimage(Image *i)
+{
+	Image **l, *j;
+
+	/* already on idle list? */
+	if(i->link != nil)
+		return;
+
+	lock(&imagealloc);
+	l = &imagealloc.idle;
+	j = imagealloc.idle;
+	/* sort by least frequenty and most pages used first */
+	for(; j != nil; l = &j->next, j = j->next){
+		long c = j->nattach - i->nattach;
+		if(c < 0)
+			continue;
+		if(c > 0)
+			break;
+		if(j->pgref < i->pgref)
+			break;
+	}
+	if((i->next = j) != nil)
+		j->link = &i->next;
+	*(i->link = l) = i;
+	imagealloc.pgidle += i->pgref;
+	imagealloc.nidle++;
+	unlock(&imagealloc);
+}
+
 /* putimage(): called with image locked and unlocks */
 void
 putimage(Image *i)
 {
-	Image *f, **l;
 	Chan *c;
 	long r;
 
@@ -397,24 +465,17 @@
 		unlock(i);
 		return;
 	}
-
-	/*
-	 * all remaining references to this image are from the
-	 * page cache, so close the chan.
-	 */
-	if(r == i->pgref){
-		c = i->c;
-		i->c = nil;
-	} else
-		c = nil;
-
 	if(r == 0){
 		assert(i->pgref == 0);
-		assert(i->c == nil);
 		assert(i->s == nil);
-
+		c = i->c;
+		i->c = nil;
+		busyimage(i);
 		lock(&imagealloc);
-		if(i->ref == 0){
+		r = i->ref;
+		if(r == 0){
+			Image *f, **l;
+
 			l = &ihash(i->qid.path);
 			for(f = *l; f != nil; f = f->hash) {
 				if(f == i) {
@@ -423,13 +484,23 @@
 				}
 				l = &f->hash;
 			}
-			i->next = imagealloc.free;
-			imagealloc.free = i;
 		}
 		unlock(&imagealloc);
+	} else if(r == i->pgref) {
+		assert(i->pgref > 0);
+		assert(i->s == nil);
+		c = i->c;
+		i->c = nil;
+		idleimage(i);
+	} else {
+		c = nil;
+		busyimage(i);
 	}
 	unlock(i);
 
+	if(r == 0)
+		free(i);
+
 	if(c != nil)
 		ccloseq(c);	/* does not block */
 }
@@ -437,57 +508,37 @@
 ulong
 imagecached(void)
 {
-	Image *i, *ie;
-	ulong np;
-
-	np = 0;
-	ie = &imagealloc.list[conf.nimage];
-	for(i = imagealloc.list; i < ie; i++)
-		np += i->pgref;
-	return np;
+	return imagealloc.pgidle;
 }
 
 ulong
-imagereclaim(int active)
+imagereclaim(ulong pages)
 {
-	static int x, y;
 	ulong np;
 	Image *i;
-	int j;
 
+	eqlock(&imagealloc.ireclaim);
+	
+	lock(&imagealloc);
 	np = 0;
+	while(np < pages || imagealloc.nidle > conf.nimage) {
+		i = imagealloc.idle;
+		if(i == nil)
+			break;
+		incref(i);
+		unlock(&imagealloc);
 
-	eqlock(&imagealloc.ireclaim);
-
-	/* try reclaim idle images */
-	for(j = 0; j < conf.nimage; j++, x++) {
-		if(x >= conf.nimage)
-			x = 0;
-		i = &imagealloc.list[x];
-		if(i->ref == 0)
-			continue;
-		if(i->s != nil || i->ref != i->pgref)
-			continue;
 		np += pagereclaim(i);
-		if(np >= 1000)
-			goto Done;
-	}
 
-	if(!active)
-		goto Done;
+		lock(i);
+		busyimage(i);	/* force re-insert into idle list */
+		putimage(i);
 
-	/* try reclaim active images */
-	for(j = 0; j < conf.nimage; j++, y++) {
-		if(y >= conf.nimage)
-			y = 0;
-		i = &imagealloc.list[y];
-		if(i->ref == 0)
-			continue;
-		np += pagereclaim(i);
-		if(np >= 1000)
-			goto Done;
+		lock(&imagealloc);
 	}
-Done:
+	imagealloc.pgidle -= np;
+	unlock(&imagealloc);
+
 	qunlock(&imagealloc.ireclaim);
 
 	return np;
--- a/sys/src/9/port/sysproc.c
+++ b/sys/src/9/port/sysproc.c
@@ -561,7 +561,7 @@
 
 	/* Attach text segment */
 	/* attachimage returns a locked cache image */
-	img = attachimage(tc);
+	img = attachimage(tc, (b-t)>>PGSHIFT);
 	if((ts = img->s) != nil && ts->flen == text){
 		assert(ts->image == img);
 		incref(ts);
--