shithub: neoventi

Download patch

ref: a24a074bc3809078435fbcb45250967ec9a5891e
parent: a1aaaac69337a484bb66c9c5b3ffc581b1319dac
author: Noam Preil <noam@pixelhero.dev>
date: Sat Mar 23 07:22:59 EDT 2024

cache optimizations; 2x faster than venti!

--- a/cache.c
+++ b/cache.c
@@ -18,10 +18,10 @@
 static Lock freelock;
 
 #define NENTRIES 8
-#define ENTRIES 0x10000
+#define ENTRIES 0x80000
 // Oversizes, but ensures power-of-two for fibonacci hashing
-#define BUCKETS (ENTRIES/4)
-#define BITS 56
+#define BUCKETS (ENTRIES/8)
+#define BITS 48
 
 void
 cacheinit(void)
@@ -29,9 +29,10 @@
 	usize displacement;
 	assert(sizeof(Lock) == 8);
 	assert(sizeof(bucket_t) == 64);
-	data = sbrk(8192 * ENTRIES);
-	// 18725 buckets is the minimum for 128K entries; we round up to 32,768. That "wastes" ~1MiB, but spreads entries out further,
-	// and gives a nice power of two for fibonacci hashing.
+	data = sbrk((usize)0x2000 * ENTRIES);
+	if(data == (void*)-1)
+		sysfatal("failed to allocate cache space of %lux", 8192*ENTRIES/2);
+	print("data %llx, end %llx\n", data, data + 8192*ENTRIES);
 	buckets = sbrk(64 + 64*BUCKETS);
 	freelist = sbrk(4 * ENTRIES);
 	displacement = ((usize)buckets & 0b111111);
@@ -67,30 +68,44 @@
 	return result;
 }
 
-// Called to indicate that a request has been finished.
-// See documentation at top of file for more details.
-void
-cachedone(u16int arenaindex, u16int blockindex)
+static bucket_t*
+getbucket(u32int key)
 {
-	u64int bucketindex;
-	bucket_t *bucket;
-	u32int key;
-	key = arenaindex<<16 | blockindex;
-	// We have 32K buckets, so we want 15 bits
-	bucketindex = ((u64int)key * 11400714819323198485LL) >> BITS;
-	bucket = &buckets[bucketindex];
-	// Need to do linear probing to find the right bucket to unlock
-	for(;;){
-		for(int i = 0; i < NENTRIES; i += 1){
-			if(bucket->index[i] == key){
-				unlock(bucket);
-				return;
-			}
-		}
-		bucket++;
+	// Fibonacci hashing!
+	return &buckets[key * 11400714819323198485LL >> BITS];
+}
+
+static void
+put(bucket_t *bucket, int i, u32int key, u32int cacheindex)
+{
+	bucket->index[i] = key;
+	bucket->values[i*3] = cacheindex&0xFF;
+	bucket->values[i*3+1] = (cacheindex>>8)&0xFF;
+	bucket->values[i*3+2] = cacheindex>>16;
+}
+
+static void
+get(bucket_t *bucket, int i, char **buf)
+{
+	usize cacheindex = bucket->values[i*3] | (bucket->values[i*3+1]<<8) | (bucket->values[i*3+2]<<16);
+	*buf = data + cacheindex * 8192;
+	if(*buf == (void*)0x7fffdfff){
+		print("data %llx, cacheindex %lld, data+cacheindex*0x2000 %llx\n", data, cacheindex, data+cacheindex*0x2000);
 	}
 }
 
+static int
+trybucket(bucket_t *bucket, int *i, u32int key)
+{
+	for(*i = 0; *i < NENTRIES; *i += 1){
+		if(bucket->index[*i] == key)
+			return 1;
+		if(bucket->index[*i] == 0)
+			return 0;
+	}
+	return -1;
+}
+
 // Looks up a cache entry. If the entry is not present in cache,
 // this grabs a free buffer, inserts it into cache, and yields
 // its index so that the caller may fill it up.
@@ -98,38 +113,30 @@
 int
 cachelookup(char **buf, u16int arenaindex, u32int blockindex)
 {
-	u64int bucketindex;
 	u32int cacheindex;
-	bucket_t *bucket;
-	u32int key;
-	assert(blockindex < 0x10000);
-	key = arenaindex << 16 | (blockindex&0xFFFF);
-	// We have 32K buckets, so we want 15 bits
-	bucketindex = ((u64int)key * 11400714819323198485LL) >> BITS;
-	bucket = &buckets[bucketindex];
+	int i;
+	u32int key = (arenaindex << 16) | blockindex;
+	bucket_t *bucket = getbucket(key);
 	while(bucket != bucketend){
-		lock(bucket);
-		for(int i = 0; i < NENTRIES; i += 1){
-			if(bucket->index[i] == 0){
-				// Key does not exist
-				cacheindex = grabfree();
-				*buf = &data[cacheindex*8192];
-				assert((cacheindex & 0xFF000000) == 0);
-				bucket->index[i] = key;
-				bucket->values[i*3] = cacheindex&0xFF;
-				bucket->values[i*3+1] = (cacheindex>>8)&0xFF;
-				bucket->values[i*3+2] = cacheindex>>16;
-				// Leaves bucket locked intentionally!
-				return 0;
-			}
-			if(bucket->index[i] == key){
-				u32int cacheindex = bucket->values[i*3] | (bucket->values[i*3+1]<<8) | (bucket->values[i*3+2]<<16);
-				*buf = &data[cacheindex * 8192];
-				return 1;
-			}
+		switch(trybucket(bucket, &i, key)){
+		case -1:
+			// miss. linear probe.
+			bucket++;
+			continue;
+		case 0:
+			// found an empty slot
+			cacheindex = grabfree();
+			*buf = data+(usize)cacheindex*8192;
+	if(*buf == (void*)0x7fffdfff){
+		print("data %llx, cacheindex %lld, data+cacheindex*0x2000 %llx\n", data, cacheindex, data+cacheindex*0x2000);
+	}
+			put(bucket, i, key, cacheindex);
+			return 0;
+		case 1:
+			// already exists
+			get(bucket, i, buf);
+			return 1;
 		}
-		unlock(bucket);
-		bucket++;
 	}
 //FIXME not unreachable
 //if the final bucket is full, we must handle it
--- a/disk.c
+++ b/disk.c
@@ -46,18 +46,6 @@
 	return 0;
 }
 
-static VtArena*
-arenafromindex(u64int aindex)
-{
-	u64int i;
-	for(i = 0; i < numarenas; i += 1){
-		if(strcmp(arenas[i].name, index.amap[aindex].name) == 0)
-			return &arenas[i];
-	}
-	sysfatal("arena not found");
-	return nil;
-}
-
 static u64int
 aindexfromaddr(u64int addr)
 {
@@ -72,14 +60,17 @@
 int
 vtreadlookup(u8int *score, VtAddress *addr)
 {
-	u8int buf[0x2000];
+	u8int *buf;
 	u16int entry;
 	u64int aindex;
 	u64int bucket = U32GET(score) / index.div;
 	VtISect *s_sect = &index.sects[isectforbucket(bucket)];
 	bucket -= s_sect->start;
-	if(pread(s_sect->fd, (char*)buf, s_sect->blocksize, s_sect->blockbase + (bucket << s_sect->blocklog)) != s_sect->blocksize)
-		sysfatal("Failed to read bucket");
+	u16int key = s_sect->cacheindex + (bucket >> 16);
+	if(!cachelookup((char**)&buf, key, bucket & 0xffff)){
+		if(pread(s_sect->fd, (char*)buf, s_sect->blocksize, s_sect->blockbase + (bucket << s_sect->blocklog)) != s_sect->blocksize)
+			sysfatal("Failed to read bucket");
+	}
 	if(s_sect->bucketmagic && U32GET(buf + 2) != s_sect->bucketmagic)
 		sysfatal("index is corrupt: invalid bucket magic: sect %ux, buck %ux", s_sect->bucketmagic, U32GET(buf + 2));
 	if(!bucketlookup(buf, score, &entry))
@@ -88,7 +79,7 @@
 	addr->size = U16GET((buf + 6 + (entry * IEntrySize) + 34));
 	addr->blocks = buf[6 + (entry*IEntrySize) + 37];
 	aindex = aindexfromaddr(addr->offset);
-	addr->s_arena = arenafromindex(aindex);
+	addr->s_arena = index.amap[aindex].arena;
 	addr->offset -= index.amap[aindex].start;
 	return 1;
 }
@@ -125,7 +116,6 @@
 		sysfatal("invalid blocksize %d\n", arena->blocksize);
 	if(!cachelookup(&buf, arena->index, blockindex)){
 		if(pread(arena->fd, buf, arena->blocksize, arena->base+(blockindex*arena->blocksize)) != arena->blocksize){
-			cachedone(arena->index, blockindex);
 			return nil;
 		}
 	}
@@ -154,7 +144,6 @@
 		if(m > size - n)
 			m = size - n;
 		memcpy(&dbuf[n], &buf[off], m);
-		cachedone(arena->index, addr/arena->blocksize);
 		n += m;
 		off = 0;
 		addr += arena->blocksize;
@@ -166,7 +155,7 @@
 readclump(uchar *dst, VtAddress addr)
 {
 	u16int size = addr.blocks<<ABlockLog;
-	uchar *buf = malloc(size);
+	uchar buf[0x10000];
 	vtreadarena(addr.s_arena, addr.offset, buf, size);
 	size = U16GET(buf+7);
 	if(buf[29] == 2){
@@ -177,7 +166,6 @@
 		}
 	} else if(buf[29] == 1)
 		memcpy(dst, buf+38, size);
-	free(buf);
 	return 1;
 }
 
@@ -322,6 +310,7 @@
 	sect->blocklog = u64log2(sect->blocksize);
 	sect->tabbase = (PartBlank + HeadSize + sect->blocksize - 1) & ~(sect->blocksize - 1);
 	sect->tabsize = sect->blockbase - sect->tabbase;
+	sect->cacheindex = numarenas;
 }
 
 static void
@@ -404,4 +393,15 @@
 {
 	initisectpart("/dev/" MACHINE "/isect");
 	parseindex();
+	for(int i = 0; i < index.namap; i += 1){
+		int found = 0;
+		for(int j = 0; j < numarenas; j += 1)
+			if(strcmp(arenas[j].name, index.amap[i].name) == 0){
+				found = 1;
+				index.amap[i].arena = &arenas[j];
+				break;
+			}
+		if(!found)
+			sysfatal("unable to build arena map");
+	}
 }
--- a/neoventi.c
+++ b/neoventi.c
@@ -1,6 +1,5 @@
 #include <u.h>
 #include <libc.h>
-#include <pool.h>
 #include <bio.h>
 #include <thread.h>
 #include "neoventi.h"
@@ -26,9 +25,6 @@
 validate(void)
 {
 	fprint(2, "TODO: validate initial state");
-	for(int i = 0; i < 1000; i += 1){
-		poolcheck(mainmem);
-	}
 }
 
 void
--- a/neoventi.h
+++ b/neoventi.h
@@ -92,11 +92,15 @@
 	u32int		start;			/* first bucket in this section */
 	u32int		stop;			/* limit of buckets in this section */
 	int fd;
+
+	/* used for the cache key */
+	u16int cacheindex;
 } VtISect;
 
 typedef struct {
 	char name[NameSize];
 	u64int start, stop;
+	VtArena *arena;
 } MapEntry;
 
 enum
--- a/notebook
+++ b/notebook
@@ -887,3 +887,199 @@
 
 With the cache enabled, I get a compression error >_>
 
+Lots of pain and a month later, figured out the issue (the pain wasn't related to neoventi, mostly, just depression and grief. YAYYY!)
+
+Short story, the assumptions were wrong. the fd was useless as part of the key, because all the arenas are in the same file, and the block index is _arena-local_. Drop the fd, use a bespoke cache-specific arena index with the block index, and voila, the key goes from five bytes to four bytes and everything works!
+
+(With cachedone() function modified to linearly probe.)
+
+Bonus, since the value is three bytes, this takes entry size from eight to seven bytes; the 56 bytes are now enough for eight entries instead of seven :D
+
+(and, if the lock is reduced from eight bytes to one, we can trivially add a ninth entry per cache line!)
+
+This reduces the _cold_ time for walk /n/vac from ~30s to ~23s, and warm time from ~30s to ~19s. Profiling indicates that >70% of time is still spent in read() - including on the warm run when all data is in cache! - which heavily implies that the index access is the current bottleneck. So, we need to stuff the index entries into the disk cache as well as a first step.
+
+First idea is to give index sections a cache index as with arenas - there's definitely room in the address space for that! - and then use the index bucket as the block index (which it is!)
+
+Problem with that is that, unlike with arenas, where 16 bits is enough for the block index, a typical venti has a single large index section, not many small ones, and so we can not fit the bucket into 16 bits.
+
+A few approachs are obvious:
+
+- Split the index section on disk. Adjust the formatter to default to generating many smaller sections.
+	- This requires regenerating the index, and breaks compatibility. hard pass.
+- _Fake_ splitting the index section. Generate many index sections in memory that all correspond to the same real index section, each with a subset of buckets.
+	- This results in roughly one index section per 10GiB of storage. For 32TiB, as we specced out for the maximum storage in arenas that can fit in the cache as is, we'd need ~3200 index sections. This won't pollute the cachekey space at all. (For comparison, for arenas, we definitionally need ~1 entry per 512MiB!)
+	- This can be approximated by faking the key space. Each index section is given a _base_ cache index, and its range is calculated (so that future sections can be added with a higher base). For my system, I'd need ~75 fake index sections.
+	- This can be accomplished by splitting the bucket into the lower sixteen bits (simulated block index), and then at _most_ 12 bits are needed for the index base offset _and that's if 32TiB are represented in the singular index section!_
+	- This seems like a simple enough solution.
+
+Did that, and now it's 44s cold and 27s hot. No access to disk is needed on subsequent walks, but it's still slower. Going to do a dozen warm runs and then check the profile...
+
+Oh! and that's 44s/27s _with profiling enabled!!!_ That's kinda bullshit.
+
+regardless, down to 41s/24s by reducing how long locks are held. Need to revisit the cache locking, because this is still not great. ...it also implies there _is_ some lock contention, somehow??? Or, perhaps, it's the lack of linear probing after getting a cache entry (prevents some cache misses and computation).
+
+35.5% in read(), with 1.2M calls?? That's odd, there shouldn't be that many, I think?
+27.5% in cachelookup, 15% in lock, 7% in unlock, and 6% in cachedone. That's ~55% of cache lookup overhead. 3.2 _billion_ calls to lock and unlock.
+
+Ah right. those read calls are network related :P They're 1/3rd of execution time, so might be worth looking into fixing that up (/ pipelining: start reading next request while handling current one! TODO!), but for now, the cache problem is much more severe.
+
+With locking outright removed (We're single threaded for now anyways!!), cold is down to 28s _with profiling!_, and 11.5 warm _WITH PROFILING_!!!
+
+Down to 27.15s/10.8s by removing some hot allocation.
+
+Venti can do it in ~8s warm. Can we beat that without parallelizing more? ...
+
+74.6% of time is spent on read. A bit of that was the from-disk, but most is network; let's call it ~70% network overhead. 15% is the cachelookup function. 8% is strcmp+arenafromindex. 1% is processing index entries from the bucket; adding an index cache would have minimal effect on this workload, at least. .6% is decompression. Everything else is trivial.
+
+strcmp can probably be avoided. What are we using that for that there's 1.2B calls to it in the profile??
+
+Mapping arenas from name to index, apparently. That's silly. We should just have the arena map entry contain the VtArena*. Then we only need to do the indexing at startup.
+
+That brings us down to 24s cold (WITH PROFILING) and 7.74s warm! venti is BEAT!
+
+Without profiling, 24.4s cold, and... 8s warm???
+
+the hell?
+
+meh. close enough. It's plausible that the profiling is actually faster here for some reason lol
+
+Interestingly, the profile has changed a bit: 62% read, 31% cachelookup, 3.3% vtreadlookup, 1.6% decompression, and then a long tail of who cares.
+
+Going to split cachelookup into pieces so I can see what's actually taking time. Lack of inling means that might acatually make things a tad slower, but the code will be cleaner so I don't really care.
+
+static bucket_t*
+getbucket(u32int key)
+{
+	// Fibonacci hashing!
+	return &buckets[key * 11400714819323198485LL >> BITS];
+}
+
+static void
+put(bucket_t *bucket, int i, u32int key, u32int cacheindex)
+{
+	bucket->index[i] = key;
+	bucket->values[i*3] = cacheindex&0xFF;
+	bucket->values[i*3+1] = (cacheindex>>8)&0xFF;
+	bucket->values[i*3+2] = cacheindex>>16;
+}
+
+// Looks up a cache entry. If the entry is not present in cache,
+// this grabs a free buffer, inserts it into cache, and yields
+// its index so that the caller may fill it up.
+// Returns whether the data was already in cache.
+int
+cachelookup(char **buf, u16int arenaindex, u32int blockindex)
+{
+	u32int cacheindex;
+	u32int key = (arenaindex << 16) | blockindex;
+	bucket_t *bucket = getbucket(key);
+	int existed;
+	while(bucket != bucketend){
+		for(int i = 0; i < NENTRIES; i += 1){
+			if(bucket->index[i] == 0){
+				// Key does not exist
+				cacheindex = grabfree();
+				*buf = &data[cacheindex*8192];
+				put(bucket, i, key, cacheindex);
+				return 0;
+			}
+			if(bucket->index[i] == key){
+				u32int cacheindex = bucket->values[i*3] | (bucket->values[i*3+1]<<8) | (bucket->values[i*3+2]<<16);
+				*buf = &data[cacheindex * 8192];
+				return 1;
+			}
+		}
+		bucket++;
+	}
+//FIXME not unreachable
+//if the final bucket is full, we must handle it
+	sysfatal("unreachable");
+	return 0;
+}
+
+ 0.0    0.028  1102923	getbucket
+ 0.0    0.000    41077	put
+14.5   70.896  1102923	cachelookup
+
+Iiiiinteresting. Picking the bucket and inserting into it are trivial. Let's split this further...
+
+
+static void
+get(bucket_t *bucket, int i, char **buf)
+{
+	u32int cacheindex = bucket->values[i*3] | (bucket->values[i*3+1]<<8) | (bucket->values[i*3+2]<<16);
+	*buf = &data[cacheindex * 8192];
+}
+
+static int
+trybucket(bucket_t *bucket, int *i, u32int key)
+{
+	for(*i = 0; *i < NENTRIES; *i += 1){
+		if(bucket->index[*i] == key)
+			return 1;
+		if(bucket->index[*i] == 0)
+			return 0;
+	}
+	return -1;
+}
+
+// Looks up a cache entry. If the entry is not present in cache,
+// this grabs a free buffer, inserts it into cache, and yields
+// its index so that the caller may fill it up.
+// Returns whether the data was already in cache.
+int
+cachelookup(char **buf, u16int arenaindex, u32int blockindex)
+{
+	u32int cacheindex;
+	int i;
+	u32int key = (arenaindex << 16) | blockindex;
+	bucket_t *bucket = getbucket(key);
+	while(bucket != bucketend){
+		switch(trybucket(bucket, &i, key)){
+		case -1:
+			// miss. linear probe.
+			bucket++;
+			continue;
+		case 0:
+			// found an empty slot
+			cacheindex = grabfree();
+			*buf = &data[cacheindex*8192];
+			put(bucket, i, key, cacheindex);
+			return 0;
+		case 1:
+			// already exists
+			get(bucket, i, buf);
+			return 1;
+		}
+	}
+//FIXME not unreachable
+//if the final bucket is full, we must handle it
+	sysfatal("unreachable");
+	return 0;
+}
+
+I'll be surprised if this isn't noticably slower again.
+
+Ha, yeah. 32.1s cold, 15.75 warm. Yikes. Knew it would be slower but 2x slower on the warm path? should be a good profile, at least.
+
+45.7  224.300  1368400	read
+31.2  153.185 3565574884	trybucket
+19.4   95.175    1427308	cachelookup
+ 1.6    7.741   684185	vtreadlookup
+
+...oh. This is probably the weakness of linear probing with insufficient entries; there's an average of 3000 bucket lookups needed to find a good slot. Oops.
+
+Also, BITS is set wrong. We're able to use all buckets, but only really _trying_ to use the first 255, because we're shifting over by 56. We have 64K entries, and we were reserving entries/4 buckets (since we had seven entries per bucket, and /7 would not give power of two bucket count!), so we should have had that set to 50.
+
+Changing that gives cold time of 20.7s even with the patches! Warm time is 4.39s.
+
+If I set entries to 0x40000, things break. That should be ~2GiB of memory; I'd think it was malloc related but I'm not using malloc, and the issue occurs later. Invalid address in a pread syscall...
+
+The buffer 0x7fffdfff is apparently invalid, for some reason? 
+
+data is at 10ded90, ends a 410ded90. Regardless of whether it's valid, it shouldn't actually be returned anywhere. Weird...
+
+...max index into data should be 0x40000 * 0x2000 is 2GiB. Are array indices of type int? If so, it might be wrapping >_>
+
+Ahhhhh, sbrk is failing to allocate 2GiB?
\ No newline at end of file
--- a/server.c
+++ b/server.c
@@ -1,7 +1,6 @@
 #include <u.h>
 #include <libc.h>
 #include <bio.h>
-#include <pool.h>
 #include <thread.h>
 #include "neoventi.h"
 
@@ -58,7 +57,6 @@
 	if(fprint(conn.fd, "venti-02-neoventi\n") == 18){
 		while(read(conn.fd, &c, 1) == 1){
 			if(c == '\n'){
-				poolcheck(mainmem);
 				return;
 			}
 		}
--