shithub: neoventi

--- a/disk.c

+++ b/disk.c

@@ -117,14 +117,20 @@

 	return len;

-// Reads one block from disk into buf, which must be big enough

-int

-vtreadarenablock(VtArena *arena, u32int blockindex, char *buf)

+// Reads one block from disk into the cache, returning a pointer into the

+// cache.

+// If the data is already in cache, it will not be read again.

+// Caller is responsible for calling cachedone(buf);

+char*

+vtreadarenablock(VtArena *arena, u32int blockindex)

-	if(pread(arena->fd, buf, arena->blocksize, arena->base+(blockindex*arena->blocksize)) != arena->blocksize)

-		return 0;

-	cacheinsertmap(arena->fd, blockindex, buf);

-	return 1;

+	char *buf;

+	if(!cachelookup(&buf, arena->fd, blockindex)){

+		if(pread(arena->fd, buf, arena->blocksize, arena->base+(blockindex*arena->blocksize)) != arena->blocksize)

+			return nil;

+		cachewritten(arena->fd, blockindex);

+	}

+	return buf;

 u16int

@@ -131,8 +137,8 @@

 vtreadarena(VtArena *arena, u64int addr, uchar *dbuf, u16int reqsize)

 	u64int end = arena->size - arenadirsize(arena);

-	char *buf = malloc(arena->blocksize);

 	u16int off, n, m, size;

+	char *buf;

 	size = reqsize;

 	if(addr + reqsize > end)

 		size = end - addr;

@@ -140,7 +146,8 @@

 	addr -= off;

 	n = 0;

 	while(n < size){

-		if(!vtreadarenablock(arena, addr/arena->blocksize, buf))

+		buf = vtreadarenablock(arena, addr/arena->blocksize);

+		if(buf == nil)

 			// TODO: I/O error should not crash the disk layer.

 			// Might be good to be able to recover cached data in this case?

 			sysfatal("I/O error ☹");

--- a/mkfile

+++ b/mkfile

@@ -3,8 +3,9 @@

 TARG=neoventi

 BIN=/$objtype/bin

 OFILES=unwhack.$O server.$O util.$O disk.$O cache.$O

+CLEANFILES=paper.ps paper.pdf

-HFILES=\

+HFILES=neoventi.h

 </sys/src/cmd/mkmany

--- a/neoventi.h

+++ b/neoventi.h

@@ -130,4 +130,5 @@

 void initarenas(void);

 void initindex(void);

 void cacheinit(void);

-int cacheinsertmap(int fd, u32int blockindex, char *data);

+int cachelookup(char **buf, int fd, u32int blockindex);

+void cachewritten(int fd, u32int blocksize);

\ No newline at end of file

--- a/notebook

+++ b/notebook

@@ -806,4 +806,62 @@

 For now, going to just extract the block reading, and have it insert into cache...

-That was easy enough.

\ No newline at end of file

+That was easy enough.

+Going to use the zero index to indicate an absent entry; the first block in a given partition never has important data anyways :)

+Quickly changing the API to match intended usage, as well. cachemapinsert should take the fd, the block index, and a cache entry. We should then have a cachegrab(fd, index) that yields a buffer and an indicator about whether it is free or needs to be read into. Usage would look like:

+char *buf;

+if(cachegrab(&buf, fd, index)){

+	// data is in buf

+} else {

+	// read data into buf, and call cacheinsertmap(fd, index, buf)

+}

+Ah. Right. Cache insertion needs not the buffer but the _index_. Could do math to get it back but that seems silly. So... cachegrab should yield a cache index?

+Also, it'd be nice to be able to use the memory directly within the cache without needing to copy to a destination buffer. Since some chunks can cross blocks, that'd require a way to use multiple sequential blocks in the cache at once, and to ensure they will not be evicted before we're done with them.

+The latter is important anyways, and a refcount per entry is likely the way to go. The former requirement is a lot harder. And, to e.g. handle a read request, we'll need to copy from the cache into the destination buffer anyways.

+So, easier to design the interface such that you can grab a block, read it straight into the cache, then do a _single_ copy into the destination buffer, and repeat for other needed blocks. Oh! And cachegrab can also set up the cache entry, so that clock reading could look something like:

+u32int cacheindex;

+if(cachelookup(&cacheindex, fd, blockindex))

+	return;

+pread(fd, cachebuf(cacheindex), ...;

+return cacheindex;

+And that's it. Cachelookup can, on a miss, grab a spot from the free list / trigger an eviction if needed. The caller can then cacheunref(cacheindex) when done with the entry.

+The one downside of reading straight into cache is that under extreme cache pressure, reads will become blocked by the rate of cache evictions - that is, if the cache is so small that the set of active reads cannot fit into it, there will be a major limit to throughput. At sizes just above that, the cache's utility will be negligible as most entries will be evicted quickly.

+The upside is that it reduces memory requirements, and it ought to be trivial to ensure that never happens. 1GiB is enough for >100K entries, which is far more than sufficient for purpose. Even just a few megs ought to be plenty to avoid total thrashing.

+We also need write locking on reads into the cache. Given two concurrent reads to the same data, one will read from disk into cache, both should yield the same cache index, and the memory-reader must block on the disk-reader to finish.

+A spin lock, held when yielding only to writers, should be sufficient, and require just eight bytes per entry - 1MiB, given a 1GiB cache. On the other hand, this is bumping the complexity / cost of avoiding copying, so it might be better to just...not? Holding the bucket lock while inserting would be... maybe fine, actually?

+u32int cacheindex;

+if(cachelookup(&cacheindex, fd, blockindex))

+	return;

+pread(fd, cachebuf(cacheindex), ...;

+// unlocks the bucket

+cachewritten(fd, blockindex);

+return cachebuf(cacheindex);

+...except, we no longer need the cache index, since we no longer are responsible for inserting it!

+char *buf;

+if(!cachelookup(&buf, fd, index)){

+	pread(fd, buf, ...);

+	cachewritten(fd, index);

+}

+return buf;

+We don't even need any sync on the other reader, since one of the parallel cachelookup calls will block on the other thread reading the data :)

+We still need a way to track buffer references, though. We can stuff a secret eight bytes before each buffer, and then just atomically increment it? For now, lacking atomics, we can toss a lock and adjust it under the lock. Entries can only be reclaimed when they have no active readers.

+Ooh, one option for tracking readers without too much locking is to have a list of items being read. Could be designed to fit pretty cmpactly, and should never be huge, since we don't expect to have more than a dozen active readers at once. Might be easier to manage.

--

⑨