ref: 6c16dccd849e657c3d11d233541e7da3b2992c5e
parent: 0b99e040e5663403a81df875c8826f01dc596f27
author: Noam Preil <noam@pixelhero.dev>
date: Sat Feb 17 18:18:07 EST 2024
stuff
--- /dev/null
+++ b/badcheck.c
@@ -1,0 +1,50 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include "neoventi.h"
+
+extern VtArena *arenas;
+extern u32int numarenas;
+
+int
+checkarenas(void)
+{
+ int bad = 0;
+ u8int magicbuf[4];
+ u32int magic;
+ u64int addr;
+ for(int i = numarenas-1; i >= 0; i -= 1){
+ addr = 0;
+ fprint(2, ".");
+ if(i % 20 == 19 || i+1 == numarenas)
+ fprint(2, "\n");
+ for(int j = 0 ; 1 ; j += 1){
+ if(vtreadarena(&arenas[i], addr, magicbuf, 4) != 4){
+ sysfatal("corrupt arena");
+ }
+ magic = U32GET(magicbuf);
+ if(magic == 0) // TODO: verify stopping point
+ break;
+ if(magic == arenas[i].clumpmagic)
+ fprint(2, "valid!...");
+ else{
+ fprint(2, "arena contains invalid clumps!");
+ break;
+ }
+ }
+ };
+ return bad;
+}
+
+void
+threadmain(int argc, char **argv)
+{
+ if(argc != 1)
+ sysfatal("i'm a dummy, sorry.");
+ fprint(2, "loading arena partition metadata... ");
+ initarenas();
+ fprint(2, "...scanning.\n");
+ if(!checkarenas())
+ sysfatal("arenas do be corrupt, yo!");
+ fprint(2, "looks like you're good - for now...\n");
+}
--- a/disk.c
+++ b/disk.c
@@ -103,29 +103,47 @@
return ((arena->memstats.clumps / (arena->blocksize / 25)) + 1) * arena->blocksize;
}
-static void
-vtreadarena(VtArena *arena, u64int addr, uchar *dbuf, u16int *size)
+static u64int
+partlen(int fd, char *path)
{
+ Dir *dir = dirfstat(fd);
+ u64int len;
+ if(dir == nil)
+ sysfatal("Cannot stat partition %s", path);
+ if(dir->length == 0)
+ sysfatal("can't determine size of partition %s", path);
+ len = dir->length;
+ free(dir);
+ return len;
+}
+
+
+u16int
+vtreadarena(VtArena *arena, u64int addr, uchar *dbuf, u16int reqsize)
+{
u64int end = arena->size - arenadirsize(arena);
char *buf = malloc(arena->blocksize);
- u16int off, n, m;
- if(addr + *size > end)
- *size = end - addr;
+ u16int off, n, m, size;
+ size = reqsize;
+ if(addr + reqsize > end)
+ size = end - addr;
addr += arena->base;
off = addr & (arena->blocksize-1);
addr -= off;
n = 0;
- while(n < *size){
- if(pread(arena->fd, buf, arena->blocksize, addr) != arena->blocksize)
- sysfatal("pread failed!");
+ while(n < size){
+ long r = pread(arena->fd, buf, arena->blocksize, addr);
+ if(r != arena->blocksize)
+ sysfatal("pread failed: fd %d, r %ld, bsize %ld, partlen %ld!", arena->fd, r, arena->blocksize, partlen(arena->fd, arena->name));
m = arena->blocksize - off;
- if(m > *size - n)
- m = *size - n;
+ if(m > size - n)
+ m = size - n;
memmove(&dbuf[n], &buf[off], m);
n += m;
off = 0;
addr += arena->blocksize;
}
+ return size;
}
int
@@ -133,7 +151,7 @@
{
u16int size = addr.blocks<<ABlockLog;
uchar *buf = malloc(size);
- vtreadarena(addr.s_arena, addr.offset, buf, &size);
+ vtreadarena(addr.s_arena, addr.offset, buf, size);
size = U16GET(buf+7);
if(buf[29] == 2){
if(unwhack(dst, size, buf+38, U16GET(buf+5)) != size)
@@ -169,36 +187,6 @@
return 1;
}
-static int
-partopen(char *path, u64int *size)
-{
- Dir *dir;
- int fd;
- fd = open(path, OREAD);
- if(fd < 0)
- return fd;
- dir = dirfstat(fd);
- if(dir == nil || dir->length == 0)
- sysfatal("cannot determine size of partition '%s'", path);
- *size = dir->length;
- free(dir);
- return fd;
-}
-
-static u64int
-partlen(int fd, char *path)
-{
- Dir *dir = dirfstat(fd);
- u64int len;
- if(dir == nil)
- sysfatal("Cannot stat partition %s", path);
- if(dir->length == 0)
- sysfatal("can't determine size of partition %s", path);
- len = dir->length;
- free(dir);
- return len;
-}
-
static void
loadarena(VtArena *arena)
{
@@ -248,7 +236,7 @@
}
static void
-arenacheck(u32int magic, u32int version, u32int blocksize, u32int arenabase, u32int tabbase)
+arenapartcheck(u32int magic, u32int version, u32int blocksize, u32int arenabase, u32int tabbase)
{
if(magic != ArenaPartMagic)
sysfatal("bad arena partition magic number: %#ux expected ArenaPartMagic (%#ux)", magic, ArenaPartMagic);
@@ -282,7 +270,7 @@
/* Head is not perfectly aligned; table must be aligned as first block */
tabbase = (PartBlank + HeadSize + blocksize - 1) & ~(blocksize - 1);
tabsize = arenabase - tabbase;
- arenacheck(magic, version, blocksize, arenabase, tabbase);
+ arenapartcheck(magic, version, blocksize, arenabase, tabbase);
readarenatable(fd, tabbase, tabsize, blocksize);
}
--- a/main.c
+++ /dev/null
@@ -1,36 +1,0 @@
-#include <u.h>
-#include <libc.h>
-#include <bio.h>
-#include <thread.h>
-#include "neoventi.h"
-
-void
-parseargs(int argc, char **argv)
-{
- if(argc != 1)
- sysfatal("TODO: arg parsing");
-}
-
-static void
-init(void)
-{
- initarenas();
- initindex();
-}
-
-static void
-validate(void)
-{
- fprint(2, "TODO: validate initial state");
-}
-
-void
-threadmain(int argc, char **argv)
-{
- parseargs(argc, argv);
- print("Initializing neoventi build 5... ");
- init();
- validate();
- print("initialized, launching server.\n");
- serve("tcp!127.1!14011");
-}
--- a/mkfile
+++ b/mkfile
@@ -2,16 +2,16 @@
TARG=neoventi
BIN=/$objtype/bin
-OFILES=main.$O unwhack.$O server.$O util.$O disk.$O
+OFILES=unwhack.$O server.$O util.$O disk.$O
HFILES=\
-</sys/src/cmd/mkone
+</sys/src/cmd/mkmany
-vac:VE: $O.out
+vac:VE:
vacfs -h tcp!127.1!14011 vac:4091f8b8aafc7b8a815f38534b70a171e4ae3e44
-run:VE: $O.out
+run:VE: $O.neoventi
./$O.out & pid=$apid
sleep 2
venti/read -h tcp!127.1!14011 vac:4091f8b8aafc7b8a815f38534b70a171e4ae3e44
--- /dev/null
+++ b/neoventi.c
@@ -1,0 +1,36 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include <thread.h>
+#include "neoventi.h"
+
+void
+parseargs(int argc, char **argv)
+{
+ if(argc != 1)
+ sysfatal("TODO: arg parsing");
+}
+
+static void
+init(void)
+{
+ initarenas();
+ initindex();
+}
+
+static void
+validate(void)
+{
+ fprint(2, "TODO: validate initial state");
+}
+
+void
+threadmain(int argc, char **argv)
+{
+ parseargs(argc, argv);
+ print("Initializing neoventi build 5... ");
+ init();
+ validate();
+ print("initialized, launching server.\n");
+ serve("tcp!127.1!14011");
+}
--- a/neoventi.h
+++ b/neoventi.h
@@ -41,6 +41,7 @@
IEntrySize = 38,
MaxAMap = 31*1024,
ClumpInfoSize = 25,
+ ClumpSize = ClumpInfoSize + 13,
ABlockLog = 9, /* All reads are of 512 byte sectors??? Yikes. We should probably use a larger size, FIXME. */
};
@@ -119,6 +120,7 @@
void serve(char *addr);
/* Looks up the address of a score on disk using the index */
int vtreadlookup(u8int *score, VtAddress *addr);
+u16int vtreadarena(VtArena *arena, u64int addr, uchar *dbuf, u16int reqsize);
int readclump(uchar *dst, VtAddress addr);
int Brdu32(Biobufhdr *bio, u32int *u32);
int stru32int(char *s, u32int *r);
--- a/notebook
+++ b/notebook
@@ -397,3 +397,157 @@
Caching / prefetching / some such system will definitely be needed to win on the read side - or, well, *probably* be needed. Parallelized readers might be sufficient, though that may require changes to libventi / vacfs / etc to enable multiple reads in flight at a time. But again, measure before designing.
Worst case, neoventi's prototype has demonstrated that I _can_ do this, even if I have to change the design a bunch more. And, I kinda want to do the final implementation in Hare anyways. I'll need to set up a new neoventi for write testing, because testing it on the root drive would be moronic - I'd have to boot the "real" venti read-only for that, and I've no idea how fossil would handle that. That makes this as good a time as any to actually implement the config handling...
+
+# Mon Dec 25 10:08:18 EST 2023
+
+Actually, ended up focusing on the refactor / cleanup, still. That's sufficient for now, though. And, took a break to fix album art loading in zuke :P which broke during a botched merge.
+
+I should probably also take a break to implement a neoventi formatter, and disk structure checkers, to replace the venti tools. I can test those against existing venti, and it'll be useful for setting up the test system instead of relying on venti's tools.
+
+I also need to take a pause, and reread the original venti paper, in full, before doing so. That will be important for improving neoventi design. Having a better understanding of what I'm replacing is important, and will be critical for writing the paper.
+
+TASK(fossil): neofossil should use spare blocks as cache. After a daily sync, my used blocks is 0, which is ridiculous - I should not need to reach out to venti to access e.g. /amd64/bin/rc!
+ Mon Dec 25 15:24:36 EST 2023:
+ Turns out that this is intentional, per the venti paper. "Venti is as fast as disk anyways." Given that their own numbers show it as, at BEST, half as fast, this is ridiculous. I want to make it true, but it's *not* currently true, and it's silly to waste network bandwidth when cache space is so cheap anyways.
+
+
+TASK(paper): look at more modern competitors than WAFL/AFS/etc. Look into gefs and ZFS, in particular.
+
+TASK(neoventi): investigate the possibility of storing large blocks, and e.g. streaming media files directly out of venti. (Also useful for e.g. torrents?)
+
+TASK(neoventi, research): multi-level blocks? e.g. reserve space for a torrent as a giant block, and _also_ for its individual blocks, pointing to the same storage on disk?
+
+TASK(neofossil, research): look into gear chunking more thoroughly. Wanna understand the math on this one :)
+
+TASK(neofossil, research): need to look into improving security, as well. Actually implementing authentication is a good starting point.
+
+Need to shut off as the train is arriving now, but I'm on page 4.
+
+Finished reading the paper; got a few interesting things from it I missed previously, but it's mostly unhelpful. DESIGN(neoventi): need to figure out what to replace the type with / how better to serve the "look up root scores" use case in a secure manner. Per-user lists in read-write storage, with a 'public' bit on each entry, might be acceptable?
+
+Going to implement the checkers _before_ the formatter. That'll ensure I have a total awareness of the disk format when I go to write the formatter.
+
+Checkarenas first. venti/checkarenas operates on a _singular arena partition_ as follows:
+
+• initarenapart
+• for arena ∈ arenapart:
+ • if verbose, print arena info
+ • if scanning the entire arena:
+ • For clump ∈ arena:
+ • Verify the clump's magic
+ • If the clump's magic is wrong, and we're set to fix, assume that there was a failed write, and mark the clump as free?
+ • TODO(venti/check, correctness): what effect does this have if there's a corrupt clump in the middle of correct data in the active arena? Would we end up appending to it?
+ • TODO(venti/check, correctness): what happens if we run this on a sealed arena that has been corrupted in the middle?
+ • Load the clump's data from the log
+ • Verify the clump's score
+ • If the score is wrong _and the encoding is none_, assume that the block was only partially written, and ignore it.
+ • TODO(venti/check, correctness): check if this assumption holds - and, also, what to expect if we crash in various places.
+ • detect invalid types
+ • Mark them as corrupt, if set to fix
+ • Verify that the header is correct
+ • If the memstats and diskstats are out of sync, replace the diskstats.
+ • TODO: check why this is needed. Seems likely that this is to recover from when we think we wrote data but it didn't get synced properly? That shouldn't be possible, though, since we should not update stats before writing out the blocks they refer to, presumably? venti is crazy enough that it wouldn't surprise me if it violates that requirement though.
+
+So what I actually want for my replacement, then:
+
+• For arena ∈ arenapart:
+ • For clump ∈ arena:
+ • Read clump from log, decompressing if necessary
+ • Read clump info from arena directory trailer
+ • Cross-reference clump
+ • Validate score
+ • If compressed:
+ • Recompress, double check result
+ • Verify the header
+ • Verify the trailer
+ • If sealed, verify the arena's checksum
+
+TASK(neoventi, optimization, unimportant): optimize arena partition loading? Takes like 600ms right now, which is hardly terrible, but it can probably be better.
+
+neoventi/checkarenas is reading the wrong address. While investigating venti/checkarenas behavior, I noticed it was reading the same address repeatedly (and hitting the disk cache on most of them), and assumed it was doing something wrong.
+
+clumpmagic clump 0, aa 0
+getting dblock from c4000
+preading c4000
+loadclump
+getting dblock from c4000
+found c4000 in cache
+readclumpinfo
+preading 200be000
+
+clumpmagic clump 1, aa 4e
+getting dblock from c4000
+
+It is, but not as much so as I thought. The double-reads are avoidable, but it's actually caching entire 8KiB sections?, so all the smaller blocks that fit in get prefetched at once.
+
+Actually, it's doing something else _worse_ than I thought.
+
+
+clumpmagic clump 15, aa 6ca
+loadclump
+getting dblock from c4000
+found c4000 in cache
+getting dblock from c6000
+preading c6000
+
+clumpmagic clump 16, aa 26f0
+loadclump
+getting dblock from c6000
+found c6000 in cache
+getting dblock from c8000
+preading c8000
+
+clumpmagic clump 17, aa 4716
+loadclump
+getting dblock from c8000
+found c8000 in cache
+getting dblock from c8000
+found c8000 in cache
+getting dblock from ca000
+preading ca000
+readclumpinfo
+found 200be000 in cache
+clumpmagic clump 18, aa 673c
+loadclump
+getting dblock from ca000
+found ca000 in cache
+getting dblock from ca000
+found ca000 in cache
+getting dblock from cc000
+preading cc000
+readclumpinfo
+found 200be000 in cache
+clumpmagic clump 19, aa 8762
+loadclump
+getting dblock from cc000
+found cc000 in cache
+getting dblock from cc000
+found cc000 in cache
+getting dblock from ce000
+preading ce000
+readclumpinfo
+found 200be000 in cache
+clumpmagic clump 20, aa a788
+loadclump
+getting dblock from ce000
+found ce000 in cache
+getting dblock from ce000
+found ce000 in cache
+getting dblock from d0000
+preading d0000
+readclumpinfo
+found 200be000 in cache
+
+It is not maintaining alignment _at all_. This is terrible for performance, I think? A single block read should only need to read from one sector, not two? Should probably test this. Regardless, this is unimportant for the read path, and should only impact how I design the write path.
+
+The important question, though, is why we're failing to pread.
+
+...fd is bad. Jeez. The address might be actually right?? What the heck...
+
+... oh for fucks sake.
+
+for(int i = numarenas; i >= 0; i -=1)
+
+EXCLUSIVE BOUND. NOT INCLUSIVE BOUND.
+
+Whoops.
--
⑨