shithub: vmxsmp

ref: e0e889fb69a6d4d3f2332244ef4f79f900a66d84
dir: /virtio.c/

View raw version
#include <u.h>
#include <libc.h>
#include <thread.h>
#include "dat.h"
#include "fns.h"

#include <ip.h>		/* parseether() */
#include <libsec.h>	/* genrandom() */

static int local_devpipes[8][2] = {{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1}};


 

typedef struct VIODev VIODev;
typedef struct VIOQueue VIOQueue;
typedef struct VIOBuf VIOBuf;
typedef struct VIONetDev VIONetDev;
typedef struct VIOBlkDev VIOBlkDev;

enum {
	BUFCHAIN = 1,
	BUFWR = 2,
	
	USEDNOIRQ = 1,
	
	DRIVEROK = 4, /* devstat */
	
	MAXVIODEV = 8,
};

struct VIOBuf {
	u32int flags;
	VIOQueue *qu;
	void *p;
	u64int addr;
	u32int len;
	u32int idx;
	VIOBuf *next, *head;
	u32int rptr, wptr;
};

struct VIOQueue {
	Lock lk;
	VIODev *d;
	u8int (*desc)[16], *avail, *used;
	u16int size;
	u32int addr;
	u16int availidx, usedidx;
	void (*notify)(VIOQueue*);
	int livebuf;
	int qidx;
};

struct VIONetDev {
	int readfd, writefd;
	u8int mac[6];
	enum {
		VNETPROMISC = 1,
		VNETALLMULTI = 2,
		VNETALLUNI = 4,
		VNETNOMULTI = 8,
		VNETNOUNI = 16,
		VNETNOBCAST = 32,
		
		VNETHEADER = 1<<31,
	} flags;
	u64int macbloom, multibloom;
};

struct VIOBlkDev {
	int fd;
	uvlong size;
};

struct VIODev {
	PCIDev *pci;
	int isrstat ;  /* per-CPU instead of single  */
	u32int devfeat, guestfeat;
	u16int qsel;
	u8int devstat ;
	VIOQueue *qu;
	int nqu, allocqu;
	u32int (*io)(int, u16int, u32int, int, VIODev *);
	void (*reset)(VIODev *);
	int devidx;
	Lock isrlk;
	union {
		VIONetDev net;
		VIOBlkDev blk;
	};
};

VIOShared *vioshared;

extern I8042 *i8042;
extern KbdShared *kbdshared;
extern UartShared *uartshared[2];

static int devpipes[MAXVIODEV][2];
static int ndevpipes;

/* Forward declarations */
static u32int vioblkio(int, u16int, u32int, int, VIODev *);
static u32int vionetio(int, u16int, u32int, int, VIODev *);

void *
vioalloc(ulong sz)
{
	void *p;
	
	sz = (sz + 7) & ~7;
	if(vioshared->alloc + sz > sizeof(vioshared->data))
		sysfatal("vioalloc: out of space");
	p = &vioshared->data[vioshared->alloc];
	vioshared->alloc += sz;
	return p;
}



void
viosharedinit(void)
{
	vioshared = mkseg("vio", 0x300004000, 0x10000);
    memset(vioshared, 0, sizeof(VIOShared));
	ndevpipes = 0;

	/* Allocate shared keyboard buffer */
	kbdshared = vioalloc(sizeof(KbdShared));
	memset(kbdshared, 0, sizeof(KbdShared));

	/* Allocate shared UART buffers */
	uartshared[0] = vioalloc(sizeof(UartShared));
	memset(uartshared[0], 0, sizeof(UartShared));
	
	uartshared[1] = vioalloc(sizeof(UartShared));
	memset(uartshared[1], 0, sizeof(UartShared));

	/* Allocate shared i8042 controller */
	i8042 = vioalloc(sizeof(I8042));
	memset(i8042, 0, sizeof(I8042));
	i8042->cfg = 0x47; // 0x74;
	i8042->stat = 0x10;
	i8042->oport = 0x03; // 0x01
	i8042->cmd = -1;

    extern UART *uart;
    uart = vioalloc(sizeof(UART) * 2);
    memset(uart, 0, sizeof(UART) * 2);
    uart[0].lsr = 0x60;
    uart[1].lsr = 0x60;

}


void
vionotifyinit(void)
{
}


static void
vioirq(VIODev *d, int val)
{
    int idx = d->devidx;
    
    lock(&d->isrlk);
    if(val != 0)
        d->isrstat |= val;
    else
        d->isrstat = 0;
	 
    unlock(&d->isrlk);


    pciirq(d->pci, val != 0);
    
//    pciirq(d->pci, val != 0);
}

static void *
checkdesc(VIOQueue *q, int i)
{
	if(i >= q->size){
		vmerror("virtio device %#x: invalid next pointer %d in queue (size %d), ignoring descriptor", q->d->pci->bdf, i, q->size);
		return nil;
	}
	return q->desc[i];
}

static Lock bufpool_lock;
static VIOBuf *bufpool;

static VIOBuf *
viobuf_alloc(void)
{
    VIOBuf *b;
    
    lock(&bufpool_lock);
    if(bufpool != nil){
        b = bufpool;
        bufpool = b->next;
        unlock(&bufpool_lock);
        memset(b, 0, sizeof(VIOBuf));
        return b;
    }
    unlock(&bufpool_lock);
    return emalloc(sizeof(VIOBuf));
}

static void
viobuf_free(VIOBuf *b)
{
    lock(&bufpool_lock);
    b->next = bufpool;
    bufpool = b;
    unlock(&bufpool_lock);
}

static VIOBuf *
viogetbuf(VIOQueue *q, int wait)
{
	u16int gidx;
	VIOBuf *b, *rb, **bp;
	void *dp;

	USED(wait);

	lock(&q->lk);
	
	if((q->d->devstat & DRIVEROK) == 0 || q->desc == nil){
		unlock(&q->lk);
		return nil;
	}

	gidx = GET16(q->avail, 2);
	if(gidx == q->availidx){
		unlock(&q->lk);
		return nil;
	}

	dp = checkdesc(q, GET16(q->avail, 4 + 2 * (q->availidx % q->size)));
	rb = nil;
	bp = &rb;
	for(;;){
		b = viobuf_alloc();  
		b->qu = q;
		b->idx = (u8int(*)[16])dp - q->desc;
		b->addr = GET64(dp, 0);
		b->len = GET32(dp, 8);
		b->flags = GET16(dp, 12);
		b->p = gptr(b->addr, b->len);
		if(b->p == nil){
			vmerror("virtio device %#x: invalid buffer pointer %#p in queue", q->d->pci->bdf, (void*)b->addr);
			viobuf_free(b);
			break;
		}
		*bp = b;
		b->head = rb;
		bp = &b->next;
		if((b->flags & BUFCHAIN) == 0) break;
		dp = checkdesc(q, GET16(dp, 14));
		if(dp == nil) break;
	}
	q->availidx++;
	if(rb == nil){
		unlock(&q->lk);
		return nil;
	}
	q->livebuf++;
	unlock(&q->lk);
	return rb;
}

void
vioputbuf(VIOBuf *b)
{
    VIOBuf *bn;
    VIOQueue *q;
    u8int *p;

    if(b == nil) return;
    q = b->qu;
    
    lock(&q->lk);
    
    /* CRITICAL FIX: Always decrement livebuf, even during reset.
     * viodevstatset() is waiting for livebuf == 0 to proceed with reset.
     * Without this, reset hangs for 5 seconds (timeout). */
    q->livebuf--;
    
    if((q->d->devstat & DRIVEROK) == 0){
        unlock(&q->lk);
        goto end;
    }
    if(q->used == nil)
        vmerror("virtio device %#x: address was set to an invalid value while holding buffer", q->d->pci->bdf);
    else{
        p = q->used + 4 + 8 * (q->usedidx % q->size);
        PUT32(p, 4, b->wptr);
        PUT32(p, 0, b->idx);
        coherence();
        PUT16(q->used, 2, ++q->usedidx);
        coherence();
    }

    dprint("VIOPUTBUF: usedidx=%d id=%d len=%d avail_flags=%#x\n",
               q->usedidx, b->idx, b->wptr, GET16(q->avail, 0));

	int need_irq = (q->avail != nil && (GET16(q->avail, 0) & USEDNOIRQ) == 0);

    unlock(&q->lk);
    
	if(need_irq)
    	vioirq(q->d, 1);
 
end:
	while(b != nil){
    	bn = b->next;
    	viobuf_free(b);  // Instead of free()
    	b = bn;
	}
}

ulong
vioqread(VIOBuf *b, void *v, ulong n)
{
	VIOBuf *c;
	u32int p;
	int rc;
	ulong m;
	
	p = b->rptr;
	c = b;
	rc = 0;
	for(;;){
		if(rc >= n) return rc;
		for(;;){
			if(c == nil) return rc;
			if((c->flags & BUFWR) == 0){
				if(p < c->len) break;
				p -= c->len;
			}
			c = c->next;
		}
		m = c->len - p;
		if(m > n - rc) m = n - rc;
		memmove(v, (u8int*)c->p + p, m);
		p += m, rc += m;
		v = (u8int*)v + m;
		b->rptr += m;
	}
}

ulong
vioqwrite(VIOBuf *b, void *v, ulong n)
{
	VIOBuf *c;
	u32int p;
	int rc;
	ulong m;
	
	p = b->wptr;
	c = b;
	rc = 0;
	for(;;){
		if(rc >= n) return rc;
		for(;;){
			if(c == nil) return rc;
			if((c->flags & BUFWR) != 0){
				if(p < c->len) break;
				p -= c->len;
			}
			c = c->next;
		}
		m = c->len - p;
		if(m > n - rc) m = n - rc;
		memmove((u8int*)c->p + p, v, m);
		p += m, rc += m;
		v = (u8int*)v + m;
		b->wptr += m;
	}
}

ulong
vioqrem(VIOBuf *b, int wr)
{
	VIOBuf *c;
	u32int p;
	ulong rc;
	
	p = wr ? b->wptr : b->rptr;
	for(c = b;; c = c->next){
		if(c == nil) return 0;
		if(((c->flags & BUFWR) != 0) == wr){
			if(p < c->len) break;
			p -= c->len;
		}
	}
	rc = c->len - p;
	for(c = c->next; c != nil; c = c->next)
		if(((c->flags & BUFWR) != 0) == wr)
			rc += c->len;
	return rc;
}


static int local_ackpipes[8][2];   /* worker completion */

static void
viowakeup(VIOQueue *q)
{
    char c = 1;
    int idx = q->d->devidx;
    
    if(idx >= 0 && idx < 8 && vioshared->devpipes[idx][1] > 0){
        write(vioshared->devpipes[idx][1], &c, 1);
		 
		//read(local_ackpipes[idx][0], &c, 1);
	}
}

static void
vioqaddrset(VIOQueue *q, u64int addr)
{
	void *p;
	int sz1, sz;

	addr <<= 12;
	sz1 = -(-(18 * q->size + 4) & -4096);
	sz = sz1 + (-(-(8 * q->size + 6) & -4096));
	p = gptr(addr, sz);
	if(p == nil)
		vmerror("virtio device %#x: attempt to set queue to invalid address %#p", q->d->pci->bdf, (void *) addr);
	
	lock(&q->lk);
	q->addr = addr;
	if(p == nil){
		q->desc = nil;
		q->avail = nil;
		q->used = nil;
	}else{
		q->desc = p;
		q->avail = (u8int*)p + 16 * q->size;
		q->used = (u8int*)p + sz1;
	}
	coherence();
	unlock(&q->lk);
	
	viowakeup(q);
}

static void
vioqreset(VIOQueue *q)
{
	q->desc = nil;
	q->avail = nil;
	q->used = nil;
	q->addr = 0;
	q->availidx = 0;
	q->usedidx = 0;
}

static void
viodevstatset(VIODev *v, u32int val)
{
	int i;

	v->devstat = val;
	coherence();
	if(val == 0){
		if(v->reset != nil)
			v->reset(v);
		v->guestfeat = 0;
		vioirq(v, 0);
		for(i = 0; i < v->nqu; i++){
			int waited = 0;  /* NEW: timeout counter */
			lock(&v->qu[i].lk);
			while(v->qu[i].livebuf > 0){
				unlock(&v->qu[i].lk);
				sleep(1);
				waited++;  /* NEW */
				if(waited > 5000){  /* NEW: 5 second timeout */
					vmerror("virtio device %#x: timeout waiting for "
						"queue %d buffers (livebuf=%d)",
						v->pci->bdf, i, v->qu[i].livebuf);
					lock(&v->qu[i].lk);
					break;  /* Force reset anyway */
				}
				lock(&v->qu[i].lk);
			}
			vioqreset(&v->qu[i]);
			unlock(&v->qu[i].lk);
		}
	}else{
		for(i = 0; i < v->nqu; i++)
			v->qu[i].notify(&v->qu[i]);
	}
}


u32int
vioio(int isin, u16int port, u32int val, int sz, void *vp)
{
    VIODev *v;
    int rc;
    static char whinebuf[32];

    v = vp;
    coherence();
    
    switch(isin << 16 | port){
    case 0x4: v->guestfeat = val; return 0;
    case 0x8: if(v->qsel < v->nqu) vioqaddrset(&v->qu[v->qsel], val); return 0;
    case 0xe: v->qsel = val; return 0;
    case 0x10: if(val < v->nqu) v->qu[val].notify(&v->qu[val]); return 0;
    case 0x12: viodevstatset(v, val); return 0;
    case 0x10000: return v->devfeat;
    case 0x10004: return v->guestfeat;
    case 0x10008: return v->qsel >= v->nqu ? 0 : v->qu[v->qsel].addr >> 12;
    case 0x1000c: return v->qsel >= v->nqu ? 0 : v->qu[v->qsel].size;
    case 0x1000e: return v->qsel;
    case 0x10010: return 0;
    case 0x10012: return v->devstat;
    
    case 0x10013:  /* ISR read - atomic read and clear */
	     lock(&v->isrlk);
	     rc = v->isrstat;
	     v->isrstat = 0; 
	     unlock(&v->isrlk);
		
	     pciirq(v->pci, 0);
	     dprint("VIRTIO: ISR read, was %#x\n", rc);
	     return rc;
    }
    if(port >= 20 && v->io != nil)
        return v->io(isin, port - 20, val, sz, v);
    snprint(whinebuf, sizeof(whinebuf), "virtio device %6x", v->pci->bdf);
    return iowhine(isin, port, val, sz, whinebuf);
}

/*
 * mkviodev_alloc - allocate VIODev structure only
 */
static VIODev *
mkviodev_alloc(int nqu)
{
    VIODev *d;
    int i, idx;
    
    if(vioshared->ndevpipes >= MAXVIODEV)
        sysfatal("mkviodev: too many virtio devices");
    
    d = vioalloc(sizeof(VIODev));
    memset(d, 0, sizeof(VIODev));
    // Lock is already zeroed, which is fine for Plan 9 locks
    d->allocqu = nqu;
    d->qu = vioalloc(sizeof(VIOQueue) * nqu);
    memset(d->qu, 0, sizeof(VIOQueue) * nqu);
    for(i = 0; i < nqu; i++){
        d->qu[i].d = d;
        d->qu[i].qidx = i;
    }
    
	idx = vioshared->ndevpipes;
    d->devidx = idx;
    if(pipe(vioshared->devpipes[idx]) < 0)
        sysfatal("mkviodev pipe: %r");
    vioshared->ndevpipes++;
    
    return d;
}
/*
 * mkviodev_register - register device with PCI after all fields are set
 */
static void
mkviodev_register(VIODev *d, u16int devid, u32int subclass, u32int pciclass)
{
	coherence();
	
	d->pci = mkpcidev(allocbdf(), (devid << 16) | 0x1af4, subclass << 8, 1);
	d->pci->subid = (pciclass << 16) | 0x1af4;
	
	coherence();
	
	mkpcibar(d->pci, BARIO, 0, 256, vioio, d);
}

static VIOBuf *
viogetbuf_smp(VIOQueue *q, int wait)
{
    VIOBuf *b;
    char c;
    int idx = q->d->devidx;
    
    for(;;) {
        b = viogetbuf(q, 0);
        if(b != nil)
            return b;
        if(!wait)
            return nil;
        if(idx >= 0 && idx < 8 && vioshared->devpipes[idx][0] > 0)
            read(vioshared->devpipes[idx][0], &c, 1);
    }
}

VIOQueue *
mkvioqueue(VIODev *d, int sz, void (*notify)(VIOQueue *))
{
	VIOQueue *q;
	
	if(d->nqu >= d->allocqu)
		sysfatal("mkvioqueue: too many queues");
	q = &d->qu[d->nqu++];
	q->size = sz;
	q->notify = notify;
	q->d = d;
	vioqreset(q);
	return q;
}

int
bloomhash(u8int *mac)
{
	int x;

	x = mac[0];
	x ^= mac[0] >> 6 ^ mac[1] << 2;
	x ^= mac[1] >> 4 ^ mac[2] << 4;
	x ^= mac[2] >> 2;
	x ^= mac[3];
	x ^= mac[3] >> 6 ^ mac[4] << 2;
	x ^= mac[4] >> 4 ^ mac[5] << 4;
	x ^= mac[5] >> 2;
	return x & 63;
}

int
viomacok(VIODev *d, u8int *mac)
{
	static u8int bcast[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};

	if((d->net.flags & VNETPROMISC) != 0) return 1;
	if((mac[0] & 1) == 0){
		if((d->net.flags & (VNETNOUNI|VNETALLUNI)) != 0)
			return (d->net.flags & VNETNOUNI) == 0;
		if(memcmp(mac, d->net.mac, 6) == 0) return 1;
		if(d->net.macbloom == 0) return 0;
		return d->net.macbloom >> bloomhash(mac) & 1;
	}else if(memcmp(mac, bcast, 6) == 0)
		return (d->net.flags & VNETNOBCAST) == 0;
	else{
		if((d->net.flags & (VNETNOMULTI|VNETALLMULTI)) != 0)
			return (d->net.flags & VNETNOMULTI) == 0;
		if(d->net.multibloom == 0) return 0;
		return d->net.multibloom >> bloomhash(mac) & 1;
	}
}

void
vionetrproc(void *vp)
{
	VIODev *v;
	VIOQueue *q;
	VIOBuf *vb;
	uchar rxhead[10];
	uchar rxbuf[1600];
	int rc;
	
	threadsetname("vionetrproc");
	v = vp;
	q = &v->qu[0];
	memset(rxhead, 0, sizeof(rxhead));
	for(;;){
		rc = read(v->net.readfd, rxbuf, sizeof(rxbuf));
		if(rc == 0){
			vmerror("read(vionetrproc): eof");
			threadexits("read: eof");
		}
		if(rc < 0){
			vmerror("read(vionetrproc): %r");
			threadexits("read: %r");
		}
		if(rc < 14){
			vmerror("vionetrproc: short packet received (len=%d)", rc);
			continue;
		}
		if(!viomacok(v, rxbuf))
			continue;
		vb = viogetbuf_smp(q, 1);
		if(vb == nil){
			vmerror("viogetbuf: %r");
			continue;
		}
		vioqwrite(vb, rxhead, sizeof(rxhead));
		vioqwrite(vb, rxbuf, rc);
		vioputbuf(vb);
	}
}

void
vionetwproc(void *vp)
{
	VIODev *v;
	VIOQueue *q;
	VIOBuf *vb;
	uchar txhead[10];
	uchar txbuf[1610];
	int rc, len;
	uvlong ns;
	
	threadsetname("vionetwproc");
	v = vp;
	q = &v->qu[1];
	for(;;){
		vb = viogetbuf_smp(q, 1);
		if(vb == nil){
			vmerror("viogetbuf: %r");
			threadexits("viogetbuf: %r");
		}
		vioqread(vb, txhead, sizeof(txhead));
		len = vioqread(vb, txbuf+10, sizeof(txbuf)-10);
		if(len == sizeof(txbuf)-10){
			vmerror("virtio net: ignoring excessively long packet");
			vioputbuf(vb);
			continue;
		}
		if(len < 14){
			if(len != 0)
				vmerror("virtio net: ignoring short packet (length=%d)", len);
			vioputbuf(vb);
			continue;
		}else if(len < 60){
			memset(txbuf + 10 + len, 0, 60 - len);
			len = 60;
		}
		if((v->net.flags & VNETHEADER) != 0){
			txbuf[0] = len  >> 8;
			txbuf[1] = len;
			ns = nanosec();
			txbuf[2] = ns >> 56;
			txbuf[3] = ns >> 48;
			txbuf[4] = ns >> 40;
			txbuf[5] = ns >> 32;
			txbuf[6] = ns >> 24;
			txbuf[7] = ns >> 16;
			txbuf[8] = ns >> 8;
			txbuf[9] = ns;
			rc = write(v->net.writefd, txbuf, len + 10);
		}else
			rc = write(v->net.writefd, txbuf + 10, len);
		vioputbuf(vb);
		if(rc < 0){
			vmerror("write(vionetwproc): %r");
			continue;
		}
		if(rc < len){
			vmerror("write(vionetwproc): incomplete write (%d < %d)", rc, len);
			continue;
		}
	}
}

static u32int
vionetio(int isin, u16int port, u32int val, int sz, VIODev *v)
{
	switch(isin << 16 | port){
	case 0x10000: case 0x10001: case 0x10002: case 0x10003:
		return GET32(v->net.mac, 0) >> (port & 3) * 8;
	case 0x10004: case 0x10005: case 0x10006: case 0x10007:
		return (GET16(v->net.mac, 4) | 1 << 16) >> (port & 3) * 8;
	}
	return iowhine(isin, port, val, sz, "virtio net");
}

int
vionettables(VIODev *d, VIOBuf *b)
{
	u8int buf[4];
	u8int mac[6];
	u64int bloom[2];
	int i, l;
	
	bloom[0] = 0;
	bloom[1] = 0;
	for(i = 0; i < 2; i++){
		if(vioqread(b, buf, 4) < 4)
			return 1;
		l = GET32(buf, 0);
		while(l--){
			if(vioqread(b, mac, 6) < 6)
				return 1;
			bloom[i] |= 1ULL<<bloomhash(mac);
		}
	}
	d->net.macbloom = bloom[0];
	d->net.multibloom = bloom[1];
	return 0;
}

void
vionetcmd(VIOQueue *q)
{
	VIODev *d;
	VIOBuf *b;
	u8int cmd[2], buf[6];
	u8int ack;
	int fl;

	d = q->d;
	for(; b = viogetbuf_smp(q, 0), b != nil; vioputbuf(b)){
		if(vioqread(b, cmd, 2) < 2){
			ack = 1;
			vioqwrite(b, &ack, 1);
			continue;
		}
		ack = 0;
		switch(cmd[0] << 8 | cmd[1]){
		case 0x0000: fl = VNETPROMISC; goto flag;
		case 0x0001: fl = VNETALLMULTI; goto flag;
		case 0x0002: fl = VNETALLUNI; goto flag;
		case 0x0003: fl = VNETNOMULTI; goto flag;
		case 0x0004: fl = VNETNOUNI; goto flag;
		case 0x0005: fl = VNETNOBCAST; goto flag;
		flag:
			if(vioqread(b, buf, 1) < 1) ack = 1;
			else if(buf[0] == 1) d->net.flags |= fl;
			else if(buf[0] == 0) d->net.flags &= ~fl;
			else ack = 1;
			break;
		case 0x0100:
			ack = vionettables(d, b);
			break;
		case 0x0101:
			if(vioqread(b, buf, 6) < 6) ack = 1;
			else memmove(d->net.mac, buf, 6);
			break;
		default:
			ack = 1;
		}
		vioqwrite(b, &ack, 1);
	}
}

void
vionetreset(VIODev *d)
{
	d->net.flags &= VNETHEADER;
	d->net.macbloom = 0;
	d->net.multibloom = 0;
}

int
mkvionet(char *net)
{
	int fd, cfd;
	VIODev *d;
	char *ea;
	int flags;
	enum { VNETFILE = 1 };

	ea = nil;
	flags = 0;
	for(;;){
		if(strncmp(net, "hdr!", 4) == 0){
			net += 4;
			flags |= VNETHEADER;
		}else if(strncmp(net, "file!", 5) == 0){
			net += 5;
			flags |= VNETFILE;
		}else if(strncmp(net, "ea:", 3) == 0){
			net = strchr(ea = net+3, '!');
			if(net++ == nil){
				werrstr("missing: !");
				return -1;
			}
		}else
			break;
	}
	if((flags & VNETFILE) != 0){
		flags &= ~VNETFILE;
		fd = open(net, ORDWR);
		if(fd < 0) return -1;
	}else{
		fd = dial(netmkaddr("-1", net, nil), nil, nil, &cfd);
		if(fd < 0) return -1;
		if(cfd >= 0) {
			write(cfd, "promiscuous", 11);
			write(cfd, "bridge", 6);
		}
	}
	
	d = mkviodev_alloc(3);
	
	d->io = vionetio;
	d->reset = vionetreset;
	d->devfeat = 1<<5|1<<16|1<<17|1<<18|1<<20;
	d->net.readfd = fd;
	d->net.writefd = fd;
	d->net.flags = flags;
	
	if(ea == nil){
		genrandom(d->net.mac, 6);
		d->net.mac[0] = d->net.mac[0] & ~1 | 2;
	}else{
		if(parseether(d->net.mac, ea) != 0){
			fprint(2, "unparsable mac addr: %s\n", ea);
			return -1;
		}
		if((d->net.mac[0] & 1) != 0){
			werrstr("invalid mac addr %s: must be unicast", ea);
			return -1;
		}
		if((d->net.mac[0] & 2) == 0){
			fprint(2, "invalid mac addr %s: must not be local", ea);
			return -1;
		}
	}
	
	mkvioqueue(d, 1024, viowakeup);
	mkvioqueue(d, 1024, viowakeup);
	mkvioqueue(d, 1024, vionetcmd);
	
	mkviodev_register(d, 0x1000, 0x020000, 1);
	
	vioshared->netdevs[vioshared->nnetdevs++] = d;

//	proccreate(vionetrproc, d, 8192);
//	proccreate(vionetwproc, d, 8192);
	return 0;
}

static u32int
vioblkio(int isin, u16int port, u32int val, int sz, VIODev *v)
{
	uvlong cap;
	u32int sizemax, segmax, blksz;
	
	coherence();
	cap = v->blk.size;
	sizemax = 0x400000;  /* 4MB */
	segmax = 128;
	blksz = 512;
	
	switch(isin << 16 | port){
	/* capacity: 8 bytes at offset 0-7 */
	case 0x10000: case 0x10001: case 0x10002: case 0x10003:
		return (u32int)cap >> (port & 3) * 8;
	case 0x10004: case 0x10005: case 0x10006: case 0x10007:
		return (u32int)(cap >> 32) >> (port & 3) * 8;
	
	/* size_max: 4 bytes at offset 8-11 */
	case 0x10008: case 0x10009: case 0x1000a: case 0x1000b:
		return sizemax >> ((port - 8) & 3) * 8;
	
	/* seg_max: 4 bytes at offset 12-15 */
	case 0x1000c: case 0x1000d: case 0x1000e: case 0x1000f:
		return segmax >> ((port - 12) & 3) * 8;
	
	/* geometry: 4 bytes at offset 16-19 */
	case 0x10010: case 0x10011: case 0x10012: case 0x10013:
		return 0;
	
	/* blk_size: 4 bytes at offset 20-23 */
	case 0x10014: case 0x10015: case 0x10016: case 0x10017:
		return blksz >> ((port - 20) & 3) * 8;
	}
	
	return 0;
}

void
vioblkproc(void *vp)
{
    VIODev *v;
    VIOQueue *q;
    VIOBuf *b;
    u8int cmd[16];
    u8int ack;
    char buf[65536];
    uvlong addr;
    int rc, m;
    ulong n;
    vlong offset;
    static Lock reqlock; 
    static uvlong writecount = 0;
	static uvlong reqcount = 0;
    static uvlong readcount = 0;
    uvlong myreq, mywrite;
    
    threadsetname("vioblkproc");
    v = vp;
    q = &v->qu[0];
    
    dprint( "VIOBLK: started fd=%d size=%llud sectors (%llud bytes)\n", 
           v->blk.fd, v->blk.size, v->blk.size * 512ULL);
    if(v->blk.size == 0)
        fprint(2, "VIOBLK: WARNING: disk size is 0!\n");
    
	int idx = v->devidx;
	char c = 'w';

    for(;;){
  
        b = viogetbuf_smp(q, 1);
        if(b == nil){
            vmerror("vioblkproc: viogetbuf: %r");
            threadexits("vioblkproc: viogetbuf: %r");
        }
        
        lock(&reqlock);
        myreq = ++reqcount;
        unlock(&reqlock);
        
        ack = 0;
        
        if(debug || myreq <= 5){
            VIOBuf *tb;
            int i = 0;
            ulong total_rd = 0, total_wr = 0;
            dprint("VIOBLK[%llud]: descriptor chain:\n", myreq);
            for(tb = b; tb != nil; tb = tb->next, i++){
                dprint("  desc[%d]: addr=%#llux len=%ud flags=%#ux", 
                           i, tb->addr, tb->len, tb->flags);
                if(tb->flags & BUFCHAIN) dprint(" NEXT");
                if(tb->flags & BUFWR) {
                    dprint(" WRITE");
                    total_wr += tb->len;
                } else {
                    total_rd += tb->len;
                }
                dprint("\n");
            }
            dprint("  totals: readable=%lud writable=%lud\n", total_rd, total_wr);
        }
        
        ulong hdr_read = vioqread(b, cmd, sizeof(cmd));
        if(hdr_read < sizeof(cmd)){
            dprint("VIOBLK[%llud]: ERROR: header read failed: got %lud, need 16\n", myreq, hdr_read);
            goto nope;
        }
        
        u32int type = GET32(cmd, 0);
        u32int reserved = GET32(cmd, 4);
        addr = GET64(cmd, 8);
        
        if(debug || myreq <= 5)
            dprint("VIOBLK[%llud]: type=%ud reserved=%#ux sector=%llud rptr_after_hdr=%ud\n", myreq, type, reserved, addr, b->rptr);
        
        switch(type){
        case 0:  /* READ */
            lock(&reqlock);
            readcount++;
            unlock(&reqlock);
            
            n = vioqrem(b, 1);
            dprint("VIOBLK[%llud]: READ vioqrem(wr=1)=%lud\n", myreq, n);
            
            if(n == 0){
                dprint("VIOBLK[%llud]: ERROR: READ no writable space\n", myreq);
                ack = 1;
                break;
            }
            n -= 1;
            
            dprint("VIOBLK[%llud]: READ %lud bytes from sector %llud\n", myreq, n, addr);
            
            if(addr * 512 + n > v->blk.size * 512){
                dprint("VIOBLK[%llud]: ERROR: READ bounds failed\n", myreq);
                ack = 1;
                break;
            }
            
            offset = addr << 9;
            for(; n > 0; n -= rc){
                rc = sizeof(buf);
                if(n < rc) rc = n;
                rc = pread(v->blk.fd, buf, rc, offset);
                if(rc < 0){
                    dprint("VIOBLK[%llud]: ERROR: pread failed: %r\n", myreq);
                    ack = 1;
                    break;
                }
                if(rc == 0){
                    dprint("VIOBLK[%llud]: ERROR: pread EOF\n", myreq);
                    ack = 1;
                    break;
                }
                vioqwrite(b, buf, rc);
                offset += rc;
            }
            break;
            
        case 1:  /* WRITE */
            lock(&reqlock);
            mywrite = ++writecount;
            unlock(&reqlock);
            
            n = vioqrem(b, 0);
            if(debug || myreq <= 5)
                dprint("VIOBLK[%llud]: WRITE #%llud vioqrem(wr=0)=%lud sector=%llud\n", myreq, mywrite, n, addr);
            
            if(n == 0){
                dprint("VIOBLK[%llud]: WARNING: WRITE with no data\n", myreq);
                break;
            }
            
            if(addr * 512 + n > v->blk.size * 512){
                dprint("VIOBLK[%llud]: ERROR: WRITE bounds failed\n", myreq);
                ack = 1;
                break;
            }
            
            offset = addr << 9;
            for(; n > 0; n -= m){
                m = vioqread(b, buf, sizeof(buf));
                if(m <= 0)
                    break;
                if(n < m) m = n;
                
                rc = pwrite(v->blk.fd, buf, m, offset);
                if(rc < 0){
                    dprint("VIOBLK[%llud]: ERROR: pwrite failed: %r\n", myreq);
                    ack = 1;
                    break;
                }
                if(rc < m){
                    dprint("VIOBLK[%llud]: ERROR: short write %d < %d\n", myreq, rc, m);
                    ack = 1;
                    break;
                }
                offset += m;
            }
            break;
            
        case 4:  /* FLUSH */
            dprint("VIOBLK[%llud]: FLUSH\n", myreq);
            ack = 0;
            break;
            
        case 8:  /* GET_ID */
            {
                char serial[20];
                memset(serial, 0, 20);
                snprint(serial, 20, "vmx-vioblk");
                vioqwrite(b, serial, 20);
                ack = 0;
            }
            break;
            
        default:
        nope:
            ack = 2;
        }
        
        vioqwrite(b, &ack, 1);
        
        if(debug || myreq <= 5)
            dprint("VIOBLK[%llud]: completing, ack=%d\n", myreq, ack);
        
		if(myreq % 1000 == 0)
    		dprint("REQ %llud\n", myreq);

        vioputbuf(b);
        
        if(myreq % 1000 == 0)
            dprint("VIOBLK: %llud requests (%llud reads, %llud writes)\n", myreq, readcount, writecount);

		extern int wakepipe[MAXVCPU][2];
		extern int hltpipe[MAXVCPU][2];

 //		vioirq(v, 1);  /* set ISR here after all work done */
 //       write(local_ackpipes[idx][1], &c, 1);  /* signal done */
//		for(int i = 0; i < nvcpu; i++) 
//	        	write(wakepipe[i][1], &c, 1);

    }
}

 

void
virtio_start_workers(void)
{
    int i;

    /* Create LOCAL pipes for this process */
    for(i = 0; i < vioshared->nblkdevs; i++){
		 
        if(pipe(local_devpipes[i]) < 0)
            sysfatal("virtio pipe: %r");
		if(pipe(local_ackpipes[i]) < 0)
			sysfatal("virtio pipe: %r");
        proccreate(vioblkproc, vioshared->blkdevs[i], 131072);
    }
    
    for(i = 0; i < vioshared->nnetdevs; i++){
        if(pipe(local_devpipes[vioshared->nblkdevs + i]) < 0)
            sysfatal("virtio pipe: %r");
        proccreate(vionetrproc, vioshared->netdevs[i], 8192);
        proccreate(vionetwproc, vioshared->netdevs[i], 8192);
    }
}


int
mkvioblk(char *fn)
{
    int fd;
    VIODev *d;
    uvlong size;
    Dir *dir;
    
    fd = open(fn, ORDWR);
    if(fd < 0){
        fprint(2, "VIOBLK: ERROR: cannot open '%s': %r\n", fn);
        return -1;
    }
    
    /* Verify file is writable */
    dir = dirfstat(fd);
    if(dir != nil){
        free(dir);
    }
    
    /* Get size */
    vlong filesize = seek(fd, 0, 2);
    if(filesize < 0){
        fprint(2, "VIOBLK: ERROR: seek to end failed: %r\n");
        close(fd);
        return -1;
    }
    
    size = filesize >> 9;
    
    dprint("VIOBLK: opened '%s' fd=%d filesize=%lld size=%llud sectors\n",
           fn, fd, filesize, size);
    
    if(size == 0){
        fprint(2, "VIOBLK: WARNING: disk size is 0 sectors!\n");
        fprint(2, "VIOBLK: All I/O will fail bounds check. Is the file empty?\n");
    }
    
    /* Test write at offset 0 */
    if(seek(fd, 0, 0) < 0){
        fprint(2, "VIOBLK: ERROR: seek to 0 failed: %r\n");
    }
    
    d = mkviodev_alloc(1);
    
    d->io = vioblkio;
    d->devfeat = (1<<1) | (1<<2) | (1<<6);  /* SIZE_MAX, SEG_MAX, BLK_SIZE */;
    d->blk.fd = fd;
    d->blk.size = size;
    
    mkvioqueue(d, 1024, viowakeup);
    
    mkviodev_register(d, 0x1001, 0x018000, 2);

	vioshared->blkdevs[vioshared->nblkdevs++] = d;

//    for(int i = 0; i < 1; i++)
//	    proccreate(vioblkproc, d, 131072);

    return 0;
}