shithub: vmxsmp

ref: e0e889fb69a6d4d3f2332244ef4f79f900a66d84
dir: /exith.c/

View raw version
/*
 * exith.c - VM exit handlers for vmx
 *
 * Refactored with:
 * - Fixed self-IPI delivery timing (deferred until after instruction completes)
 * - Cleaner LAPIC/IOAPIC MMIO handling using dispatch tables
 * - Better TPR/PPR handling for interrupt priority
 */

#include <u.h>
#include <libc.h>
#include <tos.h>
#include <thread.h>
#include "dat.h"
#include "fns.h"
#include "x86.h"

int persist = 1;

extern Lock statelock;

IpiQueue * ipiqueue; 
vlong cached_tscoff;
extern int wakepipe[MAXVCPU][2];
extern int hltpipe[MAXVCPU][2];

/* Timer source for HLT timeout */
enum { TIMER_HLT_TIMEOUT = 99 };  /* Special source for HLT */

typedef struct ExitInfo ExitInfo;
struct ExitInfo {
	char *raw;
	char *name;
	uvlong qual;
	uvlong pa, va;
	u32int ilen, iinfo;
};

char *x86reg[16] = {
	RAX, RCX, RDX, RBX,
	RSP, RBP, RSI, RDI,
	R8, R9, R10, R11,
	R12, R13, R14, R15
};

char *x86segreg[8] = {
	"cs", "ds", "es", "fs", "gs", "ss",
};

extern int debug;
extern LApic lapic;
extern IOApic *ioapic;

/*
 * In-Service Register bitmap - tracks ALL interrupts currently being serviced.
 * 256 vectors = 8 x 32-bit words.
 * Each forked CPU process has its own copy.
 */
u32int isr_bitmap[8] = {0, 0, 0, 0, 0, 0, 0, 0};

 
/*
 * LAPIC TPR (Task Priority Register) - per CPU
 * Used for interrupt priority filtering
 */
static u32int lapic_tpr[MAXVCPU];

/*
 * LAPIC Spurious Vector Register - per CPU
 * Bit 8 enables the LAPIC
 */
u32int lapic_svr[MAXVCPU];

/* Saved ICR high for IPI delivery */
static u32int icr_hi_saved;

/* Track INIT sent state for SIPI handling */
static u32int init_sent[MAXVCPU];

/*
 * Interrupt Request Register bitmap - tracks ALL pending interrupts.
 * 256 vectors = 8 x 32-bit words.
 * This is needed because VMX can only hold ONE pending interrupt at a time,
 * so we must track multiple pending interrupts in software.
 */
u32int irr_bitmap[8] = {0, 0, 0, 0, 0, 0, 0, 0};

/* Set a pending interrupt in software IRR */
Lock irr_lock;

void
set_irr(int vector)
{
	
    if(vector >= 0 && vector < 256){
		 
        irr_bitmap[vector >> 5] |= (1 << (vector & 31));
	 
	}
	 
}

/* Clear a pending interrupt from software IRR */
static void
clear_irr(int vector)
{
    if(vector >= 0 && vector < 256){
	 
        irr_bitmap[vector >> 5] &= ~(1 << (vector & 31));
	 
	}
}
 
void
inject_pending_irq(void)
{
    int i, j, vec = -1;
    u32int bits;
    
     for(i = 7; i >= 0; i--){
        bits = irr_bitmap[i];
        if(bits != 0){
            /* Find lowest set bit */
            for(j = 0; j < 32; j++){
                if(bits & (1 << j)){
                    vec = (i << 5) + j;
                    goto found;
                }
            }
        }
    }
found:
 

    if(vec >= 0){
        ctl("irq %d", vec);
    } else {
        ctl("irq");
    }
}
/* ============================================================
 * Utility Functions
 * ============================================================ */



/*
 * Helper: Skip current instruction
 */
static void
skipinstr(ExitInfo *ei)
{
	rset(RPC, rget(RPC) + ei->ilen);
}


/*
 * Direct preemption timer arm (for hlt timeout)
 * This bypasses the unified timer manager for the HLT case.
 */
void
preempt_arm(vlong deadline_ns)
{
	extern u32int preempt_shift;
	vlong now, delta_ns;
	uvlong delta_tsc, ticks, freq;
	
	freq = _tos->cyclefreq;
	if(freq == 0)
		return;
	
	now = nanosec();
	
	if(deadline_ns <= now){
		ctl("preempt 1");
		return;
	}
	
	delta_ns = deadline_ns - now;
	delta_tsc = (delta_ns * freq) / 1000000000ULL;
	ticks = delta_tsc >> preempt_shift;
	
	if(ticks > 0xFFFFFFFFULL)
		ticks = 0xFFFFFFFF;
	if(ticks == 0)
		ticks = 1;
	
	ctl("preempt %ud", (u32int)ticks);
}

/*
 * Translate guest virtual address to physical.
 */
static uvlong
guest_vtop(uvlong va)
{
	uvlong cr0 = rget("cr0real");

	/* Paging disabled - virtual == physical */
	if((cr0 & 0x80000000ULL) == 0)
		return va;

	/* Low addresses - likely identity mapped in early boot */
	if(va < 0x100000000ULL)
		return va;

	/* For high addresses, we MUST walk the page tables.
	 * Different OSes use different mappings:
	 * - Plan 9: 0xffffffff80000000 + phys
	 * - Linux: varies with KASLR, direct map at 0xffff888000000000
	 * - OpenBSD: 0xffff800000000000 + phys
	 * Static mappings are unreliable, so always walk. */

	uvlong cr3 = rget("cr3");
	uvlong efer = rget("efer");
	int lma = (efer >> 10) & 1;

	if(lma){
		uvlong pml4_base = cr3 & ~0xFFFULL;
		uvlong pte, *ptep;
		uvlong pa;
		int idx;

		/* PML4 */
		idx = (va >> 39) & 0x1FF;
		ptep = gptr(pml4_base + idx * 8, 8);
		if(ptep == nil || (*ptep & 1) == 0)
			goto fallback;
		pte = *ptep;

		/* PDPT */
		pa = pte & 0x000FFFFFFFFFF000ULL;
		idx = (va >> 30) & 0x1FF;
		ptep = gptr(pa + idx * 8, 8);
		if(ptep == nil || (*ptep & 1) == 0)
			goto fallback;
		pte = *ptep;
		if(pte & 0x80)  /* 1GB page */
			return (pte & 0x000FFFFFC0000000ULL) | (va & 0x3FFFFFFF);

		/* PD */
		pa = pte & 0x000FFFFFFFFFF000ULL;
		idx = (va >> 21) & 0x1FF;
		ptep = gptr(pa + idx * 8, 8);
		if(ptep == nil || (*ptep & 1) == 0)
			goto fallback;
		pte = *ptep;
		if(pte & 0x80)  /* 2MB page */
			return (pte & 0x000FFFFFFFE00000ULL) | (va & 0x1FFFFF);

		/* PT */
		pa = pte & 0x000FFFFFFFFFF000ULL;
		idx = (va >> 12) & 0x1FF;
		ptep = gptr(pa + idx * 8, 8);
		if(ptep == nil || (*ptep & 1) == 0)
			goto fallback;
		pte = *ptep;

		return (pte & 0x000FFFFFFFFFF000ULL) | (va & 0xFFF);
	}

fallback:
	/* Fallback: try common static mappings */
	
	/* Plan 9 / 9front kernel: KZERO = 0xffffffff80000000 */
	if(va >= 0xffffffff80000000ULL)
		return va - 0xffffffff80000000ULL;

	/* OpenBSD amd64 direct map */
	if(va >= 0xffff800000000000ULL && va < 0xffff880000000000ULL)
		return va - 0xffff800000000000ULL;

	/* Linux direct map */
	if(va >= 0xffff888000000000ULL && va < 0xffffc00000000000ULL)
		return va - 0xffff888000000000ULL;

	return va;
}

/* ============================================================
 * Instruction Decoding for MMIO
 * ============================================================ */

/*
 * Get value being written to MMIO address
 */
static u32int
getmovval(void)
{
	uvlong rip = rget(RPC);
	uvlong phys = guest_vtop(rip);
	u8int *ip = gptr(phys, 16);

	if(ip == nil)
		return 0;

	int i = 0;
	int rex = 0;

	/* Skip prefixes */
	while(i < 8){
		if((ip[i] & 0xF0) == 0x40)
			rex = ip[i++];
		else if(ip[i] == 0x66 || ip[i] == 0x67)
			i++;
		else
			break;
	}

	/* MOV r/m, r32/64: 0x89 */
	if(ip[i] == 0x89){
		int modrm = ip[i+1];
		int reg = (modrm >> 3) & 7;
		if(rex & 4) reg |= 8;
		return rget(x86reg[reg]);
	}

	/* MOV r/m, imm32: 0xC7 /0 */
	if(ip[i] == 0xC7){
		int modrm = ip[i+1];
		int mod = modrm >> 6;
		int rm = modrm & 7;
		int off = 2;

		if(rm == 4) off++;  /* SIB */
		if(mod == 1) off += 1;
		else if(mod == 2 || (mod == 0 && rm == 5)) off += 4;

		return ip[i+off] | (ip[i+off+1]<<8) | (ip[i+off+2]<<16) | (ip[i+off+3]<<24);
	}

	/* MOV r/m, r8: 0x88 */
	if(ip[i] == 0x88){
		int modrm = ip[i+1];
		int reg = (modrm >> 3) & 7;
		return rget(x86reg[reg]) & 0xFF;
	}

	/* MOVNTI: 0x0F 0xC3 */
	if(ip[i] == 0x0F && ip[i+1] == 0xC3){
		int modrm = ip[i+2];
		int reg = (modrm >> 3) & 7;
		if(rex & 4) reg |= 8;
		return rget(x86reg[reg]);
	}

	return 0;
}

/*
 * Set destination register for MMIO read
 */
static int
setmovdest(uvlong val)  /* Changed from u32int to uvlong to support 64-bit values */
{
	uvlong rip = rget(RPC);
	uvlong phys = guest_vtop(rip);
	u8int *ip = gptr(phys, 16);

	if(ip == nil){
		dprint("setmovdest: gptr failed rip=%#llx phys=%#llx\n", rip, phys);
		rset(RAX, val);
		return 0;
	}
	
	/* Debug: show what we're decoding */
	if(ip[0] == 0 && ip[1] == 0 && ip[2] == 0 && ip[3] == 0){
		dprint("setmovdest: zero opcodes! rip=%#llx phys=%#llx\n", rip, phys);
	}

	int i = 0;
	int rex = 0;
	int has66 = 0;

	/* Skip prefixes */
	while(i < 8){
		if((ip[i] & 0xF0) == 0x40)
			rex = ip[i++];
		else if(ip[i] == 0x66){
			has66 = 1;
			i++;
		}
		else if(ip[i] == 0x67 || ip[i] == 0xF2 || ip[i] == 0xF3 ||
		        ip[i] == 0x2E || ip[i] == 0x3E || ip[i] == 0x26 ||
		        ip[i] == 0x64 || ip[i] == 0x65 || ip[i] == 0x36)
			i++;
		else
			break;
	}

	/* MOV r32/64, r/m32/64: 0x8B */
	if(ip[i] == 0x8B){
		int modrm = ip[i+1];
		int reg = (modrm >> 3) & 7;
		if(rex & 4) reg |= 8;

		if(rex & 8){
			/* REX.W = 64-bit operation */
			rset(x86reg[reg], val);
		} else if(has66){
			/* 0x66 prefix = 16-bit operation */
			uvlong oldval = rget(x86reg[reg]);
			rset(x86reg[reg], (oldval & ~0xFFFFULL) | (val & 0xFFFF));
		} else {
			/* No REX.W, no 0x66 = 32-bit operation (zero-extends to 64-bit) */
			rset(x86reg[reg], (u32int)val);
		}

		return 1;
	}

	/* MOV r8, r/m8: 0x8A */
	if(ip[i] == 0x8A){
		int modrm = ip[i+1];
		int reg = (modrm >> 3) & 7;

		if(rex){
			if(rex & 4) reg |= 8;
			uvlong oldval = rget(x86reg[reg]);
			rset(x86reg[reg], (oldval & ~0xFFULL) | (val & 0xFF));
		} else if(reg >= 4 && reg <= 7){
			char *basereg = x86reg[reg - 4];
			uvlong oldval = rget(basereg);
			rset(basereg, (oldval & ~0xFF00ULL) | ((val & 0xFF) << 8));
		} else {
			uvlong oldval = rget(x86reg[reg]);
			rset(x86reg[reg], (oldval & ~0xFFULL) | (val & 0xFF));
		}
		return 1;
	}

	/* MOVZX r32, r/m8: 0x0F 0xB6 */
	if(ip[i] == 0x0F && ip[i+1] == 0xB6){
		int modrm = ip[i+2];
		int reg = (modrm >> 3) & 7;
		if(rex & 4) reg |= 8;
		rset(x86reg[reg], val & 0xFF);
		return 1;
	}

	/* MOVZX r32, r/m16: 0x0F 0xB7 */
	if(ip[i] == 0x0F && ip[i+1] == 0xB7){
		int modrm = ip[i+2];
		int reg = (modrm >> 3) & 7;
		if(rex & 4) reg |= 8;
		rset(x86reg[reg], val & 0xFFFF);
		return 1;
	}

	/* TEST r/m32/64, imm32: 0xF7 /0 */
	if(ip[i] == 0xF7){
		int modrm = ip[i+1];
		int regop = (modrm >> 3) & 7;

		if(regop == 0){
			int mod = (modrm >> 6) & 3;
			int rm = modrm & 7;
			int immoff = i + 2;

			if(mod != 3 && rm == 4) immoff++;
			if(mod == 1) immoff += 1;
			else if(mod == 2) immoff += 4;
			else if(mod == 0 && rm == 5) immoff += 4;

			u32int imm = (u32int)ip[immoff] |
			            ((u32int)ip[immoff+1] << 8) |
			            ((u32int)ip[immoff+2] << 16) |
			            ((u32int)ip[immoff+3] << 24);

			u64int result;
			u64int signbit;
			if(rex & 8){
				result = val & (u64int)(s64int)(s32int)imm;
				signbit = 1ULL << 63;
			} else {
				result = (u32int)val & imm;
				signbit = 1ULL << 31;
			}

			uvlong flags = rget("flags");
			flags &= ~((1<<0) | (1<<2) | (1<<6) | (1<<7) | (1<<11));

			if(result == 0) flags |= (1 << 6);
			if(result & signbit) flags |= (1 << 7);

			u8int pb = result & 0xFF;
			pb ^= pb >> 4;
			pb ^= pb >> 2;
			pb ^= pb >> 1;
			if((pb & 1) == 0) flags |= (1 << 2);

			rset("flags", flags);
			return 1;
		}
	}

	/* CMP r/m32, imm32: 0x81 /7 or 0x83 /7 */
	if(ip[i] == 0x81 || ip[i] == 0x83){
		int modrm = ip[i+1];
		int regop = (modrm >> 3) & 7;

		if(regop == 7){
			int mod = (modrm >> 6) & 3;
			int rm = modrm & 7;
			int immoff = i + 2;

			if(mod != 3 && rm == 4) immoff++;
			if(mod == 1) immoff += 1;
			else if(mod == 2) immoff += 4;
			else if(mod == 0 && rm == 5) immoff += 4;

			s64int imm;
			if(ip[i] == 0x83)
				imm = (s64int)(s8int)ip[immoff];
			else
				imm = (s64int)(s32int)((u32int)ip[immoff] |
				            ((u32int)ip[immoff+1] << 8) |
				            ((u32int)ip[immoff+2] << 16) |
				            ((u32int)ip[immoff+3] << 24));

			u64int op1, op2, result;
			u64int signbit, mask;
			if(rex & 8){
				op1 = val;
				op2 = (u64int)imm;
				result = op1 - op2;
				signbit = 1ULL << 63;
				mask = ~0ULL;
			} else {
				op1 = (u32int)val;
				op2 = (u32int)imm;
				result = op1 - op2;
				signbit = 1ULL << 31;
				mask = 0xFFFFFFFFULL;
			}
			result &= mask;

			uvlong flags = rget("flags");
			flags &= ~((1<<0) | (1<<2) | (1<<6) | (1<<7) | (1<<11) | (1<<4));

			if(op1 < (op2 & mask)) flags |= (1 << 0);
			if(result == 0) flags |= (1 << 6);
			if(result & signbit) flags |= (1 << 7);
			if(((op1 ^ op2) & (op1 ^ result)) & signbit) flags |= (1 << 11);

			u8int pb = result & 0xFF;
			pb ^= pb >> 4;
			pb ^= pb >> 2;
			pb ^= pb >> 1;
			if((pb & 1) == 0) flags |= (1 << 2);

			rset("flags", flags);
			return 1;
		}
	}

	/* CMP r/m32, r32: 0x39 */
	if(ip[i] == 0x39){
		int modrm = ip[i+1];
		int reg = (modrm >> 3) & 7;
		if(rex & 4) reg |= 8;

		u64int regval = rget(x86reg[reg]);
		u64int op1, op2, result;
		u64int signbit, mask;

		if(rex & 8){
			op1 = val;
			op2 = regval;
			signbit = 1ULL << 63;
			mask = ~0ULL;
		} else {
			op1 = (u32int)val;
			op2 = (u32int)regval;
			signbit = 1ULL << 31;
			mask = 0xFFFFFFFFULL;
		}
		result = (op1 - op2) & mask;

		uvlong flags = rget("flags");
		flags &= ~((1<<0) | (1<<2) | (1<<6) | (1<<7) | (1<<11));

		if(op1 < op2) flags |= (1 << 0);
		if(result == 0) flags |= (1 << 6);
		if(result & signbit) flags |= (1 << 7);
		if(((op1 ^ op2) & (op1 ^ result)) & signbit) flags |= (1 << 11);

		u8int pb = result & 0xFF;
		pb ^= pb >> 4;
		pb ^= pb >> 2;
		pb ^= pb >> 1;
		if((pb & 1) == 0) flags |= (1 << 2);

		rset("flags", flags);
		return 1;
	}

	/* CMP r32, r/m32: 0x3B */
	if(ip[i] == 0x3B){
		int modrm = ip[i+1];
		int reg = (modrm >> 3) & 7;
		if(rex & 4) reg |= 8;

		u64int regval = rget(x86reg[reg]);
		u64int op1, op2, result;
		u64int signbit, mask;

		if(rex & 8){
			op1 = regval;
			op2 = val;
			signbit = 1ULL << 63;
			mask = ~0ULL;
		} else {
			op1 = (u32int)regval;
			op2 = (u32int)val;
			signbit = 1ULL << 31;
			mask = 0xFFFFFFFFULL;
		}
		result = (op1 - op2) & mask;

		uvlong flags = rget("flags");
		flags &= ~((1<<0) | (1<<2) | (1<<6) | (1<<7) | (1<<11));

		if(op1 < op2) flags |= (1 << 0);
		if(result == 0) flags |= (1 << 6);
		if(result & signbit) flags |= (1 << 7);
		if(((op1 ^ op2) & (op1 ^ result)) & signbit) flags |= (1 << 11);

		u8int pb = result & 0xFF;
		pb ^= pb >> 4;
		pb ^= pb >> 2;
		pb ^= pb >> 1;
		if((pb & 1) == 0) flags |= (1 << 2);

		rset("flags", flags);
		return 1;
	}

	/* Fallback - log and try to handle common patterns */
	dprint("setmovdest: unhandled opcode %02x %02x %02x %02x (rex=%02x has66=%d) val=%#llx\n", ip[i], ip[i+1], ip[i+2], ip[i+3], rex, has66, val);  /* Changed %#x to %#llx */
	
	/*
	 * For unhandled read instructions, try to determine destination from ModRM.
	 * Most memory reads use ModRM where bits 3-5 encode the destination register.
	 */
	if(i < 14 && (ip[i] == 0x8B || ip[i] == 0x8A || ip[i] == 0x03 || ip[i] == 0x0B ||
	              ip[i] == 0x13 || ip[i] == 0x1B || ip[i] == 0x23 || ip[i] == 0x2B ||
	              ip[i] == 0x33 || ip[i] == 0x3B)){
		int modrm = ip[i+1];
		int reg = (modrm >> 3) & 7;
		if(rex & 4) reg |= 8;
		dprint("setmovdest: fallback using reg %d (%s)\n", reg, x86reg[reg]);
		rset(x86reg[reg], val);
		return 1;
	}
	
	/* Last resort: set RAX */
	rset(RAX, val);
	return 0;
}

/* ============================================================
 * IPI Handling
 * ============================================================ */

//IpiQueue *ipiqueue;

void
ipiqueueinit(void)
{
	ipiqueue = mkseg("ipi", 0x300001000, 0x1000);
    memset(ipiqueue, 0, sizeof(IpiQueue));
	 
	ipiqueue->pids[0] = getpid();
	
	dprint("IPI queue initialized\n");
}

/*
 * ipi_queue(cpu, vec) - Queue interrupt for target CPU
 * Used by: ICR writes, IOAPIC delivery, cross-CPU timers
 * For remote: sends postnote to wake target
 * For local: just queues, caller must ipi_poll() later
 */
void ipi_queue(int target, int vector)
{
    if(ipiqueue == nil || target < 0 || target >= MAXVCPU)
        return;

    lock(&ipiqueue->lock[target]);
    ipiqueue->cpu[target].pending[vector >> 5] |= (1 << (vector & 31)); 
    unlock(&ipiqueue->lock[target]);
    coherence();

    /* Always use pipe - caller might be I/O thread in different process */
    write(ipiqueue->wakefds[target], "w", 1);
}

/*
 * ipi_poll() - Drain ALL queued interrupts into IRR
 * Called at: start of processexit(), end of eptfault(), in hlt()
 */

void
ipi_poll(void)
{
    int i; 

    if(ipiqueue == nil)
        return;
  

    lock(&ipiqueue->lock[curcpuid]);
    for(i = 0; i < 8; i++){
		irr_bitmap[i] |= ipiqueue->cpu[curcpuid].pending[i]; 
        ipiqueue->cpu[curcpuid].pending[i] = 0;
    } 
    unlock(&ipiqueue->lock[curcpuid]);
 
}

int
ipi_pending(void)
{
	int i, has_pending = 0;

	if(ipiqueue == nil)
		return 0;

	lock(&ipiqueue->lock[curcpuid]);
	for(i = 0; i < 8; i++){

		if(ipiqueue->cpu[curcpuid].pending[i]){
			has_pending = 1;
			break;
		}
	}
	unlock(&ipiqueue->lock[curcpuid]);

	return has_pending;
}

 
  
static void
ioapic_eoi(int vector)
{
    int i;
    u64int redir;
    
    if(ioapic == nil)
        return;
    
    dprint("ioapic_eoi(%d): pending before=%#x\n", vector, ioapic->irq_pending);
    
    lock(&ioapic->lock);
    for(i = 0; i < 24; i++){
        redir = ioapic->redir[i];
       
        if((redir & 0xFF) == (u32int)vector){
            ioapic->irq_pending &= ~(1 << i);
            
            int level_triggered = (redir >> 15) & 1;
            int masked = (redir >> 16) & 1;
            
            dprint("ioapic_eoi: IRQ %d redir=%#llx level_trig=%d irq_level=%#x masked=%d\n", i, redir, level_triggered, ioapic->irq_level, masked);
            
            /* If level-triggered and line still high, handle re-delivery */
            if(level_triggered && (ioapic->irq_level & (1 << i))){
                if(masked){
                    /* Line high but masked - set pending for unmask to deliver */
                    ioapic->irq_pending |= (1 << i);
                } else {
                    /* Line high and not masked - re-deliver now */
                    unlock(&ioapic->lock);
					ipi_queue(curcpuid, vector);
                    
                    return;
                }
            }
            
            dprint("ioapic_eoi: cleared IRQ %d, pending now=%#x\n", i, ioapic->irq_pending);
            break;
        }
    }
    unlock(&ioapic->lock);
}
/* ============================================================
 * I/O APIC MMIO Handlers
 * ============================================================ */

void 
ioapic_init(){
 
    ioapic = mkseg("ioapic", 0x300000000, 0x1000);
    memset(ioapic, 0, sizeof(IOApic));

    ioapic->id = nvcpu << 24;  /* ID in bits 24-27 */
 
	
	for(int i = 0; i < 24; i++){
	    if(i >= 16 && i <= 19){
	        /* PCI interrupt pins: level-triggered, unmasked */
	        ioapic->redir[i] = (1 << 15) | (48 + (i - 16));
	    } else {
	        /* Other pins: masked until guest programs them */
	        ioapic->redir[i] = (1ULL << 16) | (32 + i);
	    }
	}

    dprint("IOAPIC initialized: id=%#ux (APIC ID %d)\n", ioapic->id, ioapic->id >> 24);

}

static void
ioapic_mmio_handler(ExitInfo *ei)
{
	u32int off = ei->pa & 0xFFF;
	int iswrite = (ei->qual & 2) != 0;
	u32int val = 0;
	int sel;

	if(iswrite){
		val = getmovval();

		switch(off){
		case 0x00: /* IOREGSEL */
			ioapic->reg_sel[curcpuid] = val;
			dprint("IOAPIC: IOREGSEL = %#x\n", val);
			break;

		case 0x10: /* IOWIN */
		    sel = ioapic->reg_sel[curcpuid];
		    dprint("IOAPIC: IOWIN write reg=%#x val=%#x\n", sel, val);
		
		    switch(sel){
		    case 0x00: /* ID */
		        lock(&ioapic->lock);
		        ioapic->id = val;
		        unlock(&ioapic->lock);
		        break;
		
		    default:
		        if(sel >= 0x10 && sel < 0x40){
		            int idx = (sel - 0x10) / 2;
		            lock(&ioapic->lock);
		            if(sel & 1){
		                /* High 32 bits - just destination, no pending check needed */
		                ioapic->redir[idx] = (ioapic->redir[idx] & 0xFFFFFFFFULL) |
		                                     ((u64int)val << 32);
		                unlock(&ioapic->lock);
		            } else {
		                /* Low 32 bits - check for unmasking */
		                u64int old = ioapic->redir[idx];
		                ioapic->redir[idx] = (ioapic->redir[idx] & 0xFFFFFFFF00000000ULL) | val;
		                
		                /* Debug: track when IRQ 2 (timer) gets masked/unmasked */
						if (debug)
		                if(idx == 2){
		                    if(val & 0x10000)
		                        dprint("IOAPIC: IRQ 2 MASKED! val=%#x\n", val);
		                    else
		                        dprint("IOAPIC: IRQ 2 UNMASKED val=%#x vector=%d\n", val, val & 0xFF);
		                }
		                
/* Check if unmasking with pending interrupt OR line high */
if((old & (1<<16)) && !(val & (1<<16))){
    /* Was masked, now unmasked */
    int level_trig = (val >> 15) & 1;
    int should_deliver = ioapic->irq_pending & (1 << idx);
    
    /* For level-triggered, also deliver if line is currently high */
    if(level_trig && (ioapic->irq_level & (1 << idx)))
        should_deliver = 1;
    
if(should_deliver){
    int vector = val & 0xFF;
    int dest = (ioapic->redir[idx] >> 56) & 0xFF;
    int destmode = (val >> 11) & 1;
    ioapic->irq_pending &= ~(1 << idx);
    unlock(&ioapic->lock);
    
    ipi_queue(dest, vector);

    goto done_ioapic_write;
}
}

	                unlock(&ioapic->lock);
		            }
		
		            dprint("IOAPIC: redir[%d] %s = %#llx\n", idx, (sel & 1) ? "hi" : "lo", ioapic->redir[idx]);
		        }
		        break;
		    }
		done_ioapic_write:
    		break;
		default:
			dprint("IOAPIC: write to unknown offset %#x\n", off);
		}
	} else {
		switch(off){
		case 0x00: /* IOREGSEL */
			val = ioapic->reg_sel[curcpuid];
			break;

		case 0x10: /* IOWIN */
			sel = ioapic->reg_sel[curcpuid];

			switch(sel){
			case 0x00: /* ID */
				lock(&ioapic->lock);
				val = ioapic->id;
				unlock(&ioapic->lock);
				break;

			case 0x01: /* Version */
				val = 0x00170011;  /* 24 entries, version 0x11 */
				break;

			default:
				if(sel >= 0x10 && sel < 0x40){
        			int idx = (sel - 0x10) / 2;
        			lock(&ioapic->lock);
        			if(sel & 1)
            			val = (u32int)(ioapic->redir[idx] >> 32);
        			else
            			val = (u32int)ioapic->redir[idx];
        			unlock(&ioapic->lock);
    			} else
        			val = 0;
				break;
			}
		}

		dprint("IOAPIC: read off=%#x val=%#x\n", off, val);
		setmovdest(val);
	}
}

void
ioapic_set_irq(int irq, int level)
{
    u64int redir;
    int vector, dest, masked, destmode;
    int cpu;
    int need_delivery[MAXVCPU] = {0};
    int delivery_vector = -1;
    
    if(ioapic == nil || irq < 0 || irq >= 24)
        return;
    
    lock(&ioapic->lock);
    redir = ioapic->redir[irq];
    masked = (redir >> 16) & 1;
    vector = redir & 0xFF;
    destmode = (redir >> 11) & 1;
    dest = (redir >> 56) & 0xFF;
    
    if(level)
        ioapic->irq_level |= (1 << irq);
    else
        ioapic->irq_level &= ~(1 << irq);

    if(level && !masked){
        delivery_vector = vector;
        if(destmode == 1){
    		static int next_cpu = 0;
    		need_delivery[next_cpu] = 1;
    		next_cpu = (next_cpu + 1) % nvcpu;
		} else {
            if(dest < nvcpu)
                need_delivery[dest] = 1;
        }
    }
    unlock(&ioapic->lock);  // RELEASE BEFORE delivery
    
    // Deliver AFTER releasing ioapic lock

    /* Always use IPI - caller might not be a vCPU process */
    for(cpu = 0; cpu < nvcpu; cpu++){
        if(need_delivery[cpu]){
               ipi_queue(cpu, delivery_vector); 
        }
    }
}


/* ============================================================
 * LAPIC Timer
 * ============================================================ */

#define LAPIC_BUS_FREQ_HZ  100000000ULL  /* 100 MHz */

 
/* ============================================================
 * FORK CPU 
 * ============================================================ */

enum { RCENT = 256 };

static int nactivecpu = 0; 
extern u32int isr_bitmap[]; 
extern uvlong rcvalid[(RCENT + 63)/64];
extern uvlong rcdirty[(RCENT + 63)/64];
extern void vmxsetup(void);
extern int getexit;
extern void runloop(void);
extern void modregion(Region *);
 

extern Channel *waitch, *notifch; 

void forkcpu(u32int sipi){
   
	cached_tscoff = 0;
	rset("tscoff", 0);

    int pid = fork();
 
	dprint("CPU0: tscoff = %llud\n", ioapic->tsc_base);

    if(pid < 0)
        sysfatal("fork failed: %r");

    if(pid > 0){
		vmx_register_child(pid);
        nactivecpu += 1;
        return;
    }     

//	vmx_cleanup_init();

	atnotify(vmx_notehandler, 1);

    /* Child process */
    curcpuid = nactivecpu + 1;
	dprint("CPU%d: child started (pid %d)\n", curcpuid, getpid());

    if(ipiqueue != nil)
        ipiqueue->pids[curcpuid] = getpid();

    /* Create new VMX context for this CPU */
    vmxsetup();

    /* Map memory regions into new VMX context */
    Region *r;
    for(r = mmap; r != nil; r = r->next){
        modregion(r);
	 
		dprint("CPU%d: mapped regions, first region v=%p segname=%s\n", curcpuid, r->v, r->segname);
	}

    /* Clear register cache - must reload for new context */
    memset(rcvalid, 0, sizeof(rcvalid));
    memset(rcdirty, 0, sizeof(rcdirty));

    /* 
     * Set up real-mode CPU state at SIPI vector address.
     * SIPI vector is the page number, so address = sipi << 12.
     * CS selector = sipi << 8, CS base = sipi << 12, IP = 0.
     */
    
    /* Code segment - points to SIPI trampoline */
    rset("cs", sipi << 8);
    rset("csbase", sipi << 12);
    rset("cslimit", 0xFFFF);
    rset("csperm", 0x9b);  /* Present, DPL=0, code, readable */
    
    /* Instruction pointer starts at 0 (relative to CS base) */
    rset("pc", 0);
    
    /* Data segment */
    rset("ds", 0);
    rset("dsbase", 0);
    rset("dslimit", 0xFFFF);
    rset("dsperm", 0x93);  /* Present, DPL=0, data, writable */
    
    /* Extra segment */
    rset("es", 0);
    rset("esbase", 0);
    rset("eslimit", 0xFFFF);
    rset("esperm", 0x93);
    
    /* Stack segment */
    rset("ss", 0);
    rset("ssbase", 0);
    rset("sslimit", 0xFFFF);
    rset("ssperm", 0x93);
    
    /* FS segment */
    rset("fs", 0);
    rset("fsbase", 0);
    rset("fslimit", 0xFFFF);
    rset("fsperm", 0x93);
    
    /* GS segment */
    rset("gs", 0);
    rset("gsbase", 0);
    rset("gslimit", 0xFFFF);
    rset("gsperm", 0x93);
    
    /* General purpose registers - all zero */
    rset(RAX, 0);
    rset(RBX, 0);
    rset(RCX, 0);
    rset(RDX, 0);
    rset(RSI, 0);
    rset(RDI, 0);
    rset(RBP, 0);
    rset(RSP, 0);
    rset(R8, 0);
    rset(R9, 0);
    rset(R10, 0);
    rset(R11, 0);
    rset(R12, 0);
    rset(R13, 0);
    rset(R14, 0);
    rset(R15, 0);
    
    /* Flags - bit 1 is always set (reserved) */
    rset("flags", 0x2);
    
    /* Control registers - real mode state */
    rset("cr0real", 0x10);  /* ET bit set (x87 present) */
    rset("cr0fake", 0x10);
    rset("cr2", 0);
    rset("cr3", 0);
    rset("cr4real", 0);
    rset("cr4fake", 0);
    
    /* Extended feature enable register - no long mode yet */
    rset("efer", 0);
    
    /* Debug registers */
    rset("dr0", 0);
    rset("dr1", 0);
    rset("dr2", 0);
    rset("dr3", 0);
    rset("dr6", 0xFFFF0FF0);  /* Default value */
    rset("dr7", 0x400);       /* Default value */
    
    /* GDTR/IDTR - real mode defaults */
    rset("gdtrbase", 0);
    rset("gdtrlimit", 0xFFFF);
    rset("idtrbase", 0);
    rset("idtrlimit", 0xFFFF);
    
    /* LDTR - not used in real mode */
    rset("ldtr", 0);
    rset("ldtrbase", 0);
    rset("ldtrlimit", 0xFFFF);
    rset("ldtrperm", 0x82);
    
    /* Task register - not used in real mode */
    rset("tr", 0);
    rset("trbase", 0);
    rset("trlimit", 0xFFFF);
    rset("trperm", 0x8b);

    /* Initialize per-CPU LAPIC state */
	lapic_timer_init(); 
	irqactive = -1;
	state = VMRUNNING;
    
    /* Clear interrupt state for this CPU */
    memset(isr_bitmap, 0, sizeof(isr_bitmap)); 

	icr_hi_saved = 0;
 
	memset(irr_bitmap, 0, sizeof(irr_bitmap));  // Do you have this?
    
    /* Clear exit counter */
    getexit = 0;
    
    /* 
     * Create NEW channels for this process.
     * Parent's channels are not valid after fork() - they belong
     * to the parent's libthread state.
     */
    waitch = chancreate(sizeof(char *), 32);   
    notifch = chancreate(sizeof(VmxNotif), 16);
    
    if(waitch == nil ||   notifch == nil)
        sysfatal("CPU%d: chancreate failed: %r", curcpuid);

    dprint("CPU%d: starting at %#x:0000 (linear %#x)\n",  curcpuid, sipi << 8, sipi << 12);
 
    rset("tscoff", 0);
	cached_tscoff = 0;

	
    virtio_start_workers();  /* Start workers for CPU0 */
    

    runloop();
    
    /* Should never return */
    exits("CPU exited");
}


/* ============================================================
 * LAPIC MMIO Handlers (using dispatch table)
 * ============================================================ */

typedef u32int (*LapicReadFn)(void);
typedef void (*LapicWriteFn)(u32int val);

/* Forward declarations */
static u32int lapic_read_id(void);
static u32int lapic_read_version(void);
static u32int lapic_read_tpr(void);
static u32int lapic_read_ppr(void);
static u32int lapic_read_eoi(void);
static u32int lapic_read_ldr(void);
static u32int lapic_read_dfr(void);
static u32int lapic_read_svr(void);
static u32int lapic_read_isr(int reg);
static u32int lapic_read_icr_lo(void);
static u32int lapic_read_icr_hi(void);
 
static u32int lapic_read_timer_init(void);
static u32int lapic_read_timer_cur(void);
static u32int lapic_read_timer_div(void);

static void lapic_write_tpr(u32int val);
static void lapic_write_eoi(u32int val);
static void lapic_write_ldr(u32int val);
static void lapic_write_dfr(u32int val);
static void lapic_write_svr(u32int val);
static void lapic_write_icr_lo(u32int val);
static void lapic_write_icr_hi(u32int val); 
static void lapic_write_timer_init(u32int val);
static void lapic_write_timer_div(u32int val);

 
 



/* LAPIC Register offsets */
enum {
	LAPIC_ID        = 0x020,
	LAPIC_VERSION   = 0x030,
	LAPIC_TPR       = 0x080,
	LAPIC_APR       = 0x090,
	LAPIC_PPR       = 0x0A0,
	LAPIC_EOI       = 0x0B0,
	LAPIC_RRD       = 0x0C0,
	LAPIC_LDR       = 0x0D0,
	LAPIC_DFR       = 0x0E0,
	LAPIC_SVR       = 0x0F0,
	LAPIC_ISR_BASE  = 0x100,
	LAPIC_TMR_BASE  = 0x180,
	LAPIC_IRR_BASE  = 0x200,
	LAPIC_ESR       = 0x280,
	LAPIC_ICR_LO    = 0x300,
	LAPIC_ICR_HI    = 0x310,
	LAPIC_LVT_TIMER = 0x320,
	LAPIC_LVT_THERM = 0x330,
	LAPIC_LVT_PERF  = 0x340,
	LAPIC_LVT_LINT0 = 0x350,
	LAPIC_LVT_LINT1 = 0x360,
	LAPIC_LVT_ERROR = 0x370,
	LAPIC_TIMER_ICR = 0x380,
	LAPIC_TIMER_CCR = 0x390,
	LAPIC_TIMER_DCR = 0x3E0,
	LAPIC_SELF_IPI  = 0x3F0,
};

static u32int
lapic_read_id(void)
{
	return curcpuid << 24;
}

static u32int
lapic_read_version(void)
{
	/* Version 0x14, max LVT entry 5, no extended APIC space */
	return 0x50014;
}

static u32int
lapic_read_tpr(void)
{
	return lapic_tpr[curcpuid];
}

static u32int
lapic_read_ppr(void)
{
	/* PPR = max(TPR, highest ISR priority) */
	u32int tpr = lapic_tpr[curcpuid];
	u32int isr_prio = 0;
	int i, j;

	for(i = 7; i >= 0; i--){
		if(isr_bitmap[i]){
			for(j = 31; j >= 0; j--){
				if(isr_bitmap[i] & (1 << j)){
					isr_prio = (i * 32 + j) & 0xF0;
					goto found;
				}
			}
		}
	}
found:
	return (tpr & 0xF0) > isr_prio ? (tpr & 0xF0) : isr_prio;
}

static u32int
lapic_read_eoi(void)
{
	return 0;  /* Write-only register */
}

static u32int
lapic_read_ldr(void)
{
    if(ioapic == nil)
        return (1 << curcpuid) << 24;  /* Fallback default */
    if(ioapic->ldr[curcpuid] == 0)
        ioapic->ldr[curcpuid] = (1 << curcpuid) << 24;  /* Default: unique bit */
    return ioapic->ldr[curcpuid];
}

static u32int
lapic_read_dfr(void)
{
	return 0xFFFFFFFF;  /* Flat model */
}

static u32int
lapic_read_svr(void)
{
	return lapic_svr[curcpuid];
}

static u32int
lapic_read_isr(int reg)
{
	return isr_bitmap[reg];
}

static u32int
lapic_read_icr_lo(void)
{
	/*
	 * Return 0 for delivery status (bit 12) = idle.
	 * This is critical for OpenBSD which polls this bit.
	 */
	return 0;
}

static u32int
lapic_read_icr_hi(void)
{
	return icr_hi_saved;
}

u32int
lapic_read_lvt_timer(void)
{
	return lapic.lvt_timer;
}

static u32int
lapic_read_timer_init(void)
{
	return lapic.timer_initial;
}

static u32int
lapic_read_timer_cur(void)
{
	extern uvlong current_ns(void);

	LApic *la = &lapic;
	vlong remain, ticks;
	u32int div;

	if(!la->timer_active || la->timer_deadline <= 0)
		return 0;

	remain = la->timer_deadline - vmtime_ns();
	if(remain <= 0)
		return 0;

	div = lapic_divide_value(la->timer_divide);
	
	/*
	 * Calculate current count from remaining time.
	 * This MUST be the inverse of the period calculation in timer.c:
	 *   period_ns = initial * divide * 1e9 / bus_freq
	 *   current = remain_ns * bus_freq / (divide * 1e9)
	 *
	 * For 100MHz bus:
	 *   period_ns = initial * divide * 10
	 *   current = remain_ns / (divide * 10)
	 */
	ticks = remain * LAPIC_BUS_FREQ_HZ / ((vlong)div * 1000000000LL);
	
	if(ticks > 0xFFFFFFFFLL)
		ticks = 0xFFFFFFFF;
	
	return (u32int)ticks;
}

static u32int
lapic_read_timer_div(void)
{
	return lapic.timer_divide;
}

static void
lapic_write_tpr(u32int val)
{
	lapic_tpr[curcpuid] = val & 0xFF;
	dprint("CPU%d: TPR = %#x\n", curcpuid, val);
}

static void
lapic_write_eoi(u32int val)
{
	int vec = -1;
	int i, j;

	USED(val);

	/* Find highest priority in-service vector */
	for(i = 7; i >= 0; i--){
		if(isr_bitmap[i] != 0){
			for(j = 31; j >= 0; j--){
				if(isr_bitmap[i] & (1 << j)){
					vec = i * 32 + j;
					break;
				}
			}
			break;
		}
	}

	dprint("CPU%d: EOI vec=%d\n", curcpuid, vec);

	if(vec >= 0){
		isr_bitmap[vec >> 5] &= ~(1 << (vec & 31));
	    // Only EOI to IOAPIC for device interrupts, not IPIs
    	if(vec < 128)  // IOAPIC vectors
			ioapic_eoi(vec);
		}
}

static void
lapic_write_ldr(u32int val)
{
    dprint("CPU%d: LDR = %#x\n", curcpuid, val);
    if(ioapic != nil)
        ioapic->ldr[curcpuid] = val & 0xFF000000;  /* Only bits 31:24 are writable */
}

static void
lapic_write_dfr(u32int val)
{
	dprint("CPU%d: DFR = %#x\n", curcpuid, val);
}

static void
lapic_write_svr(u32int val)
{
	lapic_svr[curcpuid] = val;
	dprint("CPU%d: SVR = %#x (LAPIC %s)\n", curcpuid, val, (val & 0x100) ? "enabled" : "disabled");
}

/*
 * ICR Low write - triggers IPI send
 * This is the key function for self-IPI handling
 *
 * CRITICAL: Self-IPIs must be deferred until after the instruction completes.
 * OpenBSD's i82489_icr_wait() polls the delivery status bit after writing ICR_LO.
 * If we inject the interrupt immediately, it can fire before the poll completes,
 * causing the kernel to think the IPI failed or causing other race conditions.
 */
static void
lapic_write_icr_lo(u32int val)
{
	int vec = val & 0xFF;
	int delmode = (val >> 8) & 7;
	int destmode = (val >> 11) & 1;
	int level = (val >> 14) & 1;
	int trigger = (val >> 15) & 1;
	int shorthand = (val >> 18) & 3;
	int dest = icr_hi_saved >> 24;
	int target, i;

	USED(destmode);
	USED(level);
	USED(trigger);

	/* Always log ICR writes - critical for debugging SMP issues */
	 dprint("CPU%d: ICR_LO = %#x (vec=%d del=%d dest=%d short=%d)\n", curcpuid, val, vec, delmode, dest, shorthand);

	switch(shorthand){
	case 0: /* No shorthand - use destination field */
		target = dest;

		switch(delmode){
		case 0: /* Fixed */
		case 1: /* Lowest priority */
			if(vec == 0){
				dprint("CPU%d: ignoring IPI with vector 0\n", curcpuid);
				break;
			}
			ipi_queue(target, vec);
			break;

		case 4: /* NMI */
			dprint("CPU%d: NMI to CPU%d\n", curcpuid, target);
			if(target < nvcpu)
				ipi_queue(target, 2);
			break;

		case 5: /* INIT */
			
			dprint("CPU%d: INIT to CPU%d (init_sent was %d)\n", curcpuid, target, target < MAXVCPU ? init_sent[target] : -1);
			if(target > 0 && target < nvcpu){
				init_sent[target] = 1;
				dprint("CPU%d: marked init_sent[%d] = 1\n", curcpuid, target);
			}
			break;

		case 6: /* Startup IPI */
			dprint("CPU%d: SIPI to CPU%d at %#x (init_sent=%d)\n",
			       curcpuid, target, vec << 12, 
			       target < MAXVCPU ? init_sent[target] : -1);

			if(target > 0 && target < nvcpu && init_sent[target] == 1){
				init_sent[target] = 2;
				dprint("CPU%d: forking CPU%d now\n", curcpuid, target);
				forkcpu(vec);
			 
			 
				coherence();
				
				dprint("CPU%d: forkcpu returned\n", curcpuid);
			} else {
				dprint("CPU%d: SIPI ignored (target=%d nvcpu=%d init_sent=%d)\n",
				       curcpuid, target, nvcpu, 
				       target < MAXVCPU ? init_sent[target] : -1);
			}
			break;
		}
		break;

	case 1: /* Self */
		if((delmode == 0 || delmode == 1) && vec != 0){
			dprint("CPU%d: self-IPI (shorthand=1) vector %d (DEFERRED)\n", curcpuid, vec);
			ipi_queue(curcpuid, vec);
		}
		break;

	case 2: /* All including self */
		dprint("CPU%d: IPI to ALL (including self) vec=%d del=%d\n", curcpuid, vec, delmode);
		if((delmode == 0 || delmode == 1) && vec != 0){
			for(i = 0; i < nvcpu; i++){
				ipi_queue(i, vec);
			}
		}
		if(delmode == 5){
			dprint("CPU%d: INIT broadcast\n", curcpuid);
		}
		break;

	case 3: /* All excluding self */
		dprint("CPU%d: IPI to ALL (excluding self) vec=%d del=%d\n", curcpuid, vec, delmode);
		if((delmode == 0 || delmode == 1) && vec != 0){
			for(i = 0; i < nvcpu; i++){
				if(i != curcpuid)
					ipi_queue(i, vec);
			}
		}
		break;
	}
}

static void
lapic_write_icr_hi(u32int val)
{
	icr_hi_saved = val;
	dprint("CPU%d: ICR_HI = %#x (dest=%d)\n", curcpuid, val, val >> 24);
}
 

static void
lapic_write_timer_init(u32int val)
{
	LApic *la = &lapic;
	la->timer_initial = val;
	lapic_timer_start();
}

static void
lapic_write_timer_div(u32int val)
{
	lapic.timer_divide = val & 0xB;
}

/*
 * LAPIC Self-IPI register (x2APIC compatible, offset 0x3F0)
 */
static void
lapic_write_self_ipi(u32int val)
{
	int vec = val & 0xFF;
	if(vec != 0){
		dprint("CPU%d: SELF_IPI register vec=%d\n", curcpuid, vec);
		ipi_queue(curcpuid, vec);
	}
}

/*
 * Main LAPIC MMIO handler
 */
static void
lapic_mmio_handler(ExitInfo *ei)
{
	u32int off = ei->pa & 0xFFF;
	int iswrite = (ei->qual & 2) != 0;
	u32int val;

	dprint("LAPIC: off=%#x %s\n", off, iswrite ? "write" : "read");

	if(iswrite){
		val = getmovval();

		switch(off){
		case LAPIC_TPR:       lapic_write_tpr(val); break;
		case LAPIC_EOI:       lapic_write_eoi(val); break;
		case LAPIC_LDR:       lapic_write_ldr(val); break;
		case LAPIC_DFR:       lapic_write_dfr(val); break;
		case LAPIC_SVR:       lapic_write_svr(val); break;
		case LAPIC_ICR_LO:    lapic_write_icr_lo(val); break;
		case LAPIC_ICR_HI:    lapic_write_icr_hi(val); break;
		case LAPIC_LVT_TIMER: lapic_write_lvt_timer(val); break;
		case LAPIC_TIMER_ICR: lapic_write_initial_count(val); break;
		case LAPIC_TIMER_DCR: lapic_write_divide_config(val); break;
		case LAPIC_SELF_IPI:  lapic_write_self_ipi(val); break;
		default:
			dprint("LAPIC: write to %#x ignored\n", off);
		}
	} else {
		switch(off){
		case LAPIC_ID:        val = lapic_read_id(); break;
		case LAPIC_VERSION:   val = lapic_read_version(); break;
		case LAPIC_TPR:       val = lapic_read_tpr(); break;
		case LAPIC_PPR:       val = lapic_read_ppr(); break;
		case LAPIC_EOI:       val = lapic_read_eoi(); break;
		case LAPIC_LDR:       val = lapic_read_ldr(); break;
		case LAPIC_DFR:       val = lapic_read_dfr(); break;
		case LAPIC_SVR:       val = lapic_read_svr(); break;
		case LAPIC_ICR_LO:    val = lapic_read_icr_lo(); break;
		case LAPIC_ICR_HI:    val = lapic_read_icr_hi(); break;
		case LAPIC_LVT_TIMER: val = lapic_read_lvt_timer(); break;
		case LAPIC_TIMER_ICR: val = lapic_read_timer_init(); break;
		case LAPIC_TIMER_CCR: val = lapic_read_current_count(); break;
		case LAPIC_TIMER_DCR: val = lapic_read_timer_div(); break;

		/* ISR registers (8 x 32-bit) */
		case 0x100: case 0x110: case 0x120: case 0x130:
		case 0x140: case 0x150: case 0x160: case 0x170:
			val = lapic_read_isr((off - 0x100) >> 4);
			break;

		/* TMR and IRR - return 0 for now */
		case 0x180: case 0x190: case 0x1A0: case 0x1B0:
		case 0x1C0: case 0x1D0: case 0x1E0: case 0x1F0:
		case 0x200: case 0x210: case 0x220: case 0x230:
		case 0x240: case 0x250: case 0x260: case 0x270:
			val = 0;
			break;

		default:
			val = 0;
			dprint("LAPIC: read from %#x returning 0\n", off);
		}

		setmovdest(val);
	}
}

/* ============================================================
 * HPET MMIO Handler
 * ============================================================ */

extern u64int hpet_read(u64int addr, int size);
extern void hpet_write(u64int addr, u32int val, int size);
 
static void
hpet_mmio_handler(ExitInfo *ei)
{
	int iswrite = (ei->qual & 2) != 0;
	int size = 1 << ((ei->qual >> 7) & 7);  // Extract access size from qualification
	
	if(iswrite){
		u32int val = getmovval();
		hpet_write(ei->pa, val, size);
	} else {
		u64int val = hpet_read(ei->pa, size);
		setmovdest(val);
	}
}

/* ============================================================
 * EPT Fault Handler
 * ============================================================ */

/* Counter for EPT faults per CPU - helps debug SMP issues */
static vlong eptfault_count = 0;

static void
eptfault(ExitInfo *ei)
{
	eptfault_count++;
	
	/* Always log EPT faults to help debug SMP issues */
	dprint("CPU%d EPT[%lld]: pa=%#llux va=%#llux qual=%#llux\n",
	       curcpuid, eptfault_count, ei->pa, ei->va, ei->qual);

	/* I/O APIC MMIO */
	if(ei->pa >= 0xFEC00000 && ei->pa < 0xFEC01000){
		ioapic_mmio_handler(ei);
		goto done;
	}

	/* HPET MMIO at 0xFED00000 */
	if(ei->pa >= 0xFED00000 && ei->pa < 0xFED01000){
		hpet_mmio_handler(ei);
		goto done;
	}

	/* Local APIC MMIO */
	if(ei->pa >= 0xFEE00000 && ei->pa < 0xFEE01000){
		lapic_mmio_handler(ei);
		goto done;
	}

	/* Other EPT faults - log and continue */
	if(ei->pa < 0x1000){
		u32int val = getmovval();
		dprint("CPU%d LOW MEM WRITE: pa=%#llx val=%#x\n", curcpuid, ei->pa, val);
	} else {
		/* Unexpected EPT fault - could indicate memory mapping issue */
		dprint("CPU%d UNEXPECTED EPT: pa=%#llx va=%#llx qual=%#llx\n",
		       curcpuid, ei->pa, ei->va, ei->qual);
	}

done:
	skipinstr(ei);
	coherence();  /* Ensure guest memory writes are visible to other CPUs */


	/*
	 * Deliver any pending self-IPI now that the instruction has completed.
	 * This is the key fix for OpenBSD SMP support.
	 */
	ipi_poll();
 
}

/* ============================================================
 * Other Exit Handlers
 * ============================================================ */

static void
iohandler(ExitInfo *ei)
{
	int port, len, inc, isin;
	int asz, seg;
	uintptr addr;
	u32int val;
	uvlong vval;
	uintptr cx;
	static int seglook[8] = {SEGES, SEGCS, SEGSS, SEGDS, SEGFS, SEGGS};
	TLB tlb;

	port = ei->qual >> 16 & 0xffff;
	len = (ei->qual & 7) + 1;
	isin = (ei->qual & 8) != 0;

	if((ei->qual & 1<<4) == 0){
		if(isin){
			val = io(1, port, 0, len);
			rsetsz(RAX, val, len);
		} else
			io(0, port, rget(RAX), len);
		skipinstr(ei);
		return;
	}

	if((rget("flags") & 0x400) != 0) inc = -len;
	else inc = len;

	switch(ei->iinfo >> 7 & 7){
	case 0: asz = 2; break;
	default: asz = 4; break;
	case 2: asz = 8; break;
	}

	if((ei->qual & 1<<5) != 0)
		cx = rgetsz(RCX, asz);
	else
		cx = 1;

	addr = isin ? rget(RDI) : rget(RSI);
	if(isin)
		seg = SEGES;
	else
		seg = seglook[ei->iinfo >> 15 & 7];

	memset(&tlb, 0, sizeof(TLB));
	for(; cx > 0; cx--){
		if(isin){
			vval = io(1, port, 0, len);
			if(x86access(seg, addr, asz, &vval, len, ACCW, &tlb) < 0)
				goto err;
		} else {
			if(x86access(seg, addr, asz, &vval, len, ACCR, &tlb) < 0)
				goto err;
			io(0, port, vval, len);
		}
		addr += inc;
	}
	skipinstr(ei);
err:
	if((ei->qual & 1<<5) != 0)
		rsetsz(RCX, cx, asz);
	if(isin)
		rsetsz(RDI, addr, asz);
	else
		rsetsz(RSI, addr, asz);
}
 

/*
 * preempt - Handle VMX preemption timer exit (reason 52)
 *
 */
static void
preempt(ExitInfo *)
{
	 
}

/*
 * hlt - Handle HLT instruction
 *
 * SIMPLIFIED: Just check for pending work and set a timeout.
 * The preempt handler will wake us when something happens.
 */

extern vlong timer_nearest(void);
extern int irr_pending(void);

/*
 * has_pending_work - Check if there's work requiring immediate attention
 */
static int
has_pending_work(void)
{
    extern IpiQueue *ipiqueue;
    extern u32int irr_bitmap[8];
    
    /* Check for pending interrupts */
    if(irr_bitmap[0] | irr_bitmap[1] | irr_bitmap[2] | irr_bitmap[3] |
       irr_bitmap[4] | irr_bitmap[5] | irr_bitmap[6] | irr_bitmap[7])
        return 1;

    
    return 0;
}

/*
 * hlt - Handle HLT instruction
 *
 * The CPU is idle and waiting for an interrupt. We sleep until:
 * 1. An interrupt arrives (IPI, timer, device)
 * 2. The nearest timer deadline expires
 * 3. A backstop timeout (in case we miss something)
 *
 * Note: Plan 9 sleep() is not perfectly interruptible, so we use
 * shorter sleeps and check for work periodically.
 */
static void
hlt(ExitInfo *ei)
{
    char buf[1];
    vlong deadline, now;
    int i;
    
    skipinstr(ei);
  	lock(&statelock);
	state = VMHALT;
	unlock(&statelock);

    /* Timer is far, safe to block */
    read(hltpipe[curcpuid][0], buf, 1);
 
	lock(&statelock);
	state = VMRUNNING;
	unlock(&statelock);
}

static void
irqackhand(ExitInfo *ei)
{
    int vec = ei->qual;
    
    // Clear from IRR when actually acknowledged
     
    irr_bitmap[vec >> 5] &= ~(1 << (vec & 31));
     
    
    // Set ISR
    isr_bitmap[vec >> 5] |= (1 << (vec & 31));
    
    irqack(vec);
}

static void
dbgexc(ExitInfo *ei)
{
	rset("dr6", rget("dr6") | ei->qual);
	postexc("#db", NOERRC);
}

static void
movdr(ExitInfo *ei)
{
	static char *dr[8] = { "dr0", "dr1", "dr2", "dr3", nil, nil, "dr6", "dr7" };
	int q = ei->qual;

	if((q & 6) == 4){
		postexc("#gp", 0);
		return;
	}
	if((q & 16) != 0)
		rset(x86reg[q >> 8 & 15], rget(dr[q & 7]));
	else
		rset(dr[q & 7], rget(x86reg[q >> 8 & 15]));
	skipinstr(ei);
}

static void
movcr(ExitInfo *ei)
{
	u32int q = ei->qual;

	switch(q & 15){
	case 0:
		switch(q >> 4 & 3){
		case 0:
			vmdebug("illegal CR0 write, value %#ux", (u32int)rget(x86reg[q >> 8 & 15]));
			rset("cr0real", rget(x86reg[q >> 8 & 15]));
			skipinstr(ei);
			break;
		case 1:
			vmerror("shouldn't happen: trap on MOV from CR0");
			rset(x86reg[q >> 8 & 15], rget("cr0fake"));
			skipinstr(ei);
			break;
		case 2:
			vmerror("shouldn't happen: trap on CLTS");
			rset("cr0real", rget("cr0real") & ~8);
			skipinstr(ei);
			break;
		case 3:
			vmerror("LMSW handler unimplemented");
			postexc("#ud", NOERRC);
		}
		break;
	case 4:
		switch(q >> 4 & 3){
		case 0:
			vmdebug("illegal CR4 write, value %#ux", (u32int)rget(x86reg[q >> 8 & 15]));
			rset("cr4real", rget(x86reg[q >> 8 & 15]));
			skipinstr(ei);
			break;
		case 1:
			vmerror("shouldn't happen: trap on MOV from CR4");
			rset(x86reg[q >> 8 & 15], rget("cr3fake"));
			skipinstr(ei);
			break;
		default:
			vmerror("unknown CR4 operation %d", q);
			postexc("#ud", NOERRC);
		}
		break;
	default:
		vmerror("access to unknown control register CR%ud", q & 15);
		postexc("#ud", NOERRC);
	}
}

/* ============================================================
 * CPUID Handler
 * ============================================================ */

typedef struct CPUID CPUID;
struct CPUID {
	u32int ax, bx, cx, dx;
};

static u32int cpuidmax;
static u32int cpuidmaxext;
static CPUID leaf1;
static struct {
	uvlong miscen;
} msr;

static uchar _cpuid[] = {
	0x5E,           /* POP SI (PC) */
	0x5D,           /* POP BP (CPUID&) */
	0x58,           /* POP AX */
	0x59,           /* POP CX */
	0x51,           /* PUSH CX */
	0x50,           /* PUSH AX */
	0x55,           /* PUSH BP */
	0x56,           /* PUSH SI */
	0x31, 0xDB,     /* XOR BX, BX */
	0x31, 0xD2,     /* XOR DX, DX */
	0x0F, 0xA2,     /* CPUID */
	0x89, 0x45, 0x00,   /* MOV AX, 0(BP) */
	0x89, 0x5d, 0x04,   /* MOV BX, 4(BP) */
	0x89, 0x4d, 0x08,   /* MOV CX, 8(BP) */
	0x89, 0x55, 0x0C,   /* MOV DX, 12(BP) */
	0xC3,           /* RET */
};

static CPUID (*getcpuid)(ulong ax, ulong cx) = (CPUID(*)(ulong, ulong)) _cpuid;

void
cpuidinit(void)
{
	CPUID r;
	int f;

	if(sizeof(uintptr) == 8)
		_cpuid[1] = 0x58;
	segflush(_cpuid, sizeof(_cpuid));

	r = getcpuid(0, 0);
	cpuidmax = r.ax;
	r = getcpuid(0x80000000, 0);
	cpuidmaxext = r.ax;
	leaf1 = getcpuid(1, 0);

	memset(&msr, 0, sizeof(msr));
	if((f = open("/dev/msr", OREAD)) >= 0){
		pread(f, &msr.miscen, 8, 0x1a0);
		msr.miscen &= 1<<0;
		close(f);
	}
}

static int xsavesz[] = {
	[1] = 512+64,
	[3] = 512+64,
	[7] = 512+64+256,
};

static void
cpuid(ExitInfo *ei)
{
	u32int ax, bx, cx, dx;
	CPUID cp;

	ax = rget(RAX);
	cx = rget(RCX);
	dprint("CPU%d: CPUID eax=%#x ecx=%#x\n", curcpuid, ax, cx);
	bx = dx = 0;
	cp = getcpuid(ax, cx);

	switch(ax){
	case 0x00:
		ax = MIN(cpuidmax, 0x18);
		bx = cp.bx;
		dx = cp.dx;
		cx = cp.cx;
		break;
	case 0x01:
    	ax = cp.ax;
    	bx = (curcpuid << 24) | (cp.bx & 0x00ffff);
    	cx = cp.cx & 0x76de3217 | 0x80000000UL;  /* ← ADD BIT 24 HERE */
    	cx |= (1 << 24);  /* TSC-Deadline mode supported */    	
		dx = cp.dx & 0x0f8aa779;
		if(leaf1.cx & 1<<27){
			if(rget("cr4real") & Cr4Osxsave)
				cx |= 1<<27;
		} else
			cx &= ~0x1c000000;
		break;
	case 0x02: goto literal;
	case 0x03: goto zero;
	case 0x04: goto literal;
	case 0x05: goto zero;
	case 0x06: goto zero;
	case 0x07:
		if(cx == 0){
			ax = 0;
			bx = cp.bx & 0x2369;
			cx = 0;
			if((leaf1.cx & 1<<27) == 0)
				bx &= ~0xdc230020;
		} else
			goto zero;
		break;
	case 0x08: goto zero;
	case 0x09: goto literal;
	case 0x0a: goto zero;
	case 0x0b: goto zero;
	case 0x0c: goto zero;
	case 0x0d:
		if((leaf1.cx & 1<<27) == 0)
			goto zero;
		if(cx == 0){
			ax = cp.ax & 7;
			bx = xsavesz[rget("xcr0")];
			cx = xsavesz[ax];
		} else if(cx == 1){
			ax = cp.ax & 7;
			bx = xsavesz[rget("xcr0")];
			cx = 0;
		} else if(cx == 2){
			ax = xsavesz[7] - xsavesz[3];
			bx = xsavesz[3];
			cx = 0;
		} else
			goto zero;
		break;
	case 0x0f: goto zero;
	case 0x10: goto zero;
	case 0x12: goto zero;
	case 0x14: goto zero;
	case 0x15: /* TSC/Crystal ratio */
		{
			uvlong freq = _tos->cyclefreq;
			if(freq == 0)
				freq = 2900000000ULL;
			ax = 1;
			bx = 1;
			cx = (u32int)freq;
			dx = 0;
		}
		break;
	case 0x16: /* Processor Frequency MHz */
		{
			uvlong freq = _tos->cyclefreq;
			if(freq == 0)
				freq = 2900000000ULL;
			u32int mhz = (u32int)(freq / 1000000ULL);
			ax = mhz;
			bx = mhz;
			cx = 100;
			dx = 0;
		}
		break;
	case 0x17: goto zero;
	case 0x18: goto literal;
	case 0x40000000:
		ax = 0x40000001;  /* Max supported leaf - was 0, now 1 */
		bx = 0x4b4d564b;  /* "KVMK" */
		cx = 0x564b4d56;  /* "VMKV" */
		dx = 0x4d;        /* "M" */
		break;
	case 0x40000001:
		/* KVM features */
		ax = (1 << 3);    /* KVM_FEATURE_CLOCKSOURCE2 */
		bx = 0;
		cx = 0;
		dx = 0;
		break;
	case 0x80000000:
		ax = MIN(cpuidmaxext, 0x80000008);
		cx = 0;
		break;
	case 0x80000001:
		ax = cp.ax;
		cx = cp.cx & 0x121;
		if(sizeof(uintptr) == 8)
			dx = cp.dx & 0x24100800;
		else
			dx = cp.dx & 0x04100000;
		break;
	case 0x80000002: goto literal;
	case 0x80000003: goto literal;
	case 0x80000004: goto literal;
	case 0x80000005: goto zero;
	case 0x80000006: goto literal;
case 0x80000007:
    ax = 0;
    bx = 0;
    cx = 0;
    dx = (1 << 8);  /* Invariant TSC */
    dprint("CPUID[80000007]: returning dx=%#x (InvariantTSC=%d)\n", dx, (dx >> 8) & 1);
    break;
	case 0x80000008: goto literal;
	literal:
		ax = cp.ax;
		bx = cp.bx;
		cx = cp.cx;
		dx = cp.dx;
		break;
	default:
		if((ax & 0xf0000000) != 0x40000000)
			vmdebug("unknown cpuid field eax=%#ux", ax);
	zero:
		ax = cx = 0;
		break;
	}

	rset(RAX, ax);
	rset(RBX, bx);
	rset(RCX, cx);
	rset(RDX, dx);
	skipinstr(ei);
}

static void
rdwrmsr(ExitInfo *ei)
{
	u32int cx;
	u64int val;
	int rd;

	rd = ei->name[1] == 'r';
	cx = rget(RCX);
	val = (uvlong)rget(RDX) << 32 | rget(RAX);

	dprint("CPU%d: MSR %s cx=%#x\n", curcpuid, rd?"read":"write", cx);

	switch(cx){
	case 0x10: /* IA32_TIME_STAMP_COUNTER */
		if(rd){
			/*
			 * Return actual hardware TSC via cycles().
			 * This gives the guest access to real CPU cycle count,
			 * allowing proper TSC calibration against HPET.
			 */
			uvlong tsc;
			cycles(&tsc);
			val = tsc + cached_tscoff;
		}
		break;
	case 0x1B: /* IA32_APIC_BASE */
		if(rd){
			/*
			 * APIC Base MSR format:
			 * Bit 8: BSP flag (1 = bootstrap processor)
			 * Bit 11: APIC global enable
			 * Bits 12-35: APIC base address (page number)
			 * 
			 * Default APIC base is 0xFEE00000
			 */
			val = 0xFEE00000ULL;  /* Base address */
			val |= (1 << 11);     /* APIC enabled */
			if(curcpuid == 0)
				val |= (1 << 8);  /* BSP flag for CPU 0 only */
		}
		/* Writes are ignored - we don't support relocating the APIC */
		break;
	case 0x277:
		if(rd) val = rget("pat");
		else rset("pat", val);
		break;
	case 0x6E0: /* IA32_TSC_DEADLINE */
    	if(rd)
        	val = lapic_read_tsc_deadline();
    	else {
			dprint("CPU%d: TSC_DEADLINE=%llud\n", curcpuid, val);
        	lapic_write_tsc_deadline(val);
			 
			pvclock_update(curcpuid);  /* Keep kvmclock in sync */
    	}
    break;
	case 0x8B:
		val = 0;
		break;
	case 0x1A0:
		if(rd) val = msr.miscen;
		break;
	case 0x4b564d00: /* MSR_KVM_WALL_CLOCK_NEW */
	case 0x4b564d01: /* MSR_KVM_SYSTEM_TIME_NEW */
		if(pvclock_msr(curcpuid, !rd, cx, &val))
			break;
	default:
		if(rd){
			vmdebug("read from unknown MSR %#ux ignored", cx);
			val = 0;
		} else
			vmdebug("write to unknown MSR %#ux ignored (val=%#ullx)", cx, val);
		break;
	}

	if(rd){
		rset(RAX, (u32int)val);
		rset(RDX, (u32int)(val >> 32));
	}

	skipinstr(ei);
}

static void
xsetbv(ExitInfo *ei)
{
	uvlong v;

	v = rget(RAX)&0xffffffff | rget(RDX)<<32;
	if(rget(RCX) & 0xffffffff)
		postexc("#gp", 0);
	else if(v != 1 && v != 3 && v != 7)
		postexc("#gp", 0);
	else if((leaf1.cx & 1<<26) == 0 || (rget("cr4real") & Cr4Osxsave) == 0)
		postexc("#ud", NOERRC);
	else {
		rset("xcr0", v);
		skipinstr(ei);
	}
}

/* ============================================================
 * Exit Dispatch
 * ============================================================ */

typedef struct ExitType ExitType;
struct ExitType {
	char *name;
	void (*f)(ExitInfo *);
};


/*
 * Helper: Check if any IRQs are pending in the IRR bitmap
 */
int
irr_pending(void)
{
	extern u32int irr_bitmap[8];
	return irr_bitmap[0] | irr_bitmap[1] | irr_bitmap[2] | irr_bitmap[3] |
	       irr_bitmap[4] | irr_bitmap[5] | irr_bitmap[6] | irr_bitmap[7];
}
 
static ExitType etypes[] = {
	{"io", iohandler},
	{".cpuid", cpuid},
	{".hlt", hlt},
	{"eptfault", eptfault},
	{"*ack", irqackhand},
	{".rdmsr", rdwrmsr},
	{".wrmsr", rdwrmsr},
	{".movdr", movdr},
	{"#db", dbgexc},
	{"movcr", movcr},
	{".xsetbv", xsetbv},
	{"preempt", preempt},
};

 
void
processexit(char *msg)
{
    static char msgc[1024];
    char *f[32];
    int nf;
    ExitType *et;
    int i;
    ExitInfo ei;
    extern int getexit;
	static uvlong exit_count = 0;
	static uvlong last_report = 0;

	exit_count++;
	vlong now = nanosec();
	if(now - last_report > 1000000000LL) {  // Every 1 second
    	dprint("EXIT RATE: %llud/sec\n", exit_count);
    	exit_count = 0;
    	last_report = now;
	}

	static vlong lastpreempt;
      
	dprint("EXIT: %s\n", msg);
    
    if(strncmp(msg, "preempt", 7) == 0){
        if(lastpreempt != 0)
            dprint("PREEMPT CPU%d: delta=%lldms\n", curcpuid, (now - lastpreempt)/1000000);
        lastpreempt = now;
    }
	

     
    strcpy(msgc, msg);
    nf = tokenize(msgc, f, nelem(f));
    if(nf < 2) sysfatal("invalid wait message: %s", msg);

    memset(&ei, 0, sizeof(ei));
    ei.raw = msg;
    ei.name = f[0];
    ei.qual = strtoull(f[1], nil, 0);

    for(i = 2; i < nf; i += 2){
        if(strcmp(f[i], "pc") == 0)
            rpoke(RPC, strtoull(f[i+1], nil, 0), 1);
        else if(strcmp(f[i], "sp") == 0)
            rpoke(RSP, strtoull(f[i+1], nil, 0), 1);
        else if(strcmp(f[i], "ax") == 0)
            rpoke(RAX, strtoull(f[i+1], nil, 0), 1);
        else if(strcmp(f[i], "ilen") == 0)
            ei.ilen = strtoul(f[i+1], nil, 0);
        else if(strcmp(f[i], "iinfo") == 0)
            ei.iinfo = strtoul(f[i+1], nil, 0);
        else if(strcmp(f[i], "pa") == 0)
            ei.pa = strtoull(f[i+1], nil, 0);
        else if(strcmp(f[i], "va") == 0)
            ei.va = strtoull(f[i+1], nil, 0);
    }

    if(*f[0] == '*') getexit++;
  
    for(et = etypes; et < etypes + nelem(etypes); et++){
        if(strcmp(et->name, f[0]) == 0){
            et->f(&ei);
            return;
        }
    }

    if(*f[0] == '.'){
        vmerror("vmx: unknown instruction %s", f[0]+1);
        postexc("#ud", NOERRC);
        return;
    }
    if(*f[0] == '*'){
        vmerror("vmx: unknown notification %s", f[0]+1);
        return;
    }

    if(persist){
        vmerror("unknown exit: %s", msg);
        state = VMDEAD;
    } else
        sysfatal("unknown exit: %s", msg);
}