ref: e0e889fb69a6d4d3f2332244ef4f79f900a66d84
dir: /exith.c/
/*
* exith.c - VM exit handlers for vmx
*
* Refactored with:
* - Fixed self-IPI delivery timing (deferred until after instruction completes)
* - Cleaner LAPIC/IOAPIC MMIO handling using dispatch tables
* - Better TPR/PPR handling for interrupt priority
*/
#include <u.h>
#include <libc.h>
#include <tos.h>
#include <thread.h>
#include "dat.h"
#include "fns.h"
#include "x86.h"
int persist = 1;
extern Lock statelock;
IpiQueue * ipiqueue;
vlong cached_tscoff;
extern int wakepipe[MAXVCPU][2];
extern int hltpipe[MAXVCPU][2];
/* Timer source for HLT timeout */
enum { TIMER_HLT_TIMEOUT = 99 }; /* Special source for HLT */
typedef struct ExitInfo ExitInfo;
struct ExitInfo {
char *raw;
char *name;
uvlong qual;
uvlong pa, va;
u32int ilen, iinfo;
};
char *x86reg[16] = {
RAX, RCX, RDX, RBX,
RSP, RBP, RSI, RDI,
R8, R9, R10, R11,
R12, R13, R14, R15
};
char *x86segreg[8] = {
"cs", "ds", "es", "fs", "gs", "ss",
};
extern int debug;
extern LApic lapic;
extern IOApic *ioapic;
/*
* In-Service Register bitmap - tracks ALL interrupts currently being serviced.
* 256 vectors = 8 x 32-bit words.
* Each forked CPU process has its own copy.
*/
u32int isr_bitmap[8] = {0, 0, 0, 0, 0, 0, 0, 0};
/*
* LAPIC TPR (Task Priority Register) - per CPU
* Used for interrupt priority filtering
*/
static u32int lapic_tpr[MAXVCPU];
/*
* LAPIC Spurious Vector Register - per CPU
* Bit 8 enables the LAPIC
*/
u32int lapic_svr[MAXVCPU];
/* Saved ICR high for IPI delivery */
static u32int icr_hi_saved;
/* Track INIT sent state for SIPI handling */
static u32int init_sent[MAXVCPU];
/*
* Interrupt Request Register bitmap - tracks ALL pending interrupts.
* 256 vectors = 8 x 32-bit words.
* This is needed because VMX can only hold ONE pending interrupt at a time,
* so we must track multiple pending interrupts in software.
*/
u32int irr_bitmap[8] = {0, 0, 0, 0, 0, 0, 0, 0};
/* Set a pending interrupt in software IRR */
Lock irr_lock;
void
set_irr(int vector)
{
if(vector >= 0 && vector < 256){
irr_bitmap[vector >> 5] |= (1 << (vector & 31));
}
}
/* Clear a pending interrupt from software IRR */
static void
clear_irr(int vector)
{
if(vector >= 0 && vector < 256){
irr_bitmap[vector >> 5] &= ~(1 << (vector & 31));
}
}
void
inject_pending_irq(void)
{
int i, j, vec = -1;
u32int bits;
for(i = 7; i >= 0; i--){
bits = irr_bitmap[i];
if(bits != 0){
/* Find lowest set bit */
for(j = 0; j < 32; j++){
if(bits & (1 << j)){
vec = (i << 5) + j;
goto found;
}
}
}
}
found:
if(vec >= 0){
ctl("irq %d", vec);
} else {
ctl("irq");
}
}
/* ============================================================
* Utility Functions
* ============================================================ */
/*
* Helper: Skip current instruction
*/
static void
skipinstr(ExitInfo *ei)
{
rset(RPC, rget(RPC) + ei->ilen);
}
/*
* Direct preemption timer arm (for hlt timeout)
* This bypasses the unified timer manager for the HLT case.
*/
void
preempt_arm(vlong deadline_ns)
{
extern u32int preempt_shift;
vlong now, delta_ns;
uvlong delta_tsc, ticks, freq;
freq = _tos->cyclefreq;
if(freq == 0)
return;
now = nanosec();
if(deadline_ns <= now){
ctl("preempt 1");
return;
}
delta_ns = deadline_ns - now;
delta_tsc = (delta_ns * freq) / 1000000000ULL;
ticks = delta_tsc >> preempt_shift;
if(ticks > 0xFFFFFFFFULL)
ticks = 0xFFFFFFFF;
if(ticks == 0)
ticks = 1;
ctl("preempt %ud", (u32int)ticks);
}
/*
* Translate guest virtual address to physical.
*/
static uvlong
guest_vtop(uvlong va)
{
uvlong cr0 = rget("cr0real");
/* Paging disabled - virtual == physical */
if((cr0 & 0x80000000ULL) == 0)
return va;
/* Low addresses - likely identity mapped in early boot */
if(va < 0x100000000ULL)
return va;
/* For high addresses, we MUST walk the page tables.
* Different OSes use different mappings:
* - Plan 9: 0xffffffff80000000 + phys
* - Linux: varies with KASLR, direct map at 0xffff888000000000
* - OpenBSD: 0xffff800000000000 + phys
* Static mappings are unreliable, so always walk. */
uvlong cr3 = rget("cr3");
uvlong efer = rget("efer");
int lma = (efer >> 10) & 1;
if(lma){
uvlong pml4_base = cr3 & ~0xFFFULL;
uvlong pte, *ptep;
uvlong pa;
int idx;
/* PML4 */
idx = (va >> 39) & 0x1FF;
ptep = gptr(pml4_base + idx * 8, 8);
if(ptep == nil || (*ptep & 1) == 0)
goto fallback;
pte = *ptep;
/* PDPT */
pa = pte & 0x000FFFFFFFFFF000ULL;
idx = (va >> 30) & 0x1FF;
ptep = gptr(pa + idx * 8, 8);
if(ptep == nil || (*ptep & 1) == 0)
goto fallback;
pte = *ptep;
if(pte & 0x80) /* 1GB page */
return (pte & 0x000FFFFFC0000000ULL) | (va & 0x3FFFFFFF);
/* PD */
pa = pte & 0x000FFFFFFFFFF000ULL;
idx = (va >> 21) & 0x1FF;
ptep = gptr(pa + idx * 8, 8);
if(ptep == nil || (*ptep & 1) == 0)
goto fallback;
pte = *ptep;
if(pte & 0x80) /* 2MB page */
return (pte & 0x000FFFFFFFE00000ULL) | (va & 0x1FFFFF);
/* PT */
pa = pte & 0x000FFFFFFFFFF000ULL;
idx = (va >> 12) & 0x1FF;
ptep = gptr(pa + idx * 8, 8);
if(ptep == nil || (*ptep & 1) == 0)
goto fallback;
pte = *ptep;
return (pte & 0x000FFFFFFFFFF000ULL) | (va & 0xFFF);
}
fallback:
/* Fallback: try common static mappings */
/* Plan 9 / 9front kernel: KZERO = 0xffffffff80000000 */
if(va >= 0xffffffff80000000ULL)
return va - 0xffffffff80000000ULL;
/* OpenBSD amd64 direct map */
if(va >= 0xffff800000000000ULL && va < 0xffff880000000000ULL)
return va - 0xffff800000000000ULL;
/* Linux direct map */
if(va >= 0xffff888000000000ULL && va < 0xffffc00000000000ULL)
return va - 0xffff888000000000ULL;
return va;
}
/* ============================================================
* Instruction Decoding for MMIO
* ============================================================ */
/*
* Get value being written to MMIO address
*/
static u32int
getmovval(void)
{
uvlong rip = rget(RPC);
uvlong phys = guest_vtop(rip);
u8int *ip = gptr(phys, 16);
if(ip == nil)
return 0;
int i = 0;
int rex = 0;
/* Skip prefixes */
while(i < 8){
if((ip[i] & 0xF0) == 0x40)
rex = ip[i++];
else if(ip[i] == 0x66 || ip[i] == 0x67)
i++;
else
break;
}
/* MOV r/m, r32/64: 0x89 */
if(ip[i] == 0x89){
int modrm = ip[i+1];
int reg = (modrm >> 3) & 7;
if(rex & 4) reg |= 8;
return rget(x86reg[reg]);
}
/* MOV r/m, imm32: 0xC7 /0 */
if(ip[i] == 0xC7){
int modrm = ip[i+1];
int mod = modrm >> 6;
int rm = modrm & 7;
int off = 2;
if(rm == 4) off++; /* SIB */
if(mod == 1) off += 1;
else if(mod == 2 || (mod == 0 && rm == 5)) off += 4;
return ip[i+off] | (ip[i+off+1]<<8) | (ip[i+off+2]<<16) | (ip[i+off+3]<<24);
}
/* MOV r/m, r8: 0x88 */
if(ip[i] == 0x88){
int modrm = ip[i+1];
int reg = (modrm >> 3) & 7;
return rget(x86reg[reg]) & 0xFF;
}
/* MOVNTI: 0x0F 0xC3 */
if(ip[i] == 0x0F && ip[i+1] == 0xC3){
int modrm = ip[i+2];
int reg = (modrm >> 3) & 7;
if(rex & 4) reg |= 8;
return rget(x86reg[reg]);
}
return 0;
}
/*
* Set destination register for MMIO read
*/
static int
setmovdest(uvlong val) /* Changed from u32int to uvlong to support 64-bit values */
{
uvlong rip = rget(RPC);
uvlong phys = guest_vtop(rip);
u8int *ip = gptr(phys, 16);
if(ip == nil){
dprint("setmovdest: gptr failed rip=%#llx phys=%#llx\n", rip, phys);
rset(RAX, val);
return 0;
}
/* Debug: show what we're decoding */
if(ip[0] == 0 && ip[1] == 0 && ip[2] == 0 && ip[3] == 0){
dprint("setmovdest: zero opcodes! rip=%#llx phys=%#llx\n", rip, phys);
}
int i = 0;
int rex = 0;
int has66 = 0;
/* Skip prefixes */
while(i < 8){
if((ip[i] & 0xF0) == 0x40)
rex = ip[i++];
else if(ip[i] == 0x66){
has66 = 1;
i++;
}
else if(ip[i] == 0x67 || ip[i] == 0xF2 || ip[i] == 0xF3 ||
ip[i] == 0x2E || ip[i] == 0x3E || ip[i] == 0x26 ||
ip[i] == 0x64 || ip[i] == 0x65 || ip[i] == 0x36)
i++;
else
break;
}
/* MOV r32/64, r/m32/64: 0x8B */
if(ip[i] == 0x8B){
int modrm = ip[i+1];
int reg = (modrm >> 3) & 7;
if(rex & 4) reg |= 8;
if(rex & 8){
/* REX.W = 64-bit operation */
rset(x86reg[reg], val);
} else if(has66){
/* 0x66 prefix = 16-bit operation */
uvlong oldval = rget(x86reg[reg]);
rset(x86reg[reg], (oldval & ~0xFFFFULL) | (val & 0xFFFF));
} else {
/* No REX.W, no 0x66 = 32-bit operation (zero-extends to 64-bit) */
rset(x86reg[reg], (u32int)val);
}
return 1;
}
/* MOV r8, r/m8: 0x8A */
if(ip[i] == 0x8A){
int modrm = ip[i+1];
int reg = (modrm >> 3) & 7;
if(rex){
if(rex & 4) reg |= 8;
uvlong oldval = rget(x86reg[reg]);
rset(x86reg[reg], (oldval & ~0xFFULL) | (val & 0xFF));
} else if(reg >= 4 && reg <= 7){
char *basereg = x86reg[reg - 4];
uvlong oldval = rget(basereg);
rset(basereg, (oldval & ~0xFF00ULL) | ((val & 0xFF) << 8));
} else {
uvlong oldval = rget(x86reg[reg]);
rset(x86reg[reg], (oldval & ~0xFFULL) | (val & 0xFF));
}
return 1;
}
/* MOVZX r32, r/m8: 0x0F 0xB6 */
if(ip[i] == 0x0F && ip[i+1] == 0xB6){
int modrm = ip[i+2];
int reg = (modrm >> 3) & 7;
if(rex & 4) reg |= 8;
rset(x86reg[reg], val & 0xFF);
return 1;
}
/* MOVZX r32, r/m16: 0x0F 0xB7 */
if(ip[i] == 0x0F && ip[i+1] == 0xB7){
int modrm = ip[i+2];
int reg = (modrm >> 3) & 7;
if(rex & 4) reg |= 8;
rset(x86reg[reg], val & 0xFFFF);
return 1;
}
/* TEST r/m32/64, imm32: 0xF7 /0 */
if(ip[i] == 0xF7){
int modrm = ip[i+1];
int regop = (modrm >> 3) & 7;
if(regop == 0){
int mod = (modrm >> 6) & 3;
int rm = modrm & 7;
int immoff = i + 2;
if(mod != 3 && rm == 4) immoff++;
if(mod == 1) immoff += 1;
else if(mod == 2) immoff += 4;
else if(mod == 0 && rm == 5) immoff += 4;
u32int imm = (u32int)ip[immoff] |
((u32int)ip[immoff+1] << 8) |
((u32int)ip[immoff+2] << 16) |
((u32int)ip[immoff+3] << 24);
u64int result;
u64int signbit;
if(rex & 8){
result = val & (u64int)(s64int)(s32int)imm;
signbit = 1ULL << 63;
} else {
result = (u32int)val & imm;
signbit = 1ULL << 31;
}
uvlong flags = rget("flags");
flags &= ~((1<<0) | (1<<2) | (1<<6) | (1<<7) | (1<<11));
if(result == 0) flags |= (1 << 6);
if(result & signbit) flags |= (1 << 7);
u8int pb = result & 0xFF;
pb ^= pb >> 4;
pb ^= pb >> 2;
pb ^= pb >> 1;
if((pb & 1) == 0) flags |= (1 << 2);
rset("flags", flags);
return 1;
}
}
/* CMP r/m32, imm32: 0x81 /7 or 0x83 /7 */
if(ip[i] == 0x81 || ip[i] == 0x83){
int modrm = ip[i+1];
int regop = (modrm >> 3) & 7;
if(regop == 7){
int mod = (modrm >> 6) & 3;
int rm = modrm & 7;
int immoff = i + 2;
if(mod != 3 && rm == 4) immoff++;
if(mod == 1) immoff += 1;
else if(mod == 2) immoff += 4;
else if(mod == 0 && rm == 5) immoff += 4;
s64int imm;
if(ip[i] == 0x83)
imm = (s64int)(s8int)ip[immoff];
else
imm = (s64int)(s32int)((u32int)ip[immoff] |
((u32int)ip[immoff+1] << 8) |
((u32int)ip[immoff+2] << 16) |
((u32int)ip[immoff+3] << 24));
u64int op1, op2, result;
u64int signbit, mask;
if(rex & 8){
op1 = val;
op2 = (u64int)imm;
result = op1 - op2;
signbit = 1ULL << 63;
mask = ~0ULL;
} else {
op1 = (u32int)val;
op2 = (u32int)imm;
result = op1 - op2;
signbit = 1ULL << 31;
mask = 0xFFFFFFFFULL;
}
result &= mask;
uvlong flags = rget("flags");
flags &= ~((1<<0) | (1<<2) | (1<<6) | (1<<7) | (1<<11) | (1<<4));
if(op1 < (op2 & mask)) flags |= (1 << 0);
if(result == 0) flags |= (1 << 6);
if(result & signbit) flags |= (1 << 7);
if(((op1 ^ op2) & (op1 ^ result)) & signbit) flags |= (1 << 11);
u8int pb = result & 0xFF;
pb ^= pb >> 4;
pb ^= pb >> 2;
pb ^= pb >> 1;
if((pb & 1) == 0) flags |= (1 << 2);
rset("flags", flags);
return 1;
}
}
/* CMP r/m32, r32: 0x39 */
if(ip[i] == 0x39){
int modrm = ip[i+1];
int reg = (modrm >> 3) & 7;
if(rex & 4) reg |= 8;
u64int regval = rget(x86reg[reg]);
u64int op1, op2, result;
u64int signbit, mask;
if(rex & 8){
op1 = val;
op2 = regval;
signbit = 1ULL << 63;
mask = ~0ULL;
} else {
op1 = (u32int)val;
op2 = (u32int)regval;
signbit = 1ULL << 31;
mask = 0xFFFFFFFFULL;
}
result = (op1 - op2) & mask;
uvlong flags = rget("flags");
flags &= ~((1<<0) | (1<<2) | (1<<6) | (1<<7) | (1<<11));
if(op1 < op2) flags |= (1 << 0);
if(result == 0) flags |= (1 << 6);
if(result & signbit) flags |= (1 << 7);
if(((op1 ^ op2) & (op1 ^ result)) & signbit) flags |= (1 << 11);
u8int pb = result & 0xFF;
pb ^= pb >> 4;
pb ^= pb >> 2;
pb ^= pb >> 1;
if((pb & 1) == 0) flags |= (1 << 2);
rset("flags", flags);
return 1;
}
/* CMP r32, r/m32: 0x3B */
if(ip[i] == 0x3B){
int modrm = ip[i+1];
int reg = (modrm >> 3) & 7;
if(rex & 4) reg |= 8;
u64int regval = rget(x86reg[reg]);
u64int op1, op2, result;
u64int signbit, mask;
if(rex & 8){
op1 = regval;
op2 = val;
signbit = 1ULL << 63;
mask = ~0ULL;
} else {
op1 = (u32int)regval;
op2 = (u32int)val;
signbit = 1ULL << 31;
mask = 0xFFFFFFFFULL;
}
result = (op1 - op2) & mask;
uvlong flags = rget("flags");
flags &= ~((1<<0) | (1<<2) | (1<<6) | (1<<7) | (1<<11));
if(op1 < op2) flags |= (1 << 0);
if(result == 0) flags |= (1 << 6);
if(result & signbit) flags |= (1 << 7);
if(((op1 ^ op2) & (op1 ^ result)) & signbit) flags |= (1 << 11);
u8int pb = result & 0xFF;
pb ^= pb >> 4;
pb ^= pb >> 2;
pb ^= pb >> 1;
if((pb & 1) == 0) flags |= (1 << 2);
rset("flags", flags);
return 1;
}
/* Fallback - log and try to handle common patterns */
dprint("setmovdest: unhandled opcode %02x %02x %02x %02x (rex=%02x has66=%d) val=%#llx\n", ip[i], ip[i+1], ip[i+2], ip[i+3], rex, has66, val); /* Changed %#x to %#llx */
/*
* For unhandled read instructions, try to determine destination from ModRM.
* Most memory reads use ModRM where bits 3-5 encode the destination register.
*/
if(i < 14 && (ip[i] == 0x8B || ip[i] == 0x8A || ip[i] == 0x03 || ip[i] == 0x0B ||
ip[i] == 0x13 || ip[i] == 0x1B || ip[i] == 0x23 || ip[i] == 0x2B ||
ip[i] == 0x33 || ip[i] == 0x3B)){
int modrm = ip[i+1];
int reg = (modrm >> 3) & 7;
if(rex & 4) reg |= 8;
dprint("setmovdest: fallback using reg %d (%s)\n", reg, x86reg[reg]);
rset(x86reg[reg], val);
return 1;
}
/* Last resort: set RAX */
rset(RAX, val);
return 0;
}
/* ============================================================
* IPI Handling
* ============================================================ */
//IpiQueue *ipiqueue;
void
ipiqueueinit(void)
{
ipiqueue = mkseg("ipi", 0x300001000, 0x1000);
memset(ipiqueue, 0, sizeof(IpiQueue));
ipiqueue->pids[0] = getpid();
dprint("IPI queue initialized\n");
}
/*
* ipi_queue(cpu, vec) - Queue interrupt for target CPU
* Used by: ICR writes, IOAPIC delivery, cross-CPU timers
* For remote: sends postnote to wake target
* For local: just queues, caller must ipi_poll() later
*/
void ipi_queue(int target, int vector)
{
if(ipiqueue == nil || target < 0 || target >= MAXVCPU)
return;
lock(&ipiqueue->lock[target]);
ipiqueue->cpu[target].pending[vector >> 5] |= (1 << (vector & 31));
unlock(&ipiqueue->lock[target]);
coherence();
/* Always use pipe - caller might be I/O thread in different process */
write(ipiqueue->wakefds[target], "w", 1);
}
/*
* ipi_poll() - Drain ALL queued interrupts into IRR
* Called at: start of processexit(), end of eptfault(), in hlt()
*/
void
ipi_poll(void)
{
int i;
if(ipiqueue == nil)
return;
lock(&ipiqueue->lock[curcpuid]);
for(i = 0; i < 8; i++){
irr_bitmap[i] |= ipiqueue->cpu[curcpuid].pending[i];
ipiqueue->cpu[curcpuid].pending[i] = 0;
}
unlock(&ipiqueue->lock[curcpuid]);
}
int
ipi_pending(void)
{
int i, has_pending = 0;
if(ipiqueue == nil)
return 0;
lock(&ipiqueue->lock[curcpuid]);
for(i = 0; i < 8; i++){
if(ipiqueue->cpu[curcpuid].pending[i]){
has_pending = 1;
break;
}
}
unlock(&ipiqueue->lock[curcpuid]);
return has_pending;
}
static void
ioapic_eoi(int vector)
{
int i;
u64int redir;
if(ioapic == nil)
return;
dprint("ioapic_eoi(%d): pending before=%#x\n", vector, ioapic->irq_pending);
lock(&ioapic->lock);
for(i = 0; i < 24; i++){
redir = ioapic->redir[i];
if((redir & 0xFF) == (u32int)vector){
ioapic->irq_pending &= ~(1 << i);
int level_triggered = (redir >> 15) & 1;
int masked = (redir >> 16) & 1;
dprint("ioapic_eoi: IRQ %d redir=%#llx level_trig=%d irq_level=%#x masked=%d\n", i, redir, level_triggered, ioapic->irq_level, masked);
/* If level-triggered and line still high, handle re-delivery */
if(level_triggered && (ioapic->irq_level & (1 << i))){
if(masked){
/* Line high but masked - set pending for unmask to deliver */
ioapic->irq_pending |= (1 << i);
} else {
/* Line high and not masked - re-deliver now */
unlock(&ioapic->lock);
ipi_queue(curcpuid, vector);
return;
}
}
dprint("ioapic_eoi: cleared IRQ %d, pending now=%#x\n", i, ioapic->irq_pending);
break;
}
}
unlock(&ioapic->lock);
}
/* ============================================================
* I/O APIC MMIO Handlers
* ============================================================ */
void
ioapic_init(){
ioapic = mkseg("ioapic", 0x300000000, 0x1000);
memset(ioapic, 0, sizeof(IOApic));
ioapic->id = nvcpu << 24; /* ID in bits 24-27 */
for(int i = 0; i < 24; i++){
if(i >= 16 && i <= 19){
/* PCI interrupt pins: level-triggered, unmasked */
ioapic->redir[i] = (1 << 15) | (48 + (i - 16));
} else {
/* Other pins: masked until guest programs them */
ioapic->redir[i] = (1ULL << 16) | (32 + i);
}
}
dprint("IOAPIC initialized: id=%#ux (APIC ID %d)\n", ioapic->id, ioapic->id >> 24);
}
static void
ioapic_mmio_handler(ExitInfo *ei)
{
u32int off = ei->pa & 0xFFF;
int iswrite = (ei->qual & 2) != 0;
u32int val = 0;
int sel;
if(iswrite){
val = getmovval();
switch(off){
case 0x00: /* IOREGSEL */
ioapic->reg_sel[curcpuid] = val;
dprint("IOAPIC: IOREGSEL = %#x\n", val);
break;
case 0x10: /* IOWIN */
sel = ioapic->reg_sel[curcpuid];
dprint("IOAPIC: IOWIN write reg=%#x val=%#x\n", sel, val);
switch(sel){
case 0x00: /* ID */
lock(&ioapic->lock);
ioapic->id = val;
unlock(&ioapic->lock);
break;
default:
if(sel >= 0x10 && sel < 0x40){
int idx = (sel - 0x10) / 2;
lock(&ioapic->lock);
if(sel & 1){
/* High 32 bits - just destination, no pending check needed */
ioapic->redir[idx] = (ioapic->redir[idx] & 0xFFFFFFFFULL) |
((u64int)val << 32);
unlock(&ioapic->lock);
} else {
/* Low 32 bits - check for unmasking */
u64int old = ioapic->redir[idx];
ioapic->redir[idx] = (ioapic->redir[idx] & 0xFFFFFFFF00000000ULL) | val;
/* Debug: track when IRQ 2 (timer) gets masked/unmasked */
if (debug)
if(idx == 2){
if(val & 0x10000)
dprint("IOAPIC: IRQ 2 MASKED! val=%#x\n", val);
else
dprint("IOAPIC: IRQ 2 UNMASKED val=%#x vector=%d\n", val, val & 0xFF);
}
/* Check if unmasking with pending interrupt OR line high */
if((old & (1<<16)) && !(val & (1<<16))){
/* Was masked, now unmasked */
int level_trig = (val >> 15) & 1;
int should_deliver = ioapic->irq_pending & (1 << idx);
/* For level-triggered, also deliver if line is currently high */
if(level_trig && (ioapic->irq_level & (1 << idx)))
should_deliver = 1;
if(should_deliver){
int vector = val & 0xFF;
int dest = (ioapic->redir[idx] >> 56) & 0xFF;
int destmode = (val >> 11) & 1;
ioapic->irq_pending &= ~(1 << idx);
unlock(&ioapic->lock);
ipi_queue(dest, vector);
goto done_ioapic_write;
}
}
unlock(&ioapic->lock);
}
dprint("IOAPIC: redir[%d] %s = %#llx\n", idx, (sel & 1) ? "hi" : "lo", ioapic->redir[idx]);
}
break;
}
done_ioapic_write:
break;
default:
dprint("IOAPIC: write to unknown offset %#x\n", off);
}
} else {
switch(off){
case 0x00: /* IOREGSEL */
val = ioapic->reg_sel[curcpuid];
break;
case 0x10: /* IOWIN */
sel = ioapic->reg_sel[curcpuid];
switch(sel){
case 0x00: /* ID */
lock(&ioapic->lock);
val = ioapic->id;
unlock(&ioapic->lock);
break;
case 0x01: /* Version */
val = 0x00170011; /* 24 entries, version 0x11 */
break;
default:
if(sel >= 0x10 && sel < 0x40){
int idx = (sel - 0x10) / 2;
lock(&ioapic->lock);
if(sel & 1)
val = (u32int)(ioapic->redir[idx] >> 32);
else
val = (u32int)ioapic->redir[idx];
unlock(&ioapic->lock);
} else
val = 0;
break;
}
}
dprint("IOAPIC: read off=%#x val=%#x\n", off, val);
setmovdest(val);
}
}
void
ioapic_set_irq(int irq, int level)
{
u64int redir;
int vector, dest, masked, destmode;
int cpu;
int need_delivery[MAXVCPU] = {0};
int delivery_vector = -1;
if(ioapic == nil || irq < 0 || irq >= 24)
return;
lock(&ioapic->lock);
redir = ioapic->redir[irq];
masked = (redir >> 16) & 1;
vector = redir & 0xFF;
destmode = (redir >> 11) & 1;
dest = (redir >> 56) & 0xFF;
if(level)
ioapic->irq_level |= (1 << irq);
else
ioapic->irq_level &= ~(1 << irq);
if(level && !masked){
delivery_vector = vector;
if(destmode == 1){
static int next_cpu = 0;
need_delivery[next_cpu] = 1;
next_cpu = (next_cpu + 1) % nvcpu;
} else {
if(dest < nvcpu)
need_delivery[dest] = 1;
}
}
unlock(&ioapic->lock); // RELEASE BEFORE delivery
// Deliver AFTER releasing ioapic lock
/* Always use IPI - caller might not be a vCPU process */
for(cpu = 0; cpu < nvcpu; cpu++){
if(need_delivery[cpu]){
ipi_queue(cpu, delivery_vector);
}
}
}
/* ============================================================
* LAPIC Timer
* ============================================================ */
#define LAPIC_BUS_FREQ_HZ 100000000ULL /* 100 MHz */
/* ============================================================
* FORK CPU
* ============================================================ */
enum { RCENT = 256 };
static int nactivecpu = 0;
extern u32int isr_bitmap[];
extern uvlong rcvalid[(RCENT + 63)/64];
extern uvlong rcdirty[(RCENT + 63)/64];
extern void vmxsetup(void);
extern int getexit;
extern void runloop(void);
extern void modregion(Region *);
extern Channel *waitch, *notifch;
void forkcpu(u32int sipi){
cached_tscoff = 0;
rset("tscoff", 0);
int pid = fork();
dprint("CPU0: tscoff = %llud\n", ioapic->tsc_base);
if(pid < 0)
sysfatal("fork failed: %r");
if(pid > 0){
vmx_register_child(pid);
nactivecpu += 1;
return;
}
// vmx_cleanup_init();
atnotify(vmx_notehandler, 1);
/* Child process */
curcpuid = nactivecpu + 1;
dprint("CPU%d: child started (pid %d)\n", curcpuid, getpid());
if(ipiqueue != nil)
ipiqueue->pids[curcpuid] = getpid();
/* Create new VMX context for this CPU */
vmxsetup();
/* Map memory regions into new VMX context */
Region *r;
for(r = mmap; r != nil; r = r->next){
modregion(r);
dprint("CPU%d: mapped regions, first region v=%p segname=%s\n", curcpuid, r->v, r->segname);
}
/* Clear register cache - must reload for new context */
memset(rcvalid, 0, sizeof(rcvalid));
memset(rcdirty, 0, sizeof(rcdirty));
/*
* Set up real-mode CPU state at SIPI vector address.
* SIPI vector is the page number, so address = sipi << 12.
* CS selector = sipi << 8, CS base = sipi << 12, IP = 0.
*/
/* Code segment - points to SIPI trampoline */
rset("cs", sipi << 8);
rset("csbase", sipi << 12);
rset("cslimit", 0xFFFF);
rset("csperm", 0x9b); /* Present, DPL=0, code, readable */
/* Instruction pointer starts at 0 (relative to CS base) */
rset("pc", 0);
/* Data segment */
rset("ds", 0);
rset("dsbase", 0);
rset("dslimit", 0xFFFF);
rset("dsperm", 0x93); /* Present, DPL=0, data, writable */
/* Extra segment */
rset("es", 0);
rset("esbase", 0);
rset("eslimit", 0xFFFF);
rset("esperm", 0x93);
/* Stack segment */
rset("ss", 0);
rset("ssbase", 0);
rset("sslimit", 0xFFFF);
rset("ssperm", 0x93);
/* FS segment */
rset("fs", 0);
rset("fsbase", 0);
rset("fslimit", 0xFFFF);
rset("fsperm", 0x93);
/* GS segment */
rset("gs", 0);
rset("gsbase", 0);
rset("gslimit", 0xFFFF);
rset("gsperm", 0x93);
/* General purpose registers - all zero */
rset(RAX, 0);
rset(RBX, 0);
rset(RCX, 0);
rset(RDX, 0);
rset(RSI, 0);
rset(RDI, 0);
rset(RBP, 0);
rset(RSP, 0);
rset(R8, 0);
rset(R9, 0);
rset(R10, 0);
rset(R11, 0);
rset(R12, 0);
rset(R13, 0);
rset(R14, 0);
rset(R15, 0);
/* Flags - bit 1 is always set (reserved) */
rset("flags", 0x2);
/* Control registers - real mode state */
rset("cr0real", 0x10); /* ET bit set (x87 present) */
rset("cr0fake", 0x10);
rset("cr2", 0);
rset("cr3", 0);
rset("cr4real", 0);
rset("cr4fake", 0);
/* Extended feature enable register - no long mode yet */
rset("efer", 0);
/* Debug registers */
rset("dr0", 0);
rset("dr1", 0);
rset("dr2", 0);
rset("dr3", 0);
rset("dr6", 0xFFFF0FF0); /* Default value */
rset("dr7", 0x400); /* Default value */
/* GDTR/IDTR - real mode defaults */
rset("gdtrbase", 0);
rset("gdtrlimit", 0xFFFF);
rset("idtrbase", 0);
rset("idtrlimit", 0xFFFF);
/* LDTR - not used in real mode */
rset("ldtr", 0);
rset("ldtrbase", 0);
rset("ldtrlimit", 0xFFFF);
rset("ldtrperm", 0x82);
/* Task register - not used in real mode */
rset("tr", 0);
rset("trbase", 0);
rset("trlimit", 0xFFFF);
rset("trperm", 0x8b);
/* Initialize per-CPU LAPIC state */
lapic_timer_init();
irqactive = -1;
state = VMRUNNING;
/* Clear interrupt state for this CPU */
memset(isr_bitmap, 0, sizeof(isr_bitmap));
icr_hi_saved = 0;
memset(irr_bitmap, 0, sizeof(irr_bitmap)); // Do you have this?
/* Clear exit counter */
getexit = 0;
/*
* Create NEW channels for this process.
* Parent's channels are not valid after fork() - they belong
* to the parent's libthread state.
*/
waitch = chancreate(sizeof(char *), 32);
notifch = chancreate(sizeof(VmxNotif), 16);
if(waitch == nil || notifch == nil)
sysfatal("CPU%d: chancreate failed: %r", curcpuid);
dprint("CPU%d: starting at %#x:0000 (linear %#x)\n", curcpuid, sipi << 8, sipi << 12);
rset("tscoff", 0);
cached_tscoff = 0;
virtio_start_workers(); /* Start workers for CPU0 */
runloop();
/* Should never return */
exits("CPU exited");
}
/* ============================================================
* LAPIC MMIO Handlers (using dispatch table)
* ============================================================ */
typedef u32int (*LapicReadFn)(void);
typedef void (*LapicWriteFn)(u32int val);
/* Forward declarations */
static u32int lapic_read_id(void);
static u32int lapic_read_version(void);
static u32int lapic_read_tpr(void);
static u32int lapic_read_ppr(void);
static u32int lapic_read_eoi(void);
static u32int lapic_read_ldr(void);
static u32int lapic_read_dfr(void);
static u32int lapic_read_svr(void);
static u32int lapic_read_isr(int reg);
static u32int lapic_read_icr_lo(void);
static u32int lapic_read_icr_hi(void);
static u32int lapic_read_timer_init(void);
static u32int lapic_read_timer_cur(void);
static u32int lapic_read_timer_div(void);
static void lapic_write_tpr(u32int val);
static void lapic_write_eoi(u32int val);
static void lapic_write_ldr(u32int val);
static void lapic_write_dfr(u32int val);
static void lapic_write_svr(u32int val);
static void lapic_write_icr_lo(u32int val);
static void lapic_write_icr_hi(u32int val);
static void lapic_write_timer_init(u32int val);
static void lapic_write_timer_div(u32int val);
/* LAPIC Register offsets */
enum {
LAPIC_ID = 0x020,
LAPIC_VERSION = 0x030,
LAPIC_TPR = 0x080,
LAPIC_APR = 0x090,
LAPIC_PPR = 0x0A0,
LAPIC_EOI = 0x0B0,
LAPIC_RRD = 0x0C0,
LAPIC_LDR = 0x0D0,
LAPIC_DFR = 0x0E0,
LAPIC_SVR = 0x0F0,
LAPIC_ISR_BASE = 0x100,
LAPIC_TMR_BASE = 0x180,
LAPIC_IRR_BASE = 0x200,
LAPIC_ESR = 0x280,
LAPIC_ICR_LO = 0x300,
LAPIC_ICR_HI = 0x310,
LAPIC_LVT_TIMER = 0x320,
LAPIC_LVT_THERM = 0x330,
LAPIC_LVT_PERF = 0x340,
LAPIC_LVT_LINT0 = 0x350,
LAPIC_LVT_LINT1 = 0x360,
LAPIC_LVT_ERROR = 0x370,
LAPIC_TIMER_ICR = 0x380,
LAPIC_TIMER_CCR = 0x390,
LAPIC_TIMER_DCR = 0x3E0,
LAPIC_SELF_IPI = 0x3F0,
};
static u32int
lapic_read_id(void)
{
return curcpuid << 24;
}
static u32int
lapic_read_version(void)
{
/* Version 0x14, max LVT entry 5, no extended APIC space */
return 0x50014;
}
static u32int
lapic_read_tpr(void)
{
return lapic_tpr[curcpuid];
}
static u32int
lapic_read_ppr(void)
{
/* PPR = max(TPR, highest ISR priority) */
u32int tpr = lapic_tpr[curcpuid];
u32int isr_prio = 0;
int i, j;
for(i = 7; i >= 0; i--){
if(isr_bitmap[i]){
for(j = 31; j >= 0; j--){
if(isr_bitmap[i] & (1 << j)){
isr_prio = (i * 32 + j) & 0xF0;
goto found;
}
}
}
}
found:
return (tpr & 0xF0) > isr_prio ? (tpr & 0xF0) : isr_prio;
}
static u32int
lapic_read_eoi(void)
{
return 0; /* Write-only register */
}
static u32int
lapic_read_ldr(void)
{
if(ioapic == nil)
return (1 << curcpuid) << 24; /* Fallback default */
if(ioapic->ldr[curcpuid] == 0)
ioapic->ldr[curcpuid] = (1 << curcpuid) << 24; /* Default: unique bit */
return ioapic->ldr[curcpuid];
}
static u32int
lapic_read_dfr(void)
{
return 0xFFFFFFFF; /* Flat model */
}
static u32int
lapic_read_svr(void)
{
return lapic_svr[curcpuid];
}
static u32int
lapic_read_isr(int reg)
{
return isr_bitmap[reg];
}
static u32int
lapic_read_icr_lo(void)
{
/*
* Return 0 for delivery status (bit 12) = idle.
* This is critical for OpenBSD which polls this bit.
*/
return 0;
}
static u32int
lapic_read_icr_hi(void)
{
return icr_hi_saved;
}
u32int
lapic_read_lvt_timer(void)
{
return lapic.lvt_timer;
}
static u32int
lapic_read_timer_init(void)
{
return lapic.timer_initial;
}
static u32int
lapic_read_timer_cur(void)
{
extern uvlong current_ns(void);
LApic *la = &lapic;
vlong remain, ticks;
u32int div;
if(!la->timer_active || la->timer_deadline <= 0)
return 0;
remain = la->timer_deadline - vmtime_ns();
if(remain <= 0)
return 0;
div = lapic_divide_value(la->timer_divide);
/*
* Calculate current count from remaining time.
* This MUST be the inverse of the period calculation in timer.c:
* period_ns = initial * divide * 1e9 / bus_freq
* current = remain_ns * bus_freq / (divide * 1e9)
*
* For 100MHz bus:
* period_ns = initial * divide * 10
* current = remain_ns / (divide * 10)
*/
ticks = remain * LAPIC_BUS_FREQ_HZ / ((vlong)div * 1000000000LL);
if(ticks > 0xFFFFFFFFLL)
ticks = 0xFFFFFFFF;
return (u32int)ticks;
}
static u32int
lapic_read_timer_div(void)
{
return lapic.timer_divide;
}
static void
lapic_write_tpr(u32int val)
{
lapic_tpr[curcpuid] = val & 0xFF;
dprint("CPU%d: TPR = %#x\n", curcpuid, val);
}
static void
lapic_write_eoi(u32int val)
{
int vec = -1;
int i, j;
USED(val);
/* Find highest priority in-service vector */
for(i = 7; i >= 0; i--){
if(isr_bitmap[i] != 0){
for(j = 31; j >= 0; j--){
if(isr_bitmap[i] & (1 << j)){
vec = i * 32 + j;
break;
}
}
break;
}
}
dprint("CPU%d: EOI vec=%d\n", curcpuid, vec);
if(vec >= 0){
isr_bitmap[vec >> 5] &= ~(1 << (vec & 31));
// Only EOI to IOAPIC for device interrupts, not IPIs
if(vec < 128) // IOAPIC vectors
ioapic_eoi(vec);
}
}
static void
lapic_write_ldr(u32int val)
{
dprint("CPU%d: LDR = %#x\n", curcpuid, val);
if(ioapic != nil)
ioapic->ldr[curcpuid] = val & 0xFF000000; /* Only bits 31:24 are writable */
}
static void
lapic_write_dfr(u32int val)
{
dprint("CPU%d: DFR = %#x\n", curcpuid, val);
}
static void
lapic_write_svr(u32int val)
{
lapic_svr[curcpuid] = val;
dprint("CPU%d: SVR = %#x (LAPIC %s)\n", curcpuid, val, (val & 0x100) ? "enabled" : "disabled");
}
/*
* ICR Low write - triggers IPI send
* This is the key function for self-IPI handling
*
* CRITICAL: Self-IPIs must be deferred until after the instruction completes.
* OpenBSD's i82489_icr_wait() polls the delivery status bit after writing ICR_LO.
* If we inject the interrupt immediately, it can fire before the poll completes,
* causing the kernel to think the IPI failed or causing other race conditions.
*/
static void
lapic_write_icr_lo(u32int val)
{
int vec = val & 0xFF;
int delmode = (val >> 8) & 7;
int destmode = (val >> 11) & 1;
int level = (val >> 14) & 1;
int trigger = (val >> 15) & 1;
int shorthand = (val >> 18) & 3;
int dest = icr_hi_saved >> 24;
int target, i;
USED(destmode);
USED(level);
USED(trigger);
/* Always log ICR writes - critical for debugging SMP issues */
dprint("CPU%d: ICR_LO = %#x (vec=%d del=%d dest=%d short=%d)\n", curcpuid, val, vec, delmode, dest, shorthand);
switch(shorthand){
case 0: /* No shorthand - use destination field */
target = dest;
switch(delmode){
case 0: /* Fixed */
case 1: /* Lowest priority */
if(vec == 0){
dprint("CPU%d: ignoring IPI with vector 0\n", curcpuid);
break;
}
ipi_queue(target, vec);
break;
case 4: /* NMI */
dprint("CPU%d: NMI to CPU%d\n", curcpuid, target);
if(target < nvcpu)
ipi_queue(target, 2);
break;
case 5: /* INIT */
dprint("CPU%d: INIT to CPU%d (init_sent was %d)\n", curcpuid, target, target < MAXVCPU ? init_sent[target] : -1);
if(target > 0 && target < nvcpu){
init_sent[target] = 1;
dprint("CPU%d: marked init_sent[%d] = 1\n", curcpuid, target);
}
break;
case 6: /* Startup IPI */
dprint("CPU%d: SIPI to CPU%d at %#x (init_sent=%d)\n",
curcpuid, target, vec << 12,
target < MAXVCPU ? init_sent[target] : -1);
if(target > 0 && target < nvcpu && init_sent[target] == 1){
init_sent[target] = 2;
dprint("CPU%d: forking CPU%d now\n", curcpuid, target);
forkcpu(vec);
coherence();
dprint("CPU%d: forkcpu returned\n", curcpuid);
} else {
dprint("CPU%d: SIPI ignored (target=%d nvcpu=%d init_sent=%d)\n",
curcpuid, target, nvcpu,
target < MAXVCPU ? init_sent[target] : -1);
}
break;
}
break;
case 1: /* Self */
if((delmode == 0 || delmode == 1) && vec != 0){
dprint("CPU%d: self-IPI (shorthand=1) vector %d (DEFERRED)\n", curcpuid, vec);
ipi_queue(curcpuid, vec);
}
break;
case 2: /* All including self */
dprint("CPU%d: IPI to ALL (including self) vec=%d del=%d\n", curcpuid, vec, delmode);
if((delmode == 0 || delmode == 1) && vec != 0){
for(i = 0; i < nvcpu; i++){
ipi_queue(i, vec);
}
}
if(delmode == 5){
dprint("CPU%d: INIT broadcast\n", curcpuid);
}
break;
case 3: /* All excluding self */
dprint("CPU%d: IPI to ALL (excluding self) vec=%d del=%d\n", curcpuid, vec, delmode);
if((delmode == 0 || delmode == 1) && vec != 0){
for(i = 0; i < nvcpu; i++){
if(i != curcpuid)
ipi_queue(i, vec);
}
}
break;
}
}
static void
lapic_write_icr_hi(u32int val)
{
icr_hi_saved = val;
dprint("CPU%d: ICR_HI = %#x (dest=%d)\n", curcpuid, val, val >> 24);
}
static void
lapic_write_timer_init(u32int val)
{
LApic *la = &lapic;
la->timer_initial = val;
lapic_timer_start();
}
static void
lapic_write_timer_div(u32int val)
{
lapic.timer_divide = val & 0xB;
}
/*
* LAPIC Self-IPI register (x2APIC compatible, offset 0x3F0)
*/
static void
lapic_write_self_ipi(u32int val)
{
int vec = val & 0xFF;
if(vec != 0){
dprint("CPU%d: SELF_IPI register vec=%d\n", curcpuid, vec);
ipi_queue(curcpuid, vec);
}
}
/*
* Main LAPIC MMIO handler
*/
static void
lapic_mmio_handler(ExitInfo *ei)
{
u32int off = ei->pa & 0xFFF;
int iswrite = (ei->qual & 2) != 0;
u32int val;
dprint("LAPIC: off=%#x %s\n", off, iswrite ? "write" : "read");
if(iswrite){
val = getmovval();
switch(off){
case LAPIC_TPR: lapic_write_tpr(val); break;
case LAPIC_EOI: lapic_write_eoi(val); break;
case LAPIC_LDR: lapic_write_ldr(val); break;
case LAPIC_DFR: lapic_write_dfr(val); break;
case LAPIC_SVR: lapic_write_svr(val); break;
case LAPIC_ICR_LO: lapic_write_icr_lo(val); break;
case LAPIC_ICR_HI: lapic_write_icr_hi(val); break;
case LAPIC_LVT_TIMER: lapic_write_lvt_timer(val); break;
case LAPIC_TIMER_ICR: lapic_write_initial_count(val); break;
case LAPIC_TIMER_DCR: lapic_write_divide_config(val); break;
case LAPIC_SELF_IPI: lapic_write_self_ipi(val); break;
default:
dprint("LAPIC: write to %#x ignored\n", off);
}
} else {
switch(off){
case LAPIC_ID: val = lapic_read_id(); break;
case LAPIC_VERSION: val = lapic_read_version(); break;
case LAPIC_TPR: val = lapic_read_tpr(); break;
case LAPIC_PPR: val = lapic_read_ppr(); break;
case LAPIC_EOI: val = lapic_read_eoi(); break;
case LAPIC_LDR: val = lapic_read_ldr(); break;
case LAPIC_DFR: val = lapic_read_dfr(); break;
case LAPIC_SVR: val = lapic_read_svr(); break;
case LAPIC_ICR_LO: val = lapic_read_icr_lo(); break;
case LAPIC_ICR_HI: val = lapic_read_icr_hi(); break;
case LAPIC_LVT_TIMER: val = lapic_read_lvt_timer(); break;
case LAPIC_TIMER_ICR: val = lapic_read_timer_init(); break;
case LAPIC_TIMER_CCR: val = lapic_read_current_count(); break;
case LAPIC_TIMER_DCR: val = lapic_read_timer_div(); break;
/* ISR registers (8 x 32-bit) */
case 0x100: case 0x110: case 0x120: case 0x130:
case 0x140: case 0x150: case 0x160: case 0x170:
val = lapic_read_isr((off - 0x100) >> 4);
break;
/* TMR and IRR - return 0 for now */
case 0x180: case 0x190: case 0x1A0: case 0x1B0:
case 0x1C0: case 0x1D0: case 0x1E0: case 0x1F0:
case 0x200: case 0x210: case 0x220: case 0x230:
case 0x240: case 0x250: case 0x260: case 0x270:
val = 0;
break;
default:
val = 0;
dprint("LAPIC: read from %#x returning 0\n", off);
}
setmovdest(val);
}
}
/* ============================================================
* HPET MMIO Handler
* ============================================================ */
extern u64int hpet_read(u64int addr, int size);
extern void hpet_write(u64int addr, u32int val, int size);
static void
hpet_mmio_handler(ExitInfo *ei)
{
int iswrite = (ei->qual & 2) != 0;
int size = 1 << ((ei->qual >> 7) & 7); // Extract access size from qualification
if(iswrite){
u32int val = getmovval();
hpet_write(ei->pa, val, size);
} else {
u64int val = hpet_read(ei->pa, size);
setmovdest(val);
}
}
/* ============================================================
* EPT Fault Handler
* ============================================================ */
/* Counter for EPT faults per CPU - helps debug SMP issues */
static vlong eptfault_count = 0;
static void
eptfault(ExitInfo *ei)
{
eptfault_count++;
/* Always log EPT faults to help debug SMP issues */
dprint("CPU%d EPT[%lld]: pa=%#llux va=%#llux qual=%#llux\n",
curcpuid, eptfault_count, ei->pa, ei->va, ei->qual);
/* I/O APIC MMIO */
if(ei->pa >= 0xFEC00000 && ei->pa < 0xFEC01000){
ioapic_mmio_handler(ei);
goto done;
}
/* HPET MMIO at 0xFED00000 */
if(ei->pa >= 0xFED00000 && ei->pa < 0xFED01000){
hpet_mmio_handler(ei);
goto done;
}
/* Local APIC MMIO */
if(ei->pa >= 0xFEE00000 && ei->pa < 0xFEE01000){
lapic_mmio_handler(ei);
goto done;
}
/* Other EPT faults - log and continue */
if(ei->pa < 0x1000){
u32int val = getmovval();
dprint("CPU%d LOW MEM WRITE: pa=%#llx val=%#x\n", curcpuid, ei->pa, val);
} else {
/* Unexpected EPT fault - could indicate memory mapping issue */
dprint("CPU%d UNEXPECTED EPT: pa=%#llx va=%#llx qual=%#llx\n",
curcpuid, ei->pa, ei->va, ei->qual);
}
done:
skipinstr(ei);
coherence(); /* Ensure guest memory writes are visible to other CPUs */
/*
* Deliver any pending self-IPI now that the instruction has completed.
* This is the key fix for OpenBSD SMP support.
*/
ipi_poll();
}
/* ============================================================
* Other Exit Handlers
* ============================================================ */
static void
iohandler(ExitInfo *ei)
{
int port, len, inc, isin;
int asz, seg;
uintptr addr;
u32int val;
uvlong vval;
uintptr cx;
static int seglook[8] = {SEGES, SEGCS, SEGSS, SEGDS, SEGFS, SEGGS};
TLB tlb;
port = ei->qual >> 16 & 0xffff;
len = (ei->qual & 7) + 1;
isin = (ei->qual & 8) != 0;
if((ei->qual & 1<<4) == 0){
if(isin){
val = io(1, port, 0, len);
rsetsz(RAX, val, len);
} else
io(0, port, rget(RAX), len);
skipinstr(ei);
return;
}
if((rget("flags") & 0x400) != 0) inc = -len;
else inc = len;
switch(ei->iinfo >> 7 & 7){
case 0: asz = 2; break;
default: asz = 4; break;
case 2: asz = 8; break;
}
if((ei->qual & 1<<5) != 0)
cx = rgetsz(RCX, asz);
else
cx = 1;
addr = isin ? rget(RDI) : rget(RSI);
if(isin)
seg = SEGES;
else
seg = seglook[ei->iinfo >> 15 & 7];
memset(&tlb, 0, sizeof(TLB));
for(; cx > 0; cx--){
if(isin){
vval = io(1, port, 0, len);
if(x86access(seg, addr, asz, &vval, len, ACCW, &tlb) < 0)
goto err;
} else {
if(x86access(seg, addr, asz, &vval, len, ACCR, &tlb) < 0)
goto err;
io(0, port, vval, len);
}
addr += inc;
}
skipinstr(ei);
err:
if((ei->qual & 1<<5) != 0)
rsetsz(RCX, cx, asz);
if(isin)
rsetsz(RDI, addr, asz);
else
rsetsz(RSI, addr, asz);
}
/*
* preempt - Handle VMX preemption timer exit (reason 52)
*
*/
static void
preempt(ExitInfo *)
{
}
/*
* hlt - Handle HLT instruction
*
* SIMPLIFIED: Just check for pending work and set a timeout.
* The preempt handler will wake us when something happens.
*/
extern vlong timer_nearest(void);
extern int irr_pending(void);
/*
* has_pending_work - Check if there's work requiring immediate attention
*/
static int
has_pending_work(void)
{
extern IpiQueue *ipiqueue;
extern u32int irr_bitmap[8];
/* Check for pending interrupts */
if(irr_bitmap[0] | irr_bitmap[1] | irr_bitmap[2] | irr_bitmap[3] |
irr_bitmap[4] | irr_bitmap[5] | irr_bitmap[6] | irr_bitmap[7])
return 1;
return 0;
}
/*
* hlt - Handle HLT instruction
*
* The CPU is idle and waiting for an interrupt. We sleep until:
* 1. An interrupt arrives (IPI, timer, device)
* 2. The nearest timer deadline expires
* 3. A backstop timeout (in case we miss something)
*
* Note: Plan 9 sleep() is not perfectly interruptible, so we use
* shorter sleeps and check for work periodically.
*/
static void
hlt(ExitInfo *ei)
{
char buf[1];
vlong deadline, now;
int i;
skipinstr(ei);
lock(&statelock);
state = VMHALT;
unlock(&statelock);
/* Timer is far, safe to block */
read(hltpipe[curcpuid][0], buf, 1);
lock(&statelock);
state = VMRUNNING;
unlock(&statelock);
}
static void
irqackhand(ExitInfo *ei)
{
int vec = ei->qual;
// Clear from IRR when actually acknowledged
irr_bitmap[vec >> 5] &= ~(1 << (vec & 31));
// Set ISR
isr_bitmap[vec >> 5] |= (1 << (vec & 31));
irqack(vec);
}
static void
dbgexc(ExitInfo *ei)
{
rset("dr6", rget("dr6") | ei->qual);
postexc("#db", NOERRC);
}
static void
movdr(ExitInfo *ei)
{
static char *dr[8] = { "dr0", "dr1", "dr2", "dr3", nil, nil, "dr6", "dr7" };
int q = ei->qual;
if((q & 6) == 4){
postexc("#gp", 0);
return;
}
if((q & 16) != 0)
rset(x86reg[q >> 8 & 15], rget(dr[q & 7]));
else
rset(dr[q & 7], rget(x86reg[q >> 8 & 15]));
skipinstr(ei);
}
static void
movcr(ExitInfo *ei)
{
u32int q = ei->qual;
switch(q & 15){
case 0:
switch(q >> 4 & 3){
case 0:
vmdebug("illegal CR0 write, value %#ux", (u32int)rget(x86reg[q >> 8 & 15]));
rset("cr0real", rget(x86reg[q >> 8 & 15]));
skipinstr(ei);
break;
case 1:
vmerror("shouldn't happen: trap on MOV from CR0");
rset(x86reg[q >> 8 & 15], rget("cr0fake"));
skipinstr(ei);
break;
case 2:
vmerror("shouldn't happen: trap on CLTS");
rset("cr0real", rget("cr0real") & ~8);
skipinstr(ei);
break;
case 3:
vmerror("LMSW handler unimplemented");
postexc("#ud", NOERRC);
}
break;
case 4:
switch(q >> 4 & 3){
case 0:
vmdebug("illegal CR4 write, value %#ux", (u32int)rget(x86reg[q >> 8 & 15]));
rset("cr4real", rget(x86reg[q >> 8 & 15]));
skipinstr(ei);
break;
case 1:
vmerror("shouldn't happen: trap on MOV from CR4");
rset(x86reg[q >> 8 & 15], rget("cr3fake"));
skipinstr(ei);
break;
default:
vmerror("unknown CR4 operation %d", q);
postexc("#ud", NOERRC);
}
break;
default:
vmerror("access to unknown control register CR%ud", q & 15);
postexc("#ud", NOERRC);
}
}
/* ============================================================
* CPUID Handler
* ============================================================ */
typedef struct CPUID CPUID;
struct CPUID {
u32int ax, bx, cx, dx;
};
static u32int cpuidmax;
static u32int cpuidmaxext;
static CPUID leaf1;
static struct {
uvlong miscen;
} msr;
static uchar _cpuid[] = {
0x5E, /* POP SI (PC) */
0x5D, /* POP BP (CPUID&) */
0x58, /* POP AX */
0x59, /* POP CX */
0x51, /* PUSH CX */
0x50, /* PUSH AX */
0x55, /* PUSH BP */
0x56, /* PUSH SI */
0x31, 0xDB, /* XOR BX, BX */
0x31, 0xD2, /* XOR DX, DX */
0x0F, 0xA2, /* CPUID */
0x89, 0x45, 0x00, /* MOV AX, 0(BP) */
0x89, 0x5d, 0x04, /* MOV BX, 4(BP) */
0x89, 0x4d, 0x08, /* MOV CX, 8(BP) */
0x89, 0x55, 0x0C, /* MOV DX, 12(BP) */
0xC3, /* RET */
};
static CPUID (*getcpuid)(ulong ax, ulong cx) = (CPUID(*)(ulong, ulong)) _cpuid;
void
cpuidinit(void)
{
CPUID r;
int f;
if(sizeof(uintptr) == 8)
_cpuid[1] = 0x58;
segflush(_cpuid, sizeof(_cpuid));
r = getcpuid(0, 0);
cpuidmax = r.ax;
r = getcpuid(0x80000000, 0);
cpuidmaxext = r.ax;
leaf1 = getcpuid(1, 0);
memset(&msr, 0, sizeof(msr));
if((f = open("/dev/msr", OREAD)) >= 0){
pread(f, &msr.miscen, 8, 0x1a0);
msr.miscen &= 1<<0;
close(f);
}
}
static int xsavesz[] = {
[1] = 512+64,
[3] = 512+64,
[7] = 512+64+256,
};
static void
cpuid(ExitInfo *ei)
{
u32int ax, bx, cx, dx;
CPUID cp;
ax = rget(RAX);
cx = rget(RCX);
dprint("CPU%d: CPUID eax=%#x ecx=%#x\n", curcpuid, ax, cx);
bx = dx = 0;
cp = getcpuid(ax, cx);
switch(ax){
case 0x00:
ax = MIN(cpuidmax, 0x18);
bx = cp.bx;
dx = cp.dx;
cx = cp.cx;
break;
case 0x01:
ax = cp.ax;
bx = (curcpuid << 24) | (cp.bx & 0x00ffff);
cx = cp.cx & 0x76de3217 | 0x80000000UL; /* ← ADD BIT 24 HERE */
cx |= (1 << 24); /* TSC-Deadline mode supported */
dx = cp.dx & 0x0f8aa779;
if(leaf1.cx & 1<<27){
if(rget("cr4real") & Cr4Osxsave)
cx |= 1<<27;
} else
cx &= ~0x1c000000;
break;
case 0x02: goto literal;
case 0x03: goto zero;
case 0x04: goto literal;
case 0x05: goto zero;
case 0x06: goto zero;
case 0x07:
if(cx == 0){
ax = 0;
bx = cp.bx & 0x2369;
cx = 0;
if((leaf1.cx & 1<<27) == 0)
bx &= ~0xdc230020;
} else
goto zero;
break;
case 0x08: goto zero;
case 0x09: goto literal;
case 0x0a: goto zero;
case 0x0b: goto zero;
case 0x0c: goto zero;
case 0x0d:
if((leaf1.cx & 1<<27) == 0)
goto zero;
if(cx == 0){
ax = cp.ax & 7;
bx = xsavesz[rget("xcr0")];
cx = xsavesz[ax];
} else if(cx == 1){
ax = cp.ax & 7;
bx = xsavesz[rget("xcr0")];
cx = 0;
} else if(cx == 2){
ax = xsavesz[7] - xsavesz[3];
bx = xsavesz[3];
cx = 0;
} else
goto zero;
break;
case 0x0f: goto zero;
case 0x10: goto zero;
case 0x12: goto zero;
case 0x14: goto zero;
case 0x15: /* TSC/Crystal ratio */
{
uvlong freq = _tos->cyclefreq;
if(freq == 0)
freq = 2900000000ULL;
ax = 1;
bx = 1;
cx = (u32int)freq;
dx = 0;
}
break;
case 0x16: /* Processor Frequency MHz */
{
uvlong freq = _tos->cyclefreq;
if(freq == 0)
freq = 2900000000ULL;
u32int mhz = (u32int)(freq / 1000000ULL);
ax = mhz;
bx = mhz;
cx = 100;
dx = 0;
}
break;
case 0x17: goto zero;
case 0x18: goto literal;
case 0x40000000:
ax = 0x40000001; /* Max supported leaf - was 0, now 1 */
bx = 0x4b4d564b; /* "KVMK" */
cx = 0x564b4d56; /* "VMKV" */
dx = 0x4d; /* "M" */
break;
case 0x40000001:
/* KVM features */
ax = (1 << 3); /* KVM_FEATURE_CLOCKSOURCE2 */
bx = 0;
cx = 0;
dx = 0;
break;
case 0x80000000:
ax = MIN(cpuidmaxext, 0x80000008);
cx = 0;
break;
case 0x80000001:
ax = cp.ax;
cx = cp.cx & 0x121;
if(sizeof(uintptr) == 8)
dx = cp.dx & 0x24100800;
else
dx = cp.dx & 0x04100000;
break;
case 0x80000002: goto literal;
case 0x80000003: goto literal;
case 0x80000004: goto literal;
case 0x80000005: goto zero;
case 0x80000006: goto literal;
case 0x80000007:
ax = 0;
bx = 0;
cx = 0;
dx = (1 << 8); /* Invariant TSC */
dprint("CPUID[80000007]: returning dx=%#x (InvariantTSC=%d)\n", dx, (dx >> 8) & 1);
break;
case 0x80000008: goto literal;
literal:
ax = cp.ax;
bx = cp.bx;
cx = cp.cx;
dx = cp.dx;
break;
default:
if((ax & 0xf0000000) != 0x40000000)
vmdebug("unknown cpuid field eax=%#ux", ax);
zero:
ax = cx = 0;
break;
}
rset(RAX, ax);
rset(RBX, bx);
rset(RCX, cx);
rset(RDX, dx);
skipinstr(ei);
}
static void
rdwrmsr(ExitInfo *ei)
{
u32int cx;
u64int val;
int rd;
rd = ei->name[1] == 'r';
cx = rget(RCX);
val = (uvlong)rget(RDX) << 32 | rget(RAX);
dprint("CPU%d: MSR %s cx=%#x\n", curcpuid, rd?"read":"write", cx);
switch(cx){
case 0x10: /* IA32_TIME_STAMP_COUNTER */
if(rd){
/*
* Return actual hardware TSC via cycles().
* This gives the guest access to real CPU cycle count,
* allowing proper TSC calibration against HPET.
*/
uvlong tsc;
cycles(&tsc);
val = tsc + cached_tscoff;
}
break;
case 0x1B: /* IA32_APIC_BASE */
if(rd){
/*
* APIC Base MSR format:
* Bit 8: BSP flag (1 = bootstrap processor)
* Bit 11: APIC global enable
* Bits 12-35: APIC base address (page number)
*
* Default APIC base is 0xFEE00000
*/
val = 0xFEE00000ULL; /* Base address */
val |= (1 << 11); /* APIC enabled */
if(curcpuid == 0)
val |= (1 << 8); /* BSP flag for CPU 0 only */
}
/* Writes are ignored - we don't support relocating the APIC */
break;
case 0x277:
if(rd) val = rget("pat");
else rset("pat", val);
break;
case 0x6E0: /* IA32_TSC_DEADLINE */
if(rd)
val = lapic_read_tsc_deadline();
else {
dprint("CPU%d: TSC_DEADLINE=%llud\n", curcpuid, val);
lapic_write_tsc_deadline(val);
pvclock_update(curcpuid); /* Keep kvmclock in sync */
}
break;
case 0x8B:
val = 0;
break;
case 0x1A0:
if(rd) val = msr.miscen;
break;
case 0x4b564d00: /* MSR_KVM_WALL_CLOCK_NEW */
case 0x4b564d01: /* MSR_KVM_SYSTEM_TIME_NEW */
if(pvclock_msr(curcpuid, !rd, cx, &val))
break;
default:
if(rd){
vmdebug("read from unknown MSR %#ux ignored", cx);
val = 0;
} else
vmdebug("write to unknown MSR %#ux ignored (val=%#ullx)", cx, val);
break;
}
if(rd){
rset(RAX, (u32int)val);
rset(RDX, (u32int)(val >> 32));
}
skipinstr(ei);
}
static void
xsetbv(ExitInfo *ei)
{
uvlong v;
v = rget(RAX)&0xffffffff | rget(RDX)<<32;
if(rget(RCX) & 0xffffffff)
postexc("#gp", 0);
else if(v != 1 && v != 3 && v != 7)
postexc("#gp", 0);
else if((leaf1.cx & 1<<26) == 0 || (rget("cr4real") & Cr4Osxsave) == 0)
postexc("#ud", NOERRC);
else {
rset("xcr0", v);
skipinstr(ei);
}
}
/* ============================================================
* Exit Dispatch
* ============================================================ */
typedef struct ExitType ExitType;
struct ExitType {
char *name;
void (*f)(ExitInfo *);
};
/*
* Helper: Check if any IRQs are pending in the IRR bitmap
*/
int
irr_pending(void)
{
extern u32int irr_bitmap[8];
return irr_bitmap[0] | irr_bitmap[1] | irr_bitmap[2] | irr_bitmap[3] |
irr_bitmap[4] | irr_bitmap[5] | irr_bitmap[6] | irr_bitmap[7];
}
static ExitType etypes[] = {
{"io", iohandler},
{".cpuid", cpuid},
{".hlt", hlt},
{"eptfault", eptfault},
{"*ack", irqackhand},
{".rdmsr", rdwrmsr},
{".wrmsr", rdwrmsr},
{".movdr", movdr},
{"#db", dbgexc},
{"movcr", movcr},
{".xsetbv", xsetbv},
{"preempt", preempt},
};
void
processexit(char *msg)
{
static char msgc[1024];
char *f[32];
int nf;
ExitType *et;
int i;
ExitInfo ei;
extern int getexit;
static uvlong exit_count = 0;
static uvlong last_report = 0;
exit_count++;
vlong now = nanosec();
if(now - last_report > 1000000000LL) { // Every 1 second
dprint("EXIT RATE: %llud/sec\n", exit_count);
exit_count = 0;
last_report = now;
}
static vlong lastpreempt;
dprint("EXIT: %s\n", msg);
if(strncmp(msg, "preempt", 7) == 0){
if(lastpreempt != 0)
dprint("PREEMPT CPU%d: delta=%lldms\n", curcpuid, (now - lastpreempt)/1000000);
lastpreempt = now;
}
strcpy(msgc, msg);
nf = tokenize(msgc, f, nelem(f));
if(nf < 2) sysfatal("invalid wait message: %s", msg);
memset(&ei, 0, sizeof(ei));
ei.raw = msg;
ei.name = f[0];
ei.qual = strtoull(f[1], nil, 0);
for(i = 2; i < nf; i += 2){
if(strcmp(f[i], "pc") == 0)
rpoke(RPC, strtoull(f[i+1], nil, 0), 1);
else if(strcmp(f[i], "sp") == 0)
rpoke(RSP, strtoull(f[i+1], nil, 0), 1);
else if(strcmp(f[i], "ax") == 0)
rpoke(RAX, strtoull(f[i+1], nil, 0), 1);
else if(strcmp(f[i], "ilen") == 0)
ei.ilen = strtoul(f[i+1], nil, 0);
else if(strcmp(f[i], "iinfo") == 0)
ei.iinfo = strtoul(f[i+1], nil, 0);
else if(strcmp(f[i], "pa") == 0)
ei.pa = strtoull(f[i+1], nil, 0);
else if(strcmp(f[i], "va") == 0)
ei.va = strtoull(f[i+1], nil, 0);
}
if(*f[0] == '*') getexit++;
for(et = etypes; et < etypes + nelem(etypes); et++){
if(strcmp(et->name, f[0]) == 0){
et->f(&ei);
return;
}
}
if(*f[0] == '.'){
vmerror("vmx: unknown instruction %s", f[0]+1);
postexc("#ud", NOERRC);
return;
}
if(*f[0] == '*'){
vmerror("vmx: unknown notification %s", f[0]+1);
return;
}
if(persist){
vmerror("unknown exit: %s", msg);
state = VMDEAD;
} else
sysfatal("unknown exit: %s", msg);
}