shithub: vmxsmp

ref: e0e889fb69a6d4d3f2332244ef4f79f900a66d84
dir: /pvclock.c/

View raw version
/*
 * pvclock.c - KVM paravirtual clock
 * 
 * The guest computes time as:
 *   time_ns = system_time + ((rdtsc - tsc_timestamp) * mul >> shift)
 * 
 * CRITICAL: tsc_timestamp must be in GUEST TSC units (host + offset)
 * because that's what guest's rdtsc instruction returns.
 */

#include <u.h>
#include <libc.h>
#include <tos.h>
#include <thread.h>
#include "dat.h"
#include "fns.h"

#define MSR_KVM_WALL_CLOCK_NEW   0x4b564d00
#define MSR_KVM_SYSTEM_TIME_NEW  0x4b564d01

typedef struct PVClock PVClock;
struct PVClock {
	u32int version;
	u32int pad0;
	u64int tsc_timestamp;
	u64int system_time;
	u32int tsc_to_system_mul;
	s8int  tsc_shift;
	u8int  flags;
	u8int  pad[2];
};

typedef struct PVWallClock PVWallClock;
struct PVWallClock {
	u32int version;
	u32int sec;
	u32int nsec;
};

static uvlong pvclock_gpa[MAXVCPU];
static uvlong wallclock_gpa;
static int pvclock_enabled[MAXVCPU];

/* Cached conversion parameters */
static u32int tsc_mul;
static s8int tsc_shift;
static uvlong tsc_freq;

/* Boot time references - set once at init */
static uvlong boot_ns;

/* From vmx.c - the TSC offset applied to guest */
extern vlong cached_tscoff;

/*
 * Compute mul and shift for TSC -> nanosecond conversion.
 */
static void
compute_tsc_params(void)
{
	uvlong mul64;
	
	tsc_freq = _tos->cyclefreq;
	if(tsc_freq == 0)
		tsc_freq = 2900000000ULL;
	
	tsc_shift = 0;
	mul64 = (1000000000ULL << 32) / tsc_freq;
	
	if(mul64 > 0xFFFFFFFFULL){
		tsc_shift = -1;
		mul64 = (1000000000ULL << 31) / tsc_freq;
		while(mul64 > 0xFFFFFFFFULL && tsc_shift > -10){
			tsc_shift--;
			mul64 = (1000000000ULL << (32 + tsc_shift)) / tsc_freq;
		}
	}
	
	tsc_mul = (u32int)mul64;
	
	dprint("pvclock: tsc_freq=%llud mul=%ud shift=%d\n",
	       tsc_freq, tsc_mul, tsc_shift);
}

void
pvclock_init(void)
{
	compute_tsc_params();
	boot_ns = nanosec();
	dprint("pvclock: boot_ns=%llud\n", boot_ns);
}

/*
 * Update pvclock structure for a CPU.
 * Sets the reference point so guest can compute current time.
 */
void
pvclock_update(int cpu)
{
	PVClock *pv;
	uvlong host_tsc, guest_tsc, now_ns;
	
	if(cpu < 0 || cpu >= MAXVCPU || !pvclock_enabled[cpu])
		return;
	
	pv = gptr(pvclock_gpa[cpu], sizeof(PVClock));
	if(pv == nil)
		return;

	/* Make version odd during update */
	pv->version |= 1;
	coherence();
	
	/* Get current host TSC and convert to guest TSC */
	cycles(&host_tsc);
	guest_tsc = host_tsc + cached_tscoff;
	
	/* Current time in nanoseconds since boot */
	now_ns = nanosec() - boot_ns;
	
	/*
	 * Set reference point using GUEST TSC.
	 * Guest computes: system_time + (rdtsc - tsc_timestamp) * mul >> shift
	 * 
	 * rdtsc returns guest_tsc, so tsc_timestamp must also be guest_tsc.
	 */
	pv->tsc_timestamp = guest_tsc;
	pv->system_time = now_ns;
	pv->tsc_to_system_mul = tsc_mul;
	pv->tsc_shift = tsc_shift;
	pv->flags = 1;  /* TSC stable */
	
	coherence();
	pv->version++;  /* Make even = stable */
}

static void
pvclock_setup(int cpu)
{
	dprint("CPU%d: pvclock setup gpa=%#llux tscoff=%lld\n",
	       cpu, pvclock_gpa[cpu], (vlong)cached_tscoff);
	pvclock_update(cpu);
}

static void
wallclock_update(void)
{
	PVWallClock *wc;
	vlong wall_sec;
	uvlong ns_since_boot;
	
	if(wallclock_gpa == 0)
		return;
	
	wc = gptr(wallclock_gpa, sizeof(PVWallClock));
	if(wc == nil)
		return;
	
	wc->version = 1;
	coherence();
	
	wall_sec = time(nil);
	ns_since_boot = nanosec() - boot_ns;
	wall_sec -= ns_since_boot / 1000000000ULL;
	
	wc->sec = wall_sec;
	wc->nsec = 0;
	
	coherence();
	wc->version = 2;
	
	dprint("wallclock: sec=%lld at boot\n", wall_sec);
}

int
pvclock_msr(int cpu, int write, u32int msr, u64int *val)
{
	if(cpu < 0 || cpu >= MAXVCPU)
		return 0;
	
	switch(msr){
	case MSR_KVM_SYSTEM_TIME_NEW:
		if(write){
			pvclock_gpa[cpu] = *val & ~1ULL;
			pvclock_enabled[cpu] = *val & 1;
			
			dprint("CPU%d: pvclock MSR write gpa=%#llux enabled=%d\n",
			       cpu, pvclock_gpa[cpu], pvclock_enabled[cpu]);
			
			if(pvclock_enabled[cpu])
				pvclock_setup(cpu);
		} else {
			*val = pvclock_gpa[cpu] | pvclock_enabled[cpu];
		}
		return 1;
	
	case MSR_KVM_WALL_CLOCK_NEW:
		if(write){
			wallclock_gpa = *val;
			dprint("wallclock: GPA set to %#llux\n", wallclock_gpa);
			wallclock_update();
		} else {
			*val = wallclock_gpa;
		}
		return 1;
	}
	
	return 0;
}