shithub: psxe

Download patch

ref: 9a76a94d5146753d5d98dd130939c6528fe0f257
parent: b06cc48f434d10fe1974dae898013fd978f01d84
author: allkern <lisandroaalarcon@gmail.com>
date: Thu Jul 18 08:25:52 EDT 2024

Insanely huge update

Implemented bilinear texture filtering
Fixed I_MASK IRQ triggering issue
Implemented 384 mode (need config)
Fixed CUE TOC values
Implemented initial GPU line generalization
Fixed SPU interpolation
Fixed CPU delayed LWL/LWR
Fixed CPU GTE instruction in delay slot

--- a/frontend/main.c
+++ b/frontend/main.c
@@ -12,7 +12,10 @@
     psx_cdrom_t* cdrom = ((psx_t*)ud)->cdrom;
     psx_spu_t* spu = ((psx_t*)ud)->spu;
 
-    psx_cdrom_get_audio_samples(cdrom, buf, size, spu);
+    memset(buf, 0, size);
+
+    psx_cdrom_get_audio_samples(cdrom, buf, size);
+    psx_spu_update_cdda_buffer(spu, cdrom->cdda_buf);
 
     for (int i = 0; i < (size >> 2); i++) {
         uint32_t sample = psx_spu_get_sample(spu);
--- a/frontend/screen.c
+++ b/frontend/screen.c
@@ -57,7 +57,13 @@
 int screen_get_base_width(psxe_screen_t* screen) {
     int width = psx_get_dmode_width(screen->psx);
 
-    return (width == 256) ? 256 : 320;
+    switch (width) {
+        case 256: return 256;
+        case 320: return 320;
+        case 368: return 384;
+    }
+
+    return 320;
 }
 
 psxe_screen_t* psxe_screen_create(void) {
--- a/psx/bus.c
+++ b/psx/bus.c
@@ -105,7 +105,7 @@
         return 0x05;
 
     if (addr == 0x1f400004)
-        return 0xc0;
+        return 0xc8;
 
     if (addr == 0x1f400006)
         return 0x1fe0;
--- a/psx/cpu.c
+++ b/psx/cpu.c
@@ -170,11 +170,11 @@
 #define R_A0 (cpu->r[4])
 #define R_RA (cpu->r[31])
 
-#define DO_PENDING_LOAD \
+#define DO_PENDING_LOAD { \
     cpu->r[cpu->load_d] = cpu->load_v; \
     R_R0 = 0; \
     cpu->load_v = 0xffffffff; \
-    cpu->load_d = 0;
+    cpu->load_d = 0; }
 
 #define SE8(v) ((int32_t)((int8_t)v))
 #define SE16(v) ((int32_t)((int16_t)v))
@@ -295,6 +295,15 @@
     cpu->next_pc += 4;
 
     if (psx_cpu_check_irq(cpu)) {
+        // GTE instructions "win" over interrupts
+        if ((cpu->opcode & 0xfe000000) == 0x4a000000)
+            g_psx_cpu_primary_table[OP](cpu);
+
+        cpu->r[0] = 0;
+
+        cpu->last_cycles = 2;
+        cpu->total_cycles += cpu->last_cycles;
+
         psx_cpu_exception(cpu, CAUSE_INT);
 
         return;
@@ -326,9 +335,6 @@
         cpu->cop0_r[COP0_CAUSE] |= 0x80000000;
     }
 
-    if ((cause == CAUSE_INT) && ((cpu->opcode & 0xfe000000) == 0x4a000000))
-        cpu->cop0_r[COP0_EPC] += 4;
-
     // Do exception stack push
     uint32_t mode = cpu->cop0_r[COP0_SR] & 0x3f;
 
@@ -603,7 +609,8 @@
 
     uint32_t s = cpu->r[S];
 
-    DO_PENDING_LOAD;
+    if (cpu->load_d != T)
+        DO_PENDING_LOAD;
 
     cpu->load_d = T;
     cpu->load_v = SE8(psx_bus_read8(cpu->bus, s + IMM16S));
@@ -614,7 +621,8 @@
 
     uint32_t s = cpu->r[S];
 
-    DO_PENDING_LOAD;
+    if (cpu->load_d != T)
+        DO_PENDING_LOAD;
 
     uint32_t addr = s + IMM16S;
 
@@ -636,11 +644,12 @@
     uint32_t addr = s + IMM16S;
     uint32_t load = psx_bus_read32(cpu->bus, addr & 0xfffffffc);
 
-    if (rt == cpu->load_d)
+    if (rt == cpu->load_d) {
         t = cpu->load_v;
+    } else {
+        DO_PENDING_LOAD;
+    }
 
-    DO_PENDING_LOAD;
-
     int shift = (int)((addr & 0x3) << 3);
     uint32_t mask = (uint32_t)0x00FFFFFF >> shift;
     uint32_t value = (t & mask) | (load << (24 - shift)); 
@@ -648,8 +657,8 @@
     cpu->load_d = rt;
     cpu->load_v = value;
 
-    // printf("lwl rt=%u s=%08x t=%08x addr=%08x load=%08x (%08x) tp=%08x shift=%u mask=%08x value=%08x\n",
-    //     rt, s, t, addr, load, addr & 0xfffffffc, tp, shift, mask, value
+    // printf("lwl rt=%u s=%08x t=%08x addr=%08x load=%08x (%08x) shift=%u mask=%08x value=%08x\n",
+    //     rt, s, t, addr, load, addr & 0xfffffffc, shift, mask, value
     // );
 }
 
@@ -659,7 +668,8 @@
     uint32_t s = cpu->r[S];
     uint32_t addr = s + IMM16S;
 
-    DO_PENDING_LOAD;
+    if (cpu->load_d != T)
+        DO_PENDING_LOAD;
 
     if (addr & 0x3) {
         psx_cpu_exception(cpu, CAUSE_ADEL);
@@ -674,7 +684,8 @@
 
     uint32_t s = cpu->r[S];
 
-    DO_PENDING_LOAD;
+    if (cpu->load_d != T)
+        DO_PENDING_LOAD;
 
     cpu->load_d = T;
     cpu->load_v = psx_bus_read8(cpu->bus, s + IMM16S);
@@ -686,7 +697,8 @@
     uint32_t s = cpu->r[S];
     uint32_t addr = s + IMM16S;
 
-    DO_PENDING_LOAD;
+    if (cpu->load_d != T)
+        DO_PENDING_LOAD;
 
     if (addr & 0x1) {
         psx_cpu_exception(cpu, CAUSE_ADEL);
@@ -706,11 +718,12 @@
     uint32_t addr = s + IMM16S;
     uint32_t load = psx_bus_read32(cpu->bus, addr & 0xfffffffc);
 
-    if (rt == cpu->load_d)
+    if (rt == cpu->load_d) {
         t = cpu->load_v;
+    } else {
+        DO_PENDING_LOAD;
+    }
 
-    DO_PENDING_LOAD;
-
     int shift = (int)((addr & 0x3) << 3);
     uint32_t mask = 0xFFFFFF00 << (24 - shift);
     uint32_t value = (t & mask) | (load >> shift); 
@@ -718,8 +731,8 @@
     cpu->load_d = rt;
     cpu->load_v = value;
 
-    // printf("lwl rt=%u s=%08x t=%08x addr=%08x load=%08x (%08x) tp=%08x shift=%u mask=%08x value=%08x\n",
-    //     rt, s, t, addr, load, addr & 0xfffffffc, tp, shift, mask, value
+    // printf("lwr rt=%u s=%08x t=%08x addr=%08x load=%08x (%08x) shift=%u mask=%08x value=%08x\n",
+    //     rt, s, t, addr, load, addr & 0xfffffffc, shift, mask, value
     // );
 }
 
@@ -1486,12 +1499,14 @@
     return (int32_t)(((value << 20) >> 20) >> cpu->gte_sf);
 }
 
-void gte_check_mac(psx_cpu_t* cpu, int i, int64_t value) {
+int64_t gte_check_mac(psx_cpu_t* cpu, int i, int64_t value) {
     if (value < -0x80000000000ll) {
         R_FLAG |= 0x8000000 >> (i - 1);
     } else if (value > 0x7ffffffffffll) {
         R_FLAG |= 0x40000000 >> (i - 1);
     }
+
+    return (value << 20) >> 20;
 }
 
 int32_t gte_clamp_ir0(psx_cpu_t* cpu, int32_t value) {
@@ -1550,7 +1565,7 @@
     return (uint8_t)value;
 }
 
-int32_t gte_clamp_ir(psx_cpu_t* cpu, int i, int value, int lm) {
+int32_t gte_clamp_ir(psx_cpu_t* cpu, int i, int64_t value, int lm) {
     if (lm && (value < 0)) {
         R_FLAG |= (uint32_t)(0x1000000 >> (i - 1));
 
@@ -1714,13 +1729,12 @@
 #define R_LB3 cpu->cop2_cr.lr.m33
 
 #define GTE_RTP_DQ(i) { \
-    R_FLAG = 0; \
     int64_t vx = (int64_t)((int16_t)cpu->cop2_dr.v[i].p[0]); \
     int64_t vy = (int64_t)((int16_t)cpu->cop2_dr.v[i].p[1]); \
     int64_t vz = (int64_t)cpu->cop2_dr.v[i].z; \
-    R_MAC1 = gte_clamp_mac(cpu, 1, (((int64_t)R_TRX) << 12) + (I64((int16_t)R_RT11) * vx) + (I64((int16_t)R_RT12) * vy) + (I64((int16_t)R_RT13) * vz)); \
-    R_MAC2 = gte_clamp_mac(cpu, 2, (((int64_t)R_TRY) << 12) + (I64((int16_t)R_RT21) * vx) + (I64((int16_t)R_RT22) * vy) + (I64((int16_t)R_RT23) * vz)); \
-    R_MAC3 = gte_clamp_mac(cpu, 3, (((int64_t)R_TRZ) << 12) + (I64((int16_t)R_RT31) * vx) + (I64((int16_t)R_RT32) * vy) + (I64((int16_t)R_RT33) * vz)); \
+    R_MAC1 = gte_clamp_mac(cpu, 1, gte_check_mac(cpu, 1, gte_check_mac(cpu, 1, (((int64_t)R_TRX) << 12) + (I64((int16_t)R_RT11) * vx)) + (I64((int16_t)R_RT12) * vy)) + (I64((int16_t)R_RT13) * vz)); \
+    R_MAC2 = gte_clamp_mac(cpu, 2, gte_check_mac(cpu, 2, gte_check_mac(cpu, 2, (((int64_t)R_TRY) << 12) + (I64((int16_t)R_RT21) * vx)) + (I64((int16_t)R_RT22) * vy)) + (I64((int16_t)R_RT23) * vz)); \
+    R_MAC3 = gte_clamp_mac(cpu, 3, gte_check_mac(cpu, 3, gte_check_mac(cpu, 3, (((int64_t)R_TRZ) << 12) + (I64((int16_t)R_RT31) * vx)) + (I64((int16_t)R_RT32) * vy)) + (I64((int16_t)R_RT33) * vz)); \
     R_IR1 = gte_clamp_ir(cpu, 1, R_MAC1, cpu->gte_lm); \
     R_IR2 = gte_clamp_ir(cpu, 2, R_MAC2, cpu->gte_lm); \
     R_IR3 = gte_clamp_ir_z(cpu, cpu->s_mac3, cpu->gte_sf, cpu->gte_lm); \
@@ -1737,13 +1751,12 @@
     R_IR0 = gte_clamp_ir0(cpu, cpu->s_mac0 >> 12); }
 
 #define GTE_RTP(i) { \
-    R_FLAG = 0; \
     int64_t vx = (int64_t)((int16_t)cpu->cop2_dr.v[i].p[0]); \
     int64_t vy = (int64_t)((int16_t)cpu->cop2_dr.v[i].p[1]); \
     int64_t vz = (int64_t)cpu->cop2_dr.v[i].z; \
-    R_MAC1 = gte_clamp_mac(cpu, 1, (((int64_t)R_TRX) << 12) + (I64((int16_t)R_RT11) * vx) + (I64((int16_t)R_RT12) * vy) + (I64((int16_t)R_RT13) * vz)); \
-    R_MAC2 = gte_clamp_mac(cpu, 2, (((int64_t)R_TRY) << 12) + (I64((int16_t)R_RT21) * vx) + (I64((int16_t)R_RT22) * vy) + (I64((int16_t)R_RT23) * vz)); \
-    R_MAC3 = gte_clamp_mac(cpu, 3, (((int64_t)R_TRZ) << 12) + (I64((int16_t)R_RT31) * vx) + (I64((int16_t)R_RT32) * vy) + (I64((int16_t)R_RT33) * vz)); \
+    R_MAC1 = gte_clamp_mac(cpu, 1, gte_check_mac(cpu, 1, gte_check_mac(cpu, 1, (((int64_t)R_TRX) << 12) + (I64((int16_t)R_RT11) * vx)) + (I64((int16_t)R_RT12) * vy)) + (I64((int16_t)R_RT13) * vz)); \
+    R_MAC2 = gte_clamp_mac(cpu, 2, gte_check_mac(cpu, 2, gte_check_mac(cpu, 2, (((int64_t)R_TRY) << 12) + (I64((int16_t)R_RT21) * vx)) + (I64((int16_t)R_RT22) * vy)) + (I64((int16_t)R_RT23) * vz)); \
+    R_MAC3 = gte_clamp_mac(cpu, 3, gte_check_mac(cpu, 3, gte_check_mac(cpu, 3, (((int64_t)R_TRZ) << 12) + (I64((int16_t)R_RT31) * vx)) + (I64((int16_t)R_RT32) * vy)) + (I64((int16_t)R_RT33) * vz)); \
     R_IR1 = gte_clamp_ir(cpu, 1, R_MAC1, cpu->gte_lm); \
     R_IR2 = gte_clamp_ir(cpu, 2, R_MAC2, cpu->gte_lm); \
     R_IR3 = gte_clamp_ir_z(cpu, cpu->s_mac3, cpu->gte_sf, cpu->gte_lm); \
@@ -1758,7 +1771,6 @@
     R_SY2 = gte_clamp_sxy(cpu, 2, (gte_clamp_mac0(cpu, (int64_t)((int32_t)R_OFY) + ((int64_t)R_IR2 * div)) >> 16)); }
 
 #define DPCT1 { \
-    R_FLAG = 0; \
     int64_t mac1 = gte_clamp_mac(cpu, 1, (((int64_t)R_RFC) << 12) - (((int64_t)cpu->cop2_dr.rgb[0].c[0]) << 16)); \
     int64_t mac2 = gte_clamp_mac(cpu, 2, (((int64_t)R_GFC) << 12) - (((int64_t)cpu->cop2_dr.rgb[0].c[1]) << 16)); \
     int64_t mac3 = gte_clamp_mac(cpu, 3, (((int64_t)R_BFC) << 12) - (((int64_t)cpu->cop2_dr.rgb[0].c[2]) << 16)); \
@@ -1779,28 +1791,24 @@
     R_BC2 = gte_clamp_rgb(cpu, 3, R_MAC3 >> 4); }
 
 #define NCCS(i) { \
-    R_FLAG = 0; \
     int64_t vx = (int64_t)((int16_t)cpu->cop2_dr.v[i].p[0]); \
     int64_t vy = (int64_t)((int16_t)cpu->cop2_dr.v[i].p[1]); \
     int64_t vz = (int64_t)cpu->cop2_dr.v[i].z; \
-    R_MAC1 = (int)(gte_clamp_mac(cpu, 1, (int64_t)R_L11 * vx + R_L12 * vy + R_L13 * vz)); \
-    R_MAC2 = (int)(gte_clamp_mac(cpu, 2, (int64_t)R_L21 * vx + R_L22 * vy + R_L23 * vz)); \
-    R_MAC3 = (int)(gte_clamp_mac(cpu, 3, (int64_t)R_L31 * vx + R_L32 * vy + R_L33 * vz)); \
+    R_MAC1 = gte_clamp_mac(cpu, 1, (I64(R_L11) * vx) + (I64(R_L12) * vy) + (I64(R_L13) * vz)); \
+    R_MAC2 = gte_clamp_mac(cpu, 2, (I64(R_L21) * vx) + (I64(R_L22) * vy) + (I64(R_L23) * vz)); \
+    R_MAC3 = gte_clamp_mac(cpu, 3, (I64(R_L31) * vx) + (I64(R_L32) * vy) + (I64(R_L33) * vz)); \
     R_IR1 = gte_clamp_ir(cpu, 1, R_MAC1, cpu->gte_lm); \
     R_IR2 = gte_clamp_ir(cpu, 2, R_MAC2, cpu->gte_lm); \
-    R_IR3 = gte_clamp_ir_z(cpu, cpu->s_mac3, cpu->gte_sf, cpu->gte_lm); \
-    R_MAC1 = (int)(gte_clamp_mac(cpu, 1, gte_clamp_mac(cpu, 1, gte_clamp_mac(cpu, 1, (long)R_RBK * 0x1000 + R_LR1 * R_IR1) + (long)R_LG1 * R_IR2) + (long)R_LB1 * R_IR3)); \
-    R_MAC2 = (int)(gte_clamp_mac(cpu, 2, gte_clamp_mac(cpu, 2, gte_clamp_mac(cpu, 2, (long)R_GBK * 0x1000 + R_LR2 * R_IR1) + (long)R_LG2 * R_IR2) + (long)R_LB2 * R_IR3)); \
-    R_MAC3 = (int)(gte_clamp_mac(cpu, 3, gte_clamp_mac(cpu, 3, gte_clamp_mac(cpu, 3, (long)R_BBK * 0x1000 + R_LR3 * R_IR1) + (long)R_LG3 * R_IR2) + (long)R_LB3 * R_IR3)); \
+    R_IR3 = gte_clamp_ir(cpu, 3, R_MAC3, cpu->gte_lm); \
+    R_MAC1 = gte_clamp_mac(cpu, 1, gte_check_mac(cpu, 1, gte_check_mac(cpu, 1, (I64(R_RBK) << 12) + (I64(R_LR1) * I64(R_IR1))) + (I64(R_LR2) * I64(R_IR2))) + (I64(R_LR3) * I64(R_IR3))); \
+    R_MAC2 = gte_clamp_mac(cpu, 2, gte_check_mac(cpu, 2, gte_check_mac(cpu, 2, (I64(R_GBK) << 12) + (I64(R_LG1) * I64(R_IR1))) + (I64(R_LG2) * I64(R_IR2))) + (I64(R_LG3) * I64(R_IR3))); \
+    R_MAC3 = gte_clamp_mac(cpu, 3, gte_check_mac(cpu, 3, gte_check_mac(cpu, 3, (I64(R_BBK) << 12) + (I64(R_LB1) * I64(R_IR1))) + (I64(R_LB2) * I64(R_IR2))) + (I64(R_LB3) * I64(R_IR3))); \
     R_IR1 = gte_clamp_ir(cpu, 1, R_MAC1, cpu->gte_lm); \
     R_IR2 = gte_clamp_ir(cpu, 2, R_MAC2, cpu->gte_lm); \
-    R_IR3 = gte_clamp_ir_z(cpu, cpu->s_mac3, cpu->gte_sf, cpu->gte_lm); \
-    R_MAC1 = (int)gte_clamp_mac(cpu, 1, (R_RGB0 * R_IR1) << 4); \
-    R_MAC2 = (int)gte_clamp_mac(cpu, 2, (R_RGB1 * R_IR2) << 4); \
-    R_MAC3 = (int)gte_clamp_mac(cpu, 3, (R_RGB2 * R_IR3) << 4); \
-    R_MAC1 = (int)gte_clamp_mac(cpu, 1, R_MAC1); \
-    R_MAC2 = (int)gte_clamp_mac(cpu, 2, R_MAC2); \
-    R_MAC3 = (int)gte_clamp_mac(cpu, 3, R_MAC3); \
+    R_IR3 = gte_clamp_ir(cpu, 3, R_MAC3, cpu->gte_lm); \
+    R_MAC1 = gte_clamp_mac(cpu, 1, (I64(R_RC) * I64(R_IR1)) << 4); \
+    R_MAC2 = gte_clamp_mac(cpu, 2, (I64(R_GC) * I64(R_IR2)) << 4); \
+    R_MAC3 = gte_clamp_mac(cpu, 3, (I64(R_BC) * I64(R_IR3)) << 4); \
     R_RGB0 = R_RGB1; \
     R_RGB1 = R_RGB2; \
     R_CD2 = R_CODE; \
@@ -1809,25 +1817,24 @@
     R_BC2 = gte_clamp_rgb(cpu, 3, R_MAC3 >> 4); \
     R_IR1 = gte_clamp_ir(cpu, 1, R_MAC1, cpu->gte_lm); \
     R_IR2 = gte_clamp_ir(cpu, 2, R_MAC2, cpu->gte_lm); \
-    R_IR3 = gte_clamp_ir_z(cpu, cpu->s_mac3, cpu->gte_sf, cpu->gte_lm); }
+    R_IR3 = gte_clamp_ir(cpu, 3, R_MAC3, cpu->gte_lm); }
 
 #define NCS(i) { \
-    R_FLAG = 0; \
     int64_t vx = (int64_t)((int16_t)cpu->cop2_dr.v[i].p[0]); \
     int64_t vy = (int64_t)((int16_t)cpu->cop2_dr.v[i].p[1]); \
     int64_t vz = (int64_t)cpu->cop2_dr.v[i].z; \
-    R_MAC1 = (int)(gte_clamp_mac(cpu, 1, (int64_t)R_L11 * vx + R_L12 * vy + R_L13 * vz)); \
-    R_MAC2 = (int)(gte_clamp_mac(cpu, 2, (int64_t)R_L21 * vx + R_L22 * vy + R_L23 * vz)); \
-    R_MAC3 = (int)(gte_clamp_mac(cpu, 3, (int64_t)R_L31 * vx + R_L32 * vy + R_L33 * vz)); \
+    R_MAC1 = gte_clamp_mac(cpu, 1, (I64(R_L11) * vx) + (I64(R_L12) * vy) + (I64(R_L13) * vz)); \
+    R_MAC2 = gte_clamp_mac(cpu, 2, (I64(R_L21) * vx) + (I64(R_L22) * vy) + (I64(R_L23) * vz)); \
+    R_MAC3 = gte_clamp_mac(cpu, 3, (I64(R_L31) * vx) + (I64(R_L32) * vy) + (I64(R_L33) * vz)); \
     R_IR1 = gte_clamp_ir(cpu, 1, R_MAC1, cpu->gte_lm); \
     R_IR2 = gte_clamp_ir(cpu, 2, R_MAC2, cpu->gte_lm); \
-    R_IR3 = gte_clamp_ir_z(cpu, cpu->s_mac3, cpu->gte_sf, cpu->gte_lm); \
-    R_MAC1 = (int)(gte_clamp_mac(cpu, 1, gte_clamp_mac(cpu, 1, gte_clamp_mac(cpu, 1, (long)R_RBK * 0x1000 + R_LR1 * R_IR1) + (long)R_LG1 * R_IR2) + (long)R_LB1 * R_IR3)); \
-    R_MAC2 = (int)(gte_clamp_mac(cpu, 2, gte_clamp_mac(cpu, 2, gte_clamp_mac(cpu, 2, (long)R_GBK * 0x1000 + R_LR2 * R_IR1) + (long)R_LG2 * R_IR2) + (long)R_LB2 * R_IR3)); \
-    R_MAC3 = (int)(gte_clamp_mac(cpu, 3, gte_clamp_mac(cpu, 3, gte_clamp_mac(cpu, 3, (long)R_BBK * 0x1000 + R_LR3 * R_IR1) + (long)R_LG3 * R_IR2) + (long)R_LB3 * R_IR3)); \
+    R_IR3 = gte_clamp_ir(cpu, 3, R_MAC3, cpu->gte_lm); \
+    R_MAC1 = gte_clamp_mac(cpu, 1, gte_check_mac(cpu, 1, gte_check_mac(cpu, 1, (I64(R_RBK) << 12) + (I64(R_LR1) * I64(R_IR1))) + (I64(R_LR2) * I64(R_IR2))) + (I64(R_LR3) * I64(R_IR3))); \
+    R_MAC2 = gte_clamp_mac(cpu, 2, gte_check_mac(cpu, 2, gte_check_mac(cpu, 2, (I64(R_GBK) << 12) + (I64(R_LG1) * I64(R_IR1))) + (I64(R_LG2) * I64(R_IR2))) + (I64(R_LG3) * I64(R_IR3))); \
+    R_MAC3 = gte_clamp_mac(cpu, 3, gte_check_mac(cpu, 3, gte_check_mac(cpu, 3, (I64(R_BBK) << 12) + (I64(R_LB1) * I64(R_IR1))) + (I64(R_LB2) * I64(R_IR2))) + (I64(R_LB3) * I64(R_IR3))); \
     R_IR1 = gte_clamp_ir(cpu, 1, R_MAC1, cpu->gte_lm); \
     R_IR2 = gte_clamp_ir(cpu, 2, R_MAC2, cpu->gte_lm); \
-    R_IR3 = gte_clamp_ir_z(cpu, cpu->s_mac3, cpu->gte_sf, cpu->gte_lm); \
+    R_IR3 = gte_clamp_ir(cpu, 3, R_MAC3, cpu->gte_lm); \
     R_RGB0 = R_RGB1; \
     R_RGB1 = R_RGB2; \
     R_CD2 = R_CODE; \
@@ -1836,10 +1843,9 @@
     R_BC2 = gte_clamp_rgb(cpu, 3, R_MAC3 >> 4); \
     R_IR1 = gte_clamp_ir(cpu, 1, R_MAC1, cpu->gte_lm); \
     R_IR2 = gte_clamp_ir(cpu, 2, R_MAC2, cpu->gte_lm); \
-    R_IR3 = gte_clamp_ir_z(cpu, cpu->s_mac3, cpu->gte_sf, cpu->gte_lm); }
+    R_IR3 = gte_clamp_ir(cpu, 3, R_MAC3, cpu->gte_lm); }
 
 #define NCDS(i) { \
-    R_FLAG = 0; \
     int64_t vx = (int64_t)((int16_t)cpu->cop2_dr.v[i].p[0]); \
     int64_t vy = (int64_t)((int16_t)cpu->cop2_dr.v[i].p[1]); \
     int64_t vz = (int64_t)cpu->cop2_dr.v[i].z; \
@@ -1849,21 +1855,9 @@
     R_IR1 = gte_clamp_ir(cpu, 1, R_MAC1, cpu->gte_lm); \
     R_IR2 = gte_clamp_ir(cpu, 2, R_MAC2, cpu->gte_lm); \
     R_IR3 = gte_clamp_ir(cpu, 3, R_MAC3, cpu->gte_lm); \
-    gte_check_mac(cpu, 1, ((I64(R_RBK) << 12) + (I64(R_LR1) * I64(R_IR1)))); \
-    gte_check_mac(cpu, 2, ((I64(R_GBK) << 12) + (I64(R_LG1) * I64(R_IR1)))); \
-    gte_check_mac(cpu, 3, ((I64(R_BBK) << 12) + (I64(R_LB1) * I64(R_IR1)))); \
-    gte_check_mac(cpu, 1, I64(R_LR2) * I64(R_IR2)); \
-    gte_check_mac(cpu, 2, I64(R_LG2) * I64(R_IR2)); \
-    gte_check_mac(cpu, 3, I64(R_LB2) * I64(R_IR2)); \
-    gte_check_mac(cpu, 1, I64(R_LR3) * I64(R_IR3)); \
-    gte_check_mac(cpu, 2, I64(R_LG3) * I64(R_IR3)); \
-    gte_check_mac(cpu, 3, I64(R_LB3) * I64(R_IR3)); \
-    R_MAC1 = gte_clamp_mac(cpu, 1, ((I64(R_RBK) << 12) + (I64(R_LR1) * I64(R_IR1))) + (I64(R_LR2) * I64(R_IR2)) + (I64(R_LR3) * I64(R_IR3))); \
-    R_MAC2 = gte_clamp_mac(cpu, 2, ((I64(R_GBK) << 12) + (I64(R_LG1) * I64(R_IR1))) + (I64(R_LG2) * I64(R_IR2)) + (I64(R_LG3) * I64(R_IR3))); \
-    R_MAC3 = gte_clamp_mac(cpu, 3, ((I64(R_BBK) << 12) + (I64(R_LB1) * I64(R_IR1))) + (I64(R_LB2) * I64(R_IR2)) + (I64(R_LB3) * I64(R_IR3))); \
-    /* R_MAC1 = gte_clamp_mac(cpu, 1, () + () + ()); */ \
-    /* R_MAC2 = gte_clamp_mac(cpu, 2, () + () + ()); */ \
-    /* R_MAC3 = gte_clamp_mac(cpu, 3, () + () + ()); */ \
+    R_MAC1 = gte_clamp_mac(cpu, 1, gte_check_mac(cpu, 1, gte_check_mac(cpu, 1, (I64(R_RBK) << 12) + (I64(R_LR1) * I64(R_IR1))) + (I64(R_LR2) * I64(R_IR2))) + (I64(R_LR3) * I64(R_IR3))); \
+    R_MAC2 = gte_clamp_mac(cpu, 2, gte_check_mac(cpu, 2, gte_check_mac(cpu, 2, (I64(R_GBK) << 12) + (I64(R_LG1) * I64(R_IR1))) + (I64(R_LG2) * I64(R_IR2))) + (I64(R_LG3) * I64(R_IR3))); \
+    R_MAC3 = gte_clamp_mac(cpu, 3, gte_check_mac(cpu, 3, gte_check_mac(cpu, 3, (I64(R_BBK) << 12) + (I64(R_LB1) * I64(R_IR1))) + (I64(R_LB2) * I64(R_IR2))) + (I64(R_LB3) * I64(R_IR3))); \
     R_IR1 = gte_clamp_ir(cpu, 1, R_MAC1, cpu->gte_lm); \
     R_IR2 = gte_clamp_ir(cpu, 2, R_MAC2, cpu->gte_lm); \
     R_IR3 = gte_clamp_ir(cpu, 3, R_MAC3, cpu->gte_lm); \
@@ -1883,25 +1877,44 @@
     R_GC2 = gte_clamp_rgb(cpu, 2, R_MAC2 >> 4); \
     R_BC2 = gte_clamp_rgb(cpu, 3, R_MAC3 >> 4); }
 
-void gte_interpolate_color(psx_cpu_t* cpu, uint64_t mac1, uint64_t mac2, uint64_t mac3) {
-    R_MAC1 = (int)(gte_clamp_mac(cpu, 1, ((long)R_RFC << 12) - mac1));
-    R_MAC2 = (int)(gte_clamp_mac(cpu, 2, ((long)R_GFC << 12) - mac2));
-    R_MAC3 = (int)(gte_clamp_mac(cpu, 3, ((long)R_BFC << 12) - mac3));
+void gte_interpolate_color(psx_cpu_t* cpu, int64_t mac1, int64_t mac2, int64_t mac3) {
+    R_MAC1 = gte_clamp_mac(cpu, 1, (I64(R_RFC) << 12) - mac1);
+    R_MAC2 = gte_clamp_mac(cpu, 2, (I64(R_GFC) << 12) - mac2);
+    R_MAC3 = gte_clamp_mac(cpu, 3, (I64(R_BFC) << 12) - mac3);
 
-    R_IR1 = gte_clamp_ir(cpu, 1, R_MAC1, cpu->gte_lm);
-    R_IR2 = gte_clamp_ir(cpu, 2, R_MAC2, cpu->gte_lm);
-    R_IR3 = gte_clamp_ir_z(cpu, cpu->s_mac3, cpu->gte_sf, cpu->gte_lm);
+    // printf("input=(%08x, %08x, %08x) (%d, %d, %d) mac=(%08x, %08x, %08x), (%d, %d, %d) fc=(%08x, %08x, %08x)\n",
+    //     mac1,
+    //     mac2,
+    //     mac3,
+    //     mac1,
+    //     mac2,
+    //     mac3,
+    //     R_MAC1,
+    //     R_MAC2,
+    //     R_MAC3,
+    //     R_MAC1,
+    //     R_MAC2,
+    //     R_MAC3,
+    //     I64(R_RFC) << 12,
+    //     I64(R_GFC) << 12,
+    //     I64(R_BFC) << 12
+    // );
 
-    R_MAC1 = (int)(gte_clamp_mac(cpu, 1, ((long)R_IR1 * R_IR0) + mac1));
-    R_MAC2 = (int)(gte_clamp_mac(cpu, 2, ((long)R_IR2 * R_IR0) + mac2));
-    R_MAC3 = (int)(gte_clamp_mac(cpu, 3, ((long)R_IR3 * R_IR0) + mac3));
+    R_IR1 = gte_clamp_ir(cpu, 1, R_MAC1, 0);
+    R_IR2 = gte_clamp_ir(cpu, 2, R_MAC2, 0);
+    R_IR3 = gte_clamp_ir(cpu, 3, R_MAC3, 0);
 
-    R_IR1 = gte_clamp_ir(cpu, 1, R_MAC1, cpu->gte_lm);
-    R_IR2 = gte_clamp_ir(cpu, 2, R_MAC2, cpu->gte_lm);
-    R_IR3 = gte_clamp_ir_z(cpu, cpu->s_mac3, cpu->gte_sf, cpu->gte_lm);
+    R_MAC1 = gte_clamp_mac(cpu, 1, (R_IR1 * R_IR0) + mac1);
+    R_MAC2 = gte_clamp_mac(cpu, 2, (R_IR2 * R_IR0) + mac2);
+    R_MAC3 = gte_clamp_mac(cpu, 3, (R_IR3 * R_IR0) + mac3);
+
+    R_IR1 = gte_clamp_ir(cpu, 1, R_MAC1, cpu->gte_sf);
+    R_IR2 = gte_clamp_ir(cpu, 2, R_MAC2, cpu->gte_sf);
+    R_IR3 = gte_clamp_ir(cpu, 3, R_MAC3, cpu->gte_sf);
 }
 
 void psx_gte_i_rtps(psx_cpu_t* cpu) {
+    R_FLAG = 0;
     GTE_RTP_DQ(0);
 }
 
@@ -1938,9 +1951,9 @@
     int64_t ir2 = gte_clamp_ir(cpu, 2, mac2, 0);
     int64_t ir3 = gte_clamp_ir(cpu, 3, mac3, 0);
 
-    R_MAC1 = gte_clamp_mac(cpu, 1, (((int64_t)R_RC) << 16) + (R_IR0 * ir1));
-    R_MAC2 = gte_clamp_mac(cpu, 2, (((int64_t)R_GC) << 16) + (R_IR0 * ir2));
-    R_MAC3 = gte_clamp_mac(cpu, 3, (((int64_t)R_BC) << 16) + (R_IR0 * ir3));
+    R_MAC1 = gte_clamp_mac(cpu, 1, (I64(R_RC) << 16) + (R_IR0 * ir1));
+    R_MAC2 = gte_clamp_mac(cpu, 2, (I64(R_GC) << 16) + (R_IR0 * ir2));
+    R_MAC3 = gte_clamp_mac(cpu, 3, (I64(R_BC) << 16) + (R_IR0 * ir3));
 
     R_IR1 = gte_clamp_ir(cpu, 1, R_MAC1, cpu->gte_lm);
     R_IR2 = gte_clamp_ir(cpu, 2, R_MAC2, cpu->gte_lm);
@@ -2012,7 +2025,7 @@
         case 1: mx = cpu->cop2_cr.l; break;
         case 2: mx = cpu->cop2_cr.lr; break;
         case 3: {
-            R_MX11 = -R_RC << 4;
+            R_MX11 = -(R_RC << 4);
             R_MX12 = R_RC << 4;
             R_MX13 = R_IR0;
             R_MX21 = R_RT13;
@@ -2049,21 +2062,21 @@
 
     // Bugged case (CV=FC)
     if (cpu->gte_cv == 2) {
-        R_MAC1 = gte_clamp_mac(cpu, 1, (int64_t)(I64(R_MX12) * I64(R_VY)) + (I64(R_MX13) * I64(R_VZ)));
-        R_MAC2 = gte_clamp_mac(cpu, 2, (int64_t)(I64(R_MX22) * I64(R_VY)) + (I64(R_MX23) * I64(R_VZ)));
-        R_MAC3 = gte_clamp_mac(cpu, 3, (int64_t)(I64(R_MX32) * I64(R_VY)) + (I64(R_MX33) * I64(R_VZ)));
+        R_MAC1 = gte_clamp_mac(cpu, 1, gte_check_mac(cpu, 1, I64(R_MX12) * I64(R_VY)) + (I64(R_MX13) * I64(R_VZ)));
+        R_MAC2 = gte_clamp_mac(cpu, 2, gte_check_mac(cpu, 2, I64(R_MX22) * I64(R_VY)) + (I64(R_MX23) * I64(R_VZ)));
+        R_MAC3 = gte_clamp_mac(cpu, 3, gte_check_mac(cpu, 3, I64(R_MX32) * I64(R_VY)) + (I64(R_MX33) * I64(R_VZ)));
 
-        int64_t mac1 = gte_clamp_mac(cpu, 1, (((int64_t)R_CV1) << 12) + (I64(R_MX11) * I64(R_VX))); 
-        int64_t mac2 = gte_clamp_mac(cpu, 2, (((int64_t)R_CV2) << 12) + (I64(R_MX21) * I64(R_VX))); 
-        int64_t mac3 = gte_clamp_mac(cpu, 3, (((int64_t)R_CV3) << 12) + (I64(R_MX31) * I64(R_VX))); 
+        int64_t mac1 = gte_clamp_mac(cpu, 1, (I64(R_CV1) << 12) + (I64(R_MX11) * I64(R_VX))); 
+        int64_t mac2 = gte_clamp_mac(cpu, 2, (I64(R_CV2) << 12) + (I64(R_MX21) * I64(R_VX))); 
+        int64_t mac3 = gte_clamp_mac(cpu, 3, (I64(R_CV3) << 12) + (I64(R_MX31) * I64(R_VX))); 
 
         gte_clamp_ir(cpu, 1, mac1, 0);
         gte_clamp_ir(cpu, 2, mac2, 0);
         gte_clamp_ir(cpu, 3, mac3, 0);
     } else {
-        R_MAC1 = gte_clamp_mac(cpu, 1, (((int64_t)R_CV1) << 12) + (I64(R_MX11) * I64(R_VX)) + (I64(R_MX12) * I64(R_VY)) + (I64(R_MX13) * I64(R_VZ)));
-        R_MAC2 = gte_clamp_mac(cpu, 2, (((int64_t)R_CV2) << 12) + (I64(R_MX21) * I64(R_VX)) + (I64(R_MX22) * I64(R_VY)) + (I64(R_MX23) * I64(R_VZ)));
-        R_MAC3 = gte_clamp_mac(cpu, 3, (((int64_t)R_CV3) << 12) + (I64(R_MX31) * I64(R_VX)) + (I64(R_MX32) * I64(R_VY)) + (I64(R_MX33) * I64(R_VZ)));
+        R_MAC1 = gte_clamp_mac(cpu, 1, gte_check_mac(cpu, 1, gte_check_mac(cpu, 1, (I64(R_CV1) << 12) + (I64(R_MX11) * I64(R_VX))) + (I64(R_MX12) * I64(R_VY))) + (I64(R_MX13) * I64(R_VZ)));
+        R_MAC2 = gte_clamp_mac(cpu, 2, gte_check_mac(cpu, 2, gte_check_mac(cpu, 2, (I64(R_CV2) << 12) + (I64(R_MX21) * I64(R_VX))) + (I64(R_MX22) * I64(R_VY))) + (I64(R_MX23) * I64(R_VZ)));
+        R_MAC3 = gte_clamp_mac(cpu, 3, gte_check_mac(cpu, 3, gte_check_mac(cpu, 3, (I64(R_CV3) << 12) + (I64(R_MX31) * I64(R_VX))) + (I64(R_MX32) * I64(R_VY))) + (I64(R_MX33) * I64(R_VZ)));
     }
 
     R_IR1 = gte_clamp_ir(cpu, 1, R_MAC1, cpu->gte_lm);
@@ -2090,14 +2103,38 @@
 
 // To-do: Fix flags
 void psx_gte_i_ncds(psx_cpu_t* cpu) {
+    R_FLAG = 0;
+
     NCDS(0);
 }
 
 void psx_gte_i_cdp(psx_cpu_t* cpu) {
-    printf("cdp: Unimplemented GTE instruction\n");
+    R_FLAG = 0;
+    R_MAC1 = gte_clamp_mac(cpu, 1, gte_check_mac(cpu, 1, gte_check_mac(cpu, 1, (I64(R_RBK) << 12) + (I64(R_LR1) * I64(R_IR1))) + (I64(R_LR2) * I64(R_IR2))) + (I64(R_LR3) * I64(R_IR3)));
+    R_MAC2 = gte_clamp_mac(cpu, 2, gte_check_mac(cpu, 2, gte_check_mac(cpu, 2, (I64(R_GBK) << 12) + (I64(R_LG1) * I64(R_IR1))) + (I64(R_LG2) * I64(R_IR2))) + (I64(R_LG3) * I64(R_IR3)));
+    R_MAC3 = gte_clamp_mac(cpu, 3, gte_check_mac(cpu, 3, gte_check_mac(cpu, 3, (I64(R_BBK) << 12) + (I64(R_LB1) * I64(R_IR1))) + (I64(R_LB2) * I64(R_IR2))) + (I64(R_LB3) * I64(R_IR3)));
+    R_IR1 = gte_clamp_ir(cpu, 1, R_MAC1, cpu->gte_lm);
+    R_IR2 = gte_clamp_ir(cpu, 2, R_MAC2, cpu->gte_lm);
+    R_IR3 = gte_clamp_ir(cpu, 3, R_MAC3, cpu->gte_lm);
+    int64_t ir1 = gte_clamp_ir(cpu, 1, gte_clamp_mac(cpu, 1, ((I64(R_RFC) << 12) - ((I64(R_RC << 4)) * I64(R_IR1)))), 0);
+    int64_t ir2 = gte_clamp_ir(cpu, 2, gte_clamp_mac(cpu, 2, ((I64(R_GFC) << 12) - ((I64(R_GC << 4)) * I64(R_IR2)))), 0);
+    int64_t ir3 = gte_clamp_ir(cpu, 3, gte_clamp_mac(cpu, 3, ((I64(R_BFC) << 12) - ((I64(R_BC << 4)) * I64(R_IR3)))), 0);
+    R_MAC1 = gte_clamp_mac(cpu, 1, (I64(R_RC << 4) * I64(R_IR1)) + (I64(R_IR0) * ir1));
+    R_MAC2 = gte_clamp_mac(cpu, 2, (I64(R_GC << 4) * I64(R_IR2)) + (I64(R_IR0) * ir2));
+    R_MAC3 = gte_clamp_mac(cpu, 3, (I64(R_BC << 4) * I64(R_IR3)) + (I64(R_IR0) * ir3));
+    R_IR1 = gte_clamp_ir(cpu, 1, R_MAC1, cpu->gte_lm);
+    R_IR2 = gte_clamp_ir(cpu, 2, R_MAC2, cpu->gte_lm);
+    R_IR3 = gte_clamp_ir(cpu, 3, R_MAC3, cpu->gte_lm);
+    R_RGB0 = R_RGB1;
+    R_RGB1 = R_RGB2;
+    R_RC2 = gte_clamp_rgb(cpu, 1, R_MAC1 >> 4);
+    R_GC2 = gte_clamp_rgb(cpu, 2, R_MAC2 >> 4);
+    R_BC2 = gte_clamp_rgb(cpu, 3, R_MAC3 >> 4);
+    R_CD2 = R_CODE;
 }
 
 void psx_gte_i_ncdt(psx_cpu_t* cpu) {
+    R_FLAG = 0;
     NCDS(0);
     NCDS(1);
     NCDS(2);
@@ -2104,18 +2141,39 @@
 }
 
 void psx_gte_i_nccs(psx_cpu_t* cpu) {
+    R_FLAG = 0;
     NCCS(0);
 }
 
 void psx_gte_i_cc(psx_cpu_t* cpu) {
-    NCS(0); // Hack
+    R_FLAG = 0;
+    R_MAC1 = gte_clamp_mac(cpu, 1, gte_check_mac(cpu, 1, gte_check_mac(cpu, 1, (I64(R_RBK) << 12) + (I64(R_LR1) * I64(R_IR1))) + (I64(R_LR2) * I64(R_IR2))) + (I64(R_LR3) * I64(R_IR3)));
+    R_MAC2 = gte_clamp_mac(cpu, 2, gte_check_mac(cpu, 2, gte_check_mac(cpu, 2, (I64(R_GBK) << 12) + (I64(R_LG1) * I64(R_IR1))) + (I64(R_LG2) * I64(R_IR2))) + (I64(R_LG3) * I64(R_IR3)));
+    R_MAC3 = gte_clamp_mac(cpu, 3, gte_check_mac(cpu, 3, gte_check_mac(cpu, 3, (I64(R_BBK) << 12) + (I64(R_LB1) * I64(R_IR1))) + (I64(R_LB2) * I64(R_IR2))) + (I64(R_LB3) * I64(R_IR3)));
+    R_IR1 = gte_clamp_ir(cpu, 1, R_MAC1, cpu->gte_lm);
+    R_IR2 = gte_clamp_ir(cpu, 2, R_MAC2, cpu->gte_lm);
+    R_IR3 = gte_clamp_ir(cpu, 3, R_MAC3, cpu->gte_lm);
+    R_MAC1 = gte_clamp_mac(cpu, 1, (I64(R_RC) * I64(R_IR1)) << 4);
+    R_MAC2 = gte_clamp_mac(cpu, 2, (I64(R_GC) * I64(R_IR2)) << 4);
+    R_MAC3 = gte_clamp_mac(cpu, 3, (I64(R_BC) * I64(R_IR3)) << 4);
+    R_RGB0 = R_RGB1;
+    R_RGB1 = R_RGB2;
+    R_CD2 = R_CODE;
+    R_RC2 = gte_clamp_rgb(cpu, 1, R_MAC1 >> 4);
+    R_GC2 = gte_clamp_rgb(cpu, 2, R_MAC2 >> 4);
+    R_BC2 = gte_clamp_rgb(cpu, 3, R_MAC3 >> 4);
+    R_IR1 = gte_clamp_ir(cpu, 1, R_MAC1, cpu->gte_lm);
+    R_IR2 = gte_clamp_ir(cpu, 2, R_MAC2, cpu->gte_lm);
+    R_IR3 = gte_clamp_ir(cpu, 3, R_MAC3, cpu->gte_lm);
 }
 
 void psx_gte_i_ncs(psx_cpu_t* cpu) {
+    R_FLAG = 0;
     NCS(0);
 }
 
 void psx_gte_i_nct(psx_cpu_t* cpu) {
+    R_FLAG = 0;
     NCS(0);
     NCS(1);
     NCS(2);
@@ -2134,12 +2192,20 @@
 }
 
 void psx_gte_i_dcpl(psx_cpu_t* cpu) {
-    R_MAC1 = (int)(gte_clamp_mac(cpu, 1, R_RC * R_IR1) << 4);
-    R_MAC2 = (int)(gte_clamp_mac(cpu, 2, R_GC * R_IR2) << 4);
-    R_MAC3 = (int)(gte_clamp_mac(cpu, 3, R_BC * R_IR3) << 4);
+    R_FLAG = 0;
 
-    gte_interpolate_color(cpu, R_MAC1, R_MAC2, R_MAC3);
-
+    R_MAC1 = gte_clamp_mac(cpu, 1, I64(R_RC) * I64(R_IR1)) << 4;
+    R_MAC2 = gte_clamp_mac(cpu, 2, I64(R_GC) * I64(R_IR2)) << 4;
+    R_MAC3 = gte_clamp_mac(cpu, 3, I64(R_BC) * I64(R_IR3)) << 4;
+    int64_t ir1 = gte_clamp_ir(cpu, 1, gte_clamp_mac(cpu, 1, ((I64(R_RFC) << 12) - ((I64(R_RC << 4)) * I64(R_IR1)))), 0);
+    int64_t ir2 = gte_clamp_ir(cpu, 2, gte_clamp_mac(cpu, 2, ((I64(R_GFC) << 12) - ((I64(R_GC << 4)) * I64(R_IR2)))), 0);
+    int64_t ir3 = gte_clamp_ir(cpu, 3, gte_clamp_mac(cpu, 3, ((I64(R_BFC) << 12) - ((I64(R_BC << 4)) * I64(R_IR3)))), 0);
+    R_MAC1 = gte_clamp_mac(cpu, 1, ((I64(R_RC << 4)) * I64(R_IR1)) + (I64(R_IR0) * ir1));
+    R_MAC2 = gte_clamp_mac(cpu, 2, ((I64(R_GC << 4)) * I64(R_IR2)) + (I64(R_IR0) * ir2));
+    R_MAC3 = gte_clamp_mac(cpu, 3, ((I64(R_BC << 4)) * I64(R_IR3)) + (I64(R_IR0) * ir3));
+    R_IR1 = gte_clamp_ir(cpu, 1, R_MAC1, cpu->gte_lm);
+    R_IR2 = gte_clamp_ir(cpu, 2, R_MAC2, cpu->gte_lm);
+    R_IR3 = gte_clamp_ir(cpu, 3, R_MAC3, cpu->gte_lm);
     R_RGB0 = R_RGB1;
     R_RGB1 = R_RGB2;
     R_CD2 = R_CODE;
@@ -2149,6 +2215,7 @@
 }
 
 void psx_gte_i_dpct(psx_cpu_t* cpu) {
+    R_FLAG = 0;
     DPCT1;
     DPCT1;
     DPCT1;
@@ -2173,6 +2240,7 @@
 }
 
 void psx_gte_i_rtpt(psx_cpu_t* cpu) {
+    R_FLAG = 0;
     GTE_RTP(0);
     GTE_RTP(1);
     GTE_RTP_DQ(2);
@@ -2213,6 +2281,7 @@
 }
 
 void psx_gte_i_ncct(psx_cpu_t* cpu) {
+    R_FLAG = 0;
     NCCS(0);
     NCCS(1);
     NCCS(2);
--- a/psx/dev/cdrom/audio.c
+++ b/psx/dev/cdrom/audio.c
@@ -2,6 +2,7 @@
 #include <string.h>
 
 #include "cdrom.h"
+#include "../spu.h"
 
 #define ITOB(b) itob_table[b]
 
@@ -293,16 +294,16 @@
     }
 
     // We hit pregap (end of previous track)
-    if ((ts == TS_PREGAP) && (cdrom->mode & MODE_AUTOPAUSE)) {
-        cdrom->cdda_remaining_samples = 0;
-        cdrom->cdda_sample_index = 0;
+    // if ((ts == TS_PREGAP) && (cdrom->mode & MODE_AUTOPAUSE)) {
+    //     cdrom->cdda_remaining_samples = 0;
+    //     cdrom->cdda_sample_index = 0;
 
-        memset(buf, 0, size);
+    //     memset(buf, 0, size);
 
-        cdrom->state = CD_STATE_IDLE;
+    //     cdrom->state = CD_STATE_IDLE;
 
-        return;
-    }
+    //     return;
+    // }
 
     cdrom->cdda_remaining_samples = CD_SECTOR_SIZE >> 1;
     cdrom->cdda_sample_index = 0;
@@ -345,9 +346,7 @@
     psx_ic_irq(cdrom->ic, IC_CDROM);
 }
 
-void psx_cdrom_get_audio_samples(psx_cdrom_t* cdrom, void* buf, size_t size, psx_spu_t* spu) {
-    memset(buf, 0, size);
-
+void psx_cdrom_get_audio_samples(psx_cdrom_t* cdrom, void* buf, size_t size) {
     if (!cdrom->disc)
         return;
 
@@ -368,7 +367,7 @@
     float rr_vol = (((float)cdrom->vol[2]) / 255.0f);
     float rl_vol = (((float)cdrom->vol[3]) / 255.0f);
 
-    for (int i = 0; i < size >> 1;) {
+    for (int i = 0; i < (size >> 1);) {
         if (!cdrom->cdda_remaining_samples) {
             cdrom_reload_cdda_buffer(cdrom, buf, size);
 
@@ -391,17 +390,6 @@
             continue;
         }
 
-        // if (cdrom->cdda_sample_index >= (2352 >> 1)) {
-        //     printf("ERROR %08x %u rem=%08x %d\n",
-        //         cdrom->cdda_sample_index,
-        //         cdrom->cdda_sample_index,
-        //         cdrom->cdda_remaining_samples,
-        //         cdrom->cdda_remaining_samples
-        //     );
-
-        //     exit(1);
-        // }
-
         int16_t left = cdrom->cdda_buf[cdrom->cdda_sample_index++];
         int16_t right = cdrom->cdda_buf[cdrom->cdda_sample_index++];
 
@@ -410,11 +398,5 @@
         ptr[i++] = right * rr_vol + left  * lr_vol;
 
         cdrom->cdda_remaining_samples -= 2;
-
-        // if (cdrom->cdda_remaining_samples == 0) {
-        //     printf("ERROR rem=%08x %d", cdrom->cdda_remaining_samples, cdrom->cdda_remaining_samples);
-
-        //     exit(1);
-        // }
     }
 }
\ No newline at end of file
--- a/psx/dev/cdrom/cdrom.c
+++ b/psx/dev/cdrom/cdrom.c
@@ -153,6 +153,14 @@
     cdrom->fake_getlocl_data = 1;
 }
 
+void psx_cdrom_set_version(psx_cdrom_t* cdrom, int version) {
+    cdrom->version = version;
+}
+
+void psx_cdrom_set_region(psx_cdrom_t* cdrom, int region) {
+    cdrom->region = region;
+}
+
 void psx_cdrom_open(psx_cdrom_t* cdrom, const char* path) {
     if (!path)
         return;
@@ -426,6 +434,8 @@
 
     int ts = psx_disc_query(cdrom->disc, cdrom->lba);
 
+    // printf("ts=%u ", ts);
+
     if (ts == TS_FAR) {
         cdrom_error(cdrom,
             CD_STAT_SPINDLE | CD_STAT_SEEKERROR,
@@ -456,14 +466,19 @@
     cdrom->data->write_index = size_bit ? 0x924 : 0x800;
     cdrom->data->write_index += cdrom->data->read_index;
 
-    // printf("size=%x off=%u lba=%d\n",
+    cdrom->pending_lba = cdrom->lba + 1;
+    cdrom->delay = cdrom_get_read_delay(cdrom);
+
+    // printf("size=%x off=%u lba=%d: %02x:%02x:%02x delay=%u\n",
     //     cdrom->data->write_index,
     //     cdrom->data->read_index,
-    //     cdrom->lba
+    //     cdrom->lba,
+    //     cdrom->data->buf[0xc],
+    //     cdrom->data->buf[0xd],
+    //     cdrom->data->buf[0xe],
+    //     cdrom->data->buf[0xf],
+    //     cdrom->delay
     // );
-
-    cdrom->pending_lba = cdrom->lba + 1;
-    cdrom->delay = cdrom_get_read_delay(cdrom);
 }
 
 void psx_cdrom_update(psx_cdrom_t* cdrom, int cycles) {
@@ -602,13 +617,16 @@
     cdrom->state = CD_STATE_TX_RESP1;
 
     cdrom->pending_command = data;
-    cdrom->delay = CD_DELAY_FR;
 
-    // if (cdrom->pending_command == CDL_INIT)
-    //     cdrom->delay = CD_DELAY_INIT_FR;
+    switch (cdrom->pending_command) {
+        case CDL_INIT:
+            cdrom->delay = CD_DELAY_INIT_FR;
+        break;
 
-    if (cdrom->pending_command == CDL_GETLOCP)
-        cdrom->delay *= 4;
+        default:
+            cdrom->delay = CD_DELAY_FR;
+        break;
+    }
 
     if (cdrom->state == CD_STATE_READ)
         cdrom->busy = 1;
--- a/psx/dev/cdrom/cdrom.h
+++ b/psx/dev/cdrom/cdrom.h
@@ -6,7 +6,6 @@
 #include "queue.h"
 #include "disc.h"
 #include "../ic.h"
-#include "../spu.h"
 
 #define PSX_CDROM_BEGIN 0x1f801800
 #define PSX_CDROM_END   0x1f801803
@@ -35,15 +34,9 @@
 #define CD_DELAY_STOP_DS 25845878
 #define CD_DELAY_READ_SS (33868800 / 75)
 #define CD_DELAY_READ_DS (33868800 / (2*75))
-#define CD_DELAY_START_READ \
-    (cdrom_get_read_delay(cdrom) + \
-    cdrom_get_seek_delay(cdrom, ts))
+#define CD_DELAY_START_READ (cdrom_get_read_delay(cdrom))
+#define CD_DELAY_ONGOING_READ (cdrom_get_read_delay(cdrom) + (CD_DELAY_1MS * 4))
 
-#define CD_DELAY_ONGOING_READ \
-    (cdrom_get_read_delay(cdrom) + \
-    cdrom_get_seek_delay(cdrom, ts) + \
-    (CD_DELAY_1MS * 4))
-
 #define XA_STEREO_SAMPLES 2016 // Samples per sector
 #define XA_MONO_SAMPLES 4032 // Samples per sector
 #define XA_STEREO_RESAMPLE_SIZE 2352 // 2352 * 2
@@ -307,7 +300,7 @@
 void psx_cdrom_write16(psx_cdrom_t* cdrom, uint32_t addr, uint32_t value);
 void psx_cdrom_write8(psx_cdrom_t* cdrom, uint32_t addr, uint32_t value);
 void psx_cdrom_update(psx_cdrom_t* cdrom, int cycles);
-void psx_cdrom_get_audio_samples(psx_cdrom_t* cdrom, void* buf, size_t size, psx_spu_t* spu);
+void psx_cdrom_get_audio_samples(psx_cdrom_t* cdrom, void* buf, size_t size);
 void psx_cdrom_destroy(psx_cdrom_t* cdrom);
 
 #endif
\ No newline at end of file
--- a/psx/dev/cdrom/cue.c
+++ b/psx/dev/cdrom/cue.c
@@ -354,21 +354,12 @@
         if ((data->index[0] != -1) && (data->index[1] != -1))
             data->pregap = data->index[1];
 
-        int pregap = data->pregap;
-
-        if (data->pregap)
-            pregap -= prev_pregap;
-
-        *lba += pregap;
-
-        data->start = *lba;
+        data->start = *lba + data->pregap;
         data->end = data->start + (file->size / 0x930);
 
-        *lba += (file->size / 0x930);
+        *lba = data->end;
 
-        prev_pregap = data->pregap;
-
-        return file->size / 0x930;
+        return 0;
     }
 
     // Multiple tracks per file
@@ -571,9 +562,6 @@
 
 int cue_get_track_number(cue_t* cue, uint32_t lba) {
     cue_track_t* track = get_sector_track_in_pregap(cue, lba);
-
-    if (cue_query(cue, lba) == TS_PREGAP)
-        return track->number + 1;
 
     return track->number;
 }
--- a/psx/dev/cdrom/impl.c
+++ b/psx/dev/cdrom/impl.c
@@ -107,6 +107,16 @@
     'I', 'E', 'A'
 };
 
+void cdrom_pause(psx_cdrom_t* cdrom) {
+    cdrom->prev_state = CD_STATE_IDLE;
+    cdrom->state = CD_STATE_IDLE;
+    cdrom->pending_command = 0;
+    cdrom->busy = 0;
+    cdrom->cdda_playing = 0;
+    cdrom->xa_playing = 0;
+    cdrom->read_ongoing = 0;
+}
+
 void cdrom_restore_state(psx_cdrom_t* cdrom) {
     cdrom->state = CD_STATE_IDLE;
 
@@ -143,6 +153,7 @@
     queue_push(cdrom->response, cdrom_get_stat(cdrom));
 
     cdrom->pending_lba = (BTOI(m) * 4500) + (BTOI(s) * 75) + BTOI(f);
+
     cdrom_restore_state(cdrom);
 }
 
@@ -220,7 +231,7 @@
 
     cdrom->state = CD_STATE_READ;
     cdrom->prev_state = CD_STATE_READ;
-    cdrom->delay = CD_DELAY_START_READ;
+    cdrom->delay = cdrom_get_read_delay(cdrom);
     cdrom->read_ongoing = 1;
 }
 
@@ -274,24 +285,18 @@
     if (cdrom->state == CD_STATE_TX_RESP1) {
         cdrom_set_int(cdrom, 3);
 
-        queue_push(cdrom->response, cdrom_get_stat(cdrom));
+        queue_push(cdrom->response, CD_STAT_READ | CD_STAT_SPINDLE);
 
         // Pausing at 1x takes 70ms, 2x takes 35ms
         // but setting delays that high breaks games
-        cdrom->delay = CD_DELAY_1MS;
+        cdrom->delay = CD_DELAY_1MS * ((cdrom->mode & MODE_SPEED) ? 70 : 35);
         cdrom->state = CD_STATE_TX_RESP2;
     } else {
         cdrom_set_int(cdrom, 2);
 
-        queue_push(cdrom->response, cdrom_get_stat(cdrom));
+        queue_push(cdrom->response, CD_STAT_SPINDLE);
 
-        cdrom->prev_state = CD_STATE_IDLE;
-        cdrom->state = CD_STATE_IDLE;
-        cdrom->pending_command = 0;
-        cdrom->busy = 0;
-        cdrom->cdda_playing = 0;
-        cdrom->xa_playing = 0;
-        cdrom->read_ongoing = 0;
+        cdrom_pause(cdrom);
     }
 }
 
@@ -311,13 +316,7 @@
 
         queue_push(cdrom->response, cdrom_get_stat(cdrom));
 
-        cdrom->prev_state = CD_STATE_IDLE;
-        cdrom->state = CD_STATE_IDLE;
-        cdrom->pending_command = 0;
-        cdrom->busy = 0;
-        cdrom->cdda_playing = 0;
-        cdrom->xa_playing = 0;
-        cdrom->read_ongoing = 0;
+        cdrom_pause(cdrom);
     }
 }
 
@@ -368,7 +367,8 @@
     if (prev_speed != (cdrom->mode & MODE_SPEED))
         cdrom->pending_speed_switch_delay = CD_DELAY_1MS;
 
-    cdrom_restore_state(cdrom);
+    cdrom_pause(cdrom);
+    // cdrom_restore_state(cdrom);
 }
 
 void cdrom_cmd_getparam(psx_cdrom_t* cdrom) {
@@ -424,11 +424,11 @@
 }
 
 void cdrom_cmd_getlocp(psx_cdrom_t* cdrom) {
-    int track = psx_disc_get_track_number(cdrom->disc, cdrom->lba);
-    int track_lba = psx_disc_get_track_lba(cdrom->disc, track);
-
     int lba = cdrom->lba;
 
+    int track = psx_disc_get_track_number(cdrom->disc, lba);
+    int track_lba = psx_disc_get_track_lba(cdrom->disc, track);
+
     if (!cdrom->seek_precision)
         lba -= 25;
 
@@ -479,7 +479,7 @@
         cdrom_set_int(cdrom, 2);
         queue_push(cdrom->response, cdrom_get_stat(cdrom));
 
-        cdrom_restore_state(cdrom);
+        cdrom_pause(cdrom);
     }
 }
 
@@ -604,6 +604,27 @@
 
     // To-do: Handle other subfunctions (hard)
     // assert(subf == 32);
+    if (subf == 4) {
+        cdrom_set_int(cdrom, 3);
+
+        queue_push(cdrom->response, CD_STAT_SPINDLE);
+
+        cdrom_restore_state(cdrom);
+
+        return;
+    }
+
+    if (subf == 5) {
+        cdrom_set_int(cdrom, 3);
+
+        queue_push(cdrom->response, 0);
+        queue_push(cdrom->response, 0);
+
+        cdrom_restore_state(cdrom);
+
+        return;
+    }
+
     if (subf != 32) {
         cdrom_error(cdrom,
             CD_STAT_SPINDLE,
@@ -696,7 +717,7 @@
 }
 
 void cdrom_cmd_reset(psx_cdrom_t* cdrom) {
-
+    printf("reset\n");
 }
 
 void cdrom_cmd_getq(psx_cdrom_t* cdrom) {
@@ -708,9 +729,9 @@
         cdrom_set_int(cdrom, 3);
         queue_push(cdrom->response, cdrom_get_stat(cdrom));
 
-        cdrom->delay = CD_DELAY_1MS * 1000;
+        cdrom->delay = CD_DELAY_1MS * 10;
         cdrom->state = CD_STATE_TX_RESP2;
-        cdrom->busy = 1;
+        // cdrom->busy = 1;
     } else {
         cdrom_set_int(cdrom, 2);
         queue_push(cdrom->response, cdrom_get_stat(cdrom));
--- a/psx/dev/dma.c
+++ b/psx/dev/dma.c
@@ -402,7 +402,7 @@
         // exit(1);
     }
 
-    dma->spu_irq_delay = BCR_SIZE(spu) * BCR_BCNT(spu);
+    dma->spu_irq_delay = 32;
 
     if (CHCR_TDIR(spu)) {
         for (int j = 0; j < blocks; j++) {
@@ -482,7 +482,7 @@
     }
 
     if (dma->spu_irq_delay) {
-        dma->spu_irq_delay -= cyc;
+        dma->spu_irq_delay = 0;
 
         if (dma->spu_irq_delay <= 0)
             if (dma->dicr & DICR_DMA4EN)
--- a/psx/dev/gpu.c
+++ b/psx/dev/gpu.c
@@ -178,6 +178,47 @@
     }
 }
 
+uint16_t gpu_fetch_texel_bilinear(psx_gpu_t* gpu, float tx, float ty, uint32_t tpx, uint32_t tpy, uint16_t clutx, uint16_t cluty, int depth) {
+    float txf = floorf(tx);
+    float tyf = floorf(ty);
+    float txc = txf + 1.0f;
+    float tyc = tyf + 1.0f;
+
+    int s0 = gpu_fetch_texel(gpu, (int)txf, (int)tyf, tpx, tpy, clutx, cluty, depth);
+
+    if (!s0)
+        return 0;
+
+    int s1 = gpu_fetch_texel(gpu, (int)txc, (int)tyf, tpx, tpy, clutx, cluty, depth);
+    int s2 = gpu_fetch_texel(gpu, (int)txf, (int)tyc, tpx, tpy, clutx, cluty, depth);
+    int s3 = gpu_fetch_texel(gpu, (int)txc, (int)tyc, tpx, tpy, clutx, cluty, depth);
+
+    float s0r = (s0 >> 0) & 0x1f;
+    float s0g = (s0 >> 5) & 0x1f;
+    float s0b = (s0 >> 10) & 0x1f;
+    float s1r = (s1 >> 0) & 0x1f;
+    float s1g = (s1 >> 5) & 0x1f;
+    float s1b = (s1 >> 10) & 0x1f;
+    float s2r = (s2 >> 0) & 0x1f;
+    float s2g = (s2 >> 5) & 0x1f;
+    float s2b = (s2 >> 10) & 0x1f;
+    float s3r = (s3 >> 0) & 0x1f;
+    float s3g = (s3 >> 5) & 0x1f;
+    float s3b = (s3 >> 10) & 0x1f;
+
+    float q1r = s0r * (txc - tx) + s1r * (tx - txf);
+    float q1g = s0g * (txc - tx) + s1g * (tx - txf);
+    float q1b = s0b * (txc - tx) + s1b * (tx - txf);
+    float q2r = s2r * (txc - tx) + s3r * (tx - txf);
+    float q2g = s2g * (txc - tx) + s3g * (tx - txf);
+    float q2b = s2b * (txc - tx) + s3b * (tx - txf);
+    int qr = q1r * (tyc - ty) + q2r * (ty - tyf);
+    int qg = q1g * (tyc - ty) + q2g * (ty - tyf);
+    int qb = q1b * (tyc - ty) + q2b * (ty - tyf);
+
+    return qr | (qg << 5) | (qb << 10) | (s0 & 0x8000) | (s1 & 0x8000) | (s2 & 0x8000) | (s3 & 0x8000);
+}
+
 #define TL(z, a, b) \
     ((z < 0) || ((z == 0) && ((b.y > a.y) || ((b.y == a.y) && (b.x < a.x)))))
 
@@ -286,10 +327,10 @@
             }
 
             if (data.attrib & PA_TEXTURED) {
-                uint32_t tx = roundf(((z0 * a.tx) + (z1 * b.tx) + (z2 * c.tx)) / area);
-                uint32_t ty = roundf(((z0 * a.ty) + (z1 * b.ty) + (z2 * c.ty)) / area);
+                float tx = ((z0 * a.tx) + (z1 * b.tx) + (z2 * c.tx)) / area;
+                float ty = ((z0 * a.ty) + (z1 * b.ty) + (z2 * c.ty)) / area;
 
-                uint16_t texel = gpu_fetch_texel(gpu, tx, ty, tpx, tpy, clutx, cluty, depth);
+                uint16_t texel = gpu_fetch_texel_bilinear(gpu, tx, ty, tpx, tpy, clutx, cluty, depth);
 
                 if (!texel)
                     continue;
@@ -963,6 +1004,86 @@
     }
 }
 
+void gpu_line(psx_gpu_t* gpu) {
+    switch (gpu->state) {
+        case GPU_STATE_RECV_CMD: {
+            gpu->state = GPU_STATE_RECV_ARGS;
+
+            int shaded   = (gpu->buf[0] & 0x10000000) != 0;
+            int polyline = (gpu->buf[0] & 0x08000000) != 0;
+
+            gpu->cmd_args_remaining = polyline ? -1 : (shaded ? 3 : 2);
+            gpu->line_done = 0;
+        } break;
+
+        case GPU_STATE_RECV_ARGS: {
+            if (gpu->buf[0] & 0x08000000) {
+                if ((gpu->buf[gpu->buf_index - 1] & 0xf000f000) == 0x50005000) {
+                    gpu->state = GPU_STATE_RECV_CMD;
+
+                    return;
+                }
+
+                // int shaded = (gpu->buf[0] & 0x10000000) != 0;
+
+                // if (shaded) {
+                //     if (gpu->buf_index > 2) {
+
+                //     }
+                // }
+
+                // if (gpu->buf_index > overflow) {
+                //     vertex_t v0, v1;
+
+                //     if (shaded) {
+                //         v0.c = gpu->buf[0] & 0xffffff;
+                //         v1.c = gpu->buf[4] & 0xffffff;
+                //         v0.x = gpu->buf[1] & 0xffff;
+                //         v0.y = gpu->buf[1] >> 16;
+                //         v1.x = gpu->buf[3] & 0xffff;
+                //         v1.y = gpu->buf[3] >> 16;
+                //     } else {
+                //         v0.c = gpu->buf[0] & 0xffffff;
+                //         v1.c = gpu->buf[0] & 0xffffff;
+                //         v0.x = gpu->buf[1] & 0xffff;
+                //         v0.y = gpu->buf[1] >> 16;
+                //         v1.x = gpu->buf[2] & 0xffff;
+                //         v1.y = gpu->buf[2] >> 16;
+                //     }
+
+                //     gpu->prev_line_vertex = v1;
+
+                //     gpu_render_flat_line(gpu, v0, v1, gpu->buf[0] & 0xffffff);
+
+                //     gpu->buf_index = 1;
+                // }
+            } else if (!gpu->cmd_args_remaining) {
+                vertex_t v0, v1;
+
+                if (gpu->buf[0] & 0x10000000) {
+                    v0.c = gpu->buf[0] & 0xffffff;
+                    v1.c = gpu->buf[2] & 0xffffff;
+                    v0.x = gpu->buf[1] & 0xffff;
+                    v0.y = gpu->buf[1] >> 16;
+                    v1.x = gpu->buf[3] & 0xffff;
+                    v1.y = gpu->buf[3] >> 16;
+                } else {
+                    v0.c = gpu->buf[0] & 0xffffff;
+                    v1.c = gpu->buf[0] & 0xffffff;
+                    v0.x = gpu->buf[1] & 0xffff;
+                    v0.y = gpu->buf[1] >> 16;
+                    v1.x = gpu->buf[2] & 0xffff;
+                    v1.y = gpu->buf[2] >> 16;
+                }
+
+                gpu_render_flat_line(gpu, v0, v1, BGR555(gpu->buf[0] & 0xffffff));
+
+                gpu->state = GPU_STATE_RECV_CMD;
+            }
+        } break;
+    }
+}
+
 void gpu_cmd_a0(psx_gpu_t* gpu) {
     switch (gpu->state) {
         case GPU_STATE_RECV_CMD: {
@@ -1601,18 +1722,12 @@
 void psx_gpu_update_cmd(psx_gpu_t* gpu) {
     int type = (gpu->buf[0] >> 29) & 7;
 
-    if (type == 3) {
-        gpu_rect(gpu);
-
-        return;
+    switch (type) {
+        case 1: gpu_poly(gpu); return;
+        case 2: gpu_line(gpu); return;
+        case 3: gpu_rect(gpu); return;
     }
 
-    if (type == 1) {
-        gpu_poly(gpu);
-
-        return;
-    }
-
     switch (gpu->buf[0] >> 24) {
         case 0x00: /* nop */ break;
         case 0x01: /* Cache clear */ break;
@@ -1820,7 +1935,7 @@
         // GetlocP commands, if the timer is too slow it will
         // break.
         // if (!(gpu->line & 7))
-            // psx_ic_irq(gpu->ic, IC_TIMER2);
+        //     psx_ic_irq(gpu->ic, IC_SPU);
             // psx_ic_irq(gpu->ic, IC_SPU);
     } else {
         gpu->gpustat &= ~(1 << 31);
--- a/psx/dev/gpu.h
+++ b/psx/dev/gpu.h
@@ -102,6 +102,8 @@
     int buf_index;
     int cmd_args_remaining;
     int cmd_data_remaining;
+    int line_done;
+    vertex_t prev_line_vertex;
 
     // Command counters
     uint32_t color;
--- a/psx/dev/ic.c
+++ b/psx/dev/ic.c
@@ -70,8 +70,11 @@
     }
 
     // Emulate acknowledge
-    if (!(ic->stat & ic->mask))
+    if (!(ic->stat & ic->mask)) {
         ic->cpu->cop0_r[COP0_CAUSE] &= ~SR_IM2;
+    } else {
+        psx_cpu_set_irq_pending(ic->cpu);
+    }
 }
 
 void psx_ic_write16(psx_ic_t* ic, uint32_t offset, uint16_t value) {
@@ -83,8 +86,11 @@
     }
 
     // Emulate acknowledge
-    if (!(ic->stat & ic->mask))
+    if (!(ic->stat & ic->mask)) {
         ic->cpu->cop0_r[COP0_CAUSE] &= ~SR_IM2;
+    } else {
+        psx_cpu_set_irq_pending(ic->cpu);
+    }
 }
 
 void psx_ic_write8(psx_ic_t* ic, uint32_t offset, uint8_t value) {
@@ -100,15 +106,15 @@
     }
 
     // Emulate acknowledge
-    if (!(ic->stat & ic->mask))
+    if (!(ic->stat & ic->mask)) {
         ic->cpu->cop0_r[COP0_CAUSE] &= ~SR_IM2;
+    } else {
+        psx_cpu_set_irq_pending(ic->cpu);
+    }
 }
 
 void psx_ic_irq(psx_ic_t* ic, int id) {
     ic->stat |= id;
-
-    // if (ic->mask & (1 << id))
-    //     printf("%u IRQ gone through\n");
 
     if (ic->mask & ic->stat)
         psx_cpu_set_irq_pending(ic->cpu);
--- a/psx/dev/mc2.c
+++ b/psx/dev/mc2.c
@@ -45,7 +45,7 @@
 
 void psx_mc2_write32(psx_mc2_t* mc2, uint32_t offset, uint32_t value) {
     switch (offset) {
-        case 0x00: mc2->ram_size = value; break;
+        case 0x00: printf("ram_size write %08x\n", value); mc2->ram_size = value; break;
 
         default: {
             log_warn("Unhandled 32-bit MC2 write at offset %08x (%08x)", offset, value);
--- a/psx/dev/mcd.c
+++ b/psx/dev/mcd.c
@@ -157,7 +157,7 @@
     // log_fatal("mcd write %02x", data);
     // log_set_quiet(1);
 
-    // printf("mcd write %02x\n", data);
+    printf("mcd write %02x\n", data);
 
     switch (mcd->state) {
         case MCD_STATE_TX_FLG: mcd->mode = data; break;
--- a/psx/dev/pad.c
+++ b/psx/dev/pad.c
@@ -100,6 +100,13 @@
 
                 psx_mcd_write(mcd, data);
 
+                if (pad->ctrl & CTRL_ACIE) {
+                    pad->irq_bit = 1;
+                    pad->cycles_until_irq = 1024;
+
+                    return;
+                }
+
                 if (!psx_mcd_query(mcd))
                     pad->dest[slot] = 0;
             } break;
@@ -107,7 +114,7 @@
 
         if (pad->ctrl & CTRL_ACIE) {
             pad->irq_bit = 1;
-            pad->cycles_until_irq = JOY_IRQ_DELAY;
+            pad->cycles_until_irq = (pad->dest[slot] == DEST_MCD) ? 2048 : JOY_IRQ_DELAY;
         }
     }
 }
--- a/psx/dev/ram.c
+++ b/psx/dev/ram.c
@@ -16,8 +16,10 @@
     ram->io_size = PSX_RAM_SIZE;
 
     ram->mc2 = mc2;
-    ram->buf = (uint8_t*)malloc(RAM_SIZE);
+    ram->buf = (uint8_t*)malloc(size);
+    ram->size = size;
 
+    // Size has to be a multiple of 2MB, default to 2MB
     if (size & 0x1ffff)
         size = RAM_SIZE_2MB;
 
@@ -25,37 +27,49 @@
 }
 
 uint32_t psx_ram_read32(psx_ram_t* ram, uint32_t offset) {
-    offset &= RAM_SIZE - 1;
+    if (((ram->mc2->ram_size >> 9) & 7) == 3)
+        if (offset >= 0x400000)
+            return 0xffffffff;
 
+    offset &= ram->size - 1;
+
     return *((uint32_t*)(ram->buf + offset));
 }
 
 uint16_t psx_ram_read16(psx_ram_t* ram, uint32_t offset) {
-    offset &= RAM_SIZE - 1;
+    if (((ram->mc2->ram_size >> 9) & 7) == 3)
+        if (offset >= 0x400000)
+            return 0xffff;
 
+    offset &= ram->size - 1;
+
     return *((uint16_t*)(ram->buf + offset));
 }
 
 uint8_t psx_ram_read8(psx_ram_t* ram, uint32_t offset) {
-    offset &= RAM_SIZE - 1;
+    if (((ram->mc2->ram_size >> 9) & 7) == 3)
+        if (offset >= 0x400000)
+            return 0xff;
 
+    offset &= ram->size - 1;
+
     return ram->buf[offset];
 }
 
 void psx_ram_write32(psx_ram_t* ram, uint32_t offset, uint32_t value) {
-    offset &= RAM_SIZE - 1;
+    offset &= ram->size - 1;
 
     *((uint32_t*)(ram->buf + offset)) = value;
 }
 
 void psx_ram_write16(psx_ram_t* ram, uint32_t offset, uint16_t value) {
-    offset &= RAM_SIZE - 1;
+    offset &= ram->size - 1;
 
     *((uint16_t*)(ram->buf + offset)) = value;
 }
 
 void psx_ram_write8(psx_ram_t* ram, uint32_t offset, uint8_t value) {
-    offset &= RAM_SIZE - 1;
+    offset &= ram->size - 1;
 
     ram->buf[offset] = value;
 }
--- a/psx/dev/ram.h
+++ b/psx/dev/ram.h
@@ -6,8 +6,7 @@
 #include "../log.h"
 #include "mc2.h"
 
-#define RAM_SIZE        0x200000
-#define PSX_RAM_SIZE    0x1f000000
+#define PSX_RAM_SIZE    0x800000 // 8MB window
 #define PSX_RAM_BEGIN   0x00000000
 //#define PSX_RAM_END     0x001fffff
 #define PSX_RAM_END     0x1effffff
@@ -20,6 +19,8 @@
 typedef struct {
     uint32_t bus_delay;
     uint32_t io_base, io_size;
+
+    size_t size;
 
     psx_mc2_t* mc2;
 
--- a/psx/dev/spu.c
+++ b/psx/dev/spu.c
@@ -5,11 +5,20 @@
 #include "spu.h"
 #include "../log.h"
 
-#define CLAMP(v, l, h) ((v <= l) ? l : ((v >= h) ? h : v))
+#define CLAMP(v, l, h) (((v) <= (l)) ? (l) : (((v) >= (h)) ? (h) : (v)))
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 
 #define VOICE_COUNT 24
 
+static float interpolate_hermite(float a, float b, float c, float d, float t) {
+    float x = -a/2.0f + (3.0f*b)/2.0f - (3.0f*c)/2.0f + d/2.0f;
+    float y = a - (5.0f*b)/2.0f + 2.0f*c - d / 2.0f;
+    float z = -a/2.0f + c/2.0f;
+    float w = b;
+ 
+    return (x*t*t*t) + (y*t*t) + (z*t) + w;
+}
+
 static const int g_spu_pos_adpcm_table[] = {
     0, +60, +115, +98, +122
 };
@@ -136,15 +145,17 @@
 
     spu->data[v].block_flags = spu->ram[addr + 1];
 
-    unsigned shift  = 12 - (hdr & 0x0f);
+    unsigned hdr_shift = hdr & 0x0f;
+
+    if (hdr_shift > 12)
+        hdr_shift = 9;
+
+    unsigned shift  = 12 - hdr_shift;
     unsigned filter = (hdr >> 4) & 7;
 
     int32_t f0 = g_spu_pos_adpcm_table[filter];
     int32_t f1 = g_spu_neg_adpcm_table[filter];
 
-    if ((spu->irq9addr << 3) == addr)
-        psx_ic_irq(spu->ic, IC_SPU);
-
     for (int j = 0; j < 28; j++) {
         uint16_t n = (spu->ram[addr + 2 + (j >> 1)] >> ((j & 1) * 4)) & 0xf;
 
@@ -338,7 +349,6 @@
         }
     }
 
-    spu->kon |= value & 0x00ffffff;
     spu->endx &= ~(value & 0x00ffffff);
 }
 
@@ -346,8 +356,6 @@
     for (int i = 0; i < VOICE_COUNT; i++)
         if (value & (1 << i))
             adsr_load_release(spu, i);
-
-    spu->koff |= value & 0x00ffffff;
 }
 
 int spu_handle_write(psx_spu_t* spu, uint32_t offset, uint32_t value) {
@@ -472,6 +480,8 @@
 #define R16(addr) (spu_read_reverb(spu, addr))
 #define W16(addr, value) spu_write_reverb(spu, addr, value)
 
+#define SAT(v) CLAMP(v, INT16_MIN, INT16_MAX)
+
 void spu_get_reverb_sample(psx_spu_t* spu, int inl, int inr, int* outl, int* outr) {
     uint32_t mbase = spu->mbase << 3;
     uint32_t dapf1 = spu->dapf1 << 3;
@@ -497,46 +507,62 @@
     uint32_t mrapf1 = spu->mrapf1 << 3;
     uint32_t mrapf2 = spu->mrapf2 << 3;
 
-    float vlin = (float)spu->vlin / 32767.0f;
-    float vrin = (float)spu->vrin / 32767.0f;
-    float viir = (float)spu->viir / 32767.0f;
-    float vwall = (float)spu->vwall / 32767.0f;
-    float vcomb1 = (float)spu->vcomb1 / 32767.0f;
-    float vcomb2 = (float)spu->vcomb2 / 32767.0f;
-    float vcomb3 = (float)spu->vcomb3 / 32767.0f;
-    float vcomb4 = (float)spu->vcomb4 / 32767.0f;
-    float vapf1 = (float)spu->vapf1 / 32767.0f;
-    float vapf2 = (float)spu->vapf2 / 32767.0f;
-    float vlout = (float)spu->vlout / 32767.0f;
-    float vrout = (float)spu->vrout / 32767.0f;
+    float vlin = (float)spu->vlin;
+    float vrin = (float)spu->vrin;
+    float viir = (float)spu->viir;
+    float vwall = (float)spu->vwall;
+    float vcomb1 = (float)spu->vcomb1;
+    float vcomb2 = (float)spu->vcomb2;
+    float vcomb3 = (float)spu->vcomb3;
+    float vcomb4 = (float)spu->vcomb4;
+    float vapf1 = (float)spu->vapf1;
+    float vapf2 = (float)spu->vapf2;
+    float vlout = (float)spu->vlout;
+    float vrout = (float)spu->vrout;
 
-    int lin = ((float)inl * 0.5f) * vlin;
-    int rin = ((float)inr * 0.5f) * vrin;
+    int lin = (vlin * inl) / 32768.0f;
+    int rin = (vrin * inr) / 32768.0f;
 
-    int mlsamev = (lin + R16(dlsame)*vwall - R16(mlsame-2))*viir + R16(mlsame-2);
-    int mrsamev = (rin + R16(drsame)*vwall - R16(mrsame-2))*viir + R16(mrsame-2);
-    int mldiffv = (lin + R16(drdiff)*vwall - R16(mldiff-2))*viir + R16(mldiff-2);
-    int mrdiffv = (rin + R16(dldiff)*vwall - R16(mrdiff-2))*viir + R16(mrdiff-2);
+    // same side reflection ltol and rtor
+    int16_t mlsamev = SAT(lin + ((R16(dlsame) * vwall) / 32768.0f) - ((R16(mlsame - 2) * viir) / 32768.0f) + R16(mlsame - 2));
+    int16_t mrsamev = SAT(rin + ((R16(drsame) * vwall) / 32768.0f) - ((R16(mrsame - 2) * viir) / 32768.0f) + R16(mrsame - 2));
+    W16(mlsame, mlsamev);
+    W16(mrsame, mrsamev);
 
-    W16(mlsame, CLAMP(mlsamev, -0x8000, 0x7fff));
-    W16(mrsame, CLAMP(mrsamev, -0x8000, 0x7fff));
-    W16(mldiff, CLAMP(mldiffv, -0x8000, 0x7fff));
-    W16(mrdiff, CLAMP(mrdiffv, -0x8000, 0x7fff));
+    // different side reflection ltor and rtol
+    int16_t mldiffv = SAT(lin + ((R16(drdiff) * vwall) / 32768.0f) - ((R16(mldiff - 2) * viir) / 32768.0f) + R16(mldiff - 2));
+    int16_t mrdiffv = SAT(rin + ((R16(dldiff) * vwall) / 32768.0f) - ((R16(mrdiff - 2) * viir) / 32768.0f) + R16(mrdiff - 2));
+    W16(mldiff, mldiffv);
+    W16(mrdiff, mrdiffv);
 
-    int lout=vcomb1*R16(mlcomb1)+vcomb2*R16(mlcomb2)+vcomb3*R16(mlcomb3)+vcomb4*R16(mlcomb4);
-    int rout=vcomb1*R16(mrcomb1)+vcomb2*R16(mrcomb2)+vcomb3*R16(mrcomb3)+vcomb4*R16(mrcomb4);
+    // early echo (comb filter with input from buffer)
+    int16_t l = SAT((vcomb1 * R16(mlcomb1) / 32768.0f) + (vcomb2 * R16(mlcomb2) / 32768.0f) + (vcomb3 * R16(mlcomb3) / 32768.0f) + (vcomb4 * R16(mlcomb4) / 32768.0f));
+    int16_t r = SAT((vcomb1 * R16(mrcomb1) / 32768.0f) + (vcomb2 * R16(mrcomb2) / 32768.0f) + (vcomb3 * R16(mrcomb3) / 32768.0f) + (vcomb4 * R16(mrcomb4) / 32768.0f));
 
-    lout = CLAMP(lout, -0x8000, 0x7fff);
-    rout = CLAMP(rout, -0x8000, 0x7fff);
+    // late reverb apf1 (all pass filter 1 with input from comb)
+    l = SAT(l - SAT((vapf1 * R16(mlapf1 - dapf1)) / 32768.0f));
+    r = SAT(r - SAT((vapf1 * R16(mrapf1 - dapf1)) / 32768.0f));
 
-    lout-=CLAMP(vapf1*R16(mlapf1 - dapf1), -0x8000, 0x7fff); W16(mlapf1, lout); lout*=vapf1+((float)R16(mlapf1 - dapf1) / 32767.0f);
-    rout-=CLAMP(vapf1*R16(mrapf1 - dapf1), -0x8000, 0x7fff); W16(mrapf1, rout); rout*=vapf1+((float)R16(mrapf1 - dapf1) / 32767.0f);
-    lout-=CLAMP(vapf2*R16(mlapf2 - dapf2), -0x8000, 0x7fff); W16(mlapf2, lout); lout*=vapf2+((float)R16(mlapf2 - dapf2) / 32767.0f);
-    rout-=CLAMP(vapf2*R16(mrapf2 - dapf2), -0x8000, 0x7fff); W16(mrapf2, rout); rout*=vapf2+((float)R16(mrapf2 - dapf2) / 32767.0f);
+    W16(mlapf1, l);
+    W16(mrapf1, r);
+    
+    l = SAT((l * vapf1 / 32768.0f) + R16(mlapf1 - dapf1));
+    r = SAT((r * vapf1 / 32768.0f) + R16(mrapf1 - dapf1));
 
-    *outl = lout * vlout;
-    *outr = rout * vrout;
+    // late reverb apf2 (all pass filter 2 with input from apf1)
+    l = SAT(l - SAT((vapf2 * R16(mlapf2 - dapf2)) / 32768.0f));
+    r = SAT(r - SAT((vapf2 * R16(mrapf2 - dapf2)) / 32768.0f));
+    
+    W16(mlapf2, l);
+    W16(mrapf2, r);
 
+    l = SAT((l * vapf2 / 32768.0f) + R16(mlapf2 - dapf2));
+    r = SAT((r * vapf2 / 32768.0f) + R16(mrapf2 - dapf2));
+
+    // output to mixer (output volume multiplied with input from apf2)
+    *outl = SAT(l * vlout / 32768.0f);
+    *outr = SAT(r * vrout / 32768.0f);
+
     spu->revbaddr = MAX(mbase, (spu->revbaddr + 2) & 0x7fffe);
 }
 
@@ -563,11 +589,6 @@
 
         ++active_voice_count;
 
-        // Shift 3 older samples around
-        spu->data[v].s[3] = spu->data[v].s[2];
-        spu->data[v].s[2] = spu->data[v].s[1];
-        spu->data[v].s[1] = spu->data[v].s[0];
-
         uint32_t sample_index = spu->data[v].counter >> 12;
 
         if (sample_index > 27) {
@@ -581,7 +602,15 @@
 
             switch (spu->data[v].block_flags & 3) {
                 case 0: case 2: {
+                    if (((spu->irq9addr << 3) == spu->data[v].current_addr) && (spu->spucnt & 0x40)) {
+                        psx_ic_irq(spu->ic, IC_SPU);
+                    }
+
                     spu->data[v].current_addr += 0x10;
+
+                    if (((spu->irq9addr << 3) == spu->data[v].current_addr) && (spu->spucnt & 0x40)) {
+                        psx_ic_irq(spu->ic, IC_SPU);
+                    }
                 } break;
 
                 case 1: {
@@ -601,7 +630,13 @@
             spu_read_block(spu, v);
         }
 
-        // Fetch ADPCM sample
+        //  Fetch ADPCM sample
+        if (spu->data[v].prev_sample_index != sample_index) {
+            spu->data[v].s[3] = spu->data[v].s[2];
+            spu->data[v].s[2] = spu->data[v].s[1];
+            spu->data[v].s[1] = spu->data[v].s[0];
+        }
+
         spu->data[v].s[0] = spu->data[v].buf[sample_index];
 
         // Apply 4-point Gaussian interpolation
@@ -610,8 +645,16 @@
         int16_t g1 = g_spu_gauss_table[0x1ff - gauss_index];
         int16_t g2 = g_spu_gauss_table[0x100 + gauss_index];
         int16_t g3 = g_spu_gauss_table[0x000 + gauss_index];
-        int16_t out;
+        int16_t out = spu->data[v].s[0];
 
+        // out = interpolate_hermite(
+        //     spu->data[v].s[3],
+        //     spu->data[v].s[2],
+        //     spu->data[v].s[1],
+        //     spu->data[v].s[0],
+        //     (spu->data[v].counter & 0xfff) / 4096.0f
+        // );
+
         out  = (g0 * spu->data[v].s[3]) >> 15;
         out += (g1 * spu->data[v].s[2]) >> 15;
         out += (g2 * spu->data[v].s[1]) >> 15;
@@ -619,8 +662,8 @@
 
         float adsr_vol = (float)spu->voice[v].envcvol / 32767.0f;
 
-        float samplel = (out * spu->data[v].lvol) * adsr_vol * (float)spu->mainlvol / 32767.0f; 
-        float sampler = (out * spu->data[v].rvol) * adsr_vol * (float)spu->mainrvol / 32767.0f; 
+        float samplel = (out * spu->data[v].lvol) * adsr_vol; 
+        float sampler = (out * spu->data[v].rvol) * adsr_vol; 
 
         left += samplel;
         right += sampler;
@@ -634,23 +677,35 @@
 
         /* To-do: Do pitch modulation here */
 
+        spu->data[v].prev_sample_index = spu->data[v].counter >> 12;
         spu->data[v].counter += step;
     }
 
-    if (!active_voice_count)
-        return 0x00000000;
+    // if (!active_voice_count)
+    //     return 0x00000000;
     
     int16_t clamprl = CLAMP(revl, INT16_MIN, INT16_MAX);
     int16_t clamprr = CLAMP(revr, INT16_MIN, INT16_MAX);
     int16_t clampsl = CLAMP(left, INT16_MIN, INT16_MAX);
     int16_t clampsr = CLAMP(right, INT16_MIN, INT16_MAX);
-    
-    if ((spu->spucnt & 0x0080) && spu->even_cycle)
-        spu_get_reverb_sample(spu, clamprl, clamprr, &spu->lrsl, &spu->lrsr);
 
-    uint16_t clampl = CLAMP(clampsl + spu->lrsl, INT16_MIN, INT16_MAX);
-    uint16_t clampr = CLAMP(clampsr + spu->lrsr, INT16_MIN, INT16_MAX);
+    if ((spu->spucnt & 0x4000) == 0)
+        return 0;
 
+    uint16_t clampl;
+    uint16_t clampr;
+
+    if (spu->spucnt & 0x0080) {
+        if (spu->even_cycle)
+            spu_get_reverb_sample(spu, clamprl, clamprr, &spu->lrsl, &spu->lrsr);
+
+        clampl = CLAMP((clampsl + spu->lrsl), INT16_MIN, INT16_MAX) * (float)spu->mainlvol / 32767.0f;
+        clampr = CLAMP((clampsr + spu->lrsr), INT16_MIN, INT16_MAX) * (float)spu->mainrvol / 32767.0f;
+    } else {
+        clampl = CLAMP(clampsl, INT16_MIN, INT16_MAX) * (float)spu->mainlvol / 32767.0f;
+        clampr = CLAMP(clampsr, INT16_MIN, INT16_MAX) * (float)spu->mainrvol / 32767.0f;
+    }
+
     return clampl | (((uint32_t)clampr) << 16);
 }
 
@@ -658,14 +713,22 @@
     int16_t* ptr = buf;
     int16_t* ram = (int16_t*)spu->ram;
 
-    for (int i = 0; i < 0x400;) {
-        ram[i] = ptr[i];
+    for (int i = 0; i < 0x400; i++) {
+        ram[i + 0x000] = *ptr++;
+        ram[i + 0x400] = *ptr++;
+    }
 
-        ++i;
+    // Little bit of lowpass/smoothing
+    for (int i = 0; i < 0x400; i += 8) {
+        int l = 0, r = 0;
 
-        ram[i + 0x400] = ptr[i];
+        for (int j = 0; j < 8; j++) {
+            l += ram[i + j];
+            r += ram[i + j + 0x400];
+        }
 
-        ++i;
+        ram[i + 0x000] = l / 8;
+        ram[i + 0x400] = r / 8;
     }
 }
 
--- a/psx/dev/spu.h
+++ b/psx/dev/spu.h
@@ -124,6 +124,7 @@
         uint32_t counter;
         uint32_t current_addr;
         uint32_t repeat_addr;
+        uint32_t prev_sample_index;
         int16_t s[4];
         int block_flags;
         int16_t buf[28];
@@ -132,6 +133,8 @@
         float rvol;
         int cvol;
         int eon;
+        int reverbl;
+        int reverbr;
 
         /*
         ____lower 16bit (at 1F801C08h+N*10h)___________________________________
--- a/psx/dev/timer.c
+++ b/psx/dev/timer.c
@@ -239,7 +239,7 @@
     if (target_reached) {
         timer->timer[i].target_reached = 1;
 
-        // if ((i == 2) && (T2_CLKSRC == 2))
+        // if ((i == 1) && (T1_CLKSRC == 1))
         //     printf("target %04x (%f) reached\n", timer->timer[i].target, timer->timer[i].counter);
 
         if (timer->timer[i].reset_target)
@@ -250,7 +250,7 @@
     }
 
     if (max_reached) {
-        timer->timer[i].counter -= 65536.0f;
+        timer->timer[i].counter = 0;
         timer->timer[i].max_reached = 1;
 
         if (timer->timer[i].irq_max)
@@ -279,8 +279,8 @@
     timer->timer[i].irq = 1;
 
     if (trigger) {
-        // if ((i == 1))
-        //     printf("timer 1 irq fire\n");
+        if ((i == 1))
+            printf("timer 1 irq fire\n");
 
         psx_ic_irq(timer->ic, 16 << i);
     }
--- a/psx/psx.c
+++ b/psx/psx.c
@@ -30,7 +30,7 @@
     psx->cpu->last_cycles = 2;
 
     psx_cdrom_update(psx->cdrom, 2);
-    psx_gpu_update(psx->gpu, psx->cpu->last_cycles);
+    psx_gpu_update(psx->gpu, 2);
     psx_pad_update(psx->pad, psx->cpu->last_cycles);
     psx_timer_update(psx->timer, psx->cpu->last_cycles);
     psx_dma_update(psx->dma, psx->cpu->last_cycles);
@@ -61,7 +61,12 @@
 
     // return (draw > dmode) ? dmode : draw;
 
-    return psx_get_dmode_width(psx);
+    int width = psx_get_dmode_width(psx);
+
+    if (width == 368)
+        width = 384;
+
+    return width;
 }
 
 uint32_t psx_get_display_height(psx_t* psx) {
--