shithub: psxe

Download patch

ref: 09422a6b4ad4b99c1980f69e782b2e4666c069ff
parent: 15d0a415dbe35dc7f83cf3b7371868038c400064
author: allkern <lisandroaalarcon@gmail.com>
date: Sun Oct 22 17:11:48 EDT 2023

Implement a bunch of GTE instructions

Correctness verified

--- a/psx/cpu.c
+++ b/psx/cpu.c
@@ -1786,17 +1786,17 @@
 void gte_write_register(psx_cpu_t* cpu, uint32_t r, uint32_t value) {
     switch (r) {
         case 0 : cpu->cop2_dr.v[0].xy = value; break;
-        case 1 : cpu->cop2_dr.v[0].z = value & 0xffff; break;
+        case 1 : cpu->cop2_dr.v[0].z = value; break;
         case 2 : cpu->cop2_dr.v[1].xy = value; break;
-        case 3 : cpu->cop2_dr.v[1].z = value & 0xffff; break;
+        case 3 : cpu->cop2_dr.v[1].z = value; break;
         case 4 : cpu->cop2_dr.v[2].xy = value; break;
-        case 5 : cpu->cop2_dr.v[2].z = value & 0xffff; break;
+        case 5 : cpu->cop2_dr.v[2].z = value; break;
         case 6 : cpu->cop2_dr.rgbc.rgbc = value; break;
         case 7 : cpu->cop2_dr.otz = value; break;
-        case 8 : cpu->cop2_dr.ir[0] = value & 0xffff; break;
-        case 9 : cpu->cop2_dr.ir[1] = value & 0xffff; break;
-        case 10: cpu->cop2_dr.ir[2] = value & 0xffff; break;
-        case 11: cpu->cop2_dr.ir[3] = value & 0xffff; break;
+        case 8 : cpu->cop2_dr.ir[0] = value; break;
+        case 9 : cpu->cop2_dr.ir[1] = value; break;
+        case 10: cpu->cop2_dr.ir[2] = value; break;
+        case 11: cpu->cop2_dr.ir[3] = value; break;
         case 12: cpu->cop2_dr.sxy[0].xy = value; break;
         case 13: cpu->cop2_dr.sxy[1].xy = value; break;
         case 14: cpu->cop2_dr.sxy[2].xy = value; break;
@@ -1925,10 +1925,12 @@
 
 #define R_FLAG cpu->cop2_cr.flag
 
-uint64_t gte_clamp_mac0(psx_cpu_t* cpu, int64_t value) {
-    if (value < -((int64_t)0x80000000)) {
+int64_t gte_clamp_mac0(psx_cpu_t* cpu, int64_t value) {
+    cpu->s_mac0 = value;
+
+    if (value < (-0x80000000ll)) {
         R_FLAG |= 0x8000;
-    } else if (value > ((int64_t)0x7fffffff)) {
+    } else if (value > (0x7fffffffll)) {
         R_FLAG |= 0x10000;
     }
 
@@ -1935,17 +1937,20 @@
     return value;
 }
 
-int32_t gte_clamp_mac(psx_cpu_t* cpu, int i, int64_t value) {
+int64_t gte_clamp_mac(psx_cpu_t* cpu, int i, int64_t value) {
+    if (i == 3)
+        cpu->s_mac3 = value;
+
     if (value < -0x80000000000) {
-        R_FLAG |= (uint32_t)(0x8000000 >> (i - 1));
+        R_FLAG |= 0x8000000 >> (i - 1);
     } else if (value > 0x7ffffffffff) {
-        R_FLAG |= (uint32_t)(0x40000000 >> (i - 1));
+        R_FLAG |= 0x40000000 >> (i - 1);
     }
 
     return ((value << 20) >> 20) >> cpu->gte_sf;
 }
 
-int16_t gte_clamp_ir0(psx_cpu_t* cpu, int64_t value) {
+int64_t gte_clamp_ir0(psx_cpu_t* cpu, int64_t value) {
     if (value < 0) {
         R_FLAG |= 0x1000;
 
@@ -1956,10 +1961,10 @@
         return 0x1000;
     }
 
-    return (int16_t)value;
+    return value;
 }
 
-int16_t gte_clamp_sxy(psx_cpu_t* cpu, int i, int32_t value) {
+int64_t gte_clamp_sxy(psx_cpu_t* cpu, int i, int32_t value) {
     if (value < -0x400) {
         R_FLAG |= (uint32_t)(0x4000 >> (i - 1));
 
@@ -1970,10 +1975,10 @@
         return 0x3ff;
     }
 
-    return (int16_t)value;
+    return value;
 }
 
-uint16_t gte_clamp_sz3(psx_cpu_t* cpu, int64_t value) {
+int64_t gte_clamp_sz3(psx_cpu_t* cpu, int64_t value) {
     if (value < 0) {
         R_FLAG |= 0x40000;
 
@@ -1984,7 +1989,7 @@
         return 0xffff;
     }
 
-    return (uint16_t)value;
+    return value;
 }
 
 uint8_t gte_clamp_rgb(psx_cpu_t* cpu, int i, int value) {
@@ -2001,7 +2006,7 @@
     return (uint8_t)value;
 }
 
-int16_t gte_clamp_ir(psx_cpu_t* cpu, int i, int value, int lm) {
+int64_t gte_clamp_ir(psx_cpu_t* cpu, int i, int value, int lm) {
     if (lm && (value < 0)) {
         R_FLAG |= (uint32_t)(0x1000000 >> (i - 1));
 
@@ -2016,25 +2021,21 @@
         return 0x7fff;
     }
 
-    return (int16_t)value;
+    return value;
 }
 
-int32_t gte_clamp_ir_z(psx_cpu_t* cpu, int64_t value, int64_t old, int lm) {
-    if (old < -0x8000)
-        R_FLAG |= 0x400000;
+int64_t gte_clamp_ir_z(psx_cpu_t* cpu, int64_t value, int sf, int lm) {
+    int32_t value_sf = value >> sf;
+    int32_t value_12 = value >> 12;
+    int32_t min = 0;
 
-    if (old > 0x7fff)
-        R_FLAG |= 0x400000;
+    if (lm == 0)
+        min = -0x8000;
 
-    if (lm && (value < 0)) {
-        return 0;
-    } else if ((value < -0x8000) && !lm) {
-        return -0x8000;
-    } else if (value > 0x7fff) {
-        return 0x7fff;
-    }
+    if (value_12 < -0x8000 || value_12 > 0x7fff)
+        R_FLAG |= (1 << 22);
 
-    return (int16_t)value;
+    return CLAMP(value_sf, min, 0x7fff);
 }
 
 int clz(uint32_t value) {
@@ -2197,13 +2198,97 @@
 #define R_ZSF4 cpu->cop2_cr.zsf4
 #define R_OTZ cpu->cop2_dr.otz
 #define R_H cpu->cop2_cr.h
+#define R_RC cpu->cop2_dr.rgbc.c[0]
+#define R_GC cpu->cop2_dr.rgbc.c[1]
+#define R_BC cpu->cop2_dr.rgbc.c[2]
+#define R_CODE cpu->cop2_dr.rgbc.c[3]
+#define R_RGBC cpu->cop2_dr.rgbc.rgbc
+#define R_RFC cpu->cop2_cr.fc.x
+#define R_GFC cpu->cop2_cr.fc.y
+#define R_BFC cpu->cop2_cr.fc.z
+#define R_RGB0 cpu->cop2_dr.rgb[0].rgbc
+#define R_RGB1 cpu->cop2_dr.rgb[1].rgbc
+#define R_RGB2 cpu->cop2_dr.rgb[2].rgbc
+#define R_RC0 cpu->cop2_dr.rgb[0].c[0]
+#define R_GC0 cpu->cop2_dr.rgb[0].c[1]
+#define R_BC0 cpu->cop2_dr.rgb[0].c[2]
+#define R_CD0 cpu->cop2_dr.rgb[0].c[3]
+#define R_RC1 cpu->cop2_dr.rgb[1].c[0]
+#define R_GC1 cpu->cop2_dr.rgb[1].c[1]
+#define R_BC1 cpu->cop2_dr.rgb[1].c[2]
+#define R_CD1 cpu->cop2_dr.rgb[1].c[3]
+#define R_RC2 cpu->cop2_dr.rgb[2].c[0]
+#define R_GC2 cpu->cop2_dr.rgb[2].c[1]
+#define R_BC2 cpu->cop2_dr.rgb[2].c[2]
+#define R_CD2 cpu->cop2_dr.rgb[2].c[3]
 
+// void gte_rtp(psx_cpu_t* cpu, int i, int dq) {
+//     R_FLAG = 0;
+
+//     int64_t vx = (int64_t)(int16_t)cpu->cop2_dr.v[i].p[0];
+//     int64_t vy = (int64_t)(int16_t)cpu->cop2_dr.v[i].p[1];
+//     int64_t vz = (((int64_t)((int32_t)cpu->cop2_dr.v[i].z)) << 32) >> 32;
+
+//     int64_t mac1 = (I64((int32_t)R_TRX) << 12) + (I64((int16_t)R_RT11) * vx) + (I64((int16_t)R_RT12) * vy) + (I64((int16_t)R_RT13) * vz);
+//     int64_t mac2 = (I64((int32_t)R_TRY) << 12) + (I64((int16_t)R_RT21) * vx) + (I64((int16_t)R_RT22) * vy) + (I64((int16_t)R_RT23) * vz);
+//     int64_t mac3 = (I64((int32_t)R_TRZ) << 12) + (I64((int16_t)R_RT31) * vx) + (I64((int16_t)R_RT32) * vy) + (I64((int16_t)R_RT33) * vz);
+
+//     R_MAC1 = gte_clamp_mac(cpu, 1, mac1);
+//     R_MAC2 = gte_clamp_mac(cpu, 2, mac2);
+//     R_MAC3 = gte_clamp_mac(cpu, 3, mac3);
+
+//     // log_set_quiet(0);
+//     // log_fatal("mac1=%016x (%08x), vx=%016x, vy=%016x, vz=%016x (%08x), trx=%016x, res=%016x",
+//     //     mac1, R_MAC1,
+//     //     vx, vy, vz, cpu->cop2_dr.v[i].z,
+//     //     I64((int32_t)R_TRX) << 12,
+//     //     (I64((int32_t)R_TRX) << 12) + I64((int16_t)R_RT11) * vx + I64((int16_t)R_RT12) * vy + I64((int16_t)R_RT13) * vz
+//     // );
+//     // log_set_quiet(1);
+
+//     R_IR1 = gte_clamp_ir(cpu, 1, I64((int32_t)R_MAC1), cpu->gte_lm);
+//     R_IR2 = gte_clamp_ir(cpu, 2, I64((int32_t)R_MAC2), cpu->gte_lm);
+//     R_IR3 = gte_clamp_ir_z(cpu, I64((int32_t)R_MAC3), mac3 >> 12, cpu->gte_lm);
+
+//     R_SZ0 = R_SZ1;
+//     R_SZ1 = R_SZ2;
+//     R_SZ2 = R_SZ3;
+//     R_SZ3 = gte_clamp_sz3(cpu, I64(mac3) >> 12);
+
+//     int32_t h_div_sz = gte_divide(cpu, R_H, R_SZ3);
+
+//     int64_t x = gte_clamp_mac0(cpu, I64((int32_t)R_OFX) + (I64((int16_t)R_IR1) * h_div_sz)) >> 16;
+//     int64_t y = gte_clamp_mac0(cpu, I64((int32_t)R_OFY) + (I64((int16_t)R_IR2) * h_div_sz)) >> 16;
+
+//     R_SXY0 = R_SXY1;
+//     R_SXY1 = R_SXY2;
+//     R_SX2 = gte_clamp_sxy(cpu, 1, x);
+//     R_SY2 = gte_clamp_sxy(cpu, 2, y);
+
+//     if (dq) {
+//         int64_t mac0 = I64(R_DQB) + (h_div_sz * I64(R_DQA));
+
+//         R_MAC0 = gte_clamp_mac0(cpu, mac0);
+//         R_IR0 = gte_clamp_ir0(cpu, mac0 >> 12);
+
+//         log_set_quiet(0);
+//         log_fatal("mac0=%016x (%08x), dqa=%08x, dqb=%08x, h=%08x, sz=%08x, h/sz=%08x, ir0=%016x (%08x)",
+//             mac0, R_MAC0,
+//             R_DQA, R_DQB,
+//             R_H, R_SZ3,
+//             h_div_sz,
+//             mac0 >> 12, R_IR0
+//         );
+//         log_set_quiet(1);
+//     }
+// }
+
 void gte_rtp(psx_cpu_t* cpu, int i, int dq) {
     R_FLAG = 0;
 
     int64_t vx = (int64_t)(int16_t)cpu->cop2_dr.v[i].p[0];
     int64_t vy = (int64_t)(int16_t)cpu->cop2_dr.v[i].p[1];
-    int64_t vz = (((int64_t)((int32_t)cpu->cop2_dr.v[i].z)) << 32) >> 32;
+    int64_t vz = (int64_t)(int16_t)cpu->cop2_dr.v[i].z;
 
     int64_t mac1 = (I64((int32_t)R_TRX) << 12) + (I64((int16_t)R_RT11) * vx) + (I64((int16_t)R_RT12) * vy) + (I64((int16_t)R_RT13) * vz);
     int64_t mac2 = (I64((int32_t)R_TRY) << 12) + (I64((int16_t)R_RT21) * vx) + (I64((int16_t)R_RT22) * vy) + (I64((int16_t)R_RT23) * vz);
@@ -2213,56 +2298,98 @@
     R_MAC2 = gte_clamp_mac(cpu, 2, mac2);
     R_MAC3 = gte_clamp_mac(cpu, 3, mac3);
 
-    // log_set_quiet(0);
-    // log_fatal("mac1=%016x (%08x), vx=%016x, vy=%016x, vz=%016x (%08x), trx=%016x, res=%016x",
-    //     mac1, R_MAC1,
-    //     vx, vy, vz, cpu->cop2_dr.v[i].z,
-    //     I64((int32_t)R_TRX) << 12,
-    //     (I64((int32_t)R_TRX) << 12) + I64((int16_t)R_RT11) * vx + I64((int16_t)R_RT12) * vy + I64((int16_t)R_RT13) * vz
-    // );
-    // log_set_quiet(1);
+    R_IR1 = gte_clamp_ir(cpu, 1, R_MAC1, cpu->gte_lm);
+    R_IR2 = gte_clamp_ir(cpu, 2, R_MAC2, cpu->gte_lm);
+    R_IR3 = gte_clamp_ir_z(cpu, cpu->s_mac3, cpu->gte_sf, cpu->gte_lm);
 
-    R_IR1 = gte_clamp_ir(cpu, 1, I64((int32_t)R_MAC1), cpu->gte_lm);
-    R_IR2 = gte_clamp_ir(cpu, 2, I64((int32_t)R_MAC2), cpu->gte_lm);
-    R_IR3 = gte_clamp_ir_z(cpu, I64((int32_t)R_MAC3), mac3 >> 12, cpu->gte_lm);
-
     R_SZ0 = R_SZ1;
     R_SZ1 = R_SZ2;
     R_SZ2 = R_SZ3;
-    R_SZ3 = gte_clamp_sz3(cpu, I64(mac3) >> 12);
+    R_SZ3 = gte_clamp_sz3(cpu, cpu->s_mac3 >> 12);
 
-    int32_t h_div_sz = gte_divide(cpu, R_H, R_SZ3);
+    int64_t div = (int32_t)gte_divide(cpu, R_H, R_SZ3);
 
-    int64_t x = gte_clamp_mac0(cpu, I64((int32_t)R_OFX) + (I64((int16_t)R_IR1) * h_div_sz)) >> 16;
-    int64_t y = gte_clamp_mac0(cpu, I64((int32_t)R_OFY) + (I64((int16_t)R_IR2) * h_div_sz)) >> 16;
-
     R_SXY0 = R_SXY1;
     R_SXY1 = R_SXY2;
-    R_SX2 = gte_clamp_sxy(cpu, 1, x);
-    R_SY2 = gte_clamp_sxy(cpu, 2, y);
+    R_SX2 = gte_clamp_sxy(cpu, 1, (gte_clamp_mac0(cpu, (int64_t)((int32_t)R_OFX) + ((int64_t)R_IR1 * div)) >> 16));
+    R_SY2 = gte_clamp_sxy(cpu, 2, (gte_clamp_mac0(cpu, (int64_t)((int32_t)R_OFY) + ((int64_t)R_IR2 * div)) >> 16));
 
     if (dq) {
-        int64_t mac0 = I64(R_DQB) + (h_div_sz * I64(R_DQA));
-
-        R_MAC0 = gte_clamp_mac0(cpu, mac0);
-        R_IR0 = gte_clamp_ir0(cpu, mac0 >> 12);
+        R_MAC0 = gte_clamp_mac0(cpu, (int64_t)R_DQB + ((int64_t)R_DQA * div));
+        R_IR0 = gte_clamp_ir0(cpu, cpu->s_mac0 >> 12);
     }
 }
 
 void psx_gte_i_rtps(psx_cpu_t* cpu) {
-    gte_rtp(cpu, 0, 1);
+    // gte_rtp(cpu, 0, 1);
+
+    R_FLAG = 0;
+
+    // Fetch V0 values
+    int64_t R_VX0 = (int64_t)((int16_t)cpu->cop2_dr.v[0].p[0]);
+    int64_t R_VY0 = (int64_t)((int16_t)cpu->cop2_dr.v[0].p[1]);
+    int64_t R_VZ0 = (int64_t)cpu->cop2_dr.v[0].z;
+
+    // Calculate matrix product, checking for 44-bit overflow on the final
+    // result:
+    // if (value < -0x80000000000) {
+    //     R_FLAG |= 0x8000000 >> (i - 1);
+    // } else if (value > 0x7ffffffffff) {
+    //     R_FLAG |= 0x40000000 >> (i - 1);
+    // }
+    R_MAC1 = gte_clamp_mac(cpu, 1, ((int64_t)R_TRX << 12) + (I64((int16_t)R_RT11) * R_VX0) + (I64((int16_t)R_RT12) * R_VY0) + (I64((int16_t)R_RT13) * R_VZ0));
+    R_MAC2 = gte_clamp_mac(cpu, 2, ((int64_t)R_TRY << 12) + (I64((int16_t)R_RT21) * R_VX0) + (I64((int16_t)R_RT22) * R_VY0) + (I64((int16_t)R_RT23) * R_VZ0));
+    R_MAC3 = gte_clamp_mac(cpu, 3, ((int64_t)R_TRZ << 12) + (I64((int16_t)R_RT31) * R_VX0) + (I64((int16_t)R_RT32) * R_VY0) + (I64((int16_t)R_RT33) * R_VZ0));
+
+    // Store on IR1-3, clamping to -0x8000 (or 0) to 0x7fff
+    R_IR1 = gte_clamp_ir(cpu, 1, R_MAC1, cpu->gte_lm);
+    R_IR2 = gte_clamp_ir(cpu, 2, R_MAC2, cpu->gte_lm);
+    R_IR3 = gte_clamp_ir_z(cpu, cpu->s_mac3, cpu->gte_sf, cpu->gte_lm);
+
+    // Push screen Z: The unclamped value of MAC3 shifted right by 12,
+    // clamping to 0x0000-0xffff
+    R_SZ0 = R_SZ1;
+    R_SZ1 = R_SZ2;
+    R_SZ2 = R_SZ3;
+    R_SZ3 = gte_clamp_sz3(cpu, cpu->s_mac3 >> 12);
+
+    // Divide H by the pushed SZ3 value
+    int32_t div = gte_divide(cpu, R_H, R_SZ3);
+
+    // Push screen XY: X = OFX + (IR1 * (H/SZ3))
+    //                 Y = OFY + (IR2 * (H/SZ3))
+    // Clamping to (-0x400, 0x3ff)
+    R_SXY0 = R_SXY1;
+    R_SXY1 = R_SXY2;
+    R_SX2 = gte_clamp_sxy(cpu, 1, (gte_clamp_mac0(cpu, (int64_t)((int32_t)R_OFX) + ((int64_t)R_IR1 * div)) >> 16));
+    R_SY2 = gte_clamp_sxy(cpu, 2, (gte_clamp_mac0(cpu, (int64_t)((int32_t)R_OFY) + ((int64_t)R_IR2 * div)) >> 16));
+
+    // Do depth cueing and store on MAC0, IR0, clamping MAC0 to (-0x80000000, 0x7fffffff)
+    // and storing the unclamped MAC0 value to IR0, clamping to (0x0000, 0x1000)
+    R_MAC0 = gte_clamp_mac0(cpu, ((int64_t)R_DQB) + (((int64_t)R_DQA) * div));
+    R_IR0 = gte_clamp_ir0(cpu, cpu->s_mac0 >> 12);
 }
 
 void psx_gte_i_nclip(psx_cpu_t* cpu) {
-    uint64_t value = R_SX0 * (R_SY1 - R_SY2);
-    value += R_SX1 * (R_SY2 - R_SY0);
-    value += R_SX2 * (R_SY0 - R_SY1);
+    R_FLAG = 0;
 
-    R_MAC0 = gte_clamp_mac0(cpu, value);
+    int64_t value = I64((int16_t)R_SX0) * (I64((int16_t)R_SY1) - I64((int16_t)R_SY2));
+    value += I64((int16_t)R_SX1) * (I64((int16_t)R_SY2) - I64((int16_t)R_SY0));
+    value += I64((int16_t)R_SX2) * (I64((int16_t)R_SY0) - I64((int16_t)R_SY1));
+
+    R_MAC0 = (int)gte_clamp_mac0(cpu, value);
 }
 
 void psx_gte_i_op(psx_cpu_t* cpu) {
-    log_fatal("op: Unimplemented GTE instruction");
+    R_FLAG = 0;
+
+    R_MAC1 = gte_clamp_mac(cpu, 1, I64(I64((int16_t)R_RT22) * I64(R_IR3)) - I64((I64((int16_t)R_RT33) * I64(R_IR2))));
+    R_MAC2 = gte_clamp_mac(cpu, 2, I64(I64((int16_t)R_RT33) * I64(R_IR1)) - I64((I64((int16_t)R_RT11) * I64(R_IR3))));
+    R_MAC3 = gte_clamp_mac(cpu, 3, I64(I64((int16_t)R_RT11) * I64(R_IR2)) - I64((I64((int16_t)R_RT22) * I64(R_IR1))));
+
+    R_IR1 = gte_clamp_ir(cpu, 1, R_MAC1, cpu->gte_lm);
+    R_IR2 = gte_clamp_ir(cpu, 2, R_MAC2, cpu->gte_lm);
+    R_IR3 = gte_clamp_ir(cpu, 3, R_MAC3, cpu->gte_lm);
 }
 
 void psx_gte_i_dpcs(psx_cpu_t* cpu) {
@@ -2306,11 +2433,19 @@
 }
 
 void psx_gte_i_sqr(psx_cpu_t* cpu) {
-    log_fatal("sqr: Unimplemented GTE instruction");
+    R_FLAG = 0;
+
+    R_MAC1 = gte_clamp_mac(cpu, 1, I64(R_IR1) * I64(R_IR1));
+    R_MAC2 = gte_clamp_mac(cpu, 2, I64(R_IR2) * I64(R_IR2));
+    R_MAC3 = gte_clamp_mac(cpu, 3, I64(R_IR3) * I64(R_IR3));
+
+    R_IR1 = gte_clamp_ir(cpu, 1, R_MAC1, cpu->gte_lm);
+    R_IR2 = gte_clamp_ir(cpu, 2, R_MAC2, cpu->gte_lm);
+    R_IR3 = gte_clamp_ir(cpu, 3, R_MAC3, cpu->gte_lm);
 }
 
 void psx_gte_i_dcpl(psx_cpu_t* cpu) {
-    log_fatal("dcpl: Unimplemented GTE instruction");
+    log_fatal("dpcl: Unimplemented GTE instruction");
 }
 
 void psx_gte_i_dpct(psx_cpu_t* cpu) {
@@ -2318,7 +2453,8 @@
 }
 
 void psx_gte_i_avsz3(psx_cpu_t* cpu) {
-    log_fatal("AVSZ3");
+    R_FLAG = 0;
+
     int64_t avg = I64(R_ZSF3) * (R_SZ1 + R_SZ2 + R_SZ3);
 
     R_MAC0 = (int)gte_clamp_mac0(cpu, avg);
@@ -2326,6 +2462,8 @@
 }
 
 void psx_gte_i_avsz4(psx_cpu_t* cpu) {
+    R_FLAG = 0;
+
     int64_t avg = I64(R_ZSF4) * (R_SZ0 + R_SZ1 + R_SZ2 + R_SZ3);
 
     R_MAC0 = (int)gte_clamp_mac0(cpu, avg);
@@ -2339,11 +2477,37 @@
 }
 
 void psx_gte_i_gpf(psx_cpu_t* cpu) {
-    log_fatal("gpf: Unimplemented GTE instruction");
+    R_FLAG = 0;
+
+    R_MAC1 = gte_clamp_mac(cpu, 1, R_IR0 * R_IR1);
+    R_MAC2 = gte_clamp_mac(cpu, 2, R_IR0 * R_IR2);
+    R_MAC3 = gte_clamp_mac(cpu, 3, R_IR0 * R_IR3);
+    R_IR1 = gte_clamp_ir(cpu, 1, R_MAC1, cpu->gte_lm);
+    R_IR2 = gte_clamp_ir(cpu, 2, R_MAC2, cpu->gte_lm);
+    R_IR3 = gte_clamp_ir(cpu, 3, R_MAC3, cpu->gte_lm);
+    R_RGB0 = R_RGB1;
+    R_RGB1 = R_RGB2;
+    R_CD2 = R_CODE;
+    R_RC2 = gte_clamp_rgb(cpu, 1, R_MAC1 >> 4);
+    R_GC2 = gte_clamp_rgb(cpu, 2, R_MAC2 >> 4);
+    R_BC2 = gte_clamp_rgb(cpu, 3, R_MAC3 >> 4);
 }
 
 void psx_gte_i_gpl(psx_cpu_t* cpu) {
-    log_fatal("gpl: Unimplemented GTE instruction");
+    R_FLAG = 0;
+
+    R_MAC1 = gte_clamp_mac(cpu, 1, (I64(R_MAC1) << cpu->gte_sf) + (R_IR0 * R_IR1));
+    R_MAC2 = gte_clamp_mac(cpu, 2, (I64(R_MAC2) << cpu->gte_sf) + (R_IR0 * R_IR2));
+    R_MAC3 = gte_clamp_mac(cpu, 3, (I64(R_MAC3) << cpu->gte_sf) + (R_IR0 * R_IR3));
+    R_IR1 = gte_clamp_ir(cpu, 1, R_MAC1, cpu->gte_lm);
+    R_IR2 = gte_clamp_ir(cpu, 2, R_MAC2, cpu->gte_lm);
+    R_IR3 = gte_clamp_ir(cpu, 3, R_MAC3, cpu->gte_lm);
+    R_RGB0 = R_RGB1;
+    R_RGB1 = R_RGB2;
+    R_CD2 = R_CODE;
+    R_RC2 = gte_clamp_rgb(cpu, 1, R_MAC1 >> 4);
+    R_GC2 = gte_clamp_rgb(cpu, 2, R_MAC2 >> 4);
+    R_BC2 = gte_clamp_rgb(cpu, 3, R_MAC3 >> 4);
 }
 
 void psx_gte_i_ncct(psx_cpu_t* cpu) {
--- a/psx/cpu.h
+++ b/psx/cpu.h
@@ -134,7 +134,7 @@
         gte_matrix_t lr;
         gte_vec3_t fc;
         uint32_t ofx, ofy;
-        uint16_t h;
+        uint32_t h;
         int16_t dqa;
         int32_t dqb;
         int16_t zsf3, zsf4;
@@ -146,6 +146,8 @@
     int gte_mmat;
     int gte_mvec;
     int gte_tvec;
+    int64_t s_mac0;
+    int64_t s_mac3;
 
     psx_bus_t* bus;
 
@@ -236,9 +238,9 @@
 
 /*
     00h INT     Interrupt
-    01h MOD     Tlb modification (none such in PSX)
-    02h TLBL    Tlb load         (none such in PSX)
-    03h TLBS    Tlb store        (none such in PSX)
+    01h MOD     TLB modification (none such in PSX)
+    02h TLBL    TLB load         (none such in PSX)
+    03h TLBS    TLB store        (none such in PSX)
     04h AdEL    Address error, Data load or Instruction fetch
     05h AdES    Address error, Data store
                 The address errors occur when attempting to read
--