From e2cb7d78332703ef4e4e119c9ed586334e8e4f30 Mon Sep 17 00:00:00 2001
From: Jannik Vogel <email@jannikvogel.de>
Date: Wed, 30 Nov 2016 15:40:42 +0100
Subject: [PATCH] shader_jit: Load LOOPCOUNT_REG and LOOPINC 4 bit left-shifted

---
 src/video_core/shader/shader_jit_x64.cpp | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index 211c703ab..9a3d6ca8f 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -102,11 +102,11 @@ static const X64Reg SETUP = R9;
 /// The two 32-bit VS address offset registers set by the MOVA instruction
 static const X64Reg ADDROFFS_REG_0 = R10;
 static const X64Reg ADDROFFS_REG_1 = R11;
-/// VS loop count register
+/// VS loop count register (Multiplied by 16)
 static const X64Reg LOOPCOUNT_REG = R12;
 /// Current VS loop iteration number (we could probably use LOOPCOUNT_REG, but this quicker)
 static const X64Reg LOOPCOUNT = RSI;
-/// Number to increment LOOPCOUNT_REG by on each loop iteration
+/// Number to increment LOOPCOUNT_REG by on each loop iteration (Multiplied by 16)
 static const X64Reg LOOPINC = RDI;
 /// Result of the previous CMP instruction for the X-component comparison
 static const X64Reg COND0 = R13;
@@ -718,15 +718,18 @@ void JitShader::Compile_LOOP(Instruction instr) {
 
     looping = true;
 
+    // This decodes the fields from the integer uniform at index instr.flow_control.int_uniform_id.
+    // The Y (LOOPCOUNT_REG) and Z (LOOPINC) component are kept multiplied by 16 (Left shifted by
+    // 4 bits) to be used as an offset into the 16-byte vector registers later
     int offset =
         ShaderSetup::UniformOffset(RegisterType::IntUniform, instr.flow_control.int_uniform_id);
     MOV(32, R(LOOPCOUNT), MDisp(SETUP, offset));
     MOV(32, R(LOOPCOUNT_REG), R(LOOPCOUNT));
-    SHR(32, R(LOOPCOUNT_REG), Imm8(8));
-    AND(32, R(LOOPCOUNT_REG), Imm32(0xff)); // Y-component is the start
+    SHR(32, R(LOOPCOUNT_REG), Imm8(4));
+    AND(32, R(LOOPCOUNT_REG), Imm32(0xFF0)); // Y-component is the start
     MOV(32, R(LOOPINC), R(LOOPCOUNT));
-    SHR(32, R(LOOPINC), Imm8(16));
-    MOVZX(32, 8, LOOPINC, R(LOOPINC));     // Z-component is the incrementer
+    SHR(32, R(LOOPINC), Imm8(12));
+    AND(32, R(LOOPINC), Imm32(0xFF0));     // Z-component is the incrementer
     MOVZX(32, 8, LOOPCOUNT, R(LOOPCOUNT)); // X-component is iteration count
     ADD(32, R(LOOPCOUNT), Imm8(1));        // Iteration count is X-component + 1