diff --git a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp index 50714fb8..6a2423fb 100644 --- a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp +++ b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp @@ -253,6 +253,14 @@ void ShaderEmitter::loadRegister(Xmm dest, const PICAShader& shader, u32 src, u3 Helpers::panic("[ShaderJIT]: Unimplemented source index type %d", index); } + // Swizzle and load register into dest, from [state pointer + rcx + offset] and apply the relevant swizzle + auto swizzleAndLoadReg = [this, &dest, &compSwizzle, &convertedSwizzle](size_t offset) { + if (compSwizzle == noSwizzle) // Avoid emitting swizzle if not necessary + movaps(dest, xword[statePointer + rcx + offset]); + else // Swizzle is not trivial so we need to emit a shuffle instruction + pshufd(dest, xword[statePointer + rcx + offset], convertedSwizzle); + }; + // Here we handle what happens when using indexed addressing & we can't predict what register will be read at compile time // The index of the access is assumed to be in rax // Add source register (src) and index (rax) to form the final register @@ -268,7 +276,7 @@ void ShaderEmitter::loadRegister(Xmm dest, const PICAShader& shader, u32 src, u3 jae(maybeTemp); mov(rcx, rax); shl(rcx, 4); // rcx = rax * sizeof(vec4 of floats) = rax * 16 - movaps(dest, xword[statePointer + rcx + inputOffset]); + swizzleAndLoadReg(inputOffset); jmp(end); // If (reg < 0x1F) return tempRegisters[reg - 0x10] @@ -277,7 +285,7 @@ void ShaderEmitter::loadRegister(Xmm dest, const PICAShader& shader, u32 src, u3 jae(maybeUniform); lea(rcx, qword[rax - 0x10]); shl(rcx, 4); - movaps(dest, xword[statePointer + rcx + tempOffset]); + swizzleAndLoadReg(tempOffset); jmp(end); // If (reg < 0x80) return floatUniforms[reg - 0x20] @@ -286,7 +294,7 @@ void ShaderEmitter::loadRegister(Xmm dest, const PICAShader& shader, u32 src, u3 jae(unknownReg); lea(rcx, qword[rax - 0x20]); shl(rcx, 4); - movaps(dest, xword[statePointer + rcx + uniformOffset]); + swizzleAndLoadReg(uniformOffset); jmp(end); L(unknownReg); @@ -844,7 +852,24 @@ void ShaderEmitter::recLOOP(const PICAShader& shader, u32 instruction) { add(eax, 1); // The iteration count is actually uniform.x + 1 mov(dword[statePointer + loopRegOffset], ecx); // Set loop counter - Helpers::panic("Unimplemented LOOP instruction"); + // TODO: This might break if an instruction in a loop decides to yield... + push(rax); // Push loop iteration counter + push(rdx); // Push loop increment + + Label loopStart; + L(loopStart); + compileUntil(shader, dest + 1); + + const size_t stackOffsetOfLoopIncrement = 0; + const size_t stackOffsetOfIterationCounter = stackOffsetOfLoopIncrement + 8; + + mov(ecx, dword[rsp + stackOffsetOfLoopIncrement]); // ecx = Loop increment + add(dword[statePointer + loopRegOffset], ecx); // Increment loop counter + sub(dword[rsp + stackOffsetOfIterationCounter], 1); // Subtract 1 from loop iteration counter + + jnz(loopStart); // Back to loop start if not over + add(rsp, 16); + loopLevel--; } void ShaderEmitter::printLog(const PICAShader& shaderUnit) { @@ -852,12 +877,12 @@ void ShaderEmitter::printLog(const PICAShader& shaderUnit) { for (int i = 0; i < shaderUnit.tempRegisters.size(); i++) { const auto& r = shaderUnit.tempRegisters[i]; - printf("t%d: (%f, %f, %f, %f)\n", i, r[0].toFloat64(), r[1].toFloat64(), r[2].toFloat64(), r[3].toFloat64()); + printf("t%d: (%.2f, %.2f, %.2f, %.2f)\n", i, r[0].toFloat64(), r[1].toFloat64(), r[2].toFloat64(), r[3].toFloat64()); } for (int i = 0; i < shaderUnit.outputs.size(); i++) { const auto& r = shaderUnit.outputs[i]; - printf("o%d: (%f, %f, %f, %f)\n", i, r[0].toFloat64(), r[1].toFloat64(), r[2].toFloat64(), r[3].toFloat64()); + printf("o%d: (%.2f, %.2f, %.2f, %.2f)\n", i, r[0].toFloat64(), r[1].toFloat64(), r[2].toFloat64(), r[3].toFloat64()); } printf("addr: (%d, %d)\n", shaderUnit.addrRegister[0], shaderUnit.addrRegister[1]);