mirror of
https://github.com/wheremyfoodat/Panda3DS.git
synced 2025-04-06 06:05:40 +12:00
WIP arm64 shader recompiler
This commit is contained in:
parent
281a7eefbf
commit
c0621d0760
7 changed files with 383 additions and 4 deletions
3
.gitmodules
vendored
3
.gitmodules
vendored
|
@ -46,3 +46,6 @@
|
|||
[submodule "third_party/zep"]
|
||||
path = third_party/zep
|
||||
url = https://github.com/Panda3DS-emu/zep
|
||||
[submodule "third_party/oaknut"]
|
||||
path = third_party/oaknut
|
||||
url = https://github.com/merryhime/oaknut
|
||||
|
|
|
@ -128,6 +128,9 @@ endif()
|
|||
# Check for arm64
|
||||
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
|
||||
set(HOST_ARM64 TRUE)
|
||||
add_subdirectory(third_party/oaknut) # Add Oaknut submodule for arm64 JITs
|
||||
include_directories(third_party/oaknut/include)
|
||||
add_compile_definitions(PANDA3DS_DYNAPICA_SUPPORTED)
|
||||
add_compile_definitions(PANDA3DS_ARM64_HOST)
|
||||
else()
|
||||
set(HOST_ARM64 FALSE)
|
||||
|
@ -172,6 +175,7 @@ set(SERVICE_SOURCE_FILES src/core/services/service_manager.cpp src/core/services
|
|||
set(PICA_SOURCE_FILES src/core/PICA/gpu.cpp src/core/PICA/regs.cpp src/core/PICA/shader_unit.cpp
|
||||
src/core/PICA/shader_interpreter.cpp src/core/PICA/dynapica/shader_rec.cpp
|
||||
src/core/PICA/dynapica/shader_rec_emitter_x64.cpp src/core/PICA/pica_hash.cpp
|
||||
src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp
|
||||
)
|
||||
|
||||
set(LOADER_SOURCE_FILES src/core/loader/elf.cpp src/core/loader/ncsd.cpp src/core/loader/ncch.cpp src/core/loader/3dsx.cpp src/core/loader/lz77.cpp)
|
||||
|
@ -236,6 +240,7 @@ set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/termcolor.hpp
|
|||
include/services/news_u.hpp include/applets/software_keyboard.hpp include/applets/applet_manager.hpp include/fs/archive_user_save_data.hpp
|
||||
include/services/amiibo_device.hpp include/services/nfc_types.hpp include/swap.hpp include/services/csnd.hpp include/services/nwm_uds.hpp
|
||||
include/fs/archive_system_save_data.hpp include/lua_manager.hpp include/memory_mapped_file.hpp include/hydra_icon.hpp
|
||||
include/PICA/dynapica/shader_rec_emitter_arm64.hpp
|
||||
)
|
||||
|
||||
cmrc_add_resource_library(
|
||||
|
|
|
@ -1,13 +1,15 @@
|
|||
#pragma once
|
||||
#include "PICA/shader.hpp"
|
||||
|
||||
#if defined(PANDA3DS_DYNAPICA_SUPPORTED) && defined(PANDA3DS_X64_HOST)
|
||||
#if defined(PANDA3DS_DYNAPICA_SUPPORTED) && (defined(PANDA3DS_X64_HOST) || defined(PANDA3DS_ARM64_HOST))
|
||||
#define PANDA3DS_SHADER_JIT_SUPPORTED
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
|
||||
#ifdef PANDA3DS_X64_HOST
|
||||
#include "shader_rec_emitter_x64.hpp"
|
||||
#elif defined(PANDA3DS_ARM64_HOST)
|
||||
#include "shader_rec_emitter_arm64.hpp"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
|
129
include/PICA/dynapica/shader_rec_emitter_arm64.hpp
Normal file
129
include/PICA/dynapica/shader_rec_emitter_arm64.hpp
Normal file
|
@ -0,0 +1,129 @@
|
|||
#pragma once
|
||||
|
||||
// Only do anything if we're on an x64 target with JIT support enabled
|
||||
#if defined(PANDA3DS_DYNAPICA_SUPPORTED) && defined(PANDA3DS_ARM64_HOST)
|
||||
#include <array>
|
||||
#include <oaknut/code_block.hpp>
|
||||
#include <oaknut/oaknut.hpp>
|
||||
|
||||
#include "PICA/shader.hpp"
|
||||
#include "helpers.hpp"
|
||||
#include "logger.hpp"
|
||||
|
||||
class ShaderEmitter : private oaknut::CodeBlock, public oaknut::CodeGenerator {
|
||||
static constexpr size_t executableMemorySize = PICAShader::maxInstructionCount * 96; // How much executable memory to alloc for each shader
|
||||
// Allocate some extra space as padding for security purposes in the extremely unlikely occasion we manage to overflow the above size
|
||||
static constexpr size_t allocSize = executableMemorySize + 0x1000;
|
||||
|
||||
// If the swizzle field is this value then the swizzle pattern is .xyzw so we don't need a shuffle
|
||||
static constexpr uint noSwizzle = 0x1B;
|
||||
|
||||
using f24 = Floats::f24;
|
||||
using vec4f = std::array<f24, 4>;
|
||||
|
||||
// An array of labels (incl pointers) to each compiled (to x64) PICA instruction
|
||||
std::array<oaknut::Label, PICAShader::maxInstructionCount> instructionLabels;
|
||||
// A vector of PCs that can potentially return based on the state of the PICA callstack.
|
||||
// Filled before compiling a shader by scanning the code for call instructions
|
||||
std::vector<u32> returnPCs;
|
||||
|
||||
// Vector value of (1.0, 1.0, 1.0, 1.0) for SLT(i)/SGE(i)
|
||||
oaknut::Label onesVector;
|
||||
|
||||
u32 recompilerPC = 0; // PC the recompiler is currently recompiling @
|
||||
u32 loopLevel = 0; // The current loop nesting level (0 = not in a loop)
|
||||
|
||||
// Shows whether the loaded shader has any log2 and exp2 instructions
|
||||
bool codeHasLog2 = false;
|
||||
bool codeHasExp2 = false;
|
||||
|
||||
oaknut::Label log2Func, exp2Func;
|
||||
oaknut::Label emitLog2Func();
|
||||
oaknut::Label emitExp2Func();
|
||||
|
||||
// Compile all instructions from [current recompiler PC, end)
|
||||
void compileUntil(const PICAShader& shaderUnit, u32 endPC);
|
||||
// Compile instruction "instr"
|
||||
void compileInstruction(const PICAShader& shaderUnit);
|
||||
|
||||
bool isCall(u32 instruction) {
|
||||
const u32 opcode = instruction >> 26;
|
||||
return (opcode == ShaderOpcodes::CALL) || (opcode == ShaderOpcodes::CALLC) || (opcode == ShaderOpcodes::CALLU);
|
||||
}
|
||||
|
||||
// Scan the shader code for call instructions to fill up the returnPCs vector before starting compilation
|
||||
// We also scan for log2/exp2 instructions to see whether to emit the relevant functions
|
||||
void scanCode(const PICAShader& shaderUnit);
|
||||
|
||||
// Load register with number "srcReg" indexed by index "idx" into the arm64 register "reg"
|
||||
template <int sourceIndex>
|
||||
void loadRegister(oaknut::QReg dest, const PICAShader& shader, u32 src, u32 idx, u32 operandDescriptor);
|
||||
void storeRegister(oaknut::QReg source, const PICAShader& shader, u32 dest, u32 operandDescriptor);
|
||||
|
||||
const vec4f& getSourceRef(const PICAShader& shader, u32 src);
|
||||
const vec4f& getDestRef(const PICAShader& shader, u32 dest);
|
||||
|
||||
// Check the value of the cmp register for instructions like ifc and callc
|
||||
// Result is returned in the zero flag. If the comparison is true then zero == 1, else zero == 0
|
||||
void checkCmpRegister(const PICAShader& shader, u32 instruction);
|
||||
|
||||
// Check the value of the bool uniform for instructions like ifu and callu
|
||||
// Result is returned in the zero flag. If the comparison is true then zero == 0, else zero == 1 (Opposite of checkCmpRegister)
|
||||
void checkBoolUniform(const PICAShader& shader, u32 instruction);
|
||||
|
||||
// Instruction recompilation functions
|
||||
void recADD(const PICAShader& shader, u32 instruction);
|
||||
void recCALL(const PICAShader& shader, u32 instruction);
|
||||
void recCALLC(const PICAShader& shader, u32 instruction);
|
||||
void recCALLU(const PICAShader& shader, u32 instruction);
|
||||
void recCMP(const PICAShader& shader, u32 instruction);
|
||||
void recDP3(const PICAShader& shader, u32 instruction);
|
||||
void recDP4(const PICAShader& shader, u32 instruction);
|
||||
void recDPH(const PICAShader& shader, u32 instruction);
|
||||
void recEMIT(const PICAShader& shader, u32 instruction);
|
||||
void recEND(const PICAShader& shader, u32 instruction);
|
||||
void recEX2(const PICAShader& shader, u32 instruction);
|
||||
void recFLR(const PICAShader& shader, u32 instruction);
|
||||
void recIFC(const PICAShader& shader, u32 instruction);
|
||||
void recIFU(const PICAShader& shader, u32 instruction);
|
||||
void recJMPC(const PICAShader& shader, u32 instruction);
|
||||
void recJMPU(const PICAShader& shader, u32 instruction);
|
||||
void recLG2(const PICAShader& shader, u32 instruction);
|
||||
void recLOOP(const PICAShader& shader, u32 instruction);
|
||||
void recMAD(const PICAShader& shader, u32 instruction);
|
||||
void recMAX(const PICAShader& shader, u32 instruction);
|
||||
void recMIN(const PICAShader& shader, u32 instruction);
|
||||
void recMOVA(const PICAShader& shader, u32 instruction);
|
||||
void recMOV(const PICAShader& shader, u32 instruction);
|
||||
void recMUL(const PICAShader& shader, u32 instruction);
|
||||
void recRCP(const PICAShader& shader, u32 instruction);
|
||||
void recRSQ(const PICAShader& shader, u32 instruction);
|
||||
void recSETEMIT(const PICAShader& shader, u32 instruction);
|
||||
void recSGE(const PICAShader& shader, u32 instruction);
|
||||
void recSLT(const PICAShader& shader, u32 instruction);
|
||||
|
||||
MAKE_LOG_FUNCTION(log, shaderJITLogger)
|
||||
|
||||
public:
|
||||
// Callback type used for instructions
|
||||
using InstructionCallback = const void (*)(PICAShader& shaderUnit);
|
||||
// Callback type used for the JIT prologue. This is what the caller will call
|
||||
using PrologueCallback = const void (*)(PICAShader& shaderUnit, InstructionCallback cb);
|
||||
|
||||
PrologueCallback prologueCb = nullptr;
|
||||
|
||||
// Initialize our emitter with "allocSize" bytes of memory allocated for the code buffer
|
||||
ShaderEmitter() : oaknut::CodeBlock(allocSize), oaknut::CodeGenerator(oaknut::CodeBlock::ptr()) {}
|
||||
|
||||
// PC must be a valid entrypoint here. It doesn't have that much overhead in this case, so we use std::array<>::at() to assert it does
|
||||
InstructionCallback getInstructionCallback(u32 pc) {
|
||||
// Cast away the constness because casting to a function pointer is hard otherwise. Legal as long as we don't write to *ptr
|
||||
uint8_t* ptr = instructionLabels.at(pc).ptr<u8*>();
|
||||
return reinterpret_cast<InstructionCallback>(ptr);
|
||||
}
|
||||
|
||||
PrologueCallback getPrologueCallback() { return prologueCb; }
|
||||
void compile(const PICAShader& shaderUnit);
|
||||
};
|
||||
|
||||
#endif // arm64 recompiler check
|
239
src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp
Normal file
239
src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp
Normal file
|
@ -0,0 +1,239 @@
|
|||
#if defined(PANDA3DS_DYNAPICA_SUPPORTED) && defined(PANDA3DS_ARM64_HOST)
|
||||
#include "PICA/dynapica/shader_rec_emitter_arm64.hpp"
|
||||
|
||||
#include <bit>
|
||||
|
||||
using namespace Helpers;
|
||||
using namespace oaknut;
|
||||
using namespace oaknut::util;
|
||||
|
||||
// Similar to the x64 recompiler, we use an odd internal ABI, which abuses the fact that we'll very rarely be calling C++ functions
|
||||
// So to avoid pushing and popping, we'll be making use of volatile registers as much as possible
|
||||
static constexpr QReg scratch1 = Q0;
|
||||
static constexpr QReg scratch2 = Q1;
|
||||
static constexpr QReg src1_vec = Q2;
|
||||
static constexpr QReg src2_vec = Q3;
|
||||
static constexpr QReg src3_vec = Q4;
|
||||
|
||||
static constexpr XReg statePointer = X9;
|
||||
|
||||
void ShaderEmitter::compile(const PICAShader& shaderUnit) {
|
||||
// Scan the code for call, exp2, log2, etc instructions which need some special care
|
||||
// After that, emit exp2 and log2 functions if the corresponding instructions are present
|
||||
scanCode(shaderUnit);
|
||||
if (codeHasExp2) Helpers::panic("arm64 shader JIT: Code has exp2");
|
||||
if (codeHasLog2) Helpers::panic("arm64 shader JIT: Code has log2");
|
||||
|
||||
align(16);
|
||||
// Compile every instruction in the shader
|
||||
// This sounds horrible but the PICA instruction memory is tiny, and most of the time it's padded wtih nops that compile to nothing
|
||||
recompilerPC = 0;
|
||||
loopLevel = 0;
|
||||
compileUntil(shaderUnit, PICAShader::maxInstructionCount);
|
||||
}
|
||||
|
||||
void ShaderEmitter::scanCode(const PICAShader& shaderUnit) {
|
||||
returnPCs.clear();
|
||||
|
||||
for (u32 i = 0; i < PICAShader::maxInstructionCount; i++) {
|
||||
const u32 instruction = shaderUnit.loadedShader[i];
|
||||
const u32 opcode = instruction >> 26;
|
||||
|
||||
if (isCall(instruction)) {
|
||||
const u32 num = instruction & 0xff;
|
||||
const u32 dest = getBits<10, 12>(instruction);
|
||||
const u32 returnPC = num + dest; // Add them to get the return PC
|
||||
|
||||
returnPCs.push_back(returnPC);
|
||||
} else if (opcode == ShaderOpcodes::EX2) {
|
||||
codeHasExp2 = true;
|
||||
} else if (opcode == ShaderOpcodes::LG2) {
|
||||
codeHasLog2 = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Sort return PCs so they can be binary searched
|
||||
std::sort(returnPCs.begin(), returnPCs.end());
|
||||
}
|
||||
|
||||
void ShaderEmitter::compileUntil(const PICAShader& shaderUnit, u32 end) {
|
||||
while (recompilerPC < end) {
|
||||
compileInstruction(shaderUnit);
|
||||
}
|
||||
}
|
||||
|
||||
void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) {
|
||||
// Write current location to label for this instruction
|
||||
l(instructionLabels[recompilerPC]);
|
||||
|
||||
// See if PC is a possible return PC and emit the proper code if so
|
||||
if (std::binary_search(returnPCs.begin(), returnPCs.end(), recompilerPC)) {
|
||||
Helpers::panic("Unimplemented return address for call instruction");
|
||||
}
|
||||
|
||||
// Fetch instruction and inc PC
|
||||
const u32 instruction = shaderUnit.loadedShader[recompilerPC++];
|
||||
const u32 opcode = instruction >> 26;
|
||||
|
||||
switch (opcode) {
|
||||
// case ShaderOpcodes::ADD: recADD(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::CALL: recCALL(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::CALLC: recCALLC(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::CALLU: recCALLU(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::CMP1:
|
||||
// case ShaderOpcodes::CMP2: recCMP(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::DP3: recDP3(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::DP4: recDP4(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::DPH:
|
||||
// case ShaderOpcodes::DPHI: recDPH(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::END: recEND(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::EX2: recEX2(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::FLR: recFLR(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::IFC: recIFC(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::IFU: recIFU(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::JMPC: recJMPC(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::JMPU: recJMPU(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::LG2: recLG2(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::LOOP: recLOOP(shaderUnit, instruction); break;
|
||||
case ShaderOpcodes::MOV: recMOV(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::MOVA: recMOVA(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::MAX: recMAX(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::MIN: recMIN(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::MUL: recMUL(shaderUnit, instruction); break;
|
||||
case ShaderOpcodes::NOP:
|
||||
break;
|
||||
// case ShaderOpcodes::RCP: recRCP(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::RSQ: recRSQ(shaderUnit, instruction); break;
|
||||
|
||||
// Unimplemented opcodes that don't seem to actually be used but exist in the binary
|
||||
// EMIT/SETEMIT are used in geometry shaders, however are sometimes found in vertex shaders?
|
||||
// case ShaderOpcodes::EMIT:
|
||||
// case ShaderOpcodes::SETEMIT:
|
||||
// log("[ShaderJIT] Unknown PICA opcode: %02X\n", opcode);
|
||||
// emitPrintLog(shaderUnit);
|
||||
// break;
|
||||
|
||||
// case ShaderOpcodes::BREAK:
|
||||
// case ShaderOpcodes::BREAKC: Helpers::warn("[Shader JIT] Unimplemented BREAK(C) instruction!"); break;
|
||||
|
||||
// We consider both MAD and MADI to be the same instruction and decode which one we actually have in recMAD
|
||||
// case 0x30:
|
||||
// case 0x31:
|
||||
// case 0x32:
|
||||
// case 0x33:
|
||||
// case 0x34:
|
||||
// case 0x35:
|
||||
// case 0x36:
|
||||
// case 0x37:
|
||||
// case 0x38:
|
||||
// case 0x39:
|
||||
// case 0x3A:
|
||||
// case 0x3B:
|
||||
// case 0x3C:
|
||||
// case 0x3D:
|
||||
// case 0x3E:
|
||||
// case 0x3F: recMAD(shaderUnit, instruction); break;
|
||||
|
||||
// case ShaderOpcodes::SLT:
|
||||
// case ShaderOpcodes::SLTI: recSLT(shaderUnit, instruction); break;
|
||||
|
||||
// case ShaderOpcodes::SGE:
|
||||
// case ShaderOpcodes::SGEI: recSGE(shaderUnit, instruction); break;
|
||||
|
||||
default: Helpers::panic("Shader JIT: Unimplemented PICA opcode %X", opcode);
|
||||
}
|
||||
}
|
||||
|
||||
const ShaderEmitter::vec4f& ShaderEmitter::getSourceRef(const PICAShader& shader, u32 src) {
|
||||
if (src < 0x10)
|
||||
return shader.inputs[src];
|
||||
else if (src < 0x20)
|
||||
return shader.tempRegisters[src - 0x10];
|
||||
else if (src <= 0x7f)
|
||||
return shader.floatUniforms[src - 0x20];
|
||||
else {
|
||||
Helpers::warn("[Shader JIT] Unimplemented source value: %X\n", src);
|
||||
return shader.dummy;
|
||||
}
|
||||
}
|
||||
|
||||
const ShaderEmitter::vec4f& ShaderEmitter::getDestRef(const PICAShader& shader, u32 dest) {
|
||||
if (dest < 0x10) {
|
||||
return shader.outputs[dest];
|
||||
} else if (dest < 0x20) {
|
||||
return shader.tempRegisters[dest - 0x10];
|
||||
}
|
||||
Helpers::panic("[Shader JIT] Unimplemented dest: %X", dest);
|
||||
}
|
||||
|
||||
// See shader.hpp header for docs on how the swizzle and negate works
|
||||
template <int sourceIndex>
|
||||
void ShaderEmitter::loadRegister(QReg dest, const PICAShader& shader, u32 src, u32 index, u32 operandDescriptor) {
|
||||
u32 compSwizzle; // Component swizzle pattern for the register
|
||||
bool negate; // If true, negate all lanes of the register
|
||||
|
||||
if constexpr (sourceIndex == 1) { // SRC1
|
||||
negate = (getBit<4>(operandDescriptor)) != 0;
|
||||
compSwizzle = getBits<5, 8>(operandDescriptor);
|
||||
} else if constexpr (sourceIndex == 2) { // SRC2
|
||||
negate = (getBit<13>(operandDescriptor)) != 0;
|
||||
compSwizzle = getBits<14, 8>(operandDescriptor);
|
||||
} else if constexpr (sourceIndex == 3) { // SRC3
|
||||
negate = (getBit<22>(operandDescriptor)) != 0;
|
||||
compSwizzle = getBits<23, 8>(operandDescriptor);
|
||||
}
|
||||
|
||||
switch (index) {
|
||||
case 0:
|
||||
[[likely]] { // Keep src as is, no need to offset it
|
||||
const vec4f& srcRef = getSourceRef(shader, src);
|
||||
const uintptr_t offset = uintptr_t(&srcRef) - uintptr_t(&shader); // Calculate offset of register from start of the state struct
|
||||
|
||||
LDR(dest, statePointer, offset);
|
||||
switch (compSwizzle) {
|
||||
case noSwizzle: break; // .xyzw
|
||||
case 0x0: DUP(dest.S4(), dest.Selem()[0]); break; // .xxxx
|
||||
case 0x55: DUP(dest.S4(), dest.Selem()[1]); break; // .yyyy
|
||||
case 0xAA: DUP(dest.S4(), dest.Selem()[2]); break; // .zzzz
|
||||
case 0xFF: DUP(dest.S4(), dest.Selem()[3]); break; // .wwww
|
||||
default: Helpers::panic("Unimplemented swizzle pattern for loading");
|
||||
}
|
||||
|
||||
// Negate the register if necessary
|
||||
if (negate) {
|
||||
FNEG(dest.S4(), dest.S4());
|
||||
}
|
||||
return; // Return. Rest of the function handles indexing which is not used if index == 0
|
||||
}
|
||||
|
||||
default: Helpers::panic("[ShaderJIT]: Unimplemented source index type %d", index);
|
||||
}
|
||||
|
||||
Helpers::panic("Unimplemented indexed register load");
|
||||
}
|
||||
|
||||
void ShaderEmitter::storeRegister(QReg source, const PICAShader& shader, u32 dest, u32 operandDescriptor) {
|
||||
const vec4f& destRef = getDestRef(shader, dest);
|
||||
const uintptr_t offset = uintptr_t(&destRef) - uintptr_t(&shader); // Calculate offset of register from start of the state struct
|
||||
|
||||
// Mask of which lanes to write
|
||||
u32 writeMask = operandDescriptor & 0xf;
|
||||
if (writeMask == 0xf) { // No lanes are masked, just use STR
|
||||
STR(source, statePointer, offset);
|
||||
} else {
|
||||
LDR(scratch1, statePointer, offset); // Load current source
|
||||
Helpers::panic("Unimplemented: Storing to register with blending");
|
||||
}
|
||||
}
|
||||
|
||||
void ShaderEmitter::recMOV(const PICAShader& shader, u32 instruction) {
|
||||
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
|
||||
const u32 src = getBits<12, 7>(instruction);
|
||||
const u32 idx = getBits<19, 2>(instruction);
|
||||
const u32 dest = getBits<21, 5>(instruction);
|
||||
|
||||
loadRegister<1>(src1_vec, shader, src, idx, operandDescriptor); // Load source 1 into scratch1
|
||||
storeRegister(src1_vec, shader, dest, operandDescriptor);
|
||||
}
|
||||
|
||||
#endif
|
|
@ -342,10 +342,10 @@ void ShaderEmitter::storeRegister(Xmm source, const PICAShader& shader, u32 dest
|
|||
} else if (std::popcount(writeMask) == 1) { // Only 1 register needs to be written back. This can be done with a simple shift right + movss
|
||||
int bit = std::countr_zero(writeMask); // Get which PICA register needs to be written to (0 = w, 1 = z, etc)
|
||||
size_t index = 3 - bit;
|
||||
const uintptr_t lane_offset = offset + index * sizeof(float);
|
||||
const uintptr_t laneOffset = offset + index * sizeof(float);
|
||||
|
||||
if (index == 0) { // Bottom lane, no need to shift
|
||||
movss(dword[statePointer + lane_offset], source);
|
||||
movss(dword[statePointer + laneOffset], source);
|
||||
} else { // Shift right by 32 * index, then write bottom lane
|
||||
if (haveAVX) {
|
||||
vpsrldq(scratch1, source, index * sizeof(float));
|
||||
|
@ -353,7 +353,7 @@ void ShaderEmitter::storeRegister(Xmm source, const PICAShader& shader, u32 dest
|
|||
movaps(scratch1, source);
|
||||
psrldq(scratch1, index * sizeof(float));
|
||||
}
|
||||
movss(dword[statePointer + lane_offset], scratch1);
|
||||
movss(dword[statePointer + laneOffset], scratch1);
|
||||
}
|
||||
} else if (haveSSE4_1) {
|
||||
// Bit reverse the write mask because that is what blendps expects
|
||||
|
|
1
third_party/oaknut
vendored
Submodule
1
third_party/oaknut
vendored
Submodule
|
@ -0,0 +1 @@
|
|||
Subproject commit 1d51f551294897ab4c8001c5259c8c5dee7e2a85
|
Loading…
Add table
Reference in a new issue