WIP arm64 shader recompiler

This commit is contained in:
wheremyfoodat 2024-01-03 00:39:36 +02:00
parent 281a7eefbf
commit c0621d0760
7 changed files with 383 additions and 4 deletions

3
.gitmodules vendored
View file

@ -46,3 +46,6 @@
[submodule "third_party/zep"]
path = third_party/zep
url = https://github.com/Panda3DS-emu/zep
[submodule "third_party/oaknut"]
path = third_party/oaknut
url = https://github.com/merryhime/oaknut

View file

@ -128,6 +128,9 @@ endif()
# Check for arm64
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
set(HOST_ARM64 TRUE)
add_subdirectory(third_party/oaknut) # Add Oaknut submodule for arm64 JITs
include_directories(third_party/oaknut/include)
add_compile_definitions(PANDA3DS_DYNAPICA_SUPPORTED)
add_compile_definitions(PANDA3DS_ARM64_HOST)
else()
set(HOST_ARM64 FALSE)
@ -172,6 +175,7 @@ set(SERVICE_SOURCE_FILES src/core/services/service_manager.cpp src/core/services
set(PICA_SOURCE_FILES src/core/PICA/gpu.cpp src/core/PICA/regs.cpp src/core/PICA/shader_unit.cpp
src/core/PICA/shader_interpreter.cpp src/core/PICA/dynapica/shader_rec.cpp
src/core/PICA/dynapica/shader_rec_emitter_x64.cpp src/core/PICA/pica_hash.cpp
src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp
)
set(LOADER_SOURCE_FILES src/core/loader/elf.cpp src/core/loader/ncsd.cpp src/core/loader/ncch.cpp src/core/loader/3dsx.cpp src/core/loader/lz77.cpp)
@ -236,6 +240,7 @@ set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/termcolor.hpp
include/services/news_u.hpp include/applets/software_keyboard.hpp include/applets/applet_manager.hpp include/fs/archive_user_save_data.hpp
include/services/amiibo_device.hpp include/services/nfc_types.hpp include/swap.hpp include/services/csnd.hpp include/services/nwm_uds.hpp
include/fs/archive_system_save_data.hpp include/lua_manager.hpp include/memory_mapped_file.hpp include/hydra_icon.hpp
include/PICA/dynapica/shader_rec_emitter_arm64.hpp
)
cmrc_add_resource_library(

View file

@ -1,13 +1,15 @@
#pragma once
#include "PICA/shader.hpp"
#if defined(PANDA3DS_DYNAPICA_SUPPORTED) && defined(PANDA3DS_X64_HOST)
#if defined(PANDA3DS_DYNAPICA_SUPPORTED) && (defined(PANDA3DS_X64_HOST) || defined(PANDA3DS_ARM64_HOST))
#define PANDA3DS_SHADER_JIT_SUPPORTED
#include <memory>
#include <unordered_map>
#ifdef PANDA3DS_X64_HOST
#include "shader_rec_emitter_x64.hpp"
#elif defined(PANDA3DS_ARM64_HOST)
#include "shader_rec_emitter_arm64.hpp"
#endif
#endif

View file

@ -0,0 +1,129 @@
#pragma once
// Only do anything if we're on an x64 target with JIT support enabled
#if defined(PANDA3DS_DYNAPICA_SUPPORTED) && defined(PANDA3DS_ARM64_HOST)
#include <array>
#include <oaknut/code_block.hpp>
#include <oaknut/oaknut.hpp>
#include "PICA/shader.hpp"
#include "helpers.hpp"
#include "logger.hpp"
class ShaderEmitter : private oaknut::CodeBlock, public oaknut::CodeGenerator {
static constexpr size_t executableMemorySize = PICAShader::maxInstructionCount * 96; // How much executable memory to alloc for each shader
// Allocate some extra space as padding for security purposes in the extremely unlikely occasion we manage to overflow the above size
static constexpr size_t allocSize = executableMemorySize + 0x1000;
// If the swizzle field is this value then the swizzle pattern is .xyzw so we don't need a shuffle
static constexpr uint noSwizzle = 0x1B;
using f24 = Floats::f24;
using vec4f = std::array<f24, 4>;
// An array of labels (incl pointers) to each compiled (to x64) PICA instruction
std::array<oaknut::Label, PICAShader::maxInstructionCount> instructionLabels;
// A vector of PCs that can potentially return based on the state of the PICA callstack.
// Filled before compiling a shader by scanning the code for call instructions
std::vector<u32> returnPCs;
// Vector value of (1.0, 1.0, 1.0, 1.0) for SLT(i)/SGE(i)
oaknut::Label onesVector;
u32 recompilerPC = 0; // PC the recompiler is currently recompiling @
u32 loopLevel = 0; // The current loop nesting level (0 = not in a loop)
// Shows whether the loaded shader has any log2 and exp2 instructions
bool codeHasLog2 = false;
bool codeHasExp2 = false;
oaknut::Label log2Func, exp2Func;
oaknut::Label emitLog2Func();
oaknut::Label emitExp2Func();
// Compile all instructions from [current recompiler PC, end)
void compileUntil(const PICAShader& shaderUnit, u32 endPC);
// Compile instruction "instr"
void compileInstruction(const PICAShader& shaderUnit);
bool isCall(u32 instruction) {
const u32 opcode = instruction >> 26;
return (opcode == ShaderOpcodes::CALL) || (opcode == ShaderOpcodes::CALLC) || (opcode == ShaderOpcodes::CALLU);
}
// Scan the shader code for call instructions to fill up the returnPCs vector before starting compilation
// We also scan for log2/exp2 instructions to see whether to emit the relevant functions
void scanCode(const PICAShader& shaderUnit);
// Load register with number "srcReg" indexed by index "idx" into the arm64 register "reg"
template <int sourceIndex>
void loadRegister(oaknut::QReg dest, const PICAShader& shader, u32 src, u32 idx, u32 operandDescriptor);
void storeRegister(oaknut::QReg source, const PICAShader& shader, u32 dest, u32 operandDescriptor);
const vec4f& getSourceRef(const PICAShader& shader, u32 src);
const vec4f& getDestRef(const PICAShader& shader, u32 dest);
// Check the value of the cmp register for instructions like ifc and callc
// Result is returned in the zero flag. If the comparison is true then zero == 1, else zero == 0
void checkCmpRegister(const PICAShader& shader, u32 instruction);
// Check the value of the bool uniform for instructions like ifu and callu
// Result is returned in the zero flag. If the comparison is true then zero == 0, else zero == 1 (Opposite of checkCmpRegister)
void checkBoolUniform(const PICAShader& shader, u32 instruction);
// Instruction recompilation functions
void recADD(const PICAShader& shader, u32 instruction);
void recCALL(const PICAShader& shader, u32 instruction);
void recCALLC(const PICAShader& shader, u32 instruction);
void recCALLU(const PICAShader& shader, u32 instruction);
void recCMP(const PICAShader& shader, u32 instruction);
void recDP3(const PICAShader& shader, u32 instruction);
void recDP4(const PICAShader& shader, u32 instruction);
void recDPH(const PICAShader& shader, u32 instruction);
void recEMIT(const PICAShader& shader, u32 instruction);
void recEND(const PICAShader& shader, u32 instruction);
void recEX2(const PICAShader& shader, u32 instruction);
void recFLR(const PICAShader& shader, u32 instruction);
void recIFC(const PICAShader& shader, u32 instruction);
void recIFU(const PICAShader& shader, u32 instruction);
void recJMPC(const PICAShader& shader, u32 instruction);
void recJMPU(const PICAShader& shader, u32 instruction);
void recLG2(const PICAShader& shader, u32 instruction);
void recLOOP(const PICAShader& shader, u32 instruction);
void recMAD(const PICAShader& shader, u32 instruction);
void recMAX(const PICAShader& shader, u32 instruction);
void recMIN(const PICAShader& shader, u32 instruction);
void recMOVA(const PICAShader& shader, u32 instruction);
void recMOV(const PICAShader& shader, u32 instruction);
void recMUL(const PICAShader& shader, u32 instruction);
void recRCP(const PICAShader& shader, u32 instruction);
void recRSQ(const PICAShader& shader, u32 instruction);
void recSETEMIT(const PICAShader& shader, u32 instruction);
void recSGE(const PICAShader& shader, u32 instruction);
void recSLT(const PICAShader& shader, u32 instruction);
MAKE_LOG_FUNCTION(log, shaderJITLogger)
public:
// Callback type used for instructions
using InstructionCallback = const void (*)(PICAShader& shaderUnit);
// Callback type used for the JIT prologue. This is what the caller will call
using PrologueCallback = const void (*)(PICAShader& shaderUnit, InstructionCallback cb);
PrologueCallback prologueCb = nullptr;
// Initialize our emitter with "allocSize" bytes of memory allocated for the code buffer
ShaderEmitter() : oaknut::CodeBlock(allocSize), oaknut::CodeGenerator(oaknut::CodeBlock::ptr()) {}
// PC must be a valid entrypoint here. It doesn't have that much overhead in this case, so we use std::array<>::at() to assert it does
InstructionCallback getInstructionCallback(u32 pc) {
// Cast away the constness because casting to a function pointer is hard otherwise. Legal as long as we don't write to *ptr
uint8_t* ptr = instructionLabels.at(pc).ptr<u8*>();
return reinterpret_cast<InstructionCallback>(ptr);
}
PrologueCallback getPrologueCallback() { return prologueCb; }
void compile(const PICAShader& shaderUnit);
};
#endif // arm64 recompiler check

View file

@ -0,0 +1,239 @@
#if defined(PANDA3DS_DYNAPICA_SUPPORTED) && defined(PANDA3DS_ARM64_HOST)
#include "PICA/dynapica/shader_rec_emitter_arm64.hpp"
#include <bit>
using namespace Helpers;
using namespace oaknut;
using namespace oaknut::util;
// Similar to the x64 recompiler, we use an odd internal ABI, which abuses the fact that we'll very rarely be calling C++ functions
// So to avoid pushing and popping, we'll be making use of volatile registers as much as possible
static constexpr QReg scratch1 = Q0;
static constexpr QReg scratch2 = Q1;
static constexpr QReg src1_vec = Q2;
static constexpr QReg src2_vec = Q3;
static constexpr QReg src3_vec = Q4;
static constexpr XReg statePointer = X9;
void ShaderEmitter::compile(const PICAShader& shaderUnit) {
// Scan the code for call, exp2, log2, etc instructions which need some special care
// After that, emit exp2 and log2 functions if the corresponding instructions are present
scanCode(shaderUnit);
if (codeHasExp2) Helpers::panic("arm64 shader JIT: Code has exp2");
if (codeHasLog2) Helpers::panic("arm64 shader JIT: Code has log2");
align(16);
// Compile every instruction in the shader
// This sounds horrible but the PICA instruction memory is tiny, and most of the time it's padded wtih nops that compile to nothing
recompilerPC = 0;
loopLevel = 0;
compileUntil(shaderUnit, PICAShader::maxInstructionCount);
}
void ShaderEmitter::scanCode(const PICAShader& shaderUnit) {
returnPCs.clear();
for (u32 i = 0; i < PICAShader::maxInstructionCount; i++) {
const u32 instruction = shaderUnit.loadedShader[i];
const u32 opcode = instruction >> 26;
if (isCall(instruction)) {
const u32 num = instruction & 0xff;
const u32 dest = getBits<10, 12>(instruction);
const u32 returnPC = num + dest; // Add them to get the return PC
returnPCs.push_back(returnPC);
} else if (opcode == ShaderOpcodes::EX2) {
codeHasExp2 = true;
} else if (opcode == ShaderOpcodes::LG2) {
codeHasLog2 = true;
}
}
// Sort return PCs so they can be binary searched
std::sort(returnPCs.begin(), returnPCs.end());
}
void ShaderEmitter::compileUntil(const PICAShader& shaderUnit, u32 end) {
while (recompilerPC < end) {
compileInstruction(shaderUnit);
}
}
void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) {
// Write current location to label for this instruction
l(instructionLabels[recompilerPC]);
// See if PC is a possible return PC and emit the proper code if so
if (std::binary_search(returnPCs.begin(), returnPCs.end(), recompilerPC)) {
Helpers::panic("Unimplemented return address for call instruction");
}
// Fetch instruction and inc PC
const u32 instruction = shaderUnit.loadedShader[recompilerPC++];
const u32 opcode = instruction >> 26;
switch (opcode) {
// case ShaderOpcodes::ADD: recADD(shaderUnit, instruction); break;
// case ShaderOpcodes::CALL: recCALL(shaderUnit, instruction); break;
// case ShaderOpcodes::CALLC: recCALLC(shaderUnit, instruction); break;
// case ShaderOpcodes::CALLU: recCALLU(shaderUnit, instruction); break;
// case ShaderOpcodes::CMP1:
// case ShaderOpcodes::CMP2: recCMP(shaderUnit, instruction); break;
// case ShaderOpcodes::DP3: recDP3(shaderUnit, instruction); break;
// case ShaderOpcodes::DP4: recDP4(shaderUnit, instruction); break;
// case ShaderOpcodes::DPH:
// case ShaderOpcodes::DPHI: recDPH(shaderUnit, instruction); break;
// case ShaderOpcodes::END: recEND(shaderUnit, instruction); break;
// case ShaderOpcodes::EX2: recEX2(shaderUnit, instruction); break;
// case ShaderOpcodes::FLR: recFLR(shaderUnit, instruction); break;
// case ShaderOpcodes::IFC: recIFC(shaderUnit, instruction); break;
// case ShaderOpcodes::IFU: recIFU(shaderUnit, instruction); break;
// case ShaderOpcodes::JMPC: recJMPC(shaderUnit, instruction); break;
// case ShaderOpcodes::JMPU: recJMPU(shaderUnit, instruction); break;
// case ShaderOpcodes::LG2: recLG2(shaderUnit, instruction); break;
// case ShaderOpcodes::LOOP: recLOOP(shaderUnit, instruction); break;
case ShaderOpcodes::MOV: recMOV(shaderUnit, instruction); break;
// case ShaderOpcodes::MOVA: recMOVA(shaderUnit, instruction); break;
// case ShaderOpcodes::MAX: recMAX(shaderUnit, instruction); break;
// case ShaderOpcodes::MIN: recMIN(shaderUnit, instruction); break;
// case ShaderOpcodes::MUL: recMUL(shaderUnit, instruction); break;
case ShaderOpcodes::NOP:
break;
// case ShaderOpcodes::RCP: recRCP(shaderUnit, instruction); break;
// case ShaderOpcodes::RSQ: recRSQ(shaderUnit, instruction); break;
// Unimplemented opcodes that don't seem to actually be used but exist in the binary
// EMIT/SETEMIT are used in geometry shaders, however are sometimes found in vertex shaders?
// case ShaderOpcodes::EMIT:
// case ShaderOpcodes::SETEMIT:
// log("[ShaderJIT] Unknown PICA opcode: %02X\n", opcode);
// emitPrintLog(shaderUnit);
// break;
// case ShaderOpcodes::BREAK:
// case ShaderOpcodes::BREAKC: Helpers::warn("[Shader JIT] Unimplemented BREAK(C) instruction!"); break;
// We consider both MAD and MADI to be the same instruction and decode which one we actually have in recMAD
// case 0x30:
// case 0x31:
// case 0x32:
// case 0x33:
// case 0x34:
// case 0x35:
// case 0x36:
// case 0x37:
// case 0x38:
// case 0x39:
// case 0x3A:
// case 0x3B:
// case 0x3C:
// case 0x3D:
// case 0x3E:
// case 0x3F: recMAD(shaderUnit, instruction); break;
// case ShaderOpcodes::SLT:
// case ShaderOpcodes::SLTI: recSLT(shaderUnit, instruction); break;
// case ShaderOpcodes::SGE:
// case ShaderOpcodes::SGEI: recSGE(shaderUnit, instruction); break;
default: Helpers::panic("Shader JIT: Unimplemented PICA opcode %X", opcode);
}
}
const ShaderEmitter::vec4f& ShaderEmitter::getSourceRef(const PICAShader& shader, u32 src) {
if (src < 0x10)
return shader.inputs[src];
else if (src < 0x20)
return shader.tempRegisters[src - 0x10];
else if (src <= 0x7f)
return shader.floatUniforms[src - 0x20];
else {
Helpers::warn("[Shader JIT] Unimplemented source value: %X\n", src);
return shader.dummy;
}
}
const ShaderEmitter::vec4f& ShaderEmitter::getDestRef(const PICAShader& shader, u32 dest) {
if (dest < 0x10) {
return shader.outputs[dest];
} else if (dest < 0x20) {
return shader.tempRegisters[dest - 0x10];
}
Helpers::panic("[Shader JIT] Unimplemented dest: %X", dest);
}
// See shader.hpp header for docs on how the swizzle and negate works
template <int sourceIndex>
void ShaderEmitter::loadRegister(QReg dest, const PICAShader& shader, u32 src, u32 index, u32 operandDescriptor) {
u32 compSwizzle; // Component swizzle pattern for the register
bool negate; // If true, negate all lanes of the register
if constexpr (sourceIndex == 1) { // SRC1
negate = (getBit<4>(operandDescriptor)) != 0;
compSwizzle = getBits<5, 8>(operandDescriptor);
} else if constexpr (sourceIndex == 2) { // SRC2
negate = (getBit<13>(operandDescriptor)) != 0;
compSwizzle = getBits<14, 8>(operandDescriptor);
} else if constexpr (sourceIndex == 3) { // SRC3
negate = (getBit<22>(operandDescriptor)) != 0;
compSwizzle = getBits<23, 8>(operandDescriptor);
}
switch (index) {
case 0:
[[likely]] { // Keep src as is, no need to offset it
const vec4f& srcRef = getSourceRef(shader, src);
const uintptr_t offset = uintptr_t(&srcRef) - uintptr_t(&shader); // Calculate offset of register from start of the state struct
LDR(dest, statePointer, offset);
switch (compSwizzle) {
case noSwizzle: break; // .xyzw
case 0x0: DUP(dest.S4(), dest.Selem()[0]); break; // .xxxx
case 0x55: DUP(dest.S4(), dest.Selem()[1]); break; // .yyyy
case 0xAA: DUP(dest.S4(), dest.Selem()[2]); break; // .zzzz
case 0xFF: DUP(dest.S4(), dest.Selem()[3]); break; // .wwww
default: Helpers::panic("Unimplemented swizzle pattern for loading");
}
// Negate the register if necessary
if (negate) {
FNEG(dest.S4(), dest.S4());
}
return; // Return. Rest of the function handles indexing which is not used if index == 0
}
default: Helpers::panic("[ShaderJIT]: Unimplemented source index type %d", index);
}
Helpers::panic("Unimplemented indexed register load");
}
void ShaderEmitter::storeRegister(QReg source, const PICAShader& shader, u32 dest, u32 operandDescriptor) {
const vec4f& destRef = getDestRef(shader, dest);
const uintptr_t offset = uintptr_t(&destRef) - uintptr_t(&shader); // Calculate offset of register from start of the state struct
// Mask of which lanes to write
u32 writeMask = operandDescriptor & 0xf;
if (writeMask == 0xf) { // No lanes are masked, just use STR
STR(source, statePointer, offset);
} else {
LDR(scratch1, statePointer, offset); // Load current source
Helpers::panic("Unimplemented: Storing to register with blending");
}
}
void ShaderEmitter::recMOV(const PICAShader& shader, u32 instruction) {
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
const u32 src = getBits<12, 7>(instruction);
const u32 idx = getBits<19, 2>(instruction);
const u32 dest = getBits<21, 5>(instruction);
loadRegister<1>(src1_vec, shader, src, idx, operandDescriptor); // Load source 1 into scratch1
storeRegister(src1_vec, shader, dest, operandDescriptor);
}
#endif

View file

@ -342,10 +342,10 @@ void ShaderEmitter::storeRegister(Xmm source, const PICAShader& shader, u32 dest
} else if (std::popcount(writeMask) == 1) { // Only 1 register needs to be written back. This can be done with a simple shift right + movss
int bit = std::countr_zero(writeMask); // Get which PICA register needs to be written to (0 = w, 1 = z, etc)
size_t index = 3 - bit;
const uintptr_t lane_offset = offset + index * sizeof(float);
const uintptr_t laneOffset = offset + index * sizeof(float);
if (index == 0) { // Bottom lane, no need to shift
movss(dword[statePointer + lane_offset], source);
movss(dword[statePointer + laneOffset], source);
} else { // Shift right by 32 * index, then write bottom lane
if (haveAVX) {
vpsrldq(scratch1, source, index * sizeof(float));
@ -353,7 +353,7 @@ void ShaderEmitter::storeRegister(Xmm source, const PICAShader& shader, u32 dest
movaps(scratch1, source);
psrldq(scratch1, index * sizeof(float));
}
movss(dword[statePointer + lane_offset], scratch1);
movss(dword[statePointer + laneOffset], scratch1);
}
} else if (haveSSE4_1) {
// Bit reverse the write mask because that is what blendps expects

1
third_party/oaknut vendored Submodule

@ -0,0 +1 @@
Subproject commit 1d51f551294897ab4c8001c5259c8c5dee7e2a85