diff --git a/.github/mac-bundle.sh b/.github/mac-bundle.sh new file mode 100755 index 00000000..83947a24 --- /dev/null +++ b/.github/mac-bundle.sh @@ -0,0 +1,43 @@ +# Taken from pcsx-redux create-app-bundle.sh +# For Plist buddy +PATH="$PATH:/usr/libexec" + + +# Construct the app iconset. +mkdir alber.iconset +convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 72 -resize 16x16 alber.iconset/icon_16x16.png +convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 144 -resize 32x32 alber.iconset/icon_16x16@2x.png +convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 72 -resize 32x32 alber.iconset/icon_32x32.png +convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 144 -resize 64x64 alber.iconset/icon_32x32@2x.png +convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 72 -resize 128x128 alber.iconset/icon_128x128.png +convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 144 -resize 256x256 alber.iconset/icon_128x128@2x.png +convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 72 -resize 256x256 alber.iconset/icon_256x256.png +convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 144 -resize 512x512 alber.iconset/icon_256x256@2x.png +convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 72 -resize 512x512 alber.iconset/icon_512x512.png +convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 144 -resize 1024x1024 alber.iconset/icon_512x512@2x.png +iconutil --convert icns alber.iconset + +# Set up the .app directory +mkdir -p Alber.app/Contents/MacOS/Libraries +mkdir Alber.app/Contents/Resources + + +# Copy binary into App +cp ./build/Alber Alber.app/Contents/MacOS/Alber +chmod a+x Alber.app/Contents/Macos/Alber + +# Copy icons into App +cp alber.icns Alber.app/Contents/Resources/AppIcon.icns + +# Fix up Plist stuff +PlistBuddy Alber.app/Contents/Info.plist -c "add CFBundleDisplayName string Alber" +PlistBuddy Alber.app/Contents/Info.plist -c "add CFBundleIconName string AppIcon" +PlistBuddy Alber.app/Contents/Info.plist -c "add CFBundleIconFile string AppIcon" +PlistBuddy Alber.app/Contents/Info.plist -c "add NSHighResolutionCapable bool true" +PlistBuddy Alber.app/Contents/version.plist -c "add ProjectName string Alber" + +# Bundle dylibs +dylibbundler -od -b -x Alber.app/Contents/MacOS/Alber -d Alber.app/Contents/Frameworks/ -p @rpath + +# relative rpath +install_name_tool -add_rpath @loader_path/../Frameworks Alber.app/Contents/MacOS/Alber \ No newline at end of file diff --git a/.github/workflows/MacOS_Build.yml b/.github/workflows/MacOS_Build.yml index 5e0de4bc..d3443faf 100644 --- a/.github/workflows/MacOS_Build.yml +++ b/.github/workflows/MacOS_Build.yml @@ -32,8 +32,20 @@ jobs: # Build your program with the given configuration run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}} - - name: Upload executable + - name: Install bundle dependencies + run: brew install dylibbundler imagemagick + + - name: Run bundle script + run: ./.github/mac-bundle.sh + + - name: Sign the App + run: codesign --force -s - -vvvv Alber.app + + - name: Zip it up + run: zip -r Alber Alber.app + + - name: Upload MacOS App uses: actions/upload-artifact@v2 with: - name: MacOS executable - path: './build/Alber' + name: MacOS Alber App Bundle + path: 'Alber.zip' diff --git a/.gitmodules b/.gitmodules index a2cac3f2..1b629d30 100644 --- a/.gitmodules +++ b/.gitmodules @@ -25,3 +25,6 @@ [submodule "stb"] path = third_party/stb url = https://github.com/nothings/stb +[submodule "third_party/cmrc"] + path = third_party/cmrc + url = https://github.com/vector-of-bool/cmrc diff --git a/CMakeLists.txt b/CMakeLists.txt index d276af52..1d9c5b07 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,6 +19,7 @@ endif() option(DISABLE_PANIC_DEV "Make a build with fewer and less intrusive asserts" OFF) option(GPU_DEBUG_INFO "Enable additional GPU debugging info" OFF) +option(ENABLE_OPENGL "Enable OpenGL rendering backend" ON) option(ENABLE_LTO "Enable link-time optimization" OFF) option(ENABLE_USER_BUILD "Make a user-facing build. These builds have various assertions disabled, LTO, and more" OFF) option(ENABLE_HTTP_SERVER "Enable HTTP server. Used for Discord bot support" OFF) @@ -45,11 +46,13 @@ set(SDL_STATIC ON CACHE BOOL "" FORCE) set(SDL_SHARED OFF CACHE BOOL "" FORCE) set(SDL_TEST OFF CACHE BOOL "" FORCE) add_subdirectory(third_party/SDL2) -add_subdirectory(third_party/glad) + add_subdirectory(third_party/toml11) include_directories(${SDL2_INCLUDE_DIR}) include_directories(third_party/toml11) +add_subdirectory(third_party/cmrc) + set(BOOST_ROOT "${CMAKE_SOURCE_DIR}/third_party/boost") set(Boost_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/third_party/boost") set(Boost_NO_SYSTEM_PATHS ON) @@ -90,9 +93,10 @@ else() message(FATAL_ERROR "Currently unsupported CPU architecture") endif() -set(SOURCE_FILES src/main.cpp src/emulator.cpp src/io_file.cpp src/gl_state.cpp src/config.cpp - src/core/CPU/cpu_dynarmic.cpp src/core/CPU/dynarmic_cycles.cpp src/core/memory.cpp - src/httpserver.cpp src/stb_image_write.c +set(SOURCE_FILES src/main.cpp src/emulator.cpp src/io_file.cpp src/config.cpp + src/core/CPU/cpu_dynarmic.cpp src/core/CPU/dynarmic_cycles.cpp + src/core/memory.cpp src/renderer.cpp src/core/renderer_null/renderer_null.cpp + src/httpserver.cpp src/stb_image_write.c src/core/cheats.cpp src/core/action_replay.cpp ) set(CRYPTO_SOURCE_FILES src/core/crypto/aes_engine.cpp) set(KERNEL_SOURCE_FILES src/core/kernel/kernel.cpp src/core/kernel/resource_limits.cpp @@ -117,38 +121,36 @@ set(PICA_SOURCE_FILES src/core/PICA/gpu.cpp src/core/PICA/regs.cpp src/core/PICA src/core/PICA/dynapica/shader_rec_emitter_x64.cpp src/core/PICA/pica_hash.cpp ) -set(RENDERER_GL_SOURCE_FILES src/core/renderer_gl/renderer_gl.cpp src/core/renderer_gl/textures.cpp src/core/renderer_gl/etc1.cpp) - set(LOADER_SOURCE_FILES src/core/loader/elf.cpp src/core/loader/ncsd.cpp src/core/loader/ncch.cpp src/core/loader/lz77.cpp) set(FS_SOURCE_FILES src/core/fs/archive_self_ncch.cpp src/core/fs/archive_save_data.cpp src/core/fs/archive_sdmc.cpp src/core/fs/archive_ext_save_data.cpp src/core/fs/archive_ncch.cpp ) -set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/opengl.hpp include/termcolor.hpp - include/cpu.hpp include/cpu_dynarmic.hpp include/memory.hpp include/kernel/kernel.hpp +set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/termcolor.hpp + include/cpu.hpp include/cpu_dynarmic.hpp include/memory.hpp include/renderer.hpp include/kernel/kernel.hpp include/dynarmic_cp15.hpp include/kernel/resource_limits.hpp include/kernel/kernel_types.hpp include/kernel/config_mem.hpp include/services/service_manager.hpp include/services/apt.hpp include/kernel/handles.hpp include/services/hid.hpp include/services/fs.hpp - include/services/gsp_gpu.hpp include/services/gsp_lcd.hpp include/arm_defs.hpp + include/services/gsp_gpu.hpp include/services/gsp_lcd.hpp include/arm_defs.hpp include/renderer_null/renderer_null.hpp include/PICA/gpu.hpp include/PICA/regs.hpp include/services/ndm.hpp include/PICA/shader.hpp include/PICA/shader_unit.hpp include/PICA/float_types.hpp include/logger.hpp include/loader/ncch.hpp include/loader/ncsd.hpp include/io_file.hpp include/loader/lz77.hpp include/fs/archive_base.hpp include/fs/archive_self_ncch.hpp include/services/dsp.hpp include/services/cfg.hpp include/services/region_codes.hpp include/fs/archive_save_data.hpp include/fs/archive_sdmc.hpp include/services/ptm.hpp - include/services/mic.hpp include/services/cecd.hpp include/renderer_gl/renderer_gl.hpp - include/renderer_gl/surfaces.hpp include/renderer_gl/surface_cache.hpp include/services/ac.hpp + include/services/mic.hpp include/services/cecd.hpp include/services/ac.hpp include/services/am.hpp include/services/boss.hpp include/services/frd.hpp include/services/nim.hpp include/fs/archive_ext_save_data.hpp include/services/shared_font.hpp include/fs/archive_ncch.hpp - include/renderer_gl/textures.hpp include/colour.hpp include/services/y2r.hpp include/services/cam.hpp + include/colour.hpp include/services/y2r.hpp include/services/cam.hpp include/services/ldr_ro.hpp include/ipc.hpp include/services/act.hpp include/services/nfc.hpp include/system_models.hpp include/services/dlp_srvr.hpp include/PICA/dynapica/pica_recs.hpp include/PICA/dynapica/x64_regs.hpp include/PICA/dynapica/vertex_loader_rec.hpp include/PICA/dynapica/shader_rec.hpp include/PICA/dynapica/shader_rec_emitter_x64.hpp include/PICA/pica_hash.hpp include/result/result.hpp include/result/result_common.hpp include/result/result_fs.hpp include/result/result_fnd.hpp include/result/result_gsp.hpp include/result/result_kernel.hpp include/result/result_os.hpp - include/crypto/aes_engine.hpp include/metaprogramming.hpp include/PICA/pica_vertex.hpp include/gl_state.hpp - include/config.hpp include/services/ir_user.hpp include/httpserver.hpp + include/crypto/aes_engine.hpp include/metaprogramming.hpp include/PICA/pica_vertex.hpp + include/config.hpp include/services/ir_user.hpp include/httpserver.hpp include/cheats.hpp + include/action_replay.hpp ) set(THIRD_PARTY_SOURCE_FILES third_party/imgui/imgui.cpp @@ -160,8 +162,6 @@ set(THIRD_PARTY_SOURCE_FILES third_party/imgui/imgui.cpp third_party/cityhash/cityhash.cpp third_party/xxhash/xxhash.c ) - -source_group("Header Files\\Core" FILES ${HEADER_FILES}) source_group("Source Files\\Core" FILES ${SOURCE_FILES}) source_group("Source Files\\Core\\Crypto" FILES ${CRYPTO_SOURCE_FILES}) source_group("Source Files\\Core\\Filesystem" FILES ${FS_SOURCE_FILES}) @@ -169,20 +169,64 @@ source_group("Source Files\\Core\\Kernel" FILES ${KERNEL_SOURCE_FILES}) source_group("Source Files\\Core\\Loader" FILES ${LOADER_SOURCE_FILES}) source_group("Source Files\\Core\\Services" FILES ${SERVICE_SOURCE_FILES}) source_group("Source Files\\Core\\PICA" FILES ${PICA_SOURCE_FILES}) -source_group("Source Files\\Core\\OpenGL Renderer" FILES ${RENDERER_GL_SOURCE_FILES}) source_group("Source Files\\Third Party" FILES ${THIRD_PARTY_SOURCE_FILES}) -add_executable(Alber ${SOURCE_FILES} ${FS_SOURCE_FILES} ${CRYPTO_SOURCE_FILES} ${KERNEL_SOURCE_FILES} ${LOADER_SOURCE_FILES} ${SERVICE_SOURCE_FILES} -${PICA_SOURCE_FILES} ${RENDERER_GL_SOURCE_FILES} ${THIRD_PARTY_SOURCE_FILES} ${HEADER_FILES}) +set(RENDERER_GL_SOURCE_FILES "") # Empty by default unless we are compiling with the GL renderer + +if(ENABLE_OPENGL) + add_subdirectory(third_party/glad) + + set(RENDERER_GL_INCLUDE_FILES include/renderer_gl/opengl.hpp + include/renderer_gl/renderer_gl.hpp include/renderer_gl/textures.hpp + include/renderer_gl/surfaces.hpp include/renderer_gl/surface_cache.hpp + include/renderer_gl/gl_state.hpp + ) + + set(RENDERER_GL_SOURCE_FILES src/core/renderer_gl/renderer_gl.cpp + src/core/renderer_gl/textures.cpp src/core/renderer_gl/etc1.cpp + src/core/renderer_gl/gl_state.cpp src/host_shaders/opengl_display.frag + src/host_shaders/opengl_display.vert src/host_shaders/opengl_vertex_shader.vert + src/host_shaders/opengl_fragment_shader.frag + ) + + set(HEADER_FILES ${HEADER_FILES} ${RENDERER_GL_INCLUDE_FILES}) + source_group("Source Files\\Core\\OpenGL Renderer" FILES ${RENDERER_GL_SOURCE_FILES}) + + cmrc_add_resource_library( + resources_renderer_gl + NAMESPACE RendererGL + WHENCE "src/host_shaders/" + "src/host_shaders/opengl_display.frag" + "src/host_shaders/opengl_display.vert" + "src/host_shaders/opengl_vertex_shader.vert" + "src/host_shaders/opengl_fragment_shader.frag" + ) +endif() + +source_group("Header Files\\Core" FILES ${HEADER_FILES}) +set(ALL_SOURCES ${SOURCE_FILES} ${FS_SOURCE_FILES} ${CRYPTO_SOURCE_FILES} ${KERNEL_SOURCE_FILES} ${LOADER_SOURCE_FILES} ${SERVICE_SOURCE_FILES} + ${PICA_SOURCE_FILES} ${THIRD_PARTY_SOURCE_FILES} ${HEADER_FILES}) + +if(ENABLE_OPENGL) + # Add the OpenGL source files to ALL_SOURCES + set(ALL_SOURCES ${ALL_SOURCES} ${RENDERER_GL_SOURCE_FILES}) +endif() + +add_executable(Alber ${ALL_SOURCES}) if(ENABLE_LTO OR ENABLE_USER_BUILD) set_target_properties(Alber PROPERTIES INTERPROCEDURAL_OPTIMIZATION TRUE) endif() -target_link_libraries(Alber PRIVATE dynarmic SDL2-static glad cryptopp) +target_link_libraries(Alber PRIVATE dynarmic SDL2-static cryptopp) + +if(ENABLE_OPENGL) + target_compile_definitions(Alber PUBLIC "PANDA3DS_ENABLE_OPENGL=1") + target_link_libraries(Alber PRIVATE glad resources_renderer_gl) +endif() if(GPU_DEBUG_INFO) - target_compile_definitions(Alber PRIVATE GPU_DEBUG_INFO=1) + target_compile_definitions(Alber PRIVATE GPU_DEBUG_INFO=1) endif() if(ENABLE_USER_BUILD) diff --git a/docs/img/alber-icon.ico b/docs/img/alber-icon.ico new file mode 100644 index 00000000..b6251a0e Binary files /dev/null and b/docs/img/alber-icon.ico differ diff --git a/include/PICA/dynapica/shader_rec.hpp b/include/PICA/dynapica/shader_rec.hpp index b7d37b02..e8b6afed 100644 --- a/include/PICA/dynapica/shader_rec.hpp +++ b/include/PICA/dynapica/shader_rec.hpp @@ -21,7 +21,7 @@ class ShaderJIT { ShaderCache cache; #endif -public: + public: #ifdef PANDA3DS_SHADER_JIT_SUPPORTED // Call this before starting to process a batch of vertices // This will read the PICA config (uploaded shader and shader operand descriptors) and search if we've already compiled this shader @@ -29,9 +29,7 @@ public: // The caller must make sure the entrypoint has been properly set beforehand void prepare(PICAShader& shaderUnit); void reset(); - void run(PICAShader& shaderUnit) { - prologueCallback(shaderUnit, entrypointCallback); - } + void run(PICAShader& shaderUnit) { prologueCallback(shaderUnit, entrypointCallback); } static constexpr bool isAvailable() { return true; } #else @@ -44,7 +42,7 @@ public: } // Define dummy callback. This should never be called if the shader JIT is not supported - using Callback = void(*)(PICAShader& shaderUnit); + using Callback = void (*)(PICAShader& shaderUnit); Callback activeShaderCallback = nullptr; void reset() {} diff --git a/include/PICA/dynapica/shader_rec_emitter_x64.hpp b/include/PICA/dynapica/shader_rec_emitter_x64.hpp index ba37595a..d22ed371 100644 --- a/include/PICA/dynapica/shader_rec_emitter_x64.hpp +++ b/include/PICA/dynapica/shader_rec_emitter_x64.hpp @@ -2,17 +2,17 @@ // Only do anything if we're on an x64 target with JIT support enabled #if defined(PANDA3DS_DYNAPICA_SUPPORTED) && defined(PANDA3DS_X64_HOST) -#include "helpers.hpp" -#include "logger.hpp" -#include "PICA/shader.hpp" -#include "xbyak/xbyak.h" -#include "xbyak/xbyak_util.h" -#include "x64_regs.hpp" - #include +#include "PICA/shader.hpp" +#include "helpers.hpp" +#include "logger.hpp" +#include "x64_regs.hpp" +#include "xbyak/xbyak.h" +#include "xbyak/xbyak_util.h" + class ShaderEmitter : public Xbyak::CodeGenerator { - static constexpr size_t executableMemorySize = PICAShader::maxInstructionCount * 96; // How much executable memory to alloc for each shader + static constexpr size_t executableMemorySize = PICAShader::maxInstructionCount * 96; // How much executable memory to alloc for each shader // Allocate some extra space as padding for security purposes in the extremely unlikely occasion we manage to overflow the above size static constexpr size_t allocSize = executableMemorySize + 0x1000; @@ -20,7 +20,7 @@ class ShaderEmitter : public Xbyak::CodeGenerator { static constexpr uint noSwizzle = 0x1B; using f24 = Floats::f24; - using vec4f = OpenGL::Vector; + using vec4f = std::array; // An array of labels (incl pointers) to each compiled (to x64) PICA instruction std::array instructionLabels; @@ -33,13 +33,22 @@ class ShaderEmitter : public Xbyak::CodeGenerator { // Vector value of (1.0, 1.0, 1.0, 1.0) for SLT(i)/SGE(i) Label onesVector; - u32 recompilerPC = 0; // PC the recompiler is currently recompiling @ - u32 loopLevel = 0; // The current loop nesting level (0 = not in a loop) + u32 recompilerPC = 0; // PC the recompiler is currently recompiling @ + u32 loopLevel = 0; // The current loop nesting level (0 = not in a loop) bool haveSSE4_1 = false; // Shows if the CPU supports SSE4.1 bool haveAVX = false; // Shows if the CPU supports AVX (NOT AVX2, NOT AVX512. Regular AVX) bool haveFMA3 = false; // Shows if the CPU supports FMA3 + // Shows whether the loaded shader has any log2 and exp2 instructions + bool codeHasLog2 = false; + bool codeHasExp2 = false; + + Xbyak::Label log2Func, exp2Func; + Xbyak::Label emitLog2Func(); + Xbyak::Label emitExp2Func(); + Xbyak::util::Cpu cpuCaps; + // Compile all instructions from [current recompiler PC, end) void compileUntil(const PICAShader& shaderUnit, u32 endPC); // Compile instruction "instr" @@ -49,8 +58,10 @@ class ShaderEmitter : public Xbyak::CodeGenerator { const u32 opcode = instruction >> 26; return (opcode == ShaderOpcodes::CALL) || (opcode == ShaderOpcodes::CALLC) || (opcode == ShaderOpcodes::CALLU); } + // Scan the shader code for call instructions to fill up the returnPCs vector before starting compilation - void scanForCalls(const PICAShader& shaderUnit); + // We also scan for log2/exp2 instructions to see whether to emit the relevant functions + void scanCode(const PICAShader& shaderUnit); // Load register with number "srcReg" indexed by index "idx" into the xmm register "reg" template @@ -105,25 +116,27 @@ class ShaderEmitter : public Xbyak::CodeGenerator { MAKE_LOG_FUNCTION(log, shaderJITLogger) -public: - using InstructionCallback = const void(*)(PICAShader& shaderUnit); // Callback type used for instructions + public: + // Callback type used for instructions + using InstructionCallback = const void (*)(PICAShader& shaderUnit); // Callback type used for the JIT prologue. This is what the caller will call - using PrologueCallback = const void(*)(PICAShader& shaderUnit, InstructionCallback cb); + using PrologueCallback = const void (*)(PICAShader& shaderUnit, InstructionCallback cb); + PrologueCallback prologueCb = nullptr; // Initialize our emitter with "allocSize" bytes of RWX memory ShaderEmitter() : Xbyak::CodeGenerator(allocSize) { - const auto cpu = Xbyak::util::Cpu(); + cpuCaps = Xbyak::util::Cpu(); - haveSSE4_1 = cpu.has(Xbyak::util::Cpu::tSSE41); - haveAVX = cpu.has(Xbyak::util::Cpu::tAVX); - haveFMA3 = cpu.has(Xbyak::util::Cpu::tFMA); + haveSSE4_1 = cpuCaps.has(Xbyak::util::Cpu::tSSE41); + haveAVX = cpuCaps.has(Xbyak::util::Cpu::tAVX); + haveFMA3 = cpuCaps.has(Xbyak::util::Cpu::tFMA); - if (!cpu.has(Xbyak::util::Cpu::tSSE3)) { + if (!cpuCaps.has(Xbyak::util::Cpu::tSSE3)) { Helpers::panic("This CPU does not support SSE3. Please use the shader interpreter instead"); } } - + void compile(const PICAShader& shaderUnit); // PC must be a valid entrypoint here. It doesn't have that much overhead in this case, so we use std::array<>::at() to assert it does @@ -133,9 +146,7 @@ public: return reinterpret_cast(ptr); } - PrologueCallback getPrologueCallback() { - return prologueCb; - } + PrologueCallback getPrologueCallback() { return prologueCb; } }; -#endif // x64 recompiler check \ No newline at end of file +#endif // x64 recompiler check \ No newline at end of file diff --git a/include/PICA/gpu.hpp b/include/PICA/gpu.hpp index a4adc816..4304a2de 100644 --- a/include/PICA/gpu.hpp +++ b/include/PICA/gpu.hpp @@ -1,39 +1,39 @@ #pragma once #include +#include "PICA/dynapica/shader_rec.hpp" +#include "PICA/float_types.hpp" +#include "PICA/pica_vertex.hpp" +#include "PICA/regs.hpp" +#include "PICA/shader_unit.hpp" #include "config.hpp" #include "helpers.hpp" #include "logger.hpp" #include "memory.hpp" -#include "PICA/float_types.hpp" -#include "PICA/regs.hpp" -#include "PICA/shader_unit.hpp" -#include "PICA/dynapica/shader_rec.hpp" -#include "renderer_gl/renderer_gl.hpp" -#include "PICA/pica_vertex.hpp" +#include "renderer.hpp" class GPU { static constexpr u32 regNum = 0x300; - using vec4f = OpenGL::Vector; + using vec4f = std::array; using Registers = std::array; Memory& mem; EmulatorConfig& config; ShaderUnit shaderUnit; - ShaderJIT shaderJIT; // Doesn't do anything if JIT is disabled or not supported + ShaderJIT shaderJIT; // Doesn't do anything if JIT is disabled or not supported u8* vram = nullptr; MAKE_LOG_FUNCTION(log, gpuLogger) - static constexpr u32 maxAttribCount = 12; // Up to 12 vertex attributes + static constexpr u32 maxAttribCount = 12; // Up to 12 vertex attributes static constexpr u32 vramSize = u32(6_MB); - Registers regs; // GPU internal registers - std::array currentAttributes; // Vertex attributes before being passed to the shader + Registers regs; // GPU internal registers + std::array currentAttributes; // Vertex attributes before being passed to the shader - std::array immediateModeAttributes; // Vertex attributes uploaded via immediate mode submission + std::array immediateModeAttributes; // Vertex attributes uploaded via immediate mode submission std::array immediateModeVertices; uint immediateModeVertIndex; - uint immediateModeAttrIndex; // Index of the immediate mode attribute we're uploading + uint immediateModeAttrIndex; // Index of the immediate mode attribute we're uploading template void drawArrays(); @@ -42,35 +42,33 @@ class GPU { void drawArrays(bool indexed); struct AttribInfo { - u32 offset = 0; // Offset from base vertex array - int size = 0; // Bytes per vertex + u32 offset = 0; // Offset from base vertex array + int size = 0; // Bytes per vertex u32 config1 = 0; u32 config2 = 0; - u32 componentCount = 0; // Number of components for the attribute + u32 componentCount = 0; // Number of components for the attribute - u64 getConfigFull() { - return u64(config1) | (u64(config2) << 32); - } + u64 getConfigFull() { return u64(config1) | (u64(config2) << 32); } }; u64 getVertexShaderInputConfig() { return u64(regs[PICA::InternalRegs::VertexShaderInputCfgLow]) | (u64(regs[PICA::InternalRegs::VertexShaderInputCfgHigh]) << 32); } - std::array attributeInfo; // Info for each of the 12 attributes - u32 totalAttribCount = 0; // Number of vertex attributes to send to VS - u32 fixedAttribMask = 0; // Which attributes are fixed? - - u32 fixedAttribIndex = 0; // Which fixed attribute are we writing to ([0, 11] range) - u32 fixedAttribCount = 0; // How many attribute components have we written? When we get to 4 the attr will actually get submitted - std::array fixedAttrBuff; // Buffer to hold fixed attributes in until they get submitted + std::array attributeInfo; // Info for each of the 12 attributes + u32 totalAttribCount = 0; // Number of vertex attributes to send to VS + u32 fixedAttribMask = 0; // Which attributes are fixed? + + u32 fixedAttribIndex = 0; // Which fixed attribute are we writing to ([0, 11] range) + u32 fixedAttribCount = 0; // How many attribute components have we written? When we get to 4 the attr will actually get submitted + std::array fixedAttrBuff; // Buffer to hold fixed attributes in until they get submitted // Command processor pointers for GPU command lists u32* cmdBuffStart = nullptr; u32* cmdBuffEnd = nullptr; u32* cmdBuffCurr = nullptr; - Renderer renderer; + std::unique_ptr renderer; PICA::Vertex getImmediateModeVertex(); public: @@ -84,11 +82,10 @@ class GPU { // Set to false by the renderer when the lighting_lut is uploaded ot the GPU bool lightingLUTDirty = false; - GPU(Memory& mem, GLStateManager& gl, EmulatorConfig& config); - void initGraphicsContext() { renderer.initGraphicsContext(); } - void getGraphicsContext() { renderer.getGraphicsContext(); } - void display() { renderer.display(); } - void screenshot(const std::string& name) { renderer.screenshot(name); } + GPU(Memory& mem, EmulatorConfig& config); + void initGraphicsContext() { renderer->initGraphicsContext(); } + void display() { renderer->display(); } + void screenshot(const std::string& name) { renderer->screenshot(name); } void fireDMA(u32 dest, u32 source, u32 size); void reset(); @@ -107,13 +104,13 @@ class GPU { // TODO: Emulate the transfer engine & its registers // Then this can be emulated by just writing the appropriate values there void clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control) { - renderer.clearBuffer(startAddress, endAddress, value, control); + renderer->clearBuffer(startAddress, endAddress, value, control); } // TODO: Emulate the transfer engine & its registers // Then this can be emulated by just writing the appropriate values there void displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) { - renderer.displayTransfer(inputAddr, outputAddr, inputSize, outputSize, flags); + renderer->displayTransfer(inputAddr, outputAddr, inputSize, outputSize, flags); } // Read a value of type T from physical address paddr @@ -132,17 +129,17 @@ class GPU { // Get a pointer of type T* to the data starting from physical address paddr template - T* getPointerPhys(u32 paddr) { - if (paddr >= PhysicalAddrs::FCRAM && paddr <= PhysicalAddrs::FCRAMEnd) { + T* getPointerPhys(u32 paddr, u32 size = 0) { + if (paddr >= PhysicalAddrs::FCRAM && paddr + size <= PhysicalAddrs::FCRAMEnd) { u8* fcram = mem.getFCRAM(); u32 index = paddr - PhysicalAddrs::FCRAM; return (T*)&fcram[index]; - } else if (paddr >= PhysicalAddrs::VRAM && paddr <= PhysicalAddrs::VRAMEnd) { + } else if (paddr >= PhysicalAddrs::VRAM && paddr + size <= PhysicalAddrs::VRAMEnd) { u32 index = paddr - PhysicalAddrs::VRAM; return (T*)&vram[index]; } else [[unlikely]] { Helpers::panic("[GPU] Tried to access unknown physical address: %08X", paddr); } } -}; \ No newline at end of file +}; diff --git a/include/PICA/shader.hpp b/include/PICA/shader.hpp index ad1e0e46..0f3154f1 100644 --- a/include/PICA/shader.hpp +++ b/include/PICA/shader.hpp @@ -2,13 +2,14 @@ #include #include #include -#include "helpers.hpp" -#include "opengl.hpp" + #include "PICA/float_types.hpp" #include "PICA/pica_hash.hpp" +#include "helpers.hpp" enum class ShaderType { - Vertex, Geometry + Vertex, + Geometry, }; namespace ShaderOpcodes { @@ -46,66 +47,66 @@ namespace ShaderOpcodes { SETEMIT = 0x2B, JMPC = 0x2C, JMPU = 0x2D, - CMP1 = 0x2E, // Both of these instructions are CMP + CMP1 = 0x2E, // Both of these instructions are CMP CMP2 = 0x2F, - MAD = 0x38 // Everything between 0x38-0x3F is a MAD but fuck it + MAD = 0x38 // Everything between 0x38-0x3F is a MAD but fuck it }; } // Note: All PICA f24 vec4 registers must have the alignas(16) specifier to make them easier to access in SSE/NEON code in the JIT class PICAShader { using f24 = Floats::f24; - using vec4f = OpenGL::Vector; + using vec4f = std::array; struct Loop { - u32 startingPC; // PC at the start of the loop - u32 endingPC; // PC at the end of the loop - u32 iterations; // How many iterations of the loop to run - u32 increment; // How much to increment the loop counter after each iteration + u32 startingPC; // PC at the start of the loop + u32 endingPC; // PC at the end of the loop + u32 iterations; // How many iterations of the loop to run + u32 increment; // How much to increment the loop counter after each iteration }; // Info for ifc/ifu stack struct ConditionalInfo { - u32 endingPC; // PC at the end of the if block (= DST) - u32 newPC; // PC after the if block is done executing (= DST + NUM) + u32 endingPC; // PC at the end of the if block (= DST) + u32 newPC; // PC after the if block is done executing (= DST + NUM) }; struct CallInfo { - u32 endingPC; // PC at the end of the function - u32 returnPC; // PC to return to after the function ends + u32 endingPC; // PC at the end of the function + u32 returnPC; // PC to return to after the function ends }; - int bufferIndex; // Index of the next instruction to overwrite for shader uploads - int opDescriptorIndex; // Index of the next operand descriptor we'll overwrite - u32 floatUniformIndex = 0; // Which float uniform are we writing to? ([0, 95] range) - u32 floatUniformWordCount = 0; // How many words have we buffered for the current uniform transfer? - bool f32UniformTransfer = false; // Are we transferring an f32 uniform or an f24 uniform? + int bufferIndex; // Index of the next instruction to overwrite for shader uploads + int opDescriptorIndex; // Index of the next operand descriptor we'll overwrite + u32 floatUniformIndex = 0; // Which float uniform are we writing to? ([0, 95] range) + u32 floatUniformWordCount = 0; // How many words have we buffered for the current uniform transfer? + bool f32UniformTransfer = false; // Are we transferring an f32 uniform or an f24 uniform? - std::array floatUniformBuffer; // Buffer for temporarily caching float uniform data + std::array floatUniformBuffer; // Buffer for temporarily caching float uniform data -public: + public: // These are placed close to the temp registers and co because it helps the JIT generate better code - u32 entrypoint = 0; // Initial shader PC + u32 entrypoint = 0; // Initial shader PC u32 boolUniform; - std::array, 4> intUniforms; + std::array, 4> intUniforms; alignas(16) std::array floatUniforms; - alignas(16) std::array fixedAttributes; // Fixed vertex attributes - alignas(16) std::array inputs; // Attributes passed to the shader + alignas(16) std::array fixedAttributes; // Fixed vertex attributes + alignas(16) std::array inputs; // Attributes passed to the shader alignas(16) std::array outputs; - alignas(16) vec4f dummy = vec4f({ f24::zero(), f24::zero(), f24::zero(), f24::zero() }); // Dummy register used by the JIT + alignas(16) vec4f dummy = vec4f({f24::zero(), f24::zero(), f24::zero(), f24::zero()}); // Dummy register used by the JIT -protected: + protected: std::array operandDescriptors; - alignas(16) std::array tempRegisters; // General purpose registers the shader can use for temp values - OpenGL::Vector addrRegister; // Address register - bool cmpRegister[2]; // Comparison registers where the result of CMP is stored in + alignas(16) std::array tempRegisters; // General purpose registers the shader can use for temp values + std::array addrRegister; // Address register + bool cmpRegister[2]; // Comparison registers where the result of CMP is stored in u32 loopCounter; - u32 pc = 0; // Program counter: Index of the next instruction we're going to execute - u32 loopIndex = 0; // The index of our loop stack (0 = empty, 4 = full) - u32 ifIndex = 0; // The index of our IF stack - u32 callIndex = 0; // The index of our CALL stack + u32 pc = 0; // Program counter: Index of the next instruction we're going to execute + u32 loopIndex = 0; // The index of our loop stack (0 = empty, 4 = full) + u32 ifIndex = 0; // The index of our IF stack + u32 callIndex = 0; // The index of our CALL stack std::array loopInfo; std::array conditionalInfo; @@ -117,7 +118,7 @@ protected: // Ideally we want to be able to support multiple different types of hash depending on compilation settings, but let's get this working first using Hash = PICAHash::HashType; - Hash lastCodeHash = 0; // Last hash computed for the shader code (Used for the JIT caching mechanism) + Hash lastCodeHash = 0; // Last hash computed for the shader code (Used for the JIT caching mechanism) Hash lastOpdescHash = 0; // Last hash computed for the operand descriptors (Also used for the JIT) bool codeHashDirty = false; @@ -130,7 +131,7 @@ protected: vec4f getSource(u32 source); vec4f& getDest(u32 dest); -private: + private: // Interpreter functions for the various shader functions void add(u32 instruction); void call(u32 instruction); @@ -171,13 +172,13 @@ private: bool negate; using namespace Helpers; - if constexpr (sourceIndex == 1) { // SRC1 + if constexpr (sourceIndex == 1) { // SRC1 negate = (getBit<4>(opDescriptor)) != 0; compSwizzle = getBits<5, 8>(opDescriptor); - } else if constexpr (sourceIndex == 2) { // SRC2 + } else if constexpr (sourceIndex == 2) { // SRC2 negate = (getBit<13>(opDescriptor)) != 0; compSwizzle = getBits<14, 8>(opDescriptor); - } else if constexpr (sourceIndex == 3) { // SRC3 + } else if constexpr (sourceIndex == 3) { // SRC3 negate = (getBit<22>(opDescriptor)) != 0; compSwizzle = getBits<23, 8>(opDescriptor); } @@ -185,8 +186,8 @@ private: // Iterate through every component of the swizzled vector in reverse order // And get which source component's index to match it with for (int comp = 0; comp < 4; comp++) { - int index = compSwizzle & 3; // Get index for this component - compSwizzle >>= 2; // Move to next component index + int index = compSwizzle & 3; // Get index for this component + compSwizzle >>= 2; // Move to next component index ret[3 - comp] = source[index]; } @@ -212,39 +213,35 @@ private: u8 getIndexedSource(u32 source, u32 index); bool isCondTrue(u32 instruction); -public: + public: static constexpr size_t maxInstructionCount = 4096; - std::array loadedShader; // Currently loaded & active shader - std::array bufferedShader; // Shader to be transferred when the SH_CODETRANSFER_END reg gets written to + std::array loadedShader; // Currently loaded & active shader + std::array bufferedShader; // Shader to be transferred when the SH_CODETRANSFER_END reg gets written to PICAShader(ShaderType type) : type(type) {} // Theese functions are in the header to be inlined more easily, though with LTO I hope I'll be able to move them - void finalize() { - std::memcpy(&loadedShader[0], &bufferedShader[0], 4096 * sizeof(u32)); - } + void finalize() { std::memcpy(&loadedShader[0], &bufferedShader[0], 4096 * sizeof(u32)); } - void setBufferIndex(u32 index) { - bufferIndex = index & 0xfff; - } - - void setOpDescriptorIndex(u32 index) { - opDescriptorIndex = index & 0x7f; - } + void setBufferIndex(u32 index) { bufferIndex = index & 0xfff; } + void setOpDescriptorIndex(u32 index) { opDescriptorIndex = index & 0x7f; } void uploadWord(u32 word) { - if (bufferIndex >= 4095) Helpers::panic("o no, shader upload overflew"); + if (bufferIndex >= 4095) { + Helpers::panic("o no, shader upload overflew"); + } + bufferedShader[bufferIndex++] = word; bufferIndex &= 0xfff; - codeHashDirty = true; // Signal the JIT if necessary that the program hash has potentially changed + codeHashDirty = true; // Signal the JIT if necessary that the program hash has potentially changed } void uploadDescriptor(u32 word) { operandDescriptors[opDescriptorIndex++] = word; opDescriptorIndex &= 0x7f; - opdescHashDirty = true; // Signal the JIT if necessary that the program hash has potentially changed + opdescHashDirty = true; // Signal the JIT if necessary that the program hash has potentially changed } void setFloatUniformIndex(u32 word) { @@ -255,23 +252,24 @@ public: void uploadFloatUniform(u32 word) { floatUniformBuffer[floatUniformWordCount++] = word; - if (floatUniformIndex >= 96) + if (floatUniformIndex >= 96) { Helpers::panic("[PICA] Tried to write float uniform %d", floatUniformIndex); + } if ((f32UniformTransfer && floatUniformWordCount >= 4) || (!f32UniformTransfer && floatUniformWordCount >= 3)) { vec4f& uniform = floatUniforms[floatUniformIndex++]; floatUniformWordCount = 0; if (f32UniformTransfer) { - uniform.x() = f24::fromFloat32(*(float*)&floatUniformBuffer[3]); - uniform.y() = f24::fromFloat32(*(float*)&floatUniformBuffer[2]); - uniform.z() = f24::fromFloat32(*(float*)&floatUniformBuffer[1]); - uniform.w() = f24::fromFloat32(*(float*)&floatUniformBuffer[0]); + uniform[0] = f24::fromFloat32(*(float*)&floatUniformBuffer[3]); + uniform[1] = f24::fromFloat32(*(float*)&floatUniformBuffer[2]); + uniform[2] = f24::fromFloat32(*(float*)&floatUniformBuffer[1]); + uniform[3] = f24::fromFloat32(*(float*)&floatUniformBuffer[0]); } else { - uniform.x() = f24::fromRaw(floatUniformBuffer[2] & 0xffffff); - uniform.y() = f24::fromRaw(((floatUniformBuffer[1] & 0xffff) << 8) | (floatUniformBuffer[2] >> 24)); - uniform.z() = f24::fromRaw(((floatUniformBuffer[0] & 0xff) << 16) | (floatUniformBuffer[1] >> 16)); - uniform.w() = f24::fromRaw(floatUniformBuffer[0] >> 8); + uniform[0] = f24::fromRaw(floatUniformBuffer[2] & 0xffffff); + uniform[1] = f24::fromRaw(((floatUniformBuffer[1] & 0xffff) << 8) | (floatUniformBuffer[2] >> 24)); + uniform[2] = f24::fromRaw(((floatUniformBuffer[0] & 0xff) << 16) | (floatUniformBuffer[1] >> 16)); + uniform[3] = f24::fromRaw(floatUniformBuffer[0] >> 8); } } } @@ -280,10 +278,10 @@ public: using namespace Helpers; auto& u = intUniforms[index]; - u.x() = word & 0xff; - u.y() = getBits<8, 8>(word); - u.z() = getBits<16, 8>(word); - u.w() = getBits<24, 8>(word); + u[0] = word & 0xff; + u[1] = getBits<8, 8>(word); + u[2] = getBits<16, 8>(word); + u[3] = getBits<24, 8>(word); } void run(); diff --git a/include/action_replay.hpp b/include/action_replay.hpp new file mode 100644 index 00000000..a6b97df9 --- /dev/null +++ b/include/action_replay.hpp @@ -0,0 +1,52 @@ +#pragma once +#include +#include +#include + +#include "helpers.hpp" +#include "memory.hpp" +#include "services/hid.hpp" + +class ActionReplay { + using Cheat = std::vector; // A cheat is really just a bunch of 64-bit opcodes neatly encoded into 32-bit chunks + static constexpr size_t ifStackSize = 32; // TODO: How big is this, really? + + u32 offset1, offset2; // Memory offset registers. Non-persistent. + u32 data1, data2; // Data offset registers. Non-persistent. + u32 storage1, storage2; // Storage registers. Persistent. + + // When an instruction does not specify which offset or data register to use, we use the "active" one + // Which is by default #1 and may be changed by certain AR operations + u32 *activeOffset, *activeData, *activeStorage; + u32 ifStackIndex; // Our index in the if stack. Shows how many entries we have at the moment. + u32 loopStackIndex; // Same but for loops + std::bitset<32> ifStack; + + // Program counter + u32 pc = 0; + Memory& mem; + HIDService& hid; + + // Has the cheat ended? + bool running = false; + // Run 1 AR instruction + void runInstruction(const Cheat& cheat, u32 instruction); + + // Action Replay has a billion D-type opcodes so this handles all of them + void executeDType(const Cheat& cheat, u32 instruction); + + u8 read8(u32 addr); + u16 read16(u32 addr); + u32 read32(u32 addr); + + void write8(u32 addr, u8 value); + void write16(u32 addr, u16 value); + void write32(u32 addr, u32 value); + + void pushConditionBlock(bool condition); + + public: + ActionReplay(Memory& mem, HIDService& hid); + void runCheat(const Cheat& cheat); + void reset(); +}; \ No newline at end of file diff --git a/include/cheats.hpp b/include/cheats.hpp new file mode 100644 index 00000000..6ada7d20 --- /dev/null +++ b/include/cheats.hpp @@ -0,0 +1,32 @@ +#pragma once +#include +#include + +#include "action_replay.hpp" +#include "helpers.hpp" +#include "services/hid.hpp" + +// Forward-declare this since it's just passed and we don't want to include memory.hpp and increase compile time +class Memory; + +class Cheats { + public: + enum class CheatType { + ActionReplay, // CTRPF cheats + Gateway, + }; + + struct Cheat { + CheatType type; + std::vector instructions; + }; + + Cheats(Memory& mem, HIDService& hid); + void addCheat(const Cheat& cheat); + void reset(); + void run(); + + private: + ActionReplay ar; // An ActionReplay cheat machine for executing CTRPF codes + std::vector cheats; +}; \ No newline at end of file diff --git a/include/config.hpp b/include/config.hpp index bdb697bf..6bccdad6 100644 --- a/include/config.hpp +++ b/include/config.hpp @@ -1,10 +1,14 @@ #pragma once #include +#include "renderer.hpp" + // Remember to initialize every field here to its default value otherwise bad things will happen struct EmulatorConfig { bool shaderJitEnabled = false; + RendererType rendererType = RendererType::OpenGL; + EmulatorConfig(const std::filesystem::path& path); void load(const std::filesystem::path& path); void save(const std::filesystem::path& path); }; \ No newline at end of file diff --git a/include/emulator.hpp b/include/emulator.hpp index 83b832f6..040b93b2 100644 --- a/include/emulator.hpp +++ b/include/emulator.hpp @@ -1,39 +1,51 @@ #pragma once #include -#include #include #include #include #include "PICA/gpu.hpp" -#include "cpu.hpp" +#include "cheats.hpp" #include "config.hpp" +#include "cpu.hpp" #include "crypto/aes_engine.hpp" #include "io_file.hpp" #include "memory.hpp" -#include "gl_state.hpp" + #ifdef PANDA3DS_ENABLE_HTTP_SERVER #include "httpserver.hpp" #endif -enum class ROMType { None, ELF, NCSD, CXI }; +enum class ROMType { + None, + ELF, + NCSD, + CXI, +}; class Emulator { + EmulatorConfig config; CPU cpu; GPU gpu; Memory memory; Kernel kernel; Crypto::AESEngine aesEngine; + Cheats cheats; - GLStateManager gl; - EmulatorConfig config; SDL_Window* window; + +#ifdef PANDA3DS_ENABLE_OPENGL SDL_GLContext glContext; +#endif + SDL_GameController* gameController = nullptr; int gameControllerID; + // Shows whether we've loaded any action replay codes + bool haveCheats = false; + // Variables to keep track of whether the user is controlling the 3DS analog stick with their keyboard // This is done so when a gamepad is connected, we won't automatically override the 3DS analog stick settings with the gamepad's state // And so the user can still use the keyboard to control the analog @@ -63,8 +75,8 @@ class Emulator { public: // Decides whether to reload or not reload the ROM when resetting. We use enum class over a plain bool for clarity. // If NoReload is selected, the emulator will not reload its selected ROM. This is useful for things like booting up the emulator, or resetting to - // change ROMs. If Reload is selected, the emulator will reload its selected ROM. This is useful for eg a "reset" button that keeps the current ROM - // and just resets the emu + // change ROMs. If Reload is selected, the emulator will reload its selected ROM. This is useful for eg a "reset" button that keeps the current + // ROM and just resets the emu enum class ReloadOption { NoReload, Reload }; Emulator(); diff --git a/include/fs/archive_base.hpp b/include/fs/archive_base.hpp index e1b4caa0..0b0f65a1 100644 --- a/include/fs/archive_base.hpp +++ b/include/fs/archive_base.hpp @@ -116,15 +116,34 @@ struct ArchiveSession { ArchiveSession(ArchiveBase* archive, const FSPath& filePath, bool isOpen = true) : archive(archive), path(filePath), isOpen(isOpen) {} }; -struct DirectorySession { - ArchiveBase* archive = nullptr; - // For directories which are mirrored to a specific path on the disk, this contains that path - // Otherwise this is a nullopt - std::optional pathOnDisk; - bool isOpen; +struct DirectoryEntry { + std::filesystem::path path; + bool isDirectory; +}; - DirectorySession(ArchiveBase* archive, std::filesystem::path path, bool isOpen = true) : archive(archive), pathOnDisk(path), - isOpen(isOpen) {} +struct DirectorySession { + ArchiveBase* archive = nullptr; + // For directories which are mirrored to a specific path on the disk, this contains that path + // Otherwise this is a nullopt + std::optional pathOnDisk; + + // The list of directory entries + the index of the entry we're currently inspecting + std::vector entries; + size_t currentEntry; + + bool isOpen; + + DirectorySession(ArchiveBase* archive, std::filesystem::path path, bool isOpen = true) : archive(archive), pathOnDisk(path), isOpen(isOpen) { + currentEntry = 0; // Start from entry 0 + + // Read all directory entries, cache them + for (auto& e : std::filesystem::directory_iterator(path)) { + DirectoryEntry entry; + entry.path = e.path(); + entry.isDirectory = std::filesystem::is_directory(e); + entries.push_back(entry); + } + } }; // Represents a file descriptor obtained from OpenFile. If the optional is nullopt, opening the file failed. diff --git a/include/renderer.hpp b/include/renderer.hpp new file mode 100644 index 00000000..e14afcea --- /dev/null +++ b/include/renderer.hpp @@ -0,0 +1,66 @@ +#pragma once +#include +#include +#include + +#include "PICA/pica_vertex.hpp" +#include "PICA/regs.hpp" +#include "helpers.hpp" + +enum class RendererType : s8 { + // Todo: Auto = -1, + Null = 0, + OpenGL = 1, + Vulkan = 2, +}; + +class GPU; + +class Renderer { + protected: + GPU& gpu; + static constexpr u32 regNum = 0x300; // Number of internal PICA registers + const std::array& regs; + + std::array fbSize; // The size of the framebuffer (ie both the colour and depth buffer)' + + u32 colourBufferLoc; // Location in 3DS VRAM for the colour buffer + PICA::ColorFmt colourBufferFormat; // Format of the colours stored in the colour buffer + + // Same for the depth/stencil buffer + u32 depthBufferLoc; + PICA::DepthFmt depthBufferFormat; + + public: + Renderer(GPU& gpu, const std::array& internalRegs); + virtual ~Renderer(); + + static constexpr u32 vertexBufferSize = 0x10000; + static std::optional typeFromString(std::string inString); + static const char* typeToString(RendererType rendererType); + + virtual void reset() = 0; + virtual void display() = 0; // Display the 3DS screen contents to the window + virtual void initGraphicsContext() = 0; // Initialize graphics context + virtual void clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control) = 0; // Clear a GPU buffer in VRAM + virtual void displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) = 0; // Perform display transfer + virtual void drawVertices(PICA::PrimType primType, std::span vertices) = 0; // Draw the given vertices + + virtual void screenshot(const std::string& name) = 0; + + void setFBSize(u32 width, u32 height) { + fbSize[0] = width; + fbSize[1] = height; + } + + void setColourFormat(PICA::ColorFmt format) { colourBufferFormat = format; } + void setDepthFormat(PICA::DepthFmt format) { + if (format == PICA::DepthFmt::Unknown1) { + Helpers::panic("[PICA] Undocumented depth-stencil mode!"); + } + depthBufferFormat = format; + } + + void setColourBufferLoc(u32 loc) { colourBufferLoc = loc; } + void setDepthBufferLoc(u32 loc) { depthBufferLoc = loc; } +}; \ No newline at end of file diff --git a/include/gl_state.hpp b/include/renderer_gl/gl_state.hpp similarity index 100% rename from include/gl_state.hpp rename to include/renderer_gl/gl_state.hpp diff --git a/include/opengl.hpp b/include/renderer_gl/opengl.hpp similarity index 100% rename from include/opengl.hpp rename to include/renderer_gl/opengl.hpp diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp index 07f8a63c..15d12ade 100644 --- a/include/renderer_gl/renderer_gl.hpp +++ b/include/renderer_gl/renderer_gl.hpp @@ -1,23 +1,23 @@ #pragma once + #include #include -#include #include "PICA/float_types.hpp" +#include "PICA/pica_vertex.hpp" +#include "PICA/regs.hpp" #include "gl_state.hpp" #include "helpers.hpp" #include "logger.hpp" +#include "renderer.hpp" #include "surface_cache.hpp" #include "textures.hpp" -#include "PICA/regs.hpp" -#include "PICA/pica_vertex.hpp" // More circular dependencies! class GPU; -class Renderer { - GPU& gpu; - GLStateManager& gl; +class RendererGL final : public Renderer { + GLStateManager gl = {}; OpenGL::Program triangleProgram; OpenGL::Program displayProgram; @@ -31,7 +31,7 @@ class Renderer { GLint textureEnvCombinerLoc = -1; GLint textureEnvColorLoc = -1; GLint textureEnvScaleLoc = -1; - + // Uniform of PICA registers GLint picaRegLoc = -1; @@ -48,22 +48,10 @@ class Renderer { SurfaceCache colourBufferCache; SurfaceCache textureCache; - OpenGL::uvec2 fbSize; // The size of the framebuffer (ie both the colour and depth buffer)' - - u32 colourBufferLoc; // Location in 3DS VRAM for the colour buffer - PICA::ColorFmt colourBufferFormat; // Format of the colours stored in the colour buffer - - // Same for the depth/stencil buffer - u32 depthBufferLoc; - PICA::DepthFmt depthBufferFormat; - // Dummy VAO/VBO for blitting the final output OpenGL::VertexArray dummyVAO; OpenGL::VertexBuffer dummyVBO; - static constexpr u32 regNum = 0x300; // Number of internal PICA registers - const std::array& regs; - OpenGL::Texture screenTexture; GLuint lightLUTTextureArray; OpenGL::Framebuffer screenFramebuffer; @@ -79,34 +67,16 @@ class Renderer { void updateLightingLUT(); public: - Renderer(GPU& gpu, GLStateManager& gl, const std::array& internalRegs) : gpu(gpu), gl(gl), regs(internalRegs) {} + RendererGL(GPU& gpu, const std::array& internalRegs) : Renderer(gpu, internalRegs) {} + ~RendererGL() override; - void reset(); - void display(); // Display the 3DS screen contents to the window - void initGraphicsContext(); // Initialize graphics context - void getGraphicsContext(); // Set up graphics context for rendering - void clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control); // Clear a GPU buffer in VRAM - void displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags); // Perform display transfer - void drawVertices(PICA::PrimType primType, std::span vertices); // Draw the given vertices + void reset() override; + void display() override; // Display the 3DS screen contents to the window + void initGraphicsContext() override; // Initialize graphics context + void clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control) override; // Clear a GPU buffer in VRAM + void displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) override; // Perform display transfer + void drawVertices(PICA::PrimType primType, std::span vertices) override; // Draw the given vertices // Take a screenshot of the screen and store it in a file - void screenshot(const std::string& name); - - void setFBSize(u32 width, u32 height) { - fbSize.x() = width; - fbSize.y() = height; - } - - void setColourFormat(PICA::ColorFmt format) { colourBufferFormat = format; } - void setDepthFormat(PICA::DepthFmt format) { - if (format == PICA::DepthFmt::Unknown1) { - Helpers::panic("[PICA] Undocumented depth-stencil mode!"); - } - depthBufferFormat = format; - } - - void setColourBufferLoc(u32 loc) { colourBufferLoc = loc; } - void setDepthBufferLoc(u32 loc) { depthBufferLoc = loc; } - - static constexpr u32 vertexBufferSize = 0x10000; + void screenshot(const std::string& name) override; }; \ No newline at end of file diff --git a/include/renderer_gl/textures.hpp b/include/renderer_gl/textures.hpp index 5469a59f..a2b6c09d 100644 --- a/include/renderer_gl/textures.hpp +++ b/include/renderer_gl/textures.hpp @@ -40,11 +40,11 @@ struct Texture { void allocate(); void setNewConfig(u32 newConfig); - void decodeTexture(const void* data); + void decodeTexture(std::span data); void free(); u64 sizeInBytes(); - u32 decodeTexel(u32 u, u32 v, PICA::TextureFmt fmt, const void* data); + u32 decodeTexel(u32 u, u32 v, PICA::TextureFmt fmt, std::span data); // Get the morton interleave offset of a texel based on its U and V values static u32 mortonInterleave(u32 u, u32 v); @@ -59,6 +59,6 @@ struct Texture { // Returns the texel at coordinates (u, v) of an ETC1(A4) texture // TODO: Make hasAlpha a template parameter - u32 getTexelETC(bool hasAlpha, u32 u, u32 v, u32 width, const void* data); + u32 getTexelETC(bool hasAlpha, u32 u, u32 v, u32 width, std::span data); u32 decodeETC(u32 alpha, u32 u, u32 v, u64 colourData); -}; \ No newline at end of file +}; diff --git a/include/renderer_null/renderer_null.hpp b/include/renderer_null/renderer_null.hpp new file mode 100644 index 00000000..29080786 --- /dev/null +++ b/include/renderer_null/renderer_null.hpp @@ -0,0 +1,17 @@ +#include "renderer.hpp" + +class GPU; + +class RendererNull final : public Renderer { + public: + RendererNull(GPU& gpu, const std::array& internalRegs); + ~RendererNull() override; + + void reset() override; + void display() override; + void initGraphicsContext() override; + void clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control) override; + void displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) override; + void drawVertices(PICA::PrimType primType, std::span vertices) override; + void screenshot(const std::string& name) override; +}; \ No newline at end of file diff --git a/include/services/hid.hpp b/include/services/hid.hpp index 6a3aab95..23a36ec6 100644 --- a/include/services/hid.hpp +++ b/include/services/hid.hpp @@ -91,6 +91,7 @@ class HIDService { void pressKey(u32 mask) { newButtons |= mask; } void releaseKey(u32 mask) { newButtons &= ~mask; } + u32 getOldButtons() { return oldButtons; } s16 getCirclepadX() { return circlePadX; } s16 getCirclepadY() { return circlePadY; } diff --git a/include/services/service_manager.hpp b/include/services/service_manager.hpp index 1d93641c..51d6d554 100644 --- a/include/services/service_manager.hpp +++ b/include/services/service_manager.hpp @@ -90,17 +90,5 @@ class ServiceManager { void signalDSPEvents() { dsp.signalEvents(); } // Input function wrappers - void pressKey(u32 key) { hid.pressKey(key); } - void releaseKey(u32 key) { hid.releaseKey(key); } - s16 getCirclepadX() { return hid.getCirclepadX(); } - s16 getCirclepadY() { return hid.getCirclepadY(); } - void setCirclepadX(s16 x) { hid.setCirclepadX(x); } - void setCirclepadY(s16 y) { hid.setCirclepadY(y); } - void updateInputs(u64 currentTimestamp) { hid.updateInputs(currentTimestamp); } - void setTouchScreenPress(u16 x, u16 y) { hid.setTouchScreenPress(x, y); } - void releaseTouchScreen() { hid.releaseTouchScreen(); } - - void setRoll(s16 roll) { hid.setRoll(roll); } - void setPitch(s16 pitch) { hid.setPitch(pitch); } - void setYaw(s16 yaw) { hid.setYaw(yaw); } + HIDService& getHID() { return hid; } }; diff --git a/readme.md b/readme.md index 9c98178d..854267b6 100644 --- a/readme.md +++ b/readme.md @@ -83,6 +83,7 @@ Panda3DS also supports controller input using the SDL2 GameController API. - [Corgi3DS](https://github.com/PSI-Rockin/Corgi3DS), an LLE 3DS emulator which both served as an inspiration, as well as a nice source of documentation for some PICA200-related things # Sister Projects +- [Dynarmic](https://github.com/merryhime/dynarmic): An arm32/arm64 to x86-64/ARMv8 recompiler - [PCSX-Redux](https://github.com/grumpycoders/pcsx-redux): A PlayStation 1 emulator targetting developers, reverse engineers and regular PS1 fans alike - [SkyEmu](https://github.com/skylersaleh/SkyEmu): A seagull-themed low-level GameBoy, GameBoy Color, GameBoy Advance and Nintendo DS emulator that is designed to be easy to use, cross platform and accurate. - [NanoBoyAdvance](https://github.com/nba-emu/NanoBoyAdvance): A Game Boy Advance emulator focusing on hardware research and cycle-accurate emulation diff --git a/src/config.cpp b/src/config.cpp index 6c9a8450..a5e9330c 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -1,6 +1,7 @@ #include "config.hpp" #include +#include #include "helpers.hpp" #include "toml.hpp" @@ -9,6 +10,8 @@ // We are legally allowed, as per the author's wish, to use the above code without any licensing restrictions // However we still want to follow the license as closely as possible and offer the proper attributions. +EmulatorConfig::EmulatorConfig(const std::filesystem::path& path) { load(path); } + void EmulatorConfig::load(const std::filesystem::path& path) { // If the configuration file does not exist, create it and return std::error_code error; @@ -31,6 +34,17 @@ void EmulatorConfig::load(const std::filesystem::path& path) { if (gpuResult.is_ok()) { auto gpu = gpuResult.unwrap(); + // Get renderer + auto rendererName = toml::find_or(gpu, "Renderer", "OpenGL"); + auto configRendererType = Renderer::typeFromString(rendererName); + + if (configRendererType.has_value()) { + rendererType = configRendererType.value(); + } else { + Helpers::warn("Invalid renderer specified: %s\n", rendererName.c_str()); + rendererType = RendererType::OpenGL; + } + shaderJitEnabled = toml::find_or(gpu, "EnableShaderJIT", false); } } @@ -43,7 +57,7 @@ void EmulatorConfig::save(const std::filesystem::path& path) { if (std::filesystem::exists(path, error)) { try { data = toml::parse(path); - } catch (std::exception& ex) { + } catch (const std::exception& ex) { Helpers::warn("Exception trying to parse config file. Exception: %s\n", ex.what()); return; } @@ -55,6 +69,7 @@ void EmulatorConfig::save(const std::filesystem::path& path) { } data["GPU"]["EnableShaderJIT"] = shaderJitEnabled; + data["GPU"]["Renderer"] = std::string(Renderer::typeToString(rendererType)); std::ofstream file(path, std::ios::out); file << data; diff --git a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp index 06247950..13eb630e 100644 --- a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp +++ b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp @@ -61,11 +61,14 @@ void ShaderEmitter::compile(const PICAShader& shaderUnit) { // Tail call to shader code entrypoint jmp(arg2); - align(16); - // Scan the shader code for call instructions and add them to the list of possible return PCs. We need to do this because the PICA callstack works - // Pretty weirdly - scanForCalls(shaderUnit); + // Scan the code for call, exp2, log2, etc instructions which need some special care + // After that, emit exp2 and log2 functions if the corresponding instructions are present + scanCode(shaderUnit); + if (codeHasExp2) exp2Func = emitExp2Func(); + if (codeHasLog2) log2Func = emitLog2Func(); + + align(16); // Compile every instruction in the shader // This sounds horrible but the PICA instruction memory is tiny, and most of the time it's padded wtih nops that compile to nothing recompilerPC = 0; @@ -73,17 +76,23 @@ void ShaderEmitter::compile(const PICAShader& shaderUnit) { compileUntil(shaderUnit, PICAShader::maxInstructionCount); } -void ShaderEmitter::scanForCalls(const PICAShader& shaderUnit) { +void ShaderEmitter::scanCode(const PICAShader& shaderUnit) { returnPCs.clear(); for (u32 i = 0; i < PICAShader::maxInstructionCount; i++) { const u32 instruction = shaderUnit.loadedShader[i]; + const u32 opcode = instruction >> 26; + if (isCall(instruction)) { const u32 num = instruction & 0xff; const u32 dest = getBits<10, 12>(instruction); const u32 returnPC = num + dest; // Add them to get the return PC returnPCs.push_back(returnPC); + } else if (opcode == ShaderOpcodes::EX2) { + codeHasExp2 = true; + } else if (opcode == ShaderOpcodes::LG2) { + codeHasLog2 = true; } } @@ -877,7 +886,6 @@ void ShaderEmitter::recLOOP(const PICAShader& shader, u32 instruction) { loopLevel--; } -// SSE does not have a log2 instruction so we temporarily emulate this using x87 FPU void ShaderEmitter::recLG2(const PICAShader& shader, u32 instruction) { const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f]; const u32 src = getBits<12, 7>(instruction); @@ -885,30 +893,16 @@ void ShaderEmitter::recLG2(const PICAShader& shader, u32 instruction) { const u32 dest = getBits<21, 5>(instruction); const u32 writeMask = getBits<0, 4>(operandDescriptor); - // Load swizzled source, push 1.0 to the x87 stack loadRegister<1>(src1_xmm, shader, src, idx, operandDescriptor); - fld1(); - - // Push source to the x87 stack - movd(eax, src1_xmm); - push(rax); - fld(dword[rsp]); - - // Perform log2, load result to src1_xmm, write it back and undo the previous push rax - fyl2x(); - fstp(dword[rsp]); - movss(src1_xmm, dword[rsp]); - add(rsp, 8); - - // If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx - // Otherwise we do + call(log2Func); // Result is output in src1_xmm + if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x shufps(src1_xmm, src1_xmm, 0); // src1_xmm = src1_xmm.xxxx } + storeRegister(src1_xmm, shader, dest, operandDescriptor); } -// SSE does not have an exp2 instruction so we temporarily emulate this using x87 FPU void ShaderEmitter::recEX2(const PICAShader& shader, u32 instruction) { const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f]; const u32 src = getBits<12, 7>(instruction); @@ -917,31 +911,12 @@ void ShaderEmitter::recEX2(const PICAShader& shader, u32 instruction) { const u32 writeMask = getBits<0, 4>(operandDescriptor); loadRegister<1>(src1_xmm, shader, src, idx, operandDescriptor); + call(exp2Func); // Result is output in src1_xmm - // Push source to the x87 stack, then do some insane compiler-generated x87 math - movd(eax, src1_xmm); - push(rax); - fld(dword[rsp]); - - fld(st0); - frndint(); - fsub(st1, st0); - fxch(st1); - f2xm1(); - fadd(dword[rip + onesVector]); - fscale(); - - // Load result to src1_xmm, write it back and undo the previous push rax - fstp(st1); - fstp(dword[rsp]); - movss(src1_xmm, dword[rsp]); - add(rsp, 8); - - // If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx - // Otherwise we do if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x shufps(src1_xmm, src1_xmm, 0); // src1_xmm = src1_xmm.xxxx } + storeRegister(src1_xmm, shader, dest, operandDescriptor); } @@ -962,6 +937,228 @@ void ShaderEmitter::printLog(const PICAShader& shaderUnit) { printf("cmp: (%d, %d)\n", shaderUnit.cmpRegister[0], shaderUnit.cmpRegister[1]); } +// For EXP2/LOG2, we have permission to adjust and relicense the SSE implementation from Citra for this project from the original authors +// So we do it since EXP2/LOG2 are pretty terrible to implement. +// ABI: Input is in the bottom bits of src1_xmm, same for output. If the result needs swizzling, the caller must handle it +// Assume src1, src2, scratch1, scratch2, eax, edx all thrashed + +Xbyak::Label ShaderEmitter::emitLog2Func() { + Xbyak::Label subroutine; + + // This code uses the fact that log2(float) = log2(2^exponent * mantissa) + // = log2(2^exponent) + log2(mantissa) = exponent + log2(mantissa) where mantissa has a limited range of values + // https://stackoverflow.com/a/45787548 + + // SSE does not have a log instruction, thus we must approximate. + // We perform this approximation first performing a range reduction into the range [1.0, 2.0). + // A minimax polynomial which was fit for the function log2(x) / (x - 1) is then evaluated. + // We multiply the result by (x - 1) then restore the result into the appropriate range. + + // Coefficients for the minimax polynomial. + // f(x) computes approximately log2(x) / (x - 1). + // f(x) = c4 + x * (c3 + x * (c2 + x * (c1 + x * c0)). + // We align the table of coefficients to 64 bytes, so that the whole thing will fit in 1 cache line + align(64); + const void* c0 = getCurr(); + dd(0x3d74552f); + const void* c1 = getCurr(); + dd(0xbeee7397); + const void* c2 = getCurr(); + dd(0x3fbd96dd); + const void* c3 = getCurr(); + dd(0xc02153f6); + const void* c4 = getCurr(); + dd(0x4038d96c); + + align(16); + const void* negative_infinity_vector = getCurr(); + dd(0xff800000); + dd(0xff800000); + dd(0xff800000); + dd(0xff800000); + const void* default_qnan_vector = getCurr(); + dd(0x7fc00000); + dd(0x7fc00000); + dd(0x7fc00000); + dd(0x7fc00000); + + Xbyak::Label inputIsNan, inputIsZero, inputOutOfRange; + + align(16); + L(inputOutOfRange); + je(inputIsZero); + movaps(src1_xmm, xword[rip + default_qnan_vector]); + ret(); + L(inputIsZero); + movaps(src1_xmm, xword[rip + negative_infinity_vector]); + ret(); + + align(16); + L(subroutine); + + // Here we handle edge cases: input in {NaN, 0, -Inf, Negative}. + xorps(scratch1, scratch1); + ucomiss(scratch1, src1_xmm); + jp(inputIsNan); + jae(inputOutOfRange); + + // Split input: SRC1=MANT[1,2) SCRATCH2=Exponent + if (cpuCaps.has(Cpu::tAVX512F | Cpu::tAVX512VL)) { + vgetexpss(scratch2, src1_xmm, src1_xmm); + vgetmantss(src1_xmm, src1_xmm, src1_xmm, 0); + } else { + movd(eax, src1_xmm); + mov(edx, eax); + and_(eax, 0x7f800000); + and_(edx, 0x007fffff); + or_(edx, 0x3f800000); + movd(src1_xmm, edx); + // SRC1 now contains the mantissa of the input. + shr(eax, 23); + sub(eax, 0x7f); + cvtsi2ss(scratch2, eax); + // scratch2 now contains the exponent of the input. + } + + movss(scratch1, xword[rip + c0]); + + // Complete computation of polynomial + if (haveFMA3) { + vfmadd213ss(scratch1, src1_xmm, xword[rip + c1]); + vfmadd213ss(scratch1, src1_xmm, xword[rip + c2]); + vfmadd213ss(scratch1, src1_xmm, xword[rip + c3]); + vfmadd213ss(scratch1, src1_xmm, xword[rip + c4]); + subss(src1_xmm, dword[rip + onesVector]); + vfmadd231ss(scratch2, scratch1, src1_xmm); + } else { + mulss(scratch1, src1_xmm); + addss(scratch1, xword[rip + c1]); + mulss(scratch1, src1_xmm); + addss(scratch1, xword[rip + c2]); + mulss(scratch1, src1_xmm); + addss(scratch1, xword[rip + c3]); + mulss(scratch1, src1_xmm); + subss(src1_xmm, dword[rip + onesVector]); + addss(scratch1, xword[rip + c4]); + mulss(scratch1, src1_xmm); + addss(scratch2, scratch1); + } + + xorps(src1_xmm, src1_xmm); // break dependency chain + movss(src1_xmm, scratch2); + L(inputIsNan); + + ret(); + return subroutine; +} + +Xbyak::Label ShaderEmitter::emitExp2Func() { + Xbyak::Label subroutine; + + // SSE does not have a exp instruction, thus we must approximate. + // We perform this approximation first performaing a range reduction into the range [-0.5, 0.5). + // A minimax polynomial which was fit for the function exp2(x) is then evaluated. + // We then restore the result into the appropriate range. + + // Similarly to log2, we align our literal pool to 64 bytes to make sure the whole thing fits in 1 cache line + align(64); + const void* input_max = getCurr(); + dd(0x43010000); + const void* input_min = getCurr(); + dd(0xc2fdffff); + const void* c0 = getCurr(); + dd(0x3c5dbe69); + const void* half = getCurr(); + dd(0x3f000000); + const void* c1 = getCurr(); + dd(0x3d5509f9); + const void* c2 = getCurr(); + dd(0x3e773cc5); + const void* c3 = getCurr(); + dd(0x3f3168b3); + const void* c4 = getCurr(); + dd(0x3f800016); + + Xbyak::Label retLabel; + + align(16); + L(subroutine); + + // Handle edge cases + ucomiss(src1_xmm, src1_xmm); + jp(retLabel); + + // Decompose input: + // SCRATCH=2^round(input) + // SRC1=input-round(input) [-0.5, 0.5) + if (cpuCaps.has(Cpu::tAVX512F | Cpu::tAVX512VL)) { + // Cheat a bit and store ones in src2 since the register is unused + vmovaps(src2_xmm, xword[rip + onesVector]); + // input - 0.5 + vsubss(scratch1, src1_xmm, xword[rip + half]); + + // trunc(input - 0.5) + vrndscaless(scratch2, scratch1, scratch1, _MM_FROUND_TRUNC); + + // SCRATCH = 1 * 2^(trunc(input - 0.5)) + vscalefss(scratch1, src2_xmm, scratch2); + + // SRC1 = input-trunc(input - 0.5) + vsubss(src1_xmm, src1_xmm, scratch2); + } else { + // Clamp to maximum range since we shift the value directly into the exponent. + minss(src1_xmm, xword[rip + input_max]); + maxss(src1_xmm, xword[rip + input_min]); + + if (cpuCaps.has(Cpu::tAVX)) { + vsubss(scratch1, src1_xmm, xword[rip + half]); + } else { + movss(scratch1, src1_xmm); + subss(scratch1, xword[rip + half]); + } + + if (cpuCaps.has(Cpu::tSSE41)) { + roundss(scratch1, scratch1, _MM_FROUND_TRUNC); + cvtss2si(eax, scratch1); + } else { + cvtss2si(eax, scratch1); + cvtsi2ss(scratch1, eax); + } + // SCRATCH now contains input rounded to the nearest integer. + add(eax, 0x7f); + subss(src1_xmm, scratch1); + // SRC1 contains input - round(input), which is in [-0.5, 0.5). + shl(eax, 23); + movd(scratch1, eax); + // SCRATCH contains 2^(round(input)). + } + + // Complete computation of polynomial. + movss(scratch2, xword[rip + c0]); + + if (haveFMA3) { + vfmadd213ss(scratch2, src1_xmm, xword[rip + c1]); + vfmadd213ss(scratch2, src1_xmm, xword[rip + c2]); + vfmadd213ss(scratch2, src1_xmm, xword[rip + c3]); + vfmadd213ss(src1_xmm, scratch2, xword[rip + c4]); + } else { + mulss(scratch2, src1_xmm); + addss(scratch2, xword[rip + c1]); + mulss(scratch2, src1_xmm); + addss(scratch2, xword[rip + c2]); + mulss(scratch2, src1_xmm); + addss(scratch2, xword[rip + c3]); + mulss(src1_xmm, scratch2); + addss(src1_xmm, xword[rip + c4]); + } + + mulss(src1_xmm, scratch1); + L(retLabel); + + ret(); + return subroutine; +} + // As we mentioned above, this function is uber slow because we don't expect the shader JIT to call HLL functions in real scenarios // Aside from debugging code. So we don't care for this function to be performant or anything of the like. It is quick and dirty // And mostly meant to be used for generating logs to diff the JIT and interpreter diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp index 37b67a50..d75b0ae5 100644 --- a/src/core/PICA/gpu.cpp +++ b/src/core/PICA/gpu.cpp @@ -2,19 +2,45 @@ #include #include -#include #include +#include #include "PICA/float_types.hpp" #include "PICA/regs.hpp" +#include "renderer_null/renderer_null.hpp" +#ifdef PANDA3DS_ENABLE_OPENGL +#include "renderer_gl/renderer_gl.hpp" +#endif using namespace Floats; // Note: For when we have multiple backends, the GL state manager can stay here and have the constructor for the Vulkan-or-whatever renderer ignore it // Thus, our GLStateManager being here does not negatively impact renderer-agnosticness -GPU::GPU(Memory& mem, GLStateManager& gl, EmulatorConfig& config) : mem(mem), renderer(*this, gl, regs), config(config) { +GPU::GPU(Memory& mem, EmulatorConfig& config) : mem(mem), config(config) { vram = new u8[vramSize]; - mem.setVRAM(vram); // Give the bus a pointer to our VRAM + mem.setVRAM(vram); // Give the bus a pointer to our VRAM + + switch (config.rendererType) { + case RendererType::Null: { + renderer.reset(new RendererNull(*this, regs)); + break; + } +#ifdef PANDA3DS_ENABLE_OPENGL + case RendererType::OpenGL: { + renderer.reset(new RendererGL(*this, regs)); + break; + } +#endif + + case RendererType::Vulkan: { + Helpers::panic("Vulkan is not supported yet, please pick another renderer"); + } + + default: { + Helpers::panic("Rendering backend not supported: %s", Renderer::typeToString(config.rendererType)); + break; + } + } } void GPU::reset() { @@ -41,7 +67,7 @@ void GPU::reset() { e.config2 = 0; } - renderer.reset(); + renderer->reset(); } // Call the correct version of drawArrays based on whether this is an indexed draw (first template parameter) @@ -73,15 +99,14 @@ void GPU::drawArrays() { // Base address for vertex attributes // The vertex base is always on a quadword boundary because the PICA does weird alignment shit any time possible const u32 vertexBase = ((regs[PICA::InternalRegs::VertexAttribLoc] >> 1) & 0xfffffff) * 16; - const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg]; // Total # of vertices to transfer + const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg]; // Total # of vertices to transfer // Configures the type of primitive and the number of vertex shader outputs const u32 primConfig = regs[PICA::InternalRegs::PrimitiveConfig]; const PICA::PrimType primType = static_cast(Helpers::getBits<8, 2>(primConfig)); if (vertexCount > Renderer::vertexBufferSize) Helpers::panic("[PICA] vertexCount > vertexBufferSize"); - if ((primType == PICA::PrimType::TriangleList && vertexCount % 3) || - (primType == PICA::PrimType::TriangleStrip && vertexCount < 3) || + if ((primType == PICA::PrimType::TriangleList && vertexCount % 3) || (primType == PICA::PrimType::TriangleStrip && vertexCount < 3) || (primType == PICA::PrimType::TriangleFan && vertexCount < 3)) { Helpers::panic("Invalid vertex count for primitive. Type: %d, vert count: %d\n", primType, vertexCount); } @@ -89,10 +114,10 @@ void GPU::drawArrays() { // Get the configuration for the index buffer, used only for indexed drawing u32 indexBufferConfig = regs[PICA::InternalRegs::IndexBufferConfig]; u32 indexBufferPointer = vertexBase + (indexBufferConfig & 0xfffffff); - bool shortIndex = Helpers::getBit<31>(indexBufferConfig); // Indicates whether vert indices are 16-bit or 8-bit + bool shortIndex = Helpers::getBit<31>(indexBufferConfig); // Indicates whether vert indices are 16-bit or 8-bit // Stuff the global attribute config registers in one u64 to make attr parsing easier - // TODO: Cache this when the vertex attribute format registers are written to + // TODO: Cache this when the vertex attribute format registers are written to u64 vertexCfg = u64(regs[PICA::InternalRegs::AttribFormatLow]) | (u64(regs[PICA::InternalRegs::AttribFormatHigh]) << 32); if constexpr (!indexed) { @@ -111,24 +136,24 @@ void GPU::drawArrays() { constexpr size_t vertexCacheSize = 64; struct { - std::bitset validBits{0}; // Shows which tags are valid. If the corresponding bit is 1, then there's an entry - std::array ids; // IDs (ie indices of the cached vertices in the 3DS vertex buffer) - std::array bufferPositions; // Positions of the cached vertices in our own vertex buffer + std::bitset validBits{0}; // Shows which tags are valid. If the corresponding bit is 1, then there's an entry + std::array ids; // IDs (ie indices of the cached vertices in the 3DS vertex buffer) + std::array bufferPositions; // Positions of the cached vertices in our own vertex buffer } vertexCache; - + for (u32 i = 0; i < vertexCount; i++) { - u32 vertexIndex; // Index of the vertex in the VBO for indexed rendering + u32 vertexIndex; // Index of the vertex in the VBO for indexed rendering if constexpr (!indexed) { vertexIndex = i + regs[PICA::InternalRegs::VertexOffsetReg]; } else { if (shortIndex) { auto ptr = getPointerPhys(indexBufferPointer); - vertexIndex = *ptr; // TODO: This is very unsafe + vertexIndex = *ptr; // TODO: This is very unsafe indexBufferPointer += 2; } else { auto ptr = getPointerPhys(indexBufferPointer); - vertexIndex = *ptr; // TODO: This is also very unsafe + vertexIndex = *ptr; // TODO: This is also very unsafe indexBufferPointer += 1; } } @@ -152,22 +177,22 @@ void GPU::drawArrays() { } int attrCount = 0; - int buffer = 0; // Vertex buffer index for non-fixed attributes + int buffer = 0; // Vertex buffer index for non-fixed attributes while (attrCount < totalAttribCount) { // Check if attribute is fixed or not - if (fixedAttribMask & (1 << attrCount)) { // Fixed attribute - vec4f& fixedAttr = shaderUnit.vs.fixedAttributes[attrCount]; // TODO: Is this how it works? + if (fixedAttribMask & (1 << attrCount)) { // Fixed attribute + vec4f& fixedAttr = shaderUnit.vs.fixedAttributes[attrCount]; // TODO: Is this how it works? vec4f& inputAttr = currentAttributes[attrCount]; - std::memcpy(&inputAttr, &fixedAttr, sizeof(vec4f)); // Copy fixed attr to input attr + std::memcpy(&inputAttr, &fixedAttr, sizeof(vec4f)); // Copy fixed attr to input attr attrCount++; - } else { // Non-fixed attribute - auto& attr = attributeInfo[buffer]; // Get information for this attribute - u64 attrCfg = attr.getConfigFull(); // Get config1 | (config2 << 32) + } else { // Non-fixed attribute + auto& attr = attributeInfo[buffer]; // Get information for this attribute + u64 attrCfg = attr.getConfigFull(); // Get config1 | (config2 << 32) u32 attrAddress = vertexBase + attr.offset + (vertexIndex * attr.size); for (int j = 0; j < attr.componentCount; j++) { - uint index = (attrCfg >> (j * 4)) & 0xf; // Get index of attribute in vertexCfg + uint index = (attrCfg >> (j * 4)) & 0xf; // Get index of attribute in vertexCfg // Vertex attributes used as padding // 12, 13, 14 and 15 are equivalent to 4, 8, 12 and 16 bytes of padding respectively @@ -179,15 +204,15 @@ void GPU::drawArrays() { } u32 attribInfo = (vertexCfg >> (index * 4)) & 0xf; - u32 attribType = attribInfo & 0x3; // Type of attribute(sbyte/ubyte/short/float) - u32 size = (attribInfo >> 2) + 1; // Total number of components + u32 attribType = attribInfo & 0x3; // Type of attribute(sbyte/ubyte/short/float) + u32 size = (attribInfo >> 2) + 1; // Total number of components - //printf("vertex_attribute_strides[%d] = %d\n", attrCount, attr.size); + // printf("vertex_attribute_strides[%d] = %d\n", attrCount, attr.size); vec4f& attribute = currentAttributes[attrCount]; - uint component; // Current component + uint component; // Current component switch (attribType) { - case 0: { // Signed byte + case 0: { // Signed byte s8* ptr = getPointerPhys(attrAddress); for (component = 0; component < size; component++) { float val = static_cast(*ptr++); @@ -197,7 +222,7 @@ void GPU::drawArrays() { break; } - case 1: { // Unsigned byte + case 1: { // Unsigned byte u8* ptr = getPointerPhys(attrAddress); for (component = 0; component < size; component++) { float val = static_cast(*ptr++); @@ -207,7 +232,7 @@ void GPU::drawArrays() { break; } - case 2: { // Short + case 2: { // Short s16* ptr = getPointerPhys(attrAddress); for (component = 0; component < size; component++) { float val = static_cast(*ptr++); @@ -217,7 +242,7 @@ void GPU::drawArrays() { break; } - case 3: { // Float + case 3: { // Float float* ptr = getPointerPhys(attrAddress); for (component = 0; component < size; component++) { float val = *ptr++; @@ -251,8 +276,8 @@ void GPU::drawArrays() { const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf; std::memcpy(&shaderUnit.vs.inputs[mapping], ¤tAttributes[j], sizeof(vec4f)); } - - if constexpr (useShaderJIT) { + + if constexpr (useShaderJIT) { shaderJIT.run(shaderUnit.vs); } else { shaderUnit.vs.run(); @@ -264,14 +289,14 @@ void GPU::drawArrays() { for (int i = 0; i < totalShaderOutputs; i++) { const u32 config = regs[PICA::InternalRegs::ShaderOutmap0 + i]; - for (int j = 0; j < 4; j++) { // pls unroll + for (int j = 0; j < 4; j++) { // pls unroll const u32 mapping = (config >> (j * 8)) & 0x1F; out.raw[mapping] = shaderUnit.vs.outputs[i][j]; } } } - renderer.drawVertices(primType, std::span(vertices).first(vertexCount)); + renderer->drawVertices(primType, std::span(vertices).first(vertexCount)); } PICA::Vertex GPU::getImmediateModeVertex() { @@ -289,7 +314,9 @@ PICA::Vertex GPU::getImmediateModeVertex() { std::memcpy(&v.s.colour, &shaderUnit.vs.outputs[1], sizeof(vec4f)); std::memcpy(&v.s.texcoord0, &shaderUnit.vs.outputs[2], 2 * sizeof(f24)); - printf("(x, y, z, w) = (%f, %f, %f, %f)\n", (double)v.s.positions[0], (double)v.s.positions[1], (double)v.s.positions[2], (double)v.s.positions[3]); + printf( + "(x, y, z, w) = (%f, %f, %f, %f)\n", (double)v.s.positions[0], (double)v.s.positions[1], (double)v.s.positions[2], (double)v.s.positions[3] + ); printf("(r, g, b, a) = (%f, %f, %f, %f)\n", (double)v.s.colour[0], (double)v.s.colour[1], (double)v.s.colour[2], (double)v.s.colour[3]); printf("(u, v ) = (%f, %f)\n", (double)v.s.texcoord0[0], (double)v.s.texcoord0[1]); diff --git a/src/core/PICA/regs.cpp b/src/core/PICA/regs.cpp index f62040dd..d245f8af 100644 --- a/src/core/PICA/regs.cpp +++ b/src/core/PICA/regs.cpp @@ -1,11 +1,12 @@ -#include "PICA/gpu.hpp" #include "PICA/regs.hpp" +#include "PICA/gpu.hpp" + using namespace Floats; using namespace Helpers; u32 GPU::readReg(u32 address) { - if (address >= 0x1EF01000 && address < 0x1EF01C00) { // Internal registers + if (address >= 0x1EF01000 && address < 0x1EF01C00) { // Internal registers const u32 index = (address - 0x1EF01000) / sizeof(u32); return readInternalReg(index); } else { @@ -15,7 +16,7 @@ u32 GPU::readReg(u32 address) { } void GPU::writeReg(u32 address, u32 value) { - if (address >= 0x1EF01000 && address < 0x1EF01C00) { // Internal registers + if (address >= 0x1EF01000 && address < 0x1EF01C00) { // Internal registers const u32 index = (address - 0x1EF01000) / sizeof(u32); writeInternalReg(index, value, 0xffffffff); } else { @@ -59,7 +60,7 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) { } u32 currentValue = regs[index]; - u32 newValue = (currentValue & ~mask) | (value & mask); // Only overwrite the bits specified by "mask" + u32 newValue = (currentValue & ~mask) | (value & mask); // Only overwrite the bits specified by "mask" regs[index] = newValue; // TODO: Figure out if things like the shader index use the unmasked value or the masked one @@ -74,38 +75,38 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) { break; case AttribFormatHigh: - totalAttribCount = (value >> 28) + 1; // Total number of vertex attributes - fixedAttribMask = getBits<16, 12>(value); // Determines which vertex attributes are fixed for all vertices + totalAttribCount = (value >> 28) + 1; // Total number of vertex attributes + fixedAttribMask = getBits<16, 12>(value); // Determines which vertex attributes are fixed for all vertices break; case ColourBufferLoc: { u32 loc = (value & 0x0fffffff) << 3; - renderer.setColourBufferLoc(loc); + renderer->setColourBufferLoc(loc); break; }; case ColourBufferFormat: { u32 format = getBits<16, 3>(value); - renderer.setColourFormat(static_cast(format)); + renderer->setColourFormat(static_cast(format)); break; } case DepthBufferLoc: { u32 loc = (value & 0x0fffffff) << 3; - renderer.setDepthBufferLoc(loc); + renderer->setDepthBufferLoc(loc); break; } case DepthBufferFormat: { u32 format = value & 0x3; - renderer.setDepthFormat(static_cast(format)); + renderer->setDepthFormat(static_cast(format)); break; } case FramebufferSize: { const u32 width = value & 0x7ff; const u32 height = getBits<12, 10>(value) + 1; - renderer.setFBSize(width, height); + renderer->setFBSize(width, height); break; } @@ -116,7 +117,7 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) { case LightingLUTData4: case LightingLUTData5: case LightingLUTData6: - case LightingLUTData7:{ + case LightingLUTData7: { const uint32_t index = regs[LightingLUTIndex]; // Get full LUT index register const uint32_t lutID = getBits<8, 5>(index); // Get which LUT we're actually writing to uint32_t lutIndex = getBits<0, 8>(index); // And get the index inside the LUT we're writing to @@ -133,15 +134,22 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) { break; } - case VertexFloatUniformIndex: + case VertexFloatUniformIndex: { shaderUnit.vs.setFloatUniformIndex(value); break; + } - case VertexFloatUniformData0: case VertexFloatUniformData1: case VertexFloatUniformData2: - case VertexFloatUniformData3: case VertexFloatUniformData4: case VertexFloatUniformData5: - case VertexFloatUniformData6: case VertexFloatUniformData7: + case VertexFloatUniformData0: + case VertexFloatUniformData1: + case VertexFloatUniformData2: + case VertexFloatUniformData3: + case VertexFloatUniformData4: + case VertexFloatUniformData5: + case VertexFloatUniformData6: + case VertexFloatUniformData7: { shaderUnit.vs.uploadFloatUniform(value); break; + } case FixedAttribIndex: fixedAttribCount = 0; @@ -162,7 +170,9 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) { } break; - case FixedAttribData0: case FixedAttribData1: case FixedAttribData2: + case FixedAttribData0: + case FixedAttribData1: + case FixedAttribData2: fixedAttrBuff[fixedAttribCount++] = value; if (fixedAttribCount == 3) { @@ -170,15 +180,15 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) { vec4f attr; // These are stored in the reverse order anyone would expect them to be in - attr.x() = f24::fromRaw(fixedAttrBuff[2] & 0xffffff); - attr.y() = f24::fromRaw(((fixedAttrBuff[1] & 0xffff) << 8) | (fixedAttrBuff[2] >> 24)); - attr.z() = f24::fromRaw(((fixedAttrBuff[0] & 0xff) << 16) | (fixedAttrBuff[1] >> 16)); - attr.w() = f24::fromRaw(fixedAttrBuff[0] >> 8); + attr[0] = f24::fromRaw(fixedAttrBuff[2] & 0xffffff); + attr[1] = f24::fromRaw(((fixedAttrBuff[1] & 0xffff) << 8) | (fixedAttrBuff[2] >> 24)); + attr[2] = f24::fromRaw(((fixedAttrBuff[0] & 0xff) << 16) | (fixedAttrBuff[1] >> 16)); + attr[3] = f24::fromRaw(fixedAttrBuff[0] >> 8); // If the fixed attribute index is < 12, we're just writing to one of the fixed attributes if (fixedAttribIndex < 12) [[likely]] { shaderUnit.vs.fixedAttributes[fixedAttribIndex++] = attr; - } else if (fixedAttribIndex == 15) { // Otherwise if it's 15, we're submitting an immediate mode vertex + } else if (fixedAttribIndex == 15) { // Otherwise if it's 15, we're submitting an immediate mode vertex const uint totalAttrCount = (regs[PICA::InternalRegs::VertexShaderAttrNum] & 0xf) + 1; if (totalAttrCount <= immediateModeAttrIndex) { printf("Broken state in the immediate mode vertex submission pipeline. Failing silently\n"); @@ -199,13 +209,15 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) { // If we've reached 3 verts, issue a draw call // Handle rendering depending on the primitive type if (immediateModeVertIndex == 3) { - renderer.drawVertices(PICA::PrimType::TriangleList, immediateModeVertices); + renderer->drawVertices(PICA::PrimType::TriangleList, immediateModeVertices); switch (primType) { // Triangle or geometry primitive. Draw a triangle and discard all vertices - case 0: case 3: + case 0: + case 3: { immediateModeVertIndex = 0; break; + } // Triangle strip. Draw triangle, discard first vertex and keep the last 2 case 1: @@ -223,54 +235,72 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) { } } } - } else { // Writing to fixed attributes 13 and 14 probably does nothing, but we'll see + } else { // Writing to fixed attributes 13 and 14 probably does nothing, but we'll see log("Wrote to invalid fixed vertex attribute %d\n", fixedAttribIndex); } } break; - case VertexShaderOpDescriptorIndex: + case VertexShaderOpDescriptorIndex: { shaderUnit.vs.setOpDescriptorIndex(value); break; + } - case VertexShaderOpDescriptorData0: case VertexShaderOpDescriptorData1: case VertexShaderOpDescriptorData2: - case VertexShaderOpDescriptorData3: case VertexShaderOpDescriptorData4: case VertexShaderOpDescriptorData5: - case VertexShaderOpDescriptorData6: case VertexShaderOpDescriptorData7: + case VertexShaderOpDescriptorData0: + case VertexShaderOpDescriptorData1: + case VertexShaderOpDescriptorData2: + case VertexShaderOpDescriptorData3: + case VertexShaderOpDescriptorData4: + case VertexShaderOpDescriptorData5: + case VertexShaderOpDescriptorData6: + case VertexShaderOpDescriptorData7: { shaderUnit.vs.uploadDescriptor(value); break; + } - case VertexBoolUniform: + case VertexBoolUniform: { shaderUnit.vs.boolUniform = value & 0xffff; break; + } - case VertexIntUniform0: case VertexIntUniform1: case VertexIntUniform2: case VertexIntUniform3: + case VertexIntUniform0: + case VertexIntUniform1: + case VertexIntUniform2: + case VertexIntUniform3: { shaderUnit.vs.uploadIntUniform(index - VertexIntUniform0, value); break; + } - case VertexShaderData0: case VertexShaderData1: case VertexShaderData2: case VertexShaderData3: - case VertexShaderData4: case VertexShaderData5: case VertexShaderData6: case VertexShaderData7: + case VertexShaderData0: + case VertexShaderData1: + case VertexShaderData2: + case VertexShaderData3: + case VertexShaderData4: + case VertexShaderData5: + case VertexShaderData6: + case VertexShaderData7: { shaderUnit.vs.uploadWord(value); break; + } - case VertexShaderEntrypoint: + case VertexShaderEntrypoint: { shaderUnit.vs.entrypoint = value & 0xffff; break; + } case VertexShaderTransferEnd: if (value != 0) shaderUnit.vs.finalize(); break; - case VertexShaderTransferIndex: - shaderUnit.vs.setBufferIndex(value); - break; + case VertexShaderTransferIndex: shaderUnit.vs.setBufferIndex(value); break; // Command lists can write to the command processor registers and change the command list stream // Several games are known to do this, including New Super Mario Bros 2 and Super Mario 3D Land case CmdBufTrigger0: case CmdBufTrigger1: { - if (value != 0) { // A non-zero value triggers command list processing - int bufferIndex = index - CmdBufTrigger0; // Index of the command buffer to execute (0 or 1) + if (value != 0) { // A non-zero value triggers command list processing + int bufferIndex = index - CmdBufTrigger0; // Index of the command buffer to execute (0 or 1) u32 addr = (regs[CmdBufAddr0 + bufferIndex] & 0xfffffff) << 3; u32 size = (regs[CmdBufSize0 + bufferIndex] & 0xfffff) << 3; @@ -285,15 +315,13 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) { default: // Vertex attribute registers if (index >= AttribInfoStart && index <= AttribInfoEnd) { - uint attributeIndex = (index - AttribInfoStart) / 3; // Which attribute are we writing to - uint reg = (index - AttribInfoStart) % 3; // Which of this attribute's registers are we writing to? + uint attributeIndex = (index - AttribInfoStart) / 3; // Which attribute are we writing to + uint reg = (index - AttribInfoStart) % 3; // Which of this attribute's registers are we writing to? auto& attr = attributeInfo[attributeIndex]; switch (reg) { - case 0: attr.offset = value & 0xfffffff; break; // Attribute offset - case 1: - attr.config1 = value; - break; + case 0: attr.offset = value & 0xfffffff; break; // Attribute offset + case 1: attr.config1 = value; break; case 2: attr.config2 = value; attr.size = getBits<16, 8>(value); @@ -339,13 +367,13 @@ void GPU::startCommandList(u32 addr, u32 size) { u32 id = header & 0xffff; u32 paramMaskIndex = getBits<16, 4>(header); - u32 paramCount = getBits<20, 8>(header); // Number of additional parameters + u32 paramCount = getBits<20, 8>(header); // Number of additional parameters // Bit 31 tells us whether this command is going to write to multiple sequential registers (if the bit is 1) // Or if all written values will go to the same register (If the bit is 0). It's essentially the value that // gets added to the "id" field after each register write bool consecutiveWritingMode = (header >> 31) != 0; - u32 mask = maskLUT[paramMaskIndex]; // Actual parameter mask + u32 mask = maskLUT[paramMaskIndex]; // Actual parameter mask // Increment the ID by 1 after each write if we're in consecutive mode, or 0 otherwise u32 idIncrement = (consecutiveWritingMode) ? 1 : 0; diff --git a/src/core/PICA/shader_interpreter.cpp b/src/core/PICA/shader_interpreter.cpp index 7af284e3..9fed6bba 100644 --- a/src/core/PICA/shader_interpreter.cpp +++ b/src/core/PICA/shader_interpreter.cpp @@ -1,6 +1,7 @@ -#include "PICA/shader.hpp" #include +#include "PICA/shader.hpp" + using namespace Helpers; void PICAShader::run() { @@ -11,20 +12,23 @@ void PICAShader::run() { while (true) { const u32 instruction = loadedShader[pc++]; - const u32 opcode = instruction >> 26; // Top 6 bits are the opcode + const u32 opcode = instruction >> 26; // Top 6 bits are the opcode switch (opcode) { case ShaderOpcodes::ADD: add(instruction); break; case ShaderOpcodes::CALL: call(instruction); break; case ShaderOpcodes::CALLC: callc(instruction); break; case ShaderOpcodes::CALLU: callu(instruction); break; - case ShaderOpcodes::CMP1: case ShaderOpcodes::CMP2: + case ShaderOpcodes::CMP1: + case ShaderOpcodes::CMP2: { cmp(instruction); break; + } + case ShaderOpcodes::DP3: dp3(instruction); break; case ShaderOpcodes::DP4: dp4(instruction); break; case ShaderOpcodes::DPHI: dphi(instruction); break; - case ShaderOpcodes::END: return; // Stop running shader + case ShaderOpcodes::END: return; // Stop running shader case ShaderOpcodes::EX2: ex2(instruction); break; case ShaderOpcodes::FLR: flr(instruction); break; case ShaderOpcodes::IFC: ifc(instruction); break; @@ -38,31 +42,47 @@ void PICAShader::run() { case ShaderOpcodes::MOV: mov(instruction); break; case ShaderOpcodes::MOVA: mova(instruction); break; case ShaderOpcodes::MUL: mul(instruction); break; - case ShaderOpcodes::NOP: break; // Do nothing + case ShaderOpcodes::NOP: break; // Do nothing case ShaderOpcodes::RCP: rcp(instruction); break; case ShaderOpcodes::RSQ: rsq(instruction); break; case ShaderOpcodes::SGEI: sgei(instruction); break; case ShaderOpcodes::SLT: slt(instruction); break; case ShaderOpcodes::SLTI: slti(instruction); break; - case 0x30: case 0x31: case 0x32: case 0x33: case 0x34: case 0x35: case 0x36: case 0x37: + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: { madi(instruction); break; + } - case 0x38: case 0x39: case 0x3A: case 0x3B: case 0x3C: case 0x3D: case 0x3E: case 0x3F: + case 0x38: + case 0x39: + case 0x3A: + case 0x3B: + case 0x3C: + case 0x3D: + case 0x3E: + case 0x3F: { mad(instruction); break; + } - default:Helpers::panic("Unimplemented PICA instruction %08X (Opcode = %02X)", instruction, opcode); + default: Helpers::panic("Unimplemented PICA instruction %08X (Opcode = %02X)", instruction, opcode); } // Handle control flow statements. The ordering is important as the priority goes: LOOP > IF > CALL // Handle loop if (loopIndex != 0) { auto& loop = loopInfo[loopIndex - 1]; - if (pc == loop.endingPC) { // Check if the loop needs to start over + if (pc == loop.endingPC) { // Check if the loop needs to start over loop.iterations -= 1; - if (loop.iterations == 0) // If the loop ended, go one level down on the loop stack + if (loop.iterations == 0) // If the loop ended, go one level down on the loop stack loopIndex -= 1; loopCounter += loop.increment; @@ -73,7 +93,7 @@ void PICAShader::run() { // Handle ifs if (ifIndex != 0) { auto& info = conditionalInfo[ifIndex - 1]; - if (pc == info.endingPC) { // Check if the IF block ended + if (pc == info.endingPC) { // Check if the IF block ended pc = info.newPC; ifIndex -= 1; } @@ -82,7 +102,7 @@ void PICAShader::run() { // Handle calls if (callIndex != 0) { auto& info = callInfo[callIndex - 1]; - if (pc == info.endingPC) { // Check if the CALL block ended + if (pc == info.endingPC) { // Check if the CALL block ended pc = info.returnPC; callIndex -= 1; } @@ -92,15 +112,15 @@ void PICAShader::run() { // Calculate the actual source value using an instruction's source field and it's respective index value // The index value is used to apply relative addressing when index != 0 by adding one of the 3 addr registers to the -// source field, but only with the original source field is pointing at a vector uniform register +// source field, but only with the original source field is pointing at a vector uniform register u8 PICAShader::getIndexedSource(u32 source, u32 index) { - if (source < 0x20) // No offset is applied if the source isn't pointing to a vector uniform reg + if (source < 0x20) // No offset is applied if the source isn't pointing to a vector uniform reg return source; switch (index) { - case 0: [[likely]] return u8(source); // No offset applied - case 1: return u8(source + addrRegister.x()); - case 2: return u8(source + addrRegister.y()); + case 0: [[likely]] return u8(source); // No offset applied + case 1: return u8(source + addrRegister[0]); + case 2: return u8(source + addrRegister[1]); case 3: return u8(source + loopCounter); } @@ -117,7 +137,7 @@ PICAShader::vec4f PICAShader::getSource(u32 source) { return floatUniforms[source - 0x20]; else { Helpers::warn("[PICA] Unimplemented source value: %X\n", source); - return vec4f({ f24::zero(), f24::zero(), f24::zero(), f24::zero() }); + return vec4f({f24::zero(), f24::zero(), f24::zero(), f24::zero()}); } } @@ -136,13 +156,13 @@ bool PICAShader::isCondTrue(u32 instruction) { bool refX = (getBit<25>(instruction)) != 0; switch (condition) { - case 0: // Either cmp register matches + case 0: // Either cmp register matches return cmpRegister[0] == refX || cmpRegister[1] == refY; - case 1: // Both cmp registers match + case 1: // Both cmp registers match return cmpRegister[0] == refX && cmpRegister[1] == refY; - case 2: // At least cmp.x matches + case 2: // At least cmp.x matches return cmpRegister[0] == refX; - default: // At least cmp.y matches + default: // At least cmp.y matches return cmpRegister[1] == refY; } } @@ -150,7 +170,7 @@ bool PICAShader::isCondTrue(u32 instruction) { void PICAShader::add(u32 instruction) { const u32 operandDescriptor = operandDescriptors[instruction & 0x7f]; u32 src1 = getBits<12, 7>(instruction); - const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment + const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment const u32 idx = getBits<19, 2>(instruction); const u32 dest = getBits<21, 5>(instruction); @@ -171,7 +191,7 @@ void PICAShader::add(u32 instruction) { void PICAShader::mul(u32 instruction) { const u32 operandDescriptor = operandDescriptors[instruction & 0x7f]; u32 src1 = getBits<12, 7>(instruction); - const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment + const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment const u32 idx = getBits<19, 2>(instruction); const u32 dest = getBits<21, 5>(instruction); @@ -210,7 +230,7 @@ void PICAShader::flr(u32 instruction) { void PICAShader::max(u32 instruction) { const u32 operandDescriptor = operandDescriptors[instruction & 0x7f]; const u32 src1 = getBits<12, 7>(instruction); - const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment + const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment const u32 idx = getBits<19, 2>(instruction); const u32 dest = getBits<21, 5>(instruction); @@ -232,7 +252,7 @@ void PICAShader::max(u32 instruction) { void PICAShader::min(u32 instruction) { const u32 operandDescriptor = operandDescriptors[instruction & 0x7f]; const u32 src1 = getBits<12, 7>(instruction); - const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment + const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment const u32 idx = getBits<19, 2>(instruction); const u32 dest = getBits<21, 5>(instruction); @@ -278,16 +298,16 @@ void PICAShader::mova(u32 instruction) { vec4f srcVector = getSourceSwizzled<1>(src, operandDescriptor); u32 componentMask = operandDescriptor & 0xf; - if (componentMask & 0b1000) // x component - addrRegister.x() = static_cast(srcVector.x().toFloat32()); - if (componentMask & 0b0100) // y component - addrRegister.y() = static_cast(srcVector.y().toFloat32()); + if (componentMask & 0b1000) // x component + addrRegister[0] = static_cast(srcVector[0].toFloat32()); + if (componentMask & 0b0100) // y component + addrRegister[1] = static_cast(srcVector[1].toFloat32()); } void PICAShader::dp3(u32 instruction) { const u32 operandDescriptor = operandDescriptors[instruction & 0x7f]; u32 src1 = getBits<12, 7>(instruction); - const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment + const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment const u32 idx = getBits<19, 2>(instruction); const u32 dest = getBits<21, 5>(instruction); @@ -309,7 +329,7 @@ void PICAShader::dp3(u32 instruction) { void PICAShader::dp4(u32 instruction) { const u32 operandDescriptor = operandDescriptors[instruction & 0x7f]; u32 src1 = getBits<12, 7>(instruction); - const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment + const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment const u32 idx = getBits<19, 2>(instruction); const u32 dest = getBits<21, 5>(instruction); @@ -480,7 +500,7 @@ void PICAShader::madi(u32 instruction) { void PICAShader::slt(u32 instruction) { const u32 operandDescriptor = operandDescriptors[instruction & 0x7f]; u32 src1 = getBits<12, 7>(instruction); - const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment + const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment const u32 idx = getBits<19, 2>(instruction); const u32 dest = getBits<21, 5>(instruction); @@ -542,11 +562,11 @@ void PICAShader::slti(u32 instruction) { void PICAShader::cmp(u32 instruction) { const u32 operandDescriptor = operandDescriptors[instruction & 0x7f]; const u32 src1 = getBits<12, 7>(instruction); - const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment + const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment const u32 idx = getBits<19, 2>(instruction); const u32 cmpY = getBits<21, 3>(instruction); const u32 cmpX = getBits<24, 3>(instruction); - const u32 cmpOperations[2] = { cmpX, cmpY }; + const u32 cmpOperations[2] = {cmpX, cmpY}; if (idx) Helpers::panic("[PICA] CMP: idx != 0"); vec4f srcVec1 = getSourceSwizzled<1>(src1, operandDescriptor); @@ -554,33 +574,34 @@ void PICAShader::cmp(u32 instruction) { for (int i = 0; i < 2; i++) { switch (cmpOperations[i]) { - case 0: // Equal + case 0: // Equal cmpRegister[i] = srcVec1[i] == srcVec2[i]; break; - case 1: // Not equal + case 1: // Not equal cmpRegister[i] = srcVec1[i] != srcVec2[i]; break; - case 2: // Less than + case 2: // Less than cmpRegister[i] = srcVec1[i] < srcVec2[i]; break; - case 3: // Less than or equal + case 3: // Less than or equal cmpRegister[i] = srcVec1[i] <= srcVec2[i]; break; - case 4: // Greater than + case 4: // Greater than cmpRegister[i] = srcVec1[i] > srcVec2[i]; break; - case 5: // Greater than or equal + case 5: // Greater than or equal cmpRegister[i] = srcVec1[i] >= srcVec2[i]; break; - default: + default: { cmpRegister[i] = true; break; + } } } } @@ -604,7 +625,7 @@ void PICAShader::ifc(u32 instruction) { void PICAShader::ifu(u32 instruction) { const u32 dest = getBits<10, 12>(instruction); - const u32 bit = getBits<22, 4>(instruction); // Bit of the bool uniform to check + const u32 bit = getBits<22, 4>(instruction); // Bit of the bool uniform to check if (boolUniform & (1 << bit)) { if (ifIndex >= 8) [[unlikely]] @@ -615,8 +636,7 @@ void PICAShader::ifu(u32 instruction) { auto& block = conditionalInfo[ifIndex++]; block.endingPC = dest; block.newPC = dest + num; - } - else { + } else { pc = dest; } } @@ -637,12 +657,12 @@ void PICAShader::call(u32 instruction) { void PICAShader::callc(u32 instruction) { if (isCondTrue(instruction)) { - call(instruction); // Pls inline + call(instruction); // Pls inline } } void PICAShader::callu(u32 instruction) { - const u32 bit = getBits<22, 4>(instruction); // Bit of the bool uniform to check + const u32 bit = getBits<22, 4>(instruction); // Bit of the bool uniform to check if (boolUniform & (1 << bit)) { if (callIndex >= 4) [[unlikely]] @@ -664,26 +684,27 @@ void PICAShader::loop(u32 instruction) { Helpers::panic("[PICA] Overflowed loop stack"); u32 dest = getBits<10, 12>(instruction); - auto& uniform = intUniforms[getBits<22, 2>(instruction)]; // The uniform we'll get loop info from - loopCounter = uniform.y(); + auto& uniform = intUniforms[getBits<22, 2>(instruction)]; // The uniform we'll get loop info from + loopCounter = uniform[1]; auto& loop = loopInfo[loopIndex++]; loop.startingPC = pc; - loop.endingPC = dest + 1; // Loop is inclusive so we need + 1 here - loop.iterations = uniform.x() + 1; - loop.increment = uniform.z(); + loop.endingPC = dest + 1; // Loop is inclusive so we need + 1 here + loop.iterations = uniform[0] + 1; + loop.increment = uniform[2]; } void PICAShader::jmpc(u32 instruction) { - if (isCondTrue(instruction)) + if (isCondTrue(instruction)) { pc = getBits<10, 12>(instruction); + } } void PICAShader::jmpu(u32 instruction) { - const u32 test = (instruction & 1) ^ 1; // If the LSB is 0 we want to compare to true, otherwise compare to false + const u32 test = (instruction & 1) ^ 1; // If the LSB is 0 we want to compare to true, otherwise compare to false const u32 dest = getBits<10, 12>(instruction); - const u32 bit = getBits<22, 4>(instruction); // Bit of the bool uniform to check + const u32 bit = getBits<22, 4>(instruction); // Bit of the bool uniform to check - if (((boolUniform >> bit) & 1) == test) // Jump if the bool uniform is the value we want + if (((boolUniform >> bit) & 1) == test) // Jump if the bool uniform is the value we want pc = dest; } \ No newline at end of file diff --git a/src/core/PICA/shader_unit.cpp b/src/core/PICA/shader_unit.cpp index 6cbc2693..aa7b4c12 100644 --- a/src/core/PICA/shader_unit.cpp +++ b/src/core/PICA/shader_unit.cpp @@ -1,4 +1,5 @@ #include "PICA/shader_unit.hpp" + #include "cityhash.hpp" void ShaderUnit::reset() { @@ -18,18 +19,18 @@ void PICAShader::reset() { opDescriptorIndex = 0; f32UniformTransfer = false; - const vec4f zero = vec4f({ f24::zero(), f24::zero(), f24::zero(), f24::zero() }); + const vec4f zero = vec4f({f24::zero(), f24::zero(), f24::zero(), f24::zero()}); inputs.fill(zero); floatUniforms.fill(zero); outputs.fill(zero); tempRegisters.fill(zero); for (auto& e : intUniforms) { - e.x() = e.y() = e.z() = e.w() = 0; + e[0] = e[1] = e[2] = e[3] = 0; } - addrRegister.x() = 0; - addrRegister.y() = 0; + addrRegister[0] = 0; + addrRegister[1] = 0; loopCounter = 0; codeHashDirty = true; diff --git a/src/core/action_replay.cpp b/src/core/action_replay.cpp new file mode 100644 index 00000000..ad391b36 --- /dev/null +++ b/src/core/action_replay.cpp @@ -0,0 +1,210 @@ +#include "action_replay.hpp" + +ActionReplay::ActionReplay(Memory& mem, HIDService& hid) : mem(mem), hid(hid) { reset(); } + +void ActionReplay::reset() { + // Default value of storage regs is 0 + storage1 = 0; + storage2 = 0; + + // TODO: Is the active storage persistent or not? + activeStorage = &storage1; +} + +void ActionReplay::runCheat(const Cheat& cheat) { + // Set offset and data registers to 0 at the start of a cheat + data1 = data2 = offset1 = offset2 = 0; + pc = 0; + ifStackIndex = 0; + loopStackIndex = 0; + running = true; + + activeOffset = &offset1; + activeData = &data1; + + while (running) { + // See if we can fetch 1 64-bit opcode, otherwise we're out of bounds. Cheats seem to end when going out of bounds? + if (pc + 1 >= cheat.size()) { + return; + } + // Fetch instruction + const u32 instruction = cheat[pc++]; + + // Instructions D0000000 00000000 and D2000000 00000000 are unconditional + bool isUnconditional = cheat[pc] == 0 && (instruction == 0xD0000000 || instruction == 0xD2000000); + if (ifStackIndex > 0 && !isUnconditional && !ifStack[ifStackIndex - 1]) { + pc++; // Eat up dummy word + continue; // Skip conditional instructions where the condition is false + } + + runInstruction(cheat, instruction); + } +} + +u8 ActionReplay::read8(u32 addr) { return mem.read8(addr); } +u16 ActionReplay::read16(u32 addr) { return mem.read16(addr); } +u32 ActionReplay::read32(u32 addr) { return mem.read32(addr); } + +// Some AR cheats seem to want to write to unmapped memory or memory that straight up does not exist + +#define MAKE_WRITE_HANDLER(size) \ + void ActionReplay::write##size(u32 addr, u##size value) { \ + auto pointerWrite = mem.getWritePointer(addr); \ + if (pointerWrite) { \ + *(u##size*)pointerWrite = value; \ + } else { \ + auto pointerRead = mem.getReadPointer(addr); \ + if (pointerRead) { \ + *(u##size*)pointerRead = value; \ + } else { \ + Helpers::warn("AR code tried to write to invalid address: %08X\n", addr); \ + } \ + } \ + } + +MAKE_WRITE_HANDLER(8) +MAKE_WRITE_HANDLER(16) +MAKE_WRITE_HANDLER(32) +#undef MAKE_WRITE_HANDLER + +void ActionReplay::runInstruction(const Cheat& cheat, u32 instruction) { + // Top nibble determines the instruction type + const u32 type = instruction >> 28; + + switch (type) { + // 32-bit write to [XXXXXXX + offset] + case 0x0: { + const u32 baseAddr = Helpers::getBits<0, 28>(instruction); + const u32 value = cheat[pc++]; + write32(baseAddr + *activeOffset, value); + break; + } + + // 16-bit write to [XXXXXXX + offset] + case 0x1: { + const u32 baseAddr = Helpers::getBits<0, 28>(instruction); + const u16 value = u16(cheat[pc++]); + write16(baseAddr + *activeOffset, value); + break; + } + + // 8-bit write to [XXXXXXX + offset] + case 0x2: { + const u32 baseAddr = Helpers::getBits<0, 28>(instruction); + const u8 value = u8(cheat[pc++]); + write8(baseAddr + *activeOffset, value); + break; + } + + // Less Than (YYYYYYYY < [XXXXXXX + offset]) + case 0x4: { + const u32 baseAddr = Helpers::getBits<0, 28>(instruction); + const u32 imm = cheat[pc++]; + const u32 value = read32(baseAddr + *activeOffset); + Helpers::panic("TODO: How do ActionReplay conditional blocks work?"); + break; + } + + case 0xD: executeDType(cheat, instruction); break; + default: Helpers::panic("Unimplemented ActionReplay instruction type %X", type); break; + } +} + +void ActionReplay::executeDType(const Cheat& cheat, u32 instruction) { + switch (instruction) { + case 0xD3000000: offset1 = cheat[pc++]; break; + case 0xD3000001: offset2 = cheat[pc++]; break; + case 0xDC000000: *activeOffset += cheat[pc++]; break; + + // DD000000 XXXXXXXX - if KEYPAD has value XXXXXXXX execute next block + case 0xDD000000: { + const u32 mask = cheat[pc++]; + const u32 buttons = hid.getOldButtons(); + + pushConditionBlock((buttons & mask) == mask); + break; + } + + // Offset register ops + case 0xDF000000: { + const u32 subopcode = cheat[pc++]; + switch (subopcode) { + case 0x00000000: activeOffset = &offset1; break; + case 0x00000001: activeOffset = &offset2; break; + case 0x00010000: offset2 = offset1; break; + case 0x00010001: offset1 = offset2; break; + case 0x00020000: data1 = offset1; break; + case 0x00020001: data2 = offset2; break; + default: + Helpers::warn("Unknown ActionReplay offset operation"); + running = false; + break; + } + break; + } + + // Data register operations + case 0xDF000001: { + const u32 subopcode = cheat[pc++]; + switch (subopcode) { + case 0x00000000: activeData = &data1; break; + case 0x00000001: activeData = &data2; break; + + case 0x00010000: data2 = data1; break; + case 0x00010001: data1 = data2; break; + case 0x00020000: offset1 = data1; break; + case 0x00020001: offset2 = data2; break; + default: + Helpers::warn("Unknown ActionReplay data operation"); + running = false; + break; + } + break; + } + + // Storage register operations + case 0xDF000002: { + const u32 subopcode = cheat[pc++]; + switch (subopcode) { + case 0x00000000: activeStorage = &storage1; break; + case 0x00000001: activeStorage = &storage2; break; + + case 0x00010000: data1 = storage1; break; + case 0x00010001: data2 = storage2; break; + case 0x00020000: storage1 = data1; break; + case 0x00020001: storage2 = data2; break; + default: + Helpers::warn("Unknown ActionReplay data operation: %08X", subopcode); + running = false; + break; + } + break; + } + + // Control flow block operations + case 0xD2000000: { + const u32 subopcode = cheat[pc++]; + switch (subopcode) { + // Ends all loop/execute blocks + case 0: + loopStackIndex = 0; + ifStackIndex = 0; + break; + default: Helpers::panic("Unknown ActionReplay control flow operation: %08X", subopcode); break; + } + break; + } + + default: Helpers::panic("ActionReplay: Unimplemented d-type opcode: %08X", instruction); break; + } +} + +void ActionReplay::pushConditionBlock(bool condition) { + if (ifStackIndex >= 32) { + Helpers::warn("ActionReplay if stack overflowed"); + running = false; + return; + } + + ifStack[ifStackIndex++] = condition; +} \ No newline at end of file diff --git a/src/core/cheats.cpp b/src/core/cheats.cpp new file mode 100644 index 00000000..4c63652b --- /dev/null +++ b/src/core/cheats.cpp @@ -0,0 +1,28 @@ +#include "cheats.hpp" + +Cheats::Cheats(Memory& mem, HIDService& hid) : ar(mem, hid) { reset(); } + +void Cheats::reset() { + cheats.clear(); // Unload loaded cheats + ar.reset(); // Reset ActionReplay +} + +void Cheats::addCheat(const Cheat& cheat) { cheats.push_back(cheat); } + +void Cheats::run() { + for (const Cheat& cheat : cheats) { + switch (cheat.type) { + case CheatType::ActionReplay: { + ar.runCheat(cheat.instructions); + break; + } + + case CheatType::Gateway: { + Helpers::panic("Gateway cheats not supported yet! Only Action Replay is supported!"); + break; + } + + default: Helpers::panic("Unknown cheat type"); + } + } +} \ No newline at end of file diff --git a/src/core/kernel/directory_operations.cpp b/src/core/kernel/directory_operations.cpp index 2d5d7abc..d4cac064 100644 --- a/src/core/kernel/directory_operations.cpp +++ b/src/core/kernel/directory_operations.cpp @@ -1,3 +1,10 @@ +#include +#include +#include +#include +#include + +#include "ipc.hpp" #include "kernel.hpp" namespace DirectoryOps { @@ -7,6 +14,79 @@ namespace DirectoryOps { }; } +// Helper to convert std::string to an 8.3 filename to mimic how Directory::Read works +using ShortFilename = std::array; +using ShortExtension = std::array; +using Filename83 = std::pair; + +// The input string should be the stem and extension together, not separately +// Eg something like "boop.png", "panda.txt", etc +Filename83 convertTo83(const std::string& path) { + ShortFilename filename; + ShortExtension extension; + + // Convert a character to add it to the 8.3 name + // "Characters such as + are changed to the underscore _, and letters are put in uppercase" + // For now we put letters in uppercase until we find out what is supposed to be converted to _ and so on + auto convertCharacter = [](char c) { return (char) std::toupper(c); }; + + // List of forbidden character for 8.3 filenames, from Citra + // TODO: Use constexpr when C++20 support is solid + const std::string forbiddenChars = ".\"/\\[]:;=, "; + + // By default space-initialize the whole name, append null terminator in the end for both the filename and extension + filename.fill(' '); + extension.fill(' '); + filename[filename.size() - 1] = '\0'; + extension[extension.size() - 1] = '\0'; + + // Find the position of the dot in the string + auto dotPos = path.rfind('.'); + // Wikipedia: If a file name has no extension, a trailing . has no effect + // Thus check if the last character is a dot and ignore it, prefering the previous dot if it exists + if (dotPos == path.size() - 1) { + dotPos = path.rfind('.', dotPos); // Get previous dot + } + + // If pointPos is not npos we have a valid dot character, and as such an extension + bool haveExtension = dotPos != std::string::npos; + int validCharacterCount = 0; + bool filenameTooBig = false; + + // Parse characters until we're done OR until we reach 9 characters, in which case according to Wikipedia we must truncate to 6 letters + // And append ~1 in the end + for (auto c : path.substr(0, dotPos)) { + // Character is forbidden, we must ignore it + if (forbiddenChars.find(c) != std::string::npos) { + continue; + } + + // We already have capped the amount of characters, thus our filename is too big + if (validCharacterCount == 8) { + filenameTooBig = true; + break; + } + filename[validCharacterCount++] = convertCharacter(c); // Append character to filename + } + + // Truncate name to 6 characters and denote that it is too big + // TODO: Wikipedia says we should also do this if the filename contains an invalid character, including spaces. Must test + if (filenameTooBig) { + filename[6] = '~'; + filename[7] = '1'; + } + + if (haveExtension) { + int extensionLen = 0; + // Copy up to 3 characters from the dot onwards to the extension + for (auto c : path.substr(dotPos + 1, 3)) { + extension[extensionLen++] = convertCharacter(c); + } + } + + return {filename, extension}; +} + void Kernel::handleDirectoryOperation(u32 messagePointer, Handle directory) { const u32 cmd = mem.read32(messagePointer); switch (cmd) { @@ -25,16 +105,77 @@ void Kernel::closeDirectory(u32 messagePointer, Handle directory) { } p->getData()->isOpen = false; + mem.write32(messagePointer, IPC::responseHeader(0x802, 1, 0)); mem.write32(messagePointer + 4, Result::Success); } - void Kernel::readDirectory(u32 messagePointer, Handle directory) { const u32 entryCount = mem.read32(messagePointer + 4); const u32 outPointer = mem.read32(messagePointer + 12); logFileIO("Directory::Read (handle = %X, entry count = %d, out pointer = %08X)\n", directory, entryCount, outPointer); - Helpers::panicDev("Unimplemented FsDir::Read"); + + const auto p = getObject(directory, KernelObjectType::Directory); + if (p == nullptr) [[unlikely]] { + Helpers::panic("Called ReadDirectory on non-existent directory"); + } + DirectorySession* session = p->getData(); + if (!session->pathOnDisk.has_value()) [[unlikely]] { + Helpers::panic("Called ReadDirectory on directory that doesn't have a path on disk"); + } + + std::filesystem::path dirPath = session->pathOnDisk.value(); + + int count = 0; + while (count < entryCount && session->currentEntry < session->entries.size()) { + const auto& entry = session->entries[session->currentEntry]; + std::filesystem::path path = entry.path; + std::filesystem::path filename = path.filename(); + + std::filesystem::path relative = path.lexically_relative(dirPath); + bool isDirectory = std::filesystem::is_directory(relative); + + std::u16string nameU16 = relative.u16string(); + bool isHidden = nameU16[0] == u'.'; // If the first character is a dot then this is a hidden file/folder + + const u32 entryPointer = outPointer + (count * 0x228); // 0x228 is the size of a single entry + u32 utfPointer = entryPointer; + u32 namePointer = entryPointer + 0x20C; + u32 extensionPointer = entryPointer + 0x216; + u32 attributePointer = entryPointer + 0x21C; + u32 sizePointer = entryPointer + 0x220; + + std::string filenameString = filename.string(); + auto [shortFilename, shortExtension] = convertTo83(filenameString); + + for (auto c : nameU16) { + mem.write16(utfPointer, u16(c)); + utfPointer += sizeof(u16); + } + mem.write16(utfPointer, 0); // Null terminate the UTF16 name + + // Write 8.3 filename-extension + for (auto c : shortFilename) { + mem.write8(namePointer, u8(c)); + namePointer += sizeof(u8); + } + + for (auto c : shortExtension) { + mem.write8(extensionPointer, u8(c)); + extensionPointer += sizeof(u8); + } + + mem.write8(outPointer + 0x21A, 1); // Always 1 according to 3DBrew + mem.write8(attributePointer, entry.isDirectory ? 1 : 0); // "Is directory" attribute + mem.write8(attributePointer + 1, isHidden ? 1 : 0); // "Is hidden" attribute + mem.write8(attributePointer + 2, entry.isDirectory ? 0 : 1); // "Is archive" attribute + mem.write8(attributePointer + 3, 0); // "Is read-only" attribute + + count++; // Increment number of read directories + session->currentEntry++; // Increment index of the entry currently being read + } + + mem.write32(messagePointer, IPC::responseHeader(0x801, 2, 2)); mem.write32(messagePointer + 4, Result::Success); - mem.write32(messagePointer + 8, 0); + mem.write32(messagePointer + 8, count); } diff --git a/src/core/kernel/kernel.cpp b/src/core/kernel/kernel.cpp index 1402b468..8f3aeda0 100644 --- a/src/core/kernel/kernel.cpp +++ b/src/core/kernel/kernel.cpp @@ -95,14 +95,29 @@ KernelObject* Kernel::getProcessFromPID(Handle handle) { } void Kernel::deleteObjectData(KernelObject& object) { - using enum KernelObjectType; - - // Resource limit and thread objects do not allocate heap data, so we don't delete anything - if (object.data == nullptr || object.type == ResourceLimit || object.type == Thread) { + if (object.data == nullptr) { return; } - delete object.data; + // Resource limit and thread objects do not allocate heap data, so we don't delete anything + + switch (object.type) { + case KernelObjectType::AddressArbiter: delete object.getData(); return; + case KernelObjectType::Archive: delete object.getData(); return; + case KernelObjectType::Directory: delete object.getData(); return; + case KernelObjectType::Event: delete object.getData(); return; + case KernelObjectType::File: delete object.getData(); return; + case KernelObjectType::MemoryBlock: delete object.getData(); return; + case KernelObjectType::Port: delete object.getData(); return; + case KernelObjectType::Process: delete object.getData(); return; + case KernelObjectType::ResourceLimit: return; + case KernelObjectType::Session: delete object.getData(); return; + case KernelObjectType::Mutex: delete object.getData(); return; + case KernelObjectType::Semaphore: delete object.getData(); return; + case KernelObjectType::Thread: return; + case KernelObjectType::Dummy: return; + default: [[unlikely]] Helpers::warn("unknown object type"); return; + } } void Kernel::reset() { @@ -240,4 +255,4 @@ std::string Kernel::getProcessName(u32 pid) { } else { Helpers::panic("Attempted to name non-current process"); } -} \ No newline at end of file +} diff --git a/src/core/renderer_gl/etc1.cpp b/src/core/renderer_gl/etc1.cpp index 82f06724..8aefd622 100644 --- a/src/core/renderer_gl/etc1.cpp +++ b/src/core/renderer_gl/etc1.cpp @@ -9,7 +9,7 @@ static constexpr u32 signExtend3To32(u32 val) { return (u32)(s32(val) << 29 >> 29); } -u32 Texture::getTexelETC(bool hasAlpha, u32 u, u32 v, u32 width, const void* data) { +u32 Texture::getTexelETC(bool hasAlpha, u32 u, u32 v, u32 width, std::span data) { // Pixel offset of the 8x8 tile based on u, v and the width of the texture u32 offs = ((u & ~7) * 8) + ((v & ~7) * width); if (!hasAlpha) @@ -30,8 +30,7 @@ u32 Texture::getTexelETC(bool hasAlpha, u32 u, u32 v, u32 width, const void* dat offs += subTileSize * subTileIndex; u32 alpha; - const u8* tmp = static_cast(data) + offs; // Pointer to colour and alpha data as u8* - const u64* ptr = reinterpret_cast(tmp); // Cast to u64* + const u64* ptr = reinterpret_cast(data.data() + offs); // Cast to u64* if (hasAlpha) { // First 64 bits of the 4x4 subtile are alpha data @@ -118,4 +117,4 @@ u32 Texture::decodeETC(u32 alpha, u32 u, u32 v, u64 colourData) { b = std::clamp(b + modifier, 0, 255); return (alpha << 24) | (u32(b) << 16) | (u32(g) << 8) | u32(r); -} \ No newline at end of file +} diff --git a/src/gl_state.cpp b/src/core/renderer_gl/gl_state.cpp similarity index 96% rename from src/gl_state.cpp rename to src/core/renderer_gl/gl_state.cpp index 612ae44d..691eb7b6 100644 --- a/src/gl_state.cpp +++ b/src/core/renderer_gl/gl_state.cpp @@ -1,4 +1,4 @@ -#include "gl_state.hpp" +#include "renderer_gl/gl_state.hpp" void GLStateManager::resetBlend() { blendEnabled = false; diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index 3a13b31d..bef3fe93 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -1,582 +1,22 @@ #include "renderer_gl/renderer_gl.hpp" + +#include + +#include + #include "PICA/float_types.hpp" #include "PICA/gpu.hpp" #include "PICA/regs.hpp" +CMRC_DECLARE(RendererGL); + using namespace Floats; using namespace Helpers; using namespace PICA; -const char* vertexShader = R"( - #version 410 core - - layout (location = 0) in vec4 a_coords; - layout (location = 1) in vec4 a_quaternion; - layout (location = 2) in vec4 a_vertexColour; - layout (location = 3) in vec2 a_texcoord0; - layout (location = 4) in vec2 a_texcoord1; - layout (location = 5) in float a_texcoord0_w; - layout (location = 6) in vec3 a_view; - layout (location = 7) in vec2 a_texcoord2; +RendererGL::~RendererGL() {} - out vec3 v_normal; - out vec3 v_tangent; - out vec3 v_bitangent; - out vec4 v_colour; - out vec3 v_texcoord0; - out vec2 v_texcoord1; - out vec3 v_view; - out vec2 v_texcoord2; - flat out vec4 v_textureEnvColor[6]; - flat out vec4 v_textureEnvBufferColor; - - out float gl_ClipDistance[2]; - - // TEV uniforms - uniform uint u_textureEnvColor[6]; - uniform uint u_picaRegs[0x200 - 0x48]; - - // Helper so that the implementation of u_pica_regs can be changed later - uint readPicaReg(uint reg_addr){ - return u_picaRegs[reg_addr - 0x48]; - } - - vec4 abgr8888ToVec4(uint abgr) { - const float scale = 1.0 / 255.0; - - return scale * vec4( - float(abgr & 0xffu), - float((abgr >> 8) & 0xffu), - float((abgr >> 16) & 0xffu), - float(abgr >> 24) - ); - } - - vec3 rotateVec3ByQuaternion(vec3 v, vec4 q){ - vec3 u = q.xyz; - float s = q.w; - return 2.0 * dot(u, v) * u + (s * s - dot(u, u))* v + 2.0 * s * cross(u, v); - } - - // Convert an arbitrary-width floating point literal to an f32 - float decodeFP(uint hex, uint E, uint M){ - uint width = M + E + 1u; - uint bias = 128u - (1u << (E - 1u)); - uint exponent = (hex >> M) & ((1u << E) - 1u); - uint mantissa = hex & ((1u << M) - 1u); - uint sign = (hex >> (E + M)) << 31u; - - if ((hex & ((1u << (width - 1u)) - 1u)) != 0) { - if (exponent == (1u << E) - 1u) exponent = 255u; - else exponent += bias; - hex = sign | (mantissa << (23u - M)) | (exponent << 23u); - } else { - hex = sign; - } - - return uintBitsToFloat(hex); - } - - void main() { - gl_Position = a_coords; - v_colour = a_vertexColour; - - // Flip y axis of UVs because OpenGL uses an inverted y for texture sampling compared to the PICA - v_texcoord0 = vec3(a_texcoord0.x, 1.0 - a_texcoord0.y, a_texcoord0_w); - v_texcoord1 = vec2(a_texcoord1.x, 1.0 - a_texcoord1.y); - v_texcoord2 = vec2(a_texcoord2.x, 1.0 - a_texcoord2.y); - v_view = a_view; - - v_normal = normalize(rotateVec3ByQuaternion(vec3(0.0, 0.0, 1.0), a_quaternion)); - v_tangent = normalize(rotateVec3ByQuaternion(vec3(1.0, 0.0, 0.0), a_quaternion)); - v_bitangent = normalize(rotateVec3ByQuaternion(vec3(0.0, 1.0, 0.0), a_quaternion)); - - for (int i = 0; i < 6; i++) { - v_textureEnvColor[i] = abgr8888ToVec4(u_textureEnvColor[i]); - } - - v_textureEnvBufferColor = abgr8888ToVec4(readPicaReg(0xFD)); - - // Parse clipping plane registers - // The plane registers describe a clipping plane in the form of Ax + By + Cz + D = 0 - // With n = (A, B, C) being the normal vector and D being the origin point distance - // Therefore, for the second clipping plane, we can just pass the dot product of the clip vector and the input coordinates to gl_ClipDistance[1] - vec4 clipData = vec4( - decodeFP(readPicaReg(0x48) & 0xffffffu, 7, 16), - decodeFP(readPicaReg(0x49) & 0xffffffu, 7, 16), - decodeFP(readPicaReg(0x4A) & 0xffffffu, 7, 16), - decodeFP(readPicaReg(0x4B) & 0xffffffu, 7, 16) - ); - - // There's also another, always-on clipping plane based on vertex z - gl_ClipDistance[0] = -a_coords.z; - gl_ClipDistance[1] = dot(clipData, a_coords); - } -)"; - -const char* fragmentShader = R"( - #version 410 core - - in vec3 v_tangent; - in vec3 v_normal; - in vec3 v_bitangent; - in vec4 v_colour; - in vec3 v_texcoord0; - in vec2 v_texcoord1; - in vec3 v_view; - in vec2 v_texcoord2; - flat in vec4 v_textureEnvColor[6]; - flat in vec4 v_textureEnvBufferColor; - - out vec4 fragColour; - - // TEV uniforms - uniform uint u_textureEnvSource[6]; - uniform uint u_textureEnvOperand[6]; - uniform uint u_textureEnvCombiner[6]; - uniform uint u_textureEnvScale[6]; - - // Depth control uniforms - uniform float u_depthScale; - uniform float u_depthOffset; - uniform bool u_depthmapEnable; - - uniform sampler2D u_tex0; - uniform sampler2D u_tex1; - uniform sampler2D u_tex2; - uniform sampler1DArray u_tex_lighting_lut; - - uniform uint u_picaRegs[0x200 - 0x48]; - - // Helper so that the implementation of u_pica_regs can be changed later - uint readPicaReg(uint reg_addr){ - return u_picaRegs[reg_addr - 0x48]; - } - - vec4 tevSources[16]; - vec4 tevNextPreviousBuffer; - bool tevUnimplementedSourceFlag = false; - - // OpenGL ES 1.1 reference pages for TEVs (this is what the PICA200 implements): - // https://registry.khronos.org/OpenGL-Refpages/es1.1/xhtml/glTexEnv.xml - - vec4 tevFetchSource(uint src_id) { - if (src_id >= 6u && src_id < 13u) { - tevUnimplementedSourceFlag = true; - } - - return tevSources[src_id]; - } - - vec4 tevGetColorAndAlphaSource(int tev_id, int src_id) { - vec4 result; - - vec4 colorSource = tevFetchSource((u_textureEnvSource[tev_id] >> (src_id * 4)) & 15u); - vec4 alphaSource = tevFetchSource((u_textureEnvSource[tev_id] >> (src_id * 4 + 16)) & 15u); - - uint colorOperand = (u_textureEnvOperand[tev_id] >> (src_id * 4)) & 15u; - uint alphaOperand = (u_textureEnvOperand[tev_id] >> (12 + src_id * 4)) & 7u; - - // TODO: figure out what the undocumented values do - switch (colorOperand) { - case 0u: result.rgb = colorSource.rgb; break; // Source color - case 1u: result.rgb = 1.0 - colorSource.rgb; break; // One minus source color - case 2u: result.rgb = vec3(colorSource.a); break; // Source alpha - case 3u: result.rgb = vec3(1.0 - colorSource.a); break; // One minus source alpha - case 4u: result.rgb = vec3(colorSource.r); break; // Source red - case 5u: result.rgb = vec3(1.0 - colorSource.r); break; // One minus source red - case 8u: result.rgb = vec3(colorSource.g); break; // Source green - case 9u: result.rgb = vec3(1.0 - colorSource.g); break; // One minus source green - case 12u: result.rgb = vec3(colorSource.b); break; // Source blue - case 13u: result.rgb = vec3(1.0 - colorSource.b); break; // One minus source blue - default: break; - } - - // TODO: figure out what the undocumented values do - switch (alphaOperand) { - case 0u: result.a = alphaSource.a; break; // Source alpha - case 1u: result.a = 1.0 - alphaSource.a; break; // One minus source alpha - case 2u: result.a = alphaSource.r; break; // Source red - case 3u: result.a = 1.0 - alphaSource.r; break; // One minus source red - case 4u: result.a = alphaSource.g; break; // Source green - case 5u: result.a = 1.0 - alphaSource.g; break; // One minus source green - case 6u: result.a = alphaSource.b; break; // Source blue - case 7u: result.a = 1.0 - alphaSource.b; break; // One minus source blue - default: break; - } - - return result; - } - - vec4 tevCalculateCombiner(int tev_id) { - vec4 source0 = tevGetColorAndAlphaSource(tev_id, 0); - vec4 source1 = tevGetColorAndAlphaSource(tev_id, 1); - vec4 source2 = tevGetColorAndAlphaSource(tev_id, 2); - - uint colorCombine = u_textureEnvCombiner[tev_id] & 15u; - uint alphaCombine = (u_textureEnvCombiner[tev_id] >> 16) & 15u; - - vec4 result = vec4(1.0); - - // TODO: figure out what the undocumented values do - switch (colorCombine) { - case 0u: result.rgb = source0.rgb; break; // Replace - case 1u: result.rgb = source0.rgb * source1.rgb; break; // Modulate - case 2u: result.rgb = min(vec3(1.0), source0.rgb + source1.rgb); break; // Add - case 3u: result.rgb = clamp(source0.rgb + source1.rgb - 0.5, 0.0, 1.0); break; // Add signed - case 4u: result.rgb = mix(source1.rgb, source0.rgb, source2.rgb); break; // Interpolate - case 5u: result.rgb = max(source0.rgb - source1.rgb, 0.0); break; // Subtract - case 6u: result.rgb = vec3(4.0 * dot(source0.rgb - 0.5 , source1.rgb - 0.5)); break; // Dot3 RGB - case 7u: result = vec4(4.0 * dot(source0.rgb - 0.5 , source1.rgb - 0.5)); break; // Dot3 RGBA - case 8u: result.rgb = min(source0.rgb * source1.rgb + source2.rgb, 1.0); break; // Multiply then add - case 9u: result.rgb = min((source0.rgb + source1.rgb) * source2.rgb, 1.0); break; // Add then multiply - default: break; - } - - if (colorCombine != 7u) { // The color combiner also writes the alpha channel in the "Dot3 RGBA" mode. - // TODO: figure out what the undocumented values do - // TODO: test if the alpha combiner supports all the same modes as the color combiner. - switch (alphaCombine) { - case 0u: result.a = source0.a; break; // Replace - case 1u: result.a = source0.a * source1.a; break; // Modulate - case 2u: result.a = min(1.0, source0.a + source1.a); break; // Add - case 3u: result.a = clamp(source0.a + source1.a - 0.5, 0.0, 1.0); break; // Add signed - case 4u: result.a = mix(source1.a, source0.a, source2.a); break; // Interpolate - case 5u: result.a = max(0.0, source0.a - source1.a); break; // Subtract - case 8u: result.a = min(1.0, source0.a * source1.a + source2.a); break; // Multiply then add - case 9u: result.a = min(1.0, (source0.a + source1.a) * source2.a); break; // Add then multiply - default: break; - } - } - - result.rgb *= float(1 << (u_textureEnvScale[tev_id] & 3u)); - result.a *= float(1 << ((u_textureEnvScale[tev_id] >> 16) & 3u)); - - return result; - } - - #define D0_LUT 0u - #define D1_LUT 1u - #define SP_LUT 2u - #define FR_LUT 3u - #define RB_LUT 4u - #define RG_LUT 5u - #define RR_LUT 6u - - float lutLookup(uint lut, uint light, float value){ - if (lut >= FR_LUT && lut <= RR_LUT) - lut -= 1; - if (lut==SP_LUT) - lut = light + 8; - return texture(u_tex_lighting_lut, vec2(value, lut)).r; - } - - vec3 regToColor(uint reg) { - // Normalization scale to convert from [0...255] to [0.0...1.0] - const float scale = 1.0 / 255.0; - - return scale * vec3( - float(bitfieldExtract(reg, 20, 8)), - float(bitfieldExtract(reg, 10, 8)), - float(bitfieldExtract(reg, 00, 8)) - ); - } - - // Convert an arbitrary-width floating point literal to an f32 - float decodeFP(uint hex, uint E, uint M){ - uint width = M + E + 1u; - uint bias = 128u - (1u << (E - 1u)); - uint exponent = (hex >> M) & ((1u << E) - 1u); - uint mantissa = hex & ((1u << M) - 1u); - uint sign = (hex >> (E + M)) << 31u; - - if ((hex & ((1u << (width - 1u)) - 1u)) != 0) { - if (exponent == (1u << E) - 1u) exponent = 255u; - else exponent += bias; - hex = sign | (mantissa << (23u - M)) | (exponent << 23u); - } else { - hex = sign; - } - - return uintBitsToFloat(hex); - } - - // Implements the following algorthm: https://mathb.in/26766 - void calcLighting(out vec4 primary_color, out vec4 secondary_color){ - // Quaternions describe a transformation from surface-local space to eye space. - // In surface-local space, by definition (and up to permutation) the normal vector is (0,0,1), - // the tangent vector is (1,0,0), and the bitangent vector is (0,1,0). - vec3 normal = normalize(v_normal ); - vec3 tangent = normalize(v_tangent ); - vec3 bitangent = normalize(v_bitangent); - vec3 view = normalize(v_view); - - uint GPUREG_LIGHTING_ENABLE = readPicaReg(0x008F); - if (bitfieldExtract(GPUREG_LIGHTING_ENABLE, 0, 1) == 0){ - primary_color = secondary_color = vec4(1.0); - return; - } - - uint GPUREG_LIGHTING_AMBIENT = readPicaReg(0x01C0); - uint GPUREG_LIGHTING_NUM_LIGHTS = (readPicaReg(0x01C2) & 0x7u) +1; - uint GPUREG_LIGHTING_LIGHT_PERMUTATION = readPicaReg(0x01D9); - - primary_color = vec4(vec3(0.0),1.0); - secondary_color = vec4(vec3(0.0),1.0); - - primary_color.rgb += regToColor(GPUREG_LIGHTING_AMBIENT); - - uint GPUREG_LIGHTING_LUTINPUT_ABS = readPicaReg(0x01D0); - uint GPUREG_LIGHTING_LUTINPUT_SELECT = readPicaReg(0x01D1); - uint GPUREG_LIGHTING_CONFIG0 = readPicaReg(0x01C3); - uint GPUREG_LIGHTING_CONFIG1 = readPicaReg(0x01C4); - uint GPUREG_LIGHTING_LUTINPUT_SCALE = readPicaReg(0x01D2); - float d[7]; - - bool error_unimpl = false; - - for (uint i = 0; i < GPUREG_LIGHTING_NUM_LIGHTS; i++) { - uint light_id = bitfieldExtract(GPUREG_LIGHTING_LIGHT_PERMUTATION,int(i*3),3); - - uint GPUREG_LIGHTi_SPECULAR0 = readPicaReg(0x0140 + 0x10 * light_id); - uint GPUREG_LIGHTi_SPECULAR1 = readPicaReg(0x0141 + 0x10 * light_id); - uint GPUREG_LIGHTi_DIFFUSE = readPicaReg(0x0142 + 0x10 * light_id); - uint GPUREG_LIGHTi_AMBIENT = readPicaReg(0x0143 + 0x10 * light_id); - uint GPUREG_LIGHTi_VECTOR_LOW = readPicaReg(0x0144 + 0x10 * light_id); - uint GPUREG_LIGHTi_VECTOR_HIGH= readPicaReg(0x0145 + 0x10 * light_id); - uint GPUREG_LIGHTi_CONFIG = readPicaReg(0x0149 + 0x10 * light_id); - - vec3 light_vector = normalize(vec3( - decodeFP(bitfieldExtract(GPUREG_LIGHTi_VECTOR_LOW, 0, 16), 5, 10), - decodeFP(bitfieldExtract(GPUREG_LIGHTi_VECTOR_LOW, 16, 16), 5, 10), - decodeFP(bitfieldExtract(GPUREG_LIGHTi_VECTOR_HIGH, 0, 16), 5, 10) - )); - - // Positional Light - if (bitfieldExtract(GPUREG_LIGHTi_CONFIG, 0, 1) == 0) - error_unimpl = true; - - vec3 half_vector = normalize(normalize(light_vector) + view); - - for (int c = 0; c < 7; c++) { - if (bitfieldExtract(GPUREG_LIGHTING_CONFIG1, 16 + c, 1) == 0){ - uint scale_id = bitfieldExtract(GPUREG_LIGHTING_LUTINPUT_SCALE, c * 4, 3); - float scale = float(1u << scale_id); - if (scale_id >= 6u) - scale/=256.0; - - uint input_id = bitfieldExtract(GPUREG_LIGHTING_LUTINPUT_SELECT, c * 4, 3); - if (input_id == 0u) d[c] = dot(normal,half_vector); - else if (input_id == 1u) d[c] = dot(view,half_vector); - else if (input_id == 2u) d[c] = dot(normal,view); - else if (input_id == 3u) d[c] = dot(light_vector,normal); - else if (input_id == 4u){ - uint GPUREG_LIGHTi_SPOTDIR_LOW = readPicaReg(0x0146 + 0x10 * light_id); - uint GPUREG_LIGHTi_SPOTDIR_HIGH= readPicaReg(0x0147 + 0x10 * light_id); - vec3 spot_light_vector = normalize(vec3( - decodeFP(bitfieldExtract(GPUREG_LIGHTi_SPOTDIR_LOW, 0, 16), 1, 11), - decodeFP(bitfieldExtract(GPUREG_LIGHTi_SPOTDIR_LOW, 16, 16), 1, 11), - decodeFP(bitfieldExtract(GPUREG_LIGHTi_SPOTDIR_HIGH, 0, 16), 1, 11) - )); - d[c] = dot(-light_vector, spot_light_vector); // -L dot P (aka Spotlight aka SP); - } else if (input_id == 5u) { - d[c] = 1.0; // TODO: cos (aka CP); - error_unimpl = true; - } else { - d[c] = 1.0; - } - - d[c] = lutLookup(c, light_id, d[c] * 0.5 + 0.5) * scale; - if (bitfieldExtract(GPUREG_LIGHTING_LUTINPUT_ABS, 2 * c, 1) != 0u) - d[c] = abs(d[c]); - } else { - d[c] = 1.0; - } - } - - uint lookup_config = bitfieldExtract(GPUREG_LIGHTi_CONFIG,4,4); - if (lookup_config == 0) { - d[D1_LUT] = 0.0; - d[FR_LUT] = 0.0; - d[RG_LUT]= d[RB_LUT] = d[RR_LUT]; - } else if (lookup_config == 1) { - d[D0_LUT] = 0.0; - d[D1_LUT] = 0.0; - d[RG_LUT] = d[RB_LUT] = d[RR_LUT]; - } else if (lookup_config == 2) { - d[FR_LUT] = 0.0; - d[SP_LUT] = 0.0; - d[RG_LUT] = d[RB_LUT] = d[RR_LUT]; - } else if (lookup_config == 3) { - d[SP_LUT] = 0.0; - d[RG_LUT]= d[RB_LUT] = d[RR_LUT] = 1.0; - } else if (lookup_config == 4) { - d[FR_LUT] = 0.0; - } else if (lookup_config == 5) { - d[D1_LUT] = 0.0; - } else if (lookup_config == 6) { - d[RG_LUT] = d[RB_LUT] = d[RR_LUT]; - } - - float distance_factor = 1.0; // a - float indirect_factor = 1.0; // fi - float shadow_factor = 1.0; // o - - float NdotL = dot(normal, light_vector); //Li dot N - - // Two sided diffuse - if (bitfieldExtract(GPUREG_LIGHTi_CONFIG, 1, 1) == 0) NdotL = max(0.0, NdotL); - else NdotL = abs(NdotL); - - float light_factor = distance_factor*d[SP_LUT]*indirect_factor*shadow_factor; - - primary_color.rgb += light_factor * (regToColor(GPUREG_LIGHTi_AMBIENT) + regToColor(GPUREG_LIGHTi_DIFFUSE)*NdotL); - secondary_color.rgb += light_factor * ( - regToColor(GPUREG_LIGHTi_SPECULAR0) * d[D0_LUT] + - regToColor(GPUREG_LIGHTi_SPECULAR1) * d[D1_LUT] * vec3(d[RR_LUT], d[RG_LUT], d[RB_LUT]) - ); - } - uint fresnel_output1 = bitfieldExtract(GPUREG_LIGHTING_CONFIG0, 2, 1); - uint fresnel_output2 = bitfieldExtract(GPUREG_LIGHTING_CONFIG0, 3, 1); - - if (fresnel_output1 == 1u) primary_color.a = d[FR_LUT]; - if (fresnel_output2 == 1u) secondary_color.a = d[FR_LUT]; - - if (error_unimpl) { - secondary_color = primary_color = vec4(1.0,0.,1.0,1.0); - } - } - - void main() { - // TODO: what do invalid sources and disabled textures read as? - // And what does the "previous combiner" source read initially? - tevSources[0] = v_colour; // Primary/vertex color - calcLighting(tevSources[1],tevSources[2]); - - uint textureConfig = readPicaReg(0x80); - vec2 tex2UV = (textureConfig & (1u << 13)) != 0u ? v_texcoord1 : v_texcoord2; - - if ((textureConfig & 1u) != 0u) tevSources[3] = texture(u_tex0, v_texcoord0.xy); - if ((textureConfig & 2u) != 0u) tevSources[4] = texture(u_tex1, v_texcoord1); - if ((textureConfig & 4u) != 0u) tevSources[5] = texture(u_tex2, tex2UV); - tevSources[13] = vec4(0.0); // Previous buffer - tevSources[15] = vec4(0.0); // Previous combiner - - tevNextPreviousBuffer = v_textureEnvBufferColor; - uint textureEnvUpdateBuffer = readPicaReg(0xE0); - - for (int i = 0; i < 6; i++) { - tevSources[14] = v_textureEnvColor[i]; // Constant color - tevSources[15] = tevCalculateCombiner(i); - tevSources[13] = tevNextPreviousBuffer; - - if (i < 4) { - if ((textureEnvUpdateBuffer & (0x100u << i)) != 0u) { - tevNextPreviousBuffer.rgb = tevSources[15].rgb; - } - - if ((textureEnvUpdateBuffer & (0x1000u << i)) != 0u) { - tevNextPreviousBuffer.a = tevSources[15].a; - } - } - } - - fragColour = tevSources[15]; - - if (tevUnimplementedSourceFlag) { - // fragColour = vec4(1.0, 0.0, 1.0, 1.0); - } - // fragColour.rg = texture(u_tex_lighting_lut,vec2(gl_FragCoord.x/200.,float(int(gl_FragCoord.y/2)%24))).rr; - - - // Get original depth value by converting from [near, far] = [0, 1] to [-1, 1] - // We do this by converting to [0, 2] first and subtracting 1 to go to [-1, 1] - float z_over_w = gl_FragCoord.z * 2.0f - 1.0f; - float depth = z_over_w * u_depthScale + u_depthOffset; - - if (!u_depthmapEnable) // Divide z by w if depthmap enable == 0 (ie using W-buffering) - depth /= gl_FragCoord.w; - - // Write final fragment depth - gl_FragDepth = depth; - - // Perform alpha test - uint alphaControl = readPicaReg(0x104); - if ((alphaControl & 1u) != 0u) { // Check if alpha test is on - uint func = (alphaControl >> 4u) & 7u; - float reference = float((alphaControl >> 8u) & 0xffu) / 255.0; - float alpha = fragColour.a; - - switch (func) { - case 0: discard; // Never pass alpha test - case 1: break; // Always pass alpha test - case 2: // Pass if equal - if (alpha != reference) - discard; - break; - case 3: // Pass if not equal - if (alpha == reference) - discard; - break; - case 4: // Pass if less than - if (alpha >= reference) - discard; - break; - case 5: // Pass if less than or equal - if (alpha > reference) - discard; - break; - case 6: // Pass if greater than - if (alpha <= reference) - discard; - break; - case 7: // Pass if greater than or equal - if (alpha < reference) - discard; - break; - } - } - } -)"; - -const char* displayVertexShader = R"( - #version 410 core - out vec2 UV; - - void main() { - const vec4 positions[4] = vec4[]( - vec4(-1.0, 1.0, 1.0, 1.0), // Top-left - vec4(1.0, 1.0, 1.0, 1.0), // Top-right - vec4(-1.0, -1.0, 1.0, 1.0), // Bottom-left - vec4(1.0, -1.0, 1.0, 1.0) // Bottom-right - ); - - // The 3DS displays both screens' framebuffer rotated 90 deg counter clockwise - // So we adjust our texcoords accordingly - const vec2 texcoords[4] = vec2[]( - vec2(1.0, 1.0), // Top-right - vec2(1.0, 0.0), // Bottom-right - vec2(0.0, 1.0), // Top-left - vec2(0.0, 0.0) // Bottom-left - ); - - gl_Position = positions[gl_VertexID]; - UV = texcoords[gl_VertexID]; - } -)"; - -const char* displayFragmentShader = R"( - #version 410 core - in vec2 UV; - out vec4 FragColor; - - uniform sampler2D u_texture; - void main() { - FragColor = texture(u_texture, UV); - } -)"; - -void Renderer::reset() { +void RendererGL::reset() { depthBufferCache.reset(); colourBufferCache.reset(); textureCache.reset(); @@ -592,10 +32,10 @@ void Renderer::reset() { const auto oldProgram = OpenGL::getProgram(); gl.useProgram(triangleProgram); - - oldDepthScale = -1.0; // Default depth scale to -1.0, which is what games typically use - oldDepthOffset = 0.0; // Default depth offset to 0 - oldDepthmapEnable = false; // Enable w buffering + + oldDepthScale = -1.0; // Default depth scale to -1.0, which is what games typically use + oldDepthOffset = 0.0; // Default depth offset to 0 + oldDepthmapEnable = false; // Enable w buffering glUniform1f(depthScaleLoc, oldDepthScale); glUniform1f(depthOffsetLoc, oldDepthOffset); @@ -605,10 +45,17 @@ void Renderer::reset() { } } -void Renderer::initGraphicsContext() { - OpenGL::Shader vert(vertexShader, OpenGL::Vertex); - OpenGL::Shader frag(fragmentShader, OpenGL::Fragment); - triangleProgram.create({ vert, frag }); +void RendererGL::initGraphicsContext() { + gl.reset(); + + auto gl_resources = cmrc::RendererGL::get_filesystem(); + + auto vertexShaderSource = gl_resources.open("opengl_vertex_shader.vert"); + auto fragmentShaderSource = gl_resources.open("opengl_fragment_shader.frag"); + + OpenGL::Shader vert({vertexShaderSource.begin(), vertexShaderSource.size()}, OpenGL::Vertex); + OpenGL::Shader frag({fragmentShaderSource.begin(), fragmentShaderSource.size()}, OpenGL::Fragment); + triangleProgram.create({vert, frag}); gl.useProgram(triangleProgram); textureEnvSourceLoc = OpenGL::uniformLocation(triangleProgram, "u_textureEnvSource"); @@ -628,12 +75,15 @@ void Renderer::initGraphicsContext() { glUniform1i(OpenGL::uniformLocation(triangleProgram, "u_tex2"), 2); glUniform1i(OpenGL::uniformLocation(triangleProgram, "u_tex_lighting_lut"), 3); - OpenGL::Shader vertDisplay(displayVertexShader, OpenGL::Vertex); - OpenGL::Shader fragDisplay(displayFragmentShader, OpenGL::Fragment); - displayProgram.create({ vertDisplay, fragDisplay }); + auto displayVertexShaderSource = gl_resources.open("opengl_display.vert"); + auto displayFragmentShaderSource = gl_resources.open("opengl_display.frag"); + + OpenGL::Shader vertDisplay({displayVertexShaderSource.begin(), displayVertexShaderSource.size()}, OpenGL::Vertex); + OpenGL::Shader fragDisplay({displayFragmentShaderSource.begin(), displayFragmentShaderSource.size()}, OpenGL::Fragment); + displayProgram.create({vertDisplay, fragDisplay}); gl.useProgram(displayProgram); - glUniform1i(OpenGL::uniformLocation(displayProgram, "u_texture"), 0); // Init sampler object + glUniform1i(OpenGL::uniformLocation(displayProgram, "u_texture"), 0); // Init sampler object vbo.createFixedSize(sizeof(Vertex) * vertexBufferSize, GL_STREAM_DRAW); gl.bindVBO(vbo); @@ -669,10 +119,10 @@ void Renderer::initGraphicsContext() { dummyVAO.create(); // Create texture and framebuffer for the 3DS screen - const u32 screenTextureWidth = 400; // Top screen is 400 pixels wide, bottom is 320 - const u32 screenTextureHeight = 2 * 240; // Both screens are 240 pixels tall - - glGenTextures(1,&lightLUTTextureArray); + const u32 screenTextureWidth = 400; // Top screen is 400 pixels wide, bottom is 320 + const u32 screenTextureHeight = 2 * 240; // Both screens are 240 pixels tall + + glGenTextures(1, &lightLUTTextureArray); auto prevTexture = OpenGL::getTex2D(); screenTexture.create(screenTextureWidth, screenTextureHeight, GL_RGBA8); @@ -684,8 +134,7 @@ void Renderer::initGraphicsContext() { screenFramebuffer.createWithDrawTexture(screenTexture); screenFramebuffer.bind(OpenGL::DrawAndReadFramebuffer); - if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) - Helpers::panic("Incomplete framebuffer"); + if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) Helpers::panic("Incomplete framebuffer"); // TODO: This should not clear the framebuffer contents. It should load them from VRAM. GLint oldViewport[4]; @@ -699,19 +148,32 @@ void Renderer::initGraphicsContext() { } // Set up the OpenGL blending context to match the emulated PICA -void Renderer::setupBlending() { +void RendererGL::setupBlending() { const bool blendingEnabled = (regs[PICA::InternalRegs::ColourOperation] & (1 << 8)) != 0; - + // Map of PICA blending equations to OpenGL blending equations. The unused blending equations are equivalent to equation 0 (add) static constexpr std::array blendingEquations = { - GL_FUNC_ADD, GL_FUNC_SUBTRACT, GL_FUNC_REVERSE_SUBTRACT, GL_MIN, GL_MAX, GL_FUNC_ADD, GL_FUNC_ADD, GL_FUNC_ADD + GL_FUNC_ADD, GL_FUNC_SUBTRACT, GL_FUNC_REVERSE_SUBTRACT, GL_MIN, GL_MAX, GL_FUNC_ADD, GL_FUNC_ADD, GL_FUNC_ADD, }; - + // Map of PICA blending funcs to OpenGL blending funcs. Func = 15 is undocumented and stubbed to GL_ONE for now static constexpr std::array blendingFuncs = { - GL_ZERO, GL_ONE, GL_SRC_COLOR, GL_ONE_MINUS_SRC_COLOR, GL_DST_COLOR, GL_ONE_MINUS_DST_COLOR, GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, - GL_DST_ALPHA, GL_ONE_MINUS_DST_ALPHA, GL_CONSTANT_COLOR, GL_ONE_MINUS_CONSTANT_COLOR, GL_CONSTANT_ALPHA, GL_ONE_MINUS_CONSTANT_ALPHA, - GL_SRC_ALPHA_SATURATE, GL_ONE + GL_ZERO, + GL_ONE, + GL_SRC_COLOR, + GL_ONE_MINUS_SRC_COLOR, + GL_DST_COLOR, + GL_ONE_MINUS_DST_COLOR, + GL_SRC_ALPHA, + GL_ONE_MINUS_SRC_ALPHA, + GL_DST_ALPHA, + GL_ONE_MINUS_DST_ALPHA, + GL_CONSTANT_COLOR, + GL_ONE_MINUS_CONSTANT_COLOR, + GL_CONSTANT_ALPHA, + GL_ONE_MINUS_CONSTANT_ALPHA, + GL_SRC_ALPHA_SATURATE, + GL_ONE, }; if (!blendingEnabled) { @@ -743,13 +205,12 @@ void Renderer::setupBlending() { } } -void Renderer::setupTextureEnvState() { +void RendererGL::setupTextureEnvState() { // TODO: Only update uniforms when the TEV config changed. Use an UBO potentially. static constexpr std::array ioBases = { - PICA::InternalRegs::TexEnv0Source, PICA::InternalRegs::TexEnv1Source, - PICA::InternalRegs::TexEnv2Source, PICA::InternalRegs::TexEnv3Source, - PICA::InternalRegs::TexEnv4Source, PICA::InternalRegs::TexEnv5Source + PICA::InternalRegs::TexEnv0Source, PICA::InternalRegs::TexEnv1Source, PICA::InternalRegs::TexEnv2Source, + PICA::InternalRegs::TexEnv3Source, PICA::InternalRegs::TexEnv4Source, PICA::InternalRegs::TexEnv5Source, }; u32 textureEnvSourceRegs[6]; @@ -775,9 +236,11 @@ void Renderer::setupTextureEnvState() { glUniform1uiv(textureEnvScaleLoc, 6, textureEnvScaleRegs); } -void Renderer::bindTexturesToSlots() { +void RendererGL::bindTexturesToSlots() { static constexpr std::array ioBases = { - PICA::InternalRegs::Tex0BorderColor, PICA::InternalRegs::Tex1BorderColor, PICA::InternalRegs::Tex2BorderColor + PICA::InternalRegs::Tex0BorderColor, + PICA::InternalRegs::Tex1BorderColor, + PICA::InternalRegs::Tex2BorderColor, }; for (int i = 0; i < 3; i++) { @@ -805,13 +268,13 @@ void Renderer::bindTexturesToSlots() { glActiveTexture(GL_TEXTURE0); } -void Renderer::updateLightingLUT() { +void RendererGL::updateLightingLUT() { gpu.lightingLUTDirty = false; - std::array u16_lightinglut; - + std::array u16_lightinglut; + for (int i = 0; i < gpu.lightingLUT.size(); i++) { - uint64_t value = gpu.lightingLUT[i] & ((1 << 12) - 1); - u16_lightinglut[i] = value * 65535 / 4095; + uint64_t value = gpu.lightingLUT[i] & ((1 << 12) - 1); + u16_lightinglut[i] = value * 65535 / 4095; } glActiveTexture(GL_TEXTURE0 + 3); @@ -824,19 +287,22 @@ void Renderer::updateLightingLUT() { glActiveTexture(GL_TEXTURE0); } -void Renderer::drawVertices(PICA::PrimType primType, std::span vertices) { +void RendererGL::drawVertices(PICA::PrimType primType, std::span vertices) { // The fourth type is meant to be "Geometry primitive". TODO: Find out what that is static constexpr std::array primTypes = { - OpenGL::Triangle, OpenGL::TriangleStrip, OpenGL::TriangleFan, OpenGL::Triangle + OpenGL::Triangle, + OpenGL::TriangleStrip, + OpenGL::TriangleFan, + OpenGL::Triangle, }; - const auto primitiveTopology = primTypes[static_cast(primType)]; + const auto primitiveTopology = primTypes[static_cast(primType)]; gl.disableScissor(); gl.bindVBO(vbo); gl.bindVAO(vao); gl.useProgram(triangleProgram); - OpenGL::enableClipPlane(0); // Clipping plane 0 is always enabled + OpenGL::enableClipPlane(0); // Clipping plane 0 is always enabled if (regs[PICA::InternalRegs::ClipEnable] & 1) { OpenGL::enableClipPlane(1); } @@ -852,9 +318,7 @@ void Renderer::drawVertices(PICA::PrimType primType, std::span ver const int colourMask = getBits<8, 4>(depthControl); gl.setColourMask(colourMask & 1, colourMask & 2, colourMask & 4, colourMask & 8); - static constexpr std::array depthModes = { - GL_NEVER, GL_ALWAYS, GL_EQUAL, GL_NOTEQUAL, GL_LESS, GL_LEQUAL, GL_GREATER, GL_GEQUAL - }; + static constexpr std::array depthModes = {GL_NEVER, GL_ALWAYS, GL_EQUAL, GL_NOTEQUAL, GL_LESS, GL_LEQUAL, GL_GREATER, GL_GEQUAL}; const float depthScale = f24::fromRaw(regs[PICA::InternalRegs::DepthScale] & 0xffffff).toFloat32(); const float depthOffset = f24::fromRaw(regs[PICA::InternalRegs::DepthOffset] & 0xffffff).toFloat32(); @@ -865,7 +329,7 @@ void Renderer::drawVertices(PICA::PrimType primType, std::span ver oldDepthScale = depthScale; glUniform1f(depthScaleLoc, depthScale); } - + if (oldDepthOffset != depthOffset) { oldDepthOffset = depthOffset; glUniform1f(depthOffsetLoc, depthOffset); @@ -917,7 +381,7 @@ void Renderer::drawVertices(PICA::PrimType primType, std::span ver constexpr u32 topScreenBuffer = 0x1f000000; constexpr u32 bottomScreenBuffer = 0x1f05dc00; -void Renderer::display() { +void RendererGL::display() { gl.disableScissor(); glBindFramebuffer(GL_DRAW_FRAMEBUFFER, 0); @@ -925,7 +389,7 @@ void Renderer::display() { glBlitFramebuffer(0, 0, 400, 480, 0, 0, 400, 480, GL_COLOR_BUFFER_BIT, GL_LINEAR); } -void Renderer::clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control) { +void RendererGL::clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control) { return; log("GPU: Clear buffer\nStart: %08X End: %08X\nValue: %08X Control: %08X\n", startAddress, endAddress, value, control); @@ -947,10 +411,10 @@ void Renderer::clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 cont OpenGL::clearColor(); } -OpenGL::Framebuffer Renderer::getColourFBO() { - //We construct a colour buffer object and see if our cache has any matching colour buffers in it - // If not, we allocate a texture & FBO for our framebuffer and store it in the cache - ColourBuffer sampleBuffer(colourBufferLoc, colourBufferFormat, fbSize.x(), fbSize.y()); +OpenGL::Framebuffer RendererGL::getColourFBO() { + // We construct a colour buffer object and see if our cache has any matching colour buffers in it + // If not, we allocate a texture & FBO for our framebuffer and store it in the cache + ColourBuffer sampleBuffer(colourBufferLoc, colourBufferFormat, fbSize[0], fbSize[1]); auto buffer = colourBufferCache.find(sampleBuffer); if (buffer.has_value()) { @@ -960,9 +424,9 @@ OpenGL::Framebuffer Renderer::getColourFBO() { } } -void Renderer::bindDepthBuffer() { +void RendererGL::bindDepthBuffer() { // Similar logic as the getColourFBO function - DepthBuffer sampleBuffer(depthBufferLoc, depthBufferFormat, fbSize.x(), fbSize.y()); + DepthBuffer sampleBuffer(depthBufferLoc, depthBufferFormat, fbSize[0], fbSize[1]); auto buffer = depthBufferCache.find(sampleBuffer); GLuint tex; @@ -979,14 +443,14 @@ void Renderer::bindDepthBuffer() { glFramebufferTexture2D(GL_FRAMEBUFFER, attachment, GL_TEXTURE_2D, tex, 0); } -OpenGL::Texture Renderer::getTexture(Texture& tex) { +OpenGL::Texture RendererGL::getTexture(Texture& tex) { // Similar logic as the getColourFBO/bindDepthBuffer functions auto buffer = textureCache.find(tex); if (buffer.has_value()) { return buffer.value().get().texture; } else { - const void* textureData = gpu.getPointerPhys(tex.location); // Get pointer to the texture data in 3DS memory + const auto textureData = std::span{gpu.getPointerPhys(tex.location), tex.sizeInBytes()}; // Get pointer to the texture data in 3DS memory Texture& newTex = textureCache.add(tex); newTex.decodeTexture(textureData); @@ -994,7 +458,7 @@ OpenGL::Texture Renderer::getTexture(Texture& tex) { } } -void Renderer::displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) { +void RendererGL::displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) { const u32 inputWidth = inputSize & 0xffff; const u32 inputGap = inputSize >> 16; @@ -1022,21 +486,21 @@ void Renderer::displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 // Hack: Detect whether we are writing to the top or bottom screen by checking output gap and drawing to the proper part of the output texture // We consider output gap == 320 to mean bottom, and anything else to mean top if (outputGap == 320) { - OpenGL::setViewport(40, 0, 320, 240); // Bottom screen viewport + OpenGL::setViewport(40, 0, 320, 240); // Bottom screen viewport } else { - OpenGL::setViewport(0, 240, 400, 240); // Top screen viewport + OpenGL::setViewport(0, 240, 400, 240); // Top screen viewport } - OpenGL::draw(OpenGL::TriangleStrip, 4); // Actually draw our 3DS screen + OpenGL::draw(OpenGL::TriangleStrip, 4); // Actually draw our 3DS screen } -void Renderer::screenshot(const std::string& name) { +void RendererGL::screenshot(const std::string& name) { constexpr uint width = 400; constexpr uint height = 2 * 240; std::vector pixels, flippedPixels; - pixels.resize(width * height * 4); - flippedPixels.resize(pixels.size());; + pixels.resize(width * height * 4); + flippedPixels.resize(pixels.size()); OpenGL::bindScreenFramebuffer(); glReadPixels(0, 0, width, height, GL_BGRA, GL_UNSIGNED_BYTE, pixels.data()); @@ -1053,4 +517,4 @@ void Renderer::screenshot(const std::string& name) { } stbi_write_png(name.c_str(), width, height, 4, flippedPixels.data(), 0); -} \ No newline at end of file +} diff --git a/src/core/renderer_gl/textures.cpp b/src/core/renderer_gl/textures.cpp index 819bf783..9e303fd9 100644 --- a/src/core/renderer_gl/textures.cpp +++ b/src/core/renderer_gl/textures.cpp @@ -112,12 +112,11 @@ u32 Texture::getSwizzledOffset_4bpp(u32 u, u32 v, u32 width) { // Get the texel at position (u, v) // fmt: format of the texture // data: texture data of the texture -u32 Texture::decodeTexel(u32 u, u32 v, PICA::TextureFmt fmt, const void* data) { +u32 Texture::decodeTexel(u32 u, u32 v, PICA::TextureFmt fmt, std::span data) { switch (fmt) { case PICA::TextureFmt::RGBA4: { u32 offset = getSwizzledOffset(u, v, size.u(), 2); - auto ptr = static_cast(data); - u16 texel = u16(ptr[offset]) | (u16(ptr[offset + 1]) << 8); + u16 texel = u16(data[offset]) | (u16(data[offset + 1]) << 8); u8 alpha = Colour::convert4To8Bit(getBits<0, 4, u8>(texel)); u8 b = Colour::convert4To8Bit(getBits<4, 4, u8>(texel)); @@ -128,9 +127,8 @@ u32 Texture::decodeTexel(u32 u, u32 v, PICA::TextureFmt fmt, const void* data) { } case PICA::TextureFmt::RGBA5551: { - u32 offset = getSwizzledOffset(u, v, size.u(), 2); - auto ptr = static_cast(data); - u16 texel = u16(ptr[offset]) | (u16(ptr[offset + 1]) << 8); + const u32 offset = getSwizzledOffset(u, v, size.u(), 2); + const u16 texel = u16(data[offset]) | (u16(data[offset + 1]) << 8); u8 alpha = getBit<0>(texel) ? 0xff : 0; u8 b = Colour::convert5To8Bit(getBits<1, 5, u8>(texel)); @@ -141,56 +139,47 @@ u32 Texture::decodeTexel(u32 u, u32 v, PICA::TextureFmt fmt, const void* data) { } case PICA::TextureFmt::RGB565: { - u32 offset = getSwizzledOffset(u, v, size.u(), 2); - auto ptr = static_cast(data); - u16 texel = u16(ptr[offset]) | (u16(ptr[offset + 1]) << 8); + const u32 offset = getSwizzledOffset(u, v, size.u(), 2); + const u16 texel = u16(data[offset]) | (u16(data[offset + 1]) << 8); - u8 b = Colour::convert5To8Bit(getBits<0, 5, u8>(texel)); - u8 g = Colour::convert6To8Bit(getBits<5, 6, u8>(texel)); - u8 r = Colour::convert5To8Bit(getBits<11, 5, u8>(texel)); + const u8 b = Colour::convert5To8Bit(getBits<0, 5, u8>(texel)); + const u8 g = Colour::convert6To8Bit(getBits<5, 6, u8>(texel)); + const u8 r = Colour::convert5To8Bit(getBits<11, 5, u8>(texel)); return (0xff << 24) | (b << 16) | (g << 8) | r; } case PICA::TextureFmt::RG8: { u32 offset = getSwizzledOffset(u, v, size.u(), 2); - auto ptr = static_cast(data); - constexpr u8 b = 0; - u8 g = ptr[offset]; - u8 r = ptr[offset + 1]; + const u8 g = data[offset]; + const u8 r = data[offset + 1]; return (0xff << 24) | (b << 16) | (g << 8) | r; } case PICA::TextureFmt::RGB8: { - u32 offset = getSwizzledOffset(u, v, size.u(), 3); - auto ptr = static_cast(data); - - u8 b = ptr[offset]; - u8 g = ptr[offset + 1]; - u8 r = ptr[offset + 2]; + const u32 offset = getSwizzledOffset(u, v, size.u(), 3); + const u8 b = data[offset]; + const u8 g = data[offset + 1]; + const u8 r = data[offset + 2]; return (0xff << 24) | (b << 16) | (g << 8) | r; } case PICA::TextureFmt::RGBA8: { - u32 offset = getSwizzledOffset(u, v, size.u(), 4); - auto ptr = static_cast(data); - - u8 alpha = ptr[offset]; - u8 b = ptr[offset + 1]; - u8 g = ptr[offset + 2]; - u8 r = ptr[offset + 3]; + const u32 offset = getSwizzledOffset(u, v, size.u(), 4); + const u8 alpha = data[offset]; + const u8 b = data[offset + 1]; + const u8 g = data[offset + 2]; + const u8 r = data[offset + 3]; return (alpha << 24) | (b << 16) | (g << 8) | r; } case PICA::TextureFmt::IA4: { - u32 offset = getSwizzledOffset(u, v, size.u(), 1); - auto ptr = static_cast(data); - - const u8 texel = ptr[offset]; + const u32 offset = getSwizzledOffset(u, v, size.u(), 1); + const u8 texel = data[offset]; const u8 alpha = Colour::convert4To8Bit(texel & 0xf); const u8 intensity = Colour::convert4To8Bit(texel >> 4); @@ -199,11 +188,10 @@ u32 Texture::decodeTexel(u32 u, u32 v, PICA::TextureFmt fmt, const void* data) { } case PICA::TextureFmt::A4: { - u32 offset = getSwizzledOffset_4bpp(u, v, size.u()); - auto ptr = static_cast(data); + const u32 offset = getSwizzledOffset_4bpp(u, v, size.u()); // For odd U coordinates, grab the top 4 bits, and the low 4 bits for even coordinates - u8 alpha = ptr[offset] >> ((u % 2) ? 4 : 0); + u8 alpha = data[offset] >> ((u % 2) ? 4 : 0); alpha = Colour::convert4To8Bit(getBits<0, 4>(alpha)); // A8 sets RGB to 0 @@ -212,8 +200,7 @@ u32 Texture::decodeTexel(u32 u, u32 v, PICA::TextureFmt fmt, const void* data) { case PICA::TextureFmt::A8: { u32 offset = getSwizzledOffset(u, v, size.u(), 1); - auto ptr = static_cast(data); - const u8 alpha = ptr[offset]; + const u8 alpha = data[offset]; // A8 sets RGB to 0 return (alpha << 24) | (0 << 16) | (0 << 8) | 0; @@ -221,10 +208,9 @@ u32 Texture::decodeTexel(u32 u, u32 v, PICA::TextureFmt fmt, const void* data) { case PICA::TextureFmt::I4: { u32 offset = getSwizzledOffset_4bpp(u, v, size.u()); - auto ptr = static_cast(data); // For odd U coordinates, grab the top 4 bits, and the low 4 bits for even coordinates - u8 intensity = ptr[offset] >> ((u % 2) ? 4 : 0); + u8 intensity = data[offset] >> ((u % 2) ? 4 : 0); intensity = Colour::convert4To8Bit(getBits<0, 4>(intensity)); // Intensity formats just copy the intensity value to every colour channel @@ -233,8 +219,7 @@ u32 Texture::decodeTexel(u32 u, u32 v, PICA::TextureFmt fmt, const void* data) { case PICA::TextureFmt::I8: { u32 offset = getSwizzledOffset(u, v, size.u(), 1); - auto ptr = static_cast(data); - const u8 intensity = ptr[offset]; + const u8 intensity = data[offset]; // Intensity formats just copy the intensity value to every colour channel return (0xff << 24) | (intensity << 16) | (intensity << 8) | intensity; @@ -242,11 +227,10 @@ u32 Texture::decodeTexel(u32 u, u32 v, PICA::TextureFmt fmt, const void* data) { case PICA::TextureFmt::IA8: { u32 offset = getSwizzledOffset(u, v, size.u(), 2); - auto ptr = static_cast(data); // Same as I8 except each pixel gets its own alpha value too - const u8 alpha = ptr[offset]; - const u8 intensity = ptr[offset + 1]; + const u8 alpha = data[offset]; + const u8 intensity = data[offset + 1]; return (alpha << 24) | (intensity << 16) | (intensity << 8) | intensity; } @@ -258,7 +242,7 @@ u32 Texture::decodeTexel(u32 u, u32 v, PICA::TextureFmt fmt, const void* data) { } } -void Texture::decodeTexture(const void* data) { +void Texture::decodeTexture(std::span data) { std::vector decoded; decoded.reserve(u64(size.u()) * u64(size.v())); @@ -272,4 +256,4 @@ void Texture::decodeTexture(const void* data) { texture.bind(); glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, size.u(), size.v(), GL_RGBA, GL_UNSIGNED_BYTE, decoded.data()); -} \ No newline at end of file +} diff --git a/src/core/renderer_null/renderer_null.cpp b/src/core/renderer_null/renderer_null.cpp new file mode 100644 index 00000000..9df2ddeb --- /dev/null +++ b/src/core/renderer_null/renderer_null.cpp @@ -0,0 +1,12 @@ +#include "renderer_null/renderer_null.hpp" + +RendererNull::RendererNull(GPU& gpu, const std::array& internalRegs) : Renderer(gpu, internalRegs) {} +RendererNull::~RendererNull() {} + +void RendererNull::reset() {} +void RendererNull::display() {} +void RendererNull::initGraphicsContext() {} +void RendererNull::clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control) {} +void RendererNull::displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) {} +void RendererNull::drawVertices(PICA::PrimType primType, std::span vertices) {} +void RendererNull::screenshot(const std::string& name) {} \ No newline at end of file diff --git a/src/emulator.cpp b/src/emulator.cpp index 0ae60543..ce42d273 100644 --- a/src/emulator.cpp +++ b/src/emulator.cpp @@ -1,6 +1,8 @@ #include "emulator.hpp" -#include +#ifdef PANDA3DS_ENABLE_OPENGL +#include +#endif #ifdef _WIN32 #include @@ -12,7 +14,9 @@ __declspec(dllexport) DWORD AmdPowerXpressRequestHighPerformance = 1; } #endif -Emulator::Emulator() : kernel(cpu, memory, gpu), cpu(memory, kernel), gpu(memory, gl, config), memory(cpu.getTicksRef()) { +Emulator::Emulator() + : config(std::filesystem::current_path() / "config.toml"), kernel(cpu, memory, gpu), cpu(memory, kernel), gpu(memory, config), + memory(cpu.getTicksRef()), cheats(memory, kernel.getServiceManager().getHID()) { if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_EVENTS) < 0) { Helpers::panic("Failed to initialize SDL2"); } @@ -23,25 +27,29 @@ Emulator::Emulator() : kernel(cpu, memory, gpu), cpu(memory, kernel), gpu(memory Helpers::warn("Failed to initialize SDL2 GameController: %s", SDL_GetError()); } - // Request OpenGL 4.1 Core (Max available on MacOS) - // MacOS gets mad if we don't explicitly demand a core profile - SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE); - SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, 4); - SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, 1); - window = SDL_CreateWindow("Alber", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, width, height, SDL_WINDOW_OPENGL); +#ifdef PANDA3DS_ENABLE_OPENGL + if (config.rendererType == RendererType::OpenGL) { + // Request OpenGL 4.1 Core (Max available on MacOS) + // MacOS gets mad if we don't explicitly demand a core profile + SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE); + SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, 4); + SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, 1); + window = SDL_CreateWindow("Alber", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, width, height, SDL_WINDOW_OPENGL); - if (window == nullptr) { - Helpers::panic("Window creation failed: %s", SDL_GetError()); - } + if (window == nullptr) { + Helpers::panic("Window creation failed: %s", SDL_GetError()); + } - glContext = SDL_GL_CreateContext(window); - if (glContext == nullptr) { - Helpers::panic("OpenGL context creation failed: %s", SDL_GetError()); - } + glContext = SDL_GL_CreateContext(window); + if (glContext == nullptr) { + Helpers::panic("OpenGL context creation failed: %s", SDL_GetError()); + } - if (!gladLoadGL(reinterpret_cast(SDL_GL_GetProcAddress))) { - Helpers::panic("OpenGL init failed: %s", SDL_GetError()); + if (!gladLoadGL(reinterpret_cast(SDL_GL_GetProcAddress))) { + Helpers::panic("OpenGL init failed: %s", SDL_GetError()); + } } +#endif if (SDL_WasInit(SDL_INIT_GAMECONTROLLER)) { gameController = SDL_GameControllerOpen(0); @@ -52,7 +60,6 @@ Emulator::Emulator() : kernel(cpu, memory, gpu), cpu(memory, kernel), gpu(memory } } - config.load(std::filesystem::current_path() / "config.toml"); reset(ReloadOption::NoReload); } @@ -69,6 +76,12 @@ void Emulator::reset(ReloadOption reload) { // Otherwise resetting the kernel or cpu might nuke them cpu.setReg(13, VirtualAddrs::StackTop); // Set initial SP + // We're resetting without reloading the ROM, so yeet cheats + if (reload == ReloadOption::NoReload) { + haveCheats = false; + cheats.reset(); + } + // If a ROM is active and we reset, with the reload option enabled then reload it. // This is necessary to set up stack, executable memory, .data/.rodata/.bss all over again if (reload == ReloadOption::Reload && romType != ROMType::None && romPath.has_value()) { @@ -91,19 +104,8 @@ void Emulator::run() { #endif while (running) { - ServiceManager& srv = kernel.getServiceManager(); - - if (romType != ROMType::None) { -#ifdef PANDA3DS_ENABLE_HTTP_SERVER - pollHttpServer(); -#endif - runFrame(); // Run 1 frame of instructions - gpu.display(); // Display graphics - - // Send VBlank interrupts - srv.sendGPUInterrupt(GPUInterrupt::VBlank0); - srv.sendGPUInterrupt(GPUInterrupt::VBlank1); - } + runFrame(); + HIDService& hid = kernel.getServiceManager().getHID(); SDL_Event event; while (SDL_PollEvent(&event)) { @@ -119,41 +121,41 @@ void Emulator::run() { if (romType == ROMType::None) break; switch (event.key.keysym.sym) { - case SDLK_l: srv.pressKey(Keys::A); break; - case SDLK_k: srv.pressKey(Keys::B); break; - case SDLK_o: srv.pressKey(Keys::X); break; - case SDLK_i: srv.pressKey(Keys::Y); break; + case SDLK_l: hid.pressKey(Keys::A); break; + case SDLK_k: hid.pressKey(Keys::B); break; + case SDLK_o: hid.pressKey(Keys::X); break; + case SDLK_i: hid.pressKey(Keys::Y); break; - case SDLK_q: srv.pressKey(Keys::L); break; - case SDLK_p: srv.pressKey(Keys::R); break; + case SDLK_q: hid.pressKey(Keys::L); break; + case SDLK_p: hid.pressKey(Keys::R); break; - case SDLK_RIGHT: srv.pressKey(Keys::Right); break; - case SDLK_LEFT: srv.pressKey(Keys::Left); break; - case SDLK_UP: srv.pressKey(Keys::Up); break; - case SDLK_DOWN: srv.pressKey(Keys::Down); break; + case SDLK_RIGHT: hid.pressKey(Keys::Right); break; + case SDLK_LEFT: hid.pressKey(Keys::Left); break; + case SDLK_UP: hid.pressKey(Keys::Up); break; + case SDLK_DOWN: hid.pressKey(Keys::Down); break; case SDLK_w: - srv.setCirclepadY(0x9C); + hid.setCirclepadY(0x9C); keyboardAnalogY = true; break; case SDLK_a: - srv.setCirclepadX(-0x9C); + hid.setCirclepadX(-0x9C); keyboardAnalogX = true; break; case SDLK_s: - srv.setCirclepadY(-0x9C); + hid.setCirclepadY(-0x9C); keyboardAnalogY = true; break; case SDLK_d: - srv.setCirclepadX(0x9C); + hid.setCirclepadX(0x9C); keyboardAnalogX = true; break; - case SDLK_RETURN: srv.pressKey(Keys::Start); break; - case SDLK_BACKSPACE: srv.pressKey(Keys::Select); break; + case SDLK_RETURN: hid.pressKey(Keys::Start); break; + case SDLK_BACKSPACE: hid.pressKey(Keys::Select); break; } break; @@ -161,34 +163,34 @@ void Emulator::run() { if (romType == ROMType::None) break; switch (event.key.keysym.sym) { - case SDLK_l: srv.releaseKey(Keys::A); break; - case SDLK_k: srv.releaseKey(Keys::B); break; - case SDLK_o: srv.releaseKey(Keys::X); break; - case SDLK_i: srv.releaseKey(Keys::Y); break; + case SDLK_l: hid.releaseKey(Keys::A); break; + case SDLK_k: hid.releaseKey(Keys::B); break; + case SDLK_o: hid.releaseKey(Keys::X); break; + case SDLK_i: hid.releaseKey(Keys::Y); break; - case SDLK_q: srv.releaseKey(Keys::L); break; - case SDLK_p: srv.releaseKey(Keys::R); break; + case SDLK_q: hid.releaseKey(Keys::L); break; + case SDLK_p: hid.releaseKey(Keys::R); break; - case SDLK_RIGHT: srv.releaseKey(Keys::Right); break; - case SDLK_LEFT: srv.releaseKey(Keys::Left); break; - case SDLK_UP: srv.releaseKey(Keys::Up); break; - case SDLK_DOWN: srv.releaseKey(Keys::Down); break; + case SDLK_RIGHT: hid.releaseKey(Keys::Right); break; + case SDLK_LEFT: hid.releaseKey(Keys::Left); break; + case SDLK_UP: hid.releaseKey(Keys::Up); break; + case SDLK_DOWN: hid.releaseKey(Keys::Down); break; // Err this is probably not ideal case SDLK_w: case SDLK_s: - srv.setCirclepadY(0); + hid.setCirclepadY(0); keyboardAnalogY = false; break; case SDLK_a: case SDLK_d: - srv.setCirclepadX(0); + hid.setCirclepadX(0); keyboardAnalogX = false; break; - case SDLK_RETURN: srv.releaseKey(Keys::Start); break; - case SDLK_BACKSPACE: srv.releaseKey(Keys::Select); break; + case SDLK_RETURN: hid.releaseKey(Keys::Start); break; + case SDLK_BACKSPACE: hid.releaseKey(Keys::Select); break; } break; @@ -205,9 +207,9 @@ void Emulator::run() { u16 x_converted = static_cast(x) - 40; u16 y_converted = static_cast(y) - 240; - srv.setTouchScreenPress(x_converted, y_converted); + hid.setTouchScreenPress(x_converted, y_converted); } else { - srv.releaseTouchScreen(); + hid.releaseTouchScreen(); } } else if (event.button.button == SDL_BUTTON_RIGHT) { holdingRightClick = true; @@ -219,7 +221,7 @@ void Emulator::run() { if (romType == ROMType::None) break; if (event.button.button == SDL_BUTTON_LEFT) { - srv.releaseTouchScreen(); + hid.releaseTouchScreen(); } else if (event.button.button == SDL_BUTTON_RIGHT) { holdingRightClick = false; } @@ -262,9 +264,9 @@ void Emulator::run() { if (key != 0) { if (event.cbutton.state == SDL_PRESSED) { - srv.pressKey(key); + hid.pressKey(key); } else { - srv.releaseKey(key); + hid.releaseKey(key); } } break; @@ -283,8 +285,8 @@ void Emulator::run() { // So up until then, we will set the gyroscope euler angles to fixed values based on the direction of the relative motion const s32 roll = motionX > 0 ? 0x7f : -0x7f; const s32 pitch = motionY > 0 ? 0x7f : -0x7f; - srv.setRoll(roll); - srv.setPitch(pitch); + hid.setRoll(roll); + hid.setPitch(pitch); break; } @@ -311,19 +313,19 @@ void Emulator::run() { // Avoid overriding the keyboard's circlepad input if (abs(stickX) < deadzone && !keyboardAnalogX) { - srv.setCirclepadX(0); + hid.setCirclepadX(0); } else { - srv.setCirclepadX(stickX / div); + hid.setCirclepadX(stickX / div); } if (abs(stickY) < deadzone && !keyboardAnalogY) { - srv.setCirclepadY(0); + hid.setCirclepadY(0); } else { - srv.setCirclepadY(-(stickY / div)); + hid.setCirclepadY(-(stickY / div)); } } - srv.updateInputs(cpu.getTicks()); + hid.updateInputs(cpu.getTicks()); } // Update inputs in the HID module @@ -331,7 +333,24 @@ void Emulator::run() { } } -void Emulator::runFrame() { cpu.runFrame(); } +void Emulator::runFrame() { + if (romType != ROMType::None) { +#ifdef PANDA3DS_ENABLE_HTTP_SERVER + pollHttpServer(); +#endif + cpu.runFrame(); // Run 1 frame of instructions + gpu.display(); // Display graphics + + // Send VBlank interrupts + ServiceManager& srv = kernel.getServiceManager(); + srv.sendGPUInterrupt(GPUInterrupt::VBlank0); + srv.sendGPUInterrupt(GPUInterrupt::VBlank1); + + if (haveCheats) [[unlikely]] { + cheats.run(); + } + } +} bool Emulator::loadROM(const std::filesystem::path& path) { // Reset the emulator if we've already loaded a ROM @@ -427,15 +446,13 @@ bool Emulator::loadELF(std::ifstream& file) { } // Reset our graphics context and initialize the GPU's graphics context -void Emulator::initGraphicsContext() { - gl.reset(); // TODO (For when we have multiple backends): Only do this if we are using OpenGL - gpu.initGraphicsContext(); -} +void Emulator::initGraphicsContext() { gpu.initGraphicsContext(); } #ifdef PANDA3DS_ENABLE_HTTP_SERVER void Emulator::pollHttpServer() { std::scoped_lock lock(httpServer.actionMutex); - ServiceManager& srv = kernel.getServiceManager(); + + HIDService& hid = kernel.getServiceManager().getHID(); if (httpServer.pendingAction) { switch (httpServer.action) { @@ -443,14 +460,14 @@ void Emulator::pollHttpServer() { case HttpAction::PressKey: if (httpServer.pendingKey != 0) { - srv.pressKey(httpServer.pendingKey); + hid.pressKey(httpServer.pendingKey); httpServer.pendingKey = 0; } break; case HttpAction::ReleaseKey: if (httpServer.pendingKey != 0) { - srv.releaseKey(httpServer.pendingKey); + hid.releaseKey(httpServer.pendingKey); httpServer.pendingKey = 0; } break; diff --git a/src/host_shaders/opengl_display.frag b/src/host_shaders/opengl_display.frag new file mode 100644 index 00000000..612671c8 --- /dev/null +++ b/src/host_shaders/opengl_display.frag @@ -0,0 +1,8 @@ +#version 410 core +in vec2 UV; +out vec4 FragColor; + +uniform sampler2D u_texture; +void main() { + FragColor = texture(u_texture, UV); +} \ No newline at end of file diff --git a/src/host_shaders/opengl_display.vert b/src/host_shaders/opengl_display.vert new file mode 100644 index 00000000..990e2f80 --- /dev/null +++ b/src/host_shaders/opengl_display.vert @@ -0,0 +1,23 @@ +#version 410 core +out vec2 UV; + +void main() { + const vec4 positions[4] = vec4[]( + vec4(-1.0, 1.0, 1.0, 1.0), // Top-left + vec4(1.0, 1.0, 1.0, 1.0), // Top-right + vec4(-1.0, -1.0, 1.0, 1.0), // Bottom-left + vec4(1.0, -1.0, 1.0, 1.0) // Bottom-right + ); + + // The 3DS displays both screens' framebuffer rotated 90 deg counter clockwise + // So we adjust our texcoords accordingly + const vec2 texcoords[4] = vec2[]( + vec2(1.0, 1.0), // Top-right + vec2(1.0, 0.0), // Bottom-right + vec2(0.0, 1.0), // Top-left + vec2(0.0, 0.0) // Bottom-left + ); + + gl_Position = positions[gl_VertexID]; + UV = texcoords[gl_VertexID]; +} \ No newline at end of file diff --git a/src/host_shaders/opengl_fragment_shader.frag b/src/host_shaders/opengl_fragment_shader.frag new file mode 100644 index 00000000..f6461094 --- /dev/null +++ b/src/host_shaders/opengl_fragment_shader.frag @@ -0,0 +1,409 @@ +#version 410 core + +in vec3 v_tangent; +in vec3 v_normal; +in vec3 v_bitangent; +in vec4 v_colour; +in vec3 v_texcoord0; +in vec2 v_texcoord1; +in vec3 v_view; +in vec2 v_texcoord2; +flat in vec4 v_textureEnvColor[6]; +flat in vec4 v_textureEnvBufferColor; + +out vec4 fragColour; + +// TEV uniforms +uniform uint u_textureEnvSource[6]; +uniform uint u_textureEnvOperand[6]; +uniform uint u_textureEnvCombiner[6]; +uniform uint u_textureEnvScale[6]; + +// Depth control uniforms +uniform float u_depthScale; +uniform float u_depthOffset; +uniform bool u_depthmapEnable; + +uniform sampler2D u_tex0; +uniform sampler2D u_tex1; +uniform sampler2D u_tex2; +uniform sampler1DArray u_tex_lighting_lut; + +uniform uint u_picaRegs[0x200 - 0x48]; + +// Helper so that the implementation of u_pica_regs can be changed later +uint readPicaReg(uint reg_addr) { return u_picaRegs[reg_addr - 0x48]; } + +vec4 tevSources[16]; +vec4 tevNextPreviousBuffer; +bool tevUnimplementedSourceFlag = false; + +// OpenGL ES 1.1 reference pages for TEVs (this is what the PICA200 implements): +// https://registry.khronos.org/OpenGL-Refpages/es1.1/xhtml/glTexEnv.xml + +vec4 tevFetchSource(uint src_id) { + if (src_id >= 6u && src_id < 13u) { + tevUnimplementedSourceFlag = true; + } + + return tevSources[src_id]; +} + +vec4 tevGetColorAndAlphaSource(int tev_id, int src_id) { + vec4 result; + + vec4 colorSource = tevFetchSource((u_textureEnvSource[tev_id] >> (src_id * 4)) & 15u); + vec4 alphaSource = tevFetchSource((u_textureEnvSource[tev_id] >> (src_id * 4 + 16)) & 15u); + + uint colorOperand = (u_textureEnvOperand[tev_id] >> (src_id * 4)) & 15u; + uint alphaOperand = (u_textureEnvOperand[tev_id] >> (12 + src_id * 4)) & 7u; + + // TODO: figure out what the undocumented values do + switch (colorOperand) { + case 0u: result.rgb = colorSource.rgb; break; // Source color + case 1u: result.rgb = 1.0 - colorSource.rgb; break; // One minus source color + case 2u: result.rgb = vec3(colorSource.a); break; // Source alpha + case 3u: result.rgb = vec3(1.0 - colorSource.a); break; // One minus source alpha + case 4u: result.rgb = vec3(colorSource.r); break; // Source red + case 5u: result.rgb = vec3(1.0 - colorSource.r); break; // One minus source red + case 8u: result.rgb = vec3(colorSource.g); break; // Source green + case 9u: result.rgb = vec3(1.0 - colorSource.g); break; // One minus source green + case 12u: result.rgb = vec3(colorSource.b); break; // Source blue + case 13u: result.rgb = vec3(1.0 - colorSource.b); break; // One minus source blue + default: break; + } + + // TODO: figure out what the undocumented values do + switch (alphaOperand) { + case 0u: result.a = alphaSource.a; break; // Source alpha + case 1u: result.a = 1.0 - alphaSource.a; break; // One minus source alpha + case 2u: result.a = alphaSource.r; break; // Source red + case 3u: result.a = 1.0 - alphaSource.r; break; // One minus source red + case 4u: result.a = alphaSource.g; break; // Source green + case 5u: result.a = 1.0 - alphaSource.g; break; // One minus source green + case 6u: result.a = alphaSource.b; break; // Source blue + case 7u: result.a = 1.0 - alphaSource.b; break; // One minus source blue + default: break; + } + + return result; +} + +vec4 tevCalculateCombiner(int tev_id) { + vec4 source0 = tevGetColorAndAlphaSource(tev_id, 0); + vec4 source1 = tevGetColorAndAlphaSource(tev_id, 1); + vec4 source2 = tevGetColorAndAlphaSource(tev_id, 2); + + uint colorCombine = u_textureEnvCombiner[tev_id] & 15u; + uint alphaCombine = (u_textureEnvCombiner[tev_id] >> 16) & 15u; + + vec4 result = vec4(1.0); + + // TODO: figure out what the undocumented values do + switch (colorCombine) { + case 0u: result.rgb = source0.rgb; break; // Replace + case 1u: result.rgb = source0.rgb * source1.rgb; break; // Modulate + case 2u: result.rgb = min(vec3(1.0), source0.rgb + source1.rgb); break; // Add + case 3u: result.rgb = clamp(source0.rgb + source1.rgb - 0.5, 0.0, 1.0); break; // Add signed + case 4u: result.rgb = mix(source1.rgb, source0.rgb, source2.rgb); break; // Interpolate + case 5u: result.rgb = max(source0.rgb - source1.rgb, 0.0); break; // Subtract + case 6u: result.rgb = vec3(4.0 * dot(source0.rgb - 0.5, source1.rgb - 0.5)); break; // Dot3 RGB + case 7u: result = vec4(4.0 * dot(source0.rgb - 0.5, source1.rgb - 0.5)); break; // Dot3 RGBA + case 8u: result.rgb = min(source0.rgb * source1.rgb + source2.rgb, 1.0); break; // Multiply then add + case 9u: result.rgb = min((source0.rgb + source1.rgb) * source2.rgb, 1.0); break; // Add then multiply + default: break; + } + + if (colorCombine != 7u) { // The color combiner also writes the alpha channel in the "Dot3 RGBA" mode. + // TODO: figure out what the undocumented values do + // TODO: test if the alpha combiner supports all the same modes as the color combiner. + switch (alphaCombine) { + case 0u: result.a = source0.a; break; // Replace + case 1u: result.a = source0.a * source1.a; break; // Modulate + case 2u: result.a = min(1.0, source0.a + source1.a); break; // Add + case 3u: result.a = clamp(source0.a + source1.a - 0.5, 0.0, 1.0); break; // Add signed + case 4u: result.a = mix(source1.a, source0.a, source2.a); break; // Interpolate + case 5u: result.a = max(0.0, source0.a - source1.a); break; // Subtract + case 8u: result.a = min(1.0, source0.a * source1.a + source2.a); break; // Multiply then add + case 9u: result.a = min(1.0, (source0.a + source1.a) * source2.a); break; // Add then multiply + default: break; + } + } + + result.rgb *= float(1 << (u_textureEnvScale[tev_id] & 3u)); + result.a *= float(1 << ((u_textureEnvScale[tev_id] >> 16) & 3u)); + + return result; +} + +#define D0_LUT 0u +#define D1_LUT 1u +#define SP_LUT 2u +#define FR_LUT 3u +#define RB_LUT 4u +#define RG_LUT 5u +#define RR_LUT 6u + +float lutLookup(uint lut, uint light, float value) { + if (lut >= FR_LUT && lut <= RR_LUT) lut -= 1; + if (lut == SP_LUT) lut = light + 8; + return texture(u_tex_lighting_lut, vec2(value, lut)).r; +} + +vec3 regToColor(uint reg) { + // Normalization scale to convert from [0...255] to [0.0...1.0] + const float scale = 1.0 / 255.0; + + return scale * vec3(float(bitfieldExtract(reg, 20, 8)), float(bitfieldExtract(reg, 10, 8)), float(bitfieldExtract(reg, 00, 8))); +} + +// Convert an arbitrary-width floating point literal to an f32 +float decodeFP(uint hex, uint E, uint M) { + uint width = M + E + 1u; + uint bias = 128u - (1u << (E - 1u)); + uint exponent = (hex >> M) & ((1u << E) - 1u); + uint mantissa = hex & ((1u << M) - 1u); + uint sign = (hex >> (E + M)) << 31u; + + if ((hex & ((1u << (width - 1u)) - 1u)) != 0) { + if (exponent == (1u << E) - 1u) + exponent = 255u; + else + exponent += bias; + hex = sign | (mantissa << (23u - M)) | (exponent << 23u); + } else { + hex = sign; + } + + return uintBitsToFloat(hex); +} + +// Implements the following algorthm: https://mathb.in/26766 +void calcLighting(out vec4 primary_color, out vec4 secondary_color) { + // Quaternions describe a transformation from surface-local space to eye space. + // In surface-local space, by definition (and up to permutation) the normal vector is (0,0,1), + // the tangent vector is (1,0,0), and the bitangent vector is (0,1,0). + vec3 normal = normalize(v_normal); + vec3 tangent = normalize(v_tangent); + vec3 bitangent = normalize(v_bitangent); + vec3 view = normalize(v_view); + + uint GPUREG_LIGHTING_ENABLE = readPicaReg(0x008F); + if (bitfieldExtract(GPUREG_LIGHTING_ENABLE, 0, 1) == 0) { + primary_color = secondary_color = vec4(1.0); + return; + } + + uint GPUREG_LIGHTING_AMBIENT = readPicaReg(0x01C0); + uint GPUREG_LIGHTING_NUM_LIGHTS = (readPicaReg(0x01C2) & 0x7u) + 1; + uint GPUREG_LIGHTING_LIGHT_PERMUTATION = readPicaReg(0x01D9); + + primary_color = vec4(vec3(0.0), 1.0); + secondary_color = vec4(vec3(0.0), 1.0); + + primary_color.rgb += regToColor(GPUREG_LIGHTING_AMBIENT); + + uint GPUREG_LIGHTING_LUTINPUT_ABS = readPicaReg(0x01D0); + uint GPUREG_LIGHTING_LUTINPUT_SELECT = readPicaReg(0x01D1); + uint GPUREG_LIGHTING_CONFIG0 = readPicaReg(0x01C3); + uint GPUREG_LIGHTING_CONFIG1 = readPicaReg(0x01C4); + uint GPUREG_LIGHTING_LUTINPUT_SCALE = readPicaReg(0x01D2); + float d[7]; + + bool error_unimpl = false; + + for (uint i = 0; i < GPUREG_LIGHTING_NUM_LIGHTS; i++) { + uint light_id = bitfieldExtract(GPUREG_LIGHTING_LIGHT_PERMUTATION, int(i * 3), 3); + + uint GPUREG_LIGHTi_SPECULAR0 = readPicaReg(0x0140 + 0x10 * light_id); + uint GPUREG_LIGHTi_SPECULAR1 = readPicaReg(0x0141 + 0x10 * light_id); + uint GPUREG_LIGHTi_DIFFUSE = readPicaReg(0x0142 + 0x10 * light_id); + uint GPUREG_LIGHTi_AMBIENT = readPicaReg(0x0143 + 0x10 * light_id); + uint GPUREG_LIGHTi_VECTOR_LOW = readPicaReg(0x0144 + 0x10 * light_id); + uint GPUREG_LIGHTi_VECTOR_HIGH = readPicaReg(0x0145 + 0x10 * light_id); + uint GPUREG_LIGHTi_CONFIG = readPicaReg(0x0149 + 0x10 * light_id); + + vec3 light_vector = normalize(vec3( + decodeFP(bitfieldExtract(GPUREG_LIGHTi_VECTOR_LOW, 0, 16), 5, 10), decodeFP(bitfieldExtract(GPUREG_LIGHTi_VECTOR_LOW, 16, 16), 5, 10), + decodeFP(bitfieldExtract(GPUREG_LIGHTi_VECTOR_HIGH, 0, 16), 5, 10) + )); + + // Positional Light + if (bitfieldExtract(GPUREG_LIGHTi_CONFIG, 0, 1) == 0) error_unimpl = true; + + vec3 half_vector = normalize(normalize(light_vector) + view); + + for (int c = 0; c < 7; c++) { + if (bitfieldExtract(GPUREG_LIGHTING_CONFIG1, 16 + c, 1) == 0) { + uint scale_id = bitfieldExtract(GPUREG_LIGHTING_LUTINPUT_SCALE, c * 4, 3); + float scale = float(1u << scale_id); + if (scale_id >= 6u) scale /= 256.0; + + uint input_id = bitfieldExtract(GPUREG_LIGHTING_LUTINPUT_SELECT, c * 4, 3); + if (input_id == 0u) + d[c] = dot(normal, half_vector); + else if (input_id == 1u) + d[c] = dot(view, half_vector); + else if (input_id == 2u) + d[c] = dot(normal, view); + else if (input_id == 3u) + d[c] = dot(light_vector, normal); + else if (input_id == 4u) { + uint GPUREG_LIGHTi_SPOTDIR_LOW = readPicaReg(0x0146 + 0x10 * light_id); + uint GPUREG_LIGHTi_SPOTDIR_HIGH = readPicaReg(0x0147 + 0x10 * light_id); + vec3 spot_light_vector = normalize(vec3( + decodeFP(bitfieldExtract(GPUREG_LIGHTi_SPOTDIR_LOW, 0, 16), 1, 11), + decodeFP(bitfieldExtract(GPUREG_LIGHTi_SPOTDIR_LOW, 16, 16), 1, 11), + decodeFP(bitfieldExtract(GPUREG_LIGHTi_SPOTDIR_HIGH, 0, 16), 1, 11) + )); + d[c] = dot(-light_vector, spot_light_vector); // -L dot P (aka Spotlight aka SP); + } else if (input_id == 5u) { + d[c] = 1.0; // TODO: cos (aka CP); + error_unimpl = true; + } else { + d[c] = 1.0; + } + + d[c] = lutLookup(c, light_id, d[c] * 0.5 + 0.5) * scale; + if (bitfieldExtract(GPUREG_LIGHTING_LUTINPUT_ABS, 2 * c, 1) != 0u) d[c] = abs(d[c]); + } else { + d[c] = 1.0; + } + } + + uint lookup_config = bitfieldExtract(GPUREG_LIGHTi_CONFIG, 4, 4); + if (lookup_config == 0) { + d[D1_LUT] = 0.0; + d[FR_LUT] = 0.0; + d[RG_LUT] = d[RB_LUT] = d[RR_LUT]; + } else if (lookup_config == 1) { + d[D0_LUT] = 0.0; + d[D1_LUT] = 0.0; + d[RG_LUT] = d[RB_LUT] = d[RR_LUT]; + } else if (lookup_config == 2) { + d[FR_LUT] = 0.0; + d[SP_LUT] = 0.0; + d[RG_LUT] = d[RB_LUT] = d[RR_LUT]; + } else if (lookup_config == 3) { + d[SP_LUT] = 0.0; + d[RG_LUT] = d[RB_LUT] = d[RR_LUT] = 1.0; + } else if (lookup_config == 4) { + d[FR_LUT] = 0.0; + } else if (lookup_config == 5) { + d[D1_LUT] = 0.0; + } else if (lookup_config == 6) { + d[RG_LUT] = d[RB_LUT] = d[RR_LUT]; + } + + float distance_factor = 1.0; // a + float indirect_factor = 1.0; // fi + float shadow_factor = 1.0; // o + + float NdotL = dot(normal, light_vector); // Li dot N + + // Two sided diffuse + if (bitfieldExtract(GPUREG_LIGHTi_CONFIG, 1, 1) == 0) + NdotL = max(0.0, NdotL); + else + NdotL = abs(NdotL); + + float light_factor = distance_factor * d[SP_LUT] * indirect_factor * shadow_factor; + + primary_color.rgb += light_factor * (regToColor(GPUREG_LIGHTi_AMBIENT) + regToColor(GPUREG_LIGHTi_DIFFUSE) * NdotL); + secondary_color.rgb += light_factor * (regToColor(GPUREG_LIGHTi_SPECULAR0) * d[D0_LUT] + + regToColor(GPUREG_LIGHTi_SPECULAR1) * d[D1_LUT] * vec3(d[RR_LUT], d[RG_LUT], d[RB_LUT])); + } + uint fresnel_output1 = bitfieldExtract(GPUREG_LIGHTING_CONFIG0, 2, 1); + uint fresnel_output2 = bitfieldExtract(GPUREG_LIGHTING_CONFIG0, 3, 1); + + if (fresnel_output1 == 1u) primary_color.a = d[FR_LUT]; + if (fresnel_output2 == 1u) secondary_color.a = d[FR_LUT]; + + if (error_unimpl) { + secondary_color = primary_color = vec4(1.0, 0., 1.0, 1.0); + } +} + +void main() { + // TODO: what do invalid sources and disabled textures read as? + // And what does the "previous combiner" source read initially? + tevSources[0] = v_colour; // Primary/vertex color + calcLighting(tevSources[1], tevSources[2]); + + uint textureConfig = readPicaReg(0x80); + vec2 tex2UV = (textureConfig & (1u << 13)) != 0u ? v_texcoord1 : v_texcoord2; + + if ((textureConfig & 1u) != 0u) tevSources[3] = texture(u_tex0, v_texcoord0.xy); + if ((textureConfig & 2u) != 0u) tevSources[4] = texture(u_tex1, v_texcoord1); + if ((textureConfig & 4u) != 0u) tevSources[5] = texture(u_tex2, tex2UV); + tevSources[13] = vec4(0.0); // Previous buffer + tevSources[15] = vec4(0.0); // Previous combiner + + tevNextPreviousBuffer = v_textureEnvBufferColor; + uint textureEnvUpdateBuffer = readPicaReg(0xE0); + + for (int i = 0; i < 6; i++) { + tevSources[14] = v_textureEnvColor[i]; // Constant color + tevSources[15] = tevCalculateCombiner(i); + tevSources[13] = tevNextPreviousBuffer; + + if (i < 4) { + if ((textureEnvUpdateBuffer & (0x100u << i)) != 0u) { + tevNextPreviousBuffer.rgb = tevSources[15].rgb; + } + + if ((textureEnvUpdateBuffer & (0x1000u << i)) != 0u) { + tevNextPreviousBuffer.a = tevSources[15].a; + } + } + } + + fragColour = tevSources[15]; + + if (tevUnimplementedSourceFlag) { + // fragColour = vec4(1.0, 0.0, 1.0, 1.0); + } + // fragColour.rg = texture(u_tex_lighting_lut,vec2(gl_FragCoord.x/200.,float(int(gl_FragCoord.y/2)%24))).rr; + + // Get original depth value by converting from [near, far] = [0, 1] to [-1, 1] + // We do this by converting to [0, 2] first and subtracting 1 to go to [-1, 1] + float z_over_w = gl_FragCoord.z * 2.0f - 1.0f; + float depth = z_over_w * u_depthScale + u_depthOffset; + + if (!u_depthmapEnable) // Divide z by w if depthmap enable == 0 (ie using W-buffering) + depth /= gl_FragCoord.w; + + // Write final fragment depth + gl_FragDepth = depth; + + // Perform alpha test + uint alphaControl = readPicaReg(0x104); + if ((alphaControl & 1u) != 0u) { // Check if alpha test is on + uint func = (alphaControl >> 4u) & 7u; + float reference = float((alphaControl >> 8u) & 0xffu) / 255.0; + float alpha = fragColour.a; + + switch (func) { + case 0: discard; // Never pass alpha test + case 1: break; // Always pass alpha test + case 2: // Pass if equal + if (alpha != reference) discard; + break; + case 3: // Pass if not equal + if (alpha == reference) discard; + break; + case 4: // Pass if less than + if (alpha >= reference) discard; + break; + case 5: // Pass if less than or equal + if (alpha > reference) discard; + break; + case 6: // Pass if greater than + if (alpha <= reference) discard; + break; + case 7: // Pass if greater than or equal + if (alpha < reference) discard; + break; + } + } +} \ No newline at end of file diff --git a/src/host_shaders/opengl_vertex_shader.vert b/src/host_shaders/opengl_vertex_shader.vert new file mode 100644 index 00000000..cbf992c4 --- /dev/null +++ b/src/host_shaders/opengl_vertex_shader.vert @@ -0,0 +1,97 @@ +#version 410 core + +layout(location = 0) in vec4 a_coords; +layout(location = 1) in vec4 a_quaternion; +layout(location = 2) in vec4 a_vertexColour; +layout(location = 3) in vec2 a_texcoord0; +layout(location = 4) in vec2 a_texcoord1; +layout(location = 5) in float a_texcoord0_w; +layout(location = 6) in vec3 a_view; +layout(location = 7) in vec2 a_texcoord2; + +out vec3 v_normal; +out vec3 v_tangent; +out vec3 v_bitangent; +out vec4 v_colour; +out vec3 v_texcoord0; +out vec2 v_texcoord1; +out vec3 v_view; +out vec2 v_texcoord2; +flat out vec4 v_textureEnvColor[6]; +flat out vec4 v_textureEnvBufferColor; + +out float gl_ClipDistance[2]; + +// TEV uniforms +uniform uint u_textureEnvColor[6]; +uniform uint u_picaRegs[0x200 - 0x48]; + +// Helper so that the implementation of u_pica_regs can be changed later +uint readPicaReg(uint reg_addr) { return u_picaRegs[reg_addr - 0x48]; } + +vec4 abgr8888ToVec4(uint abgr) { + const float scale = 1.0 / 255.0; + + return scale * vec4(float(abgr & 0xffu), float((abgr >> 8) & 0xffu), float((abgr >> 16) & 0xffu), float(abgr >> 24)); +} + +vec3 rotateVec3ByQuaternion(vec3 v, vec4 q) { + vec3 u = q.xyz; + float s = q.w; + return 2.0 * dot(u, v) * u + (s * s - dot(u, u)) * v + 2.0 * s * cross(u, v); +} + +// Convert an arbitrary-width floating point literal to an f32 +float decodeFP(uint hex, uint E, uint M) { + uint width = M + E + 1u; + uint bias = 128u - (1u << (E - 1u)); + uint exponent = (hex >> M) & ((1u << E) - 1u); + uint mantissa = hex & ((1u << M) - 1u); + uint sign = (hex >> (E + M)) << 31u; + + if ((hex & ((1u << (width - 1u)) - 1u)) != 0) { + if (exponent == (1u << E) - 1u) + exponent = 255u; + else + exponent += bias; + hex = sign | (mantissa << (23u - M)) | (exponent << 23u); + } else { + hex = sign; + } + + return uintBitsToFloat(hex); +} + +void main() { + gl_Position = a_coords; + v_colour = a_vertexColour; + + // Flip y axis of UVs because OpenGL uses an inverted y for texture sampling compared to the PICA + v_texcoord0 = vec3(a_texcoord0.x, 1.0 - a_texcoord0.y, a_texcoord0_w); + v_texcoord1 = vec2(a_texcoord1.x, 1.0 - a_texcoord1.y); + v_texcoord2 = vec2(a_texcoord2.x, 1.0 - a_texcoord2.y); + v_view = a_view; + + v_normal = normalize(rotateVec3ByQuaternion(vec3(0.0, 0.0, 1.0), a_quaternion)); + v_tangent = normalize(rotateVec3ByQuaternion(vec3(1.0, 0.0, 0.0), a_quaternion)); + v_bitangent = normalize(rotateVec3ByQuaternion(vec3(0.0, 1.0, 0.0), a_quaternion)); + + for (int i = 0; i < 6; i++) { + v_textureEnvColor[i] = abgr8888ToVec4(u_textureEnvColor[i]); + } + + v_textureEnvBufferColor = abgr8888ToVec4(readPicaReg(0xFD)); + + // Parse clipping plane registers + // The plane registers describe a clipping plane in the form of Ax + By + Cz + D = 0 + // With n = (A, B, C) being the normal vector and D being the origin point distance + // Therefore, for the second clipping plane, we can just pass the dot product of the clip vector and the input coordinates to gl_ClipDistance[1] + vec4 clipData = vec4( + decodeFP(readPicaReg(0x48) & 0xffffffu, 7, 16), decodeFP(readPicaReg(0x49) & 0xffffffu, 7, 16), + decodeFP(readPicaReg(0x4A) & 0xffffffu, 7, 16), decodeFP(readPicaReg(0x4B) & 0xffffffu, 7, 16) + ); + + // There's also another, always-on clipping plane based on vertex z + gl_ClipDistance[0] = -a_coords.z; + gl_ClipDistance[1] = dot(clipData, a_coords); +} \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index 1559565a..66a04b9e 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,9 +1,9 @@ #include "emulator.hpp" -int main (int argc, char *argv[]) { - Emulator emu; +int main(int argc, char *argv[]) { + Emulator emu; - emu.initGraphicsContext(); + emu.initGraphicsContext(); if (argc > 1) { auto romPath = std::filesystem::current_path() / argv[1]; diff --git a/src/renderer.cpp b/src/renderer.cpp new file mode 100644 index 00000000..3ba29aea --- /dev/null +++ b/src/renderer.cpp @@ -0,0 +1,35 @@ +#include "renderer.hpp" + +#include +#include + +Renderer::Renderer(GPU& gpu, const std::array& internalRegs) : gpu(gpu), regs(internalRegs) {} +Renderer::~Renderer() {} + +std::optional Renderer::typeFromString(std::string inString) { + // Transform to lower-case to make the setting case-insensitive + std::transform(inString.begin(), inString.end(), inString.begin(), [](unsigned char c) { return std::tolower(c); }); + + // Huge table of possible names and misspellings + // Please stop misspelling Vulkan as Vulcan + static const std::unordered_map map = { + {"null", RendererType::Null}, {"nil", RendererType::Null}, {"none", RendererType::Null}, + {"gl", RendererType::OpenGL}, {"ogl", RendererType::OpenGL}, {"opengl", RendererType::OpenGL}, + {"vk", RendererType::Vulkan}, {"vulkan", RendererType::Vulkan}, {"vulcan", RendererType::Vulkan}, + }; + + if (auto search = map.find(inString); search != map.end()) { + return search->second; + } + + return std::nullopt; +} + +const char* Renderer::typeToString(RendererType rendererType) { + switch (rendererType) { + case RendererType::Null: return "null"; + case RendererType::OpenGL: return "opengl"; + case RendererType::Vulkan: return "vulkan"; + default: return "Invalid"; + } +} \ No newline at end of file diff --git a/third_party/cmrc b/third_party/cmrc new file mode 160000 index 00000000..9a339644 --- /dev/null +++ b/third_party/cmrc @@ -0,0 +1 @@ +Subproject commit 9a3396444e0478bd6f261075e74d1ecf70964029