From 0e94eae4838a45e4cf1401761a87205b36264879 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Wed, 20 Nov 2024 13:10:34 +0200 Subject: [PATCH 1/2] Force-inline SIMD index buffer functions --- include/PICA/pica_simd.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/PICA/pica_simd.hpp b/include/PICA/pica_simd.hpp index ae7d04eb..efb00d43 100644 --- a/include/PICA/pica_simd.hpp +++ b/include/PICA/pica_simd.hpp @@ -3,6 +3,7 @@ #include #include +#include "compiler_builtins.hpp" #include "helpers.hpp" #if defined(_M_AMD64) || defined(__x86_64__) @@ -43,7 +44,7 @@ namespace PICA::IndexBuffer { #ifdef PICA_SIMD_ARM64 template - std::pair analyzeNEON(u8* indexBuffer, u32 vertexCount) { + ALWAYS_INLINE std::pair analyzeNEON(u8* indexBuffer, u32 vertexCount) { // We process 16 bytes per iteration, which is 8 vertices if we're using u16 indices or 16 vertices if we're using u8 indices constexpr u32 vertsPerLoop = (useShortIndices) ? 8 : 16; @@ -134,7 +135,7 @@ namespace PICA::IndexBuffer { #if defined(PICA_SIMD_X64) && (defined(__SSE4_1__) || defined(__AVX__)) template - std::pair analyzeSSE4_1(u8* indexBuffer, u32 vertexCount) { + ALWAYS_INLINE std::pair analyzeSSE4_1(u8* indexBuffer, u32 vertexCount) { // We process 16 bytes per iteration, which is 8 vertices if we're using u16 // indices or 16 vertices if we're using u8 indices constexpr u32 vertsPerLoop = (useShortIndices) ? 8 : 16; From bea7b00c7d69905551d50bce79b9a08fa44da647 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Wed, 20 Nov 2024 13:40:00 +0200 Subject: [PATCH 2/2] Draw acceleration: Replace multiplication for component sizes with left shift --- src/core/PICA/draw_acceleration.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/core/PICA/draw_acceleration.cpp b/src/core/PICA/draw_acceleration.cpp index d7df3b77..fe21fe1a 100644 --- a/src/core/PICA/draw_acceleration.cpp +++ b/src/core/PICA/draw_acceleration.cpp @@ -90,7 +90,11 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { const u32 size = (attribInfo >> 2) + 1; // Total number of components // Size of each component based on the attribute type - static constexpr u32 sizePerComponent[4] = {1, 1, 2, 4}; + [[maybe_unused]] static constexpr u32 sizePerComponent[4] = {1, 1, 2, 4}; + // To avoid a multiplication, instead of multiplying by the above values, we shift left instead + // So multiplication by 1 becomes a shift by 0, mul by 2 becomes a shift by 1, and mul by 4 becomes a shift by 2 + static constexpr u32 sizeShiftPerComponent[4] = {0, 0, 1, 2}; + const u32 inputReg = (inputAttrCfg >> (attributeIndex * 4)) & 0xf; // Mark the attribute as enabled accel.enabledAttributeMask |= 1 << inputReg; @@ -100,7 +104,7 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { attr.offset = attributeOffset + loaderOffset; attr.stride = loaderData.size; attr.type = attribType; - attributeOffset += size * sizePerComponent[attribType]; + attributeOffset += size << sizeShiftPerComponent[attribType]; } loaderOffset += loader.size; @@ -134,4 +138,4 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { } accel.canBeAccelerated = true; -} \ No newline at end of file +}