From 5d28f11ccfeab671efb62d491a36758bbbe657d0 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Sun, 20 Oct 2024 20:02:02 +0300 Subject: [PATCH] GPU: Add optimized NEON path for analyzing index buffers (#613) * Implement ARM NEON index buffer analysis * NEON: Fix initial index buffer minima/maxima * NEON: Fix vertex count comparison for index buffer analysis * GPU: Add SSE4.1 path for index buffer analysis * Fix oopsie * Fix oopsie, again --- CMakeLists.txt | 10 +- include/PICA/pica_simd.hpp | 253 ++++++++++++++++++++++++++++ src/core/PICA/draw_acceleration.cpp | 23 +-- 3 files changed, 268 insertions(+), 18 deletions(-) create mode 100644 include/PICA/pica_simd.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 641a0055..f72e8cf6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,6 +54,7 @@ option(ENABLE_GIT_VERSIONING "Enables querying git for the emulator version" ON) option(BUILD_HYDRA_CORE "Build a Hydra core" OFF) option(BUILD_LIBRETRO_CORE "Build a Libretro core" OFF) option(ENABLE_RENDERDOC_API "Build with support for Renderdoc's capture API for graphics debugging" ON) +option(DISABLE_SSE4 "Build with SSE4 instructions disabled, may reduce performance" OFF) set(OPENGL_PROFILE ${DEFAULT_OPENGL_PROFILE} CACHE STRING "OpenGL profile to use if OpenGL is enabled. Valid values are 'OpenGL' and 'OpenGLES'.") set_property(CACHE OPENGL_PROFILE PROPERTY STRINGS OpenGL OpenGLES) @@ -211,6 +212,13 @@ else() set(HOST_ARM64 FALSE) endif() +# Enable SSE4.1 if it's not explicitly disabled +# Annoyingly, we can't easily do this if we're using MSVC cause there's no SSE4.1 flag, only SSE4.1 +if(NOT MSVC OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND NOT DISABLE_SSE4 AND HOST_X64) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.1") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.1") +endif() + if(ENABLE_RENDERDOC_API) find_package(RenderDoc 1.6.0 MODULE REQUIRED) add_compile_definitions(PANDA3DS_ENABLE_RENDERDOC) @@ -318,7 +326,7 @@ set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/termcolor.hpp include/audio/hle_core.hpp include/capstone.hpp include/audio/aac.hpp include/PICA/pica_frag_config.hpp include/PICA/pica_frag_uniforms.hpp include/PICA/shader_gen_types.hpp include/PICA/shader_decompiler.hpp include/PICA/pica_vert_config.hpp include/sdl_sensors.hpp include/PICA/draw_acceleration.hpp include/renderdoc.hpp - include/align.hpp include/audio/aac_decoder.hpp + include/align.hpp include/audio/aac_decoder.hpp include/PICA/pica_simd.hpp ) cmrc_add_resource_library( diff --git a/include/PICA/pica_simd.hpp b/include/PICA/pica_simd.hpp new file mode 100644 index 00000000..dfd528fc --- /dev/null +++ b/include/PICA/pica_simd.hpp @@ -0,0 +1,253 @@ +#pragma once +#include +#include +#include + +#include "helpers.hpp" + +#if defined(_M_AMD64) || defined(__x86_64__) +#define PICA_SIMD_X64 +#include +#elif defined(_M_ARM64) || defined(__aarch64__) +#define PICA_SIMD_ARM64 +#include +#endif + +// Optimized functions for analyzing PICA index buffers (Finding minimum and maximum index values inside them) +namespace PICA::IndexBuffer { + // Non-SIMD, portable algorithm + template + std::pair analyzePortable(u8* indexBuffer, u32 vertexCount) { + u16 minimumIndex = std::numeric_limits::max(); + u16 maximumIndex = 0; + + // Calculate the minimum and maximum indices used in the index buffer, so we'll only upload them + if constexpr (useShortIndices) { + u16* indexBuffer16 = reinterpret_cast(indexBuffer); + + for (u32 i = 0; i < vertexCount; i++) { + u16 index = indexBuffer16[i]; + minimumIndex = std::min(minimumIndex, index); + maximumIndex = std::max(maximumIndex, index); + } + } else { + for (u32 i = 0; i < vertexCount; i++) { + u16 index = u16(indexBuffer[i]); + minimumIndex = std::min(minimumIndex, index); + maximumIndex = std::max(maximumIndex, index); + } + } + + return {minimumIndex, maximumIndex}; + } + +#ifdef PICA_SIMD_ARM64 + template + std::pair analyzeNEON(u8* indexBuffer, u32 vertexCount) { + // We process 16 bytes per iteration, which is 8 vertices if we're using u16 indices or 16 vertices if we're using u8 indices + constexpr u32 vertsPerLoop = (useShortIndices) ? 8 : 16; + + if (vertexCount < vertsPerLoop) { + return analyzePortable(indexBuffer, vertexCount); + } + + u16 minimumIndex, maximumIndex; + + if constexpr (useShortIndices) { + // 16-bit indices + uint16x8_t minima = vdupq_n_u16(0xffff); + uint16x8_t maxima = vdupq_n_u16(0); + + while (vertexCount >= vertsPerLoop) { + const uint16x8_t data = vld1q_u16(reinterpret_cast(indexBuffer)); + minima = vminq_u16(data, minima); + maxima = vmaxq_u16(data, maxima); + + indexBuffer += 16; + vertexCount -= vertsPerLoop; + } + + // Do horizontal min/max operations to get the actual minimum and maximum from all the vertices we processed with SIMD + // We want to gather the actual minimum and maximum in the line bottom lane of the minima/maxima vectors + // uint16x4_t foldedMinima1 = vmin_u16(vget_high_u16(minima), vget_low_u16(minima)); + // uint16x4_t foldedMaxima1 = vmax_u16(vget_high_u16(maxima), vget_low_u16(maxima)); + + uint16x8_t foldedMinima1 = vpminq_u16(minima, minima); + uint16x8_t foldedMinima2 = vpminq_u16(foldedMinima1, foldedMinima1); + uint16x8_t foldedMinima3 = vpminq_u16(foldedMinima2, foldedMinima2); + + uint16x8_t foldedMaxima1 = vpmaxq_u16(maxima, maxima); + uint16x8_t foldedMaxima2 = vpmaxq_u16(foldedMaxima1, foldedMaxima1); + uint16x8_t foldedMaxima3 = vpmaxq_u16(foldedMaxima2, foldedMaxima2); + + minimumIndex = vgetq_lane_u16(foldedMinima3, 0); + maximumIndex = vgetq_lane_u16(foldedMaxima3, 0); + } else { + // 8-bit indices + uint8x16_t minima = vdupq_n_u8(0xff); + uint8x16_t maxima = vdupq_n_u8(0); + + while (vertexCount >= vertsPerLoop) { + uint8x16_t data = vld1q_u8(indexBuffer); + minima = vminq_u8(data, minima); + maxima = vmaxq_u8(data, maxima); + + indexBuffer += 16; + vertexCount -= vertsPerLoop; + } + + // Do a similar horizontal min/max as in the u16 case, except now we're working uint8x16 instead of uint16x4 so we need 4 folds + uint8x16_t foldedMinima1 = vpminq_u8(minima, minima); + uint8x16_t foldedMinima2 = vpminq_u8(foldedMinima1, foldedMinima1); + uint8x16_t foldedMinima3 = vpminq_u8(foldedMinima2, foldedMinima2); + uint8x16_t foldedMinima4 = vpminq_u8(foldedMinima3, foldedMinima3); + + uint8x16_t foldedMaxima1 = vpmaxq_u8(maxima, maxima); + uint8x16_t foldedMaxima2 = vpmaxq_u8(foldedMaxima1, foldedMaxima1); + uint8x16_t foldedMaxima3 = vpmaxq_u8(foldedMaxima2, foldedMaxima2); + uint8x16_t foldedMaxima4 = vpmaxq_u8(foldedMaxima3, foldedMaxima3); + + minimumIndex = u16(vgetq_lane_u8(foldedMinima4, 0)); + maximumIndex = u16(vgetq_lane_u8(foldedMaxima4, 0)); + } + + // If any indices could not be processed cause the buffer size is not 16-byte aligned, process them the naive way + // Calculate the minimum and maximum indices used in the index buffer, so we'll only upload them + while (vertexCount > 0) { + if constexpr (useShortIndices) { + u16 index = *reinterpret_cast(indexBuffer); + minimumIndex = std::min(minimumIndex, index); + maximumIndex = std::max(maximumIndex, index); + indexBuffer += 2; + } else { + u16 index = u16(*indexBuffer++); + minimumIndex = std::min(minimumIndex, index); + maximumIndex = std::max(maximumIndex, index); + } + + vertexCount -= 1; + } + + return {minimumIndex, maximumIndex}; + } +#endif + +#if defined(PICA_SIMD_X64) && (defined(__SSE4_1__) || defined(__AVX__)) + template + std::pair analyzeSSE4_1(u8* indexBuffer, u32 vertexCount) { + // We process 16 bytes per iteration, which is 8 vertices if we're using u16 + // indices or 16 vertices if we're using u8 indices + constexpr u32 vertsPerLoop = (useShortIndices) ? 8 : 16; + + if (vertexCount < vertsPerLoop) { + return analyzePortable(indexBuffer, vertexCount); + } + + u16 minimumIndex, maximumIndex; + + if constexpr (useShortIndices) { + // Calculate the horizontal minimum/maximum value across an SSE vector of 16-bit unsigned integers. + // Based on https://stackoverflow.com/a/22259607 + auto horizontalMin16 = [](__m128i vector) -> u16 { return u16(_mm_cvtsi128_si32(_mm_minpos_epu16(vector))); }; + + auto horizontalMax16 = [](__m128i vector) -> u16 { + // We have an instruction to compute horizontal minimum but not maximum, so we use it. + // To use it, we have to subtract each value from 0xFFFF (which we do with an xor), then execute a horizontal minimum + __m128i flipped = _mm_xor_si128(vector, _mm_set_epi32(0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu)); + u16 min = u16(_mm_cvtsi128_si32(_mm_minpos_epu16(flipped))); + return u16(min ^ 0xffff); + }; + + // 16-bit indices + // Initialize the minima vector to all FFs (So 0xFFFF for each 16-bit lane) + // And the maxima vector to all 0s (0 for each 16-bit lane) + __m128i minima = _mm_set_epi32(0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu); + __m128i maxima = _mm_set_epi32(0, 0, 0, 0); + + while (vertexCount >= vertsPerLoop) { + const __m128i data = _mm_loadu_si128(reinterpret_cast(indexBuffer)); + minima = _mm_min_epu16(data, minima); + maxima = _mm_max_epu16(data, maxima); + + indexBuffer += 16; + vertexCount -= vertsPerLoop; + } + + minimumIndex = u16(horizontalMin16(minima)); + maximumIndex = u16(horizontalMax16(maxima)); + } else { + // Calculate the horizontal minimum/maximum value across an SSE vector of 8-bit unsigned integers. + // Based on https://stackoverflow.com/a/22259607 + auto horizontalMin8 = [](__m128i vector) -> u8 { + vector = _mm_min_epu8(vector, _mm_shuffle_epi32(vector, _MM_SHUFFLE(3, 2, 3, 2))); + vector = _mm_min_epu8(vector, _mm_shuffle_epi32(vector, _MM_SHUFFLE(1, 1, 1, 1))); + vector = _mm_min_epu8(vector, _mm_shufflelo_epi16(vector, _MM_SHUFFLE(1, 1, 1, 1))); + vector = _mm_min_epu8(vector, _mm_srli_epi16(vector, 8)); + return u8(_mm_cvtsi128_si32(vector)); + }; + + auto horizontalMax8 = [](__m128i vector) -> u8 { + vector = _mm_max_epu8(vector, _mm_shuffle_epi32(vector, _MM_SHUFFLE(3, 2, 3, 2))); + vector = _mm_max_epu8(vector, _mm_shuffle_epi32(vector, _MM_SHUFFLE(1, 1, 1, 1))); + vector = _mm_max_epu8(vector, _mm_shufflelo_epi16(vector, _MM_SHUFFLE(1, 1, 1, 1))); + vector = _mm_max_epu8(vector, _mm_srli_epi16(vector, 8)); + return u8(_mm_cvtsi128_si32(vector)); + }; + + // 8-bit indices + // Initialize the minima vector to all FFs (So 0xFF for each 8-bit lane) + // And the maxima vector to all 0s (0 for each 8-bit lane) + __m128i minima = _mm_set_epi32(0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu); + __m128i maxima = _mm_set_epi32(0, 0, 0, 0); + + while (vertexCount >= vertsPerLoop) { + const __m128i data = _mm_loadu_si128(reinterpret_cast(indexBuffer)); + minima = _mm_min_epu8(data, minima); + maxima = _mm_max_epu8(data, maxima); + + indexBuffer += 16; + vertexCount -= vertsPerLoop; + } + + minimumIndex = u16(horizontalMin8(minima)); + maximumIndex = u16(horizontalMax8(maxima)); + } + + // If any indices could not be processed cause the buffer size + // is not 16-byte aligned, process them the naive way + // Calculate the minimum and maximum indices used in the index + // buffer, so we'll only upload them + while (vertexCount > 0) { + if constexpr (useShortIndices) { + u16 index = *reinterpret_cast(indexBuffer); + minimumIndex = std::min(minimumIndex, index); + maximumIndex = std::max(maximumIndex, index); + indexBuffer += 2; + } else { + u16 index = u16(*indexBuffer++); + minimumIndex = std::min(minimumIndex, index); + maximumIndex = std::max(maximumIndex, index); + } + + vertexCount -= 1; + } + + return {minimumIndex, maximumIndex}; + } +#endif + + // Analyzes a PICA index buffer to get the minimum and maximum indices in the + // buffer, and returns them in a pair in the form [min, max]. Takes a template + // parameter to decide whether the indices in the buffer are u8 or u16 + template + std::pair analyze(u8* indexBuffer, u32 vertexCount) { +#if defined(PICA_SIMD_ARM64) + return analyzeNEON(indexBuffer, vertexCount); +#elif defined(PICA_SIMD_X64) && (defined(__SSE4_1__) || defined(__AVX__)) + // Annoyingly, MSVC refuses to define __SSE4_1__ even when we're building with AVX + return analyzeSSE4_1(indexBuffer, vertexCount); +#else + return analyzePortable(indexBuffer, vertexCount); +#endif + } +} // namespace PICA::IndexBuffer diff --git a/src/core/PICA/draw_acceleration.cpp b/src/core/PICA/draw_acceleration.cpp index 1850d819..d7df3b77 100644 --- a/src/core/PICA/draw_acceleration.cpp +++ b/src/core/PICA/draw_acceleration.cpp @@ -1,16 +1,17 @@ #include "PICA/draw_acceleration.hpp" #include -#include +#include #include "PICA/gpu.hpp" +#include "PICA/pica_simd.hpp" #include "PICA/regs.hpp" void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { accel.indexed = indexed; accel.totalAttribCount = totalAttribCount; accel.enabledAttributeMask = 0; - + const u32 vertexBase = ((regs[PICA::InternalRegs::VertexAttribLoc] >> 1) & 0xfffffff) * 16; const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg]; // Total # of vertices to transfer @@ -27,24 +28,12 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { // Calculate the minimum and maximum indices used in the index buffer, so we'll only upload them if (accel.useShortIndices) { - u16* indexBuffer16 = reinterpret_cast(indexBuffer); - - for (int i = 0; i < vertexCount; i++) { - u16 index = indexBuffer16[i]; - minimumIndex = std::min(minimumIndex, index); - maximumIndex = std::max(maximumIndex, index); - } + std::tie(accel.minimumIndex, accel.maximumIndex) = PICA::IndexBuffer::analyze(indexBuffer, vertexCount); } else { - for (int i = 0; i < vertexCount; i++) { - u16 index = u16(indexBuffer[i]); - minimumIndex = std::min(minimumIndex, index); - maximumIndex = std::max(maximumIndex, index); - } + std::tie(accel.minimumIndex, accel.maximumIndex) = PICA::IndexBuffer::analyze(indexBuffer, vertexCount); } accel.indexBuffer = indexBuffer; - accel.minimumIndex = minimumIndex; - accel.maximumIndex = maximumIndex; } else { accel.indexBuffer = nullptr; accel.minimumIndex = regs[PICA::InternalRegs::VertexOffsetReg]; @@ -76,7 +65,7 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { // Add it to the total vertex data size, aligned to 4 bytes. accel.vertexDataSize += (bytes + 3) & ~3; - + // Get a pointer to the data where this loader's data is stored const u32 loaderAddress = vertexBase + loaderData.offset + (accel.minimumIndex * loaderData.size); loader.data = getPointerPhys(loaderAddress);