diff --git a/CMakeLists.txt b/CMakeLists.txt index f780387d..c5e76d5a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -355,7 +355,7 @@ set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/termcolor.hpp include/PICA/pica_frag_uniforms.hpp include/PICA/shader_gen_types.hpp include/PICA/shader_decompiler.hpp include/PICA/pica_vert_config.hpp include/sdl_sensors.hpp include/PICA/draw_acceleration.hpp include/renderdoc.hpp include/align.hpp include/audio/aac_decoder.hpp include/PICA/pica_simd.hpp include/services/fonts.hpp - include/audio/audio_interpolation.hpp include/audio/hle_mixer.hpp + include/audio/audio_interpolation.hpp include/audio/hle_mixer.hpp include/audio/dsp_simd.hpp ) cmrc_add_resource_library( diff --git a/include/audio/dsp_simd.hpp b/include/audio/dsp_simd.hpp new file mode 100644 index 00000000..48823485 --- /dev/null +++ b/include/audio/dsp_simd.hpp @@ -0,0 +1,62 @@ +#pragma once + +#include "audio/hle_mixer.hpp" +#include "compiler_builtins.hpp" +#include "helpers.hpp" + +#if defined(_M_AMD64) || defined(__x86_64__) +#define DSP_SIMD_X64 +#include +#elif defined(_M_ARM64) || defined(__aarch64__) +#define DSP_SIMD_ARM64 +#include +#endif + +// Optimized SIMD functions for mixing the stereo output of a DSP voice into a quadraphonic intermediate mix +namespace DSP::MixIntoQuad { + using IntermediateMix = Audio::DSPMixer::IntermediateMix; + using StereoFrame16 = Audio::DSPMixer::StereoFrame; + + // Non-SIMD, portable algorithm + ALWAYS_INLINE static void mixPortable(IntermediateMix& mix, StereoFrame16& frame, const float* gains) { + for (usize sampleIndex = 0; sampleIndex < Audio::samplesInFrame; sampleIndex++) { + // Mono samples are in the format: (l, r) + // When converting to quad, gain0 and gain2 are applied to the left sample, gain1 and gain3 to the right one + mix[sampleIndex][0] += s32(frame[sampleIndex][0] * gains[0]); + mix[sampleIndex][1] += s32(frame[sampleIndex][1] * gains[1]); + mix[sampleIndex][2] += s32(frame[sampleIndex][0] * gains[2]); + mix[sampleIndex][3] += s32(frame[sampleIndex][1] * gains[3]); + } + } + +#if defined(DSP_SIMD_X64) && (defined(__SSE4_1__) || defined(__AVX__)) + ALWAYS_INLINE static void mixSSE4_1(IntermediateMix& mix, StereoFrame16& frame, const float* gains) { + for (usize sampleIndex = 0; sampleIndex < Audio::samplesInFrame; sampleIndex++) { + // The stereo samples, repeated every 4 bytes inside the vector register + __m128i stereoSamples = _mm_castps_si128(_mm_load1_ps((float*)&frame[sampleIndex][0])); + + __m128 currentFrame = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(stereoSamples)); + __m128 gains_ = _mm_load_ps(gains); + __m128i offset = _mm_cvtps_epi32(_mm_mul_ps(currentFrame, gains_)); + __m128i intermediateMixPrev = _mm_load_si128((__m128i*)&mix[sampleIndex][0]); + __m128i result = _mm_add_epi32(intermediateMixPrev, offset); + _mm_store_si128((__m128i*)&mix[sampleIndex][0], result); + } + } +#endif + +#ifdef DSP_SIMD_ARM64 + ALWAYS_INLINE static void mixNEON(IntermediateMix& mix, StereoFrame16& frame, const float* gains) { mixPortable(mix, frame, gains); } +#endif + + // Mixes the stereo output of a DSP voice into a quadraphonic intermediate mix + static void mix(IntermediateMix& mix, StereoFrame16& frame, const float* gains) { +#if defined(DSP_SIMD_ARM64) + return mixNEON(mix, frame, gains); +#elif defined(DSP_SIMD_X64) && (defined(__SSE4_1__) || defined(__AVX__)) + return mixSSE4_1(mix, frame, gains); +#else + return mixPortable(mix, frame, gains); +#endif + } +} // namespace DSP::MixIntoQuad \ No newline at end of file diff --git a/include/audio/hle_core.hpp b/include/audio/hle_core.hpp index 5868a5d0..32bbaae8 100644 --- a/include/audio/hle_core.hpp +++ b/include/audio/hle_core.hpp @@ -50,7 +50,9 @@ namespace Audio { using InterpolationMode = HLE::SourceConfiguration::Configuration::InterpolationMode; using InterpolationState = Audio::Interpolation::State; - DSPMixer::StereoFrame currentFrame; + // The samples this voice output for this audio frame. + // Aligned to 4 for SIMD purposes. + alignas(4) DSPMixer::StereoFrame currentFrame; BufferQueue buffers; SampleFormat sampleFormat = SampleFormat::ADPCM; @@ -60,7 +62,8 @@ namespace Audio { // There's one gain configuration for each of the 3 intermediate mixing stages // And each gain configuration is composed of 4 gain values, one for each sample in a quad-channel sample - std::array, 3> gains; + // Aligned to 16 for SIMD purposes + alignas(16) std::array, 3> gains; // Of the 3 intermediate mix stages, typically only the first one is actually enabled and the other ones do nothing // Ie their gain is vec4(0.0). We track which stages are disabled (have a gain of all 0s) using this bitfield and skip them // In order to save up on CPU time. diff --git a/src/core/audio/hle_core.cpp b/src/core/audio/hle_core.cpp index fec795f8..7e82a139 100644 --- a/src/core/audio/hle_core.cpp +++ b/src/core/audio/hle_core.cpp @@ -7,6 +7,7 @@ #include #include "audio/aac_decoder.hpp" +#include "audio/dsp_simd.hpp" #include "services/dsp.hpp" namespace Audio { @@ -228,7 +229,9 @@ namespace Audio { // The DSP checks the DSP configuration dirty bits on every frame, applies them, and clears them read.dspConfiguration.dirtyRaw = 0; read.dspConfiguration.dirtyRaw2 = 0; - std::array mixes{}; + + // The intermediate mix buffer is aligned to 16 for SIMD purposes + alignas(16) std::array mixes{}; for (int i = 0; i < sourceCount; i++) { // Update source configuration from the read region of shared memory @@ -263,15 +266,7 @@ namespace Audio { IntermediateMix& intermediateMix = mixes[mix]; const std::array& gains = source.gains[mix]; - // TODO: SIMD implementations - for (usize sampleIndex = 0; sampleIndex < Audio::samplesInFrame; sampleIndex++) { - // Mono samples are in the format: (l, r) - // When converting to quad, gain0 and gain2 are applied to the left sample, gain1 and gain3 to the right one - intermediateMix[sampleIndex][0] += s32(source.currentFrame[sampleIndex][0] * gains[0]); - intermediateMix[sampleIndex][1] += s32(source.currentFrame[sampleIndex][1] * gains[1]); - intermediateMix[sampleIndex][2] += s32(source.currentFrame[sampleIndex][0] * gains[2]); - intermediateMix[sampleIndex][3] += s32(source.currentFrame[sampleIndex][1] * gains[3]); - } + DSP::MixIntoQuad::mix(intermediateMix, source.currentFrame, gains.data()); } } }