#include "renderer_gl/renderer_gl.hpp" #include #include "PICA/float_types.hpp" #include "PICA/gpu.hpp" #include "PICA/regs.hpp" using namespace Floats; using namespace Helpers; using namespace PICA; const char* vertexShader = R"( #version 410 core layout (location = 0) in vec4 a_coords; layout (location = 1) in vec4 a_quaternion; layout (location = 2) in vec4 a_vertexColour; layout (location = 3) in vec2 a_texcoord0; layout (location = 4) in vec2 a_texcoord1; layout (location = 5) in float a_texcoord0_w; layout (location = 6) in vec3 a_view; layout (location = 7) in vec2 a_texcoord2; out vec3 v_normal; out vec3 v_tangent; out vec3 v_bitangent; out vec4 v_colour; out vec3 v_texcoord0; out vec2 v_texcoord1; out vec3 v_view; out vec2 v_texcoord2; flat out vec4 v_textureEnvColor[6]; flat out vec4 v_textureEnvBufferColor; out float gl_ClipDistance[2]; // TEV uniforms uniform uint u_textureEnvColor[6]; uniform uint u_picaRegs[0x200 - 0x48]; // Helper so that the implementation of u_pica_regs can be changed later uint readPicaReg(uint reg_addr){ return u_picaRegs[reg_addr - 0x48]; } vec4 abgr8888ToVec4(uint abgr) { const float scale = 1.0 / 255.0; return scale * vec4( float(abgr & 0xffu), float((abgr >> 8) & 0xffu), float((abgr >> 16) & 0xffu), float(abgr >> 24) ); } vec3 rotateVec3ByQuaternion(vec3 v, vec4 q){ vec3 u = q.xyz; float s = q.w; return 2.0 * dot(u, v) * u + (s * s - dot(u, u))* v + 2.0 * s * cross(u, v); } // Convert an arbitrary-width floating point literal to an f32 float decodeFP(uint hex, uint E, uint M){ uint width = M + E + 1u; uint bias = 128u - (1u << (E - 1u)); uint exponent = (hex >> M) & ((1u << E) - 1u); uint mantissa = hex & ((1u << M) - 1u); uint sign = (hex >> (E + M)) << 31u; if ((hex & ((1u << (width - 1u)) - 1u)) != 0) { if (exponent == (1u << E) - 1u) exponent = 255u; else exponent += bias; hex = sign | (mantissa << (23u - M)) | (exponent << 23u); } else { hex = sign; } return uintBitsToFloat(hex); } void main() { gl_Position = a_coords; v_colour = a_vertexColour; // Flip y axis of UVs because OpenGL uses an inverted y for texture sampling compared to the PICA v_texcoord0 = vec3(a_texcoord0.x, 1.0 - a_texcoord0.y, a_texcoord0_w); v_texcoord1 = vec2(a_texcoord1.x, 1.0 - a_texcoord1.y); v_texcoord2 = vec2(a_texcoord2.x, 1.0 - a_texcoord2.y); v_view = a_view; v_normal = normalize(rotateVec3ByQuaternion(vec3(0.0, 0.0, 1.0), a_quaternion)); v_tangent = normalize(rotateVec3ByQuaternion(vec3(1.0, 0.0, 0.0), a_quaternion)); v_bitangent = normalize(rotateVec3ByQuaternion(vec3(0.0, 1.0, 0.0), a_quaternion)); for (int i = 0; i < 6; i++) { v_textureEnvColor[i] = abgr8888ToVec4(u_textureEnvColor[i]); } v_textureEnvBufferColor = abgr8888ToVec4(readPicaReg(0xFD)); // Parse clipping plane registers // The plane registers describe a clipping plane in the form of Ax + By + Cz + D = 0 // With n = (A, B, C) being the normal vector and D being the origin point distance // Therefore, for the second clipping plane, we can just pass the dot product of the clip vector and the input coordinates to gl_ClipDistance[1] vec4 clipData = vec4( decodeFP(readPicaReg(0x48) & 0xffffffu, 7, 16), decodeFP(readPicaReg(0x49) & 0xffffffu, 7, 16), decodeFP(readPicaReg(0x4A) & 0xffffffu, 7, 16), decodeFP(readPicaReg(0x4B) & 0xffffffu, 7, 16) ); // There's also another, always-on clipping plane based on vertex z gl_ClipDistance[0] = -a_coords.z; gl_ClipDistance[1] = dot(clipData, a_coords); } )"; const char* fragmentShader = R"( #version 410 core in vec3 v_tangent; in vec3 v_normal; in vec3 v_bitangent; in vec4 v_colour; in vec3 v_texcoord0; in vec2 v_texcoord1; in vec3 v_view; in vec2 v_texcoord2; flat in vec4 v_textureEnvColor[6]; flat in vec4 v_textureEnvBufferColor; out vec4 fragColour; // TEV uniforms uniform uint u_textureEnvSource[6]; uniform uint u_textureEnvOperand[6]; uniform uint u_textureEnvCombiner[6]; uniform uint u_textureEnvScale[6]; // Depth control uniforms uniform float u_depthScale; uniform float u_depthOffset; uniform bool u_depthmapEnable; uniform sampler2D u_tex0; uniform sampler2D u_tex1; uniform sampler2D u_tex2; uniform sampler1DArray u_tex_lighting_lut; uniform uint u_picaRegs[0x200 - 0x48]; // Helper so that the implementation of u_pica_regs can be changed later uint readPicaReg(uint reg_addr){ return u_picaRegs[reg_addr - 0x48]; } vec4 tevSources[16]; vec4 tevNextPreviousBuffer; bool tevUnimplementedSourceFlag = false; // OpenGL ES 1.1 reference pages for TEVs (this is what the PICA200 implements): // https://registry.khronos.org/OpenGL-Refpages/es1.1/xhtml/glTexEnv.xml vec4 tevFetchSource(uint src_id) { if (src_id >= 6u && src_id < 13u) { tevUnimplementedSourceFlag = true; } return tevSources[src_id]; } vec4 tevGetColorAndAlphaSource(int tev_id, int src_id) { vec4 result; vec4 colorSource = tevFetchSource((u_textureEnvSource[tev_id] >> (src_id * 4)) & 15u); vec4 alphaSource = tevFetchSource((u_textureEnvSource[tev_id] >> (src_id * 4 + 16)) & 15u); uint colorOperand = (u_textureEnvOperand[tev_id] >> (src_id * 4)) & 15u; uint alphaOperand = (u_textureEnvOperand[tev_id] >> (12 + src_id * 4)) & 7u; // TODO: figure out what the undocumented values do switch (colorOperand) { case 0u: result.rgb = colorSource.rgb; break; // Source color case 1u: result.rgb = 1.0 - colorSource.rgb; break; // One minus source color case 2u: result.rgb = vec3(colorSource.a); break; // Source alpha case 3u: result.rgb = vec3(1.0 - colorSource.a); break; // One minus source alpha case 4u: result.rgb = vec3(colorSource.r); break; // Source red case 5u: result.rgb = vec3(1.0 - colorSource.r); break; // One minus source red case 8u: result.rgb = vec3(colorSource.g); break; // Source green case 9u: result.rgb = vec3(1.0 - colorSource.g); break; // One minus source green case 12u: result.rgb = vec3(colorSource.b); break; // Source blue case 13u: result.rgb = vec3(1.0 - colorSource.b); break; // One minus source blue default: break; } // TODO: figure out what the undocumented values do switch (alphaOperand) { case 0u: result.a = alphaSource.a; break; // Source alpha case 1u: result.a = 1.0 - alphaSource.a; break; // One minus source alpha case 2u: result.a = alphaSource.r; break; // Source red case 3u: result.a = 1.0 - alphaSource.r; break; // One minus source red case 4u: result.a = alphaSource.g; break; // Source green case 5u: result.a = 1.0 - alphaSource.g; break; // One minus source green case 6u: result.a = alphaSource.b; break; // Source blue case 7u: result.a = 1.0 - alphaSource.b; break; // One minus source blue default: break; } return result; } vec4 tevCalculateCombiner(int tev_id) { vec4 source0 = tevGetColorAndAlphaSource(tev_id, 0); vec4 source1 = tevGetColorAndAlphaSource(tev_id, 1); vec4 source2 = tevGetColorAndAlphaSource(tev_id, 2); uint colorCombine = u_textureEnvCombiner[tev_id] & 15u; uint alphaCombine = (u_textureEnvCombiner[tev_id] >> 16) & 15u; vec4 result = vec4(1.0); // TODO: figure out what the undocumented values do switch (colorCombine) { case 0u: result.rgb = source0.rgb; break; // Replace case 1u: result.rgb = source0.rgb * source1.rgb; break; // Modulate case 2u: result.rgb = min(vec3(1.0), source0.rgb + source1.rgb); break; // Add case 3u: result.rgb = clamp(source0.rgb + source1.rgb - 0.5, 0.0, 1.0); break; // Add signed case 4u: result.rgb = mix(source1.rgb, source0.rgb, source2.rgb); break; // Interpolate case 5u: result.rgb = max(source0.rgb - source1.rgb, 0.0); break; // Subtract case 6u: result.rgb = vec3(4.0 * dot(source0.rgb - 0.5 , source1.rgb - 0.5)); break; // Dot3 RGB case 7u: result = vec4(4.0 * dot(source0.rgb - 0.5 , source1.rgb - 0.5)); break; // Dot3 RGBA case 8u: result.rgb = min(source0.rgb * source1.rgb + source2.rgb, 1.0); break; // Multiply then add case 9u: result.rgb = min((source0.rgb + source1.rgb) * source2.rgb, 1.0); break; // Add then multiply default: break; } if (colorCombine != 7u) { // The color combiner also writes the alpha channel in the "Dot3 RGBA" mode. // TODO: figure out what the undocumented values do // TODO: test if the alpha combiner supports all the same modes as the color combiner. switch (alphaCombine) { case 0u: result.a = source0.a; break; // Replace case 1u: result.a = source0.a * source1.a; break; // Modulate case 2u: result.a = min(1.0, source0.a + source1.a); break; // Add case 3u: result.a = clamp(source0.a + source1.a - 0.5, 0.0, 1.0); break; // Add signed case 4u: result.a = mix(source1.a, source0.a, source2.a); break; // Interpolate case 5u: result.a = max(0.0, source0.a - source1.a); break; // Subtract case 8u: result.a = min(1.0, source0.a * source1.a + source2.a); break; // Multiply then add case 9u: result.a = min(1.0, (source0.a + source1.a) * source2.a); break; // Add then multiply default: break; } } result.rgb *= float(1 << (u_textureEnvScale[tev_id] & 3u)); result.a *= float(1 << ((u_textureEnvScale[tev_id] >> 16) & 3u)); return result; } #define D0_LUT 0u #define D1_LUT 1u #define SP_LUT 2u #define FR_LUT 3u #define RB_LUT 4u #define RG_LUT 5u #define RR_LUT 6u float lutLookup(uint lut, uint light, float value){ if (lut >= FR_LUT && lut <= RR_LUT) lut -= 1; if (lut==SP_LUT) lut = light + 8; return texture(u_tex_lighting_lut, vec2(value, lut)).r; } vec3 regToColor(uint reg) { // Normalization scale to convert from [0...255] to [0.0...1.0] const float scale = 1.0 / 255.0; return scale * vec3( float(bitfieldExtract(reg, 20, 8)), float(bitfieldExtract(reg, 10, 8)), float(bitfieldExtract(reg, 00, 8)) ); } // Convert an arbitrary-width floating point literal to an f32 float decodeFP(uint hex, uint E, uint M){ uint width = M + E + 1u; uint bias = 128u - (1u << (E - 1u)); uint exponent = (hex >> M) & ((1u << E) - 1u); uint mantissa = hex & ((1u << M) - 1u); uint sign = (hex >> (E + M)) << 31u; if ((hex & ((1u << (width - 1u)) - 1u)) != 0) { if (exponent == (1u << E) - 1u) exponent = 255u; else exponent += bias; hex = sign | (mantissa << (23u - M)) | (exponent << 23u); } else { hex = sign; } return uintBitsToFloat(hex); } // Implements the following algorthm: https://mathb.in/26766 void calcLighting(out vec4 primary_color, out vec4 secondary_color){ // Quaternions describe a transformation from surface-local space to eye space. // In surface-local space, by definition (and up to permutation) the normal vector is (0,0,1), // the tangent vector is (1,0,0), and the bitangent vector is (0,1,0). vec3 normal = normalize(v_normal ); vec3 tangent = normalize(v_tangent ); vec3 bitangent = normalize(v_bitangent); vec3 view = normalize(v_view); uint GPUREG_LIGHTING_ENABLE = readPicaReg(0x008F); if (bitfieldExtract(GPUREG_LIGHTING_ENABLE, 0, 1) == 0){ primary_color = secondary_color = vec4(1.0); return; } uint GPUREG_LIGHTING_AMBIENT = readPicaReg(0x01C0); uint GPUREG_LIGHTING_NUM_LIGHTS = (readPicaReg(0x01C2) & 0x7u) +1; uint GPUREG_LIGHTING_LIGHT_PERMUTATION = readPicaReg(0x01D9); primary_color = vec4(vec3(0.0),1.0); secondary_color = vec4(vec3(0.0),1.0); primary_color.rgb += regToColor(GPUREG_LIGHTING_AMBIENT); uint GPUREG_LIGHTING_LUTINPUT_ABS = readPicaReg(0x01D0); uint GPUREG_LIGHTING_LUTINPUT_SELECT = readPicaReg(0x01D1); uint GPUREG_LIGHTING_CONFIG0 = readPicaReg(0x01C3); uint GPUREG_LIGHTING_CONFIG1 = readPicaReg(0x01C4); uint GPUREG_LIGHTING_LUTINPUT_SCALE = readPicaReg(0x01D2); float d[7]; bool error_unimpl = false; for (uint i = 0; i < GPUREG_LIGHTING_NUM_LIGHTS; i++) { uint light_id = bitfieldExtract(GPUREG_LIGHTING_LIGHT_PERMUTATION,int(i*3),3); uint GPUREG_LIGHTi_SPECULAR0 = readPicaReg(0x0140 + 0x10 * light_id); uint GPUREG_LIGHTi_SPECULAR1 = readPicaReg(0x0141 + 0x10 * light_id); uint GPUREG_LIGHTi_DIFFUSE = readPicaReg(0x0142 + 0x10 * light_id); uint GPUREG_LIGHTi_AMBIENT = readPicaReg(0x0143 + 0x10 * light_id); uint GPUREG_LIGHTi_VECTOR_LOW = readPicaReg(0x0144 + 0x10 * light_id); uint GPUREG_LIGHTi_VECTOR_HIGH= readPicaReg(0x0145 + 0x10 * light_id); uint GPUREG_LIGHTi_CONFIG = readPicaReg(0x0149 + 0x10 * light_id); vec3 light_vector = normalize(vec3( decodeFP(bitfieldExtract(GPUREG_LIGHTi_VECTOR_LOW, 0, 16), 5, 10), decodeFP(bitfieldExtract(GPUREG_LIGHTi_VECTOR_LOW, 16, 16), 5, 10), decodeFP(bitfieldExtract(GPUREG_LIGHTi_VECTOR_HIGH, 0, 16), 5, 10) )); // Positional Light if (bitfieldExtract(GPUREG_LIGHTi_CONFIG, 0, 1) == 0) error_unimpl = true; vec3 half_vector = normalize(normalize(light_vector) + view); for (int c = 0; c < 7; c++) { if (bitfieldExtract(GPUREG_LIGHTING_CONFIG1, 16 + c, 1) == 0){ uint scale_id = bitfieldExtract(GPUREG_LIGHTING_LUTINPUT_SCALE, c * 4, 3); float scale = float(1u << scale_id); if (scale_id >= 6u) scale/=256.0; uint input_id = bitfieldExtract(GPUREG_LIGHTING_LUTINPUT_SELECT, c * 4, 3); if (input_id == 0u) d[c] = dot(normal,half_vector); else if (input_id == 1u) d[c] = dot(view,half_vector); else if (input_id == 2u) d[c] = dot(normal,view); else if (input_id == 3u) d[c] = dot(light_vector,normal); else if (input_id == 4u){ uint GPUREG_LIGHTi_SPOTDIR_LOW = readPicaReg(0x0146 + 0x10 * light_id); uint GPUREG_LIGHTi_SPOTDIR_HIGH= readPicaReg(0x0147 + 0x10 * light_id); vec3 spot_light_vector = normalize(vec3( decodeFP(bitfieldExtract(GPUREG_LIGHTi_SPOTDIR_LOW, 0, 16), 1, 11), decodeFP(bitfieldExtract(GPUREG_LIGHTi_SPOTDIR_LOW, 16, 16), 1, 11), decodeFP(bitfieldExtract(GPUREG_LIGHTi_SPOTDIR_HIGH, 0, 16), 1, 11) )); d[c] = dot(-light_vector, spot_light_vector); // -L dot P (aka Spotlight aka SP); } else if (input_id == 5u) { d[c] = 1.0; // TODO: cos (aka CP); error_unimpl = true; } else { d[c] = 1.0; } d[c] = lutLookup(c, light_id, d[c] * 0.5 + 0.5) * scale; if (bitfieldExtract(GPUREG_LIGHTING_LUTINPUT_ABS, 2 * c, 1) != 0u) d[c] = abs(d[c]); } else { d[c] = 1.0; } } uint lookup_config = bitfieldExtract(GPUREG_LIGHTi_CONFIG,4,4); if (lookup_config == 0) { d[D1_LUT] = 0.0; d[FR_LUT] = 0.0; d[RG_LUT]= d[RB_LUT] = d[RR_LUT]; } else if (lookup_config == 1) { d[D0_LUT] = 0.0; d[D1_LUT] = 0.0; d[RG_LUT] = d[RB_LUT] = d[RR_LUT]; } else if (lookup_config == 2) { d[FR_LUT] = 0.0; d[SP_LUT] = 0.0; d[RG_LUT] = d[RB_LUT] = d[RR_LUT]; } else if (lookup_config == 3) { d[SP_LUT] = 0.0; d[RG_LUT]= d[RB_LUT] = d[RR_LUT] = 1.0; } else if (lookup_config == 4) { d[FR_LUT] = 0.0; } else if (lookup_config == 5) { d[D1_LUT] = 0.0; } else if (lookup_config == 6) { d[RG_LUT] = d[RB_LUT] = d[RR_LUT]; } float distance_factor = 1.0; // a float indirect_factor = 1.0; // fi float shadow_factor = 1.0; // o float NdotL = dot(normal, light_vector); //Li dot N // Two sided diffuse if (bitfieldExtract(GPUREG_LIGHTi_CONFIG, 1, 1) == 0) NdotL = max(0.0, NdotL); else NdotL = abs(NdotL); float light_factor = distance_factor*d[SP_LUT]*indirect_factor*shadow_factor; primary_color.rgb += light_factor * (regToColor(GPUREG_LIGHTi_AMBIENT) + regToColor(GPUREG_LIGHTi_DIFFUSE)*NdotL); secondary_color.rgb += light_factor * ( regToColor(GPUREG_LIGHTi_SPECULAR0) * d[D0_LUT] + regToColor(GPUREG_LIGHTi_SPECULAR1) * d[D1_LUT] * vec3(d[RR_LUT], d[RG_LUT], d[RB_LUT]) ); } uint fresnel_output1 = bitfieldExtract(GPUREG_LIGHTING_CONFIG0, 2, 1); uint fresnel_output2 = bitfieldExtract(GPUREG_LIGHTING_CONFIG0, 3, 1); if (fresnel_output1 == 1u) primary_color.a = d[FR_LUT]; if (fresnel_output2 == 1u) secondary_color.a = d[FR_LUT]; if (error_unimpl) { secondary_color = primary_color = vec4(1.0,0.,1.0,1.0); } } void main() { // TODO: what do invalid sources and disabled textures read as? // And what does the "previous combiner" source read initially? tevSources[0] = v_colour; // Primary/vertex color calcLighting(tevSources[1],tevSources[2]); uint textureConfig = readPicaReg(0x80); vec2 tex2UV = (textureConfig & (1u << 13)) != 0u ? v_texcoord1 : v_texcoord2; if ((textureConfig & 1u) != 0u) tevSources[3] = texture(u_tex0, v_texcoord0.xy); if ((textureConfig & 2u) != 0u) tevSources[4] = texture(u_tex1, v_texcoord1); if ((textureConfig & 4u) != 0u) tevSources[5] = texture(u_tex2, tex2UV); tevSources[13] = vec4(0.0); // Previous buffer tevSources[15] = vec4(0.0); // Previous combiner tevNextPreviousBuffer = v_textureEnvBufferColor; uint textureEnvUpdateBuffer = readPicaReg(0xE0); for (int i = 0; i < 6; i++) { tevSources[14] = v_textureEnvColor[i]; // Constant color tevSources[15] = tevCalculateCombiner(i); tevSources[13] = tevNextPreviousBuffer; if (i < 4) { if ((textureEnvUpdateBuffer & (0x100u << i)) != 0u) { tevNextPreviousBuffer.rgb = tevSources[15].rgb; } if ((textureEnvUpdateBuffer & (0x1000u << i)) != 0u) { tevNextPreviousBuffer.a = tevSources[15].a; } } } fragColour = tevSources[15]; if (tevUnimplementedSourceFlag) { // fragColour = vec4(1.0, 0.0, 1.0, 1.0); } // fragColour.rg = texture(u_tex_lighting_lut,vec2(gl_FragCoord.x/200.,float(int(gl_FragCoord.y/2)%24))).rr; // Get original depth value by converting from [near, far] = [0, 1] to [-1, 1] // We do this by converting to [0, 2] first and subtracting 1 to go to [-1, 1] float z_over_w = gl_FragCoord.z * 2.0f - 1.0f; float depth = z_over_w * u_depthScale + u_depthOffset; if (!u_depthmapEnable) // Divide z by w if depthmap enable == 0 (ie using W-buffering) depth /= gl_FragCoord.w; // Write final fragment depth gl_FragDepth = depth; // Perform alpha test uint alphaControl = readPicaReg(0x104); if ((alphaControl & 1u) != 0u) { // Check if alpha test is on uint func = (alphaControl >> 4u) & 7u; float reference = float((alphaControl >> 8u) & 0xffu) / 255.0; float alpha = fragColour.a; switch (func) { case 0: discard; // Never pass alpha test case 1: break; // Always pass alpha test case 2: // Pass if equal if (alpha != reference) discard; break; case 3: // Pass if not equal if (alpha == reference) discard; break; case 4: // Pass if less than if (alpha >= reference) discard; break; case 5: // Pass if less than or equal if (alpha > reference) discard; break; case 6: // Pass if greater than if (alpha <= reference) discard; break; case 7: // Pass if greater than or equal if (alpha < reference) discard; break; } } } )"; const char* displayVertexShader = R"( #version 410 core out vec2 UV; void main() { const vec4 positions[4] = vec4[]( vec4(-1.0, 1.0, 1.0, 1.0), // Top-left vec4(1.0, 1.0, 1.0, 1.0), // Top-right vec4(-1.0, -1.0, 1.0, 1.0), // Bottom-left vec4(1.0, -1.0, 1.0, 1.0) // Bottom-right ); // The 3DS displays both screens' framebuffer rotated 90 deg counter clockwise // So we adjust our texcoords accordingly const vec2 texcoords[4] = vec2[]( vec2(1.0, 1.0), // Top-right vec2(1.0, 0.0), // Bottom-right vec2(0.0, 1.0), // Top-left vec2(0.0, 0.0) // Bottom-left ); gl_Position = positions[gl_VertexID]; UV = texcoords[gl_VertexID]; } )"; const char* displayFragmentShader = R"( #version 410 core in vec2 UV; out vec4 FragColor; uniform sampler2D u_texture; void main() { FragColor = texture(u_texture, UV); } )"; void RendererGL::reset() { depthBufferCache.reset(); colourBufferCache.reset(); textureCache.reset(); // Init the colour/depth buffer settings to some random defaults on reset colourBufferLoc = 0; colourBufferFormat = PICA::ColorFmt::RGBA8; depthBufferLoc = 0; depthBufferFormat = PICA::DepthFmt::Depth16; if (triangleProgram.exists()) { const auto oldProgram = OpenGL::getProgram(); gl.useProgram(triangleProgram); oldDepthScale = -1.0; // Default depth scale to -1.0, which is what games typically use oldDepthOffset = 0.0; // Default depth offset to 0 oldDepthmapEnable = false; // Enable w buffering glUniform1f(depthScaleLoc, oldDepthScale); glUniform1f(depthOffsetLoc, oldDepthOffset); glUniform1i(depthmapEnableLoc, oldDepthmapEnable); gl.useProgram(oldProgram); // Switch to old GL program } } void RendererGL::initGraphicsContext() { gl.reset(); OpenGL::Shader vert(vertexShader, OpenGL::Vertex); OpenGL::Shader frag(fragmentShader, OpenGL::Fragment); triangleProgram.create({vert, frag}); gl.useProgram(triangleProgram); textureEnvSourceLoc = OpenGL::uniformLocation(triangleProgram, "u_textureEnvSource"); textureEnvOperandLoc = OpenGL::uniformLocation(triangleProgram, "u_textureEnvOperand"); textureEnvCombinerLoc = OpenGL::uniformLocation(triangleProgram, "u_textureEnvCombiner"); textureEnvColorLoc = OpenGL::uniformLocation(triangleProgram, "u_textureEnvColor"); textureEnvScaleLoc = OpenGL::uniformLocation(triangleProgram, "u_textureEnvScale"); depthScaleLoc = OpenGL::uniformLocation(triangleProgram, "u_depthScale"); depthOffsetLoc = OpenGL::uniformLocation(triangleProgram, "u_depthOffset"); depthmapEnableLoc = OpenGL::uniformLocation(triangleProgram, "u_depthmapEnable"); picaRegLoc = OpenGL::uniformLocation(triangleProgram, "u_picaRegs"); // Init sampler objects. Texture 0 goes in texture unit 0, texture 1 in TU 1, texture 2 in TU 2, and the light maps go in TU 3 glUniform1i(OpenGL::uniformLocation(triangleProgram, "u_tex0"), 0); glUniform1i(OpenGL::uniformLocation(triangleProgram, "u_tex1"), 1); glUniform1i(OpenGL::uniformLocation(triangleProgram, "u_tex2"), 2); glUniform1i(OpenGL::uniformLocation(triangleProgram, "u_tex_lighting_lut"), 3); OpenGL::Shader vertDisplay(displayVertexShader, OpenGL::Vertex); OpenGL::Shader fragDisplay(displayFragmentShader, OpenGL::Fragment); displayProgram.create({vertDisplay, fragDisplay}); gl.useProgram(displayProgram); glUniform1i(OpenGL::uniformLocation(displayProgram, "u_texture"), 0); // Init sampler object vbo.createFixedSize(sizeof(Vertex) * vertexBufferSize, GL_STREAM_DRAW); gl.bindVBO(vbo); vao.create(); gl.bindVAO(vao); // Position (x, y, z, w) attributes vao.setAttributeFloat(0, 4, sizeof(Vertex), offsetof(Vertex, s.positions)); vao.enableAttribute(0); // Quaternion attribute vao.setAttributeFloat(1, 4, sizeof(Vertex), offsetof(Vertex, s.quaternion)); vao.enableAttribute(1); // Colour attribute vao.setAttributeFloat(2, 4, sizeof(Vertex), offsetof(Vertex, s.colour)); vao.enableAttribute(2); // UV 0 attribute vao.setAttributeFloat(3, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord0)); vao.enableAttribute(3); // UV 1 attribute vao.setAttributeFloat(4, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord1)); vao.enableAttribute(4); // UV 0 W-component attribute vao.setAttributeFloat(5, 1, sizeof(Vertex), offsetof(Vertex, s.texcoord0_w)); vao.enableAttribute(5); // View vao.setAttributeFloat(6, 3, sizeof(Vertex), offsetof(Vertex, s.view)); vao.enableAttribute(6); // UV 2 attribute vao.setAttributeFloat(7, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord2)); vao.enableAttribute(7); dummyVBO.create(); dummyVAO.create(); // Create texture and framebuffer for the 3DS screen const u32 screenTextureWidth = 400; // Top screen is 400 pixels wide, bottom is 320 const u32 screenTextureHeight = 2 * 240; // Both screens are 240 pixels tall glGenTextures(1, &lightLUTTextureArray); auto prevTexture = OpenGL::getTex2D(); screenTexture.create(screenTextureWidth, screenTextureHeight, GL_RGBA8); screenTexture.bind(); screenTexture.setMinFilter(OpenGL::Linear); screenTexture.setMagFilter(OpenGL::Linear); glBindTexture(GL_TEXTURE_2D, prevTexture); screenFramebuffer.createWithDrawTexture(screenTexture); screenFramebuffer.bind(OpenGL::DrawAndReadFramebuffer); if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) Helpers::panic("Incomplete framebuffer"); // TODO: This should not clear the framebuffer contents. It should load them from VRAM. GLint oldViewport[4]; glGetIntegerv(GL_VIEWPORT, oldViewport); OpenGL::setViewport(screenTextureWidth, screenTextureHeight); OpenGL::setClearColor(0.0, 0.0, 0.0, 1.0); OpenGL::clearColor(); OpenGL::setViewport(oldViewport[0], oldViewport[1], oldViewport[2], oldViewport[3]); reset(); } // Set up the OpenGL blending context to match the emulated PICA void RendererGL::setupBlending() { const bool blendingEnabled = (regs[PICA::InternalRegs::ColourOperation] & (1 << 8)) != 0; // Map of PICA blending equations to OpenGL blending equations. The unused blending equations are equivalent to equation 0 (add) static constexpr std::array blendingEquations = { GL_FUNC_ADD, GL_FUNC_SUBTRACT, GL_FUNC_REVERSE_SUBTRACT, GL_MIN, GL_MAX, GL_FUNC_ADD, GL_FUNC_ADD, GL_FUNC_ADD, }; // Map of PICA blending funcs to OpenGL blending funcs. Func = 15 is undocumented and stubbed to GL_ONE for now static constexpr std::array blendingFuncs = { GL_ZERO, GL_ONE, GL_SRC_COLOR, GL_ONE_MINUS_SRC_COLOR, GL_DST_COLOR, GL_ONE_MINUS_DST_COLOR, GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_DST_ALPHA, GL_ONE_MINUS_DST_ALPHA, GL_CONSTANT_COLOR, GL_ONE_MINUS_CONSTANT_COLOR, GL_CONSTANT_ALPHA, GL_ONE_MINUS_CONSTANT_ALPHA, GL_SRC_ALPHA_SATURATE, GL_ONE, }; if (!blendingEnabled) { gl.disableBlend(); } else { gl.enableBlend(); // Get blending equations const u32 blendControl = regs[PICA::InternalRegs::BlendFunc]; const u32 rgbEquation = blendControl & 0x7; const u32 alphaEquation = getBits<8, 3>(blendControl); // Get blending functions const u32 rgbSourceFunc = getBits<16, 4>(blendControl); const u32 rgbDestFunc = getBits<20, 4>(blendControl); const u32 alphaSourceFunc = getBits<24, 4>(blendControl); const u32 alphaDestFunc = getBits<28, 4>(blendControl); const u32 constantColor = regs[PICA::InternalRegs::BlendColour]; const u32 r = constantColor & 0xff; const u32 g = getBits<8, 8>(constantColor); const u32 b = getBits<16, 8>(constantColor); const u32 a = getBits<24, 8>(constantColor); OpenGL::setBlendColor(float(r) / 255.f, float(g) / 255.f, float(b) / 255.f, float(a) / 255.f); // Translate equations and funcs to their GL equivalents and set them glBlendEquationSeparate(blendingEquations[rgbEquation], blendingEquations[alphaEquation]); glBlendFuncSeparate(blendingFuncs[rgbSourceFunc], blendingFuncs[rgbDestFunc], blendingFuncs[alphaSourceFunc], blendingFuncs[alphaDestFunc]); } } void RendererGL::setupTextureEnvState() { // TODO: Only update uniforms when the TEV config changed. Use an UBO potentially. static constexpr std::array ioBases = { PICA::InternalRegs::TexEnv0Source, PICA::InternalRegs::TexEnv1Source, PICA::InternalRegs::TexEnv2Source, PICA::InternalRegs::TexEnv3Source, PICA::InternalRegs::TexEnv4Source, PICA::InternalRegs::TexEnv5Source, }; u32 textureEnvSourceRegs[6]; u32 textureEnvOperandRegs[6]; u32 textureEnvCombinerRegs[6]; u32 textureEnvColourRegs[6]; u32 textureEnvScaleRegs[6]; for (int i = 0; i < 6; i++) { const u32 ioBase = ioBases[i]; textureEnvSourceRegs[i] = regs[ioBase]; textureEnvOperandRegs[i] = regs[ioBase + 1]; textureEnvCombinerRegs[i] = regs[ioBase + 2]; textureEnvColourRegs[i] = regs[ioBase + 3]; textureEnvScaleRegs[i] = regs[ioBase + 4]; } glUniform1uiv(textureEnvSourceLoc, 6, textureEnvSourceRegs); glUniform1uiv(textureEnvOperandLoc, 6, textureEnvOperandRegs); glUniform1uiv(textureEnvCombinerLoc, 6, textureEnvCombinerRegs); glUniform1uiv(textureEnvColorLoc, 6, textureEnvColourRegs); glUniform1uiv(textureEnvScaleLoc, 6, textureEnvScaleRegs); } void RendererGL::bindTexturesToSlots() { static constexpr std::array ioBases = { PICA::InternalRegs::Tex0BorderColor, PICA::InternalRegs::Tex1BorderColor, PICA::InternalRegs::Tex2BorderColor, }; for (int i = 0; i < 3; i++) { if ((regs[PICA::InternalRegs::TexUnitCfg] & (1 << i)) == 0) { continue; } const size_t ioBase = ioBases[i]; const u32 dim = regs[ioBase + 1]; const u32 config = regs[ioBase + 2]; const u32 height = dim & 0x7ff; const u32 width = getBits<16, 11>(dim); const u32 addr = (regs[ioBase + 4] & 0x0FFFFFFF) << 3; u32 format = regs[ioBase + (i == 0 ? 13 : 5)] & 0xF; glActiveTexture(GL_TEXTURE0 + i); Texture targetTex(addr, static_cast(format), width, height, config); OpenGL::Texture tex = getTexture(targetTex); tex.bind(); } glActiveTexture(GL_TEXTURE0 + 3); glBindTexture(GL_TEXTURE_1D_ARRAY, lightLUTTextureArray); glActiveTexture(GL_TEXTURE0); } void RendererGL::updateLightingLUT() { gpu.lightingLUTDirty = false; std::array u16_lightinglut; for (int i = 0; i < gpu.lightingLUT.size(); i++) { uint64_t value = gpu.lightingLUT[i] & ((1 << 12) - 1); u16_lightinglut[i] = value * 65535 / 4095; } glActiveTexture(GL_TEXTURE0 + 3); glBindTexture(GL_TEXTURE_1D_ARRAY, lightLUTTextureArray); glTexImage2D(GL_TEXTURE_1D_ARRAY, 0, GL_R16, 256, Lights::LUT_Count, 0, GL_RED, GL_UNSIGNED_SHORT, u16_lightinglut.data()); glTexParameteri(GL_TEXTURE_1D_ARRAY, GL_TEXTURE_MIN_FILTER, GL_LINEAR); glTexParameteri(GL_TEXTURE_1D_ARRAY, GL_TEXTURE_MAG_FILTER, GL_LINEAR); glTexParameteri(GL_TEXTURE_1D_ARRAY, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); glTexParameteri(GL_TEXTURE_1D_ARRAY, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); glActiveTexture(GL_TEXTURE0); } void RendererGL::drawVertices(PICA::PrimType primType, std::span vertices) { // The fourth type is meant to be "Geometry primitive". TODO: Find out what that is static constexpr std::array primTypes = { OpenGL::Triangle, OpenGL::TriangleStrip, OpenGL::TriangleFan, OpenGL::Triangle, }; const auto primitiveTopology = primTypes[static_cast(primType)]; gl.disableScissor(); gl.bindVBO(vbo); gl.bindVAO(vao); gl.useProgram(triangleProgram); OpenGL::enableClipPlane(0); // Clipping plane 0 is always enabled if (regs[PICA::InternalRegs::ClipEnable] & 1) { OpenGL::enableClipPlane(1); } setupBlending(); OpenGL::Framebuffer poop = getColourFBO(); poop.bind(OpenGL::DrawAndReadFramebuffer); const u32 depthControl = regs[PICA::InternalRegs::DepthAndColorMask]; const bool depthEnable = depthControl & 1; const bool depthWriteEnable = getBit<12>(depthControl); const int depthFunc = getBits<4, 3>(depthControl); const int colourMask = getBits<8, 4>(depthControl); gl.setColourMask(colourMask & 1, colourMask & 2, colourMask & 4, colourMask & 8); static constexpr std::array depthModes = {GL_NEVER, GL_ALWAYS, GL_EQUAL, GL_NOTEQUAL, GL_LESS, GL_LEQUAL, GL_GREATER, GL_GEQUAL}; const float depthScale = f24::fromRaw(regs[PICA::InternalRegs::DepthScale] & 0xffffff).toFloat32(); const float depthOffset = f24::fromRaw(regs[PICA::InternalRegs::DepthOffset] & 0xffffff).toFloat32(); const bool depthMapEnable = regs[PICA::InternalRegs::DepthmapEnable] & 1; // Update depth uniforms if (oldDepthScale != depthScale) { oldDepthScale = depthScale; glUniform1f(depthScaleLoc, depthScale); } if (oldDepthOffset != depthOffset) { oldDepthOffset = depthOffset; glUniform1f(depthOffsetLoc, depthOffset); } if (oldDepthmapEnable != depthMapEnable) { oldDepthmapEnable = depthMapEnable; glUniform1i(depthmapEnableLoc, depthMapEnable); } setupTextureEnvState(); bindTexturesToSlots(); // Upload PICA Registers as a single uniform. The shader needs access to the rasterizer registers (for depth, starting from index 0x48) // The texturing and the fragment lighting registers. Therefore we upload them all in one go to avoid multiple slow uniform updates glUniform1uiv(picaRegLoc, 0x200 - 0x48, ®s[0x48]); if (gpu.lightingLUTDirty) { updateLightingLUT(); } // TODO: Actually use this GLsizei viewportWidth = GLsizei(f24::fromRaw(regs[PICA::InternalRegs::ViewportWidth] & 0xffffff).toFloat32() * 2.0f); GLsizei viewportHeight = GLsizei(f24::fromRaw(regs[PICA::InternalRegs::ViewportHeight] & 0xffffff).toFloat32() * 2.0f); OpenGL::setViewport(viewportWidth, viewportHeight); // Note: The code below must execute after we've bound the colour buffer & its framebuffer // Because it attaches a depth texture to the aforementioned colour buffer if (depthEnable) { gl.enableDepth(); gl.setDepthMask(depthWriteEnable ? GL_TRUE : GL_FALSE); gl.setDepthFunc(depthModes[depthFunc]); bindDepthBuffer(); } else { if (depthWriteEnable) { gl.enableDepth(); gl.setDepthMask(GL_TRUE); gl.setDepthFunc(GL_ALWAYS); bindDepthBuffer(); } else { gl.disableDepth(); } } vbo.bufferVertsSub(vertices); OpenGL::draw(primitiveTopology, GLsizei(vertices.size())); } constexpr u32 topScreenBuffer = 0x1f000000; constexpr u32 bottomScreenBuffer = 0x1f05dc00; void RendererGL::display() { gl.disableScissor(); glBindFramebuffer(GL_DRAW_FRAMEBUFFER, 0); screenFramebuffer.bind(OpenGL::ReadFramebuffer); glBlitFramebuffer(0, 0, 400, 480, 0, 0, 400, 480, GL_COLOR_BUFFER_BIT, GL_LINEAR); } void RendererGL::clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control) { return; log("GPU: Clear buffer\nStart: %08X End: %08X\nValue: %08X Control: %08X\n", startAddress, endAddress, value, control); const float r = float(getBits<24, 8>(value)) / 255.0f; const float g = float(getBits<16, 8>(value)) / 255.0f; const float b = float(getBits<8, 8>(value)) / 255.0f; const float a = float(value & 0xff) / 255.0f; if (startAddress == topScreenBuffer) { log("GPU: Cleared top screen\n"); } else if (startAddress == bottomScreenBuffer) { log("GPU: Tried to clear bottom screen\n"); return; } else { log("GPU: Clearing some unknown buffer\n"); } OpenGL::setClearColor(r, g, b, a); OpenGL::clearColor(); } OpenGL::Framebuffer RendererGL::getColourFBO() { // We construct a colour buffer object and see if our cache has any matching colour buffers in it // If not, we allocate a texture & FBO for our framebuffer and store it in the cache ColourBuffer sampleBuffer(colourBufferLoc, colourBufferFormat, fbSize[0], fbSize[1]); auto buffer = colourBufferCache.find(sampleBuffer); if (buffer.has_value()) { return buffer.value().get().fbo; } else { return colourBufferCache.add(sampleBuffer).fbo; } } void RendererGL::bindDepthBuffer() { // Similar logic as the getColourFBO function DepthBuffer sampleBuffer(depthBufferLoc, depthBufferFormat, fbSize[0], fbSize[1]); auto buffer = depthBufferCache.find(sampleBuffer); GLuint tex; if (buffer.has_value()) { tex = buffer.value().get().texture.m_handle; } else { tex = depthBufferCache.add(sampleBuffer).texture.m_handle; } if (PICA::DepthFmt::Depth24Stencil8 != depthBufferFormat) { Helpers::panicDev("TODO: Should we remove stencil attachment?"); } auto attachment = depthBufferFormat == PICA::DepthFmt::Depth24Stencil8 ? GL_DEPTH_STENCIL_ATTACHMENT : GL_DEPTH_ATTACHMENT; glFramebufferTexture2D(GL_FRAMEBUFFER, attachment, GL_TEXTURE_2D, tex, 0); } OpenGL::Texture RendererGL::getTexture(Texture& tex) { // Similar logic as the getColourFBO/bindDepthBuffer functions auto buffer = textureCache.find(tex); if (buffer.has_value()) { return buffer.value().get().texture; } else { const void* textureData = gpu.getPointerPhys(tex.location); // Get pointer to the texture data in 3DS memory Texture& newTex = textureCache.add(tex); newTex.decodeTexture(textureData); return newTex.texture; } } void RendererGL::displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) { const u32 inputWidth = inputSize & 0xffff; const u32 inputGap = inputSize >> 16; const u32 outputWidth = outputSize & 0xffff; const u32 outputGap = outputSize >> 16; auto framebuffer = colourBufferCache.findFromAddress(inputAddr); // If there's a framebuffer at this address, use it. Otherwise go back to our old hack and display framebuffer 0 // Displays are hard I really don't want to try implementing them because getting a fast solution is terrible OpenGL::Texture& tex = framebuffer.has_value() ? framebuffer.value().get().texture : colourBufferCache[0].texture; tex.bind(); screenFramebuffer.bind(OpenGL::DrawFramebuffer); gl.disableBlend(); gl.disableDepth(); gl.disableScissor(); gl.setColourMask(true, true, true, true); gl.useProgram(displayProgram); gl.bindVAO(dummyVAO); OpenGL::disableClipPlane(0); OpenGL::disableClipPlane(1); // Hack: Detect whether we are writing to the top or bottom screen by checking output gap and drawing to the proper part of the output texture // We consider output gap == 320 to mean bottom, and anything else to mean top if (outputGap == 320) { OpenGL::setViewport(40, 0, 320, 240); // Bottom screen viewport } else { OpenGL::setViewport(0, 240, 400, 240); // Top screen viewport } OpenGL::draw(OpenGL::TriangleStrip, 4); // Actually draw our 3DS screen } void RendererGL::screenshot(const std::string& name) { constexpr uint width = 400; constexpr uint height = 2 * 240; std::vector pixels, flippedPixels; pixels.resize(width * height * 4); flippedPixels.resize(pixels.size()); OpenGL::bindScreenFramebuffer(); glReadPixels(0, 0, width, height, GL_BGRA, GL_UNSIGNED_BYTE, pixels.data()); // Flip the image vertically for (int y = 0; y < height; y++) { memcpy(&flippedPixels[y * width * 4], &pixels[(height - y - 1) * width * 4], width * 4); // Swap R and B channels for (int x = 0; x < width; x++) { std::swap(flippedPixels[y * width * 4 + x * 4 + 0], flippedPixels[y * width * 4 + x * 4 + 2]); // Set alpha to 0xFF flippedPixels[y * width * 4 + x * 4 + 3] = 0xFF; } } stbi_write_png(name.c_str(), width, height, 4, flippedPixels.data(), 0); }