From e5a024ba852b5d79bfd7a5d903be5f931c855d97 Mon Sep 17 00:00:00 2001 From: Stenzek Date: Fri, 12 Apr 2024 17:21:09 +1000 Subject: [PATCH] MetalDevice: Add support for framebuffer fetch --- src/core/gpu_hw.cpp | 5 +- src/util/metal_device.h | 3 +- src/util/metal_device.mm | 153 +++++++++++++++++++++++++++++++++++-- src/util/shadergen.cpp | 25 ++++++ src/util/vulkan_device.cpp | 2 + 5 files changed, 179 insertions(+), 9 deletions(-) diff --git a/src/core/gpu_hw.cpp b/src/core/gpu_hw.cpp index 1099b6d74..b6ae728ef 100644 --- a/src/core/gpu_hw.cpp +++ b/src/core/gpu_hw.cpp @@ -1062,7 +1062,8 @@ bool GPU_HW::CompilePipelines() return false; plconfig.fragment_shader = fs.get(); - plconfig.depth = GPUPipeline::DepthState::GetAlwaysWriteState(); + plconfig.depth = needs_depth_buffer ? GPUPipeline::DepthState::GetAlwaysWriteState() : + GPUPipeline::DepthState::GetNoTestsState(); if (!(m_vram_fill_pipelines[wrapped][interlaced] = g_gpu_device->CreatePipeline(plconfig))) return false; @@ -1137,7 +1138,7 @@ bool GPU_HW::CompilePipelines() return false; plconfig.fragment_shader = fs.get(); - plconfig.depth = GPUPipeline::DepthState::GetAlwaysWriteState(); + plconfig.depth = GPUPipeline::DepthState::GetNoTestsState(); if (!(m_vram_write_replacement_pipeline = g_gpu_device->CreatePipeline(plconfig))) return false; diff --git a/src/util/metal_device.h b/src/util/metal_device.h index be51cb798..83e6ec97e 100644 --- a/src/util/metal_device.h +++ b/src/util/metal_device.h @@ -376,8 +376,9 @@ private: id m_render_cmdbuf = nil; id m_render_encoder = nil; + u8 m_num_current_render_targets = 0; + GPUPipeline::RenderPassFlag m_current_feedback_loop = GPUPipeline::NoRenderPassFlags; std::array m_current_render_targets = {}; - u32 m_num_current_render_targets = 0; MetalTexture* m_current_depth_target = nullptr; MetalPipeline* m_current_pipeline = nullptr; diff --git a/src/util/metal_device.mm b/src/util/metal_device.mm index cc631fe9b..a6c605cdf 100644 --- a/src/util/metal_device.mm +++ b/src/util/metal_device.mm @@ -222,6 +222,13 @@ void MetalDevice::SetFeatures(FeatureMask disabled_features) m_max_texture_size = 8192; } + // Framebuffer fetch requires MSL 2.3 and an Apple GPU family. + const bool supports_fbfetch = [m_device supportsFamily:MTLGPUFamilyApple1]; + + // If fbfetch is disabled, barriers aren't supported on Apple GPUs. + const bool supports_barriers = + ([m_device supportsFamily:MTLGPUFamilyMac1] && ![m_device supportsFamily:MTLGPUFamilyApple3]); + m_max_multisamples = 0; for (u32 multisamples = 1; multisamples < 16; multisamples *= 2) { @@ -231,13 +238,13 @@ void MetalDevice::SetFeatures(FeatureMask disabled_features) } m_features.dual_source_blend = !(disabled_features & FEATURE_MASK_DUAL_SOURCE_BLEND); - m_features.framebuffer_fetch = !(disabled_features & FEATURE_MASK_FRAMEBUFFER_FETCH) && false; // TODO + m_features.framebuffer_fetch = !(disabled_features & FEATURE_MASK_FRAMEBUFFER_FETCH) && supports_fbfetch; m_features.per_sample_shading = true; m_features.noperspective_interpolation = true; m_features.texture_copy_to_self = !(disabled_features & FEATURE_MASK_TEXTURE_COPY_TO_SELF); m_features.supports_texture_buffers = !(disabled_features & FEATURE_MASK_TEXTURE_BUFFERS); m_features.texture_buffers_emulated_with_ssbo = true; - m_features.feedback_loops = false; + m_features.feedback_loops = (m_features.framebuffer_fetch || supports_barriers); m_features.geometry_shaders = false; m_features.partial_msaa_resolve = false; m_features.memory_import = true; @@ -687,6 +694,9 @@ std::unique_ptr MetalDevice::CreateShaderFromSource(GPUShaderStage st spirv_cross::CompilerMSL compiler(result.cbegin(), std::distance(result.cbegin(), result.cend())); spirv_cross::CompilerMSL::Options msl_options = compiler.get_msl_options(); msl_options.pad_fragment_output_components = true; + msl_options.use_framebuffer_fetch_subpasses = m_features.framebuffer_fetch; + if (m_features.framebuffer_fetch) + msl_options.set_msl_version(2, 3); if (stage == GPUShaderStage::Fragment) { @@ -702,6 +712,16 @@ std::unique_ptr MetalDevice::CreateShaderFromSource(GPUShaderStage st rb.msl_buffer = i; compiler.add_msl_resource_binding(rb); } + + if (!m_features.framebuffer_fetch) + { + spirv_cross::MSLResourceBinding rb; + rb.stage = spv::ExecutionModelFragment; + rb.desc_set = 2; + rb.binding = 0; + rb.msl_texture = MAX_TEXTURE_SAMPLERS; + compiler.add_msl_resource_binding(rb); + } } compiler.set_msl_options(msl_options); @@ -1764,8 +1784,9 @@ void MetalDevice::UnmapUniformBuffer(u32 size) void MetalDevice::SetRenderTargets(GPUTexture* const* rts, u32 num_rts, GPUTexture* ds, GPUPipeline::RenderPassFlag feedback_loop) { - DebugAssert(!feedback_loop); - bool changed = (m_num_current_render_targets != num_rts || m_current_depth_target != ds); + bool changed = (m_num_current_render_targets != num_rts || m_current_depth_target != ds || + (!m_features.framebuffer_fetch && ((feedback_loop & GPUPipeline::ColorFeedbackLoop) != + (m_current_feedback_loop & GPUPipeline::ColorFeedbackLoop)))); bool needs_ds_clear = (ds && ds->IsClearedOrInvalidated()); bool needs_rt_clear = false; @@ -1779,7 +1800,8 @@ void MetalDevice::SetRenderTargets(GPUTexture* const* rts, u32 num_rts, GPUTextu } for (u32 i = num_rts; i < m_num_current_render_targets; i++) m_current_render_targets[i] = nullptr; - m_num_current_render_targets = num_rts; + m_num_current_render_targets = static_cast(num_rts); + m_current_feedback_loop = feedback_loop; if (changed || needs_rt_clear || needs_ds_clear) { @@ -2077,6 +2099,13 @@ void MetalDevice::SetInitialEncoderState() [m_render_encoder setFragmentSamplerStates:m_current_samplers.data() withRange:NSMakeRange(0, MAX_TEXTURE_SAMPLERS)]; if (m_current_ssbo) [m_render_encoder setFragmentBuffer:m_current_ssbo offset:0 atIndex:1]; + + if (!m_features.framebuffer_fetch && (m_current_feedback_loop & GPUPipeline::ColorFeedbackLoop)) + { + DebugAssert(m_current_render_targets[0]); + [m_render_encoder setFragmentTexture:m_current_render_targets[0]->GetMTLTexture() atIndex:MAX_TEXTURE_SAMPLERS]; + } + SetViewportInRenderEncoder(); SetScissorInRenderEncoder(); } @@ -2138,7 +2167,118 @@ void MetalDevice::DrawIndexed(u32 index_count, u32 base_index, u32 base_vertex) void MetalDevice::DrawIndexedWithBarrier(u32 index_count, u32 base_index, u32 base_vertex, DrawBarrier type) { - Panic("Barriers are not supported"); + // Shouldn't be using this with framebuffer fetch. + DebugAssert(!m_features.framebuffer_fetch); + + const bool skip_first_barrier = !InRenderPass(); + PreDrawCheck(); + + // TODO: The first barrier is unnecessary if we're starting the render pass. + + u32 index_offset = base_index * sizeof(u16); + + switch (type) + { + case GPUDevice::DrawBarrier::None: + { + s_stats.num_draws++; + + [m_render_encoder drawIndexedPrimitives:m_current_pipeline->GetPrimitive() + indexCount:index_count + indexType:MTLIndexTypeUInt16 + indexBuffer:m_index_buffer.GetBuffer() + indexBufferOffset:index_offset + instanceCount:1 + baseVertex:base_vertex + baseInstance:0]; + } + break; + + case GPUDevice::DrawBarrier::One: + { + DebugAssert(m_num_current_render_targets == 1); + s_stats.num_draws++; + + if (!skip_first_barrier) + { + s_stats.num_barriers++; + [m_render_encoder memoryBarrierWithScope:MTLBarrierScopeRenderTargets + afterStages:MTLRenderStageFragment + beforeStages:MTLRenderStageFragment]; + } + + [m_render_encoder drawIndexedPrimitives:m_current_pipeline->GetPrimitive() + indexCount:index_count + indexType:MTLIndexTypeUInt16 + indexBuffer:m_index_buffer.GetBuffer() + indexBufferOffset:index_offset + instanceCount:1 + baseVertex:base_vertex + baseInstance:0]; + } + break; + + case GPUDevice::DrawBarrier::Full: + { + DebugAssert(m_num_current_render_targets == 1); + + static constexpr const u8 vertices_per_primitive[][2] = { + {1, 1}, // MTLPrimitiveTypePoint + {2, 2}, // MTLPrimitiveTypeLine + {2, 1}, // MTLPrimitiveTypeLineStrip + {3, 3}, // MTLPrimitiveTypeTriangle + {3, 1}, // MTLPrimitiveTypeTriangleStrip + }; + + const u32 first_step = + vertices_per_primitive[static_cast(m_current_pipeline->GetPrimitive())][0] * sizeof(u16); + const u32 index_step = + vertices_per_primitive[static_cast(m_current_pipeline->GetPrimitive())][1] * sizeof(u16); + const u32 end_offset = (base_index + index_count) * sizeof(u16); + + // first primitive + if (!skip_first_barrier) + { + s_stats.num_barriers++; + [m_render_encoder memoryBarrierWithScope:MTLBarrierScopeRenderTargets + afterStages:MTLRenderStageFragment + beforeStages:MTLRenderStageFragment]; + } + s_stats.num_draws++; + [m_render_encoder drawIndexedPrimitives:m_current_pipeline->GetPrimitive() + indexCount:index_count + indexType:MTLIndexTypeUInt16 + indexBuffer:m_index_buffer.GetBuffer() + indexBufferOffset:index_offset + instanceCount:1 + baseVertex:base_vertex + baseInstance:0]; + + index_offset += first_step; + + // remaining primitices + for (; index_offset < end_offset; index_offset += index_step) + { + s_stats.num_barriers++; + s_stats.num_draws++; + + [m_render_encoder memoryBarrierWithScope:MTLBarrierScopeRenderTargets + afterStages:MTLRenderStageFragment + beforeStages:MTLRenderStageFragment]; + [m_render_encoder drawIndexedPrimitives:m_current_pipeline->GetPrimitive() + indexCount:index_count + indexType:MTLIndexTypeUInt16 + indexBuffer:m_index_buffer.GetBuffer() + indexBufferOffset:index_offset + instanceCount:1 + baseVertex:base_vertex + baseInstance:0]; + } + } + break; + + DefaultCaseIsUnreachable(); + } } id MetalDevice::GetBlitEncoder(bool is_inline) @@ -2199,6 +2339,7 @@ bool MetalDevice::BeginPresent(bool skip_present) s_stats.num_render_passes++; std::memset(m_current_render_targets.data(), 0, sizeof(m_current_render_targets)); m_num_current_render_targets = 0; + m_current_feedback_loop = GPUPipeline::NoRenderPassFlags; m_current_depth_target = nullptr; m_current_pipeline = nullptr; m_current_depth_state = nil; diff --git a/src/util/shadergen.cpp b/src/util/shadergen.cpp index 3c4c76ca4..b36141201 100644 --- a/src/util/shadergen.cpp +++ b/src/util/shadergen.cpp @@ -123,6 +123,15 @@ void ShaderGen::WriteHeader(std::stringstream& ss) else if (m_spirv) ss << "#version 450 core\n\n"; +#ifdef __APPLE__ + // TODO: Do this for Vulkan as well. + if (m_render_api == RenderAPI::Metal) + { + if (!m_supports_framebuffer_fetch) + ss << "#extension GL_EXT_samplerless_texture_functions : require\n"; + } +#endif + #ifdef ENABLE_OPENGL // Extension enabling for OpenGL. if (m_render_api == RenderAPI::OpenGL || m_render_api == RenderAPI::OpenGLES) @@ -587,6 +596,22 @@ void ShaderGen::DeclareFragmentEntryPoint( ss << "layout(input_attachment_index = 0, set = 2, binding = 0) uniform subpassInput u_input_rt;\n"; ss << "#define LAST_FRAG_COLOR subpassLoad(u_input_rt)\n"; } +#endif +#ifdef __APPLE__ + if (m_render_api == RenderAPI::Metal) + { + if (m_supports_framebuffer_fetch) + { + // Set doesn't matter, because it's transformed to color0. + ss << "layout(input_attachment_index = 0, set = 2, binding = 0) uniform subpassInput u_input_rt;\n"; + ss << "#define LAST_FRAG_COLOR subpassLoad(u_input_rt)\n"; + } + else + { + ss << "layout(set = 2, binding = 0) uniform texture2D u_input_rt;\n"; + ss << "#define LAST_FRAG_COLOR texelFetch(u_input_rt, int2(gl_FragCoord.xy), 0)\n"; + } + } #endif } diff --git a/src/util/vulkan_device.cpp b/src/util/vulkan_device.cpp index eca673fd2..8db1197ca 100644 --- a/src/util/vulkan_device.cpp +++ b/src/util/vulkan_device.cpp @@ -3886,5 +3886,7 @@ void VulkanDevice::DrawIndexedWithBarrier(u32 index_count, u32 base_index, u32 b } } break; + + DefaultCaseIsUnreachable(); } }