GPU/HW: Use R16U format for downloads

More parallelism.
wip
Stenzek 4 days ago
parent 9977e1dca0
commit 0261a78cbd
No known key found for this signature in database

@ -47,6 +47,7 @@ static constexpr GPUTexture::Format VRAM_RT_FORMAT = GPUTexture::Format::RGBA8;
static constexpr GPUTexture::Format VRAM_DS_FORMAT = GPUTexture::Format::D16;
static constexpr GPUTexture::Format VRAM_DS_DEPTH_FORMAT = GPUTexture::Format::D32F;
static constexpr GPUTexture::Format VRAM_DS_COLOR_FORMAT = GPUTexture::Format::R32F;
static constexpr GPUTexture::Format VRAM_TRANSFER_FORMAT = GPUTexture::Format::R16U;
#if defined(_DEBUG) || defined(_DEVEL)
@ -927,8 +928,8 @@ bool GPU_HW::CreateBuffers(Error* error)
g_gpu_device->FetchTexture(texture_width, texture_height, 1, 1, 1, GPUTexture::Type::Texture, VRAM_RT_FORMAT,
read_texture_flags, nullptr, 0, error)) ||
!(m_vram_readback_texture =
g_gpu_device->FetchTexture(VRAM_WIDTH / 2, VRAM_HEIGHT, 1, 1, 1, GPUTexture::Type::RenderTarget,
VRAM_RT_FORMAT, GPUTexture::Flags::None, nullptr, 0, error)))
g_gpu_device->FetchTexture(VRAM_WIDTH, VRAM_HEIGHT, 1, 1, 1, GPUTexture::Type::RenderTarget,
VRAM_TRANSFER_FORMAT, GPUTexture::Flags::None, nullptr, 0, error)))
{
Error::AddPrefix(error, "Failed to create VRAM textures: ");
return false;
@ -1773,7 +1774,7 @@ bool GPU_HW::CompileResolutionDependentPipelines(Error* error)
plconfig.depth = GPUPipeline::DepthState::GetNoTestsState();
plconfig.blend = GPUPipeline::BlendState::GetNoBlendingState();
plconfig.vertex_shader = m_fullscreen_quad_vertex_shader.get();
plconfig.SetTargetFormats(VRAM_RT_FORMAT);
plconfig.SetTargetFormats(VRAM_TRANSFER_FORMAT);
// VRAM read
{
@ -1792,6 +1793,7 @@ bool GPU_HW::CompileResolutionDependentPipelines(Error* error)
}
// Display
plconfig.SetTargetFormats(VRAM_RT_FORMAT);
{
for (u8 shader = 0; shader < 3; shader++)
{
@ -3386,26 +3388,16 @@ void GPU_HW::DownloadVRAMFromGPU(u32 x, u32 y, u32 width, u32 height)
// TODO: Only read if it's in the drawn area
// Get bounds with wrap-around handled.
GSVector4i copy_rect = GetVRAMTransferBounds(x, y, width, height);
// Has to be aligned to an even pixel for the download, due to 32-bit packing.
if (copy_rect.left & 1)
copy_rect.left--;
if (copy_rect.right & 1)
copy_rect.right++;
DebugAssert((copy_rect.left % 2) == 0 && (copy_rect.width() % 2) == 0);
const u32 encoded_left = copy_rect.left / 2;
const u32 encoded_top = copy_rect.top;
const u32 encoded_width = copy_rect.width() / 2;
const u32 encoded_height = copy_rect.height();
const GSVector4i copy_rect = GetVRAMTransferBounds(x, y, width, height);
const u32 copy_width = static_cast<u32>(copy_rect.width());
const u32 copy_height = static_cast<u32>(copy_rect.height());
// Encode the 24-bit texture as 16-bit.
const s32 uniforms[4] = {copy_rect.left, copy_rect.top, copy_rect.width(), copy_rect.height()};
g_gpu_device->SetRenderTarget(m_vram_readback_texture.get());
g_gpu_device->SetPipeline(m_vram_readback_pipeline.get());
g_gpu_device->SetTextureSampler(0, m_vram_texture.get(), g_gpu_device->GetNearestSampler());
g_gpu_device->SetViewportAndScissor(0, 0, encoded_width, encoded_height);
g_gpu_device->SetViewportAndScissor(0, 0, copy_width, copy_height);
g_gpu_device->PushUniformBuffer(uniforms, sizeof(uniforms));
g_gpu_device->Draw(3, 0);
@ -3413,18 +3405,17 @@ void GPU_HW::DownloadVRAMFromGPU(u32 x, u32 y, u32 width, u32 height)
if (m_vram_readback_download_texture->IsImported())
{
// Fast path, read directly.
m_vram_readback_download_texture->CopyFromTexture(encoded_left, encoded_top, m_vram_readback_texture.get(), 0, 0,
encoded_width, encoded_height, 0, 0, false);
m_vram_readback_download_texture->CopyFromTexture(copy_rect.x, copy_rect.y, m_vram_readback_texture.get(), 0, 0,
copy_width, copy_height, 0, 0, false);
m_vram_readback_download_texture->Flush();
}
else
{
// Copy to staging buffer, then to VRAM.
m_vram_readback_download_texture->CopyFromTexture(0, 0, m_vram_readback_texture.get(), 0, 0, encoded_width,
encoded_height, 0, 0, true);
m_vram_readback_download_texture->ReadTexels(0, 0, encoded_width, encoded_height,
&g_vram[copy_rect.top * VRAM_WIDTH + copy_rect.left],
VRAM_WIDTH * sizeof(u16));
m_vram_readback_download_texture->CopyFromTexture(0, 0, m_vram_readback_texture.get(), 0, 0, copy_width,
copy_height, 0, 0, true);
m_vram_readback_download_texture->ReadTexels(
0, 0, copy_width, copy_height, &g_vram[copy_rect.top * VRAM_WIDTH + copy_rect.left], VRAM_WIDTH * sizeof(u16));
}
RestoreDeviceContext();
@ -3477,7 +3468,7 @@ void GPU_HW::UpdateVRAMOnGPU(u32 x, u32 y, u32 width, u32 height, const void* da
{
map_index = 0;
upload_texture =
g_gpu_device->FetchAutoRecycleTexture(width, height, 1, 1, 1, GPUTexture::Type::Texture, GPUTexture::Format::R16U,
g_gpu_device->FetchAutoRecycleTexture(width, height, 1, 1, 1, GPUTexture::Type::Texture, VRAM_TRANSFER_FORMAT,
GPUTexture::Flags::None, data, data_pitch);
if (!upload_texture)
{

@ -1395,13 +1395,13 @@ float4 SampleFromVRAM(TEXPAGE_VALUE texpage, float2 coords)
if (uv_limits)
{
DeclareFragmentEntryPoint(ss, 1, 1, {{"nointerpolation", "float4 v_uv_limits"}}, true, num_fragment_outputs,
use_dual_source, write_mask_as_depth, msaa, per_sample_shading, false,
false, use_dual_source, write_mask_as_depth, msaa, per_sample_shading, false,
disable_color_perspective, shader_blending && !use_rov, use_rov);
}
else
{
DeclareFragmentEntryPoint(ss, 1, 1, {}, true, num_fragment_outputs, use_dual_source, write_mask_as_depth, msaa,
per_sample_shading, false, disable_color_perspective, shader_blending && !use_rov,
DeclareFragmentEntryPoint(ss, 1, 1, {}, true, num_fragment_outputs, false, use_dual_source, write_mask_as_depth,
msaa, per_sample_shading, false, disable_color_perspective, shader_blending && !use_rov,
use_rov);
}
}
@ -1415,21 +1415,22 @@ float4 SampleFromVRAM(TEXPAGE_VALUE texpage, float2 coords)
DeclareFragmentEntryPoint(ss, 1, 1,
{{"nointerpolation", palette ? "uint4 v_texpage" : "uint2 v_texpage"},
{"nointerpolation", "float4 v_uv_limits"}},
true, num_fragment_outputs, use_dual_source, write_mask_as_depth, msaa,
true, num_fragment_outputs, false, use_dual_source, write_mask_as_depth, msaa,
per_sample_shading, false, disable_color_perspective, shader_blending && !use_rov,
use_rov);
}
else
{
DeclareFragmentEntryPoint(ss, 1, 1, {{"nointerpolation", palette ? "uint4 v_texpage" : "uint2 v_texpage"}}, true,
num_fragment_outputs, use_dual_source, write_mask_as_depth, msaa, per_sample_shading,
false, disable_color_perspective, shader_blending && !use_rov, use_rov);
num_fragment_outputs, false, use_dual_source, write_mask_as_depth, msaa,
per_sample_shading, false, disable_color_perspective, shader_blending && !use_rov,
use_rov);
}
}
else
{
DeclareFragmentEntryPoint(ss, 1, 0, {}, true, num_fragment_outputs, use_dual_source, write_mask_as_depth, msaa,
per_sample_shading, false, disable_color_perspective, shader_blending && !use_rov,
DeclareFragmentEntryPoint(ss, 1, 0, {}, true, num_fragment_outputs, false, use_dual_source, write_mask_as_depth,
msaa, per_sample_shading, false, disable_color_perspective, shader_blending && !use_rov,
use_rov);
}
@ -1933,19 +1934,12 @@ uint SampleVRAM(uint2 coords)
}
)";
DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1);
DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1, true);
ss << R"(
{
uint2 sample_coords = uint2(uint(v_pos.x) * 2u, uint(v_pos.y));
sample_coords += u_base_coords;
// We're encoding as 32-bit, so the output width is halved and we pack two 16-bit pixels in one 32-bit pixel.
uint left = SampleVRAM(sample_coords);
uint right = SampleVRAM(uint2(sample_coords.x + 1u, sample_coords.y));
o_col0 = float4(float(left & 0xFFu), float((left >> 8) & 0xFFu),
float(right & 0xFFu), float((right >> 8) & 0xFFu))
/ float4(255.0, 255.0, 255.0, 255.0);
uint2 sample_coords = uint2(v_pos.xy) + u_base_coords;
uint value = SampleVRAM(sample_coords);
o_col0 = uint4(value, 0u, 0u, 0u);
})";
return std::move(ss).str();
@ -1997,7 +1991,7 @@ std::string GPU_HW_ShaderGen::GenerateVRAMWriteFragmentShader(bool use_buffer, b
ss << "#define GET_VALUE(buffer_offset) (LOAD_TEXTURE_BUFFER(samp0, int(buffer_offset)).r)\n\n";
}
DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1 + BoolToUInt32(write_depth_as_rt), false, write_mask_as_depth);
DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1 + BoolToUInt32(write_depth_as_rt), false, false, write_mask_as_depth);
ss << R"(
{
float2 coords = floor(v_pos.xy / u_resolution_scale);
@ -2051,8 +2045,8 @@ std::string GPU_HW_ShaderGen::GenerateVRAMCopyFragmentShader(bool write_mask_as_
true);
DeclareTexture(ss, "samp0", 0, msaa);
DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1 + BoolToUInt32(write_depth_as_rt), false, write_mask_as_depth, false,
false, msaa);
DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1 + BoolToUInt32(write_depth_as_rt), false, false, write_mask_as_depth,
false, false, msaa);
ss << R"(
{
float2 dst_coords = floor(v_pos.xy);
@ -2105,7 +2099,7 @@ std::string GPU_HW_ShaderGen::GenerateVRAMFillFragmentShader(bool wrapped, bool
DeclareUniformBuffer(
ss, {"uint2 u_dst_coords", "uint2 u_end_coords", "float4 u_fill_color", "uint u_interlaced_displayed_field"}, true);
DeclareFragmentEntryPoint(ss, 0, 1, {}, interlaced || wrapped, 1 + BoolToUInt32(write_depth_as_rt), false,
DeclareFragmentEntryPoint(ss, 0, 1, {}, interlaced || wrapped, 1 + BoolToUInt32(write_depth_as_rt), false, false,
write_mask_as_depth, false, false, false);
ss << R"(
{
@ -2144,7 +2138,7 @@ std::string GPU_HW_ShaderGen::GenerateVRAMUpdateDepthFragmentShader(bool msaa) c
WriteHeader(ss);
DefineMacro(ss, "MULTISAMPLING", msaa);
DeclareTexture(ss, "samp0", 0, msaa);
DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 0, false, true, false, false, msaa);
DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 0, false, false, true, false, false, msaa);
ss << R"(
{
@ -2165,7 +2159,7 @@ std::string GPU_HW_ShaderGen::GenerateVRAMCopyDepthFragmentShader(bool msaa) con
WriteHeader(ss);
DefineMacro(ss, "MULTISAMPLED", msaa);
DeclareTexture(ss, "samp0", 0, msaa);
DeclareFragmentEntryPoint(ss, 0, 1, {}, msaa, 1, false, false, msaa, msaa, msaa);
DeclareFragmentEntryPoint(ss, 0, 1, {}, msaa, 1, false, false, false, msaa, msaa, msaa);
ss << R"(
{
@ -2185,7 +2179,8 @@ std::string GPU_HW_ShaderGen::GenerateVRAMClearDepthFragmentShader(bool write_de
std::stringstream ss;
WriteHeader(ss);
DefineMacro(ss, "WRITE_DEPTH_AS_RT", write_depth_as_rt);
DeclareFragmentEntryPoint(ss, 0, 1, {}, false, BoolToUInt32(write_depth_as_rt), false, false, false, false, false);
DeclareFragmentEntryPoint(ss, 0, 1, {}, false, BoolToUInt32(write_depth_as_rt), false, false, false, false, false,
false);
ss << R"(
{

@ -625,10 +625,10 @@ void ShaderGen::DeclareVertexEntryPoint(
void ShaderGen::DeclareFragmentEntryPoint(
std::stringstream& ss, u32 num_color_inputs, u32 num_texcoord_inputs,
const std::initializer_list<std::pair<const char*, const char*>>& additional_inputs /* = */,
bool declare_fragcoord /* = false */, u32 num_color_outputs /* = 1 */, bool dual_source_output /* = false */,
bool depth_output /* = false */, bool msaa /* = false */, bool ssaa /* = false */,
bool declare_sample_id /* = false */, bool noperspective_color /* = false */, bool feedback_loop /* = false */,
bool rov /* = false */) const
bool declare_fragcoord /* = false */, u32 num_color_outputs /* = 1 */, bool integer_color_output /* = false */,
bool dual_source_output /* = false */, bool depth_output /* = false */, bool msaa /* = false */,
bool ssaa /* = false */, bool declare_sample_id /* = false */, bool noperspective_color /* = false */,
bool feedback_loop /* = false */, bool rov /* = false */) const
{
if (m_glsl)
{
@ -761,23 +761,26 @@ void ShaderGen::DeclareFragmentEntryPoint(
{
for (u32 i = 0; i < num_color_outputs; i++)
{
ss << "layout(location = 0, index = " << i << ") " << ((i == 0) ? target_0_qualifier : "out")
<< " float4 o_col" << i << ";\n";
ss << "layout(location = 0, index = " << i << ") " << ((i == 0) ? target_0_qualifier : "out") << " "
<< (integer_color_output ? "uint4" : "float4") << " o_col" << i << ";\n";
}
}
else
{
for (u32 i = 0; i < num_color_outputs; i++)
{
ss << "layout(location = " << i << ") " << ((i == 0) ? target_0_qualifier : "out") << " float4 o_col" << i
<< ";\n";
ss << "layout(location = " << i << ") " << ((i == 0) ? target_0_qualifier : "out") << " "
<< (integer_color_output ? "uint4" : "float4") << " o_col" << i << ";\n";
}
}
}
else
{
for (u32 i = 0; i < num_color_outputs; i++)
ss << ((i == 0) ? target_0_qualifier : "out") << " float4 o_col" << i << ";\n";
{
ss << ((i == 0) ? target_0_qualifier : "out") << " " << (integer_color_output ? "uint4" : "float4") << " o_col"
<< i << ";\n";
}
}
ss << "\n";
@ -839,7 +842,8 @@ void ShaderGen::DeclareFragmentEntryPoint(
}
for (u32 i = 0; i < num_color_outputs; i++)
{
ss << (first ? "" : ",\n") << " out float4 o_col" << i << " : SV_Target" << i;
ss << (first ? "" : ",\n") << " out " << (integer_color_output ? "uint4" : "float4") << " o_col" << i
<< " : SV_Target" << i;
first = false;
}

@ -61,7 +61,8 @@ public:
void
DeclareFragmentEntryPoint(std::stringstream& ss, u32 num_color_inputs, u32 num_texcoord_inputs,
const std::initializer_list<std::pair<const char*, const char*>>& additional_inputs = {},
bool declare_fragcoord = false, u32 num_color_outputs = 1, bool dual_source_output = false,
bool declare_fragcoord = false, u32 num_color_outputs = 1,
bool integer_color_output = false, bool dual_source_output = false,
bool depth_output = false, bool msaa = false, bool ssaa = false,
bool declare_sample_id = false, bool noperspective_color = false,
bool feedback_loop = false, bool rov = false) const;

Loading…
Cancel
Save