GPU: More accurate texture blending in non-true-colour-mode

Fixes Silent Hill loading animation, Advanced V.G. intro fade-out, and probably others.
5 years ago · bf3c83658a
parent c114873ed8
commit bf3c83658a
4 changed files with 85 additions and 40 deletions
--- a/src/core/gpu.h
+++ b/src/core/gpu.h
@ -95,7 +95,8 @@ public:
    MAX_PRIMITIVE_HEIGHT = 512,
    DOT_TIMER_INDEX = 0,
    HBLANK_TIMER_INDEX = 1,
-    MAX_RESOLUTION_SCALE = 16
+    MAX_RESOLUTION_SCALE = 16,
+    DITHER_MATRIX_SIZE = 4
  };

  enum : u16
@ -107,10 +108,10 @@ public:
  };

  // 4x4 dither matrix.
-  static constexpr s32 DITHER_MATRIX[4][4] = {{-4, +0, -3, +1},  // row 0
-                                              {+2, -2, +3, -1},  // row 1
-                                              {-3, +1, -4, +0},  // row 2
-                                              {+4, -1, +2, -2}}; // row 3
+  static constexpr s32 DITHER_MATRIX[DITHER_MATRIX_SIZE][DITHER_MATRIX_SIZE] = {{-4, +0, -3, +1},  // row 0
+                                                                                {+2, -2, +3, -1},  // row 1
+                                                                                {-3, +1, -4, +0},  // row 2
+                                                                                {+4, -1, +2, -2}}; // row 3

  // Base class constructor.
  GPU();
--- a/src/core/gpu_hw_shadergen.cpp
+++ b/src/core/gpu_hw_shadergen.cpp
@ -581,17 +581,20 @@ std::string GPU_HW_ShaderGen::GenerateBatchFragmentShader(GPU_HW::BatchRenderMod
    ss << "};\n";

  ss << R"(
-int3 ApplyDithering(uint2 coord, int3 icol)
+uint3 ApplyDithering(uint2 coord, uint3 icol)
 {
-  uint2 fc = coord & uint2(3u, 3u);
+  #if DITHERING_SCALED
+    uint2 fc = coord & uint2(3u, 3u);
+  #else
+    uint2 fc = (coord / uint2(RESOLUTION_SCALE, RESOLUTION_SCALE)) & uint2(3u, 3u);
+  #endif
  int offset = s_dither_values[fc.y * 4u + fc.x];
-  return icol + int3(offset, offset, offset);
-}

-int3 TruncateTo15Bit(int3 icol)
-{
-  icol = clamp(icol, int3(0, 0, 0), int3(255, 255, 255));
-  return (icol & int3(~7, ~7, ~7)) | ((icol >> 3) & int3(7, 7, 7));
+  #if !TRUE_COLOR
+    return uint3(clamp((int3(icol) + int3(offset, offset, offset)) >> 3, 0, 31));
+  #else
+    return uint3(clamp(int3(icol) + int3(offset, offset, offset), 0, 255));
+  #endif
 }

 #if TEXTURED
@ -654,10 +657,10 @@ float4 SampleFromVRAM(uint4 texpage, uint2 icoord)

  ss << R"(
 {
-  int3 vertcol = int3(v_col0.rgb * float3(255.0, 255.0, 255.0));
+  uint3 vertcol = uint3(v_col0.rgb * float3(255.0, 255.0, 255.0));

  bool semitransparent;
-  int3 icolor;
+  uint3 icolor;
  float ialpha;
  float oalpha;

@ -707,10 +710,27 @@ float4 SampleFromVRAM(uint4 texpage, uint2 icoord)
      ialpha = 1.0;
    #endif

-    #if RAW_TEXTURE
-      icolor = int3(texcol.rgb * float3(255.0, 255.0, 255.0));
+    // If not using true color, truncate the framebuffer colors to 5-bit.
+    #if !TRUE_COLOR
+      icolor = uint3(texcol.rgb * float3(255.0, 255.0, 255.0)) >> 3;
+      #if !RAW_TEXTURE
+        icolor = (icolor * vertcol) >> 4;
+        #if DITHERING
+          icolor = ApplyDithering(uint2(v_pos.xy), icolor);
+        #else
+          icolor = min(icolor >> 3, uint3(31u, 31u, 31u));
+        #endif
+      #endif
    #else
-      icolor = (vertcol * int3(texcol.rgb * float3(255.0, 255.0, 255.0))) >> 7;
+      icolor = uint3(texcol.rgb * float3(255.0, 255.0, 255.0));
+      #if !RAW_TEXTURE
+        icolor = (icolor * vertcol) >> 7;
+        #if DITHERING
+          icolor = ApplyDithering(uint2(v_pos.xy), icolor);
+        #else
+          icolor = min(icolor, uint3(255u, 255u, 255u));
+        #endif
+      #endif
    #endif

    // Compute output alpha (mask bit)
@ -721,17 +741,16 @@ float4 SampleFromVRAM(uint4 texpage, uint2 icoord)
    icolor = vertcol;
    ialpha = 1.0;

-    // However, the mask bit is cleared if set mask bit is false.
-    oalpha = float(u_set_mask_while_drawing);
-  #endif
-
-  // Apply dithering
-  #if DITHERING
-    #if DITHERING_SCALED
+    #if DITHERING
      icolor = ApplyDithering(uint2(v_pos.xy), icolor);
    #else
-      icolor = ApplyDithering(uint2(v_pos.xy) / uint2(RESOLUTION_SCALE, RESOLUTION_SCALE), icolor);
+      #if !TRUE_COLOR
+        icolor >>= 3;
+      #endif
    #endif
+
+    // However, the mask bit is cleared if set mask bit is false.
+    oalpha = float(u_set_mask_while_drawing);
  #endif

  // Premultiply alpha so we don't need to use a colour output for it.
@ -744,11 +763,10 @@ float4 SampleFromVRAM(uint4 texpage, uint2 icoord)
  #if !TRUE_COLOR
    // We want to apply the alpha before the truncation to 16-bit, otherwise we'll be passing a 32-bit precision color
    // into the blend unit, which can cause a small amount of error to accumulate.
-    icolor = int3(((float3(icolor) / float3(255.0, 255.0, 255.0)) * premultiply_alpha) * float3(255.0, 255.0, 255.0));
-    color = (float3(icolor >> 3) / float3(31.0, 31.0, 31.0));
+    color = floor(float3(icolor) * premultiply_alpha) / float3(31.0, 31.0, 31.0);
  #else
    // True color is actually simpler here since we want to preserve the precision.
-    color = (float3(icolor) / float3(255.0, 255.0, 255.0)) * premultiply_alpha;
+    color = (float3(icolor) * premultiply_alpha) / float3(255.0, 255.0, 255.0);
  #endif

  #if TRANSPARENCY
--- a/src/core/gpu_sw.cpp
+++ b/src/core/gpu_sw.cpp
@ -551,6 +551,25 @@ void GPU_SW::DrawRectangle(s32 origin_x, s32 origin_y, u32 width, u32 height, u8
  }
 }

+constexpr GPU_SW::DitherLUT GPU_SW::ComputeDitherLUT()
+{
+  DitherLUT lut = {};
+  for (u32 i = 0; i < DITHER_MATRIX_SIZE; i++)
+  {
+    for (u32 j = 0; j < DITHER_MATRIX_SIZE; j++)
+    {
+      for (s32 value = 0; value < DITHER_LUT_SIZE; value++)
+      {
+        const s32 dithered_value = (value + DITHER_MATRIX[i][j]) >> 3;
+        lut[i][j][value] = static_cast<u8>((dithered_value < 0) ? 0 : ((dithered_value > 31) ? 31 : dithered_value));
+      }
+    }
+  }
+  return lut;
+}
+
+static constexpr GPU_SW::DitherLUT s_dither_lut = GPU_SW::ComputeDitherLUT();
+
 template<bool texture_enable, bool raw_texture_enable, bool transparency_enable, bool dithering_enable>
 void GPU_SW::ShadePixel(u32 x, u32 y, u8 color_r, u8 color_g, u8 color_b, u8 texcoord_x, u8 texcoord_y)
 {
@ -612,23 +631,25 @@ void GPU_SW::ShadePixel(u32 x, u32 y, u8 color_r, u8 color_g, u8 color_b, u8 tex
    }
    else
    {
-      const u8 r = Truncate8(std::min<u16>((ZeroExtend16(texture_color.GetR8()) * ZeroExtend16(color_r)) >> 7, 0xFF));
-      const u8 g = Truncate8(std::min<u16>((ZeroExtend16(texture_color.GetG8()) * ZeroExtend16(color_g)) >> 7, 0xFF));
-      const u8 b = Truncate8(std::min<u16>((ZeroExtend16(texture_color.GetB8()) * ZeroExtend16(color_b)) >> 7, 0xFF));
-      if constexpr (dithering_enable)
-        color.SetRGB24Dithered(x, y, r, g, b, texture_color.c);
-      else
-        color.SetRGB24(r, g, b, texture_color.c);
+      const u32 dither_y = (dithering_enable) ? (y & 3u) : 2u;
+      const u32 dither_x = (dithering_enable) ? (x & 3u) : 3u;
+
+      color.bits = (ZeroExtend16(s_dither_lut[dither_y][dither_x][(u16(texture_color.r) * u16(color_r)) >> 4]) << 0) |
+                   (ZeroExtend16(s_dither_lut[dither_y][dither_x][(u16(texture_color.g) * u16(color_g)) >> 4]) << 5) |
+                   (ZeroExtend16(s_dither_lut[dither_y][dither_x][(u16(texture_color.b) * u16(color_b)) >> 4]) << 10) |
+                   (texture_color.bits & 0x8000u);
    }
  }
  else
  {
    transparent = true;

-    if constexpr (dithering_enable)
-      color.SetRGB24Dithered(x, y, color_r, color_g, color_b);
-    else
-      color.SetRGB24(color_r, color_g, color_b);
+    const u32 dither_y = (dithering_enable) ? (y & 3u) : 2u;
+    const u32 dither_x = (dithering_enable) ? (x & 3u) : 3u;
+
+    color.bits = (ZeroExtend16(s_dither_lut[dither_y][dither_x][color_r]) << 0) |
+                 (ZeroExtend16(s_dither_lut[dither_y][dither_x][color_g]) << 5) |
+                 (ZeroExtend16(s_dither_lut[dither_y][dither_x][color_b]) << 10);
  }

  const VRAMPixel bg_color{GetPixel(static_cast<u32>(x), static_cast<u32>(y))};
--- a/src/core/gpu_sw.h
+++ b/src/core/gpu_sw.h
@ -23,6 +23,11 @@ public:
  u16* GetPixelPtr(u32 x, u32 y) { return &m_vram[VRAM_WIDTH * y + x]; }
  void SetPixel(u32 x, u32 y, u16 value) { m_vram[VRAM_WIDTH * y + x] = value; }

+  // this is actually (31 * 255) >> 4) == 494, but to simplify addressing we use the next power of two (512)
+  static constexpr u32 DITHER_LUT_SIZE = 512;
+  using DitherLUT = std::array<std::array<std::array<u8, 512>, DITHER_MATRIX_SIZE>, DITHER_MATRIX_SIZE>;
+  static constexpr DitherLUT ComputeDitherLUT();
+
 protected:
  struct SWVertex
  {