Image: Vectorize RGBA8 conversion and add A1BGR5

2 weeks ago · b68d7c8adb
parent 9b2cf0abbd
commit b68d7c8adb
3 changed files with 156 additions and 10 deletions
--- a/src/common/intrin.h
+++ b/src/common/intrin.h
@ -44,6 +44,17 @@
 #include <malloc.h> // alloca
 #endif

+/// Helper to disable loop vectorization.
+#if defined(__clang__)
+#define DONT_VECTORIZE_THIS_LOOP _Pragma("clang loop vectorize(disable)")
+#elif defined(_MSC_VER)
+#define DONT_VECTORIZE_THIS_LOOP __pragma(loop(no_vector))
+#elif defined(__GNUC__)
+#define DONT_VECTORIZE_THIS_LOOP _Pragma("GCC novector")
+#else
+#define DONT_VECTORIZE_THIS_LOOP
+#endif
+
 /// Only currently using 128-bit vectors at max.
 static constexpr u32 VECTOR_ALIGNMENT = 16;

--- a/src/util/image.cpp
+++ b/src/util/image.cpp
@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-FileCopyrightText: 2019-2025 Connor McLaughlin <stenzek@gmail.com>
 // SPDX-License-Identifier: CC-BY-NC-ND-4.0

 #include "image.h"
@ -583,12 +583,46 @@ std::optional<Image> Image::ConvertToRGBA8(Error* error) const
    case ImageFormat::RGB565:
    {
      ret = Image(m_width, m_height, ImageFormat::RGBA8);
+
+      constexpr u32 pixels_per_vec = 8;
+      [[maybe_unused]] const u32 aligned_width = Common::AlignDownPow2(m_width, pixels_per_vec);
+
      for (u32 y = 0; y < m_height; y++)
      {
        const u8* pixels_in = GetRowPixels(y);
        u8* pixels_out = ret->GetRowPixels(y);
+        u32 x = 0;

-        for (u32 x = 0; x < m_width; x++)
+#ifdef CPU_ARCH_SIMD
+        for (; x < aligned_width; x += pixels_per_vec)
+        {
+          GSVector4i rgb565 = GSVector4i::load<false>(pixels_in);
+          pixels_in += sizeof(u16) * pixels_per_vec;
+
+          GSVector4i r = rgb565.srl16<11>();
+          r = r.sll16<3>() | r.sll16<13>().srl16<13>();
+
+          GSVector4i g = rgb565.sll16<5>().srl16<10>();
+          g = g.sll16<2>() | g.sll16<14>().srl16<14>();
+
+          GSVector4i b = rgb565.sll16<11>().srl16<11>();
+          b = b.sll16<3>() | b.sll16<13>().srl16<13>();
+
+          const GSVector4i low =
+            r.u16to32() | g.u16to32().sll32<8>() | b.u16to32().sll32<16>() | GSVector4i::cxpr(0xFF000000);
+          const GSVector4i high = r.uph64().u16to32() | g.uph64().u16to32().sll32<8>() |
+                                  b.uph64().u16to32().sll32<16>() | GSVector4i::cxpr(0xFF000000);
+
+          GSVector4i::store<false>(pixels_out, low);
+          pixels_out += sizeof(GSVector4i);
+
+          GSVector4i::store<false>(pixels_out, high);
+          pixels_out += sizeof(GSVector4i);
+        }
+#endif
+
+        DONT_VECTORIZE_THIS_LOOP
+        for (; x < m_width; x++)
        {
          // RGB565 -> RGBA8
          u16 pixel_in;
@ -609,12 +643,48 @@ std::optional<Image> Image::ConvertToRGBA8(Error* error) const
    case ImageFormat::RGB5A1:
    {
      ret = Image(m_width, m_height, ImageFormat::RGBA8);
+
+      constexpr u32 pixels_per_vec = 8;
+      [[maybe_unused]] const u32 aligned_width = Common::AlignDownPow2(m_width, pixels_per_vec);
+
      for (u32 y = 0; y < m_height; y++)
      {
        const u8* pixels_in = GetRowPixels(y);
        u8* pixels_out = ret->GetRowPixels(y);
+        u32 x = 0;

-        for (u32 x = 0; x < m_width; x++)
+#ifdef CPU_ARCH_SIMD
+        for (; x < aligned_width; x += pixels_per_vec)
+        {
+          GSVector4i rgb5a1 = GSVector4i::load<false>(pixels_in);
+          pixels_in += sizeof(u16) * pixels_per_vec;
+
+          GSVector4i r = rgb5a1.sll16<1>().srl16<11>();
+          r = r.sll16<3>() | r.sll16<13>().srl16<13>();
+
+          GSVector4i g = rgb5a1.sll16<6>().srl16<11>();
+          g = g.sll16<3>() | g.sll16<13>().srl16<13>();
+
+          GSVector4i b = rgb5a1.sll16<11>().srl16<11>();
+          b = b.sll16<3>() | b.sll16<13>().srl16<13>();
+
+          GSVector4i a = rgb5a1.sra16<7>().srl16<8>();
+
+          const GSVector4i low =
+            r.u16to32() | g.u16to32().sll32<8>() | b.u16to32().sll32<16>() | a.u16to32().sll32<24>();
+          const GSVector4i high = r.uph64().u16to32() | g.uph64().u16to32().sll32<8>() |
+                                  b.uph64().u16to32().sll32<16>() | a.uph64().u16to32().sll32<24>();
+
+          GSVector4i::store<false>(pixels_out, low);
+          pixels_out += sizeof(GSVector4i);
+
+          GSVector4i::store<false>(pixels_out, high);
+          pixels_out += sizeof(GSVector4i);
+        }
+#endif
+
+        DONT_VECTORIZE_THIS_LOOP
+        for (; x < m_width; x++)
        {
          // RGB5A1 -> RGBA8
          u16 pixel_in;
@ -633,6 +703,69 @@ std::optional<Image> Image::ConvertToRGBA8(Error* error) const
    }
    break;

+    case ImageFormat::A1BGR5:
+    {
+      ret = Image(m_width, m_height, ImageFormat::RGBA8);
+
+      constexpr u32 pixels_per_vec = 8;
+      [[maybe_unused]] const u32 aligned_width = Common::AlignDownPow2(m_width, pixels_per_vec);
+
+      for (u32 y = 0; y < m_height; y++)
+      {
+        const u8* pixels_in = GetRowPixels(y);
+        u8* pixels_out = ret->GetRowPixels(y);
+        u32 x = 0;
+
+#ifdef CPU_ARCH_SIMD
+        for (; x < aligned_width; x += pixels_per_vec)
+        {
+          GSVector4i a1bgr5 = GSVector4i::load<false>(pixels_in);
+          pixels_in += sizeof(u16) * pixels_per_vec;
+
+          GSVector4i r = a1bgr5.srl16<11>();
+          r = r.sll16<3>() | r.sll16<13>().srl16<13>();
+
+          GSVector4i g = a1bgr5.sll16<5>().srl16<11>();
+          g = g.sll16<3>() | g.sll16<13>().srl16<13>();
+
+          GSVector4i b = a1bgr5.sll16<10>().srl16<11>();
+          b = b.sll16<3>() | b.sll16<13>().srl16<13>();
+
+          GSVector4i a = a1bgr5.sll16<15>().sra16<7>().srl16<8>();
+
+          const GSVector4i low =
+            r.u16to32() | g.u16to32().sll32<8>() | b.u16to32().sll32<16>() | a.u16to32().sll32<24>();
+          const GSVector4i high = r.uph64().u16to32() | g.uph64().u16to32().sll32<8>() |
+                                  b.uph64().u16to32().sll32<16>() | a.uph64().u16to32().sll32<24>();
+
+          GSVector4i::store<false>(pixels_out, low);
+          pixels_out += sizeof(GSVector4i);
+
+          GSVector4i::store<false>(pixels_out, high);
+          pixels_out += sizeof(GSVector4i);
+        }
+#endif
+
+        DONT_VECTORIZE_THIS_LOOP
+        for (; x < m_width; x++)
+        {
+          // RGB5A1 -> RGBA8
+          u16 pixel_in;
+          std::memcpy(&pixel_in, pixels_in, sizeof(u16));
+          pixels_in += sizeof(u16);
+          const u8 a1 = Truncate8(pixel_in & 0x01);
+          const u8 r5 = Truncate8((pixel_in >> 11) & 0x1F);
+          const u8 g6 = Truncate8((pixel_in >> 6) & 0x1F);
+          const u8 b5 = Truncate8((pixel_in >> 1) & 0x1F);
+          const u32 rgba8 = ZeroExtend32((r5 << 3) | (r5 & 7)) | (ZeroExtend32((g6 << 3) | (g6 & 7)) << 8) |
+                            (ZeroExtend32((b5 << 3) | (b5 & 7)) << 16) | (a1 ? 0xFF000000u : 0u);
+          std::memcpy(pixels_out, &rgba8, sizeof(u32));
+          pixels_out += sizeof(u32);
+        }
+      }
+    }
+    break;
+
    case ImageFormat::BGR8:
    {
      ret = Image(m_width, m_height, ImageFormat::RGBA8);
--- a/src/util/image.h
+++ b/src/util/image.h
@ -1,10 +1,11 @@
-// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-FileCopyrightText: 2019-2025 Connor McLaughlin <stenzek@gmail.com>
 // SPDX-License-Identifier: CC-BY-NC-ND-4.0

 #pragma once

 #include "common/align.h"
 #include "common/heap_array.h"
+#include "common/intrin.h"
 #include "common/types.h"

 #include <cstdio>
@ -60,12 +61,13 @@ public:
  ALWAYS_INLINE u32 GetHeight() const { return m_height; }
  ALWAYS_INLINE u32 GetPitch() const { return m_pitch; }
  ALWAYS_INLINE ImageFormat GetFormat() const { return m_format; }
-  ALWAYS_INLINE const u8* GetPixels() const { return m_pixels.get(); }
-  ALWAYS_INLINE u8* GetPixels() { return m_pixels.get(); }
-  ALWAYS_INLINE const u8* GetRowPixels(u32 y) const { return &m_pixels[y * m_pitch]; }
-  ALWAYS_INLINE u8* GetRowPixels(u32 y) { return &m_pixels[y * m_pitch]; }
-  // ALWAYS_INLINE void SetPixel(u32 x, u32 y, PixelType pixel) { m_pixels[y * m_width + x] = pixel; }
-  // ALWAYS_INLINE PixelType GetPixel(u32 x, u32 y) const { return m_pixels[y * m_width + x]; }
+  ALWAYS_INLINE const u8* GetPixels() const { return std::assume_aligned<VECTOR_ALIGNMENT>(m_pixels.get()); }
+  ALWAYS_INLINE u8* GetPixels() { return std::assume_aligned<VECTOR_ALIGNMENT>(m_pixels.get()); }
+  ALWAYS_INLINE const u8* GetRowPixels(u32 y) const
+  {
+    return std::assume_aligned<VECTOR_ALIGNMENT>(&m_pixels[y * m_pitch]);
+  }
+  ALWAYS_INLINE u8* GetRowPixels(u32 y) { return std::assume_aligned<VECTOR_ALIGNMENT>(&m_pixels[y * m_pitch]); }

  u32 GetBlocksWide() const;
  u32 GetBlocksHigh() const;