Image: Vectorize RGBA8 conversion and add A1BGR5

wip2
Stenzek 2 weeks ago
parent 9b2cf0abbd
commit b68d7c8adb
No known key found for this signature in database

@ -44,6 +44,17 @@
#include <malloc.h> // alloca
#endif
/// Helper to disable loop vectorization.
#if defined(__clang__)
#define DONT_VECTORIZE_THIS_LOOP _Pragma("clang loop vectorize(disable)")
#elif defined(_MSC_VER)
#define DONT_VECTORIZE_THIS_LOOP __pragma(loop(no_vector))
#elif defined(__GNUC__)
#define DONT_VECTORIZE_THIS_LOOP _Pragma("GCC novector")
#else
#define DONT_VECTORIZE_THIS_LOOP
#endif
/// Only currently using 128-bit vectors at max.
static constexpr u32 VECTOR_ALIGNMENT = 16;

@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
// SPDX-FileCopyrightText: 2019-2025 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: CC-BY-NC-ND-4.0
#include "image.h"
@ -583,12 +583,46 @@ std::optional<Image> Image::ConvertToRGBA8(Error* error) const
case ImageFormat::RGB565:
{
ret = Image(m_width, m_height, ImageFormat::RGBA8);
constexpr u32 pixels_per_vec = 8;
[[maybe_unused]] const u32 aligned_width = Common::AlignDownPow2(m_width, pixels_per_vec);
for (u32 y = 0; y < m_height; y++)
{
const u8* pixels_in = GetRowPixels(y);
u8* pixels_out = ret->GetRowPixels(y);
u32 x = 0;
for (u32 x = 0; x < m_width; x++)
#ifdef CPU_ARCH_SIMD
for (; x < aligned_width; x += pixels_per_vec)
{
GSVector4i rgb565 = GSVector4i::load<false>(pixels_in);
pixels_in += sizeof(u16) * pixels_per_vec;
GSVector4i r = rgb565.srl16<11>();
r = r.sll16<3>() | r.sll16<13>().srl16<13>();
GSVector4i g = rgb565.sll16<5>().srl16<10>();
g = g.sll16<2>() | g.sll16<14>().srl16<14>();
GSVector4i b = rgb565.sll16<11>().srl16<11>();
b = b.sll16<3>() | b.sll16<13>().srl16<13>();
const GSVector4i low =
r.u16to32() | g.u16to32().sll32<8>() | b.u16to32().sll32<16>() | GSVector4i::cxpr(0xFF000000);
const GSVector4i high = r.uph64().u16to32() | g.uph64().u16to32().sll32<8>() |
b.uph64().u16to32().sll32<16>() | GSVector4i::cxpr(0xFF000000);
GSVector4i::store<false>(pixels_out, low);
pixels_out += sizeof(GSVector4i);
GSVector4i::store<false>(pixels_out, high);
pixels_out += sizeof(GSVector4i);
}
#endif
DONT_VECTORIZE_THIS_LOOP
for (; x < m_width; x++)
{
// RGB565 -> RGBA8
u16 pixel_in;
@ -609,12 +643,48 @@ std::optional<Image> Image::ConvertToRGBA8(Error* error) const
case ImageFormat::RGB5A1:
{
ret = Image(m_width, m_height, ImageFormat::RGBA8);
constexpr u32 pixels_per_vec = 8;
[[maybe_unused]] const u32 aligned_width = Common::AlignDownPow2(m_width, pixels_per_vec);
for (u32 y = 0; y < m_height; y++)
{
const u8* pixels_in = GetRowPixels(y);
u8* pixels_out = ret->GetRowPixels(y);
u32 x = 0;
for (u32 x = 0; x < m_width; x++)
#ifdef CPU_ARCH_SIMD
for (; x < aligned_width; x += pixels_per_vec)
{
GSVector4i rgb5a1 = GSVector4i::load<false>(pixels_in);
pixels_in += sizeof(u16) * pixels_per_vec;
GSVector4i r = rgb5a1.sll16<1>().srl16<11>();
r = r.sll16<3>() | r.sll16<13>().srl16<13>();
GSVector4i g = rgb5a1.sll16<6>().srl16<11>();
g = g.sll16<3>() | g.sll16<13>().srl16<13>();
GSVector4i b = rgb5a1.sll16<11>().srl16<11>();
b = b.sll16<3>() | b.sll16<13>().srl16<13>();
GSVector4i a = rgb5a1.sra16<7>().srl16<8>();
const GSVector4i low =
r.u16to32() | g.u16to32().sll32<8>() | b.u16to32().sll32<16>() | a.u16to32().sll32<24>();
const GSVector4i high = r.uph64().u16to32() | g.uph64().u16to32().sll32<8>() |
b.uph64().u16to32().sll32<16>() | a.uph64().u16to32().sll32<24>();
GSVector4i::store<false>(pixels_out, low);
pixels_out += sizeof(GSVector4i);
GSVector4i::store<false>(pixels_out, high);
pixels_out += sizeof(GSVector4i);
}
#endif
DONT_VECTORIZE_THIS_LOOP
for (; x < m_width; x++)
{
// RGB5A1 -> RGBA8
u16 pixel_in;
@ -633,6 +703,69 @@ std::optional<Image> Image::ConvertToRGBA8(Error* error) const
}
break;
case ImageFormat::A1BGR5:
{
ret = Image(m_width, m_height, ImageFormat::RGBA8);
constexpr u32 pixels_per_vec = 8;
[[maybe_unused]] const u32 aligned_width = Common::AlignDownPow2(m_width, pixels_per_vec);
for (u32 y = 0; y < m_height; y++)
{
const u8* pixels_in = GetRowPixels(y);
u8* pixels_out = ret->GetRowPixels(y);
u32 x = 0;
#ifdef CPU_ARCH_SIMD
for (; x < aligned_width; x += pixels_per_vec)
{
GSVector4i a1bgr5 = GSVector4i::load<false>(pixels_in);
pixels_in += sizeof(u16) * pixels_per_vec;
GSVector4i r = a1bgr5.srl16<11>();
r = r.sll16<3>() | r.sll16<13>().srl16<13>();
GSVector4i g = a1bgr5.sll16<5>().srl16<11>();
g = g.sll16<3>() | g.sll16<13>().srl16<13>();
GSVector4i b = a1bgr5.sll16<10>().srl16<11>();
b = b.sll16<3>() | b.sll16<13>().srl16<13>();
GSVector4i a = a1bgr5.sll16<15>().sra16<7>().srl16<8>();
const GSVector4i low =
r.u16to32() | g.u16to32().sll32<8>() | b.u16to32().sll32<16>() | a.u16to32().sll32<24>();
const GSVector4i high = r.uph64().u16to32() | g.uph64().u16to32().sll32<8>() |
b.uph64().u16to32().sll32<16>() | a.uph64().u16to32().sll32<24>();
GSVector4i::store<false>(pixels_out, low);
pixels_out += sizeof(GSVector4i);
GSVector4i::store<false>(pixels_out, high);
pixels_out += sizeof(GSVector4i);
}
#endif
DONT_VECTORIZE_THIS_LOOP
for (; x < m_width; x++)
{
// RGB5A1 -> RGBA8
u16 pixel_in;
std::memcpy(&pixel_in, pixels_in, sizeof(u16));
pixels_in += sizeof(u16);
const u8 a1 = Truncate8(pixel_in & 0x01);
const u8 r5 = Truncate8((pixel_in >> 11) & 0x1F);
const u8 g6 = Truncate8((pixel_in >> 6) & 0x1F);
const u8 b5 = Truncate8((pixel_in >> 1) & 0x1F);
const u32 rgba8 = ZeroExtend32((r5 << 3) | (r5 & 7)) | (ZeroExtend32((g6 << 3) | (g6 & 7)) << 8) |
(ZeroExtend32((b5 << 3) | (b5 & 7)) << 16) | (a1 ? 0xFF000000u : 0u);
std::memcpy(pixels_out, &rgba8, sizeof(u32));
pixels_out += sizeof(u32);
}
}
}
break;
case ImageFormat::BGR8:
{
ret = Image(m_width, m_height, ImageFormat::RGBA8);

@ -1,10 +1,11 @@
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
// SPDX-FileCopyrightText: 2019-2025 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: CC-BY-NC-ND-4.0
#pragma once
#include "common/align.h"
#include "common/heap_array.h"
#include "common/intrin.h"
#include "common/types.h"
#include <cstdio>
@ -60,12 +61,13 @@ public:
ALWAYS_INLINE u32 GetHeight() const { return m_height; }
ALWAYS_INLINE u32 GetPitch() const { return m_pitch; }
ALWAYS_INLINE ImageFormat GetFormat() const { return m_format; }
ALWAYS_INLINE const u8* GetPixels() const { return m_pixels.get(); }
ALWAYS_INLINE u8* GetPixels() { return m_pixels.get(); }
ALWAYS_INLINE const u8* GetRowPixels(u32 y) const { return &m_pixels[y * m_pitch]; }
ALWAYS_INLINE u8* GetRowPixels(u32 y) { return &m_pixels[y * m_pitch]; }
// ALWAYS_INLINE void SetPixel(u32 x, u32 y, PixelType pixel) { m_pixels[y * m_width + x] = pixel; }
// ALWAYS_INLINE PixelType GetPixel(u32 x, u32 y) const { return m_pixels[y * m_width + x]; }
ALWAYS_INLINE const u8* GetPixels() const { return std::assume_aligned<VECTOR_ALIGNMENT>(m_pixels.get()); }
ALWAYS_INLINE u8* GetPixels() { return std::assume_aligned<VECTOR_ALIGNMENT>(m_pixels.get()); }
ALWAYS_INLINE const u8* GetRowPixels(u32 y) const
{
return std::assume_aligned<VECTOR_ALIGNMENT>(&m_pixels[y * m_pitch]);
}
ALWAYS_INLINE u8* GetRowPixels(u32 y) { return std::assume_aligned<VECTOR_ALIGNMENT>(&m_pixels[y * m_pitch]); }
u32 GetBlocksWide() const;
u32 GetBlocksHigh() const;

Loading…
Cancel
Save