From 2a8cfc79226e83216abe2253915dcf2a741eae82 Mon Sep 17 00:00:00 2001 From: Stenzek Date: Fri, 27 Dec 2024 13:29:12 +1000 Subject: [PATCH] CPU/CodeCache: Simplify code LUT addressing One more instruction on x86/ARM32, no additional instructions on ARM64. Worth it so that the application doesn't crash if the game jumps to an invalid PC. Note that the lower 2 bits are truncated, so an unaligned jump will round down to the closest instruction. Obviously not correct, but if a game ends up doing this, it's a lost cause anyway. --- src/core/cpu_code_cache.cpp | 55 +++++------------------------ src/core/cpu_recompiler_arm32.cpp | 3 +- src/core/cpu_recompiler_arm64.cpp | 2 +- src/core/cpu_recompiler_riscv64.cpp | 6 ++-- src/core/cpu_recompiler_x64.cpp | 3 +- 5 files changed, 18 insertions(+), 51 deletions(-) diff --git a/src/core/cpu_code_cache.cpp b/src/core/cpu_code_cache.cpp index 593807f26..23b823095 100644 --- a/src/core/cpu_code_cache.cpp +++ b/src/core/cpu_code_cache.cpp @@ -52,10 +52,6 @@ static constexpr u32 RECOMPILE_FRAMES_FOR_INTERPRETER_FALLBACK = 15; static constexpr u32 INVALIDATE_COUNT_FOR_MANUAL_PROTECTION = 4; static constexpr u32 INVALIDATE_FRAMES_FOR_MANUAL_PROTECTION = 60; -static CodeLUT DecodeCodeLUTPointer(u32 slot, CodeLUT ptr); -static CodeLUT EncodeCodeLUTPointer(u32 slot, CodeLUT ptr); -static CodeLUT OffsetCodeLUTPointer(CodeLUT fake_ptr, u32 pc); - static void AllocateLUTs(); static void DeallocateLUTs(); static void ResetCodeLUT(); @@ -277,31 +273,6 @@ static constexpr u32 GetLUTSlotCount(bool include_unreachable) } } // namespace CPU::CodeCache -CPU::CodeCache::CodeLUT CPU::CodeCache::DecodeCodeLUTPointer(u32 slot, CodeLUT ptr) -{ - if constexpr (sizeof(void*) == 8) - return reinterpret_cast(reinterpret_cast(ptr) + (static_cast(slot) << 17)); - else - return reinterpret_cast(reinterpret_cast(ptr) + (slot << 16)); -} - -CPU::CodeCache::CodeLUT CPU::CodeCache::EncodeCodeLUTPointer(u32 slot, CodeLUT ptr) -{ - if constexpr (sizeof(void*) == 8) - return reinterpret_cast(reinterpret_cast(ptr) - (static_cast(slot) << 17)); - else - return reinterpret_cast(reinterpret_cast(ptr) - (slot << 16)); -} - -CPU::CodeCache::CodeLUT CPU::CodeCache::OffsetCodeLUTPointer(CodeLUT fake_ptr, u32 pc) -{ - u8* fake_byte_ptr = reinterpret_cast(fake_ptr); - if constexpr (sizeof(void*) == 8) - return reinterpret_cast(fake_byte_ptr + (static_cast(pc) << 1)); - else - return reinterpret_cast(fake_byte_ptr + pc); -} - void CPU::CodeCache::AllocateLUTs() { constexpr u32 num_code_slots = GetLUTSlotCount(true); @@ -323,9 +294,11 @@ void CPU::CodeCache::AllocateLUTs() // Mark everything as unreachable to begin with. for (u32 i = 0; i < LUT_TABLE_COUNT; i++) { - g_code_lut[i] = EncodeCodeLUTPointer(i, code_table_ptr); + g_code_lut[i] = code_table_ptr; s_block_lut[i] = nullptr; } + + // Exclude unreachable. code_table_ptr += LUT_TABLE_SIZE; // Allocate ranges. @@ -337,7 +310,7 @@ void CPU::CodeCache::AllocateLUTs() { const u32 slot = start_slot + i; - g_code_lut[slot] = EncodeCodeLUTPointer(slot, code_table_ptr); + g_code_lut[slot] = code_table_ptr; code_table_ptr += LUT_TABLE_SIZE; s_block_lut[slot] = block_table_ptr; @@ -357,15 +330,13 @@ void CPU::CodeCache::DeallocateLUTs() void CPU::CodeCache::ResetCodeLUT() { - if (!s_lut_code_pointers) - return; - // Make the unreachable table jump to the invalid code callback. MemsetPtrs(s_lut_code_pointers.get(), g_interpret_block, LUT_TABLE_COUNT); for (u32 i = 0; i < LUT_TABLE_COUNT; i++) { - CodeLUT ptr = DecodeCodeLUTPointer(i, g_code_lut[i]); + // Don't overwrite anything bound to unreachable. + CodeLUT ptr = g_code_lut[i]; if (ptr == s_lut_code_pointers.get()) continue; @@ -375,18 +346,10 @@ void CPU::CodeCache::ResetCodeLUT() void CPU::CodeCache::SetCodeLUT(u32 pc, const void* function) { - if (!s_lut_code_pointers) - return; - const u32 table = pc >> LUT_TABLE_SHIFT; - CodeLUT encoded_ptr = g_code_lut[table]; - -#ifdef _DEBUG - const CodeLUT table_ptr = DecodeCodeLUTPointer(table, encoded_ptr); - DebugAssert(table_ptr != nullptr && table_ptr != s_lut_code_pointers.get()); -#endif - - *OffsetCodeLUTPointer(encoded_ptr, pc) = function; + const u32 idx = (pc & 0xFFFF) >> 2; + DebugAssert(g_code_lut[table] != s_lut_code_pointers.get()); + g_code_lut[table][idx] = function; } CPU::CodeCache::Block* CPU::CodeCache::LookupBlock(u32 pc) diff --git a/src/core/cpu_recompiler_arm32.cpp b/src/core/cpu_recompiler_arm32.cpp index 06fb84fc4..1748fb631 100644 --- a/src/core/cpu_recompiler_arm32.cpp +++ b/src/core/cpu_recompiler_arm32.cpp @@ -290,10 +290,11 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size) armAsm->ldr(RARG1, PTR(&g_state.pc)); armMoveAddressToReg(armAsm, RARG3, g_code_lut.data()); armAsm->lsr(RARG2, RARG1, 16); + armAsm->ubfx(RARG1, RARG1, 2, 14); armAsm->ldr(RARG2, MemOperand(RARG3, RARG2, LSL, 2)); // blr(x9[pc * 2]) (fast_map[pc >> 2]) - armAsm->ldr(RARG1, MemOperand(RARG2, RARG1)); + armAsm->ldr(RARG1, MemOperand(RARG2, RARG1, LSL, 2)); armAsm->blx(RARG1); } diff --git a/src/core/cpu_recompiler_arm64.cpp b/src/core/cpu_recompiler_arm64.cpp index ac47aa082..a50705b38 100644 --- a/src/core/cpu_recompiler_arm64.cpp +++ b/src/core/cpu_recompiler_arm64.cpp @@ -478,7 +478,7 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size) armAsm->ldr(RWARG1, PTR(&g_state.pc)); armMoveAddressToReg(armAsm, RXARG3, g_code_lut.data()); armAsm->lsr(RWARG2, RWARG1, 16); - armAsm->lsr(RWARG1, RWARG1, 2); + armAsm->ubfx(RWARG1, RWARG1, 2, 14); armAsm->ldr(RXARG2, MemOperand(RXARG3, RXARG2, LSL, 3)); // blr(x9[pc * 2]) (fast_map[pc >> 2]) diff --git a/src/core/cpu_recompiler_riscv64.cpp b/src/core/cpu_recompiler_riscv64.cpp index c78356617..628692f82 100644 --- a/src/core/cpu_recompiler_riscv64.cpp +++ b/src/core/cpu_recompiler_riscv64.cpp @@ -279,12 +279,14 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size) rvAsm->LWU(RARG1, PTR(&g_state.pc)); rvMoveAddressToReg(rvAsm, RARG3, g_code_lut.data()); rvAsm->SRLI(RARG2, RARG1, 16); - rvAsm->SLLI(RARG1, RARG1, 1); rvAsm->SLLI(RARG2, RARG2, 3); rvAsm->ADD(RARG2, RARG2, RARG3); rvAsm->LD(RARG2, 0, RARG2); + rvAsm->SLLI(RARG1, RARG1, 48); // idx = (pc & 0xFFFF) >> 2 + rvAsm->SRLI(RARG1, RARG1, 50); + rvAsm->SLLI(RARG1, RARG1, 3); - // blr(x9[pc * 2]) (fast_map[pc >> 2]) + // blr(x9[pc * 2]) (fast_map[idx]) rvAsm->ADD(RARG1, RARG1, RARG2); rvAsm->LD(RARG1, 0, RARG1); rvAsm->JR(RARG1); diff --git a/src/core/cpu_recompiler_x64.cpp b/src/core/cpu_recompiler_x64.cpp index e9da3db2c..1201ad73e 100644 --- a/src/core/cpu_recompiler_x64.cpp +++ b/src/core/cpu_recompiler_x64.cpp @@ -156,8 +156,9 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size) cg->mov(RWARG1, cg->dword[PTR(&g_state.pc)]); cg->lea(RXARG2, cg->dword[PTR(g_code_lut.data())]); cg->mov(RWARG3, RWARG1); - cg->shr(RWARG3, 16); + cg->shr(RWARG3, LUT_TABLE_SHIFT); cg->mov(RXARG2, cg->qword[RXARG2 + RXARG3 * 8]); + cg->and_(RWARG1, (LUT_TABLE_SIZE - 1) << 2); // 0xFFFC // call(rcx[pc * 2]) (fast_map[pc >> 2]) cg->jmp(cg->qword[RXARG2 + RXARG1 * 2]);