CPU/CodeCache: Simplify code LUT addressing

One more instruction on x86/ARM32, no additional instructions on ARM64.

Worth it so that the application doesn't crash if the game jumps to an
invalid PC. Note that the lower 2 bits are truncated, so an unaligned
jump will round down to the closest instruction. Obviously not correct,
but if a game ends up doing this, it's a lost cause anyway.
pull/3349/head
Stenzek 2 months ago
parent 4e5b4ba071
commit 2a8cfc7922
No known key found for this signature in database

@ -52,10 +52,6 @@ static constexpr u32 RECOMPILE_FRAMES_FOR_INTERPRETER_FALLBACK = 15;
static constexpr u32 INVALIDATE_COUNT_FOR_MANUAL_PROTECTION = 4;
static constexpr u32 INVALIDATE_FRAMES_FOR_MANUAL_PROTECTION = 60;
static CodeLUT DecodeCodeLUTPointer(u32 slot, CodeLUT ptr);
static CodeLUT EncodeCodeLUTPointer(u32 slot, CodeLUT ptr);
static CodeLUT OffsetCodeLUTPointer(CodeLUT fake_ptr, u32 pc);
static void AllocateLUTs();
static void DeallocateLUTs();
static void ResetCodeLUT();
@ -277,31 +273,6 @@ static constexpr u32 GetLUTSlotCount(bool include_unreachable)
}
} // namespace CPU::CodeCache
CPU::CodeCache::CodeLUT CPU::CodeCache::DecodeCodeLUTPointer(u32 slot, CodeLUT ptr)
{
if constexpr (sizeof(void*) == 8)
return reinterpret_cast<CodeLUT>(reinterpret_cast<u8*>(ptr) + (static_cast<u64>(slot) << 17));
else
return reinterpret_cast<CodeLUT>(reinterpret_cast<u8*>(ptr) + (slot << 16));
}
CPU::CodeCache::CodeLUT CPU::CodeCache::EncodeCodeLUTPointer(u32 slot, CodeLUT ptr)
{
if constexpr (sizeof(void*) == 8)
return reinterpret_cast<CodeLUT>(reinterpret_cast<u8*>(ptr) - (static_cast<u64>(slot) << 17));
else
return reinterpret_cast<CodeLUT>(reinterpret_cast<u8*>(ptr) - (slot << 16));
}
CPU::CodeCache::CodeLUT CPU::CodeCache::OffsetCodeLUTPointer(CodeLUT fake_ptr, u32 pc)
{
u8* fake_byte_ptr = reinterpret_cast<u8*>(fake_ptr);
if constexpr (sizeof(void*) == 8)
return reinterpret_cast<const void**>(fake_byte_ptr + (static_cast<u64>(pc) << 1));
else
return reinterpret_cast<const void**>(fake_byte_ptr + pc);
}
void CPU::CodeCache::AllocateLUTs()
{
constexpr u32 num_code_slots = GetLUTSlotCount(true);
@ -323,9 +294,11 @@ void CPU::CodeCache::AllocateLUTs()
// Mark everything as unreachable to begin with.
for (u32 i = 0; i < LUT_TABLE_COUNT; i++)
{
g_code_lut[i] = EncodeCodeLUTPointer(i, code_table_ptr);
g_code_lut[i] = code_table_ptr;
s_block_lut[i] = nullptr;
}
// Exclude unreachable.
code_table_ptr += LUT_TABLE_SIZE;
// Allocate ranges.
@ -337,7 +310,7 @@ void CPU::CodeCache::AllocateLUTs()
{
const u32 slot = start_slot + i;
g_code_lut[slot] = EncodeCodeLUTPointer(slot, code_table_ptr);
g_code_lut[slot] = code_table_ptr;
code_table_ptr += LUT_TABLE_SIZE;
s_block_lut[slot] = block_table_ptr;
@ -357,15 +330,13 @@ void CPU::CodeCache::DeallocateLUTs()
void CPU::CodeCache::ResetCodeLUT()
{
if (!s_lut_code_pointers)
return;
// Make the unreachable table jump to the invalid code callback.
MemsetPtrs(s_lut_code_pointers.get(), g_interpret_block, LUT_TABLE_COUNT);
for (u32 i = 0; i < LUT_TABLE_COUNT; i++)
{
CodeLUT ptr = DecodeCodeLUTPointer(i, g_code_lut[i]);
// Don't overwrite anything bound to unreachable.
CodeLUT ptr = g_code_lut[i];
if (ptr == s_lut_code_pointers.get())
continue;
@ -375,18 +346,10 @@ void CPU::CodeCache::ResetCodeLUT()
void CPU::CodeCache::SetCodeLUT(u32 pc, const void* function)
{
if (!s_lut_code_pointers)
return;
const u32 table = pc >> LUT_TABLE_SHIFT;
CodeLUT encoded_ptr = g_code_lut[table];
#ifdef _DEBUG
const CodeLUT table_ptr = DecodeCodeLUTPointer(table, encoded_ptr);
DebugAssert(table_ptr != nullptr && table_ptr != s_lut_code_pointers.get());
#endif
*OffsetCodeLUTPointer(encoded_ptr, pc) = function;
const u32 idx = (pc & 0xFFFF) >> 2;
DebugAssert(g_code_lut[table] != s_lut_code_pointers.get());
g_code_lut[table][idx] = function;
}
CPU::CodeCache::Block* CPU::CodeCache::LookupBlock(u32 pc)

@ -290,10 +290,11 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
armAsm->ldr(RARG1, PTR(&g_state.pc));
armMoveAddressToReg(armAsm, RARG3, g_code_lut.data());
armAsm->lsr(RARG2, RARG1, 16);
armAsm->ubfx(RARG1, RARG1, 2, 14);
armAsm->ldr(RARG2, MemOperand(RARG3, RARG2, LSL, 2));
// blr(x9[pc * 2]) (fast_map[pc >> 2])
armAsm->ldr(RARG1, MemOperand(RARG2, RARG1));
armAsm->ldr(RARG1, MemOperand(RARG2, RARG1, LSL, 2));
armAsm->blx(RARG1);
}

@ -478,7 +478,7 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
armAsm->ldr(RWARG1, PTR(&g_state.pc));
armMoveAddressToReg(armAsm, RXARG3, g_code_lut.data());
armAsm->lsr(RWARG2, RWARG1, 16);
armAsm->lsr(RWARG1, RWARG1, 2);
armAsm->ubfx(RWARG1, RWARG1, 2, 14);
armAsm->ldr(RXARG2, MemOperand(RXARG3, RXARG2, LSL, 3));
// blr(x9[pc * 2]) (fast_map[pc >> 2])

@ -279,12 +279,14 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
rvAsm->LWU(RARG1, PTR(&g_state.pc));
rvMoveAddressToReg(rvAsm, RARG3, g_code_lut.data());
rvAsm->SRLI(RARG2, RARG1, 16);
rvAsm->SLLI(RARG1, RARG1, 1);
rvAsm->SLLI(RARG2, RARG2, 3);
rvAsm->ADD(RARG2, RARG2, RARG3);
rvAsm->LD(RARG2, 0, RARG2);
rvAsm->SLLI(RARG1, RARG1, 48); // idx = (pc & 0xFFFF) >> 2
rvAsm->SRLI(RARG1, RARG1, 50);
rvAsm->SLLI(RARG1, RARG1, 3);
// blr(x9[pc * 2]) (fast_map[pc >> 2])
// blr(x9[pc * 2]) (fast_map[idx])
rvAsm->ADD(RARG1, RARG1, RARG2);
rvAsm->LD(RARG1, 0, RARG1);
rvAsm->JR(RARG1);

@ -156,8 +156,9 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
cg->mov(RWARG1, cg->dword[PTR(&g_state.pc)]);
cg->lea(RXARG2, cg->dword[PTR(g_code_lut.data())]);
cg->mov(RWARG3, RWARG1);
cg->shr(RWARG3, 16);
cg->shr(RWARG3, LUT_TABLE_SHIFT);
cg->mov(RXARG2, cg->qword[RXARG2 + RXARG3 * 8]);
cg->and_(RWARG1, (LUT_TABLE_SIZE - 1) << 2); // 0xFFFC
// call(rcx[pc * 2]) (fast_map[pc >> 2])
cg->jmp(cg->qword[RXARG2 + RXARG1 * 2]);

Loading…
Cancel
Save