0
0
Fork 0
mirror of synced 2025-09-22 11:47:55 +00:00

Compare commits

...

3 commits

Author SHA1 Message Date
xtophyr
1a17235ffd
Merge eeec5ecd10 into 6e4a5a6d94 2025-09-20 08:39:03 -04:00
Christopher Wright
eeec5ecd10 undo this change 2025-09-20 08:38:40 -04:00
Christopher Wright
93f5067999 minor Aarch64 JIT changes (better instruction selection, don't emit instructions that add 0, etc) 2025-09-20 08:32:32 -04:00
2 changed files with 111 additions and 123 deletions

View file

@ -67,7 +67,6 @@ constexpr uint32_t LDR_LITERAL = 0x58000000;
constexpr uint32_t ROR = 0x9AC02C00;
constexpr uint32_t ROR_IMM = 0x93C00000;
constexpr uint32_t MOV_REG = 0xAA0003E0;
constexpr uint32_t MOV_VREG_EL = 0x6E080400;
constexpr uint32_t FADD = 0x4E60D400;
constexpr uint32_t FSUB = 0x4EE0D400;
constexpr uint32_t FEOR = 0x6E201C00;
@ -102,7 +101,7 @@ static size_t CalcDatasetItemSize()
((uint8_t*)randomx_calc_dataset_item_aarch64_end - (uint8_t*)randomx_calc_dataset_item_aarch64_store_result);
}
constexpr uint32_t IntRegMap[8] = { 4, 5, 6, 7, 12, 13, 14, 15 };
constexpr uint8_t IntRegMap[8] = { 4, 5, 6, 7, 12, 13, 14, 15 };
JitCompilerA64::JitCompilerA64(bool hugePagesEnable, bool) :
hugePages(hugePagesJIT && hugePagesEnable),
@ -128,11 +127,12 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con
uint32_t codePos = MainLoopBegin + 4;
uint32_t mask = ((RandomX_CurrentConfig.Log2_ScratchpadL3 - 7) << 10);
// and w16, w10, ScratchpadL3Mask64
emit32(0x121A0000 | 16 | (10 << 5) | ((RandomX_CurrentConfig.Log2_ScratchpadL3 - 7) << 10), code, codePos);
emit32(0x121A0000 | 16 | (10 << 5) | mask, code, codePos);
// and w17, w20, ScratchpadL3Mask64
emit32(0x121A0000 | 17 | (20 << 5) | ((RandomX_CurrentConfig.Log2_ScratchpadL3 - 7) << 10), code, codePos);
emit32(0x121A0000 | 17 | (20 << 5) | mask, code, codePos);
codePos = PrologueSize;
literalPos = ImulRcpLiteralsEnd;
@ -155,13 +155,14 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con
const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64)) - codePos;
emit32(ARMV8A::B | (offset / 4), code, codePos);
// and w20, w20, CacheLineAlignMask
mask = ((RandomX_CurrentConfig.Log2_DatasetBaseSize - 7) << 10);
// and w20, w9, CacheLineAlignMask
codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask1) - ((uint8_t*)randomx_program_aarch64));
emit32(0x121A0000 | 20 | (20 << 5) | ((RandomX_CurrentConfig.Log2_DatasetBaseSize - 7) << 10), code, codePos);
emit32(0x121A0000 | 20 | (9 << 5) | mask, code, codePos);
// and w10, w10, CacheLineAlignMask
codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask2) - ((uint8_t*)randomx_program_aarch64));
emit32(0x121A0000 | 10 | (10 << 5) | ((RandomX_CurrentConfig.Log2_DatasetBaseSize - 7) << 10), code, codePos);
emit32(0x121A0000 | 10 | (10 << 5) | mask, code, codePos);
// Update spMix1
// eor x10, config.readReg0, config.readReg1
@ -497,9 +498,12 @@ void JitCompilerA64::emitMemLoad(uint32_t dst, uint32_t src, Instruction& instr,
if (src != dst)
{
imm &= instr.getModMem() ? (RandomX_CurrentConfig.ScratchpadL1_Size - 1) : (RandomX_CurrentConfig.ScratchpadL2_Size - 1);
emitAddImmediate(tmp_reg, src, imm, code, k);
uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5);
if (imm)
emitAddImmediate(tmp_reg, src, imm, code, k);
else
t = 0x927d0000 | tmp_reg | (src << 5);
constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5);
const uint32_t andInstrL1 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL1 - 4) << 10);
const uint32_t andInstrL2 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL2 - 4) << 10);
@ -511,10 +515,18 @@ void JitCompilerA64::emitMemLoad(uint32_t dst, uint32_t src, Instruction& instr,
else
{
imm = (imm & ScratchpadL3Mask) >> 3;
emitMovImmediate(tmp_reg, imm, code, k);
if (imm)
{
emitMovImmediate(tmp_reg, imm, code, k);
// ldr tmp_reg, [x2, tmp_reg, lsl 3]
emit32(0xf8607840 | tmp_reg | (tmp_reg << 16), code, k);
// ldr tmp_reg, [x2, tmp_reg, lsl 3]
emit32(0xf8607840 | tmp_reg | (tmp_reg << 16), code, k);
}
else
{
// ldr tmp_reg, [x2]
emit32(0xf9400040 | tmp_reg, code, k);
}
}
codePos = k;
@ -529,25 +541,22 @@ void JitCompilerA64::emitMemLoadFP(uint32_t src, Instruction& instr, uint8_t* co
constexpr uint32_t tmp_reg = 19;
imm &= instr.getModMem() ? (RandomX_CurrentConfig.ScratchpadL1_Size - 1) : (RandomX_CurrentConfig.ScratchpadL2_Size - 1);
emitAddImmediate(tmp_reg, src, imm, code, k);
uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5);
if (imm)
emitAddImmediate(tmp_reg, src, imm, code, k);
else
t = 0x927d0000 | tmp_reg | (src << 5);
constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5);
const uint32_t andInstrL1 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL1 - 4) << 10);
const uint32_t andInstrL2 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL2 - 4) << 10);
emit32(instr.getModMem() ? andInstrL1 : andInstrL2, code, k);
// add tmp_reg, x2, tmp_reg
emit32(ARMV8A::ADD | tmp_reg | (2 << 5) | (tmp_reg << 16), code, k);
// ldr tmp_reg_fp, [x2, tmp_reg]
emit32(0x3ce06800 | tmp_reg_fp | (2 << 5) | (tmp_reg << 16), code, k);
// ldpsw tmp_reg, tmp_reg + 1, [tmp_reg]
emit32(0x69400000 | tmp_reg | (tmp_reg << 5) | ((tmp_reg + 1) << 10), code, k);
// ins tmp_reg_fp.d[0], tmp_reg
emit32(0x4E081C00 | tmp_reg_fp | (tmp_reg << 5), code, k);
// ins tmp_reg_fp.d[1], tmp_reg + 1
emit32(0x4E181C00 | tmp_reg_fp | ((tmp_reg + 1) << 5), code, k);
// sxtl.2d tmp_reg_fp, tmp_reg_fp
emit32(0x0f20a400 | tmp_reg_fp | (tmp_reg_fp << 5), code, k);
// scvtf tmp_reg_fp.2d, tmp_reg_fp.2d
emit32(0x4E61D800 | tmp_reg_fp | (tmp_reg_fp << 5), code, k);
@ -835,7 +844,8 @@ void JitCompilerA64::h_IROR_R(Instruction& instr, uint32_t& codePos)
else
{
// ror dst, dst, imm
emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((instr.getImm32() & 63) << 10) | (dst << 16), code, codePos);
if ((instr.getImm32() & 63))
emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((instr.getImm32() & 63) << 10) | (dst << 16), code, codePos);
}
reg_changed_offset[instr.dst] = codePos;
@ -861,7 +871,8 @@ void JitCompilerA64::h_IROL_R(Instruction& instr, uint32_t& codePos)
else
{
// ror dst, dst, imm
emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((-instr.getImm32() & 63) << 10) | (dst << 16), code, k);
if ((instr.getImm32() & 63))
emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((-instr.getImm32() & 63) << 10) | (dst << 16), code, k);
}
reg_changed_offset[instr.dst] = k;
@ -894,13 +905,8 @@ void JitCompilerA64::h_FSWAP_R(Instruction& instr, uint32_t& codePos)
const uint32_t dst = instr.dst + 16;
constexpr uint32_t tmp_reg_fp = 28;
constexpr uint32_t src_index1 = 1 << 14;
constexpr uint32_t dst_index1 = 1 << 20;
emit32(ARMV8A::MOV_VREG_EL | tmp_reg_fp | (dst << 5) | src_index1, code, k);
emit32(ARMV8A::MOV_VREG_EL | dst | (dst << 5) | dst_index1, code, k);
emit32(ARMV8A::MOV_VREG_EL | dst | (tmp_reg_fp << 5), code, k);
// ext dst.16b, dst.16b, dst.16b, #0x8
emit32(0x6e004000 | dst | (dst << 5) | (dst << 16), code, k);
codePos = k;
}
@ -1029,11 +1035,19 @@ void JitCompilerA64::h_CFROUND(Instruction& instr, uint32_t& codePos)
constexpr uint32_t tmp_reg = 20;
constexpr uint32_t fpcr_tmp_reg = 8;
// ror tmp_reg, src, imm
emit32(ARMV8A::ROR_IMM | tmp_reg | (src << 5) | ((instr.getImm32() & 63) << 10) | (src << 16), code, k);
if (instr.getImm32() & 63)
{
// ror tmp_reg, src, imm
emit32(ARMV8A::ROR_IMM | tmp_reg | (src << 5) | ((instr.getImm32() & 63) << 10) | (src << 16), code, k);
// bfi fpcr_tmp_reg, tmp_reg, 40, 2
emit32(0xB3580400 | fpcr_tmp_reg | (tmp_reg << 5), code, k);
// bfi fpcr_tmp_reg, tmp_reg, 40, 2
emit32(0xB3580400 | fpcr_tmp_reg | (tmp_reg << 5), code, k);
}
else // no rotation
{
// bfi fpcr_tmp_reg, src, 40, 2
emit32(0xB3580400 | fpcr_tmp_reg | (src << 5), code, k);
}
// rbit tmp_reg, fpcr_tmp_reg
emit32(0xDAC00000 | tmp_reg | (fpcr_tmp_reg << 5), code, k);
@ -1059,9 +1073,12 @@ void JitCompilerA64::h_ISTORE(Instruction& instr, uint32_t& codePos)
else
imm &= RandomX_CurrentConfig.ScratchpadL3_Size - 1;
emitAddImmediate(tmp_reg, dst, imm, code, k);
uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5);
if (imm)
emitAddImmediate(tmp_reg, dst, imm, code, k);
else
t = 0x927d0000 | tmp_reg | (dst << 5);
constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5);
const uint32_t andInstrL1 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL1 - 4) << 10);
const uint32_t andInstrL2 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL2 - 4) << 10);
const uint32_t andInstrL3 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL3 - 4) << 10);

View file

@ -100,9 +100,9 @@
# v26 -> "a2"
# v27 -> "a3"
# v28 -> temporary
# v29 -> E 'and' mask = 0x00ffffffffffffff00ffffffffffffff
# v30 -> E 'or' mask = 0x3*00000000******3*00000000******
# v31 -> scale mask = 0x81f000000000000081f0000000000000
# v29 -> E 'and' mask = 0x00ffffffffffffff'00ffffffffffffff
# v30 -> E 'or' mask = 0x3*00000000******'3*00000000******
# v31 -> scale mask = 0x80f0000000000000'80f0000000000000
.balign 4
DECL(randomx_program_aarch64):
@ -142,17 +142,14 @@ DECL(randomx_program_aarch64):
ldp q26, q27, [x0, 224]
# Load E 'and' mask
mov x16, 0x00FFFFFFFFFFFFFF
ins v29.d[0], x16
ins v29.d[1], x16
movi.2d v29, #0x00FFFFFFFFFFFFFF
# Load E 'or' mask (stored in reg.f[0])
ldr q30, [x0, 64]
# Load scale mask
mov x16, 0x80f0000000000000
ins v31.d[0], x16
ins v31.d[1], x16
dup v31.2d, x16
# Read fpcr
mrs x8, fpcr
@ -162,35 +159,22 @@ DECL(randomx_program_aarch64):
str x0, [sp, -16]!
# Read literals
ldr x0, literal_x0
ldr x11, literal_x11
ldr x21, literal_x21
ldr x22, literal_x22
ldr x23, literal_x23
ldr x24, literal_x24
ldr x25, literal_x25
ldr x26, literal_x26
ldr x27, literal_x27
ldr x28, literal_x28
ldr x29, literal_x29
ldr x30, literal_x30
adr x30, literal_v0
ldp q0, q1, [x30]
ldp q2, q3, [x30, 32]
ldp q4, q5, [x30, 64]
ldp q6, q7, [x30, 96]
ldp q8, q9, [x30, 128]
ldp q10, q11, [x30, 160]
ldp q12, q13, [x30, 192]
ldp q14, q15, [x30, 224]
ldr q0, literal_v0
ldr q1, literal_v1
ldr q2, literal_v2
ldr q3, literal_v3
ldr q4, literal_v4
ldr q5, literal_v5
ldr q6, literal_v6
ldr q7, literal_v7
ldr q8, literal_v8
ldr q9, literal_v9
ldr q10, literal_v10
ldr q11, literal_v11
ldr q12, literal_v12
ldr q13, literal_v13
ldr q14, literal_v14
ldr q15, literal_v15
ldp x0, x11, [x30, -96] // literal_x0
ldp x21, x22, [x30, -80] // literal_x21
ldp x23, x24, [x30, -64] // literal_x23
ldp x25, x26, [x30, -48] // literal_x25
ldp x27, x28, [x30, -32] // literal_x27
ldp x29, x30, [x30, -16] // literal_x29
DECL(randomx_program_aarch64_main_loop):
# spAddr0 = spMix1 & ScratchpadL3Mask64;
@ -221,40 +205,31 @@ DECL(randomx_program_aarch64_main_loop):
eor x15, x15, x19
# Load group F registers (spAddr1)
ldpsw x20, x19, [x17]
ins v16.d[0], x20
ins v16.d[1], x19
ldpsw x20, x19, [x17, 8]
ins v17.d[0], x20
ins v17.d[1], x19
ldpsw x20, x19, [x17, 16]
ins v18.d[0], x20
ins v18.d[1], x19
ldpsw x20, x19, [x17, 24]
ins v19.d[0], x20
ins v19.d[1], x19
scvtf v16.2d, v16.2d
scvtf v17.2d, v17.2d
scvtf v18.2d, v18.2d
scvtf v19.2d, v19.2d
ldr q17, [x17]
sxtl.2d v16, v17
scvtf.2d v16, v16
sxtl2.2d v17, v17
scvtf.2d v17, v17
ldr q19, [x17, 16]
sxtl.2d v18, v19
scvtf.2d v18, v18
sxtl2.2d v19, v19
scvtf.2d v19, v19
# Load group E registers (spAddr1)
ldpsw x20, x19, [x17, 32]
ins v20.d[0], x20
ins v20.d[1], x19
ldpsw x20, x19, [x17, 40]
ins v21.d[0], x20
ins v21.d[1], x19
ldpsw x20, x19, [x17, 48]
ins v22.d[0], x20
ins v22.d[1], x19
ldpsw x20, x19, [x17, 56]
ins v23.d[0], x20
ins v23.d[1], x19
scvtf v20.2d, v20.2d
scvtf v21.2d, v21.2d
scvtf v22.2d, v22.2d
scvtf v23.2d, v23.2d
ldr q21, [x17, 32]
sxtl.2d v20, v21
scvtf.2d v20, v20
sxtl2.2d v21, v21
scvtf.2d v21, v21
ldr q23, [x17, 48]
sxtl.2d v22, v23
scvtf.2d v22, v22
sxtl2.2d v23, v23
scvtf.2d v23, v23
and v20.16b, v20.16b, v29.16b
and v21.16b, v21.16b, v29.16b
and v22.16b, v22.16b, v29.16b
@ -310,10 +285,9 @@ DECL(randomx_program_aarch64_vm_instructions_end):
eor x9, x9, x20
# Calculate dataset pointer for dataset prefetch
mov w20, w9
DECL(randomx_program_aarch64_cacheline_align_mask1):
# Actual mask will be inserted by JIT compiler
and x20, x20, 1
and x20, x9, 1
add x20, x20, x1
# Prefetch dataset data
@ -491,42 +465,39 @@ DECL(randomx_calc_dataset_item_aarch64):
stp x10, x11, [sp, 80]
stp x12, x13, [sp, 96]
ldr x12, superscalarMul0
adr x7, superscalarMul0
# superscalarMul0, superscalarAdd1
ldp x12, x13, [x7]
mov x8, x0
mov x9, x1
ldp x8, x9, [sp]
mov x10, x2
# rl[0] = (itemNumber + 1) * superscalarMul0;
madd x0, x2, x12, x12
# rl[1] = rl[0] ^ superscalarAdd1;
ldr x12, superscalarAdd1
eor x1, x0, x12
eor x1, x0, x13
# rl[2] = rl[0] ^ superscalarAdd2;
ldr x12, superscalarAdd2
ldp x12, x13, [x7, 16]
eor x2, x0, x12
# rl[3] = rl[0] ^ superscalarAdd3;
ldr x12, superscalarAdd3
eor x3, x0, x12
eor x3, x0, x13
# rl[4] = rl[0] ^ superscalarAdd4;
ldr x12, superscalarAdd4
ldp x12, x13, [x7, 32]
eor x4, x0, x12
# rl[5] = rl[0] ^ superscalarAdd5;
ldr x12, superscalarAdd5
eor x5, x0, x12
eor x5, x0, x13
# rl[6] = rl[0] ^ superscalarAdd6;
ldr x12, superscalarAdd6
ldp x12, x13, [x7, 48]
eor x6, x0, x12
# rl[7] = rl[0] ^ superscalarAdd7;
ldr x12, superscalarAdd7
eor x7, x0, x12
eor x7, x0, x13
b DECL(randomx_calc_dataset_item_aarch64_prefetch)