From 93f506799948f3c476bae2a3eb45a75647f29e8d Mon Sep 17 00:00:00 2001 From: Christopher Wright <22761542+xtophyr@users.noreply.github.com> Date: Wed, 17 Sep 2025 15:15:01 -0400 Subject: [PATCH] minor Aarch64 JIT changes (better instruction selection, don't emit instructions that add 0, etc) --- src/crypto/randomx/jit_compiler_a64.cpp | 95 +++++++------ src/crypto/randomx/jit_compiler_a64_static.S | 139 ++++++++----------- src/crypto/randomx/reciprocal.c | 5 +- 3 files changed, 112 insertions(+), 127 deletions(-) diff --git a/src/crypto/randomx/jit_compiler_a64.cpp b/src/crypto/randomx/jit_compiler_a64.cpp index 860503081..6192cdeca 100644 --- a/src/crypto/randomx/jit_compiler_a64.cpp +++ b/src/crypto/randomx/jit_compiler_a64.cpp @@ -67,7 +67,6 @@ constexpr uint32_t LDR_LITERAL = 0x58000000; constexpr uint32_t ROR = 0x9AC02C00; constexpr uint32_t ROR_IMM = 0x93C00000; constexpr uint32_t MOV_REG = 0xAA0003E0; -constexpr uint32_t MOV_VREG_EL = 0x6E080400; constexpr uint32_t FADD = 0x4E60D400; constexpr uint32_t FSUB = 0x4EE0D400; constexpr uint32_t FEOR = 0x6E201C00; @@ -102,7 +101,7 @@ static size_t CalcDatasetItemSize() ((uint8_t*)randomx_calc_dataset_item_aarch64_end - (uint8_t*)randomx_calc_dataset_item_aarch64_store_result); } -constexpr uint32_t IntRegMap[8] = { 4, 5, 6, 7, 12, 13, 14, 15 }; +constexpr uint8_t IntRegMap[8] = { 4, 5, 6, 7, 12, 13, 14, 15 }; JitCompilerA64::JitCompilerA64(bool hugePagesEnable, bool) : hugePages(hugePagesJIT && hugePagesEnable), @@ -128,11 +127,12 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con uint32_t codePos = MainLoopBegin + 4; + uint32_t mask = ((RandomX_CurrentConfig.Log2_ScratchpadL3 - 7) << 10); // and w16, w10, ScratchpadL3Mask64 - emit32(0x121A0000 | 16 | (10 << 5) | ((RandomX_CurrentConfig.Log2_ScratchpadL3 - 7) << 10), code, codePos); + emit32(0x121A0000 | 16 | (10 << 5) | mask, code, codePos); // and w17, w20, ScratchpadL3Mask64 - emit32(0x121A0000 | 17 | (20 << 5) | ((RandomX_CurrentConfig.Log2_ScratchpadL3 - 7) << 10), code, codePos); + emit32(0x121A0000 | 17 | (20 << 5) | mask, code, codePos); codePos = PrologueSize; literalPos = ImulRcpLiteralsEnd; @@ -155,13 +155,14 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64)) - codePos; emit32(ARMV8A::B | (offset / 4), code, codePos); - // and w20, w20, CacheLineAlignMask + mask = ((RandomX_CurrentConfig.Log2_DatasetBaseSize - 7) << 10); + // and w20, w9, CacheLineAlignMask codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask1) - ((uint8_t*)randomx_program_aarch64)); - emit32(0x121A0000 | 20 | (20 << 5) | ((RandomX_CurrentConfig.Log2_DatasetBaseSize - 7) << 10), code, codePos); + emit32(0x121A0000 | 20 | (9 << 5) | mask, code, codePos); // and w10, w10, CacheLineAlignMask codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask2) - ((uint8_t*)randomx_program_aarch64)); - emit32(0x121A0000 | 10 | (10 << 5) | ((RandomX_CurrentConfig.Log2_DatasetBaseSize - 7) << 10), code, codePos); + emit32(0x121A0000 | 10 | (10 << 5) | mask, code, codePos); // Update spMix1 // eor x10, config.readReg0, config.readReg1 @@ -497,9 +498,12 @@ void JitCompilerA64::emitMemLoad(uint32_t dst, uint32_t src, Instruction& instr, if (src != dst) { imm &= instr.getModMem() ? (RandomX_CurrentConfig.ScratchpadL1_Size - 1) : (RandomX_CurrentConfig.ScratchpadL2_Size - 1); - emitAddImmediate(tmp_reg, src, imm, code, k); + uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5); + if (imm) + emitAddImmediate(tmp_reg, src, imm, code, k); + else + t = 0x927d0000 | tmp_reg | (src << 5); - constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5); const uint32_t andInstrL1 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL1 - 4) << 10); const uint32_t andInstrL2 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL2 - 4) << 10); @@ -511,10 +515,18 @@ void JitCompilerA64::emitMemLoad(uint32_t dst, uint32_t src, Instruction& instr, else { imm = (imm & ScratchpadL3Mask) >> 3; - emitMovImmediate(tmp_reg, imm, code, k); + if (imm) + { + emitMovImmediate(tmp_reg, imm, code, k); - // ldr tmp_reg, [x2, tmp_reg, lsl 3] - emit32(0xf8607840 | tmp_reg | (tmp_reg << 16), code, k); + // ldr tmp_reg, [x2, tmp_reg, lsl 3] + emit32(0xf8607840 | tmp_reg | (tmp_reg << 16), code, k); + } + else + { + // ldr tmp_reg, [x2] + emit32(0xf9400040 | tmp_reg, code, k); + } } codePos = k; @@ -529,25 +541,22 @@ void JitCompilerA64::emitMemLoadFP(uint32_t src, Instruction& instr, uint8_t* co constexpr uint32_t tmp_reg = 19; imm &= instr.getModMem() ? (RandomX_CurrentConfig.ScratchpadL1_Size - 1) : (RandomX_CurrentConfig.ScratchpadL2_Size - 1); - emitAddImmediate(tmp_reg, src, imm, code, k); + uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5); + if (imm) + emitAddImmediate(tmp_reg, src, imm, code, k); + else + t = 0x927d0000 | tmp_reg | (src << 5); - constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5); const uint32_t andInstrL1 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL1 - 4) << 10); const uint32_t andInstrL2 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL2 - 4) << 10); emit32(instr.getModMem() ? andInstrL1 : andInstrL2, code, k); - // add tmp_reg, x2, tmp_reg - emit32(ARMV8A::ADD | tmp_reg | (2 << 5) | (tmp_reg << 16), code, k); + // ldr tmp_reg_fp, [x2, tmp_reg] + emit32(0x3ce06800 | tmp_reg_fp | (2 << 5) | (tmp_reg << 16), code, k); - // ldpsw tmp_reg, tmp_reg + 1, [tmp_reg] - emit32(0x69400000 | tmp_reg | (tmp_reg << 5) | ((tmp_reg + 1) << 10), code, k); - - // ins tmp_reg_fp.d[0], tmp_reg - emit32(0x4E081C00 | tmp_reg_fp | (tmp_reg << 5), code, k); - - // ins tmp_reg_fp.d[1], tmp_reg + 1 - emit32(0x4E181C00 | tmp_reg_fp | ((tmp_reg + 1) << 5), code, k); + // sxtl.2d tmp_reg_fp, tmp_reg_fp + emit32(0x0f20a400 | tmp_reg_fp | (tmp_reg_fp << 5), code, k); // scvtf tmp_reg_fp.2d, tmp_reg_fp.2d emit32(0x4E61D800 | tmp_reg_fp | (tmp_reg_fp << 5), code, k); @@ -835,7 +844,8 @@ void JitCompilerA64::h_IROR_R(Instruction& instr, uint32_t& codePos) else { // ror dst, dst, imm - emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((instr.getImm32() & 63) << 10) | (dst << 16), code, codePos); + if ((instr.getImm32() & 63)) + emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((instr.getImm32() & 63) << 10) | (dst << 16), code, codePos); } reg_changed_offset[instr.dst] = codePos; @@ -861,7 +871,8 @@ void JitCompilerA64::h_IROL_R(Instruction& instr, uint32_t& codePos) else { // ror dst, dst, imm - emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((-instr.getImm32() & 63) << 10) | (dst << 16), code, k); + if ((instr.getImm32() & 63)) + emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((-instr.getImm32() & 63) << 10) | (dst << 16), code, k); } reg_changed_offset[instr.dst] = k; @@ -894,13 +905,8 @@ void JitCompilerA64::h_FSWAP_R(Instruction& instr, uint32_t& codePos) const uint32_t dst = instr.dst + 16; - constexpr uint32_t tmp_reg_fp = 28; - constexpr uint32_t src_index1 = 1 << 14; - constexpr uint32_t dst_index1 = 1 << 20; - - emit32(ARMV8A::MOV_VREG_EL | tmp_reg_fp | (dst << 5) | src_index1, code, k); - emit32(ARMV8A::MOV_VREG_EL | dst | (dst << 5) | dst_index1, code, k); - emit32(ARMV8A::MOV_VREG_EL | dst | (tmp_reg_fp << 5), code, k); + // ext dst.16b, dst.16b, dst.16b, #0x8 + emit32(0x6e004000 | dst | (dst << 5) | (dst << 16), code, k); codePos = k; } @@ -1029,11 +1035,19 @@ void JitCompilerA64::h_CFROUND(Instruction& instr, uint32_t& codePos) constexpr uint32_t tmp_reg = 20; constexpr uint32_t fpcr_tmp_reg = 8; - // ror tmp_reg, src, imm - emit32(ARMV8A::ROR_IMM | tmp_reg | (src << 5) | ((instr.getImm32() & 63) << 10) | (src << 16), code, k); + if (instr.getImm32() & 63) + { + // ror tmp_reg, src, imm + emit32(ARMV8A::ROR_IMM | tmp_reg | (src << 5) | ((instr.getImm32() & 63) << 10) | (src << 16), code, k); - // bfi fpcr_tmp_reg, tmp_reg, 40, 2 - emit32(0xB3580400 | fpcr_tmp_reg | (tmp_reg << 5), code, k); + // bfi fpcr_tmp_reg, tmp_reg, 40, 2 + emit32(0xB3580400 | fpcr_tmp_reg | (tmp_reg << 5), code, k); + } + else // no rotation + { + // bfi fpcr_tmp_reg, src, 40, 2 + emit32(0xB3580400 | fpcr_tmp_reg | (src << 5), code, k); + } // rbit tmp_reg, fpcr_tmp_reg emit32(0xDAC00000 | tmp_reg | (fpcr_tmp_reg << 5), code, k); @@ -1059,9 +1073,12 @@ void JitCompilerA64::h_ISTORE(Instruction& instr, uint32_t& codePos) else imm &= RandomX_CurrentConfig.ScratchpadL3_Size - 1; - emitAddImmediate(tmp_reg, dst, imm, code, k); + uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5); + if (imm) + emitAddImmediate(tmp_reg, dst, imm, code, k); + else + t = 0x927d0000 | tmp_reg | (dst << 5); - constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5); const uint32_t andInstrL1 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL1 - 4) << 10); const uint32_t andInstrL2 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL2 - 4) << 10); const uint32_t andInstrL3 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL3 - 4) << 10); diff --git a/src/crypto/randomx/jit_compiler_a64_static.S b/src/crypto/randomx/jit_compiler_a64_static.S index e019c6b4b..b5d6183f8 100644 --- a/src/crypto/randomx/jit_compiler_a64_static.S +++ b/src/crypto/randomx/jit_compiler_a64_static.S @@ -100,9 +100,9 @@ # v26 -> "a2" # v27 -> "a3" # v28 -> temporary -# v29 -> E 'and' mask = 0x00ffffffffffffff00ffffffffffffff -# v30 -> E 'or' mask = 0x3*00000000******3*00000000****** -# v31 -> scale mask = 0x81f000000000000081f0000000000000 +# v29 -> E 'and' mask = 0x00ffffffffffffff'00ffffffffffffff +# v30 -> E 'or' mask = 0x3*00000000******'3*00000000****** +# v31 -> scale mask = 0x80f0000000000000'80f0000000000000 .balign 4 DECL(randomx_program_aarch64): @@ -142,17 +142,14 @@ DECL(randomx_program_aarch64): ldp q26, q27, [x0, 224] # Load E 'and' mask - mov x16, 0x00FFFFFFFFFFFFFF - ins v29.d[0], x16 - ins v29.d[1], x16 + movi.2d v29, #0x00FFFFFFFFFFFFFF # Load E 'or' mask (stored in reg.f[0]) ldr q30, [x0, 64] # Load scale mask mov x16, 0x80f0000000000000 - ins v31.d[0], x16 - ins v31.d[1], x16 + dup v31.2d, x16 # Read fpcr mrs x8, fpcr @@ -162,35 +159,22 @@ DECL(randomx_program_aarch64): str x0, [sp, -16]! # Read literals - ldr x0, literal_x0 - ldr x11, literal_x11 - ldr x21, literal_x21 - ldr x22, literal_x22 - ldr x23, literal_x23 - ldr x24, literal_x24 - ldr x25, literal_x25 - ldr x26, literal_x26 - ldr x27, literal_x27 - ldr x28, literal_x28 - ldr x29, literal_x29 - ldr x30, literal_x30 + adr x30, literal_v0 + ldp q0, q1, [x30] + ldp q2, q3, [x30, 32] + ldp q4, q5, [x30, 64] + ldp q6, q7, [x30, 96] + ldp q8, q9, [x30, 128] + ldp q10, q11, [x30, 160] + ldp q12, q13, [x30, 192] + ldp q14, q15, [x30, 224] - ldr q0, literal_v0 - ldr q1, literal_v1 - ldr q2, literal_v2 - ldr q3, literal_v3 - ldr q4, literal_v4 - ldr q5, literal_v5 - ldr q6, literal_v6 - ldr q7, literal_v7 - ldr q8, literal_v8 - ldr q9, literal_v9 - ldr q10, literal_v10 - ldr q11, literal_v11 - ldr q12, literal_v12 - ldr q13, literal_v13 - ldr q14, literal_v14 - ldr q15, literal_v15 + ldp x0, x11, [x30, -96] // literal_x0 + ldp x21, x22, [x30, -80] // literal_x21 + ldp x23, x24, [x30, -64] // literal_x23 + ldp x25, x26, [x30, -48] // literal_x25 + ldp x27, x28, [x30, -32] // literal_x27 + ldp x29, x30, [x30, -16] // literal_x29 DECL(randomx_program_aarch64_main_loop): # spAddr0 = spMix1 & ScratchpadL3Mask64; @@ -221,40 +205,31 @@ DECL(randomx_program_aarch64_main_loop): eor x15, x15, x19 # Load group F registers (spAddr1) - ldpsw x20, x19, [x17] - ins v16.d[0], x20 - ins v16.d[1], x19 - ldpsw x20, x19, [x17, 8] - ins v17.d[0], x20 - ins v17.d[1], x19 - ldpsw x20, x19, [x17, 16] - ins v18.d[0], x20 - ins v18.d[1], x19 - ldpsw x20, x19, [x17, 24] - ins v19.d[0], x20 - ins v19.d[1], x19 - scvtf v16.2d, v16.2d - scvtf v17.2d, v17.2d - scvtf v18.2d, v18.2d - scvtf v19.2d, v19.2d + ldr q17, [x17] + sxtl.2d v16, v17 + scvtf.2d v16, v16 + sxtl2.2d v17, v17 + scvtf.2d v17, v17 + + ldr q19, [x17, 16] + sxtl.2d v18, v19 + scvtf.2d v18, v18 + sxtl2.2d v19, v19 + scvtf.2d v19, v19 # Load group E registers (spAddr1) - ldpsw x20, x19, [x17, 32] - ins v20.d[0], x20 - ins v20.d[1], x19 - ldpsw x20, x19, [x17, 40] - ins v21.d[0], x20 - ins v21.d[1], x19 - ldpsw x20, x19, [x17, 48] - ins v22.d[0], x20 - ins v22.d[1], x19 - ldpsw x20, x19, [x17, 56] - ins v23.d[0], x20 - ins v23.d[1], x19 - scvtf v20.2d, v20.2d - scvtf v21.2d, v21.2d - scvtf v22.2d, v22.2d - scvtf v23.2d, v23.2d + ldr q21, [x17, 32] + sxtl.2d v20, v21 + scvtf.2d v20, v20 + sxtl2.2d v21, v21 + scvtf.2d v21, v21 + + ldr q23, [x17, 48] + sxtl.2d v22, v23 + scvtf.2d v22, v22 + sxtl2.2d v23, v23 + scvtf.2d v23, v23 + and v20.16b, v20.16b, v29.16b and v21.16b, v21.16b, v29.16b and v22.16b, v22.16b, v29.16b @@ -310,10 +285,9 @@ DECL(randomx_program_aarch64_vm_instructions_end): eor x9, x9, x20 # Calculate dataset pointer for dataset prefetch - mov w20, w9 DECL(randomx_program_aarch64_cacheline_align_mask1): # Actual mask will be inserted by JIT compiler - and x20, x20, 1 + and x20, x9, 1 add x20, x20, x1 # Prefetch dataset data @@ -491,42 +465,39 @@ DECL(randomx_calc_dataset_item_aarch64): stp x10, x11, [sp, 80] stp x12, x13, [sp, 96] - ldr x12, superscalarMul0 + adr x7, superscalarMul0 + # superscalarMul0, superscalarAdd1 + ldp x12, x13, [x7] - mov x8, x0 - mov x9, x1 + ldp x8, x9, [sp] mov x10, x2 # rl[0] = (itemNumber + 1) * superscalarMul0; madd x0, x2, x12, x12 # rl[1] = rl[0] ^ superscalarAdd1; - ldr x12, superscalarAdd1 - eor x1, x0, x12 + eor x1, x0, x13 # rl[2] = rl[0] ^ superscalarAdd2; - ldr x12, superscalarAdd2 + ldp x12, x13, [x7, 16] eor x2, x0, x12 # rl[3] = rl[0] ^ superscalarAdd3; - ldr x12, superscalarAdd3 - eor x3, x0, x12 + eor x3, x0, x13 # rl[4] = rl[0] ^ superscalarAdd4; - ldr x12, superscalarAdd4 + ldp x12, x13, [x7, 32] eor x4, x0, x12 # rl[5] = rl[0] ^ superscalarAdd5; - ldr x12, superscalarAdd5 - eor x5, x0, x12 + eor x5, x0, x13 # rl[6] = rl[0] ^ superscalarAdd6; - ldr x12, superscalarAdd6 + ldp x12, x13, [x7, 48] eor x6, x0, x12 # rl[7] = rl[0] ^ superscalarAdd7; - ldr x12, superscalarAdd7 - eor x7, x0, x12 + eor x7, x0, x13 b DECL(randomx_calc_dataset_item_aarch64_prefetch) diff --git a/src/crypto/randomx/reciprocal.c b/src/crypto/randomx/reciprocal.c index 87cda2677..4b4e772fb 100644 --- a/src/crypto/randomx/reciprocal.c +++ b/src/crypto/randomx/reciprocal.c @@ -52,10 +52,7 @@ uint64_t randomx_reciprocal(uint64_t divisor) { uint64_t quotient = p2exp63 / divisor, remainder = p2exp63 % divisor; - unsigned bsr = 0; //highest set bit in divisor - - for (uint64_t bit = divisor; bit > 0; bit >>= 1) - bsr++; + unsigned bsr = 64 - __builtin_clzll(divisor); //highest set bit in divisor for (unsigned shift = 0; shift < bsr; shift++) { if (remainder >= divisor - remainder) {