0
0
Fork 0
mirror of synced 2025-09-23 12:18:15 +00:00

Compare commits

...

2 commits

Author SHA1 Message Date
xtophyr
740cf37552
Merge ab57a1d621 into 6e4a5a6d94 2025-09-17 15:32:06 -04:00
Christopher Wright
ab57a1d621 minor Aarch64 JIT changes (better instruction selection, don't emit instructions that add 0, etc) 2025-09-17 15:27:47 -04:00
3 changed files with 112 additions and 127 deletions

View file

@ -67,7 +67,6 @@ constexpr uint32_t LDR_LITERAL = 0x58000000;
constexpr uint32_t ROR = 0x9AC02C00; constexpr uint32_t ROR = 0x9AC02C00;
constexpr uint32_t ROR_IMM = 0x93C00000; constexpr uint32_t ROR_IMM = 0x93C00000;
constexpr uint32_t MOV_REG = 0xAA0003E0; constexpr uint32_t MOV_REG = 0xAA0003E0;
constexpr uint32_t MOV_VREG_EL = 0x6E080400;
constexpr uint32_t FADD = 0x4E60D400; constexpr uint32_t FADD = 0x4E60D400;
constexpr uint32_t FSUB = 0x4EE0D400; constexpr uint32_t FSUB = 0x4EE0D400;
constexpr uint32_t FEOR = 0x6E201C00; constexpr uint32_t FEOR = 0x6E201C00;
@ -102,7 +101,7 @@ static size_t CalcDatasetItemSize()
((uint8_t*)randomx_calc_dataset_item_aarch64_end - (uint8_t*)randomx_calc_dataset_item_aarch64_store_result); ((uint8_t*)randomx_calc_dataset_item_aarch64_end - (uint8_t*)randomx_calc_dataset_item_aarch64_store_result);
} }
constexpr uint32_t IntRegMap[8] = { 4, 5, 6, 7, 12, 13, 14, 15 }; constexpr uint8_t IntRegMap[8] = { 4, 5, 6, 7, 12, 13, 14, 15 };
JitCompilerA64::JitCompilerA64(bool hugePagesEnable, bool) : JitCompilerA64::JitCompilerA64(bool hugePagesEnable, bool) :
hugePages(hugePagesJIT && hugePagesEnable), hugePages(hugePagesJIT && hugePagesEnable),
@ -128,11 +127,12 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con
uint32_t codePos = MainLoopBegin + 4; uint32_t codePos = MainLoopBegin + 4;
uint32_t mask = ((RandomX_CurrentConfig.Log2_ScratchpadL3 - 7) << 10);
// and w16, w10, ScratchpadL3Mask64 // and w16, w10, ScratchpadL3Mask64
emit32(0x121A0000 | 16 | (10 << 5) | ((RandomX_CurrentConfig.Log2_ScratchpadL3 - 7) << 10), code, codePos); emit32(0x121A0000 | 16 | (10 << 5) | mask, code, codePos);
// and w17, w20, ScratchpadL3Mask64 // and w17, w20, ScratchpadL3Mask64
emit32(0x121A0000 | 17 | (20 << 5) | ((RandomX_CurrentConfig.Log2_ScratchpadL3 - 7) << 10), code, codePos); emit32(0x121A0000 | 17 | (20 << 5) | mask, code, codePos);
codePos = PrologueSize; codePos = PrologueSize;
literalPos = ImulRcpLiteralsEnd; literalPos = ImulRcpLiteralsEnd;
@ -155,13 +155,14 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con
const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64)) - codePos; const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64)) - codePos;
emit32(ARMV8A::B | (offset / 4), code, codePos); emit32(ARMV8A::B | (offset / 4), code, codePos);
// and w20, w20, CacheLineAlignMask mask = ((RandomX_CurrentConfig.Log2_DatasetBaseSize - 7) << 10);
// and w20, w9, CacheLineAlignMask
codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask1) - ((uint8_t*)randomx_program_aarch64)); codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask1) - ((uint8_t*)randomx_program_aarch64));
emit32(0x121A0000 | 20 | (20 << 5) | ((RandomX_CurrentConfig.Log2_DatasetBaseSize - 7) << 10), code, codePos); emit32(0x121A0000 | 20 | (9 << 5) | mask, code, codePos);
// and w10, w10, CacheLineAlignMask // and w10, w10, CacheLineAlignMask
codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask2) - ((uint8_t*)randomx_program_aarch64)); codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask2) - ((uint8_t*)randomx_program_aarch64));
emit32(0x121A0000 | 10 | (10 << 5) | ((RandomX_CurrentConfig.Log2_DatasetBaseSize - 7) << 10), code, codePos); emit32(0x121A0000 | 10 | (10 << 5) | mask, code, codePos);
// Update spMix1 // Update spMix1
// eor x10, config.readReg0, config.readReg1 // eor x10, config.readReg0, config.readReg1
@ -497,9 +498,12 @@ void JitCompilerA64::emitMemLoad(uint32_t dst, uint32_t src, Instruction& instr,
if (src != dst) if (src != dst)
{ {
imm &= instr.getModMem() ? (RandomX_CurrentConfig.ScratchpadL1_Size - 1) : (RandomX_CurrentConfig.ScratchpadL2_Size - 1); imm &= instr.getModMem() ? (RandomX_CurrentConfig.ScratchpadL1_Size - 1) : (RandomX_CurrentConfig.ScratchpadL2_Size - 1);
uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5);
if (imm)
emitAddImmediate(tmp_reg, src, imm, code, k); emitAddImmediate(tmp_reg, src, imm, code, k);
else
t = 0x927d0000 | tmp_reg | (src << 5);
constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5);
const uint32_t andInstrL1 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL1 - 4) << 10); const uint32_t andInstrL1 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL1 - 4) << 10);
const uint32_t andInstrL2 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL2 - 4) << 10); const uint32_t andInstrL2 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL2 - 4) << 10);
@ -511,11 +515,19 @@ void JitCompilerA64::emitMemLoad(uint32_t dst, uint32_t src, Instruction& instr,
else else
{ {
imm = (imm & ScratchpadL3Mask) >> 3; imm = (imm & ScratchpadL3Mask) >> 3;
if (imm)
{
emitMovImmediate(tmp_reg, imm, code, k); emitMovImmediate(tmp_reg, imm, code, k);
// ldr tmp_reg, [x2, tmp_reg, lsl 3] // ldr tmp_reg, [x2, tmp_reg, lsl 3]
emit32(0xf8607840 | tmp_reg | (tmp_reg << 16), code, k); emit32(0xf8607840 | tmp_reg | (tmp_reg << 16), code, k);
} }
else
{
// ldr tmp_reg, [x2]
emit32(0xf9400040 | tmp_reg, code, k);
}
}
codePos = k; codePos = k;
} }
@ -529,25 +541,22 @@ void JitCompilerA64::emitMemLoadFP(uint32_t src, Instruction& instr, uint8_t* co
constexpr uint32_t tmp_reg = 19; constexpr uint32_t tmp_reg = 19;
imm &= instr.getModMem() ? (RandomX_CurrentConfig.ScratchpadL1_Size - 1) : (RandomX_CurrentConfig.ScratchpadL2_Size - 1); imm &= instr.getModMem() ? (RandomX_CurrentConfig.ScratchpadL1_Size - 1) : (RandomX_CurrentConfig.ScratchpadL2_Size - 1);
uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5);
if (imm)
emitAddImmediate(tmp_reg, src, imm, code, k); emitAddImmediate(tmp_reg, src, imm, code, k);
else
t = 0x927d0000 | tmp_reg | (src << 5);
constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5);
const uint32_t andInstrL1 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL1 - 4) << 10); const uint32_t andInstrL1 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL1 - 4) << 10);
const uint32_t andInstrL2 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL2 - 4) << 10); const uint32_t andInstrL2 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL2 - 4) << 10);
emit32(instr.getModMem() ? andInstrL1 : andInstrL2, code, k); emit32(instr.getModMem() ? andInstrL1 : andInstrL2, code, k);
// add tmp_reg, x2, tmp_reg // ldr tmp_reg_fp, [x2, tmp_reg]
emit32(ARMV8A::ADD | tmp_reg | (2 << 5) | (tmp_reg << 16), code, k); emit32(0x3ce06800 | tmp_reg_fp | (2 << 5) | (tmp_reg << 16), code, k);
// ldpsw tmp_reg, tmp_reg + 1, [tmp_reg] // sxtl.2d tmp_reg_fp, tmp_reg_fp
emit32(0x69400000 | tmp_reg | (tmp_reg << 5) | ((tmp_reg + 1) << 10), code, k); emit32(0x0f20a400 | tmp_reg_fp | (tmp_reg_fp << 5), code, k);
// ins tmp_reg_fp.d[0], tmp_reg
emit32(0x4E081C00 | tmp_reg_fp | (tmp_reg << 5), code, k);
// ins tmp_reg_fp.d[1], tmp_reg + 1
emit32(0x4E181C00 | tmp_reg_fp | ((tmp_reg + 1) << 5), code, k);
// scvtf tmp_reg_fp.2d, tmp_reg_fp.2d // scvtf tmp_reg_fp.2d, tmp_reg_fp.2d
emit32(0x4E61D800 | tmp_reg_fp | (tmp_reg_fp << 5), code, k); emit32(0x4E61D800 | tmp_reg_fp | (tmp_reg_fp << 5), code, k);
@ -835,6 +844,7 @@ void JitCompilerA64::h_IROR_R(Instruction& instr, uint32_t& codePos)
else else
{ {
// ror dst, dst, imm // ror dst, dst, imm
if ((instr.getImm32() & 63))
emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((instr.getImm32() & 63) << 10) | (dst << 16), code, codePos); emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((instr.getImm32() & 63) << 10) | (dst << 16), code, codePos);
} }
@ -861,6 +871,7 @@ void JitCompilerA64::h_IROL_R(Instruction& instr, uint32_t& codePos)
else else
{ {
// ror dst, dst, imm // ror dst, dst, imm
if ((instr.getImm32() & 63))
emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((-instr.getImm32() & 63) << 10) | (dst << 16), code, k); emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((-instr.getImm32() & 63) << 10) | (dst << 16), code, k);
} }
@ -894,13 +905,8 @@ void JitCompilerA64::h_FSWAP_R(Instruction& instr, uint32_t& codePos)
const uint32_t dst = instr.dst + 16; const uint32_t dst = instr.dst + 16;
constexpr uint32_t tmp_reg_fp = 28; // ext dst.16b, dst.16b, dst.16b, #0x8
constexpr uint32_t src_index1 = 1 << 14; emit32(0x6e004000 | dst | (dst << 5) | (dst << 16), code, k);
constexpr uint32_t dst_index1 = 1 << 20;
emit32(ARMV8A::MOV_VREG_EL | tmp_reg_fp | (dst << 5) | src_index1, code, k);
emit32(ARMV8A::MOV_VREG_EL | dst | (dst << 5) | dst_index1, code, k);
emit32(ARMV8A::MOV_VREG_EL | dst | (tmp_reg_fp << 5), code, k);
codePos = k; codePos = k;
} }
@ -1029,11 +1035,19 @@ void JitCompilerA64::h_CFROUND(Instruction& instr, uint32_t& codePos)
constexpr uint32_t tmp_reg = 20; constexpr uint32_t tmp_reg = 20;
constexpr uint32_t fpcr_tmp_reg = 8; constexpr uint32_t fpcr_tmp_reg = 8;
if (instr.getImm32() & 63)
{
// ror tmp_reg, src, imm // ror tmp_reg, src, imm
emit32(ARMV8A::ROR_IMM | tmp_reg | (src << 5) | ((instr.getImm32() & 63) << 10) | (src << 16), code, k); emit32(ARMV8A::ROR_IMM | tmp_reg | (src << 5) | ((instr.getImm32() & 63) << 10) | (src << 16), code, k);
// bfi fpcr_tmp_reg, tmp_reg, 40, 2 // bfi fpcr_tmp_reg, tmp_reg, 40, 2
emit32(0xB3580400 | fpcr_tmp_reg | (tmp_reg << 5), code, k); emit32(0xB3580400 | fpcr_tmp_reg | (tmp_reg << 5), code, k);
}
else // no rotation
{
// bfi fpcr_tmp_reg, src, 40, 2
emit32(0xB3580400 | fpcr_tmp_reg | (src << 5), code, k);
}
// rbit tmp_reg, fpcr_tmp_reg // rbit tmp_reg, fpcr_tmp_reg
emit32(0xDAC00000 | tmp_reg | (fpcr_tmp_reg << 5), code, k); emit32(0xDAC00000 | tmp_reg | (fpcr_tmp_reg << 5), code, k);
@ -1059,9 +1073,12 @@ void JitCompilerA64::h_ISTORE(Instruction& instr, uint32_t& codePos)
else else
imm &= RandomX_CurrentConfig.ScratchpadL3_Size - 1; imm &= RandomX_CurrentConfig.ScratchpadL3_Size - 1;
uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5);
if (imm)
emitAddImmediate(tmp_reg, dst, imm, code, k); emitAddImmediate(tmp_reg, dst, imm, code, k);
else
t = 0x927d0000 | tmp_reg | (dst << 5);
constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5);
const uint32_t andInstrL1 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL1 - 4) << 10); const uint32_t andInstrL1 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL1 - 4) << 10);
const uint32_t andInstrL2 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL2 - 4) << 10); const uint32_t andInstrL2 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL2 - 4) << 10);
const uint32_t andInstrL3 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL3 - 4) << 10); const uint32_t andInstrL3 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL3 - 4) << 10);

View file

@ -100,9 +100,9 @@
# v26 -> "a2" # v26 -> "a2"
# v27 -> "a3" # v27 -> "a3"
# v28 -> temporary # v28 -> temporary
# v29 -> E 'and' mask = 0x00ffffffffffffff00ffffffffffffff # v29 -> E 'and' mask = 0x00ffffffffffffff'00ffffffffffffff
# v30 -> E 'or' mask = 0x3*00000000******3*00000000****** # v30 -> E 'or' mask = 0x3*00000000******'3*00000000******
# v31 -> scale mask = 0x81f000000000000081f0000000000000 # v31 -> scale mask = 0x80f0000000000000'80f0000000000000
.balign 4 .balign 4
DECL(randomx_program_aarch64): DECL(randomx_program_aarch64):
@ -142,17 +142,14 @@ DECL(randomx_program_aarch64):
ldp q26, q27, [x0, 224] ldp q26, q27, [x0, 224]
# Load E 'and' mask # Load E 'and' mask
mov x16, 0x00FFFFFFFFFFFFFF movi.2d v29, #0x00FFFFFFFFFFFFFF
ins v29.d[0], x16
ins v29.d[1], x16
# Load E 'or' mask (stored in reg.f[0]) # Load E 'or' mask (stored in reg.f[0])
ldr q30, [x0, 64] ldr q30, [x0, 64]
# Load scale mask # Load scale mask
mov x16, 0x80f0000000000000 mov x16, 0x80f0000000000000
ins v31.d[0], x16 dup v31.2d, x16
ins v31.d[1], x16
# Read fpcr # Read fpcr
mrs x8, fpcr mrs x8, fpcr
@ -162,35 +159,22 @@ DECL(randomx_program_aarch64):
str x0, [sp, -16]! str x0, [sp, -16]!
# Read literals # Read literals
ldr x0, literal_x0 adr x30, literal_v0
ldr x11, literal_x11 ldp q0, q1, [x30]
ldr x21, literal_x21 ldp q2, q3, [x30, 32]
ldr x22, literal_x22 ldp q4, q5, [x30, 64]
ldr x23, literal_x23 ldp q6, q7, [x30, 96]
ldr x24, literal_x24 ldp q8, q9, [x30, 128]
ldr x25, literal_x25 ldp q10, q11, [x30, 160]
ldr x26, literal_x26 ldp q12, q13, [x30, 192]
ldr x27, literal_x27 ldp q14, q15, [x30, 224]
ldr x28, literal_x28
ldr x29, literal_x29
ldr x30, literal_x30
ldr q0, literal_v0 ldp x0, x11, [x30, -96] // literal_x0
ldr q1, literal_v1 ldp x21, x22, [x30, -80] // literal_x21
ldr q2, literal_v2 ldp x23, x24, [x30, -64] // literal_x23
ldr q3, literal_v3 ldp x25, x26, [x30, -48] // literal_x25
ldr q4, literal_v4 ldp x27, x28, [x30, -32] // literal_x27
ldr q5, literal_v5 ldp x29, x30, [x30, -16] // literal_x29
ldr q6, literal_v6
ldr q7, literal_v7
ldr q8, literal_v8
ldr q9, literal_v9
ldr q10, literal_v10
ldr q11, literal_v11
ldr q12, literal_v12
ldr q13, literal_v13
ldr q14, literal_v14
ldr q15, literal_v15
DECL(randomx_program_aarch64_main_loop): DECL(randomx_program_aarch64_main_loop):
# spAddr0 = spMix1 & ScratchpadL3Mask64; # spAddr0 = spMix1 & ScratchpadL3Mask64;
@ -221,40 +205,31 @@ DECL(randomx_program_aarch64_main_loop):
eor x15, x15, x19 eor x15, x15, x19
# Load group F registers (spAddr1) # Load group F registers (spAddr1)
ldpsw x20, x19, [x17] ldr q17, [x17]
ins v16.d[0], x20 sxtl.2d v16, v17
ins v16.d[1], x19 scvtf.2d v16, v16
ldpsw x20, x19, [x17, 8] sxtl2.2d v17, v17
ins v17.d[0], x20 scvtf.2d v17, v17
ins v17.d[1], x19
ldpsw x20, x19, [x17, 16] ldr q19, [x17, 16]
ins v18.d[0], x20 sxtl.2d v18, v19
ins v18.d[1], x19 scvtf.2d v18, v18
ldpsw x20, x19, [x17, 24] sxtl2.2d v19, v19
ins v19.d[0], x20 scvtf.2d v19, v19
ins v19.d[1], x19
scvtf v16.2d, v16.2d
scvtf v17.2d, v17.2d
scvtf v18.2d, v18.2d
scvtf v19.2d, v19.2d
# Load group E registers (spAddr1) # Load group E registers (spAddr1)
ldpsw x20, x19, [x17, 32] ldr q21, [x17, 32]
ins v20.d[0], x20 sxtl.2d v20, v21
ins v20.d[1], x19 scvtf.2d v20, v20
ldpsw x20, x19, [x17, 40] sxtl2.2d v21, v21
ins v21.d[0], x20 scvtf.2d v21, v21
ins v21.d[1], x19
ldpsw x20, x19, [x17, 48] ldr q23, [x17, 48]
ins v22.d[0], x20 sxtl.2d v22, v23
ins v22.d[1], x19 scvtf.2d v22, v22
ldpsw x20, x19, [x17, 56] sxtl2.2d v23, v23
ins v23.d[0], x20 scvtf.2d v23, v23
ins v23.d[1], x19
scvtf v20.2d, v20.2d
scvtf v21.2d, v21.2d
scvtf v22.2d, v22.2d
scvtf v23.2d, v23.2d
and v20.16b, v20.16b, v29.16b and v20.16b, v20.16b, v29.16b
and v21.16b, v21.16b, v29.16b and v21.16b, v21.16b, v29.16b
and v22.16b, v22.16b, v29.16b and v22.16b, v22.16b, v29.16b
@ -310,10 +285,9 @@ DECL(randomx_program_aarch64_vm_instructions_end):
eor x9, x9, x20 eor x9, x9, x20
# Calculate dataset pointer for dataset prefetch # Calculate dataset pointer for dataset prefetch
mov w20, w9
DECL(randomx_program_aarch64_cacheline_align_mask1): DECL(randomx_program_aarch64_cacheline_align_mask1):
# Actual mask will be inserted by JIT compiler # Actual mask will be inserted by JIT compiler
and x20, x20, 1 and x20, x9, 1
add x20, x20, x1 add x20, x20, x1
# Prefetch dataset data # Prefetch dataset data
@ -491,42 +465,39 @@ DECL(randomx_calc_dataset_item_aarch64):
stp x10, x11, [sp, 80] stp x10, x11, [sp, 80]
stp x12, x13, [sp, 96] stp x12, x13, [sp, 96]
ldr x12, superscalarMul0 adr x7, superscalarMul0
# superscalarMul0, superscalarAdd1
ldp x12, x13, [x7]
mov x8, x0 ldp x8, x9, [sp]
mov x9, x1
mov x10, x2 mov x10, x2
# rl[0] = (itemNumber + 1) * superscalarMul0; # rl[0] = (itemNumber + 1) * superscalarMul0;
madd x0, x2, x12, x12 madd x0, x2, x12, x12
# rl[1] = rl[0] ^ superscalarAdd1; # rl[1] = rl[0] ^ superscalarAdd1;
ldr x12, superscalarAdd1 eor x1, x0, x13
eor x1, x0, x12
# rl[2] = rl[0] ^ superscalarAdd2; # rl[2] = rl[0] ^ superscalarAdd2;
ldr x12, superscalarAdd2 ldp x12, x13, [x7, 16]
eor x2, x0, x12 eor x2, x0, x12
# rl[3] = rl[0] ^ superscalarAdd3; # rl[3] = rl[0] ^ superscalarAdd3;
ldr x12, superscalarAdd3 eor x3, x0, x13
eor x3, x0, x12
# rl[4] = rl[0] ^ superscalarAdd4; # rl[4] = rl[0] ^ superscalarAdd4;
ldr x12, superscalarAdd4 ldp x12, x13, [x7, 32]
eor x4, x0, x12 eor x4, x0, x12
# rl[5] = rl[0] ^ superscalarAdd5; # rl[5] = rl[0] ^ superscalarAdd5;
ldr x12, superscalarAdd5 eor x5, x0, x13
eor x5, x0, x12
# rl[6] = rl[0] ^ superscalarAdd6; # rl[6] = rl[0] ^ superscalarAdd6;
ldr x12, superscalarAdd6 ldp x12, x13, [x7, 48]
eor x6, x0, x12 eor x6, x0, x12
# rl[7] = rl[0] ^ superscalarAdd7; # rl[7] = rl[0] ^ superscalarAdd7;
ldr x12, superscalarAdd7 eor x7, x0, x13
eor x7, x0, x12
b DECL(randomx_calc_dataset_item_aarch64_prefetch) b DECL(randomx_calc_dataset_item_aarch64_prefetch)

View file

@ -52,10 +52,7 @@ uint64_t randomx_reciprocal(uint64_t divisor) {
uint64_t quotient = p2exp63 / divisor, remainder = p2exp63 % divisor; uint64_t quotient = p2exp63 / divisor, remainder = p2exp63 % divisor;
unsigned bsr = 0; //highest set bit in divisor unsigned bsr = 64 - __builtin_clzll(divisor); //highest set bit in divisor
for (uint64_t bit = divisor; bit > 0; bit >>= 1)
bsr++;
for (unsigned shift = 0; shift < bsr; shift++) { for (unsigned shift = 0; shift < bsr; shift++) {
if (remainder >= divisor - remainder) { if (remainder >= divisor - remainder) {