erigon-pulse/crypto/bn256/cloudflare/mul_bmi2_amd64.h
Péter Szilágyi 1203c6a237
crypto/bn256: full switchover to cloudflare's code (#16301)
* crypto/bn256: full switchover to cloudflare's code

* crypto/bn256: only use cloudflare for optimized architectures

* crypto/bn256: upstream fallback for non-optimized code

* .travis, build: drop support for Go 1.8 (need type aliases)

* crypto/bn256/cloudflare: enable curve mul lattice optimization
2018-03-20 01:13:54 +09:00

113 lines
2.0 KiB
C

#define mulBMI2(a0,a1,a2,a3, rb) \
MOVQ a0, DX \
MOVQ $0, R13 \
MULXQ 0+rb, R8, R9 \
MULXQ 8+rb, AX, R10 \
ADDQ AX, R9 \
MULXQ 16+rb, AX, R11 \
ADCQ AX, R10 \
MULXQ 24+rb, AX, R12 \
ADCQ AX, R11 \
ADCQ $0, R12 \
ADCQ $0, R13 \
\
MOVQ a1, DX \
MOVQ $0, R14 \
MULXQ 0+rb, AX, BX \
ADDQ AX, R9 \
ADCQ BX, R10 \
MULXQ 16+rb, AX, BX \
ADCQ AX, R11 \
ADCQ BX, R12 \
ADCQ $0, R13 \
MULXQ 8+rb, AX, BX \
ADDQ AX, R10 \
ADCQ BX, R11 \
MULXQ 24+rb, AX, BX \
ADCQ AX, R12 \
ADCQ BX, R13 \
ADCQ $0, R14 \
\
MOVQ a2, DX \
MOVQ $0, R15 \
MULXQ 0+rb, AX, BX \
ADDQ AX, R10 \
ADCQ BX, R11 \
MULXQ 16+rb, AX, BX \
ADCQ AX, R12 \
ADCQ BX, R13 \
ADCQ $0, R14 \
MULXQ 8+rb, AX, BX \
ADDQ AX, R11 \
ADCQ BX, R12 \
MULXQ 24+rb, AX, BX \
ADCQ AX, R13 \
ADCQ BX, R14 \
ADCQ $0, R15 \
\
MOVQ a3, DX \
MULXQ 0+rb, AX, BX \
ADDQ AX, R11 \
ADCQ BX, R12 \
MULXQ 16+rb, AX, BX \
ADCQ AX, R13 \
ADCQ BX, R14 \
ADCQ $0, R15 \
MULXQ 8+rb, AX, BX \
ADDQ AX, R12 \
ADCQ BX, R13 \
MULXQ 24+rb, AX, BX \
ADCQ AX, R14 \
ADCQ BX, R15
#define gfpReduceBMI2() \
\ // m = (T * N') mod R, store m in R8:R9:R10:R11
MOVQ ·np+0(SB), DX \
MULXQ 0(SP), R8, R9 \
MULXQ 8(SP), AX, R10 \
ADDQ AX, R9 \
MULXQ 16(SP), AX, R11 \
ADCQ AX, R10 \
MULXQ 24(SP), AX, BX \
ADCQ AX, R11 \
\
MOVQ ·np+8(SB), DX \
MULXQ 0(SP), AX, BX \
ADDQ AX, R9 \
ADCQ BX, R10 \
MULXQ 16(SP), AX, BX \
ADCQ AX, R11 \
MULXQ 8(SP), AX, BX \
ADDQ AX, R10 \
ADCQ BX, R11 \
\
MOVQ ·np+16(SB), DX \
MULXQ 0(SP), AX, BX \
ADDQ AX, R10 \
ADCQ BX, R11 \
MULXQ 8(SP), AX, BX \
ADDQ AX, R11 \
\
MOVQ ·np+24(SB), DX \
MULXQ 0(SP), AX, BX \
ADDQ AX, R11 \
\
storeBlock(R8,R9,R10,R11, 64(SP)) \
\
\ // m * N
mulBMI2(·p2+0(SB),·p2+8(SB),·p2+16(SB),·p2+24(SB), 64(SP)) \
\
\ // Add the 512-bit intermediate to m*N
MOVQ $0, AX \
ADDQ 0(SP), R8 \
ADCQ 8(SP), R9 \
ADCQ 16(SP), R10 \
ADCQ 24(SP), R11 \
ADCQ 32(SP), R12 \
ADCQ 40(SP), R13 \
ADCQ 48(SP), R14 \
ADCQ 56(SP), R15 \
ADCQ $0, AX \
\
gfpCarry(R12,R13,R14,R15,AX, R8,R9,R10,R11,BX)