diff options
Diffstat (limited to 'sys/crypto/openssl/aarch64/ecp_sm2p256-armv8.S')
| -rw-r--r-- | sys/crypto/openssl/aarch64/ecp_sm2p256-armv8.S | 837 |
1 files changed, 837 insertions, 0 deletions
diff --git a/sys/crypto/openssl/aarch64/ecp_sm2p256-armv8.S b/sys/crypto/openssl/aarch64/ecp_sm2p256-armv8.S new file mode 100644 index 000000000000..c9d925a7bc77 --- /dev/null +++ b/sys/crypto/openssl/aarch64/ecp_sm2p256-armv8.S @@ -0,0 +1,837 @@ +/* Do not modify. This file is auto-generated from ecp_sm2p256-armv8.pl. */ +#include "arm_arch.h" +.arch armv8-a +.section .rodata + +.align 5 +// The polynomial p +.Lpoly: +.quad 0xffffffffffffffff,0xffffffff00000000,0xffffffffffffffff,0xfffffffeffffffff +// The order of polynomial n +.Lord: +.quad 0x53bbf40939d54123,0x7203df6b21c6052b,0xffffffffffffffff,0xfffffffeffffffff +// (p + 1) / 2 +.Lpoly_div_2: +.quad 0x8000000000000000,0xffffffff80000000,0xffffffffffffffff,0x7fffffff7fffffff +// (n + 1) / 2 +.Lord_div_2: +.quad 0xa9ddfa049ceaa092,0xb901efb590e30295,0xffffffffffffffff,0x7fffffff7fffffff + +.text + +// void bn_rshift1(BN_ULONG *a); +.globl bn_rshift1 +.type bn_rshift1,%function +.align 5 +bn_rshift1: + AARCH64_VALID_CALL_TARGET + // Load inputs + ldp x7,x8,[x0] + ldp x9,x10,[x0,#16] + + // Right shift + extr x7,x8,x7,#1 + extr x8,x9,x8,#1 + extr x9,x10,x9,#1 + lsr x10,x10,#1 + + // Store results + stp x7,x8,[x0] + stp x9,x10,[x0,#16] + + ret +.size bn_rshift1,.-bn_rshift1 + +// void bn_sub(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b); +.globl bn_sub +.type bn_sub,%function +.align 5 +bn_sub: + AARCH64_VALID_CALL_TARGET + // Load inputs + ldp x7,x8,[x1] + ldp x9,x10,[x1,#16] + ldp x11,x12,[x2] + ldp x13,x14,[x2,#16] + + // Subtraction + subs x7,x7,x11 + sbcs x8,x8,x12 + sbcs x9,x9,x13 + sbc x10,x10,x14 + + // Store results + stp x7,x8,[x0] + stp x9,x10,[x0,#16] + + ret +.size bn_sub,.-bn_sub + +// void ecp_sm2p256_div_by_2(BN_ULONG *r,const BN_ULONG *a); +.globl ecp_sm2p256_div_by_2 +.type ecp_sm2p256_div_by_2,%function +.align 5 +ecp_sm2p256_div_by_2: + AARCH64_VALID_CALL_TARGET + // Load inputs + ldp x7,x8,[x1] + ldp x9,x10,[x1,#16] + + // Save the least significant bit + mov x3,x7 + + // Right shift 1 + extr x7,x8,x7,#1 + extr x8,x9,x8,#1 + extr x9,x10,x9,#1 + lsr x10,x10,#1 + + // Load mod + adrp x2,.Lpoly_div_2 + add x2,x2,#:lo12:.Lpoly_div_2 + ldp x11,x12,[x2] + ldp x13,x14,[x2,#16] + + // Parity check + tst x3,#1 + csel x11,xzr,x11,eq + csel x12,xzr,x12,eq + csel x13,xzr,x13,eq + csel x14,xzr,x14,eq + + // Add + adds x7,x7,x11 + adcs x8,x8,x12 + adcs x9,x9,x13 + adc x10,x10,x14 + + // Store results + stp x7,x8,[x0] + stp x9,x10,[x0,#16] + ret +.size ecp_sm2p256_div_by_2,.-ecp_sm2p256_div_by_2 + +// void ecp_sm2p256_div_by_2_mod_ord(BN_ULONG *r,const BN_ULONG *a); +.globl ecp_sm2p256_div_by_2_mod_ord +.type ecp_sm2p256_div_by_2_mod_ord,%function +.align 5 +ecp_sm2p256_div_by_2_mod_ord: + AARCH64_VALID_CALL_TARGET + // Load inputs + ldp x7,x8,[x1] + ldp x9,x10,[x1,#16] + + // Save the least significant bit + mov x3,x7 + + // Right shift 1 + extr x7,x8,x7,#1 + extr x8,x9,x8,#1 + extr x9,x10,x9,#1 + lsr x10,x10,#1 + + // Load mod + adrp x2,.Lord_div_2 + add x2,x2,#:lo12:.Lord_div_2 + ldp x11,x12,[x2] + ldp x13,x14,[x2,#16] + + // Parity check + tst x3,#1 + csel x11,xzr,x11,eq + csel x12,xzr,x12,eq + csel x13,xzr,x13,eq + csel x14,xzr,x14,eq + + // Add + adds x7,x7,x11 + adcs x8,x8,x12 + adcs x9,x9,x13 + adc x10,x10,x14 + + // Store results + stp x7,x8,[x0] + stp x9,x10,[x0,#16] + ret +.size ecp_sm2p256_div_by_2_mod_ord,.-ecp_sm2p256_div_by_2_mod_ord + +// void ecp_sm2p256_mul_by_3(BN_ULONG *r,const BN_ULONG *a); +.globl ecp_sm2p256_mul_by_3 +.type ecp_sm2p256_mul_by_3,%function +.align 5 +ecp_sm2p256_mul_by_3: + AARCH64_VALID_CALL_TARGET + // Load inputs + ldp x7,x8,[x1] + ldp x9,x10,[x1,#16] + + // 2*a + adds x7,x7,x7 + adcs x8,x8,x8 + adcs x9,x9,x9 + adcs x10,x10,x10 + adcs x15,xzr,xzr + + mov x3,x7 + mov x4,x8 + mov x5,x9 + mov x6,x10 + + // Sub polynomial + adrp x2,.Lpoly + add x2,x2,#:lo12:.Lpoly + ldp x11,x12,[x2] + ldp x13,x14,[x2,#16] + subs x7,x7,x11 + sbcs x8,x8,x12 + sbcs x9,x9,x13 + sbcs x10,x10,x14 + sbcs x15,x15,xzr + + csel x7,x7,x3,cs + csel x8,x8,x4,cs + csel x9,x9,x5,cs + csel x10,x10,x6,cs + eor x15,x15,x15 + + // 3*a + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + adds x7,x7,x11 + adcs x8,x8,x12 + adcs x9,x9,x13 + adcs x10,x10,x14 + adcs x15,xzr,xzr + + mov x3,x7 + mov x4,x8 + mov x5,x9 + mov x6,x10 + + // Sub polynomial + adrp x2,.Lpoly + add x2,x2,#:lo12:.Lpoly + ldp x11,x12,[x2] + ldp x13,x14,[x2,#16] + subs x7,x7,x11 + sbcs x8,x8,x12 + sbcs x9,x9,x13 + sbcs x10,x10,x14 + sbcs x15,x15,xzr + + csel x7,x7,x3,cs + csel x8,x8,x4,cs + csel x9,x9,x5,cs + csel x10,x10,x6,cs + + // Store results + stp x7,x8,[x0] + stp x9,x10,[x0,#16] + + ret +.size ecp_sm2p256_mul_by_3,.-ecp_sm2p256_mul_by_3 + +// void ecp_sm2p256_add(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b); +.globl ecp_sm2p256_add +.type ecp_sm2p256_add,%function +.align 5 +ecp_sm2p256_add: + AARCH64_VALID_CALL_TARGET + // Load inputs + ldp x7,x8,[x1] + ldp x9,x10,[x1,#16] + ldp x11,x12,[x2] + ldp x13,x14,[x2,#16] + + // Addition + adds x7,x7,x11 + adcs x8,x8,x12 + adcs x9,x9,x13 + adcs x10,x10,x14 + adc x15,xzr,xzr + + // Load polynomial + adrp x2,.Lpoly + add x2,x2,#:lo12:.Lpoly + ldp x11,x12,[x2] + ldp x13,x14,[x2,#16] + + // Backup Addition + mov x3,x7 + mov x4,x8 + mov x5,x9 + mov x6,x10 + + // Sub polynomial + subs x3,x3,x11 + sbcs x4,x4,x12 + sbcs x5,x5,x13 + sbcs x6,x6,x14 + sbcs x15,x15,xzr + + // Select based on carry + csel x7,x7,x3,cc + csel x8,x8,x4,cc + csel x9,x9,x5,cc + csel x10,x10,x6,cc + + // Store results + stp x7,x8,[x0] + stp x9,x10,[x0,#16] + ret +.size ecp_sm2p256_add,.-ecp_sm2p256_add + +// void ecp_sm2p256_sub(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b); +.globl ecp_sm2p256_sub +.type ecp_sm2p256_sub,%function +.align 5 +ecp_sm2p256_sub: + AARCH64_VALID_CALL_TARGET + // Load inputs + ldp x7,x8,[x1] + ldp x9,x10,[x1,#16] + ldp x11,x12,[x2] + ldp x13,x14,[x2,#16] + + // Subtraction + subs x7,x7,x11 + sbcs x8,x8,x12 + sbcs x9,x9,x13 + sbcs x10,x10,x14 + sbc x15,xzr,xzr + + // Load polynomial + adrp x2,.Lpoly + add x2,x2,#:lo12:.Lpoly + ldp x11,x12,[x2] + ldp x13,x14,[x2,#16] + + // Backup subtraction + mov x3,x7 + mov x4,x8 + mov x5,x9 + mov x6,x10 + + // Add polynomial + adds x3,x3,x11 + adcs x4,x4,x12 + adcs x5,x5,x13 + adcs x6,x6,x14 + tst x15,x15 + + // Select based on carry + csel x7,x7,x3,eq + csel x8,x8,x4,eq + csel x9,x9,x5,eq + csel x10,x10,x6,eq + + // Store results + stp x7,x8,[x0] + stp x9,x10,[x0,#16] + ret +.size ecp_sm2p256_sub,.-ecp_sm2p256_sub + +// void ecp_sm2p256_sub_mod_ord(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b); +.globl ecp_sm2p256_sub_mod_ord +.type ecp_sm2p256_sub_mod_ord,%function +.align 5 +ecp_sm2p256_sub_mod_ord: + AARCH64_VALID_CALL_TARGET + // Load inputs + ldp x7,x8,[x1] + ldp x9,x10,[x1,#16] + ldp x11,x12,[x2] + ldp x13,x14,[x2,#16] + + // Subtraction + subs x7,x7,x11 + sbcs x8,x8,x12 + sbcs x9,x9,x13 + sbcs x10,x10,x14 + sbc x15,xzr,xzr + + // Load polynomial + adrp x2,.Lord + add x2,x2,#:lo12:.Lord + ldp x11,x12,[x2] + ldp x13,x14,[x2,#16] + + // Backup subtraction + mov x3,x7 + mov x4,x8 + mov x5,x9 + mov x6,x10 + + // Add polynomial + adds x3,x3,x11 + adcs x4,x4,x12 + adcs x5,x5,x13 + adcs x6,x6,x14 + tst x15,x15 + + // Select based on carry + csel x7,x7,x3,eq + csel x8,x8,x4,eq + csel x9,x9,x5,eq + csel x10,x10,x6,eq + + // Store results + stp x7,x8,[x0] + stp x9,x10,[x0,#16] + ret +.size ecp_sm2p256_sub_mod_ord,.-ecp_sm2p256_sub_mod_ord + +.macro RDC + // a = | s7 | ... | s0 |, where si are 64-bit quantities + // = |a15|a14| ... |a1|a0|, where ai are 32-bit quantities + // | s7 | s6 | s5 | s4 | + // | a15 | a14 | a13 | a12 | a11 | a10 | a9 | a8 | + // | s3 | s2 | s1 | s0 | + // | a7 | a6 | a5 | a4 | a3 | a2 | a1 | a0 | + // ================================================= + // | a8 | a11 | a10 | a9 | a8 | 0 | s4 | (+) + // | a9 | a15 | s6 | a11 | 0 | a10 | a9 | (+) + // | a10 | 0 | a14 | a13 | a12 | 0 | s5 | (+) + // | a11 | 0 | s7 | a13 | 0 | a12 | a11 | (+) + // | a12 | 0 | s7 | a13 | 0 | s6 | (+) + // | a12 | 0 | 0 | a15 | a14 | 0 | a14 | a13 | (+) + // | a13 | 0 | 0 | 0 | a15 | 0 | a14 | a13 | (+) + // | a13 | 0 | 0 | 0 | 0 | 0 | s7 | (+) + // | a14 | 0 | 0 | 0 | 0 | 0 | s7 | (+) + // | a14 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+) + // | a15 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+) + // | a15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (+) + // | s7 | 0 | 0 | 0 | 0 | 0 | 0 | (+) + // | 0 | 0 | 0 | 0 | 0 | a8 | 0 | 0 | (-) + // | 0 | 0 | 0 | 0 | 0 | a9 | 0 | 0 | (-) + // | 0 | 0 | 0 | 0 | 0 | a13 | 0 | 0 | (-) + // | 0 | 0 | 0 | 0 | 0 | a14 | 0 | 0 | (-) + // | U[7]| U[6]| U[5]| U[4]| U[3]| U[2]| U[1]| U[0]| + // | V[3] | V[2] | V[1] | V[0] | + + // 1. 64-bit addition + // t2=s6+s7+s7 + adds x5,x13,x14 + adcs x4,xzr,xzr + adds x5,x5,x14 + adcs x4,x4,xzr + // t3=s4+s5+t2 + adds x6,x11,x5 + adcs x15,x4,xzr + adds x6,x6,x12 + adcs x15,x15,xzr + // sum + adds x7,x7,x6 + adcs x8,x8,x15 + adcs x9,x9,x5 + adcs x10,x10,x14 + adcs x3,xzr,xzr + adds x10,x10,x4 + adcs x3,x3,xzr + + stp x7,x8,[sp,#32] + stp x9,x10,[sp,#48] + + // 2. 64-bit to 32-bit spread + mov x4,#0xffffffff + mov x7,x11 + mov x8,x12 + mov x9,x13 + mov x10,x14 + and x7,x7,x4 // a8 + and x8,x8,x4 // a10 + and x9,x9,x4 // a12 + and x10,x10,x4 // a14 + lsr x11,x11,#32 // a9 + lsr x12,x12,#32 // a11 + lsr x13,x13,#32 // a13 + lsr x14,x14,#32 // a15 + + // 3. 32-bit addition + add x4,x10,x9 // t1 <- a12 + a14 + add x5,x14,x13 // t2 <- a13 + a15 + add x6,x7,x11 // t3 <- a8 + a9 + add x15,x10,x8 // t4 <- a10 + a14 + add x14,x14,x12 // a15 <- a11 + a15 + add x9,x5,x4 // a12 <- a12 + a13 + a14 + a15 + add x8,x8,x9 // a10 <- a10 + a12 + a13 + a14 + a15 + add x8,x8,x9 // a10 <- a10 + 2*(a12 + a13 + a14 + a15) + add x8,x8,x6 // a10 <- a8 + a9 + a10 + 2*(a12 + a13 + a14 + a15) + add x8,x8,x12 // a10 <- a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15) + add x9,x9,x13 // a12 <- a12 + 2*a13 + a14 + a15 + add x9,x9,x12 // a12 <- a11 + a12 + 2*a13 + a14 + a15 + add x9,x9,x7 // a12 <- a8 + a11 + a12 + 2*a13 + a14 + a15 + add x6,x6,x10 // t3 <- a8 + a9 + a14 + add x6,x6,x13 // t3 <- a8 + a9 + a13 + a14 + add x11,x11,x5 // a9 <- a9 + a13 + a15 + add x12,x12,x11 // a11 <- a9 + a11 + a13 + a15 + add x12,x12,x5 // a11 <- a9 + a11 + 2*(a13 + a15) + add x4,x4,x15 // t1 <- a10 + a12 + 2*a14 + + // U[0] s5 a9 + a11 + 2*(a13 + a15) + // U[1] t1 a10 + a12 + 2*a14 + // U[2] -t3 a8 + a9 + a13 + a14 + // U[3] s2 a8 + a11 + a12 + 2*a13 + a14 + a15 + // U[4] s4 a9 + a13 + a15 + // U[5] t4 a10 + a14 + // U[6] s7 a11 + a15 + // U[7] s1 a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15) + + // 4. 32-bit to 64-bit + lsl x7,x4,#32 + extr x4,x9,x4,#32 + extr x9,x15,x9,#32 + extr x15,x8,x15,#32 + lsr x8,x8,#32 + + // 5. 64-bit addition + adds x12,x12,x7 + adcs x4,x4,xzr + adcs x11,x11,x9 + adcs x14,x14,x15 + adcs x3,x3,x8 + + // V[0] s5 + // V[1] t1 + // V[2] s4 + // V[3] s7 + // carry t0 + // sub t3 + + // 5. Process s0-s3 + ldp x7,x8,[sp,#32] + ldp x9,x10,[sp,#48] + // add with V0-V3 + adds x7,x7,x12 + adcs x8,x8,x4 + adcs x9,x9,x11 + adcs x10,x10,x14 + adcs x3,x3,xzr + // sub with t3 + subs x8,x8,x6 + sbcs x9,x9,xzr + sbcs x10,x10,xzr + sbcs x3,x3,xzr + + // 6. MOD + // First Mod + lsl x4,x3,#32 + subs x5,x4,x3 + + adds x7,x7,x3 + adcs x8,x8,x5 + adcs x9,x9,xzr + adcs x10,x10,x4 + + // Last Mod + // return y - p if y > p else y + mov x11,x7 + mov x12,x8 + mov x13,x9 + mov x14,x10 + + adrp x3,.Lpoly + add x3,x3,#:lo12:.Lpoly + ldp x4,x5,[x3] + ldp x6,x15,[x3,#16] + + adcs x16,xzr,xzr + + subs x7,x7,x4 + sbcs x8,x8,x5 + sbcs x9,x9,x6 + sbcs x10,x10,x15 + sbcs x16,x16,xzr + + csel x7,x7,x11,cs + csel x8,x8,x12,cs + csel x9,x9,x13,cs + csel x10,x10,x14,cs + +.endm + +// void ecp_sm2p256_mul(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b); +.globl ecp_sm2p256_mul +.type ecp_sm2p256_mul,%function +.align 5 +ecp_sm2p256_mul: + AARCH64_SIGN_LINK_REGISTER + // Store scalar registers + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp x16,x17,[sp,#16] + stp x19,x20,[sp,#64] + + // Load inputs + ldp x7,x8,[x1] + ldp x9,x10,[x1,#16] + ldp x11,x12,[x2] + ldp x13,x14,[x2,#16] + +// ### multiplication ### + // ======================== + // s3 s2 s1 s0 + // * s7 s6 s5 s4 + // ------------------------ + // + s0 s0 s0 s0 + // * * * * + // s7 s6 s5 s4 + // s1 s1 s1 s1 + // * * * * + // s7 s6 s5 s4 + // s2 s2 s2 s2 + // * * * * + // s7 s6 s5 s4 + // s3 s3 s3 s3 + // * * * * + // s7 s6 s5 s4 + // ------------------------ + // s7 s6 s5 s4 s3 s2 s1 s0 + // ======================== + +// ### s0*s4 ### + mul x16,x7,x11 + umulh x5,x7,x11 + +// ### s1*s4 + s0*s5 ### + mul x3,x8,x11 + umulh x4,x8,x11 + adds x5,x5,x3 + adcs x6,x4,xzr + + mul x3,x7,x12 + umulh x4,x7,x12 + adds x5,x5,x3 + adcs x6,x6,x4 + adcs x15,xzr,xzr + +// ### s2*s4 + s1*s5 + s0*s6 ### + mul x3,x9,x11 + umulh x4,x9,x11 + adds x6,x6,x3 + adcs x15,x15,x4 + + mul x3,x8,x12 + umulh x4,x8,x12 + adds x6,x6,x3 + adcs x15,x15,x4 + adcs x17,xzr,xzr + + mul x3,x7,x13 + umulh x4,x7,x13 + adds x6,x6,x3 + adcs x15,x15,x4 + adcs x17,x17,xzr + +// ### s3*s4 + s2*s5 + s1*s6 + s0*s7 ### + mul x3,x10,x11 + umulh x4,x10,x11 + adds x15,x15,x3 + adcs x17,x17,x4 + adcs x19,xzr,xzr + + mul x3,x9,x12 + umulh x4,x9,x12 + adds x15,x15,x3 + adcs x17,x17,x4 + adcs x19,x19,xzr + + mul x3,x8,x13 + umulh x4,x8,x13 + adds x15,x15,x3 + adcs x17,x17,x4 + adcs x19,x19,xzr + + mul x3,x7,x14 + umulh x4,x7,x14 + adds x15,x15,x3 + adcs x17,x17,x4 + adcs x19,x19,xzr + +// ### s3*s5 + s2*s6 + s1*s7 ### + mul x3,x10,x12 + umulh x4,x10,x12 + adds x17,x17,x3 + adcs x19,x19,x4 + adcs x20,xzr,xzr + + mul x3,x9,x13 + umulh x4,x9,x13 + adds x17,x17,x3 + adcs x19,x19,x4 + adcs x20,x20,xzr + + mul x3,x8,x14 + umulh x4,x8,x14 + adds x11,x17,x3 + adcs x19,x19,x4 + adcs x20,x20,xzr + +// ### s3*s6 + s2*s7 ### + mul x3,x10,x13 + umulh x4,x10,x13 + adds x19,x19,x3 + adcs x20,x20,x4 + adcs x17,xzr,xzr + + mul x3,x9,x14 + umulh x4,x9,x14 + adds x12,x19,x3 + adcs x20,x20,x4 + adcs x17,x17,xzr + +// ### s3*s7 ### + mul x3,x10,x14 + umulh x4,x10,x14 + adds x13,x20,x3 + adcs x14,x17,x4 + + mov x7,x16 + mov x8,x5 + mov x9,x6 + mov x10,x15 + + // result of mul: s7 s6 s5 s4 s3 s2 s1 s0 + +// ### Reduction ### + RDC + + stp x7,x8,[x0] + stp x9,x10,[x0,#16] + + // Restore scalar registers + ldp x16,x17,[sp,#16] + ldp x19,x20,[sp,#64] + ldp x29,x30,[sp],#80 + + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_sm2p256_mul,.-ecp_sm2p256_mul + +// void ecp_sm2p256_sqr(BN_ULONG *r, const BN_ULONG *a); +.globl ecp_sm2p256_sqr +.type ecp_sm2p256_sqr,%function +.align 5 + +ecp_sm2p256_sqr: + AARCH64_SIGN_LINK_REGISTER + // Store scalar registers + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp x16,x17,[sp,#16] + stp x19,x20,[sp,#64] + + // Load inputs + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + +// ### square ### + // ======================== + // s7 s6 s5 s4 + // * s7 s6 s5 s4 + // ------------------------ + // + s4 s4 s4 s4 + // * * * * + // s7 s6 s5 s4 + // s5 s5 s5 s5 + // * * * * + // s7 s6 s5 s4 + // s6 s6 s6 s6 + // * * * * + // s7 s6 s5 s4 + // s7 s7 s7 s7 + // * * * * + // s7 s6 s5 s4 + // ------------------------ + // s7 s6 s5 s4 s3 s2 s1 s0 + // ======================== + +// ### s4*s5 ### + mul x8,x11,x12 + umulh x9,x11,x12 + +// ### s4*s6 ### + mul x3,x13,x11 + umulh x10,x13,x11 + adds x9,x9,x3 + adcs x10,x10,xzr + +// ### s4*s7 + s5*s6 ### + mul x3,x14,x11 + umulh x4,x14,x11 + adds x10,x10,x3 + adcs x7,x4,xzr + + mul x3,x13,x12 + umulh x4,x13,x12 + adds x10,x10,x3 + adcs x7,x7,x4 + adcs x5,xzr,xzr + +// ### s5*s7 ### + mul x3,x14,x12 + umulh x4,x14,x12 + adds x7,x7,x3 + adcs x5,x5,x4 + +// ### s6*s7 ### + mul x3,x14,x13 + umulh x4,x14,x13 + adds x5,x5,x3 + adcs x6,x4,xzr + +// ### 2*(t3,t2,s0,s3,s2,s1) ### + adds x8,x8,x8 + adcs x9,x9,x9 + adcs x10,x10,x10 + adcs x7,x7,x7 + adcs x5,x5,x5 + adcs x6,x6,x6 + adcs x15,xzr,xzr + +// ### s4*s4 ### + mul x16,x11,x11 + umulh x17,x11,x11 + +// ### s5*s5 ### + mul x11,x12,x12 + umulh x12,x12,x12 + +// ### s6*s6 ### + mul x3,x13,x13 + umulh x4,x13,x13 + +// ### s7*s7 ### + mul x19,x14,x14 + umulh x20,x14,x14 + + adds x8,x8,x17 + adcs x9,x9,x11 + adcs x10,x10,x12 + adcs x7,x7,x3 + adcs x5,x5,x4 + adcs x6,x6,x19 + adcs x15,x15,x20 + + mov x11,x7 + mov x7,x16 + mov x12,x5 + mov x13,x6 + mov x14,x15 + + // result of mul: s7 s6 s5 s4 s3 s2 s1 s0 + +// ### Reduction ### + RDC + + stp x7,x8,[x0] + stp x9,x10,[x0,#16] + + // Restore scalar registers + ldp x16,x17,[sp,#16] + ldp x19,x20,[sp,#64] + ldp x29,x30,[sp],#80 + + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_sm2p256_sqr,.-ecp_sm2p256_sqr |
