aboutsummaryrefslogtreecommitdiff
path: root/sys/crypto/openssl/aarch64/ecp_sm2p256-armv8.S
diff options
context:
space:
mode:
Diffstat (limited to 'sys/crypto/openssl/aarch64/ecp_sm2p256-armv8.S')
-rw-r--r--sys/crypto/openssl/aarch64/ecp_sm2p256-armv8.S837
1 files changed, 837 insertions, 0 deletions
diff --git a/sys/crypto/openssl/aarch64/ecp_sm2p256-armv8.S b/sys/crypto/openssl/aarch64/ecp_sm2p256-armv8.S
new file mode 100644
index 000000000000..c9d925a7bc77
--- /dev/null
+++ b/sys/crypto/openssl/aarch64/ecp_sm2p256-armv8.S
@@ -0,0 +1,837 @@
+/* Do not modify. This file is auto-generated from ecp_sm2p256-armv8.pl. */
+#include "arm_arch.h"
+.arch armv8-a
+.section .rodata
+
+.align 5
+// The polynomial p
+.Lpoly:
+.quad 0xffffffffffffffff,0xffffffff00000000,0xffffffffffffffff,0xfffffffeffffffff
+// The order of polynomial n
+.Lord:
+.quad 0x53bbf40939d54123,0x7203df6b21c6052b,0xffffffffffffffff,0xfffffffeffffffff
+// (p + 1) / 2
+.Lpoly_div_2:
+.quad 0x8000000000000000,0xffffffff80000000,0xffffffffffffffff,0x7fffffff7fffffff
+// (n + 1) / 2
+.Lord_div_2:
+.quad 0xa9ddfa049ceaa092,0xb901efb590e30295,0xffffffffffffffff,0x7fffffff7fffffff
+
+.text
+
+// void bn_rshift1(BN_ULONG *a);
+.globl bn_rshift1
+.type bn_rshift1,%function
+.align 5
+bn_rshift1:
+ AARCH64_VALID_CALL_TARGET
+ // Load inputs
+ ldp x7,x8,[x0]
+ ldp x9,x10,[x0,#16]
+
+ // Right shift
+ extr x7,x8,x7,#1
+ extr x8,x9,x8,#1
+ extr x9,x10,x9,#1
+ lsr x10,x10,#1
+
+ // Store results
+ stp x7,x8,[x0]
+ stp x9,x10,[x0,#16]
+
+ ret
+.size bn_rshift1,.-bn_rshift1
+
+// void bn_sub(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b);
+.globl bn_sub
+.type bn_sub,%function
+.align 5
+bn_sub:
+ AARCH64_VALID_CALL_TARGET
+ // Load inputs
+ ldp x7,x8,[x1]
+ ldp x9,x10,[x1,#16]
+ ldp x11,x12,[x2]
+ ldp x13,x14,[x2,#16]
+
+ // Subtraction
+ subs x7,x7,x11
+ sbcs x8,x8,x12
+ sbcs x9,x9,x13
+ sbc x10,x10,x14
+
+ // Store results
+ stp x7,x8,[x0]
+ stp x9,x10,[x0,#16]
+
+ ret
+.size bn_sub,.-bn_sub
+
+// void ecp_sm2p256_div_by_2(BN_ULONG *r,const BN_ULONG *a);
+.globl ecp_sm2p256_div_by_2
+.type ecp_sm2p256_div_by_2,%function
+.align 5
+ecp_sm2p256_div_by_2:
+ AARCH64_VALID_CALL_TARGET
+ // Load inputs
+ ldp x7,x8,[x1]
+ ldp x9,x10,[x1,#16]
+
+ // Save the least significant bit
+ mov x3,x7
+
+ // Right shift 1
+ extr x7,x8,x7,#1
+ extr x8,x9,x8,#1
+ extr x9,x10,x9,#1
+ lsr x10,x10,#1
+
+ // Load mod
+ adrp x2,.Lpoly_div_2
+ add x2,x2,#:lo12:.Lpoly_div_2
+ ldp x11,x12,[x2]
+ ldp x13,x14,[x2,#16]
+
+ // Parity check
+ tst x3,#1
+ csel x11,xzr,x11,eq
+ csel x12,xzr,x12,eq
+ csel x13,xzr,x13,eq
+ csel x14,xzr,x14,eq
+
+ // Add
+ adds x7,x7,x11
+ adcs x8,x8,x12
+ adcs x9,x9,x13
+ adc x10,x10,x14
+
+ // Store results
+ stp x7,x8,[x0]
+ stp x9,x10,[x0,#16]
+ ret
+.size ecp_sm2p256_div_by_2,.-ecp_sm2p256_div_by_2
+
+// void ecp_sm2p256_div_by_2_mod_ord(BN_ULONG *r,const BN_ULONG *a);
+.globl ecp_sm2p256_div_by_2_mod_ord
+.type ecp_sm2p256_div_by_2_mod_ord,%function
+.align 5
+ecp_sm2p256_div_by_2_mod_ord:
+ AARCH64_VALID_CALL_TARGET
+ // Load inputs
+ ldp x7,x8,[x1]
+ ldp x9,x10,[x1,#16]
+
+ // Save the least significant bit
+ mov x3,x7
+
+ // Right shift 1
+ extr x7,x8,x7,#1
+ extr x8,x9,x8,#1
+ extr x9,x10,x9,#1
+ lsr x10,x10,#1
+
+ // Load mod
+ adrp x2,.Lord_div_2
+ add x2,x2,#:lo12:.Lord_div_2
+ ldp x11,x12,[x2]
+ ldp x13,x14,[x2,#16]
+
+ // Parity check
+ tst x3,#1
+ csel x11,xzr,x11,eq
+ csel x12,xzr,x12,eq
+ csel x13,xzr,x13,eq
+ csel x14,xzr,x14,eq
+
+ // Add
+ adds x7,x7,x11
+ adcs x8,x8,x12
+ adcs x9,x9,x13
+ adc x10,x10,x14
+
+ // Store results
+ stp x7,x8,[x0]
+ stp x9,x10,[x0,#16]
+ ret
+.size ecp_sm2p256_div_by_2_mod_ord,.-ecp_sm2p256_div_by_2_mod_ord
+
+// void ecp_sm2p256_mul_by_3(BN_ULONG *r,const BN_ULONG *a);
+.globl ecp_sm2p256_mul_by_3
+.type ecp_sm2p256_mul_by_3,%function
+.align 5
+ecp_sm2p256_mul_by_3:
+ AARCH64_VALID_CALL_TARGET
+ // Load inputs
+ ldp x7,x8,[x1]
+ ldp x9,x10,[x1,#16]
+
+ // 2*a
+ adds x7,x7,x7
+ adcs x8,x8,x8
+ adcs x9,x9,x9
+ adcs x10,x10,x10
+ adcs x15,xzr,xzr
+
+ mov x3,x7
+ mov x4,x8
+ mov x5,x9
+ mov x6,x10
+
+ // Sub polynomial
+ adrp x2,.Lpoly
+ add x2,x2,#:lo12:.Lpoly
+ ldp x11,x12,[x2]
+ ldp x13,x14,[x2,#16]
+ subs x7,x7,x11
+ sbcs x8,x8,x12
+ sbcs x9,x9,x13
+ sbcs x10,x10,x14
+ sbcs x15,x15,xzr
+
+ csel x7,x7,x3,cs
+ csel x8,x8,x4,cs
+ csel x9,x9,x5,cs
+ csel x10,x10,x6,cs
+ eor x15,x15,x15
+
+ // 3*a
+ ldp x11,x12,[x1]
+ ldp x13,x14,[x1,#16]
+ adds x7,x7,x11
+ adcs x8,x8,x12
+ adcs x9,x9,x13
+ adcs x10,x10,x14
+ adcs x15,xzr,xzr
+
+ mov x3,x7
+ mov x4,x8
+ mov x5,x9
+ mov x6,x10
+
+ // Sub polynomial
+ adrp x2,.Lpoly
+ add x2,x2,#:lo12:.Lpoly
+ ldp x11,x12,[x2]
+ ldp x13,x14,[x2,#16]
+ subs x7,x7,x11
+ sbcs x8,x8,x12
+ sbcs x9,x9,x13
+ sbcs x10,x10,x14
+ sbcs x15,x15,xzr
+
+ csel x7,x7,x3,cs
+ csel x8,x8,x4,cs
+ csel x9,x9,x5,cs
+ csel x10,x10,x6,cs
+
+ // Store results
+ stp x7,x8,[x0]
+ stp x9,x10,[x0,#16]
+
+ ret
+.size ecp_sm2p256_mul_by_3,.-ecp_sm2p256_mul_by_3
+
+// void ecp_sm2p256_add(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b);
+.globl ecp_sm2p256_add
+.type ecp_sm2p256_add,%function
+.align 5
+ecp_sm2p256_add:
+ AARCH64_VALID_CALL_TARGET
+ // Load inputs
+ ldp x7,x8,[x1]
+ ldp x9,x10,[x1,#16]
+ ldp x11,x12,[x2]
+ ldp x13,x14,[x2,#16]
+
+ // Addition
+ adds x7,x7,x11
+ adcs x8,x8,x12
+ adcs x9,x9,x13
+ adcs x10,x10,x14
+ adc x15,xzr,xzr
+
+ // Load polynomial
+ adrp x2,.Lpoly
+ add x2,x2,#:lo12:.Lpoly
+ ldp x11,x12,[x2]
+ ldp x13,x14,[x2,#16]
+
+ // Backup Addition
+ mov x3,x7
+ mov x4,x8
+ mov x5,x9
+ mov x6,x10
+
+ // Sub polynomial
+ subs x3,x3,x11
+ sbcs x4,x4,x12
+ sbcs x5,x5,x13
+ sbcs x6,x6,x14
+ sbcs x15,x15,xzr
+
+ // Select based on carry
+ csel x7,x7,x3,cc
+ csel x8,x8,x4,cc
+ csel x9,x9,x5,cc
+ csel x10,x10,x6,cc
+
+ // Store results
+ stp x7,x8,[x0]
+ stp x9,x10,[x0,#16]
+ ret
+.size ecp_sm2p256_add,.-ecp_sm2p256_add
+
+// void ecp_sm2p256_sub(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b);
+.globl ecp_sm2p256_sub
+.type ecp_sm2p256_sub,%function
+.align 5
+ecp_sm2p256_sub:
+ AARCH64_VALID_CALL_TARGET
+ // Load inputs
+ ldp x7,x8,[x1]
+ ldp x9,x10,[x1,#16]
+ ldp x11,x12,[x2]
+ ldp x13,x14,[x2,#16]
+
+ // Subtraction
+ subs x7,x7,x11
+ sbcs x8,x8,x12
+ sbcs x9,x9,x13
+ sbcs x10,x10,x14
+ sbc x15,xzr,xzr
+
+ // Load polynomial
+ adrp x2,.Lpoly
+ add x2,x2,#:lo12:.Lpoly
+ ldp x11,x12,[x2]
+ ldp x13,x14,[x2,#16]
+
+ // Backup subtraction
+ mov x3,x7
+ mov x4,x8
+ mov x5,x9
+ mov x6,x10
+
+ // Add polynomial
+ adds x3,x3,x11
+ adcs x4,x4,x12
+ adcs x5,x5,x13
+ adcs x6,x6,x14
+ tst x15,x15
+
+ // Select based on carry
+ csel x7,x7,x3,eq
+ csel x8,x8,x4,eq
+ csel x9,x9,x5,eq
+ csel x10,x10,x6,eq
+
+ // Store results
+ stp x7,x8,[x0]
+ stp x9,x10,[x0,#16]
+ ret
+.size ecp_sm2p256_sub,.-ecp_sm2p256_sub
+
+// void ecp_sm2p256_sub_mod_ord(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b);
+.globl ecp_sm2p256_sub_mod_ord
+.type ecp_sm2p256_sub_mod_ord,%function
+.align 5
+ecp_sm2p256_sub_mod_ord:
+ AARCH64_VALID_CALL_TARGET
+ // Load inputs
+ ldp x7,x8,[x1]
+ ldp x9,x10,[x1,#16]
+ ldp x11,x12,[x2]
+ ldp x13,x14,[x2,#16]
+
+ // Subtraction
+ subs x7,x7,x11
+ sbcs x8,x8,x12
+ sbcs x9,x9,x13
+ sbcs x10,x10,x14
+ sbc x15,xzr,xzr
+
+ // Load polynomial
+ adrp x2,.Lord
+ add x2,x2,#:lo12:.Lord
+ ldp x11,x12,[x2]
+ ldp x13,x14,[x2,#16]
+
+ // Backup subtraction
+ mov x3,x7
+ mov x4,x8
+ mov x5,x9
+ mov x6,x10
+
+ // Add polynomial
+ adds x3,x3,x11
+ adcs x4,x4,x12
+ adcs x5,x5,x13
+ adcs x6,x6,x14
+ tst x15,x15
+
+ // Select based on carry
+ csel x7,x7,x3,eq
+ csel x8,x8,x4,eq
+ csel x9,x9,x5,eq
+ csel x10,x10,x6,eq
+
+ // Store results
+ stp x7,x8,[x0]
+ stp x9,x10,[x0,#16]
+ ret
+.size ecp_sm2p256_sub_mod_ord,.-ecp_sm2p256_sub_mod_ord
+
+.macro RDC
+ // a = | s7 | ... | s0 |, where si are 64-bit quantities
+ // = |a15|a14| ... |a1|a0|, where ai are 32-bit quantities
+ // | s7 | s6 | s5 | s4 |
+ // | a15 | a14 | a13 | a12 | a11 | a10 | a9 | a8 |
+ // | s3 | s2 | s1 | s0 |
+ // | a7 | a6 | a5 | a4 | a3 | a2 | a1 | a0 |
+ // =================================================
+ // | a8 | a11 | a10 | a9 | a8 | 0 | s4 | (+)
+ // | a9 | a15 | s6 | a11 | 0 | a10 | a9 | (+)
+ // | a10 | 0 | a14 | a13 | a12 | 0 | s5 | (+)
+ // | a11 | 0 | s7 | a13 | 0 | a12 | a11 | (+)
+ // | a12 | 0 | s7 | a13 | 0 | s6 | (+)
+ // | a12 | 0 | 0 | a15 | a14 | 0 | a14 | a13 | (+)
+ // | a13 | 0 | 0 | 0 | a15 | 0 | a14 | a13 | (+)
+ // | a13 | 0 | 0 | 0 | 0 | 0 | s7 | (+)
+ // | a14 | 0 | 0 | 0 | 0 | 0 | s7 | (+)
+ // | a14 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+)
+ // | a15 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+)
+ // | a15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (+)
+ // | s7 | 0 | 0 | 0 | 0 | 0 | 0 | (+)
+ // | 0 | 0 | 0 | 0 | 0 | a8 | 0 | 0 | (-)
+ // | 0 | 0 | 0 | 0 | 0 | a9 | 0 | 0 | (-)
+ // | 0 | 0 | 0 | 0 | 0 | a13 | 0 | 0 | (-)
+ // | 0 | 0 | 0 | 0 | 0 | a14 | 0 | 0 | (-)
+ // | U[7]| U[6]| U[5]| U[4]| U[3]| U[2]| U[1]| U[0]|
+ // | V[3] | V[2] | V[1] | V[0] |
+
+ // 1. 64-bit addition
+ // t2=s6+s7+s7
+ adds x5,x13,x14
+ adcs x4,xzr,xzr
+ adds x5,x5,x14
+ adcs x4,x4,xzr
+ // t3=s4+s5+t2
+ adds x6,x11,x5
+ adcs x15,x4,xzr
+ adds x6,x6,x12
+ adcs x15,x15,xzr
+ // sum
+ adds x7,x7,x6
+ adcs x8,x8,x15
+ adcs x9,x9,x5
+ adcs x10,x10,x14
+ adcs x3,xzr,xzr
+ adds x10,x10,x4
+ adcs x3,x3,xzr
+
+ stp x7,x8,[sp,#32]
+ stp x9,x10,[sp,#48]
+
+ // 2. 64-bit to 32-bit spread
+ mov x4,#0xffffffff
+ mov x7,x11
+ mov x8,x12
+ mov x9,x13
+ mov x10,x14
+ and x7,x7,x4 // a8
+ and x8,x8,x4 // a10
+ and x9,x9,x4 // a12
+ and x10,x10,x4 // a14
+ lsr x11,x11,#32 // a9
+ lsr x12,x12,#32 // a11
+ lsr x13,x13,#32 // a13
+ lsr x14,x14,#32 // a15
+
+ // 3. 32-bit addition
+ add x4,x10,x9 // t1 <- a12 + a14
+ add x5,x14,x13 // t2 <- a13 + a15
+ add x6,x7,x11 // t3 <- a8 + a9
+ add x15,x10,x8 // t4 <- a10 + a14
+ add x14,x14,x12 // a15 <- a11 + a15
+ add x9,x5,x4 // a12 <- a12 + a13 + a14 + a15
+ add x8,x8,x9 // a10 <- a10 + a12 + a13 + a14 + a15
+ add x8,x8,x9 // a10 <- a10 + 2*(a12 + a13 + a14 + a15)
+ add x8,x8,x6 // a10 <- a8 + a9 + a10 + 2*(a12 + a13 + a14 + a15)
+ add x8,x8,x12 // a10 <- a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15)
+ add x9,x9,x13 // a12 <- a12 + 2*a13 + a14 + a15
+ add x9,x9,x12 // a12 <- a11 + a12 + 2*a13 + a14 + a15
+ add x9,x9,x7 // a12 <- a8 + a11 + a12 + 2*a13 + a14 + a15
+ add x6,x6,x10 // t3 <- a8 + a9 + a14
+ add x6,x6,x13 // t3 <- a8 + a9 + a13 + a14
+ add x11,x11,x5 // a9 <- a9 + a13 + a15
+ add x12,x12,x11 // a11 <- a9 + a11 + a13 + a15
+ add x12,x12,x5 // a11 <- a9 + a11 + 2*(a13 + a15)
+ add x4,x4,x15 // t1 <- a10 + a12 + 2*a14
+
+ // U[0] s5 a9 + a11 + 2*(a13 + a15)
+ // U[1] t1 a10 + a12 + 2*a14
+ // U[2] -t3 a8 + a9 + a13 + a14
+ // U[3] s2 a8 + a11 + a12 + 2*a13 + a14 + a15
+ // U[4] s4 a9 + a13 + a15
+ // U[5] t4 a10 + a14
+ // U[6] s7 a11 + a15
+ // U[7] s1 a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15)
+
+ // 4. 32-bit to 64-bit
+ lsl x7,x4,#32
+ extr x4,x9,x4,#32
+ extr x9,x15,x9,#32
+ extr x15,x8,x15,#32
+ lsr x8,x8,#32
+
+ // 5. 64-bit addition
+ adds x12,x12,x7
+ adcs x4,x4,xzr
+ adcs x11,x11,x9
+ adcs x14,x14,x15
+ adcs x3,x3,x8
+
+ // V[0] s5
+ // V[1] t1
+ // V[2] s4
+ // V[3] s7
+ // carry t0
+ // sub t3
+
+ // 5. Process s0-s3
+ ldp x7,x8,[sp,#32]
+ ldp x9,x10,[sp,#48]
+ // add with V0-V3
+ adds x7,x7,x12
+ adcs x8,x8,x4
+ adcs x9,x9,x11
+ adcs x10,x10,x14
+ adcs x3,x3,xzr
+ // sub with t3
+ subs x8,x8,x6
+ sbcs x9,x9,xzr
+ sbcs x10,x10,xzr
+ sbcs x3,x3,xzr
+
+ // 6. MOD
+ // First Mod
+ lsl x4,x3,#32
+ subs x5,x4,x3
+
+ adds x7,x7,x3
+ adcs x8,x8,x5
+ adcs x9,x9,xzr
+ adcs x10,x10,x4
+
+ // Last Mod
+ // return y - p if y > p else y
+ mov x11,x7
+ mov x12,x8
+ mov x13,x9
+ mov x14,x10
+
+ adrp x3,.Lpoly
+ add x3,x3,#:lo12:.Lpoly
+ ldp x4,x5,[x3]
+ ldp x6,x15,[x3,#16]
+
+ adcs x16,xzr,xzr
+
+ subs x7,x7,x4
+ sbcs x8,x8,x5
+ sbcs x9,x9,x6
+ sbcs x10,x10,x15
+ sbcs x16,x16,xzr
+
+ csel x7,x7,x11,cs
+ csel x8,x8,x12,cs
+ csel x9,x9,x13,cs
+ csel x10,x10,x14,cs
+
+.endm
+
+// void ecp_sm2p256_mul(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b);
+.globl ecp_sm2p256_mul
+.type ecp_sm2p256_mul,%function
+.align 5
+ecp_sm2p256_mul:
+ AARCH64_SIGN_LINK_REGISTER
+ // Store scalar registers
+ stp x29,x30,[sp,#-80]!
+ add x29,sp,#0
+ stp x16,x17,[sp,#16]
+ stp x19,x20,[sp,#64]
+
+ // Load inputs
+ ldp x7,x8,[x1]
+ ldp x9,x10,[x1,#16]
+ ldp x11,x12,[x2]
+ ldp x13,x14,[x2,#16]
+
+// ### multiplication ###
+ // ========================
+ // s3 s2 s1 s0
+ // * s7 s6 s5 s4
+ // ------------------------
+ // + s0 s0 s0 s0
+ // * * * *
+ // s7 s6 s5 s4
+ // s1 s1 s1 s1
+ // * * * *
+ // s7 s6 s5 s4
+ // s2 s2 s2 s2
+ // * * * *
+ // s7 s6 s5 s4
+ // s3 s3 s3 s3
+ // * * * *
+ // s7 s6 s5 s4
+ // ------------------------
+ // s7 s6 s5 s4 s3 s2 s1 s0
+ // ========================
+
+// ### s0*s4 ###
+ mul x16,x7,x11
+ umulh x5,x7,x11
+
+// ### s1*s4 + s0*s5 ###
+ mul x3,x8,x11
+ umulh x4,x8,x11
+ adds x5,x5,x3
+ adcs x6,x4,xzr
+
+ mul x3,x7,x12
+ umulh x4,x7,x12
+ adds x5,x5,x3
+ adcs x6,x6,x4
+ adcs x15,xzr,xzr
+
+// ### s2*s4 + s1*s5 + s0*s6 ###
+ mul x3,x9,x11
+ umulh x4,x9,x11
+ adds x6,x6,x3
+ adcs x15,x15,x4
+
+ mul x3,x8,x12
+ umulh x4,x8,x12
+ adds x6,x6,x3
+ adcs x15,x15,x4
+ adcs x17,xzr,xzr
+
+ mul x3,x7,x13
+ umulh x4,x7,x13
+ adds x6,x6,x3
+ adcs x15,x15,x4
+ adcs x17,x17,xzr
+
+// ### s3*s4 + s2*s5 + s1*s6 + s0*s7 ###
+ mul x3,x10,x11
+ umulh x4,x10,x11
+ adds x15,x15,x3
+ adcs x17,x17,x4
+ adcs x19,xzr,xzr
+
+ mul x3,x9,x12
+ umulh x4,x9,x12
+ adds x15,x15,x3
+ adcs x17,x17,x4
+ adcs x19,x19,xzr
+
+ mul x3,x8,x13
+ umulh x4,x8,x13
+ adds x15,x15,x3
+ adcs x17,x17,x4
+ adcs x19,x19,xzr
+
+ mul x3,x7,x14
+ umulh x4,x7,x14
+ adds x15,x15,x3
+ adcs x17,x17,x4
+ adcs x19,x19,xzr
+
+// ### s3*s5 + s2*s6 + s1*s7 ###
+ mul x3,x10,x12
+ umulh x4,x10,x12
+ adds x17,x17,x3
+ adcs x19,x19,x4
+ adcs x20,xzr,xzr
+
+ mul x3,x9,x13
+ umulh x4,x9,x13
+ adds x17,x17,x3
+ adcs x19,x19,x4
+ adcs x20,x20,xzr
+
+ mul x3,x8,x14
+ umulh x4,x8,x14
+ adds x11,x17,x3
+ adcs x19,x19,x4
+ adcs x20,x20,xzr
+
+// ### s3*s6 + s2*s7 ###
+ mul x3,x10,x13
+ umulh x4,x10,x13
+ adds x19,x19,x3
+ adcs x20,x20,x4
+ adcs x17,xzr,xzr
+
+ mul x3,x9,x14
+ umulh x4,x9,x14
+ adds x12,x19,x3
+ adcs x20,x20,x4
+ adcs x17,x17,xzr
+
+// ### s3*s7 ###
+ mul x3,x10,x14
+ umulh x4,x10,x14
+ adds x13,x20,x3
+ adcs x14,x17,x4
+
+ mov x7,x16
+ mov x8,x5
+ mov x9,x6
+ mov x10,x15
+
+ // result of mul: s7 s6 s5 s4 s3 s2 s1 s0
+
+// ### Reduction ###
+ RDC
+
+ stp x7,x8,[x0]
+ stp x9,x10,[x0,#16]
+
+ // Restore scalar registers
+ ldp x16,x17,[sp,#16]
+ ldp x19,x20,[sp,#64]
+ ldp x29,x30,[sp],#80
+
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ecp_sm2p256_mul,.-ecp_sm2p256_mul
+
+// void ecp_sm2p256_sqr(BN_ULONG *r, const BN_ULONG *a);
+.globl ecp_sm2p256_sqr
+.type ecp_sm2p256_sqr,%function
+.align 5
+
+ecp_sm2p256_sqr:
+ AARCH64_SIGN_LINK_REGISTER
+ // Store scalar registers
+ stp x29,x30,[sp,#-80]!
+ add x29,sp,#0
+ stp x16,x17,[sp,#16]
+ stp x19,x20,[sp,#64]
+
+ // Load inputs
+ ldp x11,x12,[x1]
+ ldp x13,x14,[x1,#16]
+
+// ### square ###
+ // ========================
+ // s7 s6 s5 s4
+ // * s7 s6 s5 s4
+ // ------------------------
+ // + s4 s4 s4 s4
+ // * * * *
+ // s7 s6 s5 s4
+ // s5 s5 s5 s5
+ // * * * *
+ // s7 s6 s5 s4
+ // s6 s6 s6 s6
+ // * * * *
+ // s7 s6 s5 s4
+ // s7 s7 s7 s7
+ // * * * *
+ // s7 s6 s5 s4
+ // ------------------------
+ // s7 s6 s5 s4 s3 s2 s1 s0
+ // ========================
+
+// ### s4*s5 ###
+ mul x8,x11,x12
+ umulh x9,x11,x12
+
+// ### s4*s6 ###
+ mul x3,x13,x11
+ umulh x10,x13,x11
+ adds x9,x9,x3
+ adcs x10,x10,xzr
+
+// ### s4*s7 + s5*s6 ###
+ mul x3,x14,x11
+ umulh x4,x14,x11
+ adds x10,x10,x3
+ adcs x7,x4,xzr
+
+ mul x3,x13,x12
+ umulh x4,x13,x12
+ adds x10,x10,x3
+ adcs x7,x7,x4
+ adcs x5,xzr,xzr
+
+// ### s5*s7 ###
+ mul x3,x14,x12
+ umulh x4,x14,x12
+ adds x7,x7,x3
+ adcs x5,x5,x4
+
+// ### s6*s7 ###
+ mul x3,x14,x13
+ umulh x4,x14,x13
+ adds x5,x5,x3
+ adcs x6,x4,xzr
+
+// ### 2*(t3,t2,s0,s3,s2,s1) ###
+ adds x8,x8,x8
+ adcs x9,x9,x9
+ adcs x10,x10,x10
+ adcs x7,x7,x7
+ adcs x5,x5,x5
+ adcs x6,x6,x6
+ adcs x15,xzr,xzr
+
+// ### s4*s4 ###
+ mul x16,x11,x11
+ umulh x17,x11,x11
+
+// ### s5*s5 ###
+ mul x11,x12,x12
+ umulh x12,x12,x12
+
+// ### s6*s6 ###
+ mul x3,x13,x13
+ umulh x4,x13,x13
+
+// ### s7*s7 ###
+ mul x19,x14,x14
+ umulh x20,x14,x14
+
+ adds x8,x8,x17
+ adcs x9,x9,x11
+ adcs x10,x10,x12
+ adcs x7,x7,x3
+ adcs x5,x5,x4
+ adcs x6,x6,x19
+ adcs x15,x15,x20
+
+ mov x11,x7
+ mov x7,x16
+ mov x12,x5
+ mov x13,x6
+ mov x14,x15
+
+ // result of mul: s7 s6 s5 s4 s3 s2 s1 s0
+
+// ### Reduction ###
+ RDC
+
+ stp x7,x8,[x0]
+ stp x9,x10,[x0,#16]
+
+ // Restore scalar registers
+ ldp x16,x17,[sp,#16]
+ ldp x19,x20,[sp,#64]
+ ldp x29,x30,[sp],#80
+
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ecp_sm2p256_sqr,.-ecp_sm2p256_sqr