aboutsummaryrefslogtreecommitdiff
path: root/crypto
diff options
context:
space:
mode:
Diffstat (limited to 'crypto')
-rw-r--r--crypto/Makefile8
-rw-r--r--crypto/aes/Makefile20
-rw-r--r--crypto/aes/aes_wrap.c186
-rw-r--r--crypto/aes/aes_x86core.c6
-rwxr-xr-xcrypto/aes/asm/aes-586.pl283
-rwxr-xr-xcrypto/aes/asm/aes-armv4.pl139
-rwxr-xr-xcrypto/aes/asm/aes-mips.pl813
-rwxr-xr-xcrypto/aes/asm/aes-ppc.pl113
-rwxr-xr-xcrypto/aes/asm/aes-x86_64.pl250
-rwxr-xr-xcrypto/aes/asm/aesni-mb-x86_64.pl1395
-rwxr-xr-xcrypto/aes/asm/aesni-sha1-x86_64.pl1121
-rwxr-xr-xcrypto/aes/asm/aesni-sha256-x86_64.pl1708
-rwxr-xr-xcrypto/aes/asm/aesni-x86.pl644
-rwxr-xr-xcrypto/aes/asm/aesni-x86_64.pl2917
-rwxr-xr-xcrypto/aes/asm/aesp8-ppc.pl1942
-rwxr-xr-xcrypto/aes/asm/aest4-sparcv9.pl919
-rwxr-xr-xcrypto/aes/asm/aesv8-armx.pl989
-rwxr-xr-xcrypto/aes/asm/bsaes-armv7.pl2469
-rwxr-xr-xcrypto/aes/asm/bsaes-x86_64.pl74
-rwxr-xr-xcrypto/aes/asm/vpaes-ppc.pl1512
-rwxr-xr-xcrypto/aes/asm/vpaes-x86.pl110
-rwxr-xr-xcrypto/aes/asm/vpaes-x86_64.pl100
-rw-r--r--crypto/arm64cpuid.S46
-rw-r--r--crypto/arm_arch.h35
-rw-r--r--crypto/armcap.c97
-rw-r--r--crypto/armv4cpuid.S109
-rw-r--r--crypto/asn1/Makefile5
-rw-r--r--crypto/asn1/a_gentm.c46
-rw-r--r--crypto/asn1/a_time.c30
-rw-r--r--crypto/asn1/a_utctm.c92
-rw-r--r--crypto/asn1/ameth_lib.c24
-rw-r--r--crypto/asn1/asn1.h10
-rw-r--r--crypto/asn1/asn1_locl.h3
-rw-r--r--crypto/asn1/t_x509.c15
-rw-r--r--crypto/asn1/x_crl.c4
-rw-r--r--crypto/asn1/x_x509.c20
-rw-r--r--crypto/bio/b_dump.c25
-rw-r--r--crypto/bio/b_sock.c8
-rw-r--r--crypto/bio/bio.h8
-rw-r--r--crypto/bio/bio_err.c2
-rw-r--r--crypto/bio/bss_acpt.c2
-rw-r--r--crypto/bio/bss_conn.c2
-rw-r--r--crypto/bio/bss_dgram.c84
-rw-r--r--crypto/bio/bss_fd.c22
-rw-r--r--crypto/bn/Makefile20
-rwxr-xr-xcrypto/bn/asm/armv4-gf2m.pl169
-rwxr-xr-xcrypto/bn/asm/armv4-mont.pl484
-rwxr-xr-xcrypto/bn/asm/mips-mont.pl2
-rwxr-xr-xcrypto/bn/asm/mips.pl2
-rw-r--r--crypto/bn/asm/mips3.s2201
-rwxr-xr-xcrypto/bn/asm/modexp512-x86_64.pl1497
-rwxr-xr-xcrypto/bn/asm/ppc-mont.pl1
-rw-r--r--crypto/bn/asm/ppc.pl10
-rwxr-xr-xcrypto/bn/asm/ppc64-mont.pl660
-rwxr-xr-xcrypto/bn/asm/rsaz-avx2.pl1898
-rwxr-xr-xcrypto/bn/asm/rsaz-x86_64.pl2144
-rwxr-xr-xcrypto/bn/asm/sparct4-mont.pl1222
-rwxr-xr-xcrypto/bn/asm/sparcv9-gf2m.pl190
-rwxr-xr-xcrypto/bn/asm/vis3-mont.pl373
-rw-r--r--crypto/bn/asm/x86_64-gcc.c110
-rwxr-xr-xcrypto/bn/asm/x86_64-mont.pl1289
-rwxr-xr-xcrypto/bn/asm/x86_64-mont5.pl2908
-rw-r--r--crypto/bn/bn.h18
-rw-r--r--crypto/bn/bn_asm.c243
-rw-r--r--crypto/bn/bn_exp.c326
-rw-r--r--crypto/bn/bn_gf2m.c3
-rw-r--r--crypto/bn/bn_lcl.h27
-rw-r--r--crypto/bn/bntest.c3
-rw-r--r--crypto/bn/rsaz_exp.c346
-rw-r--r--crypto/bn/rsaz_exp.h56
-rw-r--r--crypto/buffer/buf_str.c11
-rw-r--r--crypto/buffer/buffer.h1
-rw-r--r--crypto/camellia/Makefile2
-rwxr-xr-xcrypto/camellia/asm/cmll-x86_64.pl4
-rwxr-xr-xcrypto/camellia/asm/cmllt4-sparcv9.pl929
-rw-r--r--crypto/cast/cast_lcl.h2
-rw-r--r--crypto/cms/Makefile49
-rw-r--r--crypto/cms/cms.h52
-rw-r--r--crypto/cms/cms_asn1.c86
-rw-r--r--crypto/cms/cms_env.c320
-rw-r--r--crypto/cms/cms_err.c18
-rw-r--r--crypto/cms/cms_kari.c465
-rw-r--r--crypto/cms/cms_lcl.h32
-rw-r--r--crypto/cms/cms_lib.c59
-rw-r--r--crypto/cms/cms_sd.c205
-rw-r--r--crypto/cms/cms_smime.c54
-rw-r--r--crypto/cryptlib.c35
-rw-r--r--crypto/cversion.c4
-rw-r--r--crypto/des/Makefile2
-rw-r--r--crypto/des/asm/des-586.pl4
-rw-r--r--crypto/des/asm/des_enc.m42
-rwxr-xr-xcrypto/des/asm/dest4-sparcv9.pl617
-rw-r--r--crypto/des/des_locl.h4
-rw-r--r--crypto/des/read_pwd.c2
-rw-r--r--crypto/dh/Makefile50
-rw-r--r--crypto/dh/dh.h105
-rw-r--r--crypto/dh/dh_ameth.c519
-rw-r--r--crypto/dh/dh_asn1.c98
-rw-r--r--crypto/dh/dh_check.c46
-rw-r--r--crypto/dh/dh_err.c8
-rw-r--r--crypto/dh/dh_kdf.c187
-rw-r--r--crypto/dh/dh_key.c14
-rw-r--r--crypto/dh/dh_pmeth.c322
-rw-r--r--crypto/dh/dh_rfc5114.c285
-rw-r--r--crypto/dh/dhtest.c323
-rw-r--r--crypto/dsa/dsa.h3
-rw-r--r--crypto/dsa/dsa_ameth.c6
-rw-r--r--crypto/dsa/dsa_err.c5
-rw-r--r--crypto/dsa/dsa_gen.c369
-rw-r--r--crypto/dsa/dsa_locl.h8
-rw-r--r--crypto/dsa/dsa_ossl.c6
-rw-r--r--crypto/dsa/dsa_pmeth.c4
-rw-r--r--crypto/ebcdic.c2
-rw-r--r--crypto/ec/Makefile41
-rwxr-xr-xcrypto/ec/asm/ecp_nistz256-avx2.pl2093
-rwxr-xr-xcrypto/ec/asm/ecp_nistz256-x86_64.pl2997
-rw-r--r--crypto/ec/ec.h91
-rw-r--r--crypto/ec/ec_ameth.c340
-rw-r--r--crypto/ec/ec_curve.c643
-rw-r--r--crypto/ec/ec_cvt.c12
-rw-r--r--crypto/ec/ec_err.c15
-rw-r--r--crypto/ec/ec_lcl.h25
-rw-r--r--crypto/ec/ec_lib.c81
-rw-r--r--crypto/ec/ec_pmeth.c220
-rw-r--r--crypto/ec/eck_prn.c8
-rw-r--r--crypto/ec/ecp_nistp521.c10
-rw-r--r--crypto/ec/ecp_nistz256.c1521
-rw-r--r--crypto/ec/ecp_nistz256_table.c9533
-rw-r--r--crypto/ecdh/Makefile12
-rw-r--r--crypto/ecdh/ecdh.h7
-rw-r--r--crypto/ecdh/ecdhtest.c170
-rw-r--r--crypto/ecdh/ech_kdf.c111
-rw-r--r--crypto/ecdh/ech_ossl.c10
-rw-r--r--crypto/ecdsa/ecdsa.h75
-rw-r--r--crypto/ecdsa/ecs_err.c1
-rw-r--r--crypto/ecdsa/ecs_lib.c77
-rw-r--r--crypto/ecdsa/ecs_locl.h6
-rw-r--r--crypto/ecdsa/ecs_ossl.c28
-rw-r--r--crypto/engine/Makefile18
-rw-r--r--crypto/engine/eng_all.c3
-rw-r--r--crypto/engine/eng_cryptodev.c70
-rw-r--r--crypto/engine/eng_rsax.c701
-rw-r--r--crypto/engine/engine.h1
-rw-r--r--crypto/evp/Makefile52
-rw-r--r--crypto/evp/c_allc.c11
-rw-r--r--crypto/evp/digest.c12
-rw-r--r--crypto/evp/e_aes.c962
-rw-r--r--crypto/evp/e_aes_cbc_hmac_sha1.c460
-rw-r--r--crypto/evp/e_aes_cbc_hmac_sha256.c973
-rw-r--r--crypto/evp/e_camellia.c321
-rw-r--r--crypto/evp/e_des.c59
-rw-r--r--crypto/evp/e_des3.c273
-rw-r--r--crypto/evp/e_null.c3
-rw-r--r--crypto/evp/encode.c2
-rw-r--r--crypto/evp/evp.h56
-rw-r--r--crypto/evp/evp_enc.c17
-rw-r--r--crypto/evp/evp_err.c5
-rw-r--r--crypto/evp/evp_extra_test.c2
-rw-r--r--crypto/evp/evp_fips.c310
-rw-r--r--crypto/evp/evp_lib.c62
-rw-r--r--crypto/evp/evp_locl.h3
-rw-r--r--crypto/evp/evp_test.c202
-rw-r--r--crypto/evp/evptests.txt67
-rw-r--r--crypto/evp/m_dss.c2
-rw-r--r--crypto/evp/m_dss1.c3
-rw-r--r--crypto/evp/m_ecdsa.c2
-rw-r--r--crypto/evp/m_sha1.c28
-rw-r--r--crypto/evp/m_sigver.c44
-rw-r--r--crypto/evp/p_lib.c2
-rw-r--r--crypto/evp/pmeth_lib.c6
-rw-r--r--crypto/hmac/hm_ameth.c2
-rw-r--r--crypto/hmac/hmac.c10
-rw-r--r--crypto/hmac/hmactest.c3
-rw-r--r--crypto/jpake/jpake.c1
-rw-r--r--crypto/md32_common.h38
-rw-r--r--crypto/md5/Makefile3
-rwxr-xr-xcrypto/md5/asm/md5-sparcv9.pl430
-rw-r--r--crypto/md5/md5_locl.h2
-rw-r--r--crypto/modes/Makefile24
-rwxr-xr-xcrypto/modes/asm/aesni-gcm-x86_64.pl1057
-rwxr-xr-xcrypto/modes/asm/ghash-armv4.pl232
-rwxr-xr-xcrypto/modes/asm/ghash-s390x.pl6
-rwxr-xr-xcrypto/modes/asm/ghash-sparcv9.pl247
-rwxr-xr-xcrypto/modes/asm/ghash-x86.pl199
-rwxr-xr-xcrypto/modes/asm/ghash-x86_64.pl1149
-rwxr-xr-xcrypto/modes/asm/ghashp8-ppc.pl234
-rwxr-xr-xcrypto/modes/asm/ghashv8-armx.pl409
-rw-r--r--crypto/modes/cbc128.c2
-rw-r--r--crypto/modes/gcm128.c98
-rw-r--r--crypto/modes/modes.h10
-rw-r--r--crypto/modes/modes_lcl.h50
-rw-r--r--crypto/modes/wrap128.c138
-rw-r--r--crypto/o_str.c2
-rw-r--r--crypto/o_time.c126
-rw-r--r--crypto/o_time.h2
-rw-r--r--crypto/objects/obj_dat.h234
-rw-r--r--crypto/objects/obj_mac.h163
-rw-r--r--crypto/objects/obj_mac.num38
-rw-r--r--crypto/objects/obj_xref.h25
-rw-r--r--crypto/objects/obj_xref.txt12
-rw-r--r--crypto/objects/objects.txt58
-rwxr-xr-xcrypto/objects/objxref.pl3
-rw-r--r--crypto/ocsp/ocsp.h17
-rw-r--r--crypto/ocsp/ocsp_ht.c151
-rwxr-xr-xcrypto/ocsp/ocsp_lib.c2
-rw-r--r--crypto/opensslconf.h12
-rw-r--r--crypto/opensslv.h6
-rw-r--r--crypto/ossl_typ.h2
-rw-r--r--crypto/pem/Makefile13
-rw-r--r--crypto/pem/pem.h12
-rw-r--r--crypto/pem/pem_all.c5
-rw-r--r--crypto/pem/pem_err.c4
-rw-r--r--crypto/pem/pem_lib.c17
-rw-r--r--crypto/pem/pem_pkey.c50
-rwxr-xr-xcrypto/perlasm/ppc-xlate.pl81
-rwxr-xr-xcrypto/perlasm/sparcv9_modes.pl1687
-rwxr-xr-xcrypto/perlasm/x86_64-xlate.pl86
-rw-r--r--crypto/perlasm/x86asm.pl38
-rwxr-xr-xcrypto/perlasm/x86gas.pl13
-rwxr-xr-xcrypto/perlasm/x86masm.pl4
-rw-r--r--crypto/perlasm/x86nasm.pl4
-rw-r--r--crypto/pkcs12/p12_decr.c12
-rw-r--r--crypto/pkcs12/p12_p8e.c6
-rw-r--r--crypto/ppc_arch.h10
-rw-r--r--crypto/ppccap.c54
-rwxr-xr-xcrypto/ppccpuid.pl16
-rw-r--r--crypto/rc4/Makefile4
-rw-r--r--crypto/rc4/asm/rc4-586.pl8
-rw-r--r--crypto/rc4/rc4_enc.c2
-rw-r--r--crypto/rc5/rc5_locl.h5
-rw-r--r--crypto/rsa/Makefile15
-rw-r--r--crypto/rsa/rsa.h58
-rw-r--r--crypto/rsa/rsa_ameth.c538
-rw-r--r--crypto/rsa/rsa_asn1.c8
-rw-r--r--crypto/rsa/rsa_err.c18
-rw-r--r--crypto/rsa/rsa_oaep.c110
-rw-r--r--crypto/rsa/rsa_pmeth.c159
-rw-r--r--crypto/rsa/rsa_sign.c15
-rw-r--r--crypto/sha/Makefile17
-rw-r--r--crypto/sha/asm/sha1-586.pl357
-rwxr-xr-xcrypto/sha/asm/sha1-armv4-large.pl451
-rwxr-xr-xcrypto/sha/asm/sha1-armv8.pl338
-rwxr-xr-xcrypto/sha/asm/sha1-mb-x86_64.pl1574
-rwxr-xr-xcrypto/sha/asm/sha1-mips.pl98
-rwxr-xr-xcrypto/sha/asm/sha1-ppc.pl46
-rwxr-xr-xcrypto/sha/asm/sha1-sparcv9.pl165
-rwxr-xr-xcrypto/sha/asm/sha1-x86_64.pl1210
-rwxr-xr-xcrypto/sha/asm/sha256-586.pl1266
-rwxr-xr-xcrypto/sha/asm/sha256-armv4.pl644
-rwxr-xr-xcrypto/sha/asm/sha256-mb-x86_64.pl1560
-rwxr-xr-xcrypto/sha/asm/sha512-586.pl517
-rwxr-xr-xcrypto/sha/asm/sha512-armv4.pl69
-rwxr-xr-xcrypto/sha/asm/sha512-armv8.pl422
-rwxr-xr-xcrypto/sha/asm/sha512-ia64.pl351
-rwxr-xr-xcrypto/sha/asm/sha512-mips.pl79
-rwxr-xr-xcrypto/sha/asm/sha512-ppc.pl460
-rwxr-xr-xcrypto/sha/asm/sha512-sparcv9.pl320
-rwxr-xr-xcrypto/sha/asm/sha512-x86_64.pl2043
-rwxr-xr-xcrypto/sha/asm/sha512p8-ppc.pl424
-rw-r--r--crypto/sha/sha512.c13
-rw-r--r--crypto/sparc_arch.h101
-rw-r--r--crypto/sparccpuid.S127
-rw-r--r--crypto/sparcv9cap.c157
-rw-r--r--crypto/srp/Makefile3
-rw-r--r--crypto/srp/srptest.c1
-rw-r--r--crypto/stack/safestack.h148
-rw-r--r--crypto/stack/stack.c34
-rw-r--r--crypto/stack/stack.h1
-rw-r--r--crypto/symhacks.h34
-rw-r--r--crypto/ts/ts_rsp_sign.c7
-rw-r--r--crypto/ts/ts_rsp_verify.c2
-rw-r--r--crypto/ui/ui_openssl.c2
-rwxr-xr-xcrypto/whrlpool/asm/wp-mmx.pl42
-rwxr-xr-xcrypto/whrlpool/asm/wp-x86_64.pl43
-rw-r--r--crypto/x509/Makefile22
-rw-r--r--crypto/x509/verify_extra_test.c3
-rw-r--r--crypto/x509/vpm_int.h70
-rw-r--r--crypto/x509/x509.h26
-rw-r--r--crypto/x509/x509_cmp.c159
-rw-r--r--crypto/x509/x509_err.c10
-rw-r--r--crypto/x509/x509_lu.c26
-rw-r--r--crypto/x509/x509_set.c5
-rw-r--r--crypto/x509/x509_trs.c8
-rw-r--r--crypto/x509/x509_txt.c20
-rw-r--r--crypto/x509/x509_vfy.c338
-rw-r--r--crypto/x509/x509_vfy.h56
-rw-r--r--crypto/x509/x509_vpm.c246
-rw-r--r--crypto/x509/x_all.c14
-rw-r--r--crypto/x509v3/Makefile20
-rw-r--r--crypto/x509v3/ext_dat.h3
-rw-r--r--crypto/x509v3/v3_lib.c22
-rw-r--r--crypto/x509v3/v3_purp.c29
-rw-r--r--crypto/x509v3/v3_scts.c332
-rw-r--r--crypto/x509v3/v3_utl.c427
-rw-r--r--crypto/x509v3/v3err.c9
-rw-r--r--crypto/x509v3/v3nametest.c336
-rw-r--r--crypto/x509v3/x509v3.h44
-rw-r--r--crypto/x86_64cpuid.pl29
-rw-r--r--crypto/x86cpuid.pl31
299 files changed, 77947 insertions, 8500 deletions
diff --git a/crypto/Makefile b/crypto/Makefile
index 618c95878ce4..7869996a9c07 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -74,9 +74,9 @@ ia64cpuid.s: ia64cpuid.S; $(CC) $(CFLAGS) -E ia64cpuid.S > $@
ppccpuid.s: ppccpuid.pl; $(PERL) ppccpuid.pl $(PERLASM_SCHEME) $@
pariscid.s: pariscid.pl; $(PERL) pariscid.pl $(PERLASM_SCHEME) $@
alphacpuid.s: alphacpuid.pl
- (preproc=/tmp/$$$$.$@; trap "rm $$preproc" INT; \
+ (preproc=$$$$.$@.S; trap "rm $$preproc" INT; \
$(PERL) alphacpuid.pl > $$preproc && \
- $(CC) -E $$preproc > $@ && rm $$preproc)
+ $(CC) -E -P $$preproc > $@ && rm $$preproc)
testapps:
[ -z "$(THIS)" ] || ( if echo $(SDIRS) | fgrep ' des '; \
@@ -88,7 +88,7 @@ subdirs:
@target=all; $(RECURSIVE_MAKE)
files:
- $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
+ $(PERL) $(TOP)/util/files.pl "CPUID_OBJ=$(CPUID_OBJ)" Makefile >> $(TOP)/MINFO
@target=files; $(RECURSIVE_MAKE)
links:
@@ -102,7 +102,7 @@ lib: $(LIB)
@touch lib
$(LIB): $(LIBOBJ)
$(AR) $(LIB) $(LIBOBJ)
- [ -z "$(FIPSLIBDIR)" ] || $(AR) $(LIB) $(FIPSLIBDIR)fipscanister.o
+ test -z "$(FIPSLIBDIR)" || $(AR) $(LIB) $(FIPSLIBDIR)fipscanister.o
$(RANLIB) $(LIB) || echo Never mind.
shared: buildinf.h lib subdirs
diff --git a/crypto/aes/Makefile b/crypto/aes/Makefile
index b3a95812af3b..e825c140194f 100644
--- a/crypto/aes/Makefile
+++ b/crypto/aes/Makefile
@@ -65,12 +65,22 @@ aesni-x86_64.s: asm/aesni-x86_64.pl
$(PERL) asm/aesni-x86_64.pl $(PERLASM_SCHEME) > $@
aesni-sha1-x86_64.s: asm/aesni-sha1-x86_64.pl
$(PERL) asm/aesni-sha1-x86_64.pl $(PERLASM_SCHEME) > $@
+aesni-sha256-x86_64.s: asm/aesni-sha256-x86_64.pl
+ $(PERL) asm/aesni-sha256-x86_64.pl $(PERLASM_SCHEME) > $@
+aesni-mb-x86_64.s: asm/aesni-mb-x86_64.pl
+ $(PERL) asm/aesni-mb-x86_64.pl $(PERLASM_SCHEME) > $@
aes-sparcv9.s: asm/aes-sparcv9.pl
$(PERL) asm/aes-sparcv9.pl $(CFLAGS) > $@
+aest4-sparcv9.s: asm/aest4-sparcv9.pl ../perlasm/sparcv9_modes.pl
+ $(PERL) asm/aest4-sparcv9.pl $(CFLAGS) > $@
aes-ppc.s: asm/aes-ppc.pl
$(PERL) asm/aes-ppc.pl $(PERLASM_SCHEME) $@
+vpaes-ppc.s: asm/vpaes-ppc.pl
+ $(PERL) asm/vpaes-ppc.pl $(PERLASM_SCHEME) $@
+aesp8-ppc.s: asm/aesp8-ppc.pl
+ $(PERL) asm/aesp8-ppc.pl $(PERLASM_SCHEME) $@
aes-parisc.s: asm/aes-parisc.pl
$(PERL) asm/aes-parisc.pl $(PERLASM_SCHEME) $@
@@ -78,12 +88,18 @@ aes-parisc.s: asm/aes-parisc.pl
aes-mips.S: asm/aes-mips.pl
$(PERL) asm/aes-mips.pl $(PERLASM_SCHEME) $@
+aesv8-armx.S: asm/aesv8-armx.pl
+ $(PERL) asm/aesv8-armx.pl $(PERLASM_SCHEME) $@
+aesv8-armx.o: aesv8-armx.S
+
# GNU make "catch all"
aes-%.S: asm/aes-%.pl; $(PERL) $< $(PERLASM_SCHEME) > $@
aes-armv4.o: aes-armv4.S
+bsaes-%.S: asm/bsaes-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
+bsaes-armv7.o: bsaes-armv7.S
files:
- $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
+ $(PERL) $(TOP)/util/files.pl "AES_ENC=$(AES_ENC)" Makefile >> $(TOP)/MINFO
links:
@$(PERL) $(TOP)/util/mklink.pl ../../include/openssl $(EXHEADER)
@@ -149,7 +165,7 @@ aes_wrap.o: ../../e_os.h ../../include/openssl/aes.h
aes_wrap.o: ../../include/openssl/bio.h ../../include/openssl/buffer.h
aes_wrap.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
aes_wrap.o: ../../include/openssl/err.h ../../include/openssl/lhash.h
-aes_wrap.o: ../../include/openssl/opensslconf.h
+aes_wrap.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
aes_wrap.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
aes_wrap.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
aes_wrap.o: ../../include/openssl/symhacks.h ../cryptlib.h aes_wrap.c
diff --git a/crypto/aes/aes_wrap.c b/crypto/aes/aes_wrap.c
index b1ab8e25ef37..b7b64d57a487 100644
--- a/crypto/aes/aes_wrap.c
+++ b/crypto/aes/aes_wrap.c
@@ -54,197 +54,19 @@
#include "cryptlib.h"
#include <openssl/aes.h>
-#include <openssl/bio.h>
-
-static const unsigned char default_iv[] = {
- 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6,
-};
+#include <openssl/modes.h>
int AES_wrap_key(AES_KEY *key, const unsigned char *iv,
unsigned char *out,
const unsigned char *in, unsigned int inlen)
{
- unsigned char *A, B[16], *R;
- unsigned int i, j, t;
- if ((inlen & 0x7) || (inlen < 8))
- return -1;
- A = B;
- t = 1;
- memcpy(out + 8, in, inlen);
- if (!iv)
- iv = default_iv;
-
- memcpy(A, iv, 8);
-
- for (j = 0; j < 6; j++) {
- R = out + 8;
- for (i = 0; i < inlen; i += 8, t++, R += 8) {
- memcpy(B + 8, R, 8);
- AES_encrypt(B, B, key);
- A[7] ^= (unsigned char)(t & 0xff);
- if (t > 0xff) {
- A[6] ^= (unsigned char)((t >> 8) & 0xff);
- A[5] ^= (unsigned char)((t >> 16) & 0xff);
- A[4] ^= (unsigned char)((t >> 24) & 0xff);
- }
- memcpy(R, B + 8, 8);
- }
- }
- memcpy(out, A, 8);
- return inlen + 8;
+ return CRYPTO_128_wrap(key, iv, out, in, inlen, (block128_f) AES_encrypt);
}
int AES_unwrap_key(AES_KEY *key, const unsigned char *iv,
unsigned char *out,
const unsigned char *in, unsigned int inlen)
{
- unsigned char *A, B[16], *R;
- unsigned int i, j, t;
- inlen -= 8;
- if (inlen & 0x7)
- return -1;
- if (inlen < 8)
- return -1;
- A = B;
- t = 6 * (inlen >> 3);
- memcpy(A, in, 8);
- memcpy(out, in + 8, inlen);
- for (j = 0; j < 6; j++) {
- R = out + inlen - 8;
- for (i = 0; i < inlen; i += 8, t--, R -= 8) {
- A[7] ^= (unsigned char)(t & 0xff);
- if (t > 0xff) {
- A[6] ^= (unsigned char)((t >> 8) & 0xff);
- A[5] ^= (unsigned char)((t >> 16) & 0xff);
- A[4] ^= (unsigned char)((t >> 24) & 0xff);
- }
- memcpy(B + 8, R, 8);
- AES_decrypt(B, B, key);
- memcpy(R, B + 8, 8);
- }
- }
- if (!iv)
- iv = default_iv;
- if (memcmp(A, iv, 8)) {
- OPENSSL_cleanse(out, inlen);
- return 0;
- }
- return inlen;
-}
-
-#ifdef AES_WRAP_TEST
-
-int AES_wrap_unwrap_test(const unsigned char *kek, int keybits,
- const unsigned char *iv,
- const unsigned char *eout,
- const unsigned char *key, int keylen)
-{
- unsigned char *otmp = NULL, *ptmp = NULL;
- int r, ret = 0;
- AES_KEY wctx;
- otmp = OPENSSL_malloc(keylen + 8);
- ptmp = OPENSSL_malloc(keylen);
- if (!otmp || !ptmp)
- return 0;
- if (AES_set_encrypt_key(kek, keybits, &wctx))
- goto err;
- r = AES_wrap_key(&wctx, iv, otmp, key, keylen);
- if (r <= 0)
- goto err;
-
- if (eout && memcmp(eout, otmp, keylen))
- goto err;
-
- if (AES_set_decrypt_key(kek, keybits, &wctx))
- goto err;
- r = AES_unwrap_key(&wctx, iv, ptmp, otmp, r);
-
- if (memcmp(key, ptmp, keylen))
- goto err;
-
- ret = 1;
-
- err:
- if (otmp)
- OPENSSL_free(otmp);
- if (ptmp)
- OPENSSL_free(ptmp);
-
- return ret;
-
+ return CRYPTO_128_unwrap(key, iv, out, in, inlen,
+ (block128_f) AES_decrypt);
}
-
-int main(int argc, char **argv)
-{
-
- static const unsigned char kek[] = {
- 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
- 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
- 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
- 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
- };
-
- static const unsigned char key[] = {
- 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
- 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff,
- 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
- 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
- };
-
- static const unsigned char e1[] = {
- 0x1f, 0xa6, 0x8b, 0x0a, 0x81, 0x12, 0xb4, 0x47,
- 0xae, 0xf3, 0x4b, 0xd8, 0xfb, 0x5a, 0x7b, 0x82,
- 0x9d, 0x3e, 0x86, 0x23, 0x71, 0xd2, 0xcf, 0xe5
- };
-
- static const unsigned char e2[] = {
- 0x96, 0x77, 0x8b, 0x25, 0xae, 0x6c, 0xa4, 0x35,
- 0xf9, 0x2b, 0x5b, 0x97, 0xc0, 0x50, 0xae, 0xd2,
- 0x46, 0x8a, 0xb8, 0xa1, 0x7a, 0xd8, 0x4e, 0x5d
- };
-
- static const unsigned char e3[] = {
- 0x64, 0xe8, 0xc3, 0xf9, 0xce, 0x0f, 0x5b, 0xa2,
- 0x63, 0xe9, 0x77, 0x79, 0x05, 0x81, 0x8a, 0x2a,
- 0x93, 0xc8, 0x19, 0x1e, 0x7d, 0x6e, 0x8a, 0xe7
- };
-
- static const unsigned char e4[] = {
- 0x03, 0x1d, 0x33, 0x26, 0x4e, 0x15, 0xd3, 0x32,
- 0x68, 0xf2, 0x4e, 0xc2, 0x60, 0x74, 0x3e, 0xdc,
- 0xe1, 0xc6, 0xc7, 0xdd, 0xee, 0x72, 0x5a, 0x93,
- 0x6b, 0xa8, 0x14, 0x91, 0x5c, 0x67, 0x62, 0xd2
- };
-
- static const unsigned char e5[] = {
- 0xa8, 0xf9, 0xbc, 0x16, 0x12, 0xc6, 0x8b, 0x3f,
- 0xf6, 0xe6, 0xf4, 0xfb, 0xe3, 0x0e, 0x71, 0xe4,
- 0x76, 0x9c, 0x8b, 0x80, 0xa3, 0x2c, 0xb8, 0x95,
- 0x8c, 0xd5, 0xd1, 0x7d, 0x6b, 0x25, 0x4d, 0xa1
- };
-
- static const unsigned char e6[] = {
- 0x28, 0xc9, 0xf4, 0x04, 0xc4, 0xb8, 0x10, 0xf4,
- 0xcb, 0xcc, 0xb3, 0x5c, 0xfb, 0x87, 0xf8, 0x26,
- 0x3f, 0x57, 0x86, 0xe2, 0xd8, 0x0e, 0xd3, 0x26,
- 0xcb, 0xc7, 0xf0, 0xe7, 0x1a, 0x99, 0xf4, 0x3b,
- 0xfb, 0x98, 0x8b, 0x9b, 0x7a, 0x02, 0xdd, 0x21
- };
-
- AES_KEY wctx, xctx;
- int ret;
- ret = AES_wrap_unwrap_test(kek, 128, NULL, e1, key, 16);
- fprintf(stderr, "Key test result %d\n", ret);
- ret = AES_wrap_unwrap_test(kek, 192, NULL, e2, key, 16);
- fprintf(stderr, "Key test result %d\n", ret);
- ret = AES_wrap_unwrap_test(kek, 256, NULL, e3, key, 16);
- fprintf(stderr, "Key test result %d\n", ret);
- ret = AES_wrap_unwrap_test(kek, 192, NULL, e4, key, 24);
- fprintf(stderr, "Key test result %d\n", ret);
- ret = AES_wrap_unwrap_test(kek, 256, NULL, e5, key, 24);
- fprintf(stderr, "Key test result %d\n", ret);
- ret = AES_wrap_unwrap_test(kek, 256, NULL, e6, key, 32);
- fprintf(stderr, "Key test result %d\n", ret);
-}
-
-#endif
diff --git a/crypto/aes/aes_x86core.c b/crypto/aes/aes_x86core.c
index 1defbb1abfb2..c869ed719852 100644
--- a/crypto/aes/aes_x86core.c
+++ b/crypto/aes/aes_x86core.c
@@ -89,8 +89,10 @@ typedef unsigned long long u64;
#endif
#undef ROTATE
-#if defined(_MSC_VER) || defined(__ICC)
-# define ROTATE(a,n) _lrotl(a,n)
+#if defined(_MSC_VER)
+# define ROTATE(a,n) _lrotl(a,n)
+#elif defined(__ICC)
+# define ROTATE(a,n) _rotl(a,n)
#elif defined(__GNUC__) && __GNUC__>=2
# if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)
# define ROTATE(a,n) ({ register unsigned int ret; \
diff --git a/crypto/aes/asm/aes-586.pl b/crypto/aes/asm/aes-586.pl
index 687ed811be47..451d0e0ed1e2 100755
--- a/crypto/aes/asm/aes-586.pl
+++ b/crypto/aes/asm/aes-586.pl
@@ -39,7 +39,7 @@
# but exhibits up to 10% improvement on other cores.
#
# Second version is "monolithic" replacement for aes_core.c, which in
-# addition to AES_[de|en]crypt implements private_AES_set_[de|en]cryption_key.
+# addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key.
# This made it possible to implement little-endian variant of the
# algorithm without modifying the base C code. Motivating factor for
# the undertaken effort was that it appeared that in tight IA-32
@@ -103,11 +103,12 @@
# byte for 128-bit key.
#
# ECB encrypt ECB decrypt CBC large chunk
-# P4 56[60] 84[100] 23
-# AMD K8 48[44] 70[79] 18
-# PIII 41[50] 61[91] 24
-# Core 2 32[38] 45[70] 18.5
-# Pentium 120 160 77
+# P4 52[54] 83[95] 23
+# AMD K8 46[41] 66[70] 18
+# PIII 41[50] 60[77] 24
+# Core 2 31[36] 45[64] 18.5
+# Atom 76[100] 96[138] 60
+# Pentium 115 150 77
#
# Version 4.1 switches to compact S-box even in key schedule setup.
#
@@ -242,7 +243,7 @@ $vertical_spin=0; # shift "verticaly" defaults to 0, because of
sub encvert()
{ my ($te,@s) = @_;
- my $v0 = $acc, $v1 = $key;
+ my ($v0,$v1) = ($acc,$key);
&mov ($v0,$s[3]); # copy s3
&mov (&DWP(4,"esp"),$s[2]); # save s2
@@ -299,7 +300,7 @@ sub encvert()
# Another experimental routine, which features "horizontal spin," but
# eliminates one reference to stack. Strangely enough runs slower...
sub enchoriz()
-{ my $v0 = $key, $v1 = $acc;
+{ my ($v0,$v1) = ($key,$acc);
&movz ($v0,&LB($s0)); # 3, 2, 1, 0*
&rotr ($s2,8); # 8,11,10, 9
@@ -427,7 +428,7 @@ sub sse_encbody()
######################################################################
sub enccompact()
-{ my $Fn = mov;
+{ my $Fn = \&mov;
while ($#_>5) { pop(@_); $Fn=sub{}; }
my ($i,$te,@s)=@_;
my $tmp = $key;
@@ -476,24 +477,25 @@ sub enctransform()
my $tmp = $tbl;
my $r2 = $key ;
- &mov ($acc,$s[$i]);
- &and ($acc,0x80808080);
- &mov ($tmp,$acc);
- &shr ($tmp,7);
+ &and ($tmp,$s[$i]);
&lea ($r2,&DWP(0,$s[$i],$s[$i]));
- &sub ($acc,$tmp);
+ &mov ($acc,$tmp);
+ &shr ($tmp,7);
&and ($r2,0xfefefefe);
- &and ($acc,0x1b1b1b1b);
+ &sub ($acc,$tmp);
&mov ($tmp,$s[$i]);
+ &and ($acc,0x1b1b1b1b);
+ &rotr ($tmp,16);
&xor ($acc,$r2); # r2
+ &mov ($r2,$s[$i]);
&xor ($s[$i],$acc); # r0 ^ r2
+ &rotr ($r2,16+8);
+ &xor ($acc,$tmp);
&rotl ($s[$i],24);
- &xor ($s[$i],$acc) # ROTATE(r2^r0,24) ^ r2
- &rotr ($tmp,16);
- &xor ($s[$i],$tmp);
- &rotr ($tmp,8);
- &xor ($s[$i],$tmp);
+ &xor ($acc,$r2);
+ &mov ($tmp,0x80808080) if ($i!=1);
+ &xor ($s[$i],$acc); # ROTATE(r2^r0,24) ^ r2
}
&function_begin_B("_x86_AES_encrypt_compact");
@@ -526,6 +528,7 @@ sub enctransform()
&enccompact(1,$tbl,$s1,$s2,$s3,$s0,1);
&enccompact(2,$tbl,$s2,$s3,$s0,$s1,1);
&enccompact(3,$tbl,$s3,$s0,$s1,$s2,1);
+ &mov ($tbl,0x80808080);
&enctransform(2);
&enctransform(3);
&enctransform(0);
@@ -607,82 +610,84 @@ sub sse_enccompact()
&pshufw ("mm5","mm4",0x0d); # 15,14,11,10
&movd ("eax","mm1"); # 5, 4, 1, 0
&movd ("ebx","mm5"); # 15,14,11,10
+ &mov ($__key,$key);
&movz ($acc,&LB("eax")); # 0
- &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
- &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
&movz ("edx",&HB("eax")); # 1
+ &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
+ &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
+ &movz ($key,&LB("ebx")); # 10
&movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
- &shl ("edx",8); # 1
&shr ("eax",16); # 5, 4
+ &shl ("edx",8); # 1
- &movz ($acc,&LB("ebx")); # 10
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 10
+ &movz ($key,&HB("ebx")); # 11
&shl ($acc,16); # 10
- &or ("ecx",$acc); # 10
&pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
- &movz ($acc,&HB("ebx")); # 11
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11
+ &or ("ecx",$acc); # 10
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 11
+ &movz ($key,&HB("eax")); # 5
&shl ($acc,24); # 11
- &or ("edx",$acc); # 11
&shr ("ebx",16); # 15,14
+ &or ("edx",$acc); # 11
- &movz ($acc,&HB("eax")); # 5
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 5
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 5
+ &movz ($key,&HB("ebx")); # 15
&shl ($acc,8); # 5
&or ("ecx",$acc); # 5
- &movz ($acc,&HB("ebx")); # 15
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 15
+ &movz ($key,&LB("eax")); # 4
&shl ($acc,24); # 15
&or ("ecx",$acc); # 15
- &movd ("mm0","ecx"); # t[0] collected
- &movz ($acc,&LB("eax")); # 4
- &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 4
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 4
+ &movz ($key,&LB("ebx")); # 14
&movd ("eax","mm2"); # 7, 6, 3, 2
- &movz ($acc,&LB("ebx")); # 14
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14
- &shl ($acc,16); # 14
+ &movd ("mm0","ecx"); # t[0] collected
+ &movz ("ecx",&BP(-128,$tbl,$key,1)); # 14
+ &movz ($key,&HB("eax")); # 3
+ &shl ("ecx",16); # 14
+ &movd ("ebx","mm6"); # 13,12, 9, 8
&or ("ecx",$acc); # 14
- &movd ("ebx","mm6"); # 13,12, 9, 8
- &movz ($acc,&HB("eax")); # 3
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 3
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 3
+ &movz ($key,&HB("ebx")); # 9
&shl ($acc,24); # 3
&or ("ecx",$acc); # 3
- &movz ($acc,&HB("ebx")); # 9
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 9
+ &movz ($key,&LB("ebx")); # 8
&shl ($acc,8); # 9
+ &shr ("ebx",16); # 13,12
&or ("ecx",$acc); # 9
- &movd ("mm1","ecx"); # t[1] collected
- &movz ($acc,&LB("ebx")); # 8
- &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 8
- &shr ("ebx",16); # 13,12
- &movz ($acc,&LB("eax")); # 2
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2
- &shl ($acc,16); # 2
- &or ("ecx",$acc); # 2
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 8
+ &movz ($key,&LB("eax")); # 2
&shr ("eax",16); # 7, 6
+ &movd ("mm1","ecx"); # t[1] collected
+ &movz ("ecx",&BP(-128,$tbl,$key,1)); # 2
+ &movz ($key,&HB("eax")); # 7
+ &shl ("ecx",16); # 2
+ &and ("eax",0xff); # 6
+ &or ("ecx",$acc); # 2
&punpckldq ("mm0","mm1"); # t[0,1] collected
- &movz ($acc,&HB("eax")); # 7
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 7
+ &movz ($key,&HB("ebx")); # 13
&shl ($acc,24); # 7
- &or ("ecx",$acc); # 7
- &and ("eax",0xff); # 6
+ &and ("ebx",0xff); # 12
&movz ("eax",&BP(-128,$tbl,"eax",1)); # 6
+ &or ("ecx",$acc); # 7
&shl ("eax",16); # 6
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 13
&or ("edx","eax"); # 6
- &movz ($acc,&HB("ebx")); # 13
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13
&shl ($acc,8); # 13
- &or ("ecx",$acc); # 13
- &movd ("mm4","ecx"); # t[2] collected
- &and ("ebx",0xff); # 12
&movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12
+ &or ("ecx",$acc); # 13
&or ("edx","ebx"); # 12
+ &mov ($key,$__key);
+ &movd ("mm4","ecx"); # t[2] collected
&movd ("mm5","edx"); # t[3] collected
&punpckldq ("mm4","mm5"); # t[2,3] collected
@@ -1222,7 +1227,7 @@ sub enclast()
######################################################################
sub deccompact()
-{ my $Fn = mov;
+{ my $Fn = \&mov;
while ($#_>5) { pop(@_); $Fn=sub{}; }
my ($i,$td,@s)=@_;
my $tmp = $key;
@@ -1270,30 +1275,30 @@ sub dectransform()
my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1);
my $tp8 = $tbl;
- &mov ($acc,$s[$i]);
- &and ($acc,0x80808080);
- &mov ($tmp,$acc);
+ &mov ($tmp,0x80808080);
+ &and ($tmp,$s[$i]);
+ &mov ($acc,$tmp);
&shr ($tmp,7);
&lea ($tp2,&DWP(0,$s[$i],$s[$i]));
&sub ($acc,$tmp);
&and ($tp2,0xfefefefe);
&and ($acc,0x1b1b1b1b);
- &xor ($acc,$tp2);
- &mov ($tp2,$acc);
+ &xor ($tp2,$acc);
+ &mov ($tmp,0x80808080);
- &and ($acc,0x80808080);
- &mov ($tmp,$acc);
+ &and ($tmp,$tp2);
+ &mov ($acc,$tmp);
&shr ($tmp,7);
&lea ($tp4,&DWP(0,$tp2,$tp2));
&sub ($acc,$tmp);
&and ($tp4,0xfefefefe);
&and ($acc,0x1b1b1b1b);
&xor ($tp2,$s[$i]); # tp2^tp1
- &xor ($acc,$tp4);
- &mov ($tp4,$acc);
+ &xor ($tp4,$acc);
+ &mov ($tmp,0x80808080);
- &and ($acc,0x80808080);
- &mov ($tmp,$acc);
+ &and ($tmp,$tp4);
+ &mov ($acc,$tmp);
&shr ($tmp,7);
&lea ($tp8,&DWP(0,$tp4,$tp4));
&sub ($acc,$tmp);
@@ -1305,13 +1310,13 @@ sub dectransform()
&xor ($s[$i],$tp2);
&xor ($tp2,$tp8);
- &rotl ($tp2,24);
&xor ($s[$i],$tp4);
&xor ($tp4,$tp8);
- &rotl ($tp4,16);
+ &rotl ($tp2,24);
&xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
- &rotl ($tp8,8);
+ &rotl ($tp4,16);
&xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
+ &rotl ($tp8,8);
&xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
&mov ($s[0],$__s0) if($i==2); #prefetch $s0
&mov ($s[1],$__s1) if($i==3); #prefetch $s1
@@ -1389,85 +1394,87 @@ sub dectransform()
sub sse_deccompact()
{
&pshufw ("mm1","mm0",0x0c); # 7, 6, 1, 0
+ &pshufw ("mm5","mm4",0x09); # 13,12,11,10
&movd ("eax","mm1"); # 7, 6, 1, 0
+ &movd ("ebx","mm5"); # 13,12,11,10
+ &mov ($__key,$key);
- &pshufw ("mm5","mm4",0x09); # 13,12,11,10
&movz ($acc,&LB("eax")); # 0
- &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
- &movd ("ebx","mm5"); # 13,12,11,10
&movz ("edx",&HB("eax")); # 1
+ &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4
+ &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
+ &movz ($key,&LB("ebx")); # 10
&movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
+ &shr ("eax",16); # 7, 6
&shl ("edx",8); # 1
- &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4
- &movz ($acc,&LB("ebx")); # 10
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 10
+ &movz ($key,&HB("ebx")); # 11
&shl ($acc,16); # 10
+ &pshufw ("mm6","mm4",0x03); # 9, 8,15,14
&or ("ecx",$acc); # 10
- &shr ("eax",16); # 7, 6
- &movz ($acc,&HB("ebx")); # 11
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 11
+ &movz ($key,&HB("eax")); # 7
&shl ($acc,24); # 11
- &or ("edx",$acc); # 11
&shr ("ebx",16); # 13,12
+ &or ("edx",$acc); # 11
- &pshufw ("mm6","mm4",0x03); # 9, 8,15,14
- &movz ($acc,&HB("eax")); # 7
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 7
+ &movz ($key,&HB("ebx")); # 13
&shl ($acc,24); # 7
&or ("ecx",$acc); # 7
- &movz ($acc,&HB("ebx")); # 13
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 13
+ &movz ($key,&LB("eax")); # 6
&shl ($acc,8); # 13
+ &movd ("eax","mm2"); # 3, 2, 5, 4
&or ("ecx",$acc); # 13
- &movd ("mm0","ecx"); # t[0] collected
- &movz ($acc,&LB("eax")); # 6
- &movd ("eax","mm2"); # 3, 2, 5, 4
- &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 6
- &shl ("ecx",16); # 6
- &movz ($acc,&LB("ebx")); # 12
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 6
+ &movz ($key,&LB("ebx")); # 12
+ &shl ($acc,16); # 6
&movd ("ebx","mm6"); # 9, 8,15,14
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 12
+ &movd ("mm0","ecx"); # t[0] collected
+ &movz ("ecx",&BP(-128,$tbl,$key,1)); # 12
+ &movz ($key,&LB("eax")); # 4
&or ("ecx",$acc); # 12
- &movz ($acc,&LB("eax")); # 4
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 4
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 4
+ &movz ($key,&LB("ebx")); # 14
&or ("edx",$acc); # 4
- &movz ($acc,&LB("ebx")); # 14
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 14
+ &movz ($key,&HB("eax")); # 5
&shl ($acc,16); # 14
+ &shr ("eax",16); # 3, 2
&or ("edx",$acc); # 14
- &movd ("mm1","edx"); # t[1] collected
- &movz ($acc,&HB("eax")); # 5
- &movz ("edx",&BP(-128,$tbl,$acc,1)); # 5
- &shl ("edx",8); # 5
- &movz ($acc,&HB("ebx")); # 15
- &shr ("eax",16); # 3, 2
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15
- &shl ($acc,24); # 15
- &or ("edx",$acc); # 15
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 5
+ &movz ($key,&HB("ebx")); # 15
&shr ("ebx",16); # 9, 8
+ &shl ($acc,8); # 5
+ &movd ("mm1","edx"); # t[1] collected
+ &movz ("edx",&BP(-128,$tbl,$key,1)); # 15
+ &movz ($key,&HB("ebx")); # 9
+ &shl ("edx",24); # 15
+ &and ("ebx",0xff); # 8
+ &or ("edx",$acc); # 15
&punpckldq ("mm0","mm1"); # t[0,1] collected
- &movz ($acc,&HB("ebx")); # 9
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 9
+ &movz ($key,&LB("eax")); # 2
&shl ($acc,8); # 9
- &or ("ecx",$acc); # 9
- &and ("ebx",0xff); # 8
+ &movz ("eax",&HB("eax")); # 3
&movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8
+ &or ("ecx",$acc); # 9
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 2
&or ("edx","ebx"); # 8
- &movz ($acc,&LB("eax")); # 2
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2
&shl ($acc,16); # 2
- &or ("edx",$acc); # 2
- &movd ("mm4","edx"); # t[2] collected
- &movz ("eax",&HB("eax")); # 3
&movz ("eax",&BP(-128,$tbl,"eax",1)); # 3
+ &or ("edx",$acc); # 2
&shl ("eax",24); # 3
&or ("ecx","eax"); # 3
+ &mov ($key,$__key);
+ &movd ("mm4","edx"); # t[2] collected
&movd ("mm5","ecx"); # t[3] collected
&punpckldq ("mm4","mm5"); # t[2,3] collected
@@ -2181,8 +2188,8 @@ my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds
&mov ("ecx",240/4);
&xor ("eax","eax");
&align (4);
- &data_word(0xABF3F689); # rep stosd
- &set_label("skip_ezero")
+ &data_word(0xABF3F689); # rep stosd
+ &set_label("skip_ezero");
&mov ("esp",$_esp);
&popf ();
&set_label("drop_out");
@@ -2301,8 +2308,8 @@ my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds
&mov ("ecx",240/4);
&xor ("eax","eax");
&align (4);
- &data_word(0xABF3F689); # rep stosd
- &set_label("skip_dzero")
+ &data_word(0xABF3F689); # rep stosd
+ &set_label("skip_dzero");
&mov ("esp",$_esp);
&popf ();
&function_end_A();
@@ -2865,32 +2872,32 @@ sub deckey()
{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
my $tmp = $tbl;
- &mov ($acc,$tp1);
- &and ($acc,0x80808080);
- &mov ($tmp,$acc);
- &shr ($tmp,7);
+ &mov ($tmp,0x80808080);
+ &and ($tmp,$tp1);
&lea ($tp2,&DWP(0,$tp1,$tp1));
+ &mov ($acc,$tmp);
+ &shr ($tmp,7);
&sub ($acc,$tmp);
&and ($tp2,0xfefefefe);
&and ($acc,0x1b1b1b1b);
- &xor ($acc,$tp2);
- &mov ($tp2,$acc);
+ &xor ($tp2,$acc);
+ &mov ($tmp,0x80808080);
- &and ($acc,0x80808080);
- &mov ($tmp,$acc);
- &shr ($tmp,7);
+ &and ($tmp,$tp2);
&lea ($tp4,&DWP(0,$tp2,$tp2));
+ &mov ($acc,$tmp);
+ &shr ($tmp,7);
&sub ($acc,$tmp);
&and ($tp4,0xfefefefe);
&and ($acc,0x1b1b1b1b);
&xor ($tp2,$tp1); # tp2^tp1
- &xor ($acc,$tp4);
- &mov ($tp4,$acc);
+ &xor ($tp4,$acc);
+ &mov ($tmp,0x80808080);
- &and ($acc,0x80808080);
- &mov ($tmp,$acc);
- &shr ($tmp,7);
+ &and ($tmp,$tp4);
&lea ($tp8,&DWP(0,$tp4,$tp4));
+ &mov ($acc,$tmp);
+ &shr ($tmp,7);
&xor ($tp4,$tp1); # tp4^tp1
&sub ($acc,$tmp);
&and ($tp8,0xfefefefe);
diff --git a/crypto/aes/asm/aes-armv4.pl b/crypto/aes/asm/aes-armv4.pl
index 86b86c4a0fbd..4f8917089f6c 100755
--- a/crypto/aes/asm/aes-armv4.pl
+++ b/crypto/aes/asm/aes-armv4.pl
@@ -1,7 +1,7 @@
#!/usr/bin/env perl
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -51,9 +51,23 @@ $key="r11";
$rounds="r12";
$code=<<___;
-#include "arm_arch.h"
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+#endif
+
.text
+#if __ARM_ARCH__<7
+.code 32
+#else
+.syntax unified
+# ifdef __thumb2__
+.thumb
+# else
.code 32
+# endif
+#endif
.type AES_Te,%object
.align 5
@@ -167,7 +181,11 @@ AES_Te:
.type AES_encrypt,%function
.align 5
AES_encrypt:
+#if __ARM_ARCH__<7
sub r3,pc,#8 @ AES_encrypt
+#else
+ adr r3,AES_encrypt
+#endif
stmdb sp!,{r1,r4-r12,lr}
mov $rounds,r0 @ inp
mov $key,r2
@@ -409,11 +427,21 @@ _armv4_AES_encrypt:
.align 5
private_AES_set_encrypt_key:
_armv4_AES_set_encrypt_key:
+#if __ARM_ARCH__<7
sub r3,pc,#8 @ AES_set_encrypt_key
+#else
+ adr r3,private_AES_set_encrypt_key
+#endif
teq r0,#0
+#if __ARM_ARCH__>=7
+ itt eq @ Thumb2 thing, sanity check in ARM
+#endif
moveq r0,#-1
beq .Labrt
teq r2,#0
+#if __ARM_ARCH__>=7
+ itt eq @ Thumb2 thing, sanity check in ARM
+#endif
moveq r0,#-1
beq .Labrt
@@ -422,6 +450,9 @@ _armv4_AES_set_encrypt_key:
teq r1,#192
beq .Lok
teq r1,#256
+#if __ARM_ARCH__>=7
+ itt ne @ Thumb2 thing, sanity check in ARM
+#endif
movne r0,#-1
bne .Labrt
@@ -576,6 +607,9 @@ _armv4_AES_set_encrypt_key:
str $s2,[$key,#-16]
subs $rounds,$rounds,#1
str $s3,[$key,#-12]
+#if __ARM_ARCH__>=7
+ itt eq @ Thumb2 thing, sanity check in ARM
+#endif
subeq r2,$key,#216
beq .Ldone
@@ -645,6 +679,9 @@ _armv4_AES_set_encrypt_key:
str $s2,[$key,#-24]
subs $rounds,$rounds,#1
str $s3,[$key,#-20]
+#if __ARM_ARCH__>=7
+ itt eq @ Thumb2 thing, sanity check in ARM
+#endif
subeq r2,$key,#256
beq .Ldone
@@ -674,11 +711,17 @@ _armv4_AES_set_encrypt_key:
str $i3,[$key,#-4]
b .L256_loop
+.align 2
.Ldone: mov r0,#0
ldmia sp!,{r4-r12,lr}
-.Labrt: tst lr,#1
+.Labrt:
+#if __ARM_ARCH__>=5
+ ret @ bx lr
+#else
+ tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
+#endif
.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
.global private_AES_set_decrypt_key
@@ -688,34 +731,57 @@ private_AES_set_decrypt_key:
str lr,[sp,#-4]! @ push lr
bl _armv4_AES_set_encrypt_key
teq r0,#0
- ldrne lr,[sp],#4 @ pop lr
+ ldr lr,[sp],#4 @ pop lr
bne .Labrt
- stmdb sp!,{r4-r12}
+ mov r0,r2 @ AES_set_encrypt_key preserves r2,
+ mov r1,r2 @ which is AES_KEY *key
+ b _armv4_AES_set_enc2dec_key
+.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
- ldr $rounds,[r2,#240] @ AES_set_encrypt_key preserves r2,
- mov $key,r2 @ which is AES_KEY *key
- mov $i1,r2
- add $i2,r2,$rounds,lsl#4
+@ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out)
+.global AES_set_enc2dec_key
+.type AES_set_enc2dec_key,%function
+.align 5
+AES_set_enc2dec_key:
+_armv4_AES_set_enc2dec_key:
+ stmdb sp!,{r4-r12,lr}
+
+ ldr $rounds,[r0,#240]
+ mov $i1,r0 @ input
+ add $i2,r0,$rounds,lsl#4
+ mov $key,r1 @ ouput
+ add $tbl,r1,$rounds,lsl#4
+ str $rounds,[r1,#240]
+
+.Linv: ldr $s0,[$i1],#16
+ ldr $s1,[$i1,#-12]
+ ldr $s2,[$i1,#-8]
+ ldr $s3,[$i1,#-4]
+ ldr $t1,[$i2],#-16
+ ldr $t2,[$i2,#16+4]
+ ldr $t3,[$i2,#16+8]
+ ldr $i3,[$i2,#16+12]
+ str $s0,[$tbl],#-16
+ str $s1,[$tbl,#16+4]
+ str $s2,[$tbl,#16+8]
+ str $s3,[$tbl,#16+12]
+ str $t1,[$key],#16
+ str $t2,[$key,#-12]
+ str $t3,[$key,#-8]
+ str $i3,[$key,#-4]
+ teq $i1,$i2
+ bne .Linv
-.Linv: ldr $s0,[$i1]
+ ldr $s0,[$i1]
ldr $s1,[$i1,#4]
ldr $s2,[$i1,#8]
ldr $s3,[$i1,#12]
- ldr $t1,[$i2]
- ldr $t2,[$i2,#4]
- ldr $t3,[$i2,#8]
- ldr $i3,[$i2,#12]
- str $s0,[$i2],#-16
- str $s1,[$i2,#16+4]
- str $s2,[$i2,#16+8]
- str $s3,[$i2,#16+12]
- str $t1,[$i1],#16
- str $t2,[$i1,#-12]
- str $t3,[$i1,#-8]
- str $i3,[$i1,#-4]
- teq $i1,$i2
- bne .Linv
+ str $s0,[$key]
+ str $s1,[$key,#4]
+ str $s2,[$key,#8]
+ str $s3,[$key,#12]
+ sub $key,$key,$rounds,lsl#3
___
$mask80=$i1;
$mask1b=$i2;
@@ -773,7 +839,7 @@ $code.=<<___;
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
-.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
+.size AES_set_enc2dec_key,.-AES_set_enc2dec_key
.type AES_Td,%object
.align 5
@@ -883,7 +949,11 @@ AES_Td:
.type AES_decrypt,%function
.align 5
AES_decrypt:
+#if __ARM_ARCH__<7
sub r3,pc,#8 @ AES_decrypt
+#else
+ adr r3,AES_decrypt
+#endif
stmdb sp!,{r1,r4-r12,lr}
mov $rounds,r0 @ inp
mov $key,r2
@@ -1080,8 +1150,9 @@ _armv4_AES_decrypt:
ldrb $t3,[$tbl,$i3] @ Td4[s0>>0]
and $i3,lr,$s1,lsr#8
+ add $s1,$tbl,$s1,lsr#24
ldrb $i1,[$tbl,$i1] @ Td4[s1>>0]
- ldrb $s1,[$tbl,$s1,lsr#24] @ Td4[s1>>24]
+ ldrb $s1,[$s1] @ Td4[s1>>24]
ldrb $i2,[$tbl,$i2] @ Td4[s1>>16]
eor $s0,$i1,$s0,lsl#24
ldrb $i3,[$tbl,$i3] @ Td4[s1>>8]
@@ -1094,7 +1165,8 @@ _armv4_AES_decrypt:
ldrb $i2,[$tbl,$i2] @ Td4[s2>>0]
and $i3,lr,$s2,lsr#16
- ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24]
+ add $s2,$tbl,$s2,lsr#24
+ ldrb $s2,[$s2] @ Td4[s2>>24]
eor $s0,$s0,$i1,lsl#8
ldrb $i3,[$tbl,$i3] @ Td4[s2>>16]
eor $s1,$i2,$s1,lsl#16
@@ -1106,8 +1178,9 @@ _armv4_AES_decrypt:
ldrb $i2,[$tbl,$i2] @ Td4[s3>>8]
and $i3,lr,$s3 @ i2
+ add $s3,$tbl,$s3,lsr#24
ldrb $i3,[$tbl,$i3] @ Td4[s3>>0]
- ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24]
+ ldrb $s3,[$s3] @ Td4[s3>>24]
eor $s0,$s0,$i1,lsl#16
ldr $i1,[$key,#0]
eor $s1,$s1,$i2,lsl#8
@@ -1130,5 +1203,15 @@ _armv4_AES_decrypt:
___
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
+$code =~ s/\bret\b/bx\tlr/gm;
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/@/ and !/^$/);
+ print;
+}
+close SELF;
+
print $code;
close STDOUT; # enforce flush
diff --git a/crypto/aes/asm/aes-mips.pl b/crypto/aes/asm/aes-mips.pl
index 537c8d3172b6..4de3ee26bb74 100755
--- a/crypto/aes/asm/aes-mips.pl
+++ b/crypto/aes/asm/aes-mips.pl
@@ -20,6 +20,13 @@
# thing about this module is its endian neutrality, which means that
# it processes data without ever changing byte order...
+# September 2012
+#
+# Add MIPS32R2 (~10% less instructions) and SmartMIPS ASE (further
+# ~25% less instructions) code. Note that there is no run-time switch,
+# instead, code path is chosen upon pre-process time, pass -mips32r2
+# or/and -msmartmips.
+
######################################################################
# There is a number of MIPS ABI in use, O32 and N32/64 are most
# widely used. Then there is a new contender: NUBI. It appears that if
@@ -47,11 +54,12 @@
# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
#
-$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
if ($flavour =~ /64|n32/i) {
$PTR_ADD="dadd"; # incidentally works even on n32
$PTR_SUB="dsub"; # incidentally works even on n32
+ $PTR_INS="dins";
$REG_S="sd";
$REG_L="ld";
$PTR_SLL="dsll"; # incidentally works even on n32
@@ -59,6 +67,7 @@ if ($flavour =~ /64|n32/i) {
} else {
$PTR_ADD="add";
$PTR_SUB="sub";
+ $PTR_INS="ins";
$REG_S="sw";
$REG_L="lw";
$PTR_SLL="sll";
@@ -89,7 +98,11 @@ $code.=<<___;
# include <openssl/fipssyms.h>
#endif
-#if !defined(__vxworks) || defined(__pic__)
+#if defined(__mips_smartmips) && !defined(_MIPS_ARCH_MIPS32R2)
+#define _MIPS_ARCH_MIPS32R2
+#endif
+
+#if !defined(__mips_eabi) && (!defined(__vxworks) || defined(__pic__))
.option pic2
#endif
.set noat
@@ -125,6 +138,89 @@ _mips_AES_encrypt:
xor $s3,$t3
sub $cnt,1
+#if defined(__mips_smartmips)
+ ext $i0,$s1,16,8
+.Loop_enc:
+ ext $i1,$s2,16,8
+ ext $i2,$s3,16,8
+ ext $i3,$s0,16,8
+ lwxs $t0,$i0($Tbl) # Te1[s1>>16]
+ ext $i0,$s2,8,8
+ lwxs $t1,$i1($Tbl) # Te1[s2>>16]
+ ext $i1,$s3,8,8
+ lwxs $t2,$i2($Tbl) # Te1[s3>>16]
+ ext $i2,$s0,8,8
+ lwxs $t3,$i3($Tbl) # Te1[s0>>16]
+ ext $i3,$s1,8,8
+
+ lwxs $t4,$i0($Tbl) # Te2[s2>>8]
+ ext $i0,$s3,0,8
+ lwxs $t5,$i1($Tbl) # Te2[s3>>8]
+ ext $i1,$s0,0,8
+ lwxs $t6,$i2($Tbl) # Te2[s0>>8]
+ ext $i2,$s1,0,8
+ lwxs $t7,$i3($Tbl) # Te2[s1>>8]
+ ext $i3,$s2,0,8
+
+ lwxs $t8,$i0($Tbl) # Te3[s3]
+ ext $i0,$s0,24,8
+ lwxs $t9,$i1($Tbl) # Te3[s0]
+ ext $i1,$s1,24,8
+ lwxs $t10,$i2($Tbl) # Te3[s1]
+ ext $i2,$s2,24,8
+ lwxs $t11,$i3($Tbl) # Te3[s2]
+ ext $i3,$s3,24,8
+
+ rotr $t0,$t0,8
+ rotr $t1,$t1,8
+ rotr $t2,$t2,8
+ rotr $t3,$t3,8
+
+ rotr $t4,$t4,16
+ rotr $t5,$t5,16
+ rotr $t6,$t6,16
+ rotr $t7,$t7,16
+
+ xor $t0,$t4
+ lwxs $t4,$i0($Tbl) # Te0[s0>>24]
+ xor $t1,$t5
+ lwxs $t5,$i1($Tbl) # Te0[s1>>24]
+ xor $t2,$t6
+ lwxs $t6,$i2($Tbl) # Te0[s2>>24]
+ xor $t3,$t7
+ lwxs $t7,$i3($Tbl) # Te0[s3>>24]
+
+ rotr $t8,$t8,24
+ lw $s0,0($key0)
+ rotr $t9,$t9,24
+ lw $s1,4($key0)
+ rotr $t10,$t10,24
+ lw $s2,8($key0)
+ rotr $t11,$t11,24
+ lw $s3,12($key0)
+
+ xor $t0,$t8
+ xor $t1,$t9
+ xor $t2,$t10
+ xor $t3,$t11
+
+ xor $t0,$t4
+ xor $t1,$t5
+ xor $t2,$t6
+ xor $t3,$t7
+
+ sub $cnt,1
+ $PTR_ADD $key0,16
+ xor $s0,$t0
+ xor $s1,$t1
+ xor $s2,$t2
+ xor $s3,$t3
+ .set noreorder
+ bnez $cnt,.Loop_enc
+ ext $i0,$s1,16,8
+
+ _xtr $i0,$s1,16-2
+#else
_xtr $i0,$s1,16-2
.Loop_enc:
_xtr $i1,$s2,16-2
@@ -138,19 +234,29 @@ _mips_AES_encrypt:
$PTR_ADD $i1,$Tbl
$PTR_ADD $i2,$Tbl
$PTR_ADD $i3,$Tbl
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+ lw $t0,0($i0) # Te1[s1>>16]
+ _xtr $i0,$s2,8-2
+ lw $t1,0($i1) # Te1[s2>>16]
+ _xtr $i1,$s3,8-2
+ lw $t2,0($i2) # Te1[s3>>16]
+ _xtr $i2,$s0,8-2
+ lw $t3,0($i3) # Te1[s0>>16]
+ _xtr $i3,$s1,8-2
+#else
lwl $t0,3($i0) # Te1[s1>>16]
lwl $t1,3($i1) # Te1[s2>>16]
lwl $t2,3($i2) # Te1[s3>>16]
lwl $t3,3($i3) # Te1[s0>>16]
lwr $t0,2($i0) # Te1[s1>>16]
- lwr $t1,2($i1) # Te1[s2>>16]
- lwr $t2,2($i2) # Te1[s3>>16]
- lwr $t3,2($i3) # Te1[s0>>16]
-
_xtr $i0,$s2,8-2
+ lwr $t1,2($i1) # Te1[s2>>16]
_xtr $i1,$s3,8-2
+ lwr $t2,2($i2) # Te1[s3>>16]
_xtr $i2,$s0,8-2
+ lwr $t3,2($i3) # Te1[s0>>16]
_xtr $i3,$s1,8-2
+#endif
and $i0,0x3fc
and $i1,0x3fc
and $i2,0x3fc
@@ -159,19 +265,88 @@ _mips_AES_encrypt:
$PTR_ADD $i1,$Tbl
$PTR_ADD $i2,$Tbl
$PTR_ADD $i3,$Tbl
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+ rotr $t0,$t0,8
+ rotr $t1,$t1,8
+ rotr $t2,$t2,8
+ rotr $t3,$t3,8
+# if defined(_MIPSEL)
+ lw $t4,0($i0) # Te2[s2>>8]
+ _xtr $i0,$s3,0-2
+ lw $t5,0($i1) # Te2[s3>>8]
+ _xtr $i1,$s0,0-2
+ lw $t6,0($i2) # Te2[s0>>8]
+ _xtr $i2,$s1,0-2
+ lw $t7,0($i3) # Te2[s1>>8]
+ _xtr $i3,$s2,0-2
+
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lw $t8,0($i0) # Te3[s3]
+ $PTR_INS $i0,$s0,2,8
+ lw $t9,0($i1) # Te3[s0]
+ $PTR_INS $i1,$s1,2,8
+ lw $t10,0($i2) # Te3[s1]
+ $PTR_INS $i2,$s2,2,8
+ lw $t11,0($i3) # Te3[s2]
+ $PTR_INS $i3,$s3,2,8
+# else
+ lw $t4,0($i0) # Te2[s2>>8]
+ $PTR_INS $i0,$s3,2,8
+ lw $t5,0($i1) # Te2[s3>>8]
+ $PTR_INS $i1,$s0,2,8
+ lw $t6,0($i2) # Te2[s0>>8]
+ $PTR_INS $i2,$s1,2,8
+ lw $t7,0($i3) # Te2[s1>>8]
+ $PTR_INS $i3,$s2,2,8
+
+ lw $t8,0($i0) # Te3[s3]
+ _xtr $i0,$s0,24-2
+ lw $t9,0($i1) # Te3[s0]
+ _xtr $i1,$s1,24-2
+ lw $t10,0($i2) # Te3[s1]
+ _xtr $i2,$s2,24-2
+ lw $t11,0($i3) # Te3[s2]
+ _xtr $i3,$s3,24-2
+
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+# endif
+ rotr $t4,$t4,16
+ rotr $t5,$t5,16
+ rotr $t6,$t6,16
+ rotr $t7,$t7,16
+
+ rotr $t8,$t8,24
+ rotr $t9,$t9,24
+ rotr $t10,$t10,24
+ rotr $t11,$t11,24
+#else
lwl $t4,2($i0) # Te2[s2>>8]
lwl $t5,2($i1) # Te2[s3>>8]
lwl $t6,2($i2) # Te2[s0>>8]
lwl $t7,2($i3) # Te2[s1>>8]
lwr $t4,1($i0) # Te2[s2>>8]
- lwr $t5,1($i1) # Te2[s3>>8]
- lwr $t6,1($i2) # Te2[s0>>8]
- lwr $t7,1($i3) # Te2[s1>>8]
-
_xtr $i0,$s3,0-2
+ lwr $t5,1($i1) # Te2[s3>>8]
_xtr $i1,$s0,0-2
+ lwr $t6,1($i2) # Te2[s0>>8]
_xtr $i2,$s1,0-2
+ lwr $t7,1($i3) # Te2[s1>>8]
_xtr $i3,$s2,0-2
+
and $i0,0x3fc
and $i1,0x3fc
and $i2,0x3fc
@@ -185,14 +360,14 @@ _mips_AES_encrypt:
lwl $t10,1($i2) # Te3[s1]
lwl $t11,1($i3) # Te3[s2]
lwr $t8,0($i0) # Te3[s3]
- lwr $t9,0($i1) # Te3[s0]
- lwr $t10,0($i2) # Te3[s1]
- lwr $t11,0($i3) # Te3[s2]
-
_xtr $i0,$s0,24-2
+ lwr $t9,0($i1) # Te3[s0]
_xtr $i1,$s1,24-2
+ lwr $t10,0($i2) # Te3[s1]
_xtr $i2,$s2,24-2
+ lwr $t11,0($i3) # Te3[s2]
_xtr $i3,$s3,24-2
+
and $i0,0x3fc
and $i1,0x3fc
and $i2,0x3fc
@@ -201,24 +376,24 @@ _mips_AES_encrypt:
$PTR_ADD $i1,$Tbl
$PTR_ADD $i2,$Tbl
$PTR_ADD $i3,$Tbl
+#endif
xor $t0,$t4
- xor $t1,$t5
- xor $t2,$t6
- xor $t3,$t7
lw $t4,0($i0) # Te0[s0>>24]
+ xor $t1,$t5
lw $t5,0($i1) # Te0[s1>>24]
+ xor $t2,$t6
lw $t6,0($i2) # Te0[s2>>24]
+ xor $t3,$t7
lw $t7,0($i3) # Te0[s3>>24]
- lw $s0,0($key0)
- lw $s1,4($key0)
- lw $s2,8($key0)
- lw $s3,12($key0)
-
xor $t0,$t8
+ lw $s0,0($key0)
xor $t1,$t9
+ lw $s1,4($key0)
xor $t2,$t10
+ lw $s2,8($key0)
xor $t3,$t11
+ lw $s3,12($key0)
xor $t0,$t4
xor $t1,$t5
@@ -234,6 +409,7 @@ _mips_AES_encrypt:
.set noreorder
bnez $cnt,.Loop_enc
_xtr $i0,$s1,16-2
+#endif
.set reorder
_xtr $i1,$s2,16-2
@@ -248,14 +424,14 @@ _mips_AES_encrypt:
$PTR_ADD $i2,$Tbl
$PTR_ADD $i3,$Tbl
lbu $t0,2($i0) # Te4[s1>>16]
- lbu $t1,2($i1) # Te4[s2>>16]
- lbu $t2,2($i2) # Te4[s3>>16]
- lbu $t3,2($i3) # Te4[s0>>16]
-
_xtr $i0,$s2,8-2
+ lbu $t1,2($i1) # Te4[s2>>16]
_xtr $i1,$s3,8-2
+ lbu $t2,2($i2) # Te4[s3>>16]
_xtr $i2,$s0,8-2
+ lbu $t3,2($i3) # Te4[s0>>16]
_xtr $i3,$s1,8-2
+
and $i0,0x3fc
and $i1,0x3fc
and $i2,0x3fc
@@ -264,15 +440,44 @@ _mips_AES_encrypt:
$PTR_ADD $i1,$Tbl
$PTR_ADD $i2,$Tbl
$PTR_ADD $i3,$Tbl
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+# if defined(_MIPSEL)
lbu $t4,2($i0) # Te4[s2>>8]
+ $PTR_INS $i0,$s0,2,8
lbu $t5,2($i1) # Te4[s3>>8]
+ $PTR_INS $i1,$s1,2,8
lbu $t6,2($i2) # Te4[s0>>8]
+ $PTR_INS $i2,$s2,2,8
lbu $t7,2($i3) # Te4[s1>>8]
+ $PTR_INS $i3,$s3,2,8
+ lbu $t8,2($i0) # Te4[s0>>24]
+ _xtr $i0,$s3,0-2
+ lbu $t9,2($i1) # Te4[s1>>24]
+ _xtr $i1,$s0,0-2
+ lbu $t10,2($i2) # Te4[s2>>24]
+ _xtr $i2,$s1,0-2
+ lbu $t11,2($i3) # Te4[s3>>24]
+ _xtr $i3,$s2,0-2
+
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+# else
+ lbu $t4,2($i0) # Te4[s2>>8]
_xtr $i0,$s0,24-2
+ lbu $t5,2($i1) # Te4[s3>>8]
_xtr $i1,$s1,24-2
+ lbu $t6,2($i2) # Te4[s0>>8]
_xtr $i2,$s2,24-2
+ lbu $t7,2($i3) # Te4[s1>>8]
_xtr $i3,$s3,24-2
+
and $i0,0x3fc
and $i1,0x3fc
and $i2,0x3fc
@@ -282,18 +487,76 @@ _mips_AES_encrypt:
$PTR_ADD $i2,$Tbl
$PTR_ADD $i3,$Tbl
lbu $t8,2($i0) # Te4[s0>>24]
+ $PTR_INS $i0,$s3,2,8
lbu $t9,2($i1) # Te4[s1>>24]
+ $PTR_INS $i1,$s0,2,8
lbu $t10,2($i2) # Te4[s2>>24]
+ $PTR_INS $i2,$s1,2,8
lbu $t11,2($i3) # Te4[s3>>24]
+ $PTR_INS $i3,$s2,2,8
+# endif
+ _ins $t0,16
+ _ins $t1,16
+ _ins $t2,16
+ _ins $t3,16
+ _ins2 $t0,$t4,8
+ lbu $t4,2($i0) # Te4[s3]
+ _ins2 $t1,$t5,8
+ lbu $t5,2($i1) # Te4[s0]
+ _ins2 $t2,$t6,8
+ lbu $t6,2($i2) # Te4[s1]
+ _ins2 $t3,$t7,8
+ lbu $t7,2($i3) # Te4[s2]
+
+ _ins2 $t0,$t8,24
+ lw $s0,0($key0)
+ _ins2 $t1,$t9,24
+ lw $s1,4($key0)
+ _ins2 $t2,$t10,24
+ lw $s2,8($key0)
+ _ins2 $t3,$t11,24
+ lw $s3,12($key0)
+
+ _ins2 $t0,$t4,0
+ _ins2 $t1,$t5,0
+ _ins2 $t2,$t6,0
+ _ins2 $t3,$t7,0
+#else
+ lbu $t4,2($i0) # Te4[s2>>8]
+ _xtr $i0,$s0,24-2
+ lbu $t5,2($i1) # Te4[s3>>8]
+ _xtr $i1,$s1,24-2
+ lbu $t6,2($i2) # Te4[s0>>8]
+ _xtr $i2,$s2,24-2
+ lbu $t7,2($i3) # Te4[s1>>8]
+ _xtr $i3,$s3,24-2
+
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lbu $t8,2($i0) # Te4[s0>>24]
_xtr $i0,$s3,0-2
+ lbu $t9,2($i1) # Te4[s1>>24]
_xtr $i1,$s0,0-2
+ lbu $t10,2($i2) # Te4[s2>>24]
_xtr $i2,$s1,0-2
+ lbu $t11,2($i3) # Te4[s3>>24]
_xtr $i3,$s2,0-2
+
and $i0,0x3fc
and $i1,0x3fc
and $i2,0x3fc
and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
_ins $t0,16
_ins $t1,16
@@ -306,27 +569,21 @@ _mips_AES_encrypt:
_ins $t7,8
xor $t0,$t4
- xor $t1,$t5
- xor $t2,$t6
- xor $t3,$t7
-
- $PTR_ADD $i0,$Tbl
- $PTR_ADD $i1,$Tbl
- $PTR_ADD $i2,$Tbl
- $PTR_ADD $i3,$Tbl
lbu $t4,2($i0) # Te4[s3]
+ xor $t1,$t5
lbu $t5,2($i1) # Te4[s0]
+ xor $t2,$t6
lbu $t6,2($i2) # Te4[s1]
+ xor $t3,$t7
lbu $t7,2($i3) # Te4[s2]
_ins $t8,24
- _ins $t9,24
- _ins $t10,24
- _ins $t11,24
-
lw $s0,0($key0)
+ _ins $t9,24
lw $s1,4($key0)
+ _ins $t10,24
lw $s2,8($key0)
+ _ins $t11,24
lw $s3,12($key0)
xor $t0,$t8
@@ -343,7 +600,7 @@ _mips_AES_encrypt:
xor $t1,$t5
xor $t2,$t6
xor $t3,$t7
-
+#endif
xor $s0,$t0
xor $s1,$t1
xor $s2,$t2
@@ -455,6 +712,89 @@ _mips_AES_decrypt:
xor $s3,$t3
sub $cnt,1
+#if defined(__mips_smartmips)
+ ext $i0,$s3,16,8
+.Loop_dec:
+ ext $i1,$s0,16,8
+ ext $i2,$s1,16,8
+ ext $i3,$s2,16,8
+ lwxs $t0,$i0($Tbl) # Td1[s3>>16]
+ ext $i0,$s2,8,8
+ lwxs $t1,$i1($Tbl) # Td1[s0>>16]
+ ext $i1,$s3,8,8
+ lwxs $t2,$i2($Tbl) # Td1[s1>>16]
+ ext $i2,$s0,8,8
+ lwxs $t3,$i3($Tbl) # Td1[s2>>16]
+ ext $i3,$s1,8,8
+
+ lwxs $t4,$i0($Tbl) # Td2[s2>>8]
+ ext $i0,$s1,0,8
+ lwxs $t5,$i1($Tbl) # Td2[s3>>8]
+ ext $i1,$s2,0,8
+ lwxs $t6,$i2($Tbl) # Td2[s0>>8]
+ ext $i2,$s3,0,8
+ lwxs $t7,$i3($Tbl) # Td2[s1>>8]
+ ext $i3,$s0,0,8
+
+ lwxs $t8,$i0($Tbl) # Td3[s1]
+ ext $i0,$s0,24,8
+ lwxs $t9,$i1($Tbl) # Td3[s2]
+ ext $i1,$s1,24,8
+ lwxs $t10,$i2($Tbl) # Td3[s3]
+ ext $i2,$s2,24,8
+ lwxs $t11,$i3($Tbl) # Td3[s0]
+ ext $i3,$s3,24,8
+
+ rotr $t0,$t0,8
+ rotr $t1,$t1,8
+ rotr $t2,$t2,8
+ rotr $t3,$t3,8
+
+ rotr $t4,$t4,16
+ rotr $t5,$t5,16
+ rotr $t6,$t6,16
+ rotr $t7,$t7,16
+
+ xor $t0,$t4
+ lwxs $t4,$i0($Tbl) # Td0[s0>>24]
+ xor $t1,$t5
+ lwxs $t5,$i1($Tbl) # Td0[s1>>24]
+ xor $t2,$t6
+ lwxs $t6,$i2($Tbl) # Td0[s2>>24]
+ xor $t3,$t7
+ lwxs $t7,$i3($Tbl) # Td0[s3>>24]
+
+ rotr $t8,$t8,24
+ lw $s0,0($key0)
+ rotr $t9,$t9,24
+ lw $s1,4($key0)
+ rotr $t10,$t10,24
+ lw $s2,8($key0)
+ rotr $t11,$t11,24
+ lw $s3,12($key0)
+
+ xor $t0,$t8
+ xor $t1,$t9
+ xor $t2,$t10
+ xor $t3,$t11
+
+ xor $t0,$t4
+ xor $t1,$t5
+ xor $t2,$t6
+ xor $t3,$t7
+
+ sub $cnt,1
+ $PTR_ADD $key0,16
+ xor $s0,$t0
+ xor $s1,$t1
+ xor $s2,$t2
+ xor $s3,$t3
+ .set noreorder
+ bnez $cnt,.Loop_dec
+ ext $i0,$s3,16,8
+
+ _xtr $i0,$s3,16-2
+#else
_xtr $i0,$s3,16-2
.Loop_dec:
_xtr $i1,$s0,16-2
@@ -468,19 +808,88 @@ _mips_AES_decrypt:
$PTR_ADD $i1,$Tbl
$PTR_ADD $i2,$Tbl
$PTR_ADD $i3,$Tbl
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+ lw $t0,0($i0) # Td1[s3>>16]
+ _xtr $i0,$s2,8-2
+ lw $t1,0($i1) # Td1[s0>>16]
+ _xtr $i1,$s3,8-2
+ lw $t2,0($i2) # Td1[s1>>16]
+ _xtr $i2,$s0,8-2
+ lw $t3,0($i3) # Td1[s2>>16]
+ _xtr $i3,$s1,8-2
+#else
lwl $t0,3($i0) # Td1[s3>>16]
lwl $t1,3($i1) # Td1[s0>>16]
lwl $t2,3($i2) # Td1[s1>>16]
lwl $t3,3($i3) # Td1[s2>>16]
lwr $t0,2($i0) # Td1[s3>>16]
- lwr $t1,2($i1) # Td1[s0>>16]
- lwr $t2,2($i2) # Td1[s1>>16]
- lwr $t3,2($i3) # Td1[s2>>16]
-
_xtr $i0,$s2,8-2
+ lwr $t1,2($i1) # Td1[s0>>16]
_xtr $i1,$s3,8-2
+ lwr $t2,2($i2) # Td1[s1>>16]
_xtr $i2,$s0,8-2
+ lwr $t3,2($i3) # Td1[s2>>16]
_xtr $i3,$s1,8-2
+#endif
+
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+ rotr $t0,$t0,8
+ rotr $t1,$t1,8
+ rotr $t2,$t2,8
+ rotr $t3,$t3,8
+# if defined(_MIPSEL)
+ lw $t4,0($i0) # Td2[s2>>8]
+ _xtr $i0,$s1,0-2
+ lw $t5,0($i1) # Td2[s3>>8]
+ _xtr $i1,$s2,0-2
+ lw $t6,0($i2) # Td2[s0>>8]
+ _xtr $i2,$s3,0-2
+ lw $t7,0($i3) # Td2[s1>>8]
+ _xtr $i3,$s0,0-2
+
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lw $t8,0($i0) # Td3[s1]
+ $PTR_INS $i0,$s0,2,8
+ lw $t9,0($i1) # Td3[s2]
+ $PTR_INS $i1,$s1,2,8
+ lw $t10,0($i2) # Td3[s3]
+ $PTR_INS $i2,$s2,2,8
+ lw $t11,0($i3) # Td3[s0]
+ $PTR_INS $i3,$s3,2,8
+#else
+ lw $t4,0($i0) # Td2[s2>>8]
+ $PTR_INS $i0,$s1,2,8
+ lw $t5,0($i1) # Td2[s3>>8]
+ $PTR_INS $i1,$s2,2,8
+ lw $t6,0($i2) # Td2[s0>>8]
+ $PTR_INS $i2,$s3,2,8
+ lw $t7,0($i3) # Td2[s1>>8]
+ $PTR_INS $i3,$s0,2,8
+
+ lw $t8,0($i0) # Td3[s1]
+ _xtr $i0,$s0,24-2
+ lw $t9,0($i1) # Td3[s2]
+ _xtr $i1,$s1,24-2
+ lw $t10,0($i2) # Td3[s3]
+ _xtr $i2,$s2,24-2
+ lw $t11,0($i3) # Td3[s0]
+ _xtr $i3,$s3,24-2
+
and $i0,0x3fc
and $i1,0x3fc
and $i2,0x3fc
@@ -489,19 +898,30 @@ _mips_AES_decrypt:
$PTR_ADD $i1,$Tbl
$PTR_ADD $i2,$Tbl
$PTR_ADD $i3,$Tbl
+#endif
+ rotr $t4,$t4,16
+ rotr $t5,$t5,16
+ rotr $t6,$t6,16
+ rotr $t7,$t7,16
+
+ rotr $t8,$t8,24
+ rotr $t9,$t9,24
+ rotr $t10,$t10,24
+ rotr $t11,$t11,24
+#else
lwl $t4,2($i0) # Td2[s2>>8]
lwl $t5,2($i1) # Td2[s3>>8]
lwl $t6,2($i2) # Td2[s0>>8]
lwl $t7,2($i3) # Td2[s1>>8]
lwr $t4,1($i0) # Td2[s2>>8]
- lwr $t5,1($i1) # Td2[s3>>8]
- lwr $t6,1($i2) # Td2[s0>>8]
- lwr $t7,1($i3) # Td2[s1>>8]
-
_xtr $i0,$s1,0-2
+ lwr $t5,1($i1) # Td2[s3>>8]
_xtr $i1,$s2,0-2
+ lwr $t6,1($i2) # Td2[s0>>8]
_xtr $i2,$s3,0-2
+ lwr $t7,1($i3) # Td2[s1>>8]
_xtr $i3,$s0,0-2
+
and $i0,0x3fc
and $i1,0x3fc
and $i2,0x3fc
@@ -515,14 +935,14 @@ _mips_AES_decrypt:
lwl $t10,1($i2) # Td3[s3]
lwl $t11,1($i3) # Td3[s0]
lwr $t8,0($i0) # Td3[s1]
- lwr $t9,0($i1) # Td3[s2]
- lwr $t10,0($i2) # Td3[s3]
- lwr $t11,0($i3) # Td3[s0]
-
_xtr $i0,$s0,24-2
+ lwr $t9,0($i1) # Td3[s2]
_xtr $i1,$s1,24-2
+ lwr $t10,0($i2) # Td3[s3]
_xtr $i2,$s2,24-2
+ lwr $t11,0($i3) # Td3[s0]
_xtr $i3,$s3,24-2
+
and $i0,0x3fc
and $i1,0x3fc
and $i2,0x3fc
@@ -531,27 +951,25 @@ _mips_AES_decrypt:
$PTR_ADD $i1,$Tbl
$PTR_ADD $i2,$Tbl
$PTR_ADD $i3,$Tbl
+#endif
xor $t0,$t4
- xor $t1,$t5
- xor $t2,$t6
- xor $t3,$t7
-
-
lw $t4,0($i0) # Td0[s0>>24]
+ xor $t1,$t5
lw $t5,0($i1) # Td0[s1>>24]
+ xor $t2,$t6
lw $t6,0($i2) # Td0[s2>>24]
+ xor $t3,$t7
lw $t7,0($i3) # Td0[s3>>24]
- lw $s0,0($key0)
- lw $s1,4($key0)
- lw $s2,8($key0)
- lw $s3,12($key0)
-
xor $t0,$t8
+ lw $s0,0($key0)
xor $t1,$t9
+ lw $s1,4($key0)
xor $t2,$t10
+ lw $s2,8($key0)
xor $t3,$t11
+ lw $s3,12($key0)
xor $t0,$t4
xor $t1,$t5
@@ -567,38 +985,39 @@ _mips_AES_decrypt:
.set noreorder
bnez $cnt,.Loop_dec
_xtr $i0,$s3,16-2
+#endif
.set reorder
lw $t4,1024($Tbl) # prefetch Td4
- lw $t5,1024+32($Tbl)
- lw $t6,1024+64($Tbl)
- lw $t7,1024+96($Tbl)
- lw $t8,1024+128($Tbl)
- lw $t9,1024+160($Tbl)
- lw $t10,1024+192($Tbl)
- lw $t11,1024+224($Tbl)
-
_xtr $i0,$s3,16
+ lw $t5,1024+32($Tbl)
_xtr $i1,$s0,16
+ lw $t6,1024+64($Tbl)
_xtr $i2,$s1,16
+ lw $t7,1024+96($Tbl)
_xtr $i3,$s2,16
+ lw $t8,1024+128($Tbl)
and $i0,0xff
+ lw $t9,1024+160($Tbl)
and $i1,0xff
+ lw $t10,1024+192($Tbl)
and $i2,0xff
+ lw $t11,1024+224($Tbl)
and $i3,0xff
+
$PTR_ADD $i0,$Tbl
$PTR_ADD $i1,$Tbl
$PTR_ADD $i2,$Tbl
$PTR_ADD $i3,$Tbl
lbu $t0,1024($i0) # Td4[s3>>16]
- lbu $t1,1024($i1) # Td4[s0>>16]
- lbu $t2,1024($i2) # Td4[s1>>16]
- lbu $t3,1024($i3) # Td4[s2>>16]
-
_xtr $i0,$s2,8
+ lbu $t1,1024($i1) # Td4[s0>>16]
_xtr $i1,$s3,8
+ lbu $t2,1024($i2) # Td4[s1>>16]
_xtr $i2,$s0,8
+ lbu $t3,1024($i3) # Td4[s2>>16]
_xtr $i3,$s1,8
+
and $i0,0xff
and $i1,0xff
and $i2,0xff
@@ -607,29 +1026,108 @@ _mips_AES_decrypt:
$PTR_ADD $i1,$Tbl
$PTR_ADD $i2,$Tbl
$PTR_ADD $i3,$Tbl
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+# if defined(_MIPSEL)
lbu $t4,1024($i0) # Td4[s2>>8]
+ $PTR_INS $i0,$s0,0,8
lbu $t5,1024($i1) # Td4[s3>>8]
+ $PTR_INS $i1,$s1,0,8
lbu $t6,1024($i2) # Td4[s0>>8]
+ $PTR_INS $i2,$s2,0,8
lbu $t7,1024($i3) # Td4[s1>>8]
+ $PTR_INS $i3,$s3,0,8
+ lbu $t8,1024($i0) # Td4[s0>>24]
+ _xtr $i0,$s1,0
+ lbu $t9,1024($i1) # Td4[s1>>24]
+ _xtr $i1,$s2,0
+ lbu $t10,1024($i2) # Td4[s2>>24]
+ _xtr $i2,$s3,0
+ lbu $t11,1024($i3) # Td4[s3>>24]
+ _xtr $i3,$s0,0
+
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+# else
+ lbu $t4,1024($i0) # Td4[s2>>8]
_xtr $i0,$s0,24
+ lbu $t5,1024($i1) # Td4[s3>>8]
_xtr $i1,$s1,24
+ lbu $t6,1024($i2) # Td4[s0>>8]
_xtr $i2,$s2,24
+ lbu $t7,1024($i3) # Td4[s1>>8]
_xtr $i3,$s3,24
+
$PTR_ADD $i0,$Tbl
$PTR_ADD $i1,$Tbl
$PTR_ADD $i2,$Tbl
$PTR_ADD $i3,$Tbl
lbu $t8,1024($i0) # Td4[s0>>24]
+ $PTR_INS $i0,$s1,0,8
lbu $t9,1024($i1) # Td4[s1>>24]
+ $PTR_INS $i1,$s2,0,8
lbu $t10,1024($i2) # Td4[s2>>24]
+ $PTR_INS $i2,$s3,0,8
lbu $t11,1024($i3) # Td4[s3>>24]
+ $PTR_INS $i3,$s0,0,8
+# endif
+ _ins $t0,16
+ _ins $t1,16
+ _ins $t2,16
+ _ins $t3,16
+
+ _ins2 $t0,$t4,8
+ lbu $t4,1024($i0) # Td4[s1]
+ _ins2 $t1,$t5,8
+ lbu $t5,1024($i1) # Td4[s2]
+ _ins2 $t2,$t6,8
+ lbu $t6,1024($i2) # Td4[s3]
+ _ins2 $t3,$t7,8
+ lbu $t7,1024($i3) # Td4[s0]
+
+ _ins2 $t0,$t8,24
+ lw $s0,0($key0)
+ _ins2 $t1,$t9,24
+ lw $s1,4($key0)
+ _ins2 $t2,$t10,24
+ lw $s2,8($key0)
+ _ins2 $t3,$t11,24
+ lw $s3,12($key0)
+ _ins2 $t0,$t4,0
+ _ins2 $t1,$t5,0
+ _ins2 $t2,$t6,0
+ _ins2 $t3,$t7,0
+#else
+ lbu $t4,1024($i0) # Td4[s2>>8]
+ _xtr $i0,$s0,24
+ lbu $t5,1024($i1) # Td4[s3>>8]
+ _xtr $i1,$s1,24
+ lbu $t6,1024($i2) # Td4[s0>>8]
+ _xtr $i2,$s2,24
+ lbu $t7,1024($i3) # Td4[s1>>8]
+ _xtr $i3,$s3,24
+
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lbu $t8,1024($i0) # Td4[s0>>24]
_xtr $i0,$s1,0
+ lbu $t9,1024($i1) # Td4[s1>>24]
_xtr $i1,$s2,0
+ lbu $t10,1024($i2) # Td4[s2>>24]
_xtr $i2,$s3,0
+ lbu $t11,1024($i3) # Td4[s3>>24]
_xtr $i3,$s0,0
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+
_ins $t0,16
_ins $t1,16
_ins $t2,16
@@ -641,44 +1139,38 @@ _mips_AES_decrypt:
_ins $t7,8
xor $t0,$t4
- xor $t1,$t5
- xor $t2,$t6
- xor $t3,$t7
-
- $PTR_ADD $i0,$Tbl
- $PTR_ADD $i1,$Tbl
- $PTR_ADD $i2,$Tbl
- $PTR_ADD $i3,$Tbl
lbu $t4,1024($i0) # Td4[s1]
+ xor $t1,$t5
lbu $t5,1024($i1) # Td4[s2]
+ xor $t2,$t6
lbu $t6,1024($i2) # Td4[s3]
+ xor $t3,$t7
lbu $t7,1024($i3) # Td4[s0]
_ins $t8,24
- _ins $t9,24
- _ins $t10,24
- _ins $t11,24
-
lw $s0,0($key0)
+ _ins $t9,24
lw $s1,4($key0)
+ _ins $t10,24
lw $s2,8($key0)
+ _ins $t11,24
lw $s3,12($key0)
- _ins $t4,0
- _ins $t5,0
- _ins $t6,0
- _ins $t7,0
-
-
xor $t0,$t8
xor $t1,$t9
xor $t2,$t10
xor $t3,$t11
+ _ins $t4,0
+ _ins $t5,0
+ _ins $t6,0
+ _ins $t7,0
+
xor $t0,$t4
xor $t1,$t5
xor $t2,$t6
xor $t3,$t7
+#endif
xor $s0,$t0
xor $s1,$t1
@@ -791,7 +1283,7 @@ _mips_AES_set_encrypt_key:
beqz $inp,.Lekey_done
li $t0,-1
beqz $key,.Lekey_done
- $PTR_ADD $rcon,$Tbl,1024+256
+ $PTR_ADD $rcon,$Tbl,256
.set reorder
lwl $rk0,0+$MSB($inp) # load 128 bits
@@ -843,10 +1335,10 @@ _mips_AES_set_encrypt_key:
$PTR_ADD $i1,$Tbl
$PTR_ADD $i2,$Tbl
$PTR_ADD $i3,$Tbl
- lbu $i0,1024($i0)
- lbu $i1,1024($i1)
- lbu $i2,1024($i2)
- lbu $i3,1024($i3)
+ lbu $i0,0($i0)
+ lbu $i1,0($i1)
+ lbu $i2,0($i2)
+ lbu $i3,0($i3)
sw $rk0,0($key)
sw $rk1,4($key)
@@ -898,10 +1390,10 @@ _mips_AES_set_encrypt_key:
$PTR_ADD $i1,$Tbl
$PTR_ADD $i2,$Tbl
$PTR_ADD $i3,$Tbl
- lbu $i0,1024($i0)
- lbu $i1,1024($i1)
- lbu $i2,1024($i2)
- lbu $i3,1024($i3)
+ lbu $i0,0($i0)
+ lbu $i1,0($i1)
+ lbu $i2,0($i2)
+ lbu $i3,0($i3)
sw $rk0,0($key)
sw $rk1,4($key)
@@ -957,10 +1449,10 @@ _mips_AES_set_encrypt_key:
$PTR_ADD $i1,$Tbl
$PTR_ADD $i2,$Tbl
$PTR_ADD $i3,$Tbl
- lbu $i0,1024($i0)
- lbu $i1,1024($i1)
- lbu $i2,1024($i2)
- lbu $i3,1024($i3)
+ lbu $i0,0($i0)
+ lbu $i1,0($i1)
+ lbu $i2,0($i2)
+ lbu $i3,0($i3)
sw $rk0,0($key)
sw $rk1,4($key)
@@ -999,10 +1491,10 @@ _mips_AES_set_encrypt_key:
$PTR_ADD $i1,$Tbl
$PTR_ADD $i2,$Tbl
$PTR_ADD $i3,$Tbl
- lbu $i0,1024($i0)
- lbu $i1,1024($i1)
- lbu $i2,1024($i2)
- lbu $i3,1024($i3)
+ lbu $i0,0($i0)
+ lbu $i1,0($i1)
+ lbu $i2,0($i2)
+ lbu $i3,0($i3)
sll $i0,24
sll $i1,16
sll $i2,8
@@ -1064,7 +1556,7 @@ $code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
___
$code.=<<___;
.set reorder
- la $Tbl,AES_Te # PIC-ified 'load address'
+ la $Tbl,AES_Te4 # PIC-ified 'load address'
bal _mips_AES_set_encrypt_key
@@ -1119,7 +1611,7 @@ $code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
___
$code.=<<___;
.set reorder
- la $Tbl,AES_Te # PIC-ified 'load address'
+ la $Tbl,AES_Te4 # PIC-ified 'load address'
bal _mips_AES_set_encrypt_key
@@ -1190,6 +1682,16 @@ $code.=<<___;
xor $tpb,$tp9,$tp2
xor $tpd,$tp9,$tp4
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+ rotr $tp1,$tpd,16
+ xor $tpe,$tp2
+ rotr $tp2,$tp9,8
+ xor $tpe,$tp1
+ rotr $tp4,$tpb,24
+ xor $tpe,$tp2
+ lw $tp1,4($key) # modulo-scheduled
+ xor $tpe,$tp4
+#else
_ror $tp1,$tpd,16
xor $tpe,$tp2
_ror $tp2,$tpd,-16
@@ -1204,6 +1706,7 @@ $code.=<<___;
xor $tpe,$tp1
lw $tp1,4($key) # modulo-scheduled
xor $tpe,$tp2
+#endif
sub $cnt,1
sw $tpe,0($key)
$PTR_ADD $key,4
@@ -1234,7 +1737,7 @@ ___
# Tables are kept in endian-neutral manner
$code.=<<___;
.rdata
-.align 6
+.align 10
AES_Te:
.byte 0xc6,0x63,0x63,0xa5, 0xf8,0x7c,0x7c,0x84 # Te0
.byte 0xee,0x77,0x77,0x99, 0xf6,0x7b,0x7b,0x8d
@@ -1365,46 +1868,6 @@ AES_Te:
.byte 0x7b,0xb0,0xb0,0xcb, 0xa8,0x54,0x54,0xfc
.byte 0x6d,0xbb,0xbb,0xd6, 0x2c,0x16,0x16,0x3a
-.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 # Te4
-.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
-.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
-.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
-.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
-.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
-.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
-.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
-.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
-.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
-.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
-.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
-.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
-.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
-.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
-.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
-.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
-.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
-.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
-.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
-.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
-.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
-.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
-.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
-.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
-.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
-.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
-.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
-.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
-.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
-.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
-.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
-
-.byte 0x01,0x00,0x00,0x00, 0x02,0x00,0x00,0x00 # rcon
-.byte 0x04,0x00,0x00,0x00, 0x08,0x00,0x00,0x00
-.byte 0x10,0x00,0x00,0x00, 0x20,0x00,0x00,0x00
-.byte 0x40,0x00,0x00,0x00, 0x80,0x00,0x00,0x00
-.byte 0x1B,0x00,0x00,0x00, 0x36,0x00,0x00,0x00
-
-.align 6
AES_Td:
.byte 0x51,0xf4,0xa7,0x50, 0x7e,0x41,0x65,0x53 # Td0
.byte 0x1a,0x17,0xa4,0xc3, 0x3a,0x27,0x5e,0x96
@@ -1567,6 +2030,46 @@ AES_Td:
.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+
+AES_Te4:
+.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 # Te4
+.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+
+.byte 0x01,0x00,0x00,0x00, 0x02,0x00,0x00,0x00 # rcon
+.byte 0x04,0x00,0x00,0x00, 0x08,0x00,0x00,0x00
+.byte 0x10,0x00,0x00,0x00, 0x20,0x00,0x00,0x00
+.byte 0x40,0x00,0x00,0x00, 0x80,0x00,0x00,0x00
+.byte 0x1B,0x00,0x00,0x00, 0x36,0x00,0x00,0x00
___
foreach (split("\n",$code)) {
@@ -1583,6 +2086,9 @@ foreach (split("\n",$code)) {
s/_ins\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/
sprintf("sll\t$1,$2,%d",$big_endian ? eval($3)
: eval("24-$3"))/e or
+ s/_ins2\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/
+ sprintf("ins\t$1,$2,%d,8",$big_endian ? eval($3)
+ : eval("24-$3"))/e or
s/_ror\s+(\$[0-9]+),(\$[0-9]+),(\-?[0-9]+)/
sprintf("srl\t$1,$2,%d",$big_endian ? eval($3)
: eval("$3*-1"))/e or
@@ -1605,6 +2111,11 @@ foreach (split("\n",$code)) {
sprintf("$1%d($3)",eval("$2-$2%4+($2%4+1)&3"))/e;
}
+ if (!$big_endian) {
+ s/(rotr\s+\$[0-9]+,\$[0-9]+),([0-9]+)/sprintf("$1,%d",32-$2)/e;
+ s/(ext\s+\$[0-9]+,\$[0-9]+),([0-9]+),8/sprintf("$1,%d,8",24-$2)/e;
+ }
+
print $_,"\n";
}
diff --git a/crypto/aes/asm/aes-ppc.pl b/crypto/aes/asm/aes-ppc.pl
index 7c52cbe5f9fa..7a99fc3d0452 100755
--- a/crypto/aes/asm/aes-ppc.pl
+++ b/crypto/aes/asm/aes-ppc.pl
@@ -45,6 +45,8 @@ if ($flavour =~ /64/) {
$PUSH ="stw";
} else { die "nonsense $flavour"; }
+$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
+
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
@@ -68,7 +70,7 @@ $key="r5";
$Tbl0="r3";
$Tbl1="r6";
$Tbl2="r7";
-$Tbl3="r2";
+$Tbl3=$out; # stay away from "r2"; $out is offloaded to stack
$s0="r8";
$s1="r9";
@@ -76,7 +78,7 @@ $s2="r10";
$s3="r11";
$t0="r12";
-$t1="r13";
+$t1="r0"; # stay away from "r13";
$t2="r14";
$t3="r15";
@@ -100,9 +102,6 @@ $acc13="r29";
$acc14="r30";
$acc15="r31";
-# stay away from TLS pointer
-if ($SIZE_T==8) { die if ($t1 ne "r13"); $t1="r0"; }
-else { die if ($Tbl3 ne "r2"); $Tbl3=$t0; $t0="r0"; }
$mask80=$Tbl2;
$mask1b=$Tbl3;
@@ -337,8 +336,7 @@ $code.=<<___;
$STU $sp,-$FRAME($sp)
mflr r0
- $PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
- $PUSH r13,`$FRAME-$SIZE_T*19`($sp)
+ $PUSH $out,`$FRAME-$SIZE_T*19`($sp)
$PUSH r14,`$FRAME-$SIZE_T*18`($sp)
$PUSH r15,`$FRAME-$SIZE_T*17`($sp)
$PUSH r16,`$FRAME-$SIZE_T*16`($sp)
@@ -365,16 +363,61 @@ $code.=<<___;
bne Lenc_unaligned
Lenc_unaligned_ok:
+___
+$code.=<<___ if (!$LITTLE_ENDIAN);
lwz $s0,0($inp)
lwz $s1,4($inp)
lwz $s2,8($inp)
lwz $s3,12($inp)
+___
+$code.=<<___ if ($LITTLE_ENDIAN);
+ lwz $t0,0($inp)
+ lwz $t1,4($inp)
+ lwz $t2,8($inp)
+ lwz $t3,12($inp)
+ rotlwi $s0,$t0,8
+ rotlwi $s1,$t1,8
+ rotlwi $s2,$t2,8
+ rotlwi $s3,$t3,8
+ rlwimi $s0,$t0,24,0,7
+ rlwimi $s1,$t1,24,0,7
+ rlwimi $s2,$t2,24,0,7
+ rlwimi $s3,$t3,24,0,7
+ rlwimi $s0,$t0,24,16,23
+ rlwimi $s1,$t1,24,16,23
+ rlwimi $s2,$t2,24,16,23
+ rlwimi $s3,$t3,24,16,23
+___
+$code.=<<___;
bl LAES_Te
bl Lppc_AES_encrypt_compact
+ $POP $out,`$FRAME-$SIZE_T*19`($sp)
+___
+$code.=<<___ if ($LITTLE_ENDIAN);
+ rotlwi $t0,$s0,8
+ rotlwi $t1,$s1,8
+ rotlwi $t2,$s2,8
+ rotlwi $t3,$s3,8
+ rlwimi $t0,$s0,24,0,7
+ rlwimi $t1,$s1,24,0,7
+ rlwimi $t2,$s2,24,0,7
+ rlwimi $t3,$s3,24,0,7
+ rlwimi $t0,$s0,24,16,23
+ rlwimi $t1,$s1,24,16,23
+ rlwimi $t2,$s2,24,16,23
+ rlwimi $t3,$s3,24,16,23
+ stw $t0,0($out)
+ stw $t1,4($out)
+ stw $t2,8($out)
+ stw $t3,12($out)
+___
+$code.=<<___ if (!$LITTLE_ENDIAN);
stw $s0,0($out)
stw $s1,4($out)
stw $s2,8($out)
stw $s3,12($out)
+___
+$code.=<<___;
b Lenc_done
Lenc_unaligned:
@@ -417,6 +460,7 @@ Lenc_xpage:
bl LAES_Te
bl Lppc_AES_encrypt_compact
+ $POP $out,`$FRAME-$SIZE_T*19`($sp)
extrwi $acc00,$s0,8,0
extrwi $acc01,$s0,8,8
@@ -449,8 +493,6 @@ Lenc_xpage:
Lenc_done:
$POP r0,`$FRAME+$LRSAVE`($sp)
- $POP $toc,`$FRAME-$SIZE_T*20`($sp)
- $POP r13,`$FRAME-$SIZE_T*19`($sp)
$POP r14,`$FRAME-$SIZE_T*18`($sp)
$POP r15,`$FRAME-$SIZE_T*17`($sp)
$POP r16,`$FRAME-$SIZE_T*16`($sp)
@@ -764,6 +806,7 @@ Lenc_compact_done:
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
+.size .AES_encrypt,.-.AES_encrypt
.globl .AES_decrypt
.align 7
@@ -771,8 +814,7 @@ Lenc_compact_done:
$STU $sp,-$FRAME($sp)
mflr r0
- $PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
- $PUSH r13,`$FRAME-$SIZE_T*19`($sp)
+ $PUSH $out,`$FRAME-$SIZE_T*19`($sp)
$PUSH r14,`$FRAME-$SIZE_T*18`($sp)
$PUSH r15,`$FRAME-$SIZE_T*17`($sp)
$PUSH r16,`$FRAME-$SIZE_T*16`($sp)
@@ -799,16 +841,61 @@ Lenc_compact_done:
bne Ldec_unaligned
Ldec_unaligned_ok:
+___
+$code.=<<___ if (!$LITTLE_ENDIAN);
lwz $s0,0($inp)
lwz $s1,4($inp)
lwz $s2,8($inp)
lwz $s3,12($inp)
+___
+$code.=<<___ if ($LITTLE_ENDIAN);
+ lwz $t0,0($inp)
+ lwz $t1,4($inp)
+ lwz $t2,8($inp)
+ lwz $t3,12($inp)
+ rotlwi $s0,$t0,8
+ rotlwi $s1,$t1,8
+ rotlwi $s2,$t2,8
+ rotlwi $s3,$t3,8
+ rlwimi $s0,$t0,24,0,7
+ rlwimi $s1,$t1,24,0,7
+ rlwimi $s2,$t2,24,0,7
+ rlwimi $s3,$t3,24,0,7
+ rlwimi $s0,$t0,24,16,23
+ rlwimi $s1,$t1,24,16,23
+ rlwimi $s2,$t2,24,16,23
+ rlwimi $s3,$t3,24,16,23
+___
+$code.=<<___;
bl LAES_Td
bl Lppc_AES_decrypt_compact
+ $POP $out,`$FRAME-$SIZE_T*19`($sp)
+___
+$code.=<<___ if ($LITTLE_ENDIAN);
+ rotlwi $t0,$s0,8
+ rotlwi $t1,$s1,8
+ rotlwi $t2,$s2,8
+ rotlwi $t3,$s3,8
+ rlwimi $t0,$s0,24,0,7
+ rlwimi $t1,$s1,24,0,7
+ rlwimi $t2,$s2,24,0,7
+ rlwimi $t3,$s3,24,0,7
+ rlwimi $t0,$s0,24,16,23
+ rlwimi $t1,$s1,24,16,23
+ rlwimi $t2,$s2,24,16,23
+ rlwimi $t3,$s3,24,16,23
+ stw $t0,0($out)
+ stw $t1,4($out)
+ stw $t2,8($out)
+ stw $t3,12($out)
+___
+$code.=<<___ if (!$LITTLE_ENDIAN);
stw $s0,0($out)
stw $s1,4($out)
stw $s2,8($out)
stw $s3,12($out)
+___
+$code.=<<___;
b Ldec_done
Ldec_unaligned:
@@ -851,6 +938,7 @@ Ldec_xpage:
bl LAES_Td
bl Lppc_AES_decrypt_compact
+ $POP $out,`$FRAME-$SIZE_T*19`($sp)
extrwi $acc00,$s0,8,0
extrwi $acc01,$s0,8,8
@@ -883,8 +971,6 @@ Ldec_xpage:
Ldec_done:
$POP r0,`$FRAME+$LRSAVE`($sp)
- $POP $toc,`$FRAME-$SIZE_T*20`($sp)
- $POP r13,`$FRAME-$SIZE_T*19`($sp)
$POP r14,`$FRAME-$SIZE_T*18`($sp)
$POP r15,`$FRAME-$SIZE_T*17`($sp)
$POP r16,`$FRAME-$SIZE_T*16`($sp)
@@ -1355,6 +1441,7 @@ Ldec_compact_done:
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
+.size .AES_decrypt,.-.AES_decrypt
.asciz "AES for PPC, CRYPTOGAMS by <appro\@openssl.org>"
.align 7
diff --git a/crypto/aes/asm/aes-x86_64.pl b/crypto/aes/asm/aes-x86_64.pl
index 34cbb5d84429..47f416375d1e 100755
--- a/crypto/aes/asm/aes-x86_64.pl
+++ b/crypto/aes/asm/aes-x86_64.pl
@@ -19,9 +19,10 @@
# Performance in number of cycles per processed byte for 128-bit key:
#
# ECB encrypt ECB decrypt CBC large chunk
-# AMD64 33 41 13.0
-# EM64T 38 59 18.6(*)
-# Core 2 30 43 14.5(*)
+# AMD64 33 43 13.0
+# EM64T 38 56 18.6(*)
+# Core 2 30 42 14.5(*)
+# Atom 65 86 32.1(*)
#
# (*) with hyper-threading off
@@ -366,68 +367,66 @@ $code.=<<___;
movzb `&lo("$s0")`,$t0
movzb `&lo("$s1")`,$t1
movzb `&lo("$s2")`,$t2
- movzb ($sbox,$t0,1),$t0
- movzb ($sbox,$t1,1),$t1
- movzb ($sbox,$t2,1),$t2
-
movzb `&lo("$s3")`,$t3
movzb `&hi("$s1")`,$acc0
movzb `&hi("$s2")`,$acc1
+ shr \$16,$s2
+ movzb `&hi("$s3")`,$acc2
+ movzb ($sbox,$t0,1),$t0
+ movzb ($sbox,$t1,1),$t1
+ movzb ($sbox,$t2,1),$t2
movzb ($sbox,$t3,1),$t3
- movzb ($sbox,$acc0,1),$t4 #$t0
- movzb ($sbox,$acc1,1),$t5 #$t1
- movzb `&hi("$s3")`,$acc2
+ movzb ($sbox,$acc0,1),$t4 #$t0
movzb `&hi("$s0")`,$acc0
- shr \$16,$s2
+ movzb ($sbox,$acc1,1),$t5 #$t1
+ movzb `&lo("$s2")`,$acc1
movzb ($sbox,$acc2,1),$acc2 #$t2
movzb ($sbox,$acc0,1),$acc0 #$t3
- shr \$16,$s3
- movzb `&lo("$s2")`,$acc1
shl \$8,$t4
+ shr \$16,$s3
shl \$8,$t5
- movzb ($sbox,$acc1,1),$acc1 #$t0
xor $t4,$t0
- xor $t5,$t1
-
- movzb `&lo("$s3")`,$t4
shr \$16,$s0
+ movzb `&lo("$s3")`,$t4
shr \$16,$s1
- movzb `&lo("$s0")`,$t5
+ xor $t5,$t1
shl \$8,$acc2
- shl \$8,$acc0
- movzb ($sbox,$t4,1),$t4 #$t1
- movzb ($sbox,$t5,1),$t5 #$t2
+ movzb `&lo("$s0")`,$t5
+ movzb ($sbox,$acc1,1),$acc1 #$t0
xor $acc2,$t2
- xor $acc0,$t3
+ shl \$8,$acc0
movzb `&lo("$s1")`,$acc2
- movzb `&hi("$s3")`,$acc0
shl \$16,$acc1
- movzb ($sbox,$acc2,1),$acc2 #$t3
- movzb ($sbox,$acc0,1),$acc0 #$t0
+ xor $acc0,$t3
+ movzb ($sbox,$t4,1),$t4 #$t1
+ movzb `&hi("$s3")`,$acc0
+ movzb ($sbox,$t5,1),$t5 #$t2
xor $acc1,$t0
- movzb `&hi("$s0")`,$acc1
shr \$8,$s2
+ movzb `&hi("$s0")`,$acc1
+ shl \$16,$t4
shr \$8,$s1
+ shl \$16,$t5
+ xor $t4,$t1
+ movzb ($sbox,$acc2,1),$acc2 #$t3
+ movzb ($sbox,$acc0,1),$acc0 #$t0
movzb ($sbox,$acc1,1),$acc1 #$t1
movzb ($sbox,$s2,1),$s3 #$t3
movzb ($sbox,$s1,1),$s2 #$t2
- shl \$16,$t4
- shl \$16,$t5
+
shl \$16,$acc2
- xor $t4,$t1
xor $t5,$t2
- xor $acc2,$t3
-
shl \$24,$acc0
+ xor $acc2,$t3
shl \$24,$acc1
- shl \$24,$s3
xor $acc0,$t0
- shl \$24,$s2
+ shl \$24,$s3
xor $acc1,$t1
+ shl \$24,$s2
mov $t0,$s0
mov $t1,$s1
xor $t2,$s2
@@ -466,12 +465,12 @@ sub enctransform()
{ my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d");
$code.=<<___;
- mov $s0,$acc0
- mov $s1,$acc1
- and \$0x80808080,$acc0
- and \$0x80808080,$acc1
- mov $acc0,$t0
- mov $acc1,$t1
+ mov \$0x80808080,$t0
+ mov \$0x80808080,$t1
+ and $s0,$t0
+ and $s1,$t1
+ mov $t0,$acc0
+ mov $t1,$acc1
shr \$7,$t0
lea ($s0,$s0),$r20
shr \$7,$t1
@@ -489,25 +488,25 @@ $code.=<<___;
xor $r20,$s0
xor $r21,$s1
- mov $s2,$acc0
- mov $s3,$acc1
+ mov \$0x80808080,$t2
rol \$24,$s0
+ mov \$0x80808080,$t3
rol \$24,$s1
- and \$0x80808080,$acc0
- and \$0x80808080,$acc1
+ and $s2,$t2
+ and $s3,$t3
xor $r20,$s0
xor $r21,$s1
- mov $acc0,$t2
- mov $acc1,$t3
+ mov $t2,$acc0
ror \$16,$t0
+ mov $t3,$acc1
ror \$16,$t1
- shr \$7,$t2
lea ($s2,$s2),$r20
+ shr \$7,$t2
xor $t0,$s0
- xor $t1,$s1
shr \$7,$t3
- lea ($s3,$s3),$r21
+ xor $t1,$s1
ror \$8,$t0
+ lea ($s3,$s3),$r21
ror \$8,$t1
sub $t2,$acc0
sub $t3,$acc1
@@ -523,23 +522,23 @@ $code.=<<___;
xor $acc0,$r20
xor $acc1,$r21
+ ror \$16,$t2
xor $r20,$s2
+ ror \$16,$t3
xor $r21,$s3
rol \$24,$s2
+ mov 0($sbox),$acc0 # prefetch Te4
rol \$24,$s3
xor $r20,$s2
- xor $r21,$s3
- mov 0($sbox),$acc0 # prefetch Te4
- ror \$16,$t2
- ror \$16,$t3
mov 64($sbox),$acc1
- xor $t2,$s2
- xor $t3,$s3
+ xor $r21,$s3
mov 128($sbox),$r20
+ xor $t2,$s2
ror \$8,$t2
+ xor $t3,$s3
ror \$8,$t3
- mov 192($sbox),$r21
xor $t2,$s2
+ mov 192($sbox),$r21
xor $t3,$s3
___
}
@@ -936,70 +935,69 @@ $code.=<<___;
movzb `&lo("$s0")`,$t0
movzb `&lo("$s1")`,$t1
movzb `&lo("$s2")`,$t2
- movzb ($sbox,$t0,1),$t0
- movzb ($sbox,$t1,1),$t1
- movzb ($sbox,$t2,1),$t2
-
movzb `&lo("$s3")`,$t3
movzb `&hi("$s3")`,$acc0
movzb `&hi("$s0")`,$acc1
+ shr \$16,$s3
+ movzb `&hi("$s1")`,$acc2
+ movzb ($sbox,$t0,1),$t0
+ movzb ($sbox,$t1,1),$t1
+ movzb ($sbox,$t2,1),$t2
movzb ($sbox,$t3,1),$t3
- movzb ($sbox,$acc0,1),$t4 #$t0
- movzb ($sbox,$acc1,1),$t5 #$t1
- movzb `&hi("$s1")`,$acc2
+ movzb ($sbox,$acc0,1),$t4 #$t0
movzb `&hi("$s2")`,$acc0
- shr \$16,$s2
+ movzb ($sbox,$acc1,1),$t5 #$t1
movzb ($sbox,$acc2,1),$acc2 #$t2
movzb ($sbox,$acc0,1),$acc0 #$t3
- shr \$16,$s3
- movzb `&lo("$s2")`,$acc1
- shl \$8,$t4
+ shr \$16,$s2
shl \$8,$t5
- movzb ($sbox,$acc1,1),$acc1 #$t0
- xor $t4,$t0
- xor $t5,$t1
-
- movzb `&lo("$s3")`,$t4
+ shl \$8,$t4
+ movzb `&lo("$s2")`,$acc1
shr \$16,$s0
+ xor $t4,$t0
shr \$16,$s1
- movzb `&lo("$s0")`,$t5
+ movzb `&lo("$s3")`,$t4
+
shl \$8,$acc2
+ xor $t5,$t1
shl \$8,$acc0
- movzb ($sbox,$t4,1),$t4 #$t1
- movzb ($sbox,$t5,1),$t5 #$t2
+ movzb `&lo("$s0")`,$t5
+ movzb ($sbox,$acc1,1),$acc1 #$t0
xor $acc2,$t2
- xor $acc0,$t3
-
movzb `&lo("$s1")`,$acc2
- movzb `&hi("$s1")`,$acc0
+
shl \$16,$acc1
+ xor $acc0,$t3
+ movzb ($sbox,$t4,1),$t4 #$t1
+ movzb `&hi("$s1")`,$acc0
movzb ($sbox,$acc2,1),$acc2 #$t3
- movzb ($sbox,$acc0,1),$acc0 #$t0
xor $acc1,$t0
-
+ movzb ($sbox,$t5,1),$t5 #$t2
movzb `&hi("$s2")`,$acc1
+
+ shl \$16,$acc2
shl \$16,$t4
shl \$16,$t5
- movzb ($sbox,$acc1,1),$s1 #$t1
+ xor $acc2,$t3
+ movzb `&hi("$s3")`,$acc2
xor $t4,$t1
+ shr \$8,$s0
xor $t5,$t2
- movzb `&hi("$s3")`,$acc1
- shr \$8,$s0
- shl \$16,$acc2
- movzb ($sbox,$acc1,1),$s2 #$t2
+ movzb ($sbox,$acc0,1),$acc0 #$t0
+ movzb ($sbox,$acc1,1),$s1 #$t1
+ movzb ($sbox,$acc2,1),$s2 #$t2
movzb ($sbox,$s0,1),$s3 #$t3
- xor $acc2,$t3
+ mov $t0,$s0
shl \$24,$acc0
shl \$24,$s1
shl \$24,$s2
- xor $acc0,$t0
+ xor $acc0,$s0
shl \$24,$s3
xor $t1,$s1
- mov $t0,$s0
xor $t2,$s2
xor $t3,$s3
___
@@ -1014,12 +1012,12 @@ sub dectransform()
my $prefetch = shift;
$code.=<<___;
- mov $tp10,$acc0
- mov $tp18,$acc8
- and $mask80,$acc0
- and $mask80,$acc8
- mov $acc0,$tp40
- mov $acc8,$tp48
+ mov $mask80,$tp40
+ mov $mask80,$tp48
+ and $tp10,$tp40
+ and $tp18,$tp48
+ mov $tp40,$acc0
+ mov $tp48,$acc8
shr \$7,$tp40
lea ($tp10,$tp10),$tp20
shr \$7,$tp48
@@ -1030,15 +1028,15 @@ $code.=<<___;
and $maskfe,$tp28
and $mask1b,$acc0
and $mask1b,$acc8
- xor $tp20,$acc0
- xor $tp28,$acc8
- mov $acc0,$tp20
- mov $acc8,$tp28
-
- and $mask80,$acc0
- and $mask80,$acc8
- mov $acc0,$tp80
- mov $acc8,$tp88
+ xor $acc0,$tp20
+ xor $acc8,$tp28
+ mov $mask80,$tp80
+ mov $mask80,$tp88
+
+ and $tp20,$tp80
+ and $tp28,$tp88
+ mov $tp80,$acc0
+ mov $tp88,$acc8
shr \$7,$tp80
lea ($tp20,$tp20),$tp40
shr \$7,$tp88
@@ -1049,15 +1047,15 @@ $code.=<<___;
and $maskfe,$tp48
and $mask1b,$acc0
and $mask1b,$acc8
- xor $tp40,$acc0
- xor $tp48,$acc8
- mov $acc0,$tp40
- mov $acc8,$tp48
-
- and $mask80,$acc0
- and $mask80,$acc8
- mov $acc0,$tp80
- mov $acc8,$tp88
+ xor $acc0,$tp40
+ xor $acc8,$tp48
+ mov $mask80,$tp80
+ mov $mask80,$tp88
+
+ and $tp40,$tp80
+ and $tp48,$tp88
+ mov $tp80,$acc0
+ mov $tp88,$acc8
shr \$7,$tp80
xor $tp10,$tp20 # tp2^=tp1
shr \$7,$tp88
@@ -1082,51 +1080,51 @@ $code.=<<___;
mov $tp10,$acc0
mov $tp18,$acc8
xor $tp80,$tp40 # tp4^tp1^=tp8
- xor $tp88,$tp48 # tp4^tp1^=tp8
shr \$32,$acc0
+ xor $tp88,$tp48 # tp4^tp1^=tp8
shr \$32,$acc8
xor $tp20,$tp80 # tp8^=tp8^tp2^tp1=tp2^tp1
- xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1
rol \$8,`&LO("$tp10")` # ROTATE(tp1^tp8,8)
+ xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1
rol \$8,`&LO("$tp18")` # ROTATE(tp1^tp8,8)
xor $tp40,$tp80 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
+ rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8)
xor $tp48,$tp88 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
- rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8)
rol \$8,`&LO("$acc8")` # ROTATE(tp1^tp8,8)
xor `&LO("$tp80")`,`&LO("$tp10")`
- xor `&LO("$tp88")`,`&LO("$tp18")`
shr \$32,$tp80
+ xor `&LO("$tp88")`,`&LO("$tp18")`
shr \$32,$tp88
xor `&LO("$tp80")`,`&LO("$acc0")`
xor `&LO("$tp88")`,`&LO("$acc8")`
mov $tp20,$tp80
- mov $tp28,$tp88
- shr \$32,$tp80
- shr \$32,$tp88
rol \$24,`&LO("$tp20")` # ROTATE(tp2^tp1^tp8,24)
+ mov $tp28,$tp88
rol \$24,`&LO("$tp28")` # ROTATE(tp2^tp1^tp8,24)
- rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24)
- rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24)
+ shr \$32,$tp80
xor `&LO("$tp20")`,`&LO("$tp10")`
+ shr \$32,$tp88
xor `&LO("$tp28")`,`&LO("$tp18")`
+ rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24)
mov $tp40,$tp20
+ rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24)
mov $tp48,$tp28
+ shr \$32,$tp20
xor `&LO("$tp80")`,`&LO("$acc0")`
+ shr \$32,$tp28
xor `&LO("$tp88")`,`&LO("$acc8")`
`"mov 0($sbox),$mask80" if ($prefetch)`
- shr \$32,$tp20
- shr \$32,$tp28
- `"mov 64($sbox),$maskfe" if ($prefetch)`
rol \$16,`&LO("$tp40")` # ROTATE(tp4^tp1^tp8,16)
+ `"mov 64($sbox),$maskfe" if ($prefetch)`
rol \$16,`&LO("$tp48")` # ROTATE(tp4^tp1^tp8,16)
`"mov 128($sbox),$mask1b" if ($prefetch)`
rol \$16,`&LO("$tp20")` # ROTATE(tp4^tp1^tp8,16)
- rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16)
`"mov 192($sbox),$tp80" if ($prefetch)`
xor `&LO("$tp40")`,`&LO("$tp10")`
+ rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16)
xor `&LO("$tp48")`,`&LO("$tp18")`
`"mov 256($sbox),$tp88" if ($prefetch)`
xor `&LO("$tp20")`,`&LO("$acc0")`
@@ -1302,10 +1300,6 @@ private_AES_set_encrypt_key:
call _x86_64_AES_set_encrypt_key
- mov 8(%rsp),%r15
- mov 16(%rsp),%r14
- mov 24(%rsp),%r13
- mov 32(%rsp),%r12
mov 40(%rsp),%rbp
mov 48(%rsp),%rbx
add \$56,%rsp
diff --git a/crypto/aes/asm/aesni-mb-x86_64.pl b/crypto/aes/asm/aesni-mb-x86_64.pl
new file mode 100755
index 000000000000..33b1aed3c0b4
--- /dev/null
+++ b/crypto/aes/asm/aesni-mb-x86_64.pl
@@ -0,0 +1,1395 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# Multi-buffer AES-NI procedures process several independent buffers
+# in parallel by interleaving independent instructions.
+#
+# Cycles per byte for interleave factor 4:
+#
+# asymptotic measured
+# ---------------------------
+# Westmere 5.00/4=1.25 5.13/4=1.28
+# Atom 15.0/4=3.75 ?15.7/4=3.93
+# Sandy Bridge 5.06/4=1.27 5.18/4=1.29
+# Ivy Bridge 5.06/4=1.27 5.14/4=1.29
+# Haswell 4.44/4=1.11 4.44/4=1.11
+# Bulldozer 5.75/4=1.44 5.76/4=1.44
+#
+# Cycles per byte for interleave factor 8 (not implemented for
+# pre-AVX processors, where higher interleave factor incidentally
+# doesn't result in improvement):
+#
+# asymptotic measured
+# ---------------------------
+# Sandy Bridge 5.06/8=0.64 7.10/8=0.89(*)
+# Ivy Bridge 5.06/8=0.64 7.14/8=0.89(*)
+# Haswell 5.00/8=0.63 5.00/8=0.63
+# Bulldozer 5.75/8=0.72 5.77/8=0.72
+#
+# (*) Sandy/Ivy Bridge are known to handle high interleave factors
+# suboptimally;
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+$avx=0;
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+ $avx = ($2>=3.0) + ($2>3.0);
+}
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+# void aesni_multi_cbc_encrypt (
+# struct { void *inp,*out; int blocks; double iv[2]; } inp[8];
+# const AES_KEY *key,
+# int num); /* 1 or 2 */
+#
+$inp="%rdi"; # 1st arg
+$key="%rsi"; # 2nd arg
+$num="%edx";
+
+@inptr=map("%r$_",(8..11));
+@outptr=map("%r$_",(12..15));
+
+($rndkey0,$rndkey1)=("%xmm0","%xmm1");
+@out=map("%xmm$_",(2..5));
+@inp=map("%xmm$_",(6..9));
+($counters,$mask,$zero)=map("%xmm$_",(10..12));
+
+($rounds,$one,$sink,$offset)=("%eax","%ecx","%rbp","%rbx");
+
+$code.=<<___;
+.text
+
+.extern OPENSSL_ia32cap_P
+
+.globl aesni_multi_cbc_encrypt
+.type aesni_multi_cbc_encrypt,\@function,3
+.align 32
+aesni_multi_cbc_encrypt:
+___
+$code.=<<___ if ($avx);
+ cmp \$2,$num
+ jb .Lenc_non_avx
+ mov OPENSSL_ia32cap_P+4(%rip),%ecx
+ test \$`1<<28`,%ecx # AVX bit
+ jnz _avx_cbc_enc_shortcut
+ jmp .Lenc_non_avx
+.align 16
+.Lenc_non_avx:
+___
+$code.=<<___;
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0xa8(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,0x40(%rsp)
+ movaps %xmm11,0x50(%rsp)
+ movaps %xmm12,0x60(%rsp)
+ movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
+ movaps %xmm14,-0x58(%rax)
+ movaps %xmm15,-0x48(%rax)
+___
+$code.=<<___;
+ # stack layout
+ #
+ # +0 output sink
+ # +16 input sink [original %rsp and $num]
+ # +32 counters
+
+ sub \$48,%rsp
+ and \$-64,%rsp
+ mov %rax,16(%rsp) # original %rsp
+
+.Lenc4x_body:
+ movdqu ($key),$zero # 0-round key
+ lea 0x78($key),$key # size optimization
+ lea 40*2($inp),$inp
+
+.Lenc4x_loop_grande:
+ mov $num,24(%rsp) # original $num
+ xor $num,$num
+___
+for($i=0;$i<4;$i++) {
+ $code.=<<___;
+ mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks
+ mov `40*$i+0-40*2`($inp),@inptr[$i]
+ cmp $num,$one
+ mov `40*$i+8-40*2`($inp),@outptr[$i]
+ cmovg $one,$num # find maximum
+ test $one,$one
+ movdqu `40*$i+24-40*2`($inp),@out[$i] # load IV
+ mov $one,`32+4*$i`(%rsp) # initialize counters
+ cmovle %rsp,@inptr[$i] # cancel input
+___
+}
+$code.=<<___;
+ test $num,$num
+ jz .Lenc4x_done
+
+ movups 0x10-0x78($key),$rndkey1
+ pxor $zero,@out[0]
+ movups 0x20-0x78($key),$rndkey0
+ pxor $zero,@out[1]
+ mov 0xf0-0x78($key),$rounds
+ pxor $zero,@out[2]
+ movdqu (@inptr[0]),@inp[0] # load inputs
+ pxor $zero,@out[3]
+ movdqu (@inptr[1]),@inp[1]
+ pxor @inp[0],@out[0]
+ movdqu (@inptr[2]),@inp[2]
+ pxor @inp[1],@out[1]
+ movdqu (@inptr[3]),@inp[3]
+ pxor @inp[2],@out[2]
+ pxor @inp[3],@out[3]
+ movdqa 32(%rsp),$counters # load counters
+ xor $offset,$offset
+ jmp .Loop_enc4x
+
+.align 32
+.Loop_enc4x:
+ add \$16,$offset
+ lea 16(%rsp),$sink # sink pointer
+ mov \$1,$one # constant of 1
+ sub $offset,$sink
+
+ aesenc $rndkey1,@out[0]
+ prefetcht0 31(@inptr[0],$offset) # prefetch input
+ prefetcht0 31(@inptr[1],$offset)
+ aesenc $rndkey1,@out[1]
+ prefetcht0 31(@inptr[2],$offset)
+ prefetcht0 31(@inptr[2],$offset)
+ aesenc $rndkey1,@out[2]
+ aesenc $rndkey1,@out[3]
+ movups 0x30-0x78($key),$rndkey1
+___
+for($i=0;$i<4;$i++) {
+my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
+$code.=<<___;
+ cmp `32+4*$i`(%rsp),$one
+ aesenc $rndkey,@out[0]
+ aesenc $rndkey,@out[1]
+ aesenc $rndkey,@out[2]
+ cmovge $sink,@inptr[$i] # cancel input
+ cmovg $sink,@outptr[$i] # sink output
+ aesenc $rndkey,@out[3]
+ movups `0x40+16*$i-0x78`($key),$rndkey
+___
+}
+$code.=<<___;
+ movdqa $counters,$mask
+ aesenc $rndkey0,@out[0]
+ prefetcht0 15(@outptr[0],$offset) # prefetch output
+ prefetcht0 15(@outptr[1],$offset)
+ aesenc $rndkey0,@out[1]
+ prefetcht0 15(@outptr[2],$offset)
+ prefetcht0 15(@outptr[3],$offset)
+ aesenc $rndkey0,@out[2]
+ aesenc $rndkey0,@out[3]
+ movups 0x80-0x78($key),$rndkey0
+ pxor $zero,$zero
+
+ aesenc $rndkey1,@out[0]
+ pcmpgtd $zero,$mask
+ movdqu -0x78($key),$zero # reload 0-round key
+ aesenc $rndkey1,@out[1]
+ paddd $mask,$counters # decrement counters
+ movdqa $counters,32(%rsp) # update counters
+ aesenc $rndkey1,@out[2]
+ aesenc $rndkey1,@out[3]
+ movups 0x90-0x78($key),$rndkey1
+
+ cmp \$11,$rounds
+
+ aesenc $rndkey0,@out[0]
+ aesenc $rndkey0,@out[1]
+ aesenc $rndkey0,@out[2]
+ aesenc $rndkey0,@out[3]
+ movups 0xa0-0x78($key),$rndkey0
+
+ jb .Lenc4x_tail
+
+ aesenc $rndkey1,@out[0]
+ aesenc $rndkey1,@out[1]
+ aesenc $rndkey1,@out[2]
+ aesenc $rndkey1,@out[3]
+ movups 0xb0-0x78($key),$rndkey1
+
+ aesenc $rndkey0,@out[0]
+ aesenc $rndkey0,@out[1]
+ aesenc $rndkey0,@out[2]
+ aesenc $rndkey0,@out[3]
+ movups 0xc0-0x78($key),$rndkey0
+
+ je .Lenc4x_tail
+
+ aesenc $rndkey1,@out[0]
+ aesenc $rndkey1,@out[1]
+ aesenc $rndkey1,@out[2]
+ aesenc $rndkey1,@out[3]
+ movups 0xd0-0x78($key),$rndkey1
+
+ aesenc $rndkey0,@out[0]
+ aesenc $rndkey0,@out[1]
+ aesenc $rndkey0,@out[2]
+ aesenc $rndkey0,@out[3]
+ movups 0xe0-0x78($key),$rndkey0
+ jmp .Lenc4x_tail
+
+.align 32
+.Lenc4x_tail:
+ aesenc $rndkey1,@out[0]
+ aesenc $rndkey1,@out[1]
+ aesenc $rndkey1,@out[2]
+ aesenc $rndkey1,@out[3]
+ movdqu (@inptr[0],$offset),@inp[0]
+ movdqu 0x10-0x78($key),$rndkey1
+
+ aesenclast $rndkey0,@out[0]
+ movdqu (@inptr[1],$offset),@inp[1]
+ pxor $zero,@inp[0]
+ aesenclast $rndkey0,@out[1]
+ movdqu (@inptr[2],$offset),@inp[2]
+ pxor $zero,@inp[1]
+ aesenclast $rndkey0,@out[2]
+ movdqu (@inptr[3],$offset),@inp[3]
+ pxor $zero,@inp[2]
+ aesenclast $rndkey0,@out[3]
+ movdqu 0x20-0x78($key),$rndkey0
+ pxor $zero,@inp[3]
+
+ movups @out[0],-16(@outptr[0],$offset)
+ pxor @inp[0],@out[0]
+ movups @out[1],-16(@outptr[1],$offset)
+ pxor @inp[1],@out[1]
+ movups @out[2],-16(@outptr[2],$offset)
+ pxor @inp[2],@out[2]
+ movups @out[3],-16(@outptr[3],$offset)
+ pxor @inp[3],@out[3]
+
+ dec $num
+ jnz .Loop_enc4x
+
+ mov 16(%rsp),%rax # original %rsp
+ mov 24(%rsp),$num
+
+ #pxor @inp[0],@out[0]
+ #pxor @inp[1],@out[1]
+ #movdqu @out[0],`40*0+24-40*2`($inp) # output iv FIX ME!
+ #pxor @inp[2],@out[2]
+ #movdqu @out[1],`40*1+24-40*2`($inp)
+ #pxor @inp[3],@out[3]
+ #movdqu @out[2],`40*2+24-40*2`($inp) # won't fix, let caller
+ #movdqu @out[3],`40*3+24-40*2`($inp) # figure this out...
+
+ lea `40*4`($inp),$inp
+ dec $num
+ jnz .Lenc4x_loop_grande
+
+.Lenc4x_done:
+___
+$code.=<<___ if ($win64);
+ movaps -0xd8(%rax),%xmm6
+ movaps -0xc8(%rax),%xmm7
+ movaps -0xb8(%rax),%xmm8
+ movaps -0xa8(%rax),%xmm9
+ movaps -0x98(%rax),%xmm10
+ movaps -0x88(%rax),%xmm11
+ movaps -0x78(%rax),%xmm12
+ #movaps -0x68(%rax),%xmm13
+ #movaps -0x58(%rax),%xmm14
+ #movaps -0x48(%rax),%xmm15
+___
+$code.=<<___;
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp
+.Lenc4x_epilogue:
+ ret
+.size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt
+
+.globl aesni_multi_cbc_decrypt
+.type aesni_multi_cbc_decrypt,\@function,3
+.align 32
+aesni_multi_cbc_decrypt:
+___
+$code.=<<___ if ($avx);
+ cmp \$2,$num
+ jb .Ldec_non_avx
+ mov OPENSSL_ia32cap_P+4(%rip),%ecx
+ test \$`1<<28`,%ecx # AVX bit
+ jnz _avx_cbc_dec_shortcut
+ jmp .Ldec_non_avx
+.align 16
+.Ldec_non_avx:
+___
+$code.=<<___;
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0xa8(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,0x40(%rsp)
+ movaps %xmm11,0x50(%rsp)
+ movaps %xmm12,0x60(%rsp)
+ movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
+ movaps %xmm14,-0x58(%rax)
+ movaps %xmm15,-0x48(%rax)
+___
+$code.=<<___;
+ # stack layout
+ #
+ # +0 output sink
+ # +16 input sink [original %rsp and $num]
+ # +32 counters
+
+ sub \$48,%rsp
+ and \$-64,%rsp
+ mov %rax,16(%rsp) # original %rsp
+
+.Ldec4x_body:
+ movdqu ($key),$zero # 0-round key
+ lea 0x78($key),$key # size optimization
+ lea 40*2($inp),$inp
+
+.Ldec4x_loop_grande:
+ mov $num,24(%rsp) # original $num
+ xor $num,$num
+___
+for($i=0;$i<4;$i++) {
+ $code.=<<___;
+ mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks
+ mov `40*$i+0-40*2`($inp),@inptr[$i]
+ cmp $num,$one
+ mov `40*$i+8-40*2`($inp),@outptr[$i]
+ cmovg $one,$num # find maximum
+ test $one,$one
+ movdqu `40*$i+24-40*2`($inp),@inp[$i] # load IV
+ mov $one,`32+4*$i`(%rsp) # initialize counters
+ cmovle %rsp,@inptr[$i] # cancel input
+___
+}
+$code.=<<___;
+ test $num,$num
+ jz .Ldec4x_done
+
+ movups 0x10-0x78($key),$rndkey1
+ movups 0x20-0x78($key),$rndkey0
+ mov 0xf0-0x78($key),$rounds
+ movdqu (@inptr[0]),@out[0] # load inputs
+ movdqu (@inptr[1]),@out[1]
+ pxor $zero,@out[0]
+ movdqu (@inptr[2]),@out[2]
+ pxor $zero,@out[1]
+ movdqu (@inptr[3]),@out[3]
+ pxor $zero,@out[2]
+ pxor $zero,@out[3]
+ movdqa 32(%rsp),$counters # load counters
+ xor $offset,$offset
+ jmp .Loop_dec4x
+
+.align 32
+.Loop_dec4x:
+ add \$16,$offset
+ lea 16(%rsp),$sink # sink pointer
+ mov \$1,$one # constant of 1
+ sub $offset,$sink
+
+ aesdec $rndkey1,@out[0]
+ prefetcht0 31(@inptr[0],$offset) # prefetch input
+ prefetcht0 31(@inptr[1],$offset)
+ aesdec $rndkey1,@out[1]
+ prefetcht0 31(@inptr[2],$offset)
+ prefetcht0 31(@inptr[3],$offset)
+ aesdec $rndkey1,@out[2]
+ aesdec $rndkey1,@out[3]
+ movups 0x30-0x78($key),$rndkey1
+___
+for($i=0;$i<4;$i++) {
+my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
+$code.=<<___;
+ cmp `32+4*$i`(%rsp),$one
+ aesdec $rndkey,@out[0]
+ aesdec $rndkey,@out[1]
+ aesdec $rndkey,@out[2]
+ cmovge $sink,@inptr[$i] # cancel input
+ cmovg $sink,@outptr[$i] # sink output
+ aesdec $rndkey,@out[3]
+ movups `0x40+16*$i-0x78`($key),$rndkey
+___
+}
+$code.=<<___;
+ movdqa $counters,$mask
+ aesdec $rndkey0,@out[0]
+ prefetcht0 15(@outptr[0],$offset) # prefetch output
+ prefetcht0 15(@outptr[1],$offset)
+ aesdec $rndkey0,@out[1]
+ prefetcht0 15(@outptr[2],$offset)
+ prefetcht0 15(@outptr[3],$offset)
+ aesdec $rndkey0,@out[2]
+ aesdec $rndkey0,@out[3]
+ movups 0x80-0x78($key),$rndkey0
+ pxor $zero,$zero
+
+ aesdec $rndkey1,@out[0]
+ pcmpgtd $zero,$mask
+ movdqu -0x78($key),$zero # reload 0-round key
+ aesdec $rndkey1,@out[1]
+ paddd $mask,$counters # decrement counters
+ movdqa $counters,32(%rsp) # update counters
+ aesdec $rndkey1,@out[2]
+ aesdec $rndkey1,@out[3]
+ movups 0x90-0x78($key),$rndkey1
+
+ cmp \$11,$rounds
+
+ aesdec $rndkey0,@out[0]
+ aesdec $rndkey0,@out[1]
+ aesdec $rndkey0,@out[2]
+ aesdec $rndkey0,@out[3]
+ movups 0xa0-0x78($key),$rndkey0
+
+ jb .Ldec4x_tail
+
+ aesdec $rndkey1,@out[0]
+ aesdec $rndkey1,@out[1]
+ aesdec $rndkey1,@out[2]
+ aesdec $rndkey1,@out[3]
+ movups 0xb0-0x78($key),$rndkey1
+
+ aesdec $rndkey0,@out[0]
+ aesdec $rndkey0,@out[1]
+ aesdec $rndkey0,@out[2]
+ aesdec $rndkey0,@out[3]
+ movups 0xc0-0x78($key),$rndkey0
+
+ je .Ldec4x_tail
+
+ aesdec $rndkey1,@out[0]
+ aesdec $rndkey1,@out[1]
+ aesdec $rndkey1,@out[2]
+ aesdec $rndkey1,@out[3]
+ movups 0xd0-0x78($key),$rndkey1
+
+ aesdec $rndkey0,@out[0]
+ aesdec $rndkey0,@out[1]
+ aesdec $rndkey0,@out[2]
+ aesdec $rndkey0,@out[3]
+ movups 0xe0-0x78($key),$rndkey0
+ jmp .Ldec4x_tail
+
+.align 32
+.Ldec4x_tail:
+ aesdec $rndkey1,@out[0]
+ aesdec $rndkey1,@out[1]
+ aesdec $rndkey1,@out[2]
+ pxor $rndkey0,@inp[0]
+ pxor $rndkey0,@inp[1]
+ aesdec $rndkey1,@out[3]
+ movdqu 0x10-0x78($key),$rndkey1
+ pxor $rndkey0,@inp[2]
+ pxor $rndkey0,@inp[3]
+ movdqu 0x20-0x78($key),$rndkey0
+
+ aesdeclast @inp[0],@out[0]
+ aesdeclast @inp[1],@out[1]
+ movdqu -16(@inptr[0],$offset),@inp[0] # load next IV
+ movdqu -16(@inptr[1],$offset),@inp[1]
+ aesdeclast @inp[2],@out[2]
+ aesdeclast @inp[3],@out[3]
+ movdqu -16(@inptr[2],$offset),@inp[2]
+ movdqu -16(@inptr[3],$offset),@inp[3]
+
+ movups @out[0],-16(@outptr[0],$offset)
+ movdqu (@inptr[0],$offset),@out[0]
+ movups @out[1],-16(@outptr[1],$offset)
+ movdqu (@inptr[1],$offset),@out[1]
+ pxor $zero,@out[0]
+ movups @out[2],-16(@outptr[2],$offset)
+ movdqu (@inptr[2],$offset),@out[2]
+ pxor $zero,@out[1]
+ movups @out[3],-16(@outptr[3],$offset)
+ movdqu (@inptr[3],$offset),@out[3]
+ pxor $zero,@out[2]
+ pxor $zero,@out[3]
+
+ dec $num
+ jnz .Loop_dec4x
+
+ mov 16(%rsp),%rax # original %rsp
+ mov 24(%rsp),$num
+
+ lea `40*4`($inp),$inp
+ dec $num
+ jnz .Ldec4x_loop_grande
+
+.Ldec4x_done:
+___
+$code.=<<___ if ($win64);
+ movaps -0xd8(%rax),%xmm6
+ movaps -0xc8(%rax),%xmm7
+ movaps -0xb8(%rax),%xmm8
+ movaps -0xa8(%rax),%xmm9
+ movaps -0x98(%rax),%xmm10
+ movaps -0x88(%rax),%xmm11
+ movaps -0x78(%rax),%xmm12
+ #movaps -0x68(%rax),%xmm13
+ #movaps -0x58(%rax),%xmm14
+ #movaps -0x48(%rax),%xmm15
+___
+$code.=<<___;
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp
+.Ldec4x_epilogue:
+ ret
+.size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
+___
+
+ if ($avx) {{{
+my @ptr=map("%r$_",(8..15));
+my $offload=$sink;
+
+my @out=map("%xmm$_",(2..9));
+my @inp=map("%xmm$_",(10..13));
+my ($counters,$zero)=("%xmm14","%xmm15");
+
+$code.=<<___;
+.type aesni_multi_cbc_encrypt_avx,\@function,3
+.align 32
+aesni_multi_cbc_encrypt_avx:
+_avx_cbc_enc_shortcut:
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0xa8(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,0x40(%rsp)
+ movaps %xmm11,0x50(%rsp)
+ movaps %xmm12,-0x78(%rax)
+ movaps %xmm13,-0x68(%rax)
+ movaps %xmm14,-0x58(%rax)
+ movaps %xmm15,-0x48(%rax)
+___
+$code.=<<___;
+ # stack layout
+ #
+ # +0 output sink
+ # +16 input sink [original %rsp and $num]
+ # +32 counters
+ # +64 distances between inputs and outputs
+ # +128 off-load area for @inp[0..3]
+
+ sub \$192,%rsp
+ and \$-128,%rsp
+ mov %rax,16(%rsp) # original %rsp
+
+.Lenc8x_body:
+ vzeroupper
+ vmovdqu ($key),$zero # 0-round key
+ lea 0x78($key),$key # size optimization
+ lea 40*4($inp),$inp
+ shr \$1,$num
+
+.Lenc8x_loop_grande:
+ #mov $num,24(%rsp) # original $num
+ xor $num,$num
+___
+for($i=0;$i<8;$i++) {
+ my $temp = $i ? $offload : $offset;
+ $code.=<<___;
+ mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks
+ mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer
+ cmp $num,$one
+ mov `40*$i+8-40*4`($inp),$temp # output pointer
+ cmovg $one,$num # find maximum
+ test $one,$one
+ vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV
+ mov $one,`32+4*$i`(%rsp) # initialize counters
+ cmovle %rsp,@ptr[$i] # cancel input
+ sub @ptr[$i],$temp # distance between input and output
+ mov $temp,`64+8*$i`(%rsp) # initialize distances
+___
+}
+$code.=<<___;
+ test $num,$num
+ jz .Lenc8x_done
+
+ vmovups 0x10-0x78($key),$rndkey1
+ vmovups 0x20-0x78($key),$rndkey0
+ mov 0xf0-0x78($key),$rounds
+
+ vpxor (@ptr[0]),$zero,@inp[0] # load inputs and xor with 0-round
+ lea 128(%rsp),$offload # offload area
+ vpxor (@ptr[1]),$zero,@inp[1]
+ vpxor (@ptr[2]),$zero,@inp[2]
+ vpxor (@ptr[3]),$zero,@inp[3]
+ vpxor @inp[0],@out[0],@out[0]
+ vpxor (@ptr[4]),$zero,@inp[0]
+ vpxor @inp[1],@out[1],@out[1]
+ vpxor (@ptr[5]),$zero,@inp[1]
+ vpxor @inp[2],@out[2],@out[2]
+ vpxor (@ptr[6]),$zero,@inp[2]
+ vpxor @inp[3],@out[3],@out[3]
+ vpxor (@ptr[7]),$zero,@inp[3]
+ vpxor @inp[0],@out[4],@out[4]
+ mov \$1,$one # constant of 1
+ vpxor @inp[1],@out[5],@out[5]
+ vpxor @inp[2],@out[6],@out[6]
+ vpxor @inp[3],@out[7],@out[7]
+ jmp .Loop_enc8x
+
+.align 32
+.Loop_enc8x:
+___
+for($i=0;$i<8;$i++) {
+my $rndkey=($i&1)?$rndkey0:$rndkey1;
+$code.=<<___;
+ vaesenc $rndkey,@out[0],@out[0]
+ cmp 32+4*$i(%rsp),$one
+___
+$code.=<<___ if ($i);
+ mov 64+8*$i(%rsp),$offset
+___
+$code.=<<___;
+ vaesenc $rndkey,@out[1],@out[1]
+ prefetcht0 31(@ptr[$i]) # prefetch input
+ vaesenc $rndkey,@out[2],@out[2]
+___
+$code.=<<___ if ($i>1);
+ prefetcht0 15(@ptr[$i-2]) # prefetch output
+___
+$code.=<<___;
+ vaesenc $rndkey,@out[3],@out[3]
+ lea (@ptr[$i],$offset),$offset
+ cmovge %rsp,@ptr[$i] # cancel input
+ vaesenc $rndkey,@out[4],@out[4]
+ cmovg %rsp,$offset # sink output
+ vaesenc $rndkey,@out[5],@out[5]
+ sub @ptr[$i],$offset
+ vaesenc $rndkey,@out[6],@out[6]
+ vpxor 16(@ptr[$i]),$zero,@inp[$i%4] # load input and xor with 0-round
+ mov $offset,64+8*$i(%rsp)
+ vaesenc $rndkey,@out[7],@out[7]
+ vmovups `16*(3+$i)-0x78`($key),$rndkey
+ lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output
+___
+$code.=<<___ if ($i<4)
+ vmovdqu @inp[$i%4],`16*$i`($offload) # off-load
+___
+}
+$code.=<<___;
+ vmovdqu 32(%rsp),$counters
+ prefetcht0 15(@ptr[$i-2]) # prefetch output
+ prefetcht0 15(@ptr[$i-1])
+ cmp \$11,$rounds
+ jb .Lenc8x_tail
+
+ vaesenc $rndkey1,@out[0],@out[0]
+ vaesenc $rndkey1,@out[1],@out[1]
+ vaesenc $rndkey1,@out[2],@out[2]
+ vaesenc $rndkey1,@out[3],@out[3]
+ vaesenc $rndkey1,@out[4],@out[4]
+ vaesenc $rndkey1,@out[5],@out[5]
+ vaesenc $rndkey1,@out[6],@out[6]
+ vaesenc $rndkey1,@out[7],@out[7]
+ vmovups 0xb0-0x78($key),$rndkey1
+
+ vaesenc $rndkey0,@out[0],@out[0]
+ vaesenc $rndkey0,@out[1],@out[1]
+ vaesenc $rndkey0,@out[2],@out[2]
+ vaesenc $rndkey0,@out[3],@out[3]
+ vaesenc $rndkey0,@out[4],@out[4]
+ vaesenc $rndkey0,@out[5],@out[5]
+ vaesenc $rndkey0,@out[6],@out[6]
+ vaesenc $rndkey0,@out[7],@out[7]
+ vmovups 0xc0-0x78($key),$rndkey0
+ je .Lenc8x_tail
+
+ vaesenc $rndkey1,@out[0],@out[0]
+ vaesenc $rndkey1,@out[1],@out[1]
+ vaesenc $rndkey1,@out[2],@out[2]
+ vaesenc $rndkey1,@out[3],@out[3]
+ vaesenc $rndkey1,@out[4],@out[4]
+ vaesenc $rndkey1,@out[5],@out[5]
+ vaesenc $rndkey1,@out[6],@out[6]
+ vaesenc $rndkey1,@out[7],@out[7]
+ vmovups 0xd0-0x78($key),$rndkey1
+
+ vaesenc $rndkey0,@out[0],@out[0]
+ vaesenc $rndkey0,@out[1],@out[1]
+ vaesenc $rndkey0,@out[2],@out[2]
+ vaesenc $rndkey0,@out[3],@out[3]
+ vaesenc $rndkey0,@out[4],@out[4]
+ vaesenc $rndkey0,@out[5],@out[5]
+ vaesenc $rndkey0,@out[6],@out[6]
+ vaesenc $rndkey0,@out[7],@out[7]
+ vmovups 0xe0-0x78($key),$rndkey0
+
+.Lenc8x_tail:
+ vaesenc $rndkey1,@out[0],@out[0]
+ vpxor $zero,$zero,$zero
+ vaesenc $rndkey1,@out[1],@out[1]
+ vaesenc $rndkey1,@out[2],@out[2]
+ vpcmpgtd $zero,$counters,$zero
+ vaesenc $rndkey1,@out[3],@out[3]
+ vaesenc $rndkey1,@out[4],@out[4]
+ vpaddd $counters,$zero,$zero # decrement counters
+ vmovdqu 48(%rsp),$counters
+ vaesenc $rndkey1,@out[5],@out[5]
+ mov 64(%rsp),$offset # pre-load 1st offset
+ vaesenc $rndkey1,@out[6],@out[6]
+ vaesenc $rndkey1,@out[7],@out[7]
+ vmovups 0x10-0x78($key),$rndkey1
+
+ vaesenclast $rndkey0,@out[0],@out[0]
+ vmovdqa $zero,32(%rsp) # update counters
+ vpxor $zero,$zero,$zero
+ vaesenclast $rndkey0,@out[1],@out[1]
+ vaesenclast $rndkey0,@out[2],@out[2]
+ vpcmpgtd $zero,$counters,$zero
+ vaesenclast $rndkey0,@out[3],@out[3]
+ vaesenclast $rndkey0,@out[4],@out[4]
+ vpaddd $zero,$counters,$counters # decrement counters
+ vmovdqu -0x78($key),$zero # 0-round
+ vaesenclast $rndkey0,@out[5],@out[5]
+ vaesenclast $rndkey0,@out[6],@out[6]
+ vmovdqa $counters,48(%rsp) # update counters
+ vaesenclast $rndkey0,@out[7],@out[7]
+ vmovups 0x20-0x78($key),$rndkey0
+
+ vmovups @out[0],-16(@ptr[0]) # write output
+ sub $offset,@ptr[0] # switch to input
+ vpxor 0x00($offload),@out[0],@out[0]
+ vmovups @out[1],-16(@ptr[1])
+ sub `64+1*8`(%rsp),@ptr[1]
+ vpxor 0x10($offload),@out[1],@out[1]
+ vmovups @out[2],-16(@ptr[2])
+ sub `64+2*8`(%rsp),@ptr[2]
+ vpxor 0x20($offload),@out[2],@out[2]
+ vmovups @out[3],-16(@ptr[3])
+ sub `64+3*8`(%rsp),@ptr[3]
+ vpxor 0x30($offload),@out[3],@out[3]
+ vmovups @out[4],-16(@ptr[4])
+ sub `64+4*8`(%rsp),@ptr[4]
+ vpxor @inp[0],@out[4],@out[4]
+ vmovups @out[5],-16(@ptr[5])
+ sub `64+5*8`(%rsp),@ptr[5]
+ vpxor @inp[1],@out[5],@out[5]
+ vmovups @out[6],-16(@ptr[6])
+ sub `64+6*8`(%rsp),@ptr[6]
+ vpxor @inp[2],@out[6],@out[6]
+ vmovups @out[7],-16(@ptr[7])
+ sub `64+7*8`(%rsp),@ptr[7]
+ vpxor @inp[3],@out[7],@out[7]
+
+ dec $num
+ jnz .Loop_enc8x
+
+ mov 16(%rsp),%rax # original %rsp
+ #mov 24(%rsp),$num
+ #lea `40*8`($inp),$inp
+ #dec $num
+ #jnz .Lenc8x_loop_grande
+
+.Lenc8x_done:
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps -0xd8(%rax),%xmm6
+ movaps -0xc8(%rax),%xmm7
+ movaps -0xb8(%rax),%xmm8
+ movaps -0xa8(%rax),%xmm9
+ movaps -0x98(%rax),%xmm10
+ movaps -0x88(%rax),%xmm11
+ movaps -0x78(%rax),%xmm12
+ movaps -0x68(%rax),%xmm13
+ movaps -0x58(%rax),%xmm14
+ movaps -0x48(%rax),%xmm15
+___
+$code.=<<___;
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp
+.Lenc8x_epilogue:
+ ret
+.size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
+
+.type aesni_multi_cbc_decrypt_avx,\@function,3
+.align 32
+aesni_multi_cbc_decrypt_avx:
+_avx_cbc_dec_shortcut:
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0xa8(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,0x40(%rsp)
+ movaps %xmm11,0x50(%rsp)
+ movaps %xmm12,-0x78(%rax)
+ movaps %xmm13,-0x68(%rax)
+ movaps %xmm14,-0x58(%rax)
+ movaps %xmm15,-0x48(%rax)
+___
+$code.=<<___;
+ # stack layout
+ #
+ # +0 output sink
+ # +16 input sink [original %rsp and $num]
+ # +32 counters
+ # +64 distances between inputs and outputs
+ # +128 off-load area for @inp[0..3]
+ # +192 IV/input offload
+
+ sub \$256,%rsp
+ and \$-256,%rsp
+ sub \$192,%rsp
+ mov %rax,16(%rsp) # original %rsp
+
+.Ldec8x_body:
+ vzeroupper
+ vmovdqu ($key),$zero # 0-round key
+ lea 0x78($key),$key # size optimization
+ lea 40*4($inp),$inp
+ shr \$1,$num
+
+.Ldec8x_loop_grande:
+ #mov $num,24(%rsp) # original $num
+ xor $num,$num
+___
+for($i=0;$i<8;$i++) {
+ my $temp = $i ? $offload : $offset;
+ $code.=<<___;
+ mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks
+ mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer
+ cmp $num,$one
+ mov `40*$i+8-40*4`($inp),$temp # output pointer
+ cmovg $one,$num # find maximum
+ test $one,$one
+ vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV
+ mov $one,`32+4*$i`(%rsp) # initialize counters
+ cmovle %rsp,@ptr[$i] # cancel input
+ sub @ptr[$i],$temp # distance between input and output
+ mov $temp,`64+8*$i`(%rsp) # initialize distances
+ vmovdqu @out[$i],`192+16*$i`(%rsp) # offload IV
+___
+}
+$code.=<<___;
+ test $num,$num
+ jz .Ldec8x_done
+
+ vmovups 0x10-0x78($key),$rndkey1
+ vmovups 0x20-0x78($key),$rndkey0
+ mov 0xf0-0x78($key),$rounds
+ lea 192+128(%rsp),$offload # offload area
+
+ vmovdqu (@ptr[0]),@out[0] # load inputs
+ vmovdqu (@ptr[1]),@out[1]
+ vmovdqu (@ptr[2]),@out[2]
+ vmovdqu (@ptr[3]),@out[3]
+ vmovdqu (@ptr[4]),@out[4]
+ vmovdqu (@ptr[5]),@out[5]
+ vmovdqu (@ptr[6]),@out[6]
+ vmovdqu (@ptr[7]),@out[7]
+ vmovdqu @out[0],0x00($offload) # offload inputs
+ vpxor $zero,@out[0],@out[0] # xor inputs with 0-round
+ vmovdqu @out[1],0x10($offload)
+ vpxor $zero,@out[1],@out[1]
+ vmovdqu @out[2],0x20($offload)
+ vpxor $zero,@out[2],@out[2]
+ vmovdqu @out[3],0x30($offload)
+ vpxor $zero,@out[3],@out[3]
+ vmovdqu @out[4],0x40($offload)
+ vpxor $zero,@out[4],@out[4]
+ vmovdqu @out[5],0x50($offload)
+ vpxor $zero,@out[5],@out[5]
+ vmovdqu @out[6],0x60($offload)
+ vpxor $zero,@out[6],@out[6]
+ vmovdqu @out[7],0x70($offload)
+ vpxor $zero,@out[7],@out[7]
+ xor \$0x80,$offload
+ mov \$1,$one # constant of 1
+ jmp .Loop_dec8x
+
+.align 32
+.Loop_dec8x:
+___
+for($i=0;$i<8;$i++) {
+my $rndkey=($i&1)?$rndkey0:$rndkey1;
+$code.=<<___;
+ vaesdec $rndkey,@out[0],@out[0]
+ cmp 32+4*$i(%rsp),$one
+___
+$code.=<<___ if ($i);
+ mov 64+8*$i(%rsp),$offset
+___
+$code.=<<___;
+ vaesdec $rndkey,@out[1],@out[1]
+ prefetcht0 31(@ptr[$i]) # prefetch input
+ vaesdec $rndkey,@out[2],@out[2]
+___
+$code.=<<___ if ($i>1);
+ prefetcht0 15(@ptr[$i-2]) # prefetch output
+___
+$code.=<<___;
+ vaesdec $rndkey,@out[3],@out[3]
+ lea (@ptr[$i],$offset),$offset
+ cmovge %rsp,@ptr[$i] # cancel input
+ vaesdec $rndkey,@out[4],@out[4]
+ cmovg %rsp,$offset # sink output
+ vaesdec $rndkey,@out[5],@out[5]
+ sub @ptr[$i],$offset
+ vaesdec $rndkey,@out[6],@out[6]
+ vmovdqu 16(@ptr[$i]),@inp[$i%4] # load input
+ mov $offset,64+8*$i(%rsp)
+ vaesdec $rndkey,@out[7],@out[7]
+ vmovups `16*(3+$i)-0x78`($key),$rndkey
+ lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output
+___
+$code.=<<___ if ($i<4);
+ vmovdqu @inp[$i%4],`128+16*$i`(%rsp) # off-load
+___
+}
+$code.=<<___;
+ vmovdqu 32(%rsp),$counters
+ prefetcht0 15(@ptr[$i-2]) # prefetch output
+ prefetcht0 15(@ptr[$i-1])
+ cmp \$11,$rounds
+ jb .Ldec8x_tail
+
+ vaesdec $rndkey1,@out[0],@out[0]
+ vaesdec $rndkey1,@out[1],@out[1]
+ vaesdec $rndkey1,@out[2],@out[2]
+ vaesdec $rndkey1,@out[3],@out[3]
+ vaesdec $rndkey1,@out[4],@out[4]
+ vaesdec $rndkey1,@out[5],@out[5]
+ vaesdec $rndkey1,@out[6],@out[6]
+ vaesdec $rndkey1,@out[7],@out[7]
+ vmovups 0xb0-0x78($key),$rndkey1
+
+ vaesdec $rndkey0,@out[0],@out[0]
+ vaesdec $rndkey0,@out[1],@out[1]
+ vaesdec $rndkey0,@out[2],@out[2]
+ vaesdec $rndkey0,@out[3],@out[3]
+ vaesdec $rndkey0,@out[4],@out[4]
+ vaesdec $rndkey0,@out[5],@out[5]
+ vaesdec $rndkey0,@out[6],@out[6]
+ vaesdec $rndkey0,@out[7],@out[7]
+ vmovups 0xc0-0x78($key),$rndkey0
+ je .Ldec8x_tail
+
+ vaesdec $rndkey1,@out[0],@out[0]
+ vaesdec $rndkey1,@out[1],@out[1]
+ vaesdec $rndkey1,@out[2],@out[2]
+ vaesdec $rndkey1,@out[3],@out[3]
+ vaesdec $rndkey1,@out[4],@out[4]
+ vaesdec $rndkey1,@out[5],@out[5]
+ vaesdec $rndkey1,@out[6],@out[6]
+ vaesdec $rndkey1,@out[7],@out[7]
+ vmovups 0xd0-0x78($key),$rndkey1
+
+ vaesdec $rndkey0,@out[0],@out[0]
+ vaesdec $rndkey0,@out[1],@out[1]
+ vaesdec $rndkey0,@out[2],@out[2]
+ vaesdec $rndkey0,@out[3],@out[3]
+ vaesdec $rndkey0,@out[4],@out[4]
+ vaesdec $rndkey0,@out[5],@out[5]
+ vaesdec $rndkey0,@out[6],@out[6]
+ vaesdec $rndkey0,@out[7],@out[7]
+ vmovups 0xe0-0x78($key),$rndkey0
+
+.Ldec8x_tail:
+ vaesdec $rndkey1,@out[0],@out[0]
+ vpxor $zero,$zero,$zero
+ vaesdec $rndkey1,@out[1],@out[1]
+ vaesdec $rndkey1,@out[2],@out[2]
+ vpcmpgtd $zero,$counters,$zero
+ vaesdec $rndkey1,@out[3],@out[3]
+ vaesdec $rndkey1,@out[4],@out[4]
+ vpaddd $counters,$zero,$zero # decrement counters
+ vmovdqu 48(%rsp),$counters
+ vaesdec $rndkey1,@out[5],@out[5]
+ mov 64(%rsp),$offset # pre-load 1st offset
+ vaesdec $rndkey1,@out[6],@out[6]
+ vaesdec $rndkey1,@out[7],@out[7]
+ vmovups 0x10-0x78($key),$rndkey1
+
+ vaesdeclast $rndkey0,@out[0],@out[0]
+ vmovdqa $zero,32(%rsp) # update counters
+ vpxor $zero,$zero,$zero
+ vaesdeclast $rndkey0,@out[1],@out[1]
+ vpxor 0x00($offload),@out[0],@out[0] # xor with IV
+ vaesdeclast $rndkey0,@out[2],@out[2]
+ vpxor 0x10($offload),@out[1],@out[1]
+ vpcmpgtd $zero,$counters,$zero
+ vaesdeclast $rndkey0,@out[3],@out[3]
+ vpxor 0x20($offload),@out[2],@out[2]
+ vaesdeclast $rndkey0,@out[4],@out[4]
+ vpxor 0x30($offload),@out[3],@out[3]
+ vpaddd $zero,$counters,$counters # decrement counters
+ vmovdqu -0x78($key),$zero # 0-round
+ vaesdeclast $rndkey0,@out[5],@out[5]
+ vpxor 0x40($offload),@out[4],@out[4]
+ vaesdeclast $rndkey0,@out[6],@out[6]
+ vpxor 0x50($offload),@out[5],@out[5]
+ vmovdqa $counters,48(%rsp) # update counters
+ vaesdeclast $rndkey0,@out[7],@out[7]
+ vpxor 0x60($offload),@out[6],@out[6]
+ vmovups 0x20-0x78($key),$rndkey0
+
+ vmovups @out[0],-16(@ptr[0]) # write output
+ sub $offset,@ptr[0] # switch to input
+ vmovdqu 128+0(%rsp),@out[0]
+ vpxor 0x70($offload),@out[7],@out[7]
+ vmovups @out[1],-16(@ptr[1])
+ sub `64+1*8`(%rsp),@ptr[1]
+ vmovdqu @out[0],0x00($offload)
+ vpxor $zero,@out[0],@out[0]
+ vmovdqu 128+16(%rsp),@out[1]
+ vmovups @out[2],-16(@ptr[2])
+ sub `64+2*8`(%rsp),@ptr[2]
+ vmovdqu @out[1],0x10($offload)
+ vpxor $zero,@out[1],@out[1]
+ vmovdqu 128+32(%rsp),@out[2]
+ vmovups @out[3],-16(@ptr[3])
+ sub `64+3*8`(%rsp),@ptr[3]
+ vmovdqu @out[2],0x20($offload)
+ vpxor $zero,@out[2],@out[2]
+ vmovdqu 128+48(%rsp),@out[3]
+ vmovups @out[4],-16(@ptr[4])
+ sub `64+4*8`(%rsp),@ptr[4]
+ vmovdqu @out[3],0x30($offload)
+ vpxor $zero,@out[3],@out[3]
+ vmovdqu @inp[0],0x40($offload)
+ vpxor @inp[0],$zero,@out[4]
+ vmovups @out[5],-16(@ptr[5])
+ sub `64+5*8`(%rsp),@ptr[5]
+ vmovdqu @inp[1],0x50($offload)
+ vpxor @inp[1],$zero,@out[5]
+ vmovups @out[6],-16(@ptr[6])
+ sub `64+6*8`(%rsp),@ptr[6]
+ vmovdqu @inp[2],0x60($offload)
+ vpxor @inp[2],$zero,@out[6]
+ vmovups @out[7],-16(@ptr[7])
+ sub `64+7*8`(%rsp),@ptr[7]
+ vmovdqu @inp[3],0x70($offload)
+ vpxor @inp[3],$zero,@out[7]
+
+ xor \$128,$offload
+ dec $num
+ jnz .Loop_dec8x
+
+ mov 16(%rsp),%rax # original %rsp
+ #mov 24(%rsp),$num
+ #lea `40*8`($inp),$inp
+ #dec $num
+ #jnz .Ldec8x_loop_grande
+
+.Ldec8x_done:
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps -0xd8(%rax),%xmm6
+ movaps -0xc8(%rax),%xmm7
+ movaps -0xb8(%rax),%xmm8
+ movaps -0xa8(%rax),%xmm9
+ movaps -0x98(%rax),%xmm10
+ movaps -0x88(%rax),%xmm11
+ movaps -0x78(%rax),%xmm12
+ movaps -0x68(%rax),%xmm13
+ movaps -0x58(%rax),%xmm14
+ movaps -0x48(%rax),%xmm15
+___
+$code.=<<___;
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp
+.Ldec8x_epilogue:
+ ret
+.size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx
+___
+ }}}
+
+if ($win64) {
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<.Lprologue
+ jb .Lin_prologue
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=.Lepilogue
+ jae .Lin_prologue
+
+ mov 16(%rax),%rax # pull saved stack pointer
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore cotnext->R12
+ mov %r13,224($context) # restore cotnext->R13
+ mov %r14,232($context) # restore cotnext->R14
+ mov %r15,240($context) # restore cotnext->R15
+
+ lea -56-10*16(%rax),%rsi
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+
+.Lin_prologue:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size se_handler,.-se_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_aesni_multi_cbc_encrypt
+ .rva .LSEH_end_aesni_multi_cbc_encrypt
+ .rva .LSEH_info_aesni_multi_cbc_encrypt
+ .rva .LSEH_begin_aesni_multi_cbc_decrypt
+ .rva .LSEH_end_aesni_multi_cbc_decrypt
+ .rva .LSEH_info_aesni_multi_cbc_decrypt
+___
+$code.=<<___ if ($avx);
+ .rva .LSEH_begin_aesni_multi_cbc_encrypt_avx
+ .rva .LSEH_end_aesni_multi_cbc_encrypt_avx
+ .rva .LSEH_info_aesni_multi_cbc_encrypt_avx
+ .rva .LSEH_begin_aesni_multi_cbc_decrypt_avx
+ .rva .LSEH_end_aesni_multi_cbc_decrypt_avx
+ .rva .LSEH_info_aesni_multi_cbc_decrypt_avx
+___
+$code.=<<___;
+.section .xdata
+.align 8
+.LSEH_info_aesni_multi_cbc_encrypt:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lenc4x_body,.Lenc4x_epilogue # HandlerData[]
+.LSEH_info_aesni_multi_cbc_decrypt:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Ldec4x_body,.Ldec4x_epilogue # HandlerData[]
+___
+$code.=<<___ if ($avx);
+.LSEH_info_aesni_multi_cbc_encrypt_avx:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lenc8x_body,.Lenc8x_epilogue # HandlerData[]
+.LSEH_info_aesni_multi_cbc_decrypt_avx:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Ldec8x_body,.Ldec8x_epilogue # HandlerData[]
+___
+}
+####################################################################
+
+sub rex {
+ local *opcode=shift;
+ my ($dst,$src)=@_;
+ my $rex=0;
+
+ $rex|=0x04 if($dst>=8);
+ $rex|=0x01 if($src>=8);
+ push @opcode,$rex|0x40 if($rex);
+}
+
+sub aesni {
+ my $line=shift;
+ my @opcode=(0x66);
+
+ if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ rex(\@opcode,$4,$3);
+ push @opcode,0x0f,0x3a,0xdf;
+ push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
+ my $c=$2;
+ push @opcode,$c=~/^0/?oct($c):$c;
+ return ".byte\t".join(',',@opcode);
+ }
+ elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my %opcodelet = (
+ "aesimc" => 0xdb,
+ "aesenc" => 0xdc, "aesenclast" => 0xdd,
+ "aesdec" => 0xde, "aesdeclast" => 0xdf
+ );
+ return undef if (!defined($opcodelet{$1}));
+ rex(\@opcode,$3,$2);
+ push @opcode,0x0f,0x38,$opcodelet{$1};
+ push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
+ return ".byte\t".join(',',@opcode);
+ }
+ elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
+ my %opcodelet = (
+ "aesenc" => 0xdc, "aesenclast" => 0xdd,
+ "aesdec" => 0xde, "aesdeclast" => 0xdf
+ );
+ return undef if (!defined($opcodelet{$1}));
+ my $off = $2;
+ push @opcode,0x44 if ($3>=8);
+ push @opcode,0x0f,0x38,$opcodelet{$1};
+ push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
+ push @opcode,($off=~/^0/?oct($off):$off)&0xff;
+ return ".byte\t".join(',',@opcode);
+ }
+ return $line;
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
+
+print $code;
+close STDOUT;
diff --git a/crypto/aes/asm/aesni-sha1-x86_64.pl b/crypto/aes/asm/aesni-sha1-x86_64.pl
index 3c8f6c19e7bd..97992adca7c3 100755
--- a/crypto/aes/asm/aesni-sha1-x86_64.pl
+++ b/crypto/aes/asm/aesni-sha1-x86_64.pl
@@ -21,16 +21,25 @@
# subroutine:
#
# AES-128-CBC +SHA1 stitch gain
-# Westmere 3.77[+5.6] 9.37 6.65 +41%
-# Sandy Bridge 5.05[+5.2(6.3)] 10.25(11.35) 6.16(7.08) +67%(+60%)
+# Westmere 3.77[+5.3] 9.07 6.55 +38%
+# Sandy Bridge 5.05[+5.0(6.1)] 10.06(11.15) 5.98(7.05) +68%(+58%)
+# Ivy Bridge 5.05[+4.6] 9.65 5.54 +74%
+# Haswell 4.43[+3.6(4.2)] 8.00(8.58) 4.55(5.21) +75%(+65%)
+# Bulldozer 5.77[+6.0] 11.72 6.37 +84%
#
# AES-192-CBC
-# Westmere 4.51 10.11 6.97 +45%
-# Sandy Bridge 6.05 11.25(12.35) 6.34(7.27) +77%(+70%)
+# Westmere 4.51 9.81 6.80 +44%
+# Sandy Bridge 6.05 11.06(12.15) 6.11(7.19) +81%(+69%)
+# Ivy Bridge 6.05 10.65 6.07 +75%
+# Haswell 5.29 8.86(9.44) 5.32(5.32) +67%(+77%)
+# Bulldozer 6.89 12.84 6.96 +84%
#
# AES-256-CBC
-# Westmere 5.25 10.85 7.25 +50%
-# Sandy Bridge 7.05 12.25(13.35) 7.06(7.70) +74%(+73%)
+# Westmere 5.25 10.55 7.21 +46%
+# Sandy Bridge 7.05 12.06(13.15) 7.12(7.72) +69%(+70%)
+# Ivy Bridge 7.05 11.65 7.12 +64%
+# Haswell 6.19 9.76(10.34) 6.21(6.25) +57%(+65%)
+# Bulldozer 8.00 13.95 8.25 +69%
#
# (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for
# background information. Above numbers in parentheses are SSSE3
@@ -45,8 +54,25 @@
# standalone AESNI-CBC decrypt:
#
# AES-128-CBC AES-192-CBC AES-256-CBC
-# Westmere 1.31 1.55 1.80
-# Sandy Bridge 0.93 1.06 1.22
+# Westmere 1.25 1.50 1.75
+# Sandy Bridge 0.74 0.91 1.09
+# Ivy Bridge 0.74 0.90 1.11
+# Haswell 0.63 0.76 0.88
+# Bulldozer 0.70 0.85 0.99
+
+# And indeed:
+#
+# AES-256-CBC +SHA1 stitch gain
+# Westmere 1.75 7.20 6.68 +7.8%
+# Sandy Bridge 1.09 6.09(7.22) 5.82(6.95) +4.6%(+3.9%)
+# Ivy Bridge 1.11 5.70 5.45 +4.6%
+# Haswell 0.88 4.45(5.00) 4.39(4.69) +1.4%(*)(+6.6%)
+# Bulldozer 0.99 6.95 5.95 +17%(**)
+#
+# (*) Tiny improvement coefficient on Haswell is because we compare
+# AVX1 stitch to sum with AVX2 SHA1.
+# (**) Execution is fully dominated by integer code sequence and
+# SIMD still hardly shows [in single-process benchmark;-]
$flavour = shift;
$output = shift;
@@ -68,6 +94,11 @@ $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
`ml64 2>&1` =~ /Version ([0-9]+)\./ &&
$1>=10);
+$avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/ && $2>=3.0);
+
+$shaext=1; ### set to zero if compiling for 1.0.1
+
+$stitched_decrypt=0;
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
@@ -86,11 +117,15 @@ $code.=<<___;
.globl aesni_cbc_sha1_enc
.type aesni_cbc_sha1_enc,\@abi-omnipotent
-.align 16
+.align 32
aesni_cbc_sha1_enc:
# caller should check for SSSE3 and AES-NI bits
mov OPENSSL_ia32cap_P+0(%rip),%r10d
- mov OPENSSL_ia32cap_P+4(%rip),%r11d
+ mov OPENSSL_ia32cap_P+4(%rip),%r11
+___
+$code.=<<___ if ($shaext);
+ bt \$61,%r11 # check SHA bit
+ jc aesni_cbc_sha1_enc_shaext
___
$code.=<<___ if ($avx);
and \$`1<<28`,%r11d # mask AVX bit
@@ -112,10 +147,21 @@ my @X=map("%xmm$_",(4..7,0..3));
my @Tx=map("%xmm$_",(8..10));
my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
my @T=("%esi","%edi");
-my $j=0; my $jj=0; my $r=0; my $sn=0;
+my $j=0; my $jj=0; my $r=0; my $sn=0; my $rx=0;
my $K_XX_XX="%r11";
-my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13));
-my @rndkey=("%xmm14","%xmm15");
+my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13)); # for enc
+my @rndkey=("%xmm14","%xmm15"); # for enc
+my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15)); # for dec
+
+if (1) { # reassign for Atom Silvermont
+ # The goal is to minimize amount of instructions with more than
+ # 3 prefix bytes. Or in more practical terms to keep AES-NI *and*
+ # SSSE3 instructions to upper half of the register bank.
+ @X=map("%xmm$_",(8..11,4..7));
+ @Tx=map("%xmm$_",(12,13,3));
+ ($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15));
+ @rndkey=("%xmm0","%xmm1");
+}
sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
@@ -129,7 +175,7 @@ my $_ror=sub { &ror(@_) };
$code.=<<___;
.type aesni_cbc_sha1_enc_ssse3,\@function,6
-.align 16
+.align 32
aesni_cbc_sha1_enc_ssse3:
mov `($win64?56:8)`(%rsp),$inp # load 7th argument
#shr \$6,$len # debugging artefact
@@ -161,16 +207,16 @@ $code.=<<___;
mov $in0,%r12 # reassign arguments
mov $out,%r13
mov $len,%r14
- mov $key,%r15
+ lea 112($key),%r15 # size optimization
movdqu ($ivp),$iv # load IV
mov $ivp,88(%rsp) # save $ivp
___
-my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
+($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
my $rounds="${ivp}d";
$code.=<<___;
shl \$6,$len
sub $in0,$out
- mov 240($key),$rounds
+ mov 240-112($key),$rounds
add $inp,$len # end of input
lea K_XX_XX(%rip),$K_XX_XX
@@ -180,19 +226,22 @@ $code.=<<___;
mov 12($ctx),$D
mov $B,@T[0] # magic seed
mov 16($ctx),$E
+ mov $C,@T[1]
+ xor $D,@T[1]
+ and @T[1],@T[0]
- movdqa 64($K_XX_XX),@X[2] # pbswap mask
+ movdqa 64($K_XX_XX),@Tx[2] # pbswap mask
movdqa 0($K_XX_XX),@Tx[1] # K_00_19
movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
movdqu 16($inp),@X[-3&7]
movdqu 32($inp),@X[-2&7]
movdqu 48($inp),@X[-1&7]
- pshufb @X[2],@X[-4&7] # byte swap
+ pshufb @Tx[2],@X[-4&7] # byte swap
+ pshufb @Tx[2],@X[-3&7]
+ pshufb @Tx[2],@X[-2&7]
add \$64,$inp
- pshufb @X[2],@X[-3&7]
- pshufb @X[2],@X[-2&7]
- pshufb @X[2],@X[-1&7]
paddd @Tx[1],@X[-4&7] # add K_00_19
+ pshufb @Tx[2],@X[-1&7]
paddd @Tx[1],@X[-3&7]
paddd @Tx[1],@X[-2&7]
movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
@@ -201,8 +250,8 @@ $code.=<<___;
psubd @Tx[1],@X[-3&7]
movdqa @X[-2&7],32(%rsp)
psubd @Tx[1],@X[-2&7]
- movups ($key),$rndkey0 # $key[0]
- movups 16($key),$rndkey[0] # forward reference
+ movups -112($key),$rndkey0 # $key[0]
+ movups 16-112($key),$rndkey[0] # forward reference
jmp .Loop_ssse3
___
@@ -219,31 +268,31 @@ ___
___
$code.=<<___;
xorps $in,$iv
+ movups `32+16*$k-112`($key),$rndkey[1]
aesenc $rndkey[0],$iv
- movups `32+16*$k`($key),$rndkey[1]
___
} elsif ($k==9) {
$sn++;
$code.=<<___;
cmp \$11,$rounds
jb .Laesenclast$sn
- movups `32+16*($k+0)`($key),$rndkey[1]
+ movups `32+16*($k+0)-112`($key),$rndkey[1]
aesenc $rndkey[0],$iv
- movups `32+16*($k+1)`($key),$rndkey[0]
+ movups `32+16*($k+1)-112`($key),$rndkey[0]
aesenc $rndkey[1],$iv
je .Laesenclast$sn
- movups `32+16*($k+2)`($key),$rndkey[1]
+ movups `32+16*($k+2)-112`($key),$rndkey[1]
aesenc $rndkey[0],$iv
- movups `32+16*($k+3)`($key),$rndkey[0]
+ movups `32+16*($k+3)-112`($key),$rndkey[0]
aesenc $rndkey[1],$iv
.Laesenclast$sn:
aesenclast $rndkey[0],$iv
- movups 16($key),$rndkey[1] # forward reference
+ movups 16-112($key),$rndkey[1] # forward reference
___
} else {
$code.=<<___;
+ movups `32+16*$k-112`($key),$rndkey[1]
aesenc $rndkey[0],$iv
- movups `32+16*$k`($key),$rndkey[1]
___
}
$r++; unshift(@rndkey,pop(@rndkey));
@@ -255,61 +304,61 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
my ($a,$b,$c,$d,$e);
- &movdqa (@X[0],@X[-3&7]);
- eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]);
eval(shift(@insns));
&movdqa (@Tx[0],@X[-1&7]);
- &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
+ &paddd (@Tx[1],@X[-1&7]);
eval(shift(@insns));
eval(shift(@insns));
- &paddd (@Tx[1],@X[-1&7]);
+ &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
eval(shift(@insns));
+ eval(shift(@insns)); # rol
eval(shift(@insns));
&psrldq (@Tx[0],4); # "X[-3]", 3 dwords
eval(shift(@insns));
eval(shift(@insns));
+
&pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
eval(shift(@insns));
- eval(shift(@insns));
-
+ eval(shift(@insns)); # ror
&pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
- eval(shift(@insns));
&pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
eval(shift(@insns));
- eval(shift(@insns));
+ eval(shift(@insns)); # rol
&movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
eval(shift(@insns));
eval(shift(@insns));
&movdqa (@Tx[2],@X[0]);
- &movdqa (@Tx[0],@X[0]);
- eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ &movdqa (@Tx[0],@X[0]);
eval(shift(@insns));
&pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
&paddd (@X[0],@X[0]);
eval(shift(@insns));
eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
&psrld (@Tx[0],31);
eval(shift(@insns));
+ eval(shift(@insns)); # rol
eval(shift(@insns));
&movdqa (@Tx[1],@Tx[2]);
eval(shift(@insns));
eval(shift(@insns));
&psrld (@Tx[2],30);
- &por (@X[0],@Tx[0]); # "X[0]"<<<=1
eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ &por (@X[0],@Tx[0]); # "X[0]"<<<=1
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
@@ -317,12 +366,13 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
&pslld (@Tx[1],2);
&pxor (@X[0],@Tx[2]);
eval(shift(@insns));
- eval(shift(@insns));
&movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
+ eval(shift(@insns)); # rol
eval(shift(@insns));
eval(shift(@insns));
&pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
+ &pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
foreach (@insns) { eval; } # remaining instructions [if any]
@@ -333,27 +383,30 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
sub Xupdate_ssse3_32_79()
{ use integer;
my $body = shift;
- my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
+ my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
my ($a,$b,$c,$d,$e);
- &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8);
- eval(shift(@insns)); # body_20_39
+ eval(shift(@insns)) if ($Xi==8);
&pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
- &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]"
+ eval(shift(@insns)) if ($Xi==8);
+ eval(shift(@insns)); # body_20_39
eval(shift(@insns));
+ eval(shift(@insns)) if (@insns[1] =~ /_ror/);
+ eval(shift(@insns)) if (@insns[0] =~ /_ror/);
+ &punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
eval(shift(@insns));
eval(shift(@insns)); # rol
&pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
eval(shift(@insns));
- eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
+ eval(shift(@insns));
if ($Xi%5) {
&movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
} else { # ... or load next one
&movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
}
- &paddd (@Tx[1],@X[-1&7]);
eval(shift(@insns)); # ror
+ &paddd (@Tx[1],@X[-1&7]);
eval(shift(@insns));
&pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
@@ -361,29 +414,31 @@ sub Xupdate_ssse3_32_79()
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # rol
+ eval(shift(@insns)) if (@insns[0] =~ /_ror/);
&movdqa (@Tx[0],@X[0]);
- &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
eval(shift(@insns));
eval(shift(@insns));
+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
eval(shift(@insns)); # ror
eval(shift(@insns));
+ eval(shift(@insns)); # body_20_39
&pslld (@X[0],2);
- eval(shift(@insns)); # body_20_39
eval(shift(@insns));
- &psrld (@Tx[0],30);
eval(shift(@insns));
- eval(shift(@insns)); # rol
+ &psrld (@Tx[0],30);
+ eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # ror
- eval(shift(@insns));
&por (@X[0],@Tx[0]); # "X[0]"<<<=2
- eval(shift(@insns)); # body_20_39
eval(shift(@insns));
- &movdqa (@Tx[1],@X[0]) if ($Xi<19);
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns)) if (@insns[1] =~ /_rol/);
+ eval(shift(@insns)) if (@insns[0] =~ /_rol/);
+ &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0])
eval(shift(@insns));
eval(shift(@insns)); # rol
eval(shift(@insns));
@@ -404,10 +459,11 @@ sub Xuplast_ssse3_80()
my ($a,$b,$c,$d,$e);
eval(shift(@insns));
- &paddd (@Tx[1],@X[-1&7]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
+ &paddd (@Tx[1],@X[-1&7]);
+ eval(shift(@insns));
eval(shift(@insns));
&movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
@@ -415,17 +471,17 @@ sub Xuplast_ssse3_80()
foreach (@insns) { eval; } # remaining instructions
&cmp ($inp,$len);
- &je (".Ldone_ssse3");
+ &je (shift);
unshift(@Tx,pop(@Tx));
- &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask
+ &movdqa (@Tx[2],"64($K_XX_XX)"); # pbswap mask
&movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19
&movdqu (@X[-4&7],"0($inp)"); # load input
&movdqu (@X[-3&7],"16($inp)");
&movdqu (@X[-2&7],"32($inp)");
&movdqu (@X[-1&7],"48($inp)");
- &pshufb (@X[-4&7],@X[2]); # byte swap
+ &pshufb (@X[-4&7],@Tx[2]); # byte swap
&add ($inp,64);
$Xi=0;
@@ -439,7 +495,10 @@ sub Xloop_ssse3()
eval(shift(@insns));
eval(shift(@insns));
- &pshufb (@X[($Xi-3)&7],@X[2]);
+ eval(shift(@insns));
+ &pshufb (@X[($Xi-3)&7],@Tx[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&paddd (@X[($Xi-4)&7],@Tx[1]);
@@ -450,6 +509,8 @@ sub Xloop_ssse3()
&movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
&psubd (@X[($Xi-4)&7],@Tx[1]);
foreach (@insns) { eval; }
@@ -465,76 +526,106 @@ sub Xtail_ssse3()
foreach (@insns) { eval; }
}
-sub body_00_19 () {
- use integer;
- my ($k,$n);
- my @r=(
+my @body_00_19 = (
'($a,$b,$c,$d,$e)=@V;'.
- '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer
- '&xor ($c,$d);',
- '&mov (@T[1],$a);', # $b in next round
- '&$_rol ($a,5);',
- '&and (@T[0],$c);', # ($b&($c^$d))
- '&xor ($c,$d);', # restore $c
- '&xor (@T[0],$d);',
- '&add ($e,$a);',
'&$_ror ($b,$j?7:2);', # $b>>>2
- '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ '&xor (@T[0],$d);',
+ '&mov (@T[1],$a);', # $b for next round
+
+ '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
+ '&xor ($b,$c);', # $c^$d for next round
+
+ '&$_rol ($a,5);',
+ '&add ($e,@T[0]);',
+ '&and (@T[1],$b);', # ($b&($c^$d)) for next round
+
+ '&xor ($b,$c);', # restore $b
+ '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
);
+
+sub body_00_19 () { # ((c^d)&b)^d
+ # on start @T[0]=(c^d)&b
+ return &body_20_39() if ($rx==19); $rx++;
+
+ use integer;
+ my ($k,$n);
+ my @r=@body_00_19;
+
$n = scalar(@r);
$k = (($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds
@r[$k%$n].='&$aesenc();' if ($jj==$k/$n);
$jj++;
+
return @r;
}
-sub body_20_39 () {
- use integer;
- my ($k,$n);
- my @r=(
+my @body_20_39 = (
'($a,$b,$c,$d,$e)=@V;'.
- '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
- '&xor (@T[0],$d);', # ($b^$d)
- '&mov (@T[1],$a);', # $b in next round
+ '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
+ '&xor (@T[0],$d) if($j==19);'.
+ '&xor (@T[0],$c) if($j> 19);', # ($b^$d^$c)
+ '&mov (@T[1],$a);', # $b for next round
+
'&$_rol ($a,5);',
- '&xor (@T[0],$c);', # ($b^$d^$c)
- '&add ($e,$a);',
+ '&add ($e,@T[0]);',
+ '&xor (@T[1],$c) if ($j< 79);', # $b^$d for next round
+
'&$_ror ($b,7);', # $b>>>2
- '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
);
+
+sub body_20_39 () { # b^d^c
+ # on entry @T[0]=b^d
+ return &body_40_59() if ($rx==39); $rx++;
+
+ use integer;
+ my ($k,$n);
+ my @r=@body_20_39;
+
$n = scalar(@r);
$k = (($jj+1)*8/20)*20*$n/8; # 8 aesencs per these 20 rounds
- @r[$k%$n].='&$aesenc();' if ($jj==$k/$n);
+ @r[$k%$n].='&$aesenc();' if ($jj==$k/$n && $rx!=20);
$jj++;
+
return @r;
}
-sub body_40_59 () {
- use integer;
- my ($k,$n);
- my @r=(
+my @body_40_59 = (
'($a,$b,$c,$d,$e)=@V;'.
- '&mov (@T[1],$c);',
- '&xor ($c,$d);',
- '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
- '&and (@T[1],$d);',
- '&and (@T[0],$c);', # ($b&($c^$d))
+ '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
+ '&and (@T[0],$c) if ($j>=40);', # (b^c)&(c^d)
+ '&xor ($c,$d) if ($j>=40);', # restore $c
+
'&$_ror ($b,7);', # $b>>>2
- '&add ($e,@T[1]);',
- '&mov (@T[1],$a);', # $b in next round
+ '&mov (@T[1],$a);', # $b for next round
+ '&xor (@T[0],$c);',
+
'&$_rol ($a,5);',
'&add ($e,@T[0]);',
- '&xor ($c,$d);', # restore $c
- '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ '&xor (@T[1],$c) if ($j==59);'.
+ '&xor (@T[1],$b) if ($j< 59);', # b^c for next round
+
+ '&xor ($b,$c) if ($j< 59);', # c^d for next round
+ '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
);
+
+sub body_40_59 () { # ((b^c)&(c^d))^c
+ # on entry @T[0]=(b^c), (c^=d)
+ $rx++;
+
+ use integer;
+ my ($k,$n);
+ my @r=@body_40_59;
+
$n = scalar(@r);
$k=(($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds
- @r[$k%$n].='&$aesenc();' if ($jj==$k/$n);
+ @r[$k%$n].='&$aesenc();' if ($jj==$k/$n && $rx!=40);
$jj++;
+
return @r;
}
$code.=<<___;
-.align 16
+.align 32
.Loop_ssse3:
___
&Xupdate_ssse3_16_31(\&body_00_19);
@@ -553,7 +644,7 @@ ___
&Xupdate_ssse3_32_79(\&body_40_59);
&Xupdate_ssse3_32_79(\&body_40_59);
&Xupdate_ssse3_32_79(\&body_20_39);
- &Xuplast_ssse3_80(\&body_20_39); # can jump to "done"
+ &Xuplast_ssse3_80(\&body_20_39,".Ldone_ssse3"); # can jump to "done"
$saved_j=$j; @saved_V=@V;
$saved_r=$r; @saved_rndkey=@rndkey;
@@ -575,11 +666,13 @@ $code.=<<___;
mov @T[0],4($ctx)
mov @T[0],$B # magic seed
mov $C,8($ctx)
+ mov $C,@T[1]
mov $D,12($ctx)
+ xor $D,@T[1]
mov $E,16($ctx)
+ and @T[1],@T[0]
jmp .Loop_ssse3
-.align 16
.Ldone_ssse3:
___
$jj=$j=$saved_j; @V=@saved_V;
@@ -631,7 +724,278 @@ $code.=<<___;
.size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
___
-$j=$jj=$r=$sn=0;
+ if ($stitched_decrypt) {{{
+# reset
+($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+$j=$jj=$r=$rx=0;
+$Xi=4;
+
+# reassign for Atom Silvermont (see above)
+($inout0,$inout1,$inout2,$inout3,$rndkey0)=map("%xmm$_",(0..4));
+@X=map("%xmm$_",(8..13,6,7));
+@Tx=map("%xmm$_",(14,15,5));
+
+my @aes256_dec = (
+ '&movdqu($inout0,"0x00($in0)");',
+ '&movdqu($inout1,"0x10($in0)"); &pxor ($inout0,$rndkey0);',
+ '&movdqu($inout2,"0x20($in0)"); &pxor ($inout1,$rndkey0);',
+ '&movdqu($inout3,"0x30($in0)"); &pxor ($inout2,$rndkey0);',
+
+ '&pxor ($inout3,$rndkey0); &movups ($rndkey0,"16-112($key)");',
+ '&movaps("64(%rsp)",@X[2]);', # save IV, originally @X[3]
+ undef,undef
+ );
+for ($i=0;$i<13;$i++) {
+ push (@aes256_dec,(
+ '&aesdec ($inout0,$rndkey0);',
+ '&aesdec ($inout1,$rndkey0);',
+ '&aesdec ($inout2,$rndkey0);',
+ '&aesdec ($inout3,$rndkey0); &movups($rndkey0,"'.(16*($i+2)-112).'($key)");'
+ ));
+ push (@aes256_dec,(undef,undef)) if (($i>=3 && $i<=5) || $i>=11);
+ push (@aes256_dec,(undef,undef)) if ($i==5);
+}
+push(@aes256_dec,(
+ '&aesdeclast ($inout0,$rndkey0); &movups (@X[0],"0x00($in0)");',
+ '&aesdeclast ($inout1,$rndkey0); &movups (@X[1],"0x10($in0)");',
+ '&aesdeclast ($inout2,$rndkey0); &movups (@X[2],"0x20($in0)");',
+ '&aesdeclast ($inout3,$rndkey0); &movups (@X[3],"0x30($in0)");',
+
+ '&xorps ($inout0,"64(%rsp)"); &movdqu ($rndkey0,"-112($key)");',
+ '&xorps ($inout1,@X[0]); &movups ("0x00($out,$in0)",$inout0);',
+ '&xorps ($inout2,@X[1]); &movups ("0x10($out,$in0)",$inout1);',
+ '&xorps ($inout3,@X[2]); &movups ("0x20($out,$in0)",$inout2);',
+
+ '&movups ("0x30($out,$in0)",$inout3);'
+ ));
+
+sub body_00_19_dec () { # ((c^d)&b)^d
+ # on start @T[0]=(c^d)&b
+ return &body_20_39_dec() if ($rx==19);
+
+ my @r=@body_00_19;
+
+ unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]);
+ $rx++;
+
+ return @r;
+}
+
+sub body_20_39_dec () { # b^d^c
+ # on entry @T[0]=b^d
+ return &body_40_59_dec() if ($rx==39);
+
+ my @r=@body_20_39;
+
+ unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]);
+ $rx++;
+
+ return @r;
+}
+
+sub body_40_59_dec () { # ((b^c)&(c^d))^c
+ # on entry @T[0]=(b^c), (c^=d)
+
+ my @r=@body_40_59;
+
+ unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]);
+ $rx++;
+
+ return @r;
+}
+
+$code.=<<___;
+.globl aesni256_cbc_sha1_dec
+.type aesni256_cbc_sha1_dec,\@abi-omnipotent
+.align 32
+aesni256_cbc_sha1_dec:
+ # caller should check for SSSE3 and AES-NI bits
+ mov OPENSSL_ia32cap_P+0(%rip),%r10d
+ mov OPENSSL_ia32cap_P+4(%rip),%r11d
+___
+$code.=<<___ if ($avx);
+ and \$`1<<28`,%r11d # mask AVX bit
+ and \$`1<<30`,%r10d # mask "Intel CPU" bit
+ or %r11d,%r10d
+ cmp \$`1<<28|1<<30`,%r10d
+ je aesni256_cbc_sha1_dec_avx
+___
+$code.=<<___;
+ jmp aesni256_cbc_sha1_dec_ssse3
+ ret
+.size aesni256_cbc_sha1_dec,.-aesni256_cbc_sha1_dec
+
+.type aesni256_cbc_sha1_dec_ssse3,\@function,6
+.align 32
+aesni256_cbc_sha1_dec_ssse3:
+ mov `($win64?56:8)`(%rsp),$inp # load 7th argument
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea `-104-($win64?10*16:0)`(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,96+0(%rsp)
+ movaps %xmm7,96+16(%rsp)
+ movaps %xmm8,96+32(%rsp)
+ movaps %xmm9,96+48(%rsp)
+ movaps %xmm10,96+64(%rsp)
+ movaps %xmm11,96+80(%rsp)
+ movaps %xmm12,96+96(%rsp)
+ movaps %xmm13,96+112(%rsp)
+ movaps %xmm14,96+128(%rsp)
+ movaps %xmm15,96+144(%rsp)
+.Lprologue_dec_ssse3:
+___
+$code.=<<___;
+ mov $in0,%r12 # reassign arguments
+ mov $out,%r13
+ mov $len,%r14
+ lea 112($key),%r15 # size optimization
+ movdqu ($ivp),@X[3] # load IV
+ #mov $ivp,88(%rsp) # save $ivp
+___
+($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
+$code.=<<___;
+ shl \$6,$len
+ sub $in0,$out
+ add $inp,$len # end of input
+
+ lea K_XX_XX(%rip),$K_XX_XX
+ mov 0($ctx),$A # load context
+ mov 4($ctx),$B
+ mov 8($ctx),$C
+ mov 12($ctx),$D
+ mov $B,@T[0] # magic seed
+ mov 16($ctx),$E
+ mov $C,@T[1]
+ xor $D,@T[1]
+ and @T[1],@T[0]
+
+ movdqa 64($K_XX_XX),@Tx[2] # pbswap mask
+ movdqa 0($K_XX_XX),@Tx[1] # K_00_19
+ movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
+ movdqu 16($inp),@X[-3&7]
+ movdqu 32($inp),@X[-2&7]
+ movdqu 48($inp),@X[-1&7]
+ pshufb @Tx[2],@X[-4&7] # byte swap
+ add \$64,$inp
+ pshufb @Tx[2],@X[-3&7]
+ pshufb @Tx[2],@X[-2&7]
+ pshufb @Tx[2],@X[-1&7]
+ paddd @Tx[1],@X[-4&7] # add K_00_19
+ paddd @Tx[1],@X[-3&7]
+ paddd @Tx[1],@X[-2&7]
+ movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
+ psubd @Tx[1],@X[-4&7] # restore X[]
+ movdqa @X[-3&7],16(%rsp)
+ psubd @Tx[1],@X[-3&7]
+ movdqa @X[-2&7],32(%rsp)
+ psubd @Tx[1],@X[-2&7]
+ movdqu -112($key),$rndkey0 # $key[0]
+ jmp .Loop_dec_ssse3
+
+.align 32
+.Loop_dec_ssse3:
+___
+ &Xupdate_ssse3_16_31(\&body_00_19_dec);
+ &Xupdate_ssse3_16_31(\&body_00_19_dec);
+ &Xupdate_ssse3_16_31(\&body_00_19_dec);
+ &Xupdate_ssse3_16_31(\&body_00_19_dec);
+ &Xupdate_ssse3_32_79(\&body_00_19_dec);
+ &Xupdate_ssse3_32_79(\&body_20_39_dec);
+ &Xupdate_ssse3_32_79(\&body_20_39_dec);
+ &Xupdate_ssse3_32_79(\&body_20_39_dec);
+ &Xupdate_ssse3_32_79(\&body_20_39_dec);
+ &Xupdate_ssse3_32_79(\&body_20_39_dec);
+ &Xupdate_ssse3_32_79(\&body_40_59_dec);
+ &Xupdate_ssse3_32_79(\&body_40_59_dec);
+ &Xupdate_ssse3_32_79(\&body_40_59_dec);
+ &Xupdate_ssse3_32_79(\&body_40_59_dec);
+ &Xupdate_ssse3_32_79(\&body_40_59_dec);
+ &Xupdate_ssse3_32_79(\&body_20_39_dec);
+ &Xuplast_ssse3_80(\&body_20_39_dec,".Ldone_dec_ssse3"); # can jump to "done"
+
+ $saved_j=$j; @saved_V=@V;
+ $saved_rx=$rx;
+
+ &Xloop_ssse3(\&body_20_39_dec);
+ &Xloop_ssse3(\&body_20_39_dec);
+ &Xloop_ssse3(\&body_20_39_dec);
+
+ eval(@aes256_dec[-1]); # last store
+$code.=<<___;
+ lea 64($in0),$in0
+
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ add 12($ctx),$D
+ mov $A,0($ctx)
+ add 16($ctx),$E
+ mov @T[0],4($ctx)
+ mov @T[0],$B # magic seed
+ mov $C,8($ctx)
+ mov $C,@T[1]
+ mov $D,12($ctx)
+ xor $D,@T[1]
+ mov $E,16($ctx)
+ and @T[1],@T[0]
+ jmp .Loop_dec_ssse3
+
+.Ldone_dec_ssse3:
+___
+ $jj=$j=$saved_j; @V=@saved_V;
+ $rx=$saved_rx;
+
+ &Xtail_ssse3(\&body_20_39_dec);
+ &Xtail_ssse3(\&body_20_39_dec);
+ &Xtail_ssse3(\&body_20_39_dec);
+
+ eval(@aes256_dec[-1]); # last store
+$code.=<<___;
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ mov $A,0($ctx)
+ add 12($ctx),$D
+ mov @T[0],4($ctx)
+ add 16($ctx),$E
+ mov $C,8($ctx)
+ mov $D,12($ctx)
+ mov $E,16($ctx)
+ movups @X[3],($ivp) # write IV
+___
+$code.=<<___ if ($win64);
+ movaps 96+0(%rsp),%xmm6
+ movaps 96+16(%rsp),%xmm7
+ movaps 96+32(%rsp),%xmm8
+ movaps 96+48(%rsp),%xmm9
+ movaps 96+64(%rsp),%xmm10
+ movaps 96+80(%rsp),%xmm11
+ movaps 96+96(%rsp),%xmm12
+ movaps 96+112(%rsp),%xmm13
+ movaps 96+128(%rsp),%xmm14
+ movaps 96+144(%rsp),%xmm15
+___
+$code.=<<___;
+ lea `104+($win64?10*16:0)`(%rsp),%rsi
+ mov 0(%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lepilogue_dec_ssse3:
+ ret
+.size aesni256_cbc_sha1_dec_ssse3,.-aesni256_cbc_sha1_dec_ssse3
+___
+ }}}
+$j=$jj=$r=$rx=0;
if ($avx) {
my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
@@ -641,13 +1005,17 @@ my @X=map("%xmm$_",(4..7,0..3));
my @Tx=map("%xmm$_",(8..10));
my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
my @T=("%esi","%edi");
+my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13));
+my @rndkey=("%xmm14","%xmm15");
+my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15)); # for dec
+my $Kx=@Tx[2];
my $_rol=sub { &shld(@_[0],@_) };
my $_ror=sub { &shrd(@_[0],@_) };
$code.=<<___;
.type aesni_cbc_sha1_enc_avx,\@function,6
-.align 16
+.align 32
aesni_cbc_sha1_enc_avx:
mov `($win64?56:8)`(%rsp),$inp # load 7th argument
#shr \$6,$len # debugging artefact
@@ -680,17 +1048,16 @@ $code.=<<___;
mov $in0,%r12 # reassign arguments
mov $out,%r13
mov $len,%r14
- mov $key,%r15
+ lea 112($key),%r15 # size optimization
vmovdqu ($ivp),$iv # load IV
mov $ivp,88(%rsp) # save $ivp
___
-my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
+($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
my $rounds="${ivp}d";
$code.=<<___;
shl \$6,$len
sub $in0,$out
- mov 240($key),$rounds
- add \$112,$key # size optimization
+ mov 240-112($key),$rounds
add $inp,$len # end of input
lea K_XX_XX(%rip),$K_XX_XX
@@ -700,9 +1067,12 @@ $code.=<<___;
mov 12($ctx),$D
mov $B,@T[0] # magic seed
mov 16($ctx),$E
+ mov $C,@T[1]
+ xor $D,@T[1]
+ and @T[1],@T[0]
vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
- vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19
+ vmovdqa 0($K_XX_XX),$Kx # K_00_19
vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
vmovdqu 16($inp),@X[-3&7]
vmovdqu 32($inp),@X[-2&7]
@@ -712,13 +1082,13 @@ $code.=<<___;
vpshufb @X[2],@X[-3&7],@X[-3&7]
vpshufb @X[2],@X[-2&7],@X[-2&7]
vpshufb @X[2],@X[-1&7],@X[-1&7]
- vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19
- vpaddd @Tx[1],@X[-3&7],@X[1]
- vpaddd @Tx[1],@X[-2&7],@X[2]
+ vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
+ vpaddd $Kx,@X[-3&7],@X[1]
+ vpaddd $Kx,@X[-2&7],@X[2]
vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
vmovdqa @X[1],16(%rsp)
vmovdqa @X[2],32(%rsp)
- vmovups -112($key),$rndkey0 # $key[0]
+ vmovups -112($key),$rndkey[1] # $key[0]
vmovups 16-112($key),$rndkey[0] # forward reference
jmp .Loop_avx
___
@@ -728,14 +1098,14 @@ my $aesenc=sub {
my ($n,$k)=($r/10,$r%10);
if ($k==0) {
$code.=<<___;
- vmovups `16*$n`($in0),$in # load input
- vxorps $rndkey0,$in,$in
+ vmovdqu `16*$n`($in0),$in # load input
+ vpxor $rndkey[1],$in,$in
___
$code.=<<___ if ($n);
vmovups $iv,`16*($n-1)`($out,$in0) # write output
___
$code.=<<___;
- vxorps $in,$iv,$iv
+ vpxor $in,$iv,$iv
vaesenc $rndkey[0],$iv,$iv
vmovups `32+16*$k-112`($key),$rndkey[1]
___
@@ -755,6 +1125,7 @@ ___
vmovups `32+16*($k+3)-112`($key),$rndkey[0]
.Lvaesenclast$sn:
vaesenclast $rndkey[0],$iv,$iv
+ vmovups -112($key),$rndkey[0]
vmovups 16-112($key),$rndkey[1] # forward reference
___
} else {
@@ -778,10 +1149,10 @@ sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
eval(shift(@insns));
eval(shift(@insns));
- &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
+ &vpaddd (@Tx[1],$Kx,@X[-1&7]);
eval(shift(@insns));
eval(shift(@insns));
- &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
+ &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
eval(shift(@insns));
eval(shift(@insns));
&vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
@@ -807,31 +1178,31 @@ sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
eval(shift(@insns));
eval(shift(@insns));
- &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
+ &vpslldq(@Tx[1],@X[0],12); # "X[0]"<<96, extract one dword
&vpaddd (@X[0],@X[0],@X[0]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
- &vpsrld (@Tx[1],@Tx[2],30);
&vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
+ &vpsrld (@Tx[0],@Tx[1],30);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
- &vpslld (@Tx[2],@Tx[2],2);
- &vpxor (@X[0],@X[0],@Tx[1]);
+ &vpslld (@Tx[1],@Tx[1],2);
+ &vpxor (@X[0],@X[0],@Tx[0]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
- &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
+ &vpxor (@X[0],@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
eval(shift(@insns));
eval(shift(@insns));
- &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
+ &vmovdqa ($Kx,eval(16*(($Xi)/5))."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
eval(shift(@insns));
eval(shift(@insns));
@@ -839,7 +1210,6 @@ sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
foreach (@insns) { eval; } # remaining instructions [if any]
$Xi++; push(@X,shift(@X)); # "rotate" X[]
- push(@Tx,shift(@Tx));
}
sub Xupdate_avx_32_79()
@@ -858,12 +1228,8 @@ sub Xupdate_avx_32_79()
&vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
eval(shift(@insns));
eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
- if ($Xi%5) {
- &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
- } else { # ... or load next one
- &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
- }
- &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
+ &vpaddd (@Tx[1],$Kx,@X[-1&7]);
+ &vmovdqa ($Kx,eval(16*($Xi/5))."($K_XX_XX)") if ($Xi%5==0);
eval(shift(@insns)); # ror
eval(shift(@insns));
@@ -893,7 +1259,6 @@ sub Xupdate_avx_32_79()
&vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
eval(shift(@insns)); # body_20_39
eval(shift(@insns));
- &vmovdqa (@Tx[1],@X[0]) if ($Xi<19);
eval(shift(@insns));
eval(shift(@insns)); # rol
eval(shift(@insns));
@@ -904,7 +1269,6 @@ sub Xupdate_avx_32_79()
foreach (@insns) { eval; } # remaining instructions
$Xi++; push(@X,shift(@X)); # "rotate" X[]
- push(@Tx,shift(@Tx));
}
sub Xuplast_avx_80()
@@ -914,28 +1278,26 @@ sub Xuplast_avx_80()
my ($a,$b,$c,$d,$e);
eval(shift(@insns));
- &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
+ &vpaddd (@Tx[1],$Kx,@X[-1&7]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
- &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
+ &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
foreach (@insns) { eval; } # remaining instructions
&cmp ($inp,$len);
- &je (".Ldone_avx");
+ &je (shift);
- unshift(@Tx,pop(@Tx));
-
- &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask
- &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19
+ &vmovdqa(@Tx[1],"64($K_XX_XX)"); # pbswap mask
+ &vmovdqa($Kx,"0($K_XX_XX)"); # K_00_19
&vmovdqu(@X[-4&7],"0($inp)"); # load input
&vmovdqu(@X[-3&7],"16($inp)");
&vmovdqu(@X[-2&7],"32($inp)");
&vmovdqu(@X[-1&7],"48($inp)");
- &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
+ &vpshufb(@X[-4&7],@X[-4&7],@Tx[1]); # byte swap
&add ($inp,64);
$Xi=0;
@@ -949,15 +1311,15 @@ sub Xloop_avx()
eval(shift(@insns));
eval(shift(@insns));
- &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
+ &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@Tx[1]);
eval(shift(@insns));
eval(shift(@insns));
- &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
+ &vpaddd (@Tx[0],@X[($Xi-4)&7],$Kx);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
- &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU
+ &vmovdqa(eval(16*$Xi)."(%rsp)",@Tx[0]); # X[]+K xfer to IALU
eval(shift(@insns));
eval(shift(@insns));
@@ -975,7 +1337,7 @@ sub Xtail_avx()
}
$code.=<<___;
-.align 16
+.align 32
.Loop_avx:
___
&Xupdate_avx_16_31(\&body_00_19);
@@ -994,7 +1356,7 @@ ___
&Xupdate_avx_32_79(\&body_40_59);
&Xupdate_avx_32_79(\&body_40_59);
&Xupdate_avx_32_79(\&body_20_39);
- &Xuplast_avx_80(\&body_20_39); # can jump to "done"
+ &Xuplast_avx_80(\&body_20_39,".Ldone_avx"); # can jump to "done"
$saved_j=$j; @saved_V=@V;
$saved_r=$r; @saved_rndkey=@rndkey;
@@ -1016,11 +1378,13 @@ $code.=<<___;
mov @T[0],4($ctx)
mov @T[0],$B # magic seed
mov $C,8($ctx)
+ mov $C,@T[1]
mov $D,12($ctx)
+ xor $D,@T[1]
mov $E,16($ctx)
+ and @T[1],@T[0]
jmp .Loop_avx
-.align 16
.Ldone_avx:
___
$jj=$j=$saved_j; @V=@saved_V;
@@ -1072,6 +1436,218 @@ $code.=<<___;
ret
.size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
___
+
+ if ($stitched_decrypt) {{{
+# reset
+($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+$j=$jj=$r=$rx=0;
+$Xi=4;
+
+@aes256_dec = (
+ '&vpxor ($inout0,$rndkey0,"0x00($in0)");',
+ '&vpxor ($inout1,$rndkey0,"0x10($in0)");',
+ '&vpxor ($inout2,$rndkey0,"0x20($in0)");',
+ '&vpxor ($inout3,$rndkey0,"0x30($in0)");',
+
+ '&vmovups($rndkey0,"16-112($key)");',
+ '&vmovups("64(%rsp)",@X[2]);', # save IV, originally @X[3]
+ undef,undef
+ );
+for ($i=0;$i<13;$i++) {
+ push (@aes256_dec,(
+ '&vaesdec ($inout0,$inout0,$rndkey0);',
+ '&vaesdec ($inout1,$inout1,$rndkey0);',
+ '&vaesdec ($inout2,$inout2,$rndkey0);',
+ '&vaesdec ($inout3,$inout3,$rndkey0); &vmovups($rndkey0,"'.(16*($i+2)-112).'($key)");'
+ ));
+ push (@aes256_dec,(undef,undef)) if (($i>=3 && $i<=5) || $i>=11);
+ push (@aes256_dec,(undef,undef)) if ($i==5);
+}
+push(@aes256_dec,(
+ '&vaesdeclast ($inout0,$inout0,$rndkey0); &vmovups(@X[0],"0x00($in0)");',
+ '&vaesdeclast ($inout1,$inout1,$rndkey0); &vmovups(@X[1],"0x10($in0)");',
+ '&vaesdeclast ($inout2,$inout2,$rndkey0); &vmovups(@X[2],"0x20($in0)");',
+ '&vaesdeclast ($inout3,$inout3,$rndkey0); &vmovups(@X[3],"0x30($in0)");',
+
+ '&vxorps ($inout0,$inout0,"64(%rsp)"); &vmovdqu($rndkey0,"-112($key)");',
+ '&vxorps ($inout1,$inout1,@X[0]); &vmovups("0x00($out,$in0)",$inout0);',
+ '&vxorps ($inout2,$inout2,@X[1]); &vmovups("0x10($out,$in0)",$inout1);',
+ '&vxorps ($inout3,$inout3,@X[2]); &vmovups("0x20($out,$in0)",$inout2);',
+
+ '&vmovups ("0x30($out,$in0)",$inout3);'
+ ));
+
+$code.=<<___;
+.type aesni256_cbc_sha1_dec_avx,\@function,6
+.align 32
+aesni256_cbc_sha1_dec_avx:
+ mov `($win64?56:8)`(%rsp),$inp # load 7th argument
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea `-104-($win64?10*16:0)`(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,96+0(%rsp)
+ movaps %xmm7,96+16(%rsp)
+ movaps %xmm8,96+32(%rsp)
+ movaps %xmm9,96+48(%rsp)
+ movaps %xmm10,96+64(%rsp)
+ movaps %xmm11,96+80(%rsp)
+ movaps %xmm12,96+96(%rsp)
+ movaps %xmm13,96+112(%rsp)
+ movaps %xmm14,96+128(%rsp)
+ movaps %xmm15,96+144(%rsp)
+.Lprologue_dec_avx:
+___
+$code.=<<___;
+ vzeroall
+ mov $in0,%r12 # reassign arguments
+ mov $out,%r13
+ mov $len,%r14
+ lea 112($key),%r15 # size optimization
+ vmovdqu ($ivp),@X[3] # load IV
+___
+($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
+$code.=<<___;
+ shl \$6,$len
+ sub $in0,$out
+ add $inp,$len # end of input
+
+ lea K_XX_XX(%rip),$K_XX_XX
+ mov 0($ctx),$A # load context
+ mov 4($ctx),$B
+ mov 8($ctx),$C
+ mov 12($ctx),$D
+ mov $B,@T[0] # magic seed
+ mov 16($ctx),$E
+ mov $C,@T[1]
+ xor $D,@T[1]
+ and @T[1],@T[0]
+
+ vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
+ vmovdqa 0($K_XX_XX),$Kx # K_00_19
+ vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
+ vmovdqu 16($inp),@X[-3&7]
+ vmovdqu 32($inp),@X[-2&7]
+ vmovdqu 48($inp),@X[-1&7]
+ vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
+ add \$64,$inp
+ vpshufb @X[2],@X[-3&7],@X[-3&7]
+ vpshufb @X[2],@X[-2&7],@X[-2&7]
+ vpshufb @X[2],@X[-1&7],@X[-1&7]
+ vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
+ vpaddd $Kx,@X[-3&7],@X[1]
+ vpaddd $Kx,@X[-2&7],@X[2]
+ vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
+ vmovdqa @X[1],16(%rsp)
+ vmovdqa @X[2],32(%rsp)
+ vmovups -112($key),$rndkey0 # $key[0]
+ jmp .Loop_dec_avx
+
+.align 32
+.Loop_dec_avx:
+___
+ &Xupdate_avx_16_31(\&body_00_19_dec);
+ &Xupdate_avx_16_31(\&body_00_19_dec);
+ &Xupdate_avx_16_31(\&body_00_19_dec);
+ &Xupdate_avx_16_31(\&body_00_19_dec);
+ &Xupdate_avx_32_79(\&body_00_19_dec);
+ &Xupdate_avx_32_79(\&body_20_39_dec);
+ &Xupdate_avx_32_79(\&body_20_39_dec);
+ &Xupdate_avx_32_79(\&body_20_39_dec);
+ &Xupdate_avx_32_79(\&body_20_39_dec);
+ &Xupdate_avx_32_79(\&body_20_39_dec);
+ &Xupdate_avx_32_79(\&body_40_59_dec);
+ &Xupdate_avx_32_79(\&body_40_59_dec);
+ &Xupdate_avx_32_79(\&body_40_59_dec);
+ &Xupdate_avx_32_79(\&body_40_59_dec);
+ &Xupdate_avx_32_79(\&body_40_59_dec);
+ &Xupdate_avx_32_79(\&body_20_39_dec);
+ &Xuplast_avx_80(\&body_20_39_dec,".Ldone_dec_avx"); # can jump to "done"
+
+ $saved_j=$j; @saved_V=@V;
+ $saved_rx=$rx;
+
+ &Xloop_avx(\&body_20_39_dec);
+ &Xloop_avx(\&body_20_39_dec);
+ &Xloop_avx(\&body_20_39_dec);
+
+ eval(@aes256_dec[-1]); # last store
+$code.=<<___;
+ lea 64($in0),$in0
+
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ add 12($ctx),$D
+ mov $A,0($ctx)
+ add 16($ctx),$E
+ mov @T[0],4($ctx)
+ mov @T[0],$B # magic seed
+ mov $C,8($ctx)
+ mov $C,@T[1]
+ mov $D,12($ctx)
+ xor $D,@T[1]
+ mov $E,16($ctx)
+ and @T[1],@T[0]
+ jmp .Loop_dec_avx
+
+.Ldone_dec_avx:
+___
+ $jj=$j=$saved_j; @V=@saved_V;
+ $rx=$saved_rx;
+
+ &Xtail_avx(\&body_20_39_dec);
+ &Xtail_avx(\&body_20_39_dec);
+ &Xtail_avx(\&body_20_39_dec);
+
+ eval(@aes256_dec[-1]); # last store
+$code.=<<___;
+
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ mov $A,0($ctx)
+ add 12($ctx),$D
+ mov @T[0],4($ctx)
+ add 16($ctx),$E
+ mov $C,8($ctx)
+ mov $D,12($ctx)
+ mov $E,16($ctx)
+ vmovups @X[3],($ivp) # write IV
+ vzeroall
+___
+$code.=<<___ if ($win64);
+ movaps 96+0(%rsp),%xmm6
+ movaps 96+16(%rsp),%xmm7
+ movaps 96+32(%rsp),%xmm8
+ movaps 96+48(%rsp),%xmm9
+ movaps 96+64(%rsp),%xmm10
+ movaps 96+80(%rsp),%xmm11
+ movaps 96+96(%rsp),%xmm12
+ movaps 96+112(%rsp),%xmm13
+ movaps 96+128(%rsp),%xmm14
+ movaps 96+144(%rsp),%xmm15
+___
+$code.=<<___;
+ lea `104+($win64?10*16:0)`(%rsp),%rsi
+ mov 0(%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lepilogue_dec_avx:
+ ret
+.size aesni256_cbc_sha1_dec_avx,.-aesni256_cbc_sha1_dec_avx
+___
+ }}}
}
$code.=<<___;
.align 64
@@ -1081,11 +1657,180 @@ K_XX_XX:
.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
+.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
.asciz "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
.align 64
___
+ if ($shaext) {{{
+($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+$rounds="%r11d";
+
+($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15));
+@rndkey=("%xmm0","%xmm1");
+$r=0;
+
+my ($BSWAP,$ABCD,$E,$E_,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(7..12));
+my @MSG=map("%xmm$_",(3..6));
+
+$code.=<<___;
+.type aesni_cbc_sha1_enc_shaext,\@function,6
+.align 32
+aesni_cbc_sha1_enc_shaext:
+ mov `($win64?56:8)`(%rsp),$inp # load 7th argument
+___
+$code.=<<___ if ($win64);
+ lea `-8-10*16`(%rsp),%rsp
+ movaps %xmm6,-8-10*16(%rax)
+ movaps %xmm7,-8-9*16(%rax)
+ movaps %xmm8,-8-8*16(%rax)
+ movaps %xmm9,-8-7*16(%rax)
+ movaps %xmm10,-8-6*16(%rax)
+ movaps %xmm11,-8-5*16(%rax)
+ movaps %xmm12,-8-4*16(%rax)
+ movaps %xmm13,-8-3*16(%rax)
+ movaps %xmm14,-8-2*16(%rax)
+ movaps %xmm15,-8-1*16(%rax)
+.Lprologue_shaext:
+___
+$code.=<<___;
+ movdqu ($ctx),$ABCD
+ movd 16($ctx),$E
+ movdqa K_XX_XX+0x50(%rip),$BSWAP # byte-n-word swap
+
+ mov 240($key),$rounds
+ sub $in0,$out
+ movups ($key),$rndkey0 # $key[0]
+ movups 16($key),$rndkey[0] # forward reference
+ lea 112($key),$key # size optimization
+
+ pshufd \$0b00011011,$ABCD,$ABCD # flip word order
+ pshufd \$0b00011011,$E,$E # flip word order
+ jmp .Loop_shaext
+.align 16
+.Loop_shaext:
+___
+ &$aesenc();
+$code.=<<___;
+ movdqu ($inp),@MSG[0]
+ movdqa $E,$E_SAVE # offload $E
+ pshufb $BSWAP,@MSG[0]
+ movdqu 0x10($inp),@MSG[1]
+ movdqa $ABCD,$ABCD_SAVE # offload $ABCD
+___
+ &$aesenc();
+$code.=<<___;
+ pshufb $BSWAP,@MSG[1]
+
+ paddd @MSG[0],$E
+ movdqu 0x20($inp),@MSG[2]
+ lea 0x40($inp),$inp
+ pxor $E_SAVE,@MSG[0] # black magic
+___
+ &$aesenc();
+$code.=<<___;
+ pxor $E_SAVE,@MSG[0] # black magic
+ movdqa $ABCD,$E_
+ pshufb $BSWAP,@MSG[2]
+ sha1rnds4 \$0,$E,$ABCD # 0-3
+ sha1nexte @MSG[1],$E_
+___
+ &$aesenc();
+$code.=<<___;
+ sha1msg1 @MSG[1],@MSG[0]
+ movdqu -0x10($inp),@MSG[3]
+ movdqa $ABCD,$E
+ pshufb $BSWAP,@MSG[3]
+___
+ &$aesenc();
+$code.=<<___;
+ sha1rnds4 \$0,$E_,$ABCD # 4-7
+ sha1nexte @MSG[2],$E
+ pxor @MSG[2],@MSG[0]
+ sha1msg1 @MSG[2],@MSG[1]
+___
+ &$aesenc();
+
+for($i=2;$i<20-4;$i++) {
+$code.=<<___;
+ movdqa $ABCD,$E_
+ sha1rnds4 \$`int($i/5)`,$E,$ABCD # 8-11
+ sha1nexte @MSG[3],$E_
+___
+ &$aesenc();
+$code.=<<___;
+ sha1msg2 @MSG[3],@MSG[0]
+ pxor @MSG[3],@MSG[1]
+ sha1msg1 @MSG[3],@MSG[2]
+___
+ ($E,$E_)=($E_,$E);
+ push(@MSG,shift(@MSG));
+
+ &$aesenc();
+}
+$code.=<<___;
+ movdqa $ABCD,$E_
+ sha1rnds4 \$3,$E,$ABCD # 64-67
+ sha1nexte @MSG[3],$E_
+ sha1msg2 @MSG[3],@MSG[0]
+ pxor @MSG[3],@MSG[1]
+___
+ &$aesenc();
+$code.=<<___;
+ movdqa $ABCD,$E
+ sha1rnds4 \$3,$E_,$ABCD # 68-71
+ sha1nexte @MSG[0],$E
+ sha1msg2 @MSG[0],@MSG[1]
+___
+ &$aesenc();
+$code.=<<___;
+ movdqa $E_SAVE,@MSG[0]
+ movdqa $ABCD,$E_
+ sha1rnds4 \$3,$E,$ABCD # 72-75
+ sha1nexte @MSG[1],$E_
+___
+ &$aesenc();
+$code.=<<___;
+ movdqa $ABCD,$E
+ sha1rnds4 \$3,$E_,$ABCD # 76-79
+ sha1nexte $MSG[0],$E
+___
+ while($r<40) { &$aesenc(); } # remaining aesenc's
+$code.=<<___;
+ dec $len
+
+ paddd $ABCD_SAVE,$ABCD
+ movups $iv,48($out,$in0) # write output
+ lea 64($in0),$in0
+ jnz .Loop_shaext
+
+ pshufd \$0b00011011,$ABCD,$ABCD
+ pshufd \$0b00011011,$E,$E
+ movups $iv,($ivp) # write IV
+ movdqu $ABCD,($ctx)
+ movd $E,16($ctx)
+___
+$code.=<<___ if ($win64);
+ movaps -8-10*16(%rax),%xmm6
+ movaps -8-9*16(%rax),%xmm7
+ movaps -8-8*16(%rax),%xmm8
+ movaps -8-7*16(%rax),%xmm9
+ movaps -8-6*16(%rax),%xmm10
+ movaps -8-5*16(%rax),%xmm11
+ movaps -8-4*16(%rax),%xmm12
+ movaps -8-3*16(%rax),%xmm13
+ movaps -8-2*16(%rax),%xmm14
+ movaps -8-1*16(%rax),%xmm15
+ mov %rax,%rsp
+.Lepilogue_shaext:
+___
+$code.=<<___;
+ ret
+.size aesni_cbc_sha1_enc_shaext,.-aesni_cbc_sha1_enc_shaext
+___
+ }}}
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
if ($win64) {
@@ -1127,7 +1872,21 @@ ssse3_handler:
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail
+___
+$code.=<<___ if ($shaext);
+ lea aesni_cbc_sha1_enc_shaext(%rip),%r10
+ cmp %r10,%rbx
+ jb .Lseh_no_shaext
+ lea (%rax),%rsi
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+ lea 168(%rax),%rax # adjust stack pointer
+ jmp .Lcommon_seh_tail
+.Lseh_no_shaext:
+___
+$code.=<<___;
lea 96(%rax),%rsi
lea 512($context),%rdi # &context.Xmm6
mov \$20,%ecx
@@ -1199,6 +1958,11 @@ $code.=<<___ if ($avx);
.rva .LSEH_end_aesni_cbc_sha1_enc_avx
.rva .LSEH_info_aesni_cbc_sha1_enc_avx
___
+$code.=<<___ if ($shaext);
+ .rva .LSEH_begin_aesni_cbc_sha1_enc_shaext
+ .rva .LSEH_end_aesni_cbc_sha1_enc_shaext
+ .rva .LSEH_info_aesni_cbc_sha1_enc_shaext
+___
$code.=<<___;
.section .xdata
.align 8
@@ -1213,6 +1977,12 @@ $code.=<<___ if ($avx);
.rva ssse3_handler
.rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
___
+$code.=<<___ if ($shaext);
+.LSEH_info_aesni_cbc_sha1_enc_shaext:
+ .byte 9,0,0,0
+ .rva ssse3_handler
+ .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
+___
}
####################################################################
@@ -1223,28 +1993,65 @@ sub rex {
$rex|=0x04 if($dst>=8);
$rex|=0x01 if($src>=8);
- push @opcode,$rex|0x40 if($rex);
+ unshift @opcode,$rex|0x40 if($rex);
+}
+
+sub sha1rnds4 {
+ if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my @opcode=(0x0f,0x3a,0xcc);
+ rex(\@opcode,$3,$2);
+ push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
+ my $c=$1;
+ push @opcode,$c=~/^0/?oct($c):$c;
+ return ".byte\t".join(',',@opcode);
+ } else {
+ return "sha1rnds4\t".@_[0];
+ }
+}
+
+sub sha1op38 {
+ my $instr = shift;
+ my %opcodelet = (
+ "sha1nexte" => 0xc8,
+ "sha1msg1" => 0xc9,
+ "sha1msg2" => 0xca );
+
+ if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my @opcode=(0x0f,0x38);
+ rex(\@opcode,$2,$1);
+ push @opcode,$opcodelet{$instr};
+ push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
+ return ".byte\t".join(',',@opcode);
+ } else {
+ return $instr."\t".@_[0];
+ }
}
sub aesni {
my $line=shift;
- my @opcode=(0x66);
+ my @opcode=(0x0f,0x38);
if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
my %opcodelet = (
- "aesenc" => 0xdc, "aesenclast" => 0xdd
+ "aesenc" => 0xdc, "aesenclast" => 0xdd,
+ "aesdec" => 0xde, "aesdeclast" => 0xdf
);
return undef if (!defined($opcodelet{$1}));
rex(\@opcode,$3,$2);
- push @opcode,0x0f,0x38,$opcodelet{$1};
- push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
+ push @opcode,$opcodelet{$1},0xc0|($2&7)|(($3&7)<<3); # ModR/M
+ unshift @opcode,0x66;
return ".byte\t".join(',',@opcode);
}
return $line;
}
-$code =~ s/\`([^\`]*)\`/eval($1)/gem;
-$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or
+ s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo or
+ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/geo;
-print $code;
+ print $_,"\n";
+}
close STDOUT;
diff --git a/crypto/aes/asm/aesni-sha256-x86_64.pl b/crypto/aes/asm/aesni-sha256-x86_64.pl
new file mode 100755
index 000000000000..19b0433b3b1b
--- /dev/null
+++ b/crypto/aes/asm/aesni-sha256-x86_64.pl
@@ -0,0 +1,1708 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# January 2013
+#
+# This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
+# in http://download.intel.com/design/intarch/papers/323686.pdf, is
+# that since AESNI-CBC encrypt exhibit *very* low instruction-level
+# parallelism, interleaving it with another algorithm would allow to
+# utilize processor resources better and achieve better performance.
+# SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
+# AESNI code is weaved into it. As SHA256 dominates execution time,
+# stitch performance does not depend on AES key length. Below are
+# performance numbers in cycles per processed byte, less is better,
+# for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
+# subroutine:
+#
+# AES-128/-192/-256+SHA256 this(**)gain
+# Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
+# Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
+# Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
+# Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
+#
+# (*) there are XOP, AVX1 and AVX2 code pathes, meaning that
+# Westmere is omitted from loop, this is because gain was not
+# estimated high enough to justify the effort;
+# (**) these are EVP-free results, results obtained with 'speed
+# -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=12);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+ $avx = ($2>=3.0) + ($2>3.0);
+}
+
+$shaext=$avx; ### set to zero if compiling for 1.0.1
+$avx=1 if (!$shaext && $avx);
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$func="aesni_cbc_sha256_enc";
+$TABLE="K256";
+$SZ=4;
+@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
+ "%r8d","%r9d","%r10d","%r11d");
+($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
+@Sigma0=( 2,13,22);
+@Sigma1=( 6,11,25);
+@sigma0=( 7,18, 3);
+@sigma1=(17,19,10);
+$rounds=64;
+
+########################################################################
+# void aesni_cbc_sha256_enc(const void *inp,
+# void *out,
+# size_t length,
+# const AES_KEY *key,
+# unsigned char *iv,
+# SHA256_CTX *ctx,
+# const void *in0);
+($inp, $out, $len, $key, $ivp, $ctx, $in0) =
+("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+$Tbl="%rbp";
+
+$_inp="16*$SZ+0*8(%rsp)";
+$_out="16*$SZ+1*8(%rsp)";
+$_end="16*$SZ+2*8(%rsp)";
+$_key="16*$SZ+3*8(%rsp)";
+$_ivp="16*$SZ+4*8(%rsp)";
+$_ctx="16*$SZ+5*8(%rsp)";
+$_in0="16*$SZ+6*8(%rsp)";
+$_rsp="16*$SZ+7*8(%rsp)";
+$framesz=16*$SZ+8*8;
+
+$code=<<___;
+.text
+
+.extern OPENSSL_ia32cap_P
+.globl $func
+.type $func,\@abi-omnipotent
+.align 16
+$func:
+___
+ if ($avx) {
+$code.=<<___;
+ lea OPENSSL_ia32cap_P(%rip),%r11
+ mov \$1,%eax
+ cmp \$0,`$win64?"%rcx":"%rdi"`
+ je .Lprobe
+ mov 0(%r11),%eax
+ mov 4(%r11),%r10
+___
+$code.=<<___ if ($shaext);
+ bt \$61,%r10 # check for SHA
+ jc ${func}_shaext
+___
+$code.=<<___;
+ mov %r10,%r11
+ shr \$32,%r11
+
+ test \$`1<<11`,%r10d # check for XOP
+ jnz ${func}_xop
+___
+$code.=<<___ if ($avx>1);
+ and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
+ cmp \$`1<<8|1<<5|1<<3`,%r11d
+ je ${func}_avx2
+___
+$code.=<<___;
+ and \$`1<<30`,%eax # mask "Intel CPU" bit
+ and \$`1<<28|1<<9`,%r10d # mask AVX+SSSE3 bits
+ or %eax,%r10d
+ cmp \$`1<<28|1<<9|1<<30`,%r10d
+ je ${func}_avx
+ ud2
+___
+ }
+$code.=<<___;
+ xor %eax,%eax
+ cmp \$0,`$win64?"%rcx":"%rdi"`
+ je .Lprobe
+ ud2
+.Lprobe:
+ ret
+.size $func,.-$func
+
+.align 64
+.type $TABLE,\@object
+$TABLE:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+ .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+ .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+ .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1
+ .long 0,0,0,0, 0,0,0,0
+ .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 64
+___
+
+######################################################################
+# SIMD code paths
+#
+{{{
+($iv,$inout,$roundkey,$temp,
+ $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
+
+$aesni_cbc_idx=0;
+@aesni_cbc_block = (
+## &vmovdqu ($roundkey,"0x00-0x80($inp)");'
+## &vmovdqu ($inout,($inp));
+## &mov ($_inp,$inp);
+
+ '&vpxor ($inout,$inout,$roundkey);'.
+ ' &vmovdqu ($roundkey,"0x10-0x80($inp)");',
+
+ '&vpxor ($inout,$inout,$iv);',
+
+ '&vaesenc ($inout,$inout,$roundkey);'.
+ ' &vmovdqu ($roundkey,"0x20-0x80($inp)");',
+
+ '&vaesenc ($inout,$inout,$roundkey);'.
+ ' &vmovdqu ($roundkey,"0x30-0x80($inp)");',
+
+ '&vaesenc ($inout,$inout,$roundkey);'.
+ ' &vmovdqu ($roundkey,"0x40-0x80($inp)");',
+
+ '&vaesenc ($inout,$inout,$roundkey);'.
+ ' &vmovdqu ($roundkey,"0x50-0x80($inp)");',
+
+ '&vaesenc ($inout,$inout,$roundkey);'.
+ ' &vmovdqu ($roundkey,"0x60-0x80($inp)");',
+
+ '&vaesenc ($inout,$inout,$roundkey);'.
+ ' &vmovdqu ($roundkey,"0x70-0x80($inp)");',
+
+ '&vaesenc ($inout,$inout,$roundkey);'.
+ ' &vmovdqu ($roundkey,"0x80-0x80($inp)");',
+
+ '&vaesenc ($inout,$inout,$roundkey);'.
+ ' &vmovdqu ($roundkey,"0x90-0x80($inp)");',
+
+ '&vaesenc ($inout,$inout,$roundkey);'.
+ ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");',
+
+ '&vaesenclast ($temp,$inout,$roundkey);'.
+ ' &vaesenc ($inout,$inout,$roundkey);'.
+ ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");',
+
+ '&vpand ($iv,$temp,$mask10);'.
+ ' &vaesenc ($inout,$inout,$roundkey);'.
+ ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");',
+
+ '&vaesenclast ($temp,$inout,$roundkey);'.
+ ' &vaesenc ($inout,$inout,$roundkey);'.
+ ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");',
+
+ '&vpand ($temp,$temp,$mask12);'.
+ ' &vaesenc ($inout,$inout,$roundkey);'.
+ '&vmovdqu ($roundkey,"0xe0-0x80($inp)");',
+
+ '&vpor ($iv,$iv,$temp);'.
+ ' &vaesenclast ($temp,$inout,$roundkey);'.
+ ' &vmovdqu ($roundkey,"0x00-0x80($inp)");'
+
+## &mov ($inp,$_inp);
+## &mov ($out,$_out);
+## &vpand ($temp,$temp,$mask14);
+## &vpor ($iv,$iv,$temp);
+## &vmovdqu ($iv,($out,$inp);
+## &lea (inp,16($inp));
+);
+
+my $a4=$T1;
+my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+ my $arg = pop;
+ $arg = "\$$arg" if ($arg*1 eq $arg);
+ $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+sub body_00_15 () {
+ (
+ '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
+
+ '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
+ '&mov ($a,$a1)',
+ '&mov ($a4,$f)',
+
+ '&xor ($a0,$e)',
+ '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
+ '&xor ($a4,$g)', # f^g
+
+ '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
+ '&xor ($a1,$a)',
+ '&and ($a4,$e)', # (f^g)&e
+
+ @aesni_cbc_block[$aesni_cbc_idx++].
+ '&xor ($a0,$e)',
+ '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
+ '&mov ($a2,$a)',
+
+ '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
+ '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
+ '&xor ($a2,$b)', # a^b, b^c in next round
+
+ '&ror ($a0,$Sigma1[0])', # Sigma1(e)
+ '&add ($h,$a4)', # h+=Ch(e,f,g)
+ '&and ($a3,$a2)', # (b^c)&(a^b)
+
+ '&xor ($a1,$a)',
+ '&add ($h,$a0)', # h+=Sigma1(e)
+ '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
+
+ '&add ($d,$h)', # d+=h
+ '&ror ($a1,$Sigma0[0])', # Sigma0(a)
+ '&add ($h,$a3)', # h+=Maj(a,b,c)
+
+ '&mov ($a0,$d)',
+ '&add ($a1,$h);'. # h+=Sigma0(a)
+ '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
+ );
+}
+
+if ($avx) {{
+######################################################################
+# XOP code path
+#
+$code.=<<___;
+.type ${func}_xop,\@function,6
+.align 64
+${func}_xop:
+.Lxop_shortcut:
+ mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ mov %rsp,%r11 # copy %rsp
+ sub \$`$framesz+$win64*16*10`,%rsp
+ and \$-64,%rsp # align stack frame
+
+ shl \$6,$len
+ sub $inp,$out # re-bias
+ sub $inp,$in0
+ add $inp,$len # end of input
+
+ #mov $inp,$_inp # saved later
+ mov $out,$_out
+ mov $len,$_end
+ #mov $key,$_key # remains resident in $inp register
+ mov $ivp,$_ivp
+ mov $ctx,$_ctx
+ mov $in0,$_in0
+ mov %r11,$_rsp
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,`$framesz+16*0`(%rsp)
+ movaps %xmm7,`$framesz+16*1`(%rsp)
+ movaps %xmm8,`$framesz+16*2`(%rsp)
+ movaps %xmm9,`$framesz+16*3`(%rsp)
+ movaps %xmm10,`$framesz+16*4`(%rsp)
+ movaps %xmm11,`$framesz+16*5`(%rsp)
+ movaps %xmm12,`$framesz+16*6`(%rsp)
+ movaps %xmm13,`$framesz+16*7`(%rsp)
+ movaps %xmm14,`$framesz+16*8`(%rsp)
+ movaps %xmm15,`$framesz+16*9`(%rsp)
+___
+$code.=<<___;
+.Lprologue_xop:
+ vzeroall
+
+ mov $inp,%r12 # borrow $a4
+ lea 0x80($key),$inp # size optimization, reassign
+ lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
+ mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
+ mov $ctx,%r15 # borrow $a2
+ mov $in0,%rsi # borrow $a3
+ vmovdqu ($ivp),$iv # load IV
+ sub \$9,%r14
+
+ mov $SZ*0(%r15),$A
+ mov $SZ*1(%r15),$B
+ mov $SZ*2(%r15),$C
+ mov $SZ*3(%r15),$D
+ mov $SZ*4(%r15),$E
+ mov $SZ*5(%r15),$F
+ mov $SZ*6(%r15),$G
+ mov $SZ*7(%r15),$H
+
+ vmovdqa 0x00(%r13,%r14,8),$mask14
+ vmovdqa 0x10(%r13,%r14,8),$mask12
+ vmovdqa 0x20(%r13,%r14,8),$mask10
+ vmovdqu 0x00-0x80($inp),$roundkey
+ jmp .Lloop_xop
+___
+ if ($SZ==4) { # SHA256
+ my @X = map("%xmm$_",(0..3));
+ my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
+
+$code.=<<___;
+.align 16
+.Lloop_xop:
+ vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
+ vmovdqu 0x00(%rsi,%r12),@X[0]
+ vmovdqu 0x10(%rsi,%r12),@X[1]
+ vmovdqu 0x20(%rsi,%r12),@X[2]
+ vmovdqu 0x30(%rsi,%r12),@X[3]
+ vpshufb $t3,@X[0],@X[0]
+ lea $TABLE(%rip),$Tbl
+ vpshufb $t3,@X[1],@X[1]
+ vpshufb $t3,@X[2],@X[2]
+ vpaddd 0x00($Tbl),@X[0],$t0
+ vpshufb $t3,@X[3],@X[3]
+ vpaddd 0x20($Tbl),@X[1],$t1
+ vpaddd 0x40($Tbl),@X[2],$t2
+ vpaddd 0x60($Tbl),@X[3],$t3
+ vmovdqa $t0,0x00(%rsp)
+ mov $A,$a1
+ vmovdqa $t1,0x10(%rsp)
+ mov $B,$a3
+ vmovdqa $t2,0x20(%rsp)
+ xor $C,$a3 # magic
+ vmovdqa $t3,0x30(%rsp)
+ mov $E,$a0
+ jmp .Lxop_00_47
+
+.align 16
+.Lxop_00_47:
+ sub \$-16*2*$SZ,$Tbl # size optimization
+ vmovdqu (%r12),$inout # $a4
+ mov %r12,$_inp # $a4
+___
+sub XOP_256_00_47 () {
+my $j = shift;
+my $body = shift;
+my @X = @_;
+my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
+
+ &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpsrld ($t0,$t0,$sigma0[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpxor ($t0,$t0,$t1);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpsrld ($t2,@X[3],$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpxor ($t3,$t3,$t2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpsrldq ($t3,$t3,8);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpsrld ($t2,@X[0],$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpxor ($t3,$t3,$t2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpslldq ($t3,$t3,8); # 22 instructions
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
+ foreach (@insns) { eval; } # remaining instructions
+ &vmovdqa (16*$j."(%rsp)",$t2);
+}
+
+ $aesni_cbc_idx=0;
+ for ($i=0,$j=0; $j<4; $j++) {
+ &XOP_256_00_47($j,\&body_00_15,@X);
+ push(@X,shift(@X)); # rotate(@X)
+ }
+ &mov ("%r12",$_inp); # borrow $a4
+ &vpand ($temp,$temp,$mask14);
+ &mov ("%r15",$_out); # borrow $a2
+ &vpor ($iv,$iv,$temp);
+ &vmovdqu ("(%r15,%r12)",$iv); # write output
+ &lea ("%r12","16(%r12)"); # inp++
+
+ &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
+ &jne (".Lxop_00_47");
+
+ &vmovdqu ($inout,"(%r12)");
+ &mov ($_inp,"%r12");
+
+ $aesni_cbc_idx=0;
+ for ($i=0; $i<16; ) {
+ foreach(body_00_15()) { eval; }
+ }
+ }
+$code.=<<___;
+ mov $_inp,%r12 # borrow $a4
+ mov $_out,%r13 # borrow $a0
+ mov $_ctx,%r15 # borrow $a2
+ mov $_in0,%rsi # borrow $a3
+
+ vpand $mask14,$temp,$temp
+ mov $a1,$A
+ vpor $temp,$iv,$iv
+ vmovdqu $iv,(%r13,%r12) # write output
+ lea 16(%r12),%r12 # inp++
+
+ add $SZ*0(%r15),$A
+ add $SZ*1(%r15),$B
+ add $SZ*2(%r15),$C
+ add $SZ*3(%r15),$D
+ add $SZ*4(%r15),$E
+ add $SZ*5(%r15),$F
+ add $SZ*6(%r15),$G
+ add $SZ*7(%r15),$H
+
+ cmp $_end,%r12
+
+ mov $A,$SZ*0(%r15)
+ mov $B,$SZ*1(%r15)
+ mov $C,$SZ*2(%r15)
+ mov $D,$SZ*3(%r15)
+ mov $E,$SZ*4(%r15)
+ mov $F,$SZ*5(%r15)
+ mov $G,$SZ*6(%r15)
+ mov $H,$SZ*7(%r15)
+
+ jb .Lloop_xop
+
+ mov $_ivp,$ivp
+ mov $_rsp,%rsi
+ vmovdqu $iv,($ivp) # output IV
+ vzeroall
+___
+$code.=<<___ if ($win64);
+ movaps `$framesz+16*0`(%rsp),%xmm6
+ movaps `$framesz+16*1`(%rsp),%xmm7
+ movaps `$framesz+16*2`(%rsp),%xmm8
+ movaps `$framesz+16*3`(%rsp),%xmm9
+ movaps `$framesz+16*4`(%rsp),%xmm10
+ movaps `$framesz+16*5`(%rsp),%xmm11
+ movaps `$framesz+16*6`(%rsp),%xmm12
+ movaps `$framesz+16*7`(%rsp),%xmm13
+ movaps `$framesz+16*8`(%rsp),%xmm14
+ movaps `$framesz+16*9`(%rsp),%xmm15
+___
+$code.=<<___;
+ mov (%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lepilogue_xop:
+ ret
+.size ${func}_xop,.-${func}_xop
+___
+######################################################################
+# AVX+shrd code path
+#
+local *ror = sub { &shrd(@_[0],@_) };
+
+$code.=<<___;
+.type ${func}_avx,\@function,6
+.align 64
+${func}_avx:
+.Lavx_shortcut:
+ mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ mov %rsp,%r11 # copy %rsp
+ sub \$`$framesz+$win64*16*10`,%rsp
+ and \$-64,%rsp # align stack frame
+
+ shl \$6,$len
+ sub $inp,$out # re-bias
+ sub $inp,$in0
+ add $inp,$len # end of input
+
+ #mov $inp,$_inp # saved later
+ mov $out,$_out
+ mov $len,$_end
+ #mov $key,$_key # remains resident in $inp register
+ mov $ivp,$_ivp
+ mov $ctx,$_ctx
+ mov $in0,$_in0
+ mov %r11,$_rsp
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,`$framesz+16*0`(%rsp)
+ movaps %xmm7,`$framesz+16*1`(%rsp)
+ movaps %xmm8,`$framesz+16*2`(%rsp)
+ movaps %xmm9,`$framesz+16*3`(%rsp)
+ movaps %xmm10,`$framesz+16*4`(%rsp)
+ movaps %xmm11,`$framesz+16*5`(%rsp)
+ movaps %xmm12,`$framesz+16*6`(%rsp)
+ movaps %xmm13,`$framesz+16*7`(%rsp)
+ movaps %xmm14,`$framesz+16*8`(%rsp)
+ movaps %xmm15,`$framesz+16*9`(%rsp)
+___
+$code.=<<___;
+.Lprologue_avx:
+ vzeroall
+
+ mov $inp,%r12 # borrow $a4
+ lea 0x80($key),$inp # size optimization, reassign
+ lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
+ mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
+ mov $ctx,%r15 # borrow $a2
+ mov $in0,%rsi # borrow $a3
+ vmovdqu ($ivp),$iv # load IV
+ sub \$9,%r14
+
+ mov $SZ*0(%r15),$A
+ mov $SZ*1(%r15),$B
+ mov $SZ*2(%r15),$C
+ mov $SZ*3(%r15),$D
+ mov $SZ*4(%r15),$E
+ mov $SZ*5(%r15),$F
+ mov $SZ*6(%r15),$G
+ mov $SZ*7(%r15),$H
+
+ vmovdqa 0x00(%r13,%r14,8),$mask14
+ vmovdqa 0x10(%r13,%r14,8),$mask12
+ vmovdqa 0x20(%r13,%r14,8),$mask10
+ vmovdqu 0x00-0x80($inp),$roundkey
+___
+ if ($SZ==4) { # SHA256
+ my @X = map("%xmm$_",(0..3));
+ my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
+
+$code.=<<___;
+ jmp .Lloop_avx
+.align 16
+.Lloop_avx:
+ vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
+ vmovdqu 0x00(%rsi,%r12),@X[0]
+ vmovdqu 0x10(%rsi,%r12),@X[1]
+ vmovdqu 0x20(%rsi,%r12),@X[2]
+ vmovdqu 0x30(%rsi,%r12),@X[3]
+ vpshufb $t3,@X[0],@X[0]
+ lea $TABLE(%rip),$Tbl
+ vpshufb $t3,@X[1],@X[1]
+ vpshufb $t3,@X[2],@X[2]
+ vpaddd 0x00($Tbl),@X[0],$t0
+ vpshufb $t3,@X[3],@X[3]
+ vpaddd 0x20($Tbl),@X[1],$t1
+ vpaddd 0x40($Tbl),@X[2],$t2
+ vpaddd 0x60($Tbl),@X[3],$t3
+ vmovdqa $t0,0x00(%rsp)
+ mov $A,$a1
+ vmovdqa $t1,0x10(%rsp)
+ mov $B,$a3
+ vmovdqa $t2,0x20(%rsp)
+ xor $C,$a3 # magic
+ vmovdqa $t3,0x30(%rsp)
+ mov $E,$a0
+ jmp .Lavx_00_47
+
+.align 16
+.Lavx_00_47:
+ sub \$-16*2*$SZ,$Tbl # size optimization
+ vmovdqu (%r12),$inout # $a4
+ mov %r12,$_inp # $a4
+___
+sub Xupdate_256_AVX () {
+ (
+ '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
+ '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
+ '&vpsrld ($t2,$t0,$sigma0[0]);',
+ '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
+ '&vpsrld ($t3,$t0,$sigma0[2])',
+ '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
+ '&vpxor ($t0,$t3,$t2)',
+ '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
+ '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
+ '&vpxor ($t0,$t0,$t1)',
+ '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
+ '&vpxor ($t0,$t0,$t2)',
+ '&vpsrld ($t2,$t3,$sigma1[2]);',
+ '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
+ '&vpsrlq ($t3,$t3,$sigma1[0]);',
+ '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
+ '&vpxor ($t2,$t2,$t3);',
+ '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
+ '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15])
+ '&vpshufd ($t2,$t2,0b10000100)',
+ '&vpsrldq ($t2,$t2,8)',
+ '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
+ '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
+ '&vpsrld ($t2,$t3,$sigma1[2])',
+ '&vpsrlq ($t3,$t3,$sigma1[0])',
+ '&vpxor ($t2,$t2,$t3);',
+ '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
+ '&vpxor ($t2,$t2,$t3)',
+ '&vpshufd ($t2,$t2,0b11101000)',
+ '&vpslldq ($t2,$t2,8)',
+ '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
+ );
+}
+
+sub AVX_256_00_47 () {
+my $j = shift;
+my $body = shift;
+my @X = @_;
+my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
+
+ foreach (Xupdate_256_AVX()) { # 29 instructions
+ eval;
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ }
+ &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
+ foreach (@insns) { eval; } # remaining instructions
+ &vmovdqa (16*$j."(%rsp)",$t2);
+}
+
+ $aesni_cbc_idx=0;
+ for ($i=0,$j=0; $j<4; $j++) {
+ &AVX_256_00_47($j,\&body_00_15,@X);
+ push(@X,shift(@X)); # rotate(@X)
+ }
+ &mov ("%r12",$_inp); # borrow $a4
+ &vpand ($temp,$temp,$mask14);
+ &mov ("%r15",$_out); # borrow $a2
+ &vpor ($iv,$iv,$temp);
+ &vmovdqu ("(%r15,%r12)",$iv); # write output
+ &lea ("%r12","16(%r12)"); # inp++
+
+ &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
+ &jne (".Lavx_00_47");
+
+ &vmovdqu ($inout,"(%r12)");
+ &mov ($_inp,"%r12");
+
+ $aesni_cbc_idx=0;
+ for ($i=0; $i<16; ) {
+ foreach(body_00_15()) { eval; }
+ }
+
+ }
+$code.=<<___;
+ mov $_inp,%r12 # borrow $a4
+ mov $_out,%r13 # borrow $a0
+ mov $_ctx,%r15 # borrow $a2
+ mov $_in0,%rsi # borrow $a3
+
+ vpand $mask14,$temp,$temp
+ mov $a1,$A
+ vpor $temp,$iv,$iv
+ vmovdqu $iv,(%r13,%r12) # write output
+ lea 16(%r12),%r12 # inp++
+
+ add $SZ*0(%r15),$A
+ add $SZ*1(%r15),$B
+ add $SZ*2(%r15),$C
+ add $SZ*3(%r15),$D
+ add $SZ*4(%r15),$E
+ add $SZ*5(%r15),$F
+ add $SZ*6(%r15),$G
+ add $SZ*7(%r15),$H
+
+ cmp $_end,%r12
+
+ mov $A,$SZ*0(%r15)
+ mov $B,$SZ*1(%r15)
+ mov $C,$SZ*2(%r15)
+ mov $D,$SZ*3(%r15)
+ mov $E,$SZ*4(%r15)
+ mov $F,$SZ*5(%r15)
+ mov $G,$SZ*6(%r15)
+ mov $H,$SZ*7(%r15)
+ jb .Lloop_avx
+
+ mov $_ivp,$ivp
+ mov $_rsp,%rsi
+ vmovdqu $iv,($ivp) # output IV
+ vzeroall
+___
+$code.=<<___ if ($win64);
+ movaps `$framesz+16*0`(%rsp),%xmm6
+ movaps `$framesz+16*1`(%rsp),%xmm7
+ movaps `$framesz+16*2`(%rsp),%xmm8
+ movaps `$framesz+16*3`(%rsp),%xmm9
+ movaps `$framesz+16*4`(%rsp),%xmm10
+ movaps `$framesz+16*5`(%rsp),%xmm11
+ movaps `$framesz+16*6`(%rsp),%xmm12
+ movaps `$framesz+16*7`(%rsp),%xmm13
+ movaps `$framesz+16*8`(%rsp),%xmm14
+ movaps `$framesz+16*9`(%rsp),%xmm15
+___
+$code.=<<___;
+ mov (%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lepilogue_avx:
+ ret
+.size ${func}_avx,.-${func}_avx
+___
+
+if ($avx>1) {{
+######################################################################
+# AVX2+BMI code path
+#
+my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
+my $PUSH8=8*2*$SZ;
+use integer;
+
+sub bodyx_00_15 () {
+ # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
+ (
+ '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
+
+ '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
+ '&and ($a4,$e)', # f&e
+ '&rorx ($a0,$e,$Sigma1[2])',
+ '&rorx ($a2,$e,$Sigma1[1])',
+
+ '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
+ '&lea ($h,"($h,$a4)")',
+ '&andn ($a4,$e,$g)', # ~e&g
+ '&xor ($a0,$a2)',
+
+ '&rorx ($a1,$e,$Sigma1[0])',
+ '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
+ '&xor ($a0,$a1)', # Sigma1(e)
+ '&mov ($a2,$a)',
+
+ '&rorx ($a4,$a,$Sigma0[2])',
+ '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
+ '&xor ($a2,$b)', # a^b, b^c in next round
+ '&rorx ($a1,$a,$Sigma0[1])',
+
+ '&rorx ($a0,$a,$Sigma0[0])',
+ '&lea ($d,"($d,$h)")', # d+=h
+ '&and ($a3,$a2)', # (b^c)&(a^b)
+ @aesni_cbc_block[$aesni_cbc_idx++].
+ '&xor ($a1,$a4)',
+
+ '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
+ '&xor ($a1,$a0)', # Sigma0(a)
+ '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
+ '&mov ($a4,$e)', # copy of f in future
+
+ '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
+ );
+ # and at the finish one has to $a+=$a1
+}
+
+$code.=<<___;
+.type ${func}_avx2,\@function,6
+.align 64
+${func}_avx2:
+.Lavx2_shortcut:
+ mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ mov %rsp,%r11 # copy %rsp
+ sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
+ and \$-256*$SZ,%rsp # align stack frame
+ add \$`2*$SZ*($rounds-8)`,%rsp
+
+ shl \$6,$len
+ sub $inp,$out # re-bias
+ sub $inp,$in0
+ add $inp,$len # end of input
+
+ #mov $inp,$_inp # saved later
+ #mov $out,$_out # kept in $offload
+ mov $len,$_end
+ #mov $key,$_key # remains resident in $inp register
+ mov $ivp,$_ivp
+ mov $ctx,$_ctx
+ mov $in0,$_in0
+ mov %r11,$_rsp
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,`$framesz+16*0`(%rsp)
+ movaps %xmm7,`$framesz+16*1`(%rsp)
+ movaps %xmm8,`$framesz+16*2`(%rsp)
+ movaps %xmm9,`$framesz+16*3`(%rsp)
+ movaps %xmm10,`$framesz+16*4`(%rsp)
+ movaps %xmm11,`$framesz+16*5`(%rsp)
+ movaps %xmm12,`$framesz+16*6`(%rsp)
+ movaps %xmm13,`$framesz+16*7`(%rsp)
+ movaps %xmm14,`$framesz+16*8`(%rsp)
+ movaps %xmm15,`$framesz+16*9`(%rsp)
+___
+$code.=<<___;
+.Lprologue_avx2:
+ vzeroall
+
+ mov $inp,%r13 # borrow $a0
+ vpinsrq \$1,$out,$offload,$offload
+ lea 0x80($key),$inp # size optimization, reassign
+ lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4
+ mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
+ mov $ctx,%r15 # borrow $a2
+ mov $in0,%rsi # borrow $a3
+ vmovdqu ($ivp),$iv # load IV
+ lea -9(%r14),%r14
+
+ vmovdqa 0x00(%r12,%r14,8),$mask14
+ vmovdqa 0x10(%r12,%r14,8),$mask12
+ vmovdqa 0x20(%r12,%r14,8),$mask10
+
+ sub \$-16*$SZ,%r13 # inp++, size optimization
+ mov $SZ*0(%r15),$A
+ lea (%rsi,%r13),%r12 # borrow $a0
+ mov $SZ*1(%r15),$B
+ cmp $len,%r13 # $_end
+ mov $SZ*2(%r15),$C
+ cmove %rsp,%r12 # next block or random data
+ mov $SZ*3(%r15),$D
+ mov $SZ*4(%r15),$E
+ mov $SZ*5(%r15),$F
+ mov $SZ*6(%r15),$G
+ mov $SZ*7(%r15),$H
+ vmovdqu 0x00-0x80($inp),$roundkey
+___
+ if ($SZ==4) { # SHA256
+ my @X = map("%ymm$_",(0..3));
+ my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
+
+$code.=<<___;
+ jmp .Loop_avx2
+.align 16
+.Loop_avx2:
+ vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
+ vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
+ vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
+ vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
+ vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
+
+ vinserti128 \$1,(%r12),@X[0],@X[0]
+ vinserti128 \$1,16(%r12),@X[1],@X[1]
+ vpshufb $t3,@X[0],@X[0]
+ vinserti128 \$1,32(%r12),@X[2],@X[2]
+ vpshufb $t3,@X[1],@X[1]
+ vinserti128 \$1,48(%r12),@X[3],@X[3]
+
+ lea $TABLE(%rip),$Tbl
+ vpshufb $t3,@X[2],@X[2]
+ lea -16*$SZ(%r13),%r13
+ vpaddd 0x00($Tbl),@X[0],$t0
+ vpshufb $t3,@X[3],@X[3]
+ vpaddd 0x20($Tbl),@X[1],$t1
+ vpaddd 0x40($Tbl),@X[2],$t2
+ vpaddd 0x60($Tbl),@X[3],$t3
+ vmovdqa $t0,0x00(%rsp)
+ xor $a1,$a1
+ vmovdqa $t1,0x20(%rsp)
+ lea -$PUSH8(%rsp),%rsp
+ mov $B,$a3
+ vmovdqa $t2,0x00(%rsp)
+ xor $C,$a3 # magic
+ vmovdqa $t3,0x20(%rsp)
+ mov $F,$a4
+ sub \$-16*2*$SZ,$Tbl # size optimization
+ jmp .Lavx2_00_47
+
+.align 16
+.Lavx2_00_47:
+ vmovdqu (%r13),$inout
+ vpinsrq \$0,%r13,$offload,$offload
+___
+
+sub AVX2_256_00_47 () {
+my $j = shift;
+my $body = shift;
+my @X = @_;
+my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
+my $base = "+2*$PUSH8(%rsp)";
+
+ &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
+ foreach (Xupdate_256_AVX()) { # 29 instructions
+ eval;
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ }
+ &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
+ foreach (@insns) { eval; } # remaining instructions
+ &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
+}
+ $aesni_cbc_idx=0;
+ for ($i=0,$j=0; $j<4; $j++) {
+ &AVX2_256_00_47($j,\&bodyx_00_15,@X);
+ push(@X,shift(@X)); # rotate(@X)
+ }
+ &vmovq ("%r13",$offload); # borrow $a0
+ &vpextrq ("%r15",$offload,1); # borrow $a2
+ &vpand ($temp,$temp,$mask14);
+ &vpor ($iv,$iv,$temp);
+ &vmovdqu ("(%r15,%r13)",$iv); # write output
+ &lea ("%r13","16(%r13)"); # inp++
+
+ &lea ($Tbl,16*2*$SZ."($Tbl)");
+ &cmpb (($SZ-1)."($Tbl)",0);
+ &jne (".Lavx2_00_47");
+
+ &vmovdqu ($inout,"(%r13)");
+ &vpinsrq ($offload,$offload,"%r13",0);
+
+ $aesni_cbc_idx=0;
+ for ($i=0; $i<16; ) {
+ my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
+ foreach(bodyx_00_15()) { eval; }
+ }
+ }
+$code.=<<___;
+ vpextrq \$1,$offload,%r12 # $_out, borrow $a4
+ vmovq $offload,%r13 # $_inp, borrow $a0
+ mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
+ add $a1,$A
+ lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
+
+ vpand $mask14,$temp,$temp
+ vpor $temp,$iv,$iv
+ vmovdqu $iv,(%r12,%r13) # write output
+ lea 16(%r13),%r13
+
+ add $SZ*0(%r15),$A
+ add $SZ*1(%r15),$B
+ add $SZ*2(%r15),$C
+ add $SZ*3(%r15),$D
+ add $SZ*4(%r15),$E
+ add $SZ*5(%r15),$F
+ add $SZ*6(%r15),$G
+ add $SZ*7(%r15),$H
+
+ mov $A,$SZ*0(%r15)
+ mov $B,$SZ*1(%r15)
+ mov $C,$SZ*2(%r15)
+ mov $D,$SZ*3(%r15)
+ mov $E,$SZ*4(%r15)
+ mov $F,$SZ*5(%r15)
+ mov $G,$SZ*6(%r15)
+ mov $H,$SZ*7(%r15)
+
+ cmp `$PUSH8+2*8`($Tbl),%r13 # $_end
+ je .Ldone_avx2
+
+ xor $a1,$a1
+ mov $B,$a3
+ mov $F,$a4
+ xor $C,$a3 # magic
+ jmp .Lower_avx2
+.align 16
+.Lower_avx2:
+ vmovdqu (%r13),$inout
+ vpinsrq \$0,%r13,$offload,$offload
+___
+ $aesni_cbc_idx=0;
+ for ($i=0; $i<16; ) {
+ my $base="+16($Tbl)";
+ foreach(bodyx_00_15()) { eval; }
+ &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8);
+ }
+$code.=<<___;
+ vmovq $offload,%r13 # borrow $a0
+ vpextrq \$1,$offload,%r15 # borrow $a2
+ vpand $mask14,$temp,$temp
+ vpor $temp,$iv,$iv
+ lea -$PUSH8($Tbl),$Tbl
+ vmovdqu $iv,(%r15,%r13) # write output
+ lea 16(%r13),%r13 # inp++
+ cmp %rsp,$Tbl
+ jae .Lower_avx2
+
+ mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
+ lea 16*$SZ(%r13),%r13
+ mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3
+ add $a1,$A
+ lea `2*$SZ*($rounds-8)`(%rsp),%rsp
+
+ add $SZ*0(%r15),$A
+ add $SZ*1(%r15),$B
+ add $SZ*2(%r15),$C
+ add $SZ*3(%r15),$D
+ add $SZ*4(%r15),$E
+ add $SZ*5(%r15),$F
+ add $SZ*6(%r15),$G
+ lea (%rsi,%r13),%r12
+ add $SZ*7(%r15),$H
+
+ cmp $_end,%r13
+
+ mov $A,$SZ*0(%r15)
+ cmove %rsp,%r12 # next block or stale data
+ mov $B,$SZ*1(%r15)
+ mov $C,$SZ*2(%r15)
+ mov $D,$SZ*3(%r15)
+ mov $E,$SZ*4(%r15)
+ mov $F,$SZ*5(%r15)
+ mov $G,$SZ*6(%r15)
+ mov $H,$SZ*7(%r15)
+
+ jbe .Loop_avx2
+ lea (%rsp),$Tbl
+
+.Ldone_avx2:
+ lea ($Tbl),%rsp
+ mov $_ivp,$ivp
+ mov $_rsp,%rsi
+ vmovdqu $iv,($ivp) # output IV
+ vzeroall
+___
+$code.=<<___ if ($win64);
+ movaps `$framesz+16*0`(%rsp),%xmm6
+ movaps `$framesz+16*1`(%rsp),%xmm7
+ movaps `$framesz+16*2`(%rsp),%xmm8
+ movaps `$framesz+16*3`(%rsp),%xmm9
+ movaps `$framesz+16*4`(%rsp),%xmm10
+ movaps `$framesz+16*5`(%rsp),%xmm11
+ movaps `$framesz+16*6`(%rsp),%xmm12
+ movaps `$framesz+16*7`(%rsp),%xmm13
+ movaps `$framesz+16*8`(%rsp),%xmm14
+ movaps `$framesz+16*9`(%rsp),%xmm15
+___
+$code.=<<___;
+ mov (%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lepilogue_avx2:
+ ret
+.size ${func}_avx2,.-${func}_avx2
+___
+}}
+}}
+{{
+my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+my ($rounds,$Tbl)=("%r11d","%rbx");
+
+my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
+my @rndkey=("%xmm4","%xmm5");
+my $r=0;
+my $sn=0;
+
+my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
+my @MSG=map("%xmm$_",(10..13));
+
+my $aesenc=sub {
+ use integer;
+ my ($n,$k)=($r/10,$r%10);
+ if ($k==0) {
+ $code.=<<___;
+ movups `16*$n`($in0),$in # load input
+ xorps $rndkey0,$in
+___
+ $code.=<<___ if ($n);
+ movups $iv,`16*($n-1)`($out,$in0) # write output
+___
+ $code.=<<___;
+ xorps $in,$iv
+ movups `32+16*$k-112`($key),$rndkey[1]
+ aesenc $rndkey[0],$iv
+___
+ } elsif ($k==9) {
+ $sn++;
+ $code.=<<___;
+ cmp \$11,$rounds
+ jb .Laesenclast$sn
+ movups `32+16*($k+0)-112`($key),$rndkey[1]
+ aesenc $rndkey[0],$iv
+ movups `32+16*($k+1)-112`($key),$rndkey[0]
+ aesenc $rndkey[1],$iv
+ je .Laesenclast$sn
+ movups `32+16*($k+2)-112`($key),$rndkey[1]
+ aesenc $rndkey[0],$iv
+ movups `32+16*($k+3)-112`($key),$rndkey[0]
+ aesenc $rndkey[1],$iv
+.Laesenclast$sn:
+ aesenclast $rndkey[0],$iv
+ movups 16-112($key),$rndkey[1] # forward reference
+ nop
+___
+ } else {
+ $code.=<<___;
+ movups `32+16*$k-112`($key),$rndkey[1]
+ aesenc $rndkey[0],$iv
+___
+ }
+ $r++; unshift(@rndkey,pop(@rndkey));
+};
+
+if ($shaext) {
+my $Tbl="%rax";
+
+$code.=<<___;
+.type ${func}_shaext,\@function,6
+.align 32
+${func}_shaext:
+ mov `($win64?56:8)`(%rsp),$inp # load 7th argument
+___
+$code.=<<___ if ($win64);
+ lea `-8-10*16`(%rsp),%rsp
+ movaps %xmm6,-8-10*16(%rax)
+ movaps %xmm7,-8-9*16(%rax)
+ movaps %xmm8,-8-8*16(%rax)
+ movaps %xmm9,-8-7*16(%rax)
+ movaps %xmm10,-8-6*16(%rax)
+ movaps %xmm11,-8-5*16(%rax)
+ movaps %xmm12,-8-4*16(%rax)
+ movaps %xmm13,-8-3*16(%rax)
+ movaps %xmm14,-8-2*16(%rax)
+ movaps %xmm15,-8-1*16(%rax)
+.Lprologue_shaext:
+___
+$code.=<<___;
+ lea K256+0x80(%rip),$Tbl
+ movdqu ($ctx),$ABEF # DCBA
+ movdqu 16($ctx),$CDGH # HGFE
+ movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
+
+ mov 240($key),$rounds
+ sub $in0,$out
+ movups ($key),$rndkey0 # $key[0]
+ movups 16($key),$rndkey[0] # forward reference
+ lea 112($key),$key # size optimization
+
+ pshufd \$0x1b,$ABEF,$Wi # ABCD
+ pshufd \$0xb1,$ABEF,$ABEF # CDAB
+ pshufd \$0x1b,$CDGH,$CDGH # EFGH
+ movdqa $TMP,$BSWAP # offload
+ palignr \$8,$CDGH,$ABEF # ABEF
+ punpcklqdq $Wi,$CDGH # CDGH
+
+ jmp .Loop_shaext
+
+.align 16
+.Loop_shaext:
+ movdqu ($inp),@MSG[0]
+ movdqu 0x10($inp),@MSG[1]
+ movdqu 0x20($inp),@MSG[2]
+ pshufb $TMP,@MSG[0]
+ movdqu 0x30($inp),@MSG[3]
+
+ movdqa 0*32-0x80($Tbl),$Wi
+ paddd @MSG[0],$Wi
+ pshufb $TMP,@MSG[1]
+ movdqa $CDGH,$CDGH_SAVE # offload
+ movdqa $ABEF,$ABEF_SAVE # offload
+___
+ &$aesenc();
+$code.=<<___;
+ sha256rnds2 $ABEF,$CDGH # 0-3
+ pshufd \$0x0e,$Wi,$Wi
+___
+ &$aesenc();
+$code.=<<___;
+ sha256rnds2 $CDGH,$ABEF
+
+ movdqa 1*32-0x80($Tbl),$Wi
+ paddd @MSG[1],$Wi
+ pshufb $TMP,@MSG[2]
+ lea 0x40($inp),$inp
+___
+ &$aesenc();
+$code.=<<___;
+ sha256rnds2 $ABEF,$CDGH # 4-7
+ pshufd \$0x0e,$Wi,$Wi
+___
+ &$aesenc();
+$code.=<<___;
+ sha256rnds2 $CDGH,$ABEF
+
+ movdqa 2*32-0x80($Tbl),$Wi
+ paddd @MSG[2],$Wi
+ pshufb $TMP,@MSG[3]
+ sha256msg1 @MSG[1],@MSG[0]
+___
+ &$aesenc();
+$code.=<<___;
+ sha256rnds2 $ABEF,$CDGH # 8-11
+ pshufd \$0x0e,$Wi,$Wi
+ movdqa @MSG[3],$TMP
+ palignr \$4,@MSG[2],$TMP
+ paddd $TMP,@MSG[0]
+___
+ &$aesenc();
+$code.=<<___;
+ sha256rnds2 $CDGH,$ABEF
+
+ movdqa 3*32-0x80($Tbl),$Wi
+ paddd @MSG[3],$Wi
+ sha256msg2 @MSG[3],@MSG[0]
+ sha256msg1 @MSG[2],@MSG[1]
+___
+ &$aesenc();
+$code.=<<___;
+ sha256rnds2 $ABEF,$CDGH # 12-15
+ pshufd \$0x0e,$Wi,$Wi
+___
+ &$aesenc();
+$code.=<<___;
+ movdqa @MSG[0],$TMP
+ palignr \$4,@MSG[3],$TMP
+ paddd $TMP,@MSG[1]
+ sha256rnds2 $CDGH,$ABEF
+___
+for($i=4;$i<16-3;$i++) {
+ &$aesenc() if (($r%10)==0);
+$code.=<<___;
+ movdqa $i*32-0x80($Tbl),$Wi
+ paddd @MSG[0],$Wi
+ sha256msg2 @MSG[0],@MSG[1]
+ sha256msg1 @MSG[3],@MSG[2]
+___
+ &$aesenc();
+$code.=<<___;
+ sha256rnds2 $ABEF,$CDGH # 16-19...
+ pshufd \$0x0e,$Wi,$Wi
+ movdqa @MSG[1],$TMP
+ palignr \$4,@MSG[0],$TMP
+ paddd $TMP,@MSG[2]
+___
+ &$aesenc();
+ &$aesenc() if ($r==19);
+$code.=<<___;
+ sha256rnds2 $CDGH,$ABEF
+___
+ push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+ movdqa 13*32-0x80($Tbl),$Wi
+ paddd @MSG[0],$Wi
+ sha256msg2 @MSG[0],@MSG[1]
+ sha256msg1 @MSG[3],@MSG[2]
+___
+ &$aesenc();
+$code.=<<___;
+ sha256rnds2 $ABEF,$CDGH # 52-55
+ pshufd \$0x0e,$Wi,$Wi
+ movdqa @MSG[1],$TMP
+ palignr \$4,@MSG[0],$TMP
+ paddd $TMP,@MSG[2]
+___
+ &$aesenc();
+ &$aesenc();
+$code.=<<___;
+ sha256rnds2 $CDGH,$ABEF
+
+ movdqa 14*32-0x80($Tbl),$Wi
+ paddd @MSG[1],$Wi
+ sha256msg2 @MSG[1],@MSG[2]
+ movdqa $BSWAP,$TMP
+___
+ &$aesenc();
+$code.=<<___;
+ sha256rnds2 $ABEF,$CDGH # 56-59
+ pshufd \$0x0e,$Wi,$Wi
+___
+ &$aesenc();
+$code.=<<___;
+ sha256rnds2 $CDGH,$ABEF
+
+ movdqa 15*32-0x80($Tbl),$Wi
+ paddd @MSG[2],$Wi
+___
+ &$aesenc();
+ &$aesenc();
+$code.=<<___;
+ sha256rnds2 $ABEF,$CDGH # 60-63
+ pshufd \$0x0e,$Wi,$Wi
+___
+ &$aesenc();
+$code.=<<___;
+ sha256rnds2 $CDGH,$ABEF
+ #pxor $CDGH,$rndkey0 # black magic
+___
+ while ($r<40) { &$aesenc(); } # remaining aesenc's
+$code.=<<___;
+ #xorps $CDGH,$rndkey0 # black magic
+ paddd $CDGH_SAVE,$CDGH
+ paddd $ABEF_SAVE,$ABEF
+
+ dec $len
+ movups $iv,48($out,$in0) # write output
+ lea 64($in0),$in0
+ jnz .Loop_shaext
+
+ pshufd \$0xb1,$CDGH,$CDGH # DCHG
+ pshufd \$0x1b,$ABEF,$TMP # FEBA
+ pshufd \$0xb1,$ABEF,$ABEF # BAFE
+ punpckhqdq $CDGH,$ABEF # DCBA
+ palignr \$8,$TMP,$CDGH # HGFE
+
+ movups $iv,($ivp) # write IV
+ movdqu $ABEF,($ctx)
+ movdqu $CDGH,16($ctx)
+___
+$code.=<<___ if ($win64);
+ movaps 0*16(%rsp),%xmm6
+ movaps 1*16(%rsp),%xmm7
+ movaps 2*16(%rsp),%xmm8
+ movaps 3*16(%rsp),%xmm9
+ movaps 4*16(%rsp),%xmm10
+ movaps 5*16(%rsp),%xmm11
+ movaps 6*16(%rsp),%xmm12
+ movaps 7*16(%rsp),%xmm13
+ movaps 8*16(%rsp),%xmm14
+ movaps 9*16(%rsp),%xmm15
+ lea 8+10*16(%rsp),%rsp
+.Lepilogue_shaext:
+___
+$code.=<<___;
+ ret
+.size ${func}_shaext,.-${func}_shaext
+___
+}
+}}}}}
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64 && $avx) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HanderlData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lin_prologue
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lin_prologue
+___
+$code.=<<___ if ($shaext);
+ lea aesni_cbc_sha256_enc_shaext(%rip),%r10
+ cmp %r10,%rbx
+ jb .Lnot_in_shaext
+
+ lea (%rax),%rsi
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+ lea 168(%rax),%rax # adjust stack pointer
+ jmp .Lin_prologue
+.Lnot_in_shaext:
+___
+$code.=<<___ if ($avx>1);
+ lea .Lavx2_shortcut(%rip),%r10
+ cmp %r10,%rbx # context->Rip<avx2_shortcut
+ jb .Lnot_in_avx2
+
+ and \$-256*$SZ,%rax
+ add \$`2*$SZ*($rounds-8)`,%rax
+.Lnot_in_avx2:
+___
+$code.=<<___;
+ mov %rax,%rsi # put aside Rsp
+ mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
+ lea 48(%rax),%rax
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
+
+ lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+
+.Lin_prologue:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size se_handler,.-se_handler
+
+.section .pdata
+ .rva .LSEH_begin_${func}_xop
+ .rva .LSEH_end_${func}_xop
+ .rva .LSEH_info_${func}_xop
+
+ .rva .LSEH_begin_${func}_avx
+ .rva .LSEH_end_${func}_avx
+ .rva .LSEH_info_${func}_avx
+___
+$code.=<<___ if ($avx>1);
+ .rva .LSEH_begin_${func}_avx2
+ .rva .LSEH_end_${func}_avx2
+ .rva .LSEH_info_${func}_avx2
+___
+$code.=<<___ if ($shaext);
+ .rva .LSEH_begin_${func}_shaext
+ .rva .LSEH_end_${func}_shaext
+ .rva .LSEH_info_${func}_shaext
+___
+$code.=<<___;
+.section .xdata
+.align 8
+.LSEH_info_${func}_xop:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
+
+.LSEH_info_${func}_avx:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
+___
+$code.=<<___ if ($avx>1);
+.LSEH_info_${func}_avx2:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
+___
+$code.=<<___ if ($shaext);
+.LSEH_info_${func}_shaext:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
+___
+}
+
+####################################################################
+sub rex {
+ local *opcode=shift;
+ my ($dst,$src)=@_;
+ my $rex=0;
+
+ $rex|=0x04 if($dst>=8);
+ $rex|=0x01 if($src>=8);
+ unshift @opcode,$rex|0x40 if($rex);
+}
+
+{
+ my %opcodelet = (
+ "sha256rnds2" => 0xcb,
+ "sha256msg1" => 0xcc,
+ "sha256msg2" => 0xcd );
+
+ sub sha256op38 {
+ my $instr = shift;
+
+ if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my @opcode=(0x0f,0x38);
+ rex(\@opcode,$2,$1);
+ push @opcode,$opcodelet{$instr};
+ push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
+ return ".byte\t".join(',',@opcode);
+ } else {
+ return $instr."\t".@_[0];
+ }
+ }
+}
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
+print $code;
+close STDOUT;
diff --git a/crypto/aes/asm/aesni-x86.pl b/crypto/aes/asm/aesni-x86.pl
index 3dc345b585f6..f67df8cf13da 100755
--- a/crypto/aes/asm/aesni-x86.pl
+++ b/crypto/aes/asm/aesni-x86.pl
@@ -1,7 +1,7 @@
#!/usr/bin/env perl
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -43,6 +43,17 @@
# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
+######################################################################
+# Current large-block performance in cycles per byte processed with
+# 128-bit key (less is better).
+#
+# CBC en-/decrypt CTR XTS ECB
+# Westmere 3.77/1.37 1.37 1.52 1.27
+# * Bridge 5.07/0.98 0.99 1.09 0.91
+# Haswell 4.44/0.80 0.97 1.03 0.72
+# Silvermont 5.77/3.56 3.67 4.03 3.46
+# Bulldozer 5.80/0.98 1.05 1.24 0.93
+
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
# generates drop-in replacement for
# crypto/aes/asm/aes-586.pl:-)
@@ -54,8 +65,11 @@ require "x86asm.pl";
&asm_init($ARGV[0],$0);
-if ($PREFIX eq "aesni") { $movekey=*movups; }
-else { $movekey=*movups; }
+&external_label("OPENSSL_ia32cap_P");
+&static_label("key_const");
+
+if ($PREFIX eq "aesni") { $movekey=\&movups; }
+else { $movekey=\&movups; }
$len="eax";
$rounds="ecx";
@@ -170,7 +184,10 @@ sub aesni_generate1 # fully unrolled loop
{ &aesni_inline_generate1("enc"); }
else
{ &call ("_aesni_encrypt1"); }
+ &pxor ($rndkey0,$rndkey0); # clear register bank
+ &pxor ($rndkey1,$rndkey1);
&movups (&QWP(0,"eax"),$inout0);
+ &pxor ($inout0,$inout0);
&ret ();
&function_end_B("${PREFIX}_encrypt");
@@ -186,7 +203,10 @@ sub aesni_generate1 # fully unrolled loop
{ &aesni_inline_generate1("dec"); }
else
{ &call ("_aesni_decrypt1"); }
+ &pxor ($rndkey0,$rndkey0); # clear register bank
+ &pxor ($rndkey1,$rndkey1);
&movups (&QWP(0,"eax"),$inout0);
+ &pxor ($inout0,$inout0);
&ret ();
&function_end_B("${PREFIX}_decrypt");
@@ -196,37 +216,71 @@ sub aesni_generate1 # fully unrolled loop
# every *2nd* cycle. Thus 3x interleave was the one providing optimal
# utilization, i.e. when subroutine's throughput is virtually same as
# of non-interleaved subroutine [for number of input blocks up to 3].
-# This is why it makes no sense to implement 2x subroutine.
-# aes[enc|dec] latency in next processor generation is 8, but the
-# instructions can be scheduled every cycle. Optimal interleave for
-# new processor is therefore 8x, but it's unfeasible to accommodate it
-# in XMM registers addreassable in 32-bit mode and therefore 6x is
-# used instead...
+# This is why it originally made no sense to implement 2x subroutine.
+# But times change and it became appropriate to spend extra 192 bytes
+# on 2x subroutine on Atom Silvermont account. For processors that
+# can schedule aes[enc|dec] every cycle optimal interleave factor
+# equals to corresponding instructions latency. 8x is optimal for
+# * Bridge, but it's unfeasible to accommodate such implementation
+# in XMM registers addreassable in 32-bit mode and therefore maximum
+# of 6x is used instead...
+
+sub aesni_generate2
+{ my $p=shift;
+
+ &function_begin_B("_aesni_${p}rypt2");
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &shl ($rounds,4);
+ &$movekey ($rndkey1,&QWP(16,$key));
+ &xorps ($inout0,$rndkey0);
+ &pxor ($inout1,$rndkey0);
+ &$movekey ($rndkey0,&QWP(32,$key));
+ &lea ($key,&DWP(32,$key,$rounds));
+ &neg ($rounds);
+ &add ($rounds,16);
+
+ &set_label("${p}2_loop");
+ eval"&aes${p} ($inout0,$rndkey1)";
+ eval"&aes${p} ($inout1,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(0,$key,$rounds));
+ &add ($rounds,32);
+ eval"&aes${p} ($inout0,$rndkey0)";
+ eval"&aes${p} ($inout1,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
+ &jnz (&label("${p}2_loop"));
+ eval"&aes${p} ($inout0,$rndkey1)";
+ eval"&aes${p} ($inout1,$rndkey1)";
+ eval"&aes${p}last ($inout0,$rndkey0)";
+ eval"&aes${p}last ($inout1,$rndkey0)";
+ &ret();
+ &function_end_B("_aesni_${p}rypt2");
+}
sub aesni_generate3
{ my $p=shift;
&function_begin_B("_aesni_${p}rypt3");
&$movekey ($rndkey0,&QWP(0,$key));
- &shr ($rounds,1);
+ &shl ($rounds,4);
&$movekey ($rndkey1,&QWP(16,$key));
- &lea ($key,&DWP(32,$key));
&xorps ($inout0,$rndkey0);
&pxor ($inout1,$rndkey0);
&pxor ($inout2,$rndkey0);
- &$movekey ($rndkey0,&QWP(0,$key));
+ &$movekey ($rndkey0,&QWP(32,$key));
+ &lea ($key,&DWP(32,$key,$rounds));
+ &neg ($rounds);
+ &add ($rounds,16);
&set_label("${p}3_loop");
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
- &dec ($rounds);
eval"&aes${p} ($inout2,$rndkey1)";
- &$movekey ($rndkey1,&QWP(16,$key));
+ &$movekey ($rndkey1,&QWP(0,$key,$rounds));
+ &add ($rounds,32);
eval"&aes${p} ($inout0,$rndkey0)";
eval"&aes${p} ($inout1,$rndkey0)";
- &lea ($key,&DWP(32,$key));
eval"&aes${p} ($inout2,$rndkey0)";
- &$movekey ($rndkey0,&QWP(0,$key));
+ &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
&jnz (&label("${p}3_loop"));
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
@@ -248,27 +302,29 @@ sub aesni_generate4
&function_begin_B("_aesni_${p}rypt4");
&$movekey ($rndkey0,&QWP(0,$key));
&$movekey ($rndkey1,&QWP(16,$key));
- &shr ($rounds,1);
- &lea ($key,&DWP(32,$key));
+ &shl ($rounds,4);
&xorps ($inout0,$rndkey0);
&pxor ($inout1,$rndkey0);
&pxor ($inout2,$rndkey0);
&pxor ($inout3,$rndkey0);
- &$movekey ($rndkey0,&QWP(0,$key));
+ &$movekey ($rndkey0,&QWP(32,$key));
+ &lea ($key,&DWP(32,$key,$rounds));
+ &neg ($rounds);
+ &data_byte (0x0f,0x1f,0x40,0x00);
+ &add ($rounds,16);
&set_label("${p}4_loop");
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
- &dec ($rounds);
eval"&aes${p} ($inout2,$rndkey1)";
eval"&aes${p} ($inout3,$rndkey1)";
- &$movekey ($rndkey1,&QWP(16,$key));
+ &$movekey ($rndkey1,&QWP(0,$key,$rounds));
+ &add ($rounds,32);
eval"&aes${p} ($inout0,$rndkey0)";
eval"&aes${p} ($inout1,$rndkey0)";
- &lea ($key,&DWP(32,$key));
eval"&aes${p} ($inout2,$rndkey0)";
eval"&aes${p} ($inout3,$rndkey0)";
- &$movekey ($rndkey0,&QWP(0,$key));
+ &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
&jnz (&label("${p}4_loop"));
eval"&aes${p} ($inout0,$rndkey1)";
@@ -289,43 +345,41 @@ sub aesni_generate6
&function_begin_B("_aesni_${p}rypt6");
&static_label("_aesni_${p}rypt6_enter");
&$movekey ($rndkey0,&QWP(0,$key));
- &shr ($rounds,1);
+ &shl ($rounds,4);
&$movekey ($rndkey1,&QWP(16,$key));
- &lea ($key,&DWP(32,$key));
&xorps ($inout0,$rndkey0);
&pxor ($inout1,$rndkey0); # pxor does better here
- eval"&aes${p} ($inout0,$rndkey1)";
&pxor ($inout2,$rndkey0);
- eval"&aes${p} ($inout1,$rndkey1)";
+ eval"&aes${p} ($inout0,$rndkey1)";
&pxor ($inout3,$rndkey0);
- &dec ($rounds);
- eval"&aes${p} ($inout2,$rndkey1)";
&pxor ($inout4,$rndkey0);
- eval"&aes${p} ($inout3,$rndkey1)";
+ eval"&aes${p} ($inout1,$rndkey1)";
+ &lea ($key,&DWP(32,$key,$rounds));
+ &neg ($rounds);
+ eval"&aes${p} ($inout2,$rndkey1)";
&pxor ($inout5,$rndkey0);
- eval"&aes${p} ($inout4,$rndkey1)";
- &$movekey ($rndkey0,&QWP(0,$key));
- eval"&aes${p} ($inout5,$rndkey1)";
- &jmp (&label("_aesni_${p}rypt6_enter"));
+ &$movekey ($rndkey0,&QWP(0,$key,$rounds));
+ &add ($rounds,16);
+ &jmp (&label("_aesni_${p}rypt6_inner"));
&set_label("${p}6_loop",16);
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
- &dec ($rounds);
eval"&aes${p} ($inout2,$rndkey1)";
+ &set_label("_aesni_${p}rypt6_inner");
eval"&aes${p} ($inout3,$rndkey1)";
eval"&aes${p} ($inout4,$rndkey1)";
eval"&aes${p} ($inout5,$rndkey1)";
- &set_label("_aesni_${p}rypt6_enter",16);
- &$movekey ($rndkey1,&QWP(16,$key));
+ &set_label("_aesni_${p}rypt6_enter");
+ &$movekey ($rndkey1,&QWP(0,$key,$rounds));
+ &add ($rounds,32);
eval"&aes${p} ($inout0,$rndkey0)";
eval"&aes${p} ($inout1,$rndkey0)";
- &lea ($key,&DWP(32,$key));
eval"&aes${p} ($inout2,$rndkey0)";
eval"&aes${p} ($inout3,$rndkey0)";
eval"&aes${p} ($inout4,$rndkey0)";
eval"&aes${p} ($inout5,$rndkey0)";
- &$movekey ($rndkey0,&QWP(0,$key));
+ &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
&jnz (&label("${p}6_loop"));
eval"&aes${p} ($inout0,$rndkey1)";
@@ -343,6 +397,8 @@ sub aesni_generate6
&ret();
&function_end_B("_aesni_${p}rypt6");
}
+&aesni_generate2("enc") if ($PREFIX eq "aesni");
+&aesni_generate2("dec");
&aesni_generate3("enc") if ($PREFIX eq "aesni");
&aesni_generate3("dec");
&aesni_generate4("enc") if ($PREFIX eq "aesni");
@@ -446,8 +502,7 @@ if ($PREFIX eq "aesni") {
&jmp (&label("ecb_ret"));
&set_label("ecb_enc_two",16);
- &xorps ($inout2,$inout2);
- &call ("_aesni_encrypt3");
+ &call ("_aesni_encrypt2");
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&jmp (&label("ecb_ret"));
@@ -547,8 +602,7 @@ if ($PREFIX eq "aesni") {
&jmp (&label("ecb_ret"));
&set_label("ecb_dec_two",16);
- &xorps ($inout2,$inout2);
- &call ("_aesni_decrypt3");
+ &call ("_aesni_decrypt2");
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&jmp (&label("ecb_ret"));
@@ -568,6 +622,14 @@ if ($PREFIX eq "aesni") {
&movups (&QWP(0x30,$out),$inout3);
&set_label("ecb_ret");
+ &pxor ("xmm0","xmm0"); # clear register bank
+ &pxor ("xmm1","xmm1");
+ &pxor ("xmm2","xmm2");
+ &pxor ("xmm3","xmm3");
+ &pxor ("xmm4","xmm4");
+ &pxor ("xmm5","xmm5");
+ &pxor ("xmm6","xmm6");
+ &pxor ("xmm7","xmm7");
&function_end("aesni_ecb_encrypt");
######################################################################
@@ -610,11 +672,13 @@ if ($PREFIX eq "aesni") {
&mov (&DWP(24,"esp"),$key_);
&mov (&DWP(28,"esp"),$key_);
- &shr ($rounds,1);
+ &shl ($rounds,4);
+ &mov ($rounds_,16);
&lea ($key_,&DWP(0,$key));
&movdqa ($inout3,&QWP(0,"esp"));
&movdqa ($inout0,$ivec);
- &mov ($rounds_,$rounds);
+ &lea ($key,&DWP(32,$key,$rounds));
+ &sub ($rounds_,$rounds);
&pshufb ($ivec,$inout3);
&set_label("ccm64_enc_outer");
@@ -625,38 +689,45 @@ if ($PREFIX eq "aesni") {
&xorps ($inout0,$rndkey0);
&$movekey ($rndkey1,&QWP(16,$key_));
&xorps ($rndkey0,$in0);
- &lea ($key,&DWP(32,$key_));
&xorps ($cmac,$rndkey0); # cmac^=inp
- &$movekey ($rndkey0,&QWP(0,$key));
+ &$movekey ($rndkey0,&QWP(32,$key_));
&set_label("ccm64_enc2_loop");
&aesenc ($inout0,$rndkey1);
- &dec ($rounds);
&aesenc ($cmac,$rndkey1);
- &$movekey ($rndkey1,&QWP(16,$key));
+ &$movekey ($rndkey1,&QWP(0,$key,$rounds));
+ &add ($rounds,32);
&aesenc ($inout0,$rndkey0);
- &lea ($key,&DWP(32,$key));
&aesenc ($cmac,$rndkey0);
- &$movekey ($rndkey0,&QWP(0,$key));
+ &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
&jnz (&label("ccm64_enc2_loop"));
&aesenc ($inout0,$rndkey1);
&aesenc ($cmac,$rndkey1);
&paddq ($ivec,&QWP(16,"esp"));
+ &dec ($len);
&aesenclast ($inout0,$rndkey0);
&aesenclast ($cmac,$rndkey0);
- &dec ($len);
&lea ($inp,&DWP(16,$inp));
&xorps ($in0,$inout0); # inp^=E(ivec)
&movdqa ($inout0,$ivec);
&movups (&QWP(0,$out),$in0); # save output
- &lea ($out,&DWP(16,$out));
&pshufb ($inout0,$inout3);
+ &lea ($out,&DWP(16,$out));
&jnz (&label("ccm64_enc_outer"));
&mov ("esp",&DWP(48,"esp"));
&mov ($out,&wparam(5));
&movups (&QWP(0,$out),$cmac);
+
+ &pxor ("xmm0","xmm0"); # clear register bank
+ &pxor ("xmm1","xmm1");
+ &pxor ("xmm2","xmm2");
+ &pxor ("xmm3","xmm3");
+ &pxor ("xmm4","xmm4");
+ &pxor ("xmm5","xmm5");
+ &pxor ("xmm6","xmm6");
+ &pxor ("xmm7","xmm7");
&function_end("aesni_ccm64_encrypt_blocks");
&function_begin("aesni_ccm64_decrypt_blocks");
@@ -700,15 +771,19 @@ if ($PREFIX eq "aesni") {
{ &aesni_inline_generate1("enc"); }
else
{ &call ("_aesni_encrypt1"); }
+ &shl ($rounds_,4);
+ &mov ($rounds,16);
&movups ($in0,&QWP(0,$inp)); # load inp
&paddq ($ivec,&QWP(16,"esp"));
&lea ($inp,&QWP(16,$inp));
+ &sub ($rounds,$rounds_);
+ &lea ($key,&DWP(32,$key_,$rounds_));
+ &mov ($rounds_,$rounds);
&jmp (&label("ccm64_dec_outer"));
&set_label("ccm64_dec_outer",16);
&xorps ($in0,$inout0); # inp ^= E(ivec)
&movdqa ($inout0,$ivec);
- &mov ($rounds,$rounds_);
&movups (&QWP(0,$out),$in0); # save output
&lea ($out,&DWP(16,$out));
&pshufb ($inout0,$inout3);
@@ -717,34 +792,33 @@ if ($PREFIX eq "aesni") {
&jz (&label("ccm64_dec_break"));
&$movekey ($rndkey0,&QWP(0,$key_));
- &shr ($rounds,1);
+ &mov ($rounds,$rounds_);
&$movekey ($rndkey1,&QWP(16,$key_));
&xorps ($in0,$rndkey0);
- &lea ($key,&DWP(32,$key_));
&xorps ($inout0,$rndkey0);
&xorps ($cmac,$in0); # cmac^=out
- &$movekey ($rndkey0,&QWP(0,$key));
+ &$movekey ($rndkey0,&QWP(32,$key_));
&set_label("ccm64_dec2_loop");
&aesenc ($inout0,$rndkey1);
- &dec ($rounds);
&aesenc ($cmac,$rndkey1);
- &$movekey ($rndkey1,&QWP(16,$key));
+ &$movekey ($rndkey1,&QWP(0,$key,$rounds));
+ &add ($rounds,32);
&aesenc ($inout0,$rndkey0);
- &lea ($key,&DWP(32,$key));
&aesenc ($cmac,$rndkey0);
- &$movekey ($rndkey0,&QWP(0,$key));
+ &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
&jnz (&label("ccm64_dec2_loop"));
&movups ($in0,&QWP(0,$inp)); # load inp
&paddq ($ivec,&QWP(16,"esp"));
&aesenc ($inout0,$rndkey1);
&aesenc ($cmac,$rndkey1);
- &lea ($inp,&QWP(16,$inp));
&aesenclast ($inout0,$rndkey0);
&aesenclast ($cmac,$rndkey0);
+ &lea ($inp,&QWP(16,$inp));
&jmp (&label("ccm64_dec_outer"));
&set_label("ccm64_dec_break",16);
+ &mov ($rounds,&DWP(240,$key_));
&mov ($key,$key_);
if ($inline)
{ &aesni_inline_generate1("enc",$cmac,$in0); }
@@ -754,6 +828,15 @@ if ($PREFIX eq "aesni") {
&mov ("esp",&DWP(48,"esp"));
&mov ($out,&wparam(5));
&movups (&QWP(0,$out),$cmac);
+
+ &pxor ("xmm0","xmm0"); # clear register bank
+ &pxor ("xmm1","xmm1");
+ &pxor ("xmm2","xmm2");
+ &pxor ("xmm3","xmm3");
+ &pxor ("xmm4","xmm4");
+ &pxor ("xmm5","xmm5");
+ &pxor ("xmm6","xmm6");
+ &pxor ("xmm7","xmm7");
&function_end("aesni_ccm64_decrypt_blocks");
}
@@ -763,7 +846,7 @@ if ($PREFIX eq "aesni") {
# const char *ivec);
#
# Handles only complete blocks, operates on 32-bit counter and
-# does not update *ivec! (see engine/eng_aesni.c for details)
+# does not update *ivec! (see crypto/modes/ctr128.c for details)
#
# stack layout:
# 0 pshufb mask
@@ -810,66 +893,61 @@ if ($PREFIX eq "aesni") {
# compose 2 vectors of 3x32-bit counters
&bswap ($rounds_);
- &pxor ($rndkey1,$rndkey1);
&pxor ($rndkey0,$rndkey0);
+ &pxor ($rndkey1,$rndkey1);
&movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask
- &pinsrd ($rndkey1,$rounds_,0);
+ &pinsrd ($rndkey0,$rounds_,0);
&lea ($key_,&DWP(3,$rounds_));
- &pinsrd ($rndkey0,$key_,0);
+ &pinsrd ($rndkey1,$key_,0);
&inc ($rounds_);
- &pinsrd ($rndkey1,$rounds_,1);
+ &pinsrd ($rndkey0,$rounds_,1);
&inc ($key_);
- &pinsrd ($rndkey0,$key_,1);
+ &pinsrd ($rndkey1,$key_,1);
&inc ($rounds_);
- &pinsrd ($rndkey1,$rounds_,2);
+ &pinsrd ($rndkey0,$rounds_,2);
&inc ($key_);
- &pinsrd ($rndkey0,$key_,2);
- &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet
- &pshufb ($rndkey1,$inout0); # byte swap
- &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet
+ &pinsrd ($rndkey1,$key_,2);
+ &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet
&pshufb ($rndkey0,$inout0); # byte swap
+ &movdqu ($inout4,&QWP(0,$key)); # key[0]
+ &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet
+ &pshufb ($rndkey1,$inout0); # byte swap
- &pshufd ($inout0,$rndkey1,3<<6); # place counter to upper dword
- &pshufd ($inout1,$rndkey1,2<<6);
+ &pshufd ($inout0,$rndkey0,3<<6); # place counter to upper dword
+ &pshufd ($inout1,$rndkey0,2<<6);
&cmp ($len,6);
&jb (&label("ctr32_tail"));
- &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec
- &shr ($rounds,1);
+ &pxor ($inout5,$inout4); # counter-less ivec^key[0]
+ &shl ($rounds,4);
+ &mov ($rounds_,16);
+ &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec^key[0]
&mov ($key_,$key); # backup $key
- &mov ($rounds_,$rounds); # backup $rounds
+ &sub ($rounds_,$rounds); # backup twisted $rounds
+ &lea ($key,&DWP(32,$key,$rounds));
&sub ($len,6);
&jmp (&label("ctr32_loop6"));
&set_label("ctr32_loop6",16);
- &pshufd ($inout2,$rndkey1,1<<6);
- &movdqa ($rndkey1,&QWP(32,"esp")); # pull counter-less ivec
- &pshufd ($inout3,$rndkey0,3<<6);
- &por ($inout0,$rndkey1); # merge counter-less ivec
- &pshufd ($inout4,$rndkey0,2<<6);
- &por ($inout1,$rndkey1);
- &pshufd ($inout5,$rndkey0,1<<6);
- &por ($inout2,$rndkey1);
- &por ($inout3,$rndkey1);
- &por ($inout4,$rndkey1);
- &por ($inout5,$rndkey1);
-
- # inlining _aesni_encrypt6's prologue gives ~4% improvement...
- &$movekey ($rndkey0,&QWP(0,$key_));
- &$movekey ($rndkey1,&QWP(16,$key_));
- &lea ($key,&DWP(32,$key_));
- &dec ($rounds);
- &pxor ($inout0,$rndkey0);
+ # inlining _aesni_encrypt6's prologue gives ~6% improvement...
+ &pshufd ($inout2,$rndkey0,1<<6);
+ &movdqa ($rndkey0,&QWP(32,"esp")); # pull counter-less ivec
+ &pshufd ($inout3,$rndkey1,3<<6);
+ &pxor ($inout0,$rndkey0); # merge counter-less ivec
+ &pshufd ($inout4,$rndkey1,2<<6);
&pxor ($inout1,$rndkey0);
- &aesenc ($inout0,$rndkey1);
+ &pshufd ($inout5,$rndkey1,1<<6);
+ &$movekey ($rndkey1,&QWP(16,$key_));
&pxor ($inout2,$rndkey0);
- &aesenc ($inout1,$rndkey1);
&pxor ($inout3,$rndkey0);
- &aesenc ($inout2,$rndkey1);
+ &aesenc ($inout0,$rndkey1);
&pxor ($inout4,$rndkey0);
- &aesenc ($inout3,$rndkey1);
&pxor ($inout5,$rndkey0);
+ &aesenc ($inout1,$rndkey1);
+ &$movekey ($rndkey0,&QWP(32,$key_));
+ &mov ($rounds,$rounds_);
+ &aesenc ($inout2,$rndkey1);
+ &aesenc ($inout3,$rndkey1);
&aesenc ($inout4,$rndkey1);
- &$movekey ($rndkey0,&QWP(0,$key));
&aesenc ($inout5,$rndkey1);
&call (&label("_aesni_encrypt6_enter"));
@@ -882,12 +960,12 @@ if ($PREFIX eq "aesni") {
&movups (&QWP(0,$out),$inout0);
&movdqa ($rndkey0,&QWP(16,"esp")); # load increment
&xorps ($inout2,$rndkey1);
- &movdqa ($rndkey1,&QWP(48,"esp")); # load 1st triplet
+ &movdqa ($rndkey1,&QWP(64,"esp")); # load 2nd triplet
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
- &paddd ($rndkey1,$rndkey0); # 1st triplet increment
- &paddd ($rndkey0,&QWP(64,"esp")); # 2nd triplet increment
+ &paddd ($rndkey1,$rndkey0); # 2nd triplet increment
+ &paddd ($rndkey0,&QWP(48,"esp")); # 1st triplet increment
&movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask
&movups ($inout1,&QWP(0x30,$inp));
@@ -895,44 +973,44 @@ if ($PREFIX eq "aesni") {
&xorps ($inout3,$inout1);
&movups ($inout1,&QWP(0x50,$inp));
&lea ($inp,&DWP(0x60,$inp));
- &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet
- &pshufb ($rndkey1,$inout0); # byte swap
+ &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet
+ &pshufb ($rndkey0,$inout0); # byte swap
&xorps ($inout4,$inout2);
&movups (&QWP(0x30,$out),$inout3);
&xorps ($inout5,$inout1);
- &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet
- &pshufb ($rndkey0,$inout0); # byte swap
+ &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet
+ &pshufb ($rndkey1,$inout0); # byte swap
&movups (&QWP(0x40,$out),$inout4);
- &pshufd ($inout0,$rndkey1,3<<6);
+ &pshufd ($inout0,$rndkey0,3<<6);
&movups (&QWP(0x50,$out),$inout5);
&lea ($out,&DWP(0x60,$out));
- &mov ($rounds,$rounds_);
- &pshufd ($inout1,$rndkey1,2<<6);
+ &pshufd ($inout1,$rndkey0,2<<6);
&sub ($len,6);
&jnc (&label("ctr32_loop6"));
&add ($len,6);
&jz (&label("ctr32_ret"));
+ &movdqu ($inout5,&QWP(0,$key_));
&mov ($key,$key_);
- &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
- &movdqa ($inout5,&QWP(32,"esp")); # pull count-less ivec
+ &pxor ($inout5,&QWP(32,"esp")); # restore count-less ivec
+ &mov ($rounds,&DWP(240,$key_)); # restore $rounds
&set_label("ctr32_tail");
&por ($inout0,$inout5);
&cmp ($len,2);
&jb (&label("ctr32_one"));
- &pshufd ($inout2,$rndkey1,1<<6);
+ &pshufd ($inout2,$rndkey0,1<<6);
&por ($inout1,$inout5);
&je (&label("ctr32_two"));
- &pshufd ($inout3,$rndkey0,3<<6);
+ &pshufd ($inout3,$rndkey1,3<<6);
&por ($inout2,$inout5);
&cmp ($len,4);
&jb (&label("ctr32_three"));
- &pshufd ($inout4,$rndkey0,2<<6);
+ &pshufd ($inout4,$rndkey1,2<<6);
&por ($inout3,$inout5);
&je (&label("ctr32_four"));
@@ -970,7 +1048,7 @@ if ($PREFIX eq "aesni") {
&jmp (&label("ctr32_ret"));
&set_label("ctr32_two",16);
- &call ("_aesni_encrypt3");
+ &call ("_aesni_encrypt2");
&movups ($inout3,&QWP(0,$inp));
&movups ($inout4,&QWP(0x10,$inp));
&xorps ($inout0,$inout3);
@@ -1008,6 +1086,17 @@ if ($PREFIX eq "aesni") {
&movups (&QWP(0x30,$out),$inout3);
&set_label("ctr32_ret");
+ &pxor ("xmm0","xmm0"); # clear register bank
+ &pxor ("xmm1","xmm1");
+ &pxor ("xmm2","xmm2");
+ &pxor ("xmm3","xmm3");
+ &pxor ("xmm4","xmm4");
+ &movdqa (&QWP(32,"esp"),"xmm0"); # clear stack
+ &pxor ("xmm5","xmm5");
+ &movdqa (&QWP(48,"esp"),"xmm0");
+ &pxor ("xmm6","xmm6");
+ &movdqa (&QWP(64,"esp"),"xmm0");
+ &pxor ("xmm7","xmm7");
&mov ("esp",&DWP(80,"esp"));
&function_end("aesni_ctr32_encrypt_blocks");
@@ -1057,8 +1146,10 @@ if ($PREFIX eq "aesni") {
&sub ($len,16*6);
&jc (&label("xts_enc_short"));
- &shr ($rounds,1);
- &mov ($rounds_,$rounds);
+ &shl ($rounds,4);
+ &mov ($rounds_,16);
+ &sub ($rounds_,$rounds);
+ &lea ($key,&DWP(32,$key,$rounds));
&jmp (&label("xts_enc_loop6"));
&set_label("xts_enc_loop6",16);
@@ -1080,6 +1171,7 @@ if ($PREFIX eq "aesni") {
&pxor ($inout5,$tweak);
# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
+ &mov ($rounds,$rounds_); # restore $rounds
&movdqu ($inout1,&QWP(16*1,$inp));
&xorps ($inout0,$rndkey0); # input^=rndkey[0]
&movdqu ($inout2,&QWP(16*2,$inp));
@@ -1096,19 +1188,17 @@ if ($PREFIX eq "aesni") {
&pxor ($inout5,$rndkey1);
&$movekey ($rndkey1,&QWP(16,$key_));
- &lea ($key,&DWP(32,$key_));
&pxor ($inout1,&QWP(16*1,"esp"));
- &aesenc ($inout0,$rndkey1);
&pxor ($inout2,&QWP(16*2,"esp"));
- &aesenc ($inout1,$rndkey1);
+ &aesenc ($inout0,$rndkey1);
&pxor ($inout3,&QWP(16*3,"esp"));
- &dec ($rounds);
- &aesenc ($inout2,$rndkey1);
&pxor ($inout4,&QWP(16*4,"esp"));
- &aesenc ($inout3,$rndkey1);
+ &aesenc ($inout1,$rndkey1);
&pxor ($inout5,$rndkey0);
+ &$movekey ($rndkey0,&QWP(32,$key_));
+ &aesenc ($inout2,$rndkey1);
+ &aesenc ($inout3,$rndkey1);
&aesenc ($inout4,$rndkey1);
- &$movekey ($rndkey0,&QWP(0,$key));
&aesenc ($inout5,$rndkey1);
&call (&label("_aesni_encrypt6_enter"));
@@ -1135,13 +1225,12 @@ if ($PREFIX eq "aesni") {
&paddq ($tweak,$tweak); # &psllq($tweak,1);
&pand ($twres,$twmask); # isolate carry and residue
&pcmpgtd($twtmp,$tweak); # broadcast upper bits
- &mov ($rounds,$rounds_); # restore $rounds
&pxor ($tweak,$twres);
&sub ($len,16*6);
&jnc (&label("xts_enc_loop6"));
- &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
+ &mov ($rounds,&DWP(240,$key_)); # restore $rounds
&mov ($key,$key_); # restore $key
&mov ($rounds_,$rounds);
@@ -1241,9 +1330,8 @@ if ($PREFIX eq "aesni") {
&lea ($inp,&DWP(16*2,$inp));
&xorps ($inout0,$inout3); # input^=tweak
&xorps ($inout1,$inout4);
- &xorps ($inout2,$inout2);
- &call ("_aesni_encrypt3");
+ &call ("_aesni_encrypt2");
&xorps ($inout0,$inout3); # output^=tweak
&xorps ($inout1,$inout4);
@@ -1350,6 +1438,20 @@ if ($PREFIX eq "aesni") {
&movups (&QWP(-16,$out),$inout0); # write output
&set_label("xts_enc_ret");
+ &pxor ("xmm0","xmm0"); # clear register bank
+ &pxor ("xmm1","xmm1");
+ &pxor ("xmm2","xmm2");
+ &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack
+ &pxor ("xmm3","xmm3");
+ &movdqa (&QWP(16*1,"esp"),"xmm0");
+ &pxor ("xmm4","xmm4");
+ &movdqa (&QWP(16*2,"esp"),"xmm0");
+ &pxor ("xmm5","xmm5");
+ &movdqa (&QWP(16*3,"esp"),"xmm0");
+ &pxor ("xmm6","xmm6");
+ &movdqa (&QWP(16*4,"esp"),"xmm0");
+ &pxor ("xmm7","xmm7");
+ &movdqa (&QWP(16*5,"esp"),"xmm0");
&mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
&function_end("aesni_xts_encrypt");
@@ -1399,8 +1501,10 @@ if ($PREFIX eq "aesni") {
&sub ($len,16*6);
&jc (&label("xts_dec_short"));
- &shr ($rounds,1);
- &mov ($rounds_,$rounds);
+ &shl ($rounds,4);
+ &mov ($rounds_,16);
+ &sub ($rounds_,$rounds);
+ &lea ($key,&DWP(32,$key,$rounds));
&jmp (&label("xts_dec_loop6"));
&set_label("xts_dec_loop6",16);
@@ -1422,6 +1526,7 @@ if ($PREFIX eq "aesni") {
&pxor ($inout5,$tweak);
# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
+ &mov ($rounds,$rounds_);
&movdqu ($inout1,&QWP(16*1,$inp));
&xorps ($inout0,$rndkey0); # input^=rndkey[0]
&movdqu ($inout2,&QWP(16*2,$inp));
@@ -1438,19 +1543,17 @@ if ($PREFIX eq "aesni") {
&pxor ($inout5,$rndkey1);
&$movekey ($rndkey1,&QWP(16,$key_));
- &lea ($key,&DWP(32,$key_));
&pxor ($inout1,&QWP(16*1,"esp"));
- &aesdec ($inout0,$rndkey1);
&pxor ($inout2,&QWP(16*2,"esp"));
- &aesdec ($inout1,$rndkey1);
+ &aesdec ($inout0,$rndkey1);
&pxor ($inout3,&QWP(16*3,"esp"));
- &dec ($rounds);
- &aesdec ($inout2,$rndkey1);
&pxor ($inout4,&QWP(16*4,"esp"));
- &aesdec ($inout3,$rndkey1);
+ &aesdec ($inout1,$rndkey1);
&pxor ($inout5,$rndkey0);
+ &$movekey ($rndkey0,&QWP(32,$key_));
+ &aesdec ($inout2,$rndkey1);
+ &aesdec ($inout3,$rndkey1);
&aesdec ($inout4,$rndkey1);
- &$movekey ($rndkey0,&QWP(0,$key));
&aesdec ($inout5,$rndkey1);
&call (&label("_aesni_decrypt6_enter"));
@@ -1477,13 +1580,12 @@ if ($PREFIX eq "aesni") {
&paddq ($tweak,$tweak); # &psllq($tweak,1);
&pand ($twres,$twmask); # isolate carry and residue
&pcmpgtd($twtmp,$tweak); # broadcast upper bits
- &mov ($rounds,$rounds_); # restore $rounds
&pxor ($tweak,$twres);
&sub ($len,16*6);
&jnc (&label("xts_dec_loop6"));
- &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
+ &mov ($rounds,&DWP(240,$key_)); # restore $rounds
&mov ($key,$key_); # restore $key
&mov ($rounds_,$rounds);
@@ -1584,7 +1686,7 @@ if ($PREFIX eq "aesni") {
&xorps ($inout0,$inout3); # input^=tweak
&xorps ($inout1,$inout4);
- &call ("_aesni_decrypt3");
+ &call ("_aesni_decrypt2");
&xorps ($inout0,$inout3); # output^=tweak
&xorps ($inout1,$inout4);
@@ -1712,6 +1814,20 @@ if ($PREFIX eq "aesni") {
&movups (&QWP(0,$out),$inout0); # write output
&set_label("xts_dec_ret");
+ &pxor ("xmm0","xmm0"); # clear register bank
+ &pxor ("xmm1","xmm1");
+ &pxor ("xmm2","xmm2");
+ &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack
+ &pxor ("xmm3","xmm3");
+ &movdqa (&QWP(16*1,"esp"),"xmm0");
+ &pxor ("xmm4","xmm4");
+ &movdqa (&QWP(16*2,"esp"),"xmm0");
+ &pxor ("xmm5","xmm5");
+ &movdqa (&QWP(16*3,"esp"),"xmm0");
+ &pxor ("xmm6","xmm6");
+ &movdqa (&QWP(16*4,"esp"),"xmm0");
+ &pxor ("xmm7","xmm7");
+ &movdqa (&QWP(16*5,"esp"),"xmm0");
&mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
&function_end("aesni_xts_decrypt");
}
@@ -1764,6 +1880,7 @@ if ($PREFIX eq "aesni") {
&add ($len,16);
&jnz (&label("cbc_enc_tail"));
&movaps ($ivec,$inout0);
+ &pxor ($inout0,$inout0);
&jmp (&label("cbc_ret"));
&set_label("cbc_enc_tail");
@@ -1816,7 +1933,7 @@ if ($PREFIX eq "aesni") {
&movups (&QWP(0x10,$out),$inout1);
&lea ($inp,&DWP(0x60,$inp));
&movups (&QWP(0x20,$out),$inout2);
- &mov ($rounds,$rounds_) # restore $rounds
+ &mov ($rounds,$rounds_); # restore $rounds
&movups (&QWP(0x30,$out),$inout3);
&mov ($key,$key_); # restore $key
&movups (&QWP(0x40,$out),$inout4);
@@ -1827,7 +1944,7 @@ if ($PREFIX eq "aesni") {
&movaps ($inout0,$inout5);
&movaps ($ivec,$rndkey0);
&add ($len,0x50);
- &jle (&label("cbc_dec_tail_collected"));
+ &jle (&label("cbc_dec_clear_tail_collected"));
&movups (&QWP(0,$out),$inout0);
&lea ($out,&DWP(0x10,$out));
&set_label("cbc_dec_tail");
@@ -1866,10 +1983,14 @@ if ($PREFIX eq "aesni") {
&xorps ($inout4,$rndkey0);
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
+ &pxor ($inout1,$inout1);
&movups (&QWP(0x20,$out),$inout2);
+ &pxor ($inout2,$inout2);
&movups (&QWP(0x30,$out),$inout3);
+ &pxor ($inout3,$inout3);
&lea ($out,&DWP(0x40,$out));
&movaps ($inout0,$inout4);
+ &pxor ($inout4,$inout4);
&sub ($len,0x50);
&jmp (&label("cbc_dec_tail_collected"));
@@ -1884,12 +2005,12 @@ if ($PREFIX eq "aesni") {
&jmp (&label("cbc_dec_tail_collected"));
&set_label("cbc_dec_two",16);
- &xorps ($inout2,$inout2);
- &call ("_aesni_decrypt3");
+ &call ("_aesni_decrypt2");
&xorps ($inout0,$ivec);
&xorps ($inout1,$in0);
&movups (&QWP(0,$out),$inout0);
&movaps ($inout0,$inout1);
+ &pxor ($inout1,$inout1);
&lea ($out,&DWP(0x10,$out));
&movaps ($ivec,$in1);
&sub ($len,0x20);
@@ -1902,7 +2023,9 @@ if ($PREFIX eq "aesni") {
&xorps ($inout2,$in1);
&movups (&QWP(0,$out),$inout0);
&movaps ($inout0,$inout2);
+ &pxor ($inout2,$inout2);
&movups (&QWP(0x10,$out),$inout1);
+ &pxor ($inout1,$inout1);
&lea ($out,&DWP(0x20,$out));
&movups ($ivec,&QWP(0x20,$inp));
&sub ($len,0x30);
@@ -1918,29 +2041,44 @@ if ($PREFIX eq "aesni") {
&movups (&QWP(0,$out),$inout0);
&xorps ($inout2,$rndkey1);
&movups (&QWP(0x10,$out),$inout1);
+ &pxor ($inout1,$inout1);
&xorps ($inout3,$rndkey0);
&movups (&QWP(0x20,$out),$inout2);
+ &pxor ($inout2,$inout2);
&lea ($out,&DWP(0x30,$out));
&movaps ($inout0,$inout3);
+ &pxor ($inout3,$inout3);
&sub ($len,0x40);
+ &jmp (&label("cbc_dec_tail_collected"));
+&set_label("cbc_dec_clear_tail_collected",16);
+ &pxor ($inout1,$inout1);
+ &pxor ($inout2,$inout2);
+ &pxor ($inout3,$inout3);
+ &pxor ($inout4,$inout4);
&set_label("cbc_dec_tail_collected");
&and ($len,15);
&jnz (&label("cbc_dec_tail_partial"));
&movups (&QWP(0,$out),$inout0);
+ &pxor ($rndkey0,$rndkey0);
&jmp (&label("cbc_ret"));
&set_label("cbc_dec_tail_partial",16);
&movaps (&QWP(0,"esp"),$inout0);
+ &pxor ($rndkey0,$rndkey0);
&mov ("ecx",16);
&mov ($inp,"esp");
&sub ("ecx",$len);
&data_word(0xA4F3F689); # rep movsb
+ &movdqa (&QWP(0,"esp"),$inout0);
&set_label("cbc_ret");
&mov ("esp",&DWP(16,"esp")); # pull original %esp
&mov ($key_,&wparam(4));
+ &pxor ($inout0,$inout0);
+ &pxor ($rndkey1,$rndkey1);
&movups (&QWP(0,$key_),$ivec); # output IV
+ &pxor ($ivec,$ivec);
&set_label("cbc_abort");
&function_end("${PREFIX}_cbc_encrypt");
@@ -1957,14 +2095,24 @@ if ($PREFIX eq "aesni") {
# $round rounds
&function_begin_B("_aesni_set_encrypt_key");
+ &push ("ebp");
+ &push ("ebx");
&test ("eax","eax");
&jz (&label("bad_pointer"));
&test ($key,$key);
&jz (&label("bad_pointer"));
+ &call (&label("pic"));
+&set_label("pic");
+ &blindpop("ebx");
+ &lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
+
+ &picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
&movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
&xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
+ &mov ("ebp",&DWP(4,"ebp"));
&lea ($key,&DWP(16,$key));
+ &and ("ebp",1<<28|1<<11); # AVX and XOP bits
&cmp ($rounds,256);
&je (&label("14rounds"));
&cmp ($rounds,192);
@@ -1973,6 +2121,9 @@ if ($PREFIX eq "aesni") {
&jne (&label("bad_keybits"));
&set_label("10rounds",16);
+ &cmp ("ebp",1<<28);
+ &je (&label("10rounds_alt"));
+
&mov ($rounds,9);
&$movekey (&QWP(-16,$key),"xmm0"); # round 0
&aeskeygenassist("xmm1","xmm0",0x01); # round 1
@@ -1997,8 +2148,8 @@ if ($PREFIX eq "aesni") {
&call (&label("key_128"));
&$movekey (&QWP(0,$key),"xmm0");
&mov (&DWP(80,$key),$rounds);
- &xor ("eax","eax");
- &ret();
+
+ &jmp (&label("good_key"));
&set_label("key_128",16);
&$movekey (&QWP(0,$key),"xmm0");
@@ -2012,10 +2163,78 @@ if ($PREFIX eq "aesni") {
&xorps ("xmm0","xmm1");
&ret();
+&set_label("10rounds_alt",16);
+ &movdqa ("xmm5",&QWP(0x00,"ebx"));
+ &mov ($rounds,8);
+ &movdqa ("xmm4",&QWP(0x20,"ebx"));
+ &movdqa ("xmm2","xmm0");
+ &movdqu (&QWP(-16,$key),"xmm0");
+
+&set_label("loop_key128");
+ &pshufb ("xmm0","xmm5");
+ &aesenclast ("xmm0","xmm4");
+ &pslld ("xmm4",1);
+ &lea ($key,&DWP(16,$key));
+
+ &movdqa ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm2","xmm3");
+
+ &pxor ("xmm0","xmm2");
+ &movdqu (&QWP(-16,$key),"xmm0");
+ &movdqa ("xmm2","xmm0");
+
+ &dec ($rounds);
+ &jnz (&label("loop_key128"));
+
+ &movdqa ("xmm4",&QWP(0x30,"ebx"));
+
+ &pshufb ("xmm0","xmm5");
+ &aesenclast ("xmm0","xmm4");
+ &pslld ("xmm4",1);
+
+ &movdqa ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm2","xmm3");
+
+ &pxor ("xmm0","xmm2");
+ &movdqu (&QWP(0,$key),"xmm0");
+
+ &movdqa ("xmm2","xmm0");
+ &pshufb ("xmm0","xmm5");
+ &aesenclast ("xmm0","xmm4");
+
+ &movdqa ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm2","xmm3");
+
+ &pxor ("xmm0","xmm2");
+ &movdqu (&QWP(16,$key),"xmm0");
+
+ &mov ($rounds,9);
+ &mov (&DWP(96,$key),$rounds);
+
+ &jmp (&label("good_key"));
+
&set_label("12rounds",16);
&movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
+ &cmp ("ebp",1<<28);
+ &je (&label("12rounds_alt"));
+
&mov ($rounds,11);
- &$movekey (&QWP(-16,$key),"xmm0") # round 0
+ &$movekey (&QWP(-16,$key),"xmm0"); # round 0
&aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
&call (&label("key_192a_cold"));
&aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
@@ -2034,8 +2253,8 @@ if ($PREFIX eq "aesni") {
&call (&label("key_192b"));
&$movekey (&QWP(0,$key),"xmm0");
&mov (&DWP(48,$key),$rounds);
- &xor ("eax","eax");
- &ret();
+
+ &jmp (&label("good_key"));
&set_label("key_192a",16);
&$movekey (&QWP(0,$key),"xmm0");
@@ -2065,10 +2284,52 @@ if ($PREFIX eq "aesni") {
&lea ($key,&DWP(32,$key));
&jmp (&label("key_192b_warm"));
+&set_label("12rounds_alt",16);
+ &movdqa ("xmm5",&QWP(0x10,"ebx"));
+ &movdqa ("xmm4",&QWP(0x20,"ebx"));
+ &mov ($rounds,8);
+ &movdqu (&QWP(-16,$key),"xmm0");
+
+&set_label("loop_key192");
+ &movq (&QWP(0,$key),"xmm2");
+ &movdqa ("xmm1","xmm2");
+ &pshufb ("xmm2","xmm5");
+ &aesenclast ("xmm2","xmm4");
+ &pslld ("xmm4",1);
+ &lea ($key,&DWP(24,$key));
+
+ &movdqa ("xmm3","xmm0");
+ &pslldq ("xmm0",4);
+ &pxor ("xmm3","xmm0");
+ &pslldq ("xmm0",4);
+ &pxor ("xmm3","xmm0");
+ &pslldq ("xmm0",4);
+ &pxor ("xmm0","xmm3");
+
+ &pshufd ("xmm3","xmm0",0xff);
+ &pxor ("xmm3","xmm1");
+ &pslldq ("xmm1",4);
+ &pxor ("xmm3","xmm1");
+
+ &pxor ("xmm0","xmm2");
+ &pxor ("xmm2","xmm3");
+ &movdqu (&QWP(-16,$key),"xmm0");
+
+ &dec ($rounds);
+ &jnz (&label("loop_key192"));
+
+ &mov ($rounds,11);
+ &mov (&DWP(32,$key),$rounds);
+
+ &jmp (&label("good_key"));
+
&set_label("14rounds",16);
&movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
- &mov ($rounds,13);
&lea ($key,&DWP(16,$key));
+ &cmp ("ebp",1<<28);
+ &je (&label("14rounds_alt"));
+
+ &mov ($rounds,13);
&$movekey (&QWP(-32,$key),"xmm0"); # round 0
&$movekey (&QWP(-16,$key),"xmm2"); # round 1
&aeskeygenassist("xmm1","xmm2",0x01); # round 2
@@ -2100,7 +2361,8 @@ if ($PREFIX eq "aesni") {
&$movekey (&QWP(0,$key),"xmm0");
&mov (&DWP(16,$key),$rounds);
&xor ("eax","eax");
- &ret();
+
+ &jmp (&label("good_key"));
&set_label("key_256a",16);
&$movekey (&QWP(0,$key),"xmm2");
@@ -2126,11 +2388,77 @@ if ($PREFIX eq "aesni") {
&xorps ("xmm2","xmm1");
&ret();
+&set_label("14rounds_alt",16);
+ &movdqa ("xmm5",&QWP(0x00,"ebx"));
+ &movdqa ("xmm4",&QWP(0x20,"ebx"));
+ &mov ($rounds,7);
+ &movdqu (&QWP(-32,$key),"xmm0");
+ &movdqa ("xmm1","xmm2");
+ &movdqu (&QWP(-16,$key),"xmm2");
+
+&set_label("loop_key256");
+ &pshufb ("xmm2","xmm5");
+ &aesenclast ("xmm2","xmm4");
+
+ &movdqa ("xmm3","xmm0");
+ &pslldq ("xmm0",4);
+ &pxor ("xmm3","xmm0");
+ &pslldq ("xmm0",4);
+ &pxor ("xmm3","xmm0");
+ &pslldq ("xmm0",4);
+ &pxor ("xmm0","xmm3");
+ &pslld ("xmm4",1);
+
+ &pxor ("xmm0","xmm2");
+ &movdqu (&QWP(0,$key),"xmm0");
+
+ &dec ($rounds);
+ &jz (&label("done_key256"));
+
+ &pshufd ("xmm2","xmm0",0xff);
+ &pxor ("xmm3","xmm3");
+ &aesenclast ("xmm2","xmm3");
+
+ &movdqa ("xmm3","xmm1")
+ &pslldq ("xmm1",4);
+ &pxor ("xmm3","xmm1");
+ &pslldq ("xmm1",4);
+ &pxor ("xmm3","xmm1");
+ &pslldq ("xmm1",4);
+ &pxor ("xmm1","xmm3");
+
+ &pxor ("xmm2","xmm1");
+ &movdqu (&QWP(16,$key),"xmm2");
+ &lea ($key,&DWP(32,$key));
+ &movdqa ("xmm1","xmm2");
+ &jmp (&label("loop_key256"));
+
+&set_label("done_key256");
+ &mov ($rounds,13);
+ &mov (&DWP(16,$key),$rounds);
+
+&set_label("good_key");
+ &pxor ("xmm0","xmm0");
+ &pxor ("xmm1","xmm1");
+ &pxor ("xmm2","xmm2");
+ &pxor ("xmm3","xmm3");
+ &pxor ("xmm4","xmm4");
+ &pxor ("xmm5","xmm5");
+ &xor ("eax","eax");
+ &pop ("ebx");
+ &pop ("ebp");
+ &ret ();
+
&set_label("bad_pointer",4);
&mov ("eax",-1);
+ &pop ("ebx");
+ &pop ("ebp");
&ret ();
&set_label("bad_keybits",4);
+ &pxor ("xmm0","xmm0");
&mov ("eax",-2);
+ &pop ("ebx");
+ &pop ("ebp");
&ret ();
&function_end_B("_aesni_set_encrypt_key");
@@ -2152,7 +2480,7 @@ if ($PREFIX eq "aesni") {
&mov ($key,&wparam(2));
&call ("_aesni_set_encrypt_key");
&mov ($key,&wparam(2));
- &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key
+ &shl ($rounds,4); # rounds-1 after _aesni_set_encrypt_key
&test ("eax","eax");
&jnz (&label("dec_key_ret"));
&lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule
@@ -2180,10 +2508,18 @@ if ($PREFIX eq "aesni") {
&aesimc ("xmm0","xmm0");
&$movekey (&QWP(0,$key),"xmm0");
+ &pxor ("xmm0","xmm0");
+ &pxor ("xmm1","xmm1");
&xor ("eax","eax"); # return success
&set_label("dec_key_ret");
&ret ();
&function_end_B("${PREFIX}_set_decrypt_key");
+
+&set_label("key_const",64);
+&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
+&data_word(0x04070605,0x04070605,0x04070605,0x04070605);
+&data_word(1,1,1,1);
+&data_word(0x1b,0x1b,0x1b,0x1b);
&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();
diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl
index c9270dfddc18..25ca574f6a2f 100755
--- a/crypto/aes/asm/aesni-x86_64.pl
+++ b/crypto/aes/asm/aesni-x86_64.pl
@@ -1,7 +1,7 @@
#!/usr/bin/env perl
#
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -129,8 +129,8 @@
#
# Further data for other parallelizable modes:
#
-# CBC decrypt 1.16 0.93 0.93
-# CTR 1.14 0.91 n/a
+# CBC decrypt 1.16 0.93 0.74
+# CTR 1.14 0.91 0.74
#
# Well, given 3x column it's probably inappropriate to call the limit
# asymptotic, if it can be surpassed, isn't it? What happens there?
@@ -153,10 +153,25 @@
# April 2011
#
-# Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing
-# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like
+# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
+# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
# in CTR mode AES instruction interleave factor was chosen to be 6x.
+######################################################################
+# Current large-block performance in cycles per byte processed with
+# 128-bit key (less is better).
+#
+# CBC en-/decrypt CTR XTS ECB
+# Westmere 3.77/1.25 1.25 1.25 1.26
+# * Bridge 5.07/0.74 0.75 0.90 0.85
+# Haswell 4.44/0.63 0.63 0.73 0.63
+# Silvermont 5.75/3.54 3.56 4.12 3.87(*)
+# Bulldozer 5.77/0.70 0.72 0.90 0.70
+#
+# (*) Atom Silvermont ECB result is suboptimal because of penalties
+# incurred by operations on %xmm8-15. As ECB is not considered
+# critical, nothing was done to mitigate the problem.
+
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
# generates drop-in replacement for
# crypto/aes/asm/aes-x86_64.pl:-)
@@ -180,6 +195,7 @@ $movkey = $PREFIX eq "aesni" ? "movups" : "movups";
("%rdi","%rsi","%rdx","%rcx"); # Unix order
$code=".text\n";
+$code.=".extern OPENSSL_ia32cap_P\n";
$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
@@ -247,7 +263,10 @@ ${PREFIX}_encrypt:
___
&aesni_generate1("enc",$key,$rounds);
$code.=<<___;
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
movups $inout0,($out) # output
+ pxor $inout0,$inout0
ret
.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
@@ -260,7 +279,10 @@ ${PREFIX}_decrypt:
___
&aesni_generate1("dec",$key,$rounds);
$code.=<<___;
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
movups $inout0,($out) # output
+ pxor $inout0,$inout0
ret
.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt
___
@@ -272,10 +294,49 @@ ___
# every *2nd* cycle. Thus 3x interleave was the one providing optimal
# utilization, i.e. when subroutine's throughput is virtually same as
# of non-interleaved subroutine [for number of input blocks up to 3].
-# This is why it makes no sense to implement 2x subroutine.
-# aes[enc|dec] latency in next processor generation is 8, but the
-# instructions can be scheduled every cycle. Optimal interleave for
-# new processor is therefore 8x...
+# This is why it originally made no sense to implement 2x subroutine.
+# But times change and it became appropriate to spend extra 192 bytes
+# on 2x subroutine on Atom Silvermont account. For processors that
+# can schedule aes[enc|dec] every cycle optimal interleave factor
+# equals to corresponding instructions latency. 8x is optimal for
+# * Bridge and "super-optimal" for other Intel CPUs...
+
+sub aesni_generate2 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-1] is cipher/clear text...
+$code.=<<___;
+.type _aesni_${dir}rypt2,\@abi-omnipotent
+.align 16
+_aesni_${dir}rypt2:
+ $movkey ($key),$rndkey0
+ shl \$4,$rounds
+ $movkey 16($key),$rndkey1
+ xorps $rndkey0,$inout0
+ xorps $rndkey0,$inout1
+ $movkey 32($key),$rndkey0
+ lea 32($key,$rounds),$key
+ neg %rax # $rounds
+ add \$16,%rax
+
+.L${dir}_loop2:
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+ aes${dir} $rndkey0,$inout0
+ aes${dir} $rndkey0,$inout1
+ $movkey -16($key,%rax),$rndkey0
+ jnz .L${dir}_loop2
+
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ aes${dir}last $rndkey0,$inout0
+ aes${dir}last $rndkey0,$inout1
+ ret
+.size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2
+___
+}
sub aesni_generate3 {
my $dir=shift;
# As already mentioned it takes in $key and $rounds, which are *not*
@@ -285,25 +346,26 @@ $code.=<<___;
.align 16
_aesni_${dir}rypt3:
$movkey ($key),$rndkey0
- shr \$1,$rounds
+ shl \$4,$rounds
$movkey 16($key),$rndkey1
- lea 32($key),$key
xorps $rndkey0,$inout0
xorps $rndkey0,$inout1
xorps $rndkey0,$inout2
- $movkey ($key),$rndkey0
+ $movkey 32($key),$rndkey0
+ lea 32($key,$rounds),$key
+ neg %rax # $rounds
+ add \$16,%rax
.L${dir}_loop3:
aes${dir} $rndkey1,$inout0
aes${dir} $rndkey1,$inout1
- dec $rounds
aes${dir} $rndkey1,$inout2
- $movkey 16($key),$rndkey1
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
aes${dir} $rndkey0,$inout0
aes${dir} $rndkey0,$inout1
- lea 32($key),$key
aes${dir} $rndkey0,$inout2
- $movkey ($key),$rndkey0
+ $movkey -16($key,%rax),$rndkey0
jnz .L${dir}_loop3
aes${dir} $rndkey1,$inout0
@@ -329,28 +391,30 @@ $code.=<<___;
.align 16
_aesni_${dir}rypt4:
$movkey ($key),$rndkey0
- shr \$1,$rounds
+ shl \$4,$rounds
$movkey 16($key),$rndkey1
- lea 32($key),$key
xorps $rndkey0,$inout0
xorps $rndkey0,$inout1
xorps $rndkey0,$inout2
xorps $rndkey0,$inout3
- $movkey ($key),$rndkey0
+ $movkey 32($key),$rndkey0
+ lea 32($key,$rounds),$key
+ neg %rax # $rounds
+ .byte 0x0f,0x1f,0x00
+ add \$16,%rax
.L${dir}_loop4:
aes${dir} $rndkey1,$inout0
aes${dir} $rndkey1,$inout1
- dec $rounds
aes${dir} $rndkey1,$inout2
aes${dir} $rndkey1,$inout3
- $movkey 16($key),$rndkey1
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
aes${dir} $rndkey0,$inout0
aes${dir} $rndkey0,$inout1
- lea 32($key),$key
aes${dir} $rndkey0,$inout2
aes${dir} $rndkey0,$inout3
- $movkey ($key),$rndkey0
+ $movkey -16($key,%rax),$rndkey0
jnz .L${dir}_loop4
aes${dir} $rndkey1,$inout0
@@ -374,43 +438,40 @@ $code.=<<___;
.align 16
_aesni_${dir}rypt6:
$movkey ($key),$rndkey0
- shr \$1,$rounds
+ shl \$4,$rounds
$movkey 16($key),$rndkey1
- lea 32($key),$key
xorps $rndkey0,$inout0
pxor $rndkey0,$inout1
- aes${dir} $rndkey1,$inout0
pxor $rndkey0,$inout2
+ aes${dir} $rndkey1,$inout0
+ lea 32($key,$rounds),$key
+ neg %rax # $rounds
aes${dir} $rndkey1,$inout1
pxor $rndkey0,$inout3
- aes${dir} $rndkey1,$inout2
pxor $rndkey0,$inout4
- aes${dir} $rndkey1,$inout3
+ aes${dir} $rndkey1,$inout2
pxor $rndkey0,$inout5
- dec $rounds
- aes${dir} $rndkey1,$inout4
- $movkey ($key),$rndkey0
- aes${dir} $rndkey1,$inout5
+ $movkey ($key,%rax),$rndkey0
+ add \$16,%rax
jmp .L${dir}_loop6_enter
.align 16
.L${dir}_loop6:
aes${dir} $rndkey1,$inout0
aes${dir} $rndkey1,$inout1
- dec $rounds
aes${dir} $rndkey1,$inout2
+.L${dir}_loop6_enter:
aes${dir} $rndkey1,$inout3
aes${dir} $rndkey1,$inout4
aes${dir} $rndkey1,$inout5
-.L${dir}_loop6_enter: # happens to be 16-byte aligned
- $movkey 16($key),$rndkey1
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
aes${dir} $rndkey0,$inout0
aes${dir} $rndkey0,$inout1
- lea 32($key),$key
aes${dir} $rndkey0,$inout2
aes${dir} $rndkey0,$inout3
aes${dir} $rndkey0,$inout4
aes${dir} $rndkey0,$inout5
- $movkey ($key),$rndkey0
+ $movkey -16($key,%rax),$rndkey0
jnz .L${dir}_loop6
aes${dir} $rndkey1,$inout0
@@ -438,52 +499,46 @@ $code.=<<___;
.align 16
_aesni_${dir}rypt8:
$movkey ($key),$rndkey0
- shr \$1,$rounds
+ shl \$4,$rounds
$movkey 16($key),$rndkey1
- lea 32($key),$key
xorps $rndkey0,$inout0
xorps $rndkey0,$inout1
- aes${dir} $rndkey1,$inout0
pxor $rndkey0,$inout2
- aes${dir} $rndkey1,$inout1
pxor $rndkey0,$inout3
- aes${dir} $rndkey1,$inout2
pxor $rndkey0,$inout4
- aes${dir} $rndkey1,$inout3
+ lea 32($key,$rounds),$key
+ neg %rax # $rounds
+ aes${dir} $rndkey1,$inout0
pxor $rndkey0,$inout5
- dec $rounds
- aes${dir} $rndkey1,$inout4
pxor $rndkey0,$inout6
- aes${dir} $rndkey1,$inout5
+ aes${dir} $rndkey1,$inout1
pxor $rndkey0,$inout7
- $movkey ($key),$rndkey0
- aes${dir} $rndkey1,$inout6
- aes${dir} $rndkey1,$inout7
- $movkey 16($key),$rndkey1
- jmp .L${dir}_loop8_enter
+ $movkey ($key,%rax),$rndkey0
+ add \$16,%rax
+ jmp .L${dir}_loop8_inner
.align 16
.L${dir}_loop8:
aes${dir} $rndkey1,$inout0
aes${dir} $rndkey1,$inout1
- dec $rounds
+.L${dir}_loop8_inner:
aes${dir} $rndkey1,$inout2
aes${dir} $rndkey1,$inout3
aes${dir} $rndkey1,$inout4
aes${dir} $rndkey1,$inout5
aes${dir} $rndkey1,$inout6
aes${dir} $rndkey1,$inout7
- $movkey 16($key),$rndkey1
-.L${dir}_loop8_enter: # happens to be 16-byte aligned
+.L${dir}_loop8_enter:
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
aes${dir} $rndkey0,$inout0
aes${dir} $rndkey0,$inout1
- lea 32($key),$key
aes${dir} $rndkey0,$inout2
aes${dir} $rndkey0,$inout3
aes${dir} $rndkey0,$inout4
aes${dir} $rndkey0,$inout5
aes${dir} $rndkey0,$inout6
aes${dir} $rndkey0,$inout7
- $movkey ($key),$rndkey0
+ $movkey -16($key,%rax),$rndkey0
jnz .L${dir}_loop8
aes${dir} $rndkey1,$inout0
@@ -506,6 +561,8 @@ _aesni_${dir}rypt8:
.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
___
}
+&aesni_generate2("enc") if ($PREFIX eq "aesni");
+&aesni_generate2("dec");
&aesni_generate3("enc") if ($PREFIX eq "aesni");
&aesni_generate3("dec");
&aesni_generate4("enc") if ($PREFIX eq "aesni");
@@ -528,15 +585,15 @@ aesni_ecb_encrypt:
___
$code.=<<___ if ($win64);
lea -0x58(%rsp),%rsp
- movaps %xmm6,(%rsp)
+ movaps %xmm6,(%rsp) # offload $inout4..7
movaps %xmm7,0x10(%rsp)
movaps %xmm8,0x20(%rsp)
movaps %xmm9,0x30(%rsp)
.Lecb_enc_body:
___
$code.=<<___;
- and \$-16,$len
- jz .Lecb_ret
+ and \$-16,$len # if ($len<16)
+ jz .Lecb_ret # return
mov 240($key),$rounds # key->rounds
$movkey ($key),$rndkey0
@@ -545,10 +602,10 @@ $code.=<<___;
test %r8d,%r8d # 5th argument
jz .Lecb_decrypt
#--------------------------- ECB ENCRYPT ------------------------------#
- cmp \$0x80,$len
- jb .Lecb_enc_tail
+ cmp \$0x80,$len # if ($len<8*16)
+ jb .Lecb_enc_tail # short input
- movdqu ($inp),$inout0
+ movdqu ($inp),$inout0 # load 8 input blocks
movdqu 0x10($inp),$inout1
movdqu 0x20($inp),$inout2
movdqu 0x30($inp),$inout3
@@ -556,14 +613,14 @@ $code.=<<___;
movdqu 0x50($inp),$inout5
movdqu 0x60($inp),$inout6
movdqu 0x70($inp),$inout7
- lea 0x80($inp),$inp
- sub \$0x80,$len
+ lea 0x80($inp),$inp # $inp+=8*16
+ sub \$0x80,$len # $len-=8*16 (can be zero)
jmp .Lecb_enc_loop8_enter
.align 16
.Lecb_enc_loop8:
- movups $inout0,($out)
+ movups $inout0,($out) # store 8 output blocks
mov $key_,$key # restore $key
- movdqu ($inp),$inout0
+ movdqu ($inp),$inout0 # load 8 input blocks
mov $rnds_,$rounds # restore $rounds
movups $inout1,0x10($out)
movdqu 0x10($inp),$inout1
@@ -578,17 +635,17 @@ $code.=<<___;
movups $inout6,0x60($out)
movdqu 0x60($inp),$inout6
movups $inout7,0x70($out)
- lea 0x80($out),$out
+ lea 0x80($out),$out # $out+=8*16
movdqu 0x70($inp),$inout7
- lea 0x80($inp),$inp
+ lea 0x80($inp),$inp # $inp+=8*16
.Lecb_enc_loop8_enter:
call _aesni_encrypt8
sub \$0x80,$len
- jnc .Lecb_enc_loop8
+ jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow
- movups $inout0,($out)
+ movups $inout0,($out) # store 8 output blocks
mov $key_,$key # restore $key
movups $inout1,0x10($out)
mov $rnds_,$rounds # restore $rounds
@@ -598,11 +655,11 @@ $code.=<<___;
movups $inout5,0x50($out)
movups $inout6,0x60($out)
movups $inout7,0x70($out)
- lea 0x80($out),$out
- add \$0x80,$len
- jz .Lecb_ret
+ lea 0x80($out),$out # $out+=8*16
+ add \$0x80,$len # restore real remaining $len
+ jz .Lecb_ret # done if ($len==0)
-.Lecb_enc_tail:
+.Lecb_enc_tail: # $len is less than 8*16
movups ($inp),$inout0
cmp \$0x20,$len
jb .Lecb_enc_one
@@ -619,8 +676,9 @@ $code.=<<___;
movups 0x50($inp),$inout5
je .Lecb_enc_six
movdqu 0x60($inp),$inout6
+ xorps $inout7,$inout7
call _aesni_encrypt8
- movups $inout0,($out)
+ movups $inout0,($out) # store 7 output blocks
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
@@ -633,26 +691,25 @@ $code.=<<___;
___
&aesni_generate1("enc",$key,$rounds);
$code.=<<___;
- movups $inout0,($out)
+ movups $inout0,($out) # store one output block
jmp .Lecb_ret
.align 16
.Lecb_enc_two:
- xorps $inout2,$inout2
- call _aesni_encrypt3
- movups $inout0,($out)
+ call _aesni_encrypt2
+ movups $inout0,($out) # store 2 output blocks
movups $inout1,0x10($out)
jmp .Lecb_ret
.align 16
.Lecb_enc_three:
call _aesni_encrypt3
- movups $inout0,($out)
+ movups $inout0,($out) # store 3 output blocks
movups $inout1,0x10($out)
movups $inout2,0x20($out)
jmp .Lecb_ret
.align 16
.Lecb_enc_four:
call _aesni_encrypt4
- movups $inout0,($out)
+ movups $inout0,($out) # store 4 output blocks
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
@@ -661,7 +718,7 @@ $code.=<<___;
.Lecb_enc_five:
xorps $inout5,$inout5
call _aesni_encrypt6
- movups $inout0,($out)
+ movups $inout0,($out) # store 5 output blocks
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
@@ -670,7 +727,7 @@ $code.=<<___;
.align 16
.Lecb_enc_six:
call _aesni_encrypt6
- movups $inout0,($out)
+ movups $inout0,($out) # store 6 output blocks
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
@@ -680,10 +737,10 @@ $code.=<<___;
#--------------------------- ECB DECRYPT ------------------------------#
.align 16
.Lecb_decrypt:
- cmp \$0x80,$len
- jb .Lecb_dec_tail
+ cmp \$0x80,$len # if ($len<8*16)
+ jb .Lecb_dec_tail # short input
- movdqu ($inp),$inout0
+ movdqu ($inp),$inout0 # load 8 input blocks
movdqu 0x10($inp),$inout1
movdqu 0x20($inp),$inout2
movdqu 0x30($inp),$inout3
@@ -691,14 +748,14 @@ $code.=<<___;
movdqu 0x50($inp),$inout5
movdqu 0x60($inp),$inout6
movdqu 0x70($inp),$inout7
- lea 0x80($inp),$inp
- sub \$0x80,$len
+ lea 0x80($inp),$inp # $inp+=8*16
+ sub \$0x80,$len # $len-=8*16 (can be zero)
jmp .Lecb_dec_loop8_enter
.align 16
.Lecb_dec_loop8:
- movups $inout0,($out)
+ movups $inout0,($out) # store 8 output blocks
mov $key_,$key # restore $key
- movdqu ($inp),$inout0
+ movdqu ($inp),$inout0 # load 8 input blocks
mov $rnds_,$rounds # restore $rounds
movups $inout1,0x10($out)
movdqu 0x10($inp),$inout1
@@ -713,30 +770,38 @@ $code.=<<___;
movups $inout6,0x60($out)
movdqu 0x60($inp),$inout6
movups $inout7,0x70($out)
- lea 0x80($out),$out
+ lea 0x80($out),$out # $out+=8*16
movdqu 0x70($inp),$inout7
- lea 0x80($inp),$inp
+ lea 0x80($inp),$inp # $inp+=8*16
.Lecb_dec_loop8_enter:
call _aesni_decrypt8
$movkey ($key_),$rndkey0
sub \$0x80,$len
- jnc .Lecb_dec_loop8
+ jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow
- movups $inout0,($out)
+ movups $inout0,($out) # store 8 output blocks
+ pxor $inout0,$inout0 # clear register bank
mov $key_,$key # restore $key
movups $inout1,0x10($out)
+ pxor $inout1,$inout1
mov $rnds_,$rounds # restore $rounds
movups $inout2,0x20($out)
+ pxor $inout2,$inout2
movups $inout3,0x30($out)
+ pxor $inout3,$inout3
movups $inout4,0x40($out)
+ pxor $inout4,$inout4
movups $inout5,0x50($out)
+ pxor $inout5,$inout5
movups $inout6,0x60($out)
+ pxor $inout6,$inout6
movups $inout7,0x70($out)
- lea 0x80($out),$out
- add \$0x80,$len
- jz .Lecb_ret
+ pxor $inout7,$inout7
+ lea 0x80($out),$out # $out+=8*16
+ add \$0x80,$len # restore real remaining $len
+ jz .Lecb_ret # done if ($len==0)
.Lecb_dec_tail:
movups ($inp),$inout0
@@ -756,71 +821,107 @@ $code.=<<___;
je .Lecb_dec_six
movups 0x60($inp),$inout6
$movkey ($key),$rndkey0
+ xorps $inout7,$inout7
call _aesni_decrypt8
- movups $inout0,($out)
+ movups $inout0,($out) # store 7 output blocks
+ pxor $inout0,$inout0 # clear register bank
movups $inout1,0x10($out)
+ pxor $inout1,$inout1
movups $inout2,0x20($out)
+ pxor $inout2,$inout2
movups $inout3,0x30($out)
+ pxor $inout3,$inout3
movups $inout4,0x40($out)
+ pxor $inout4,$inout4
movups $inout5,0x50($out)
+ pxor $inout5,$inout5
movups $inout6,0x60($out)
+ pxor $inout6,$inout6
+ pxor $inout7,$inout7
jmp .Lecb_ret
.align 16
.Lecb_dec_one:
___
&aesni_generate1("dec",$key,$rounds);
$code.=<<___;
- movups $inout0,($out)
+ movups $inout0,($out) # store one output block
+ pxor $inout0,$inout0 # clear register bank
jmp .Lecb_ret
.align 16
.Lecb_dec_two:
- xorps $inout2,$inout2
- call _aesni_decrypt3
- movups $inout0,($out)
+ call _aesni_decrypt2
+ movups $inout0,($out) # store 2 output blocks
+ pxor $inout0,$inout0 # clear register bank
movups $inout1,0x10($out)
+ pxor $inout1,$inout1
jmp .Lecb_ret
.align 16
.Lecb_dec_three:
call _aesni_decrypt3
- movups $inout0,($out)
+ movups $inout0,($out) # store 3 output blocks
+ pxor $inout0,$inout0 # clear register bank
movups $inout1,0x10($out)
+ pxor $inout1,$inout1
movups $inout2,0x20($out)
+ pxor $inout2,$inout2
jmp .Lecb_ret
.align 16
.Lecb_dec_four:
call _aesni_decrypt4
- movups $inout0,($out)
+ movups $inout0,($out) # store 4 output blocks
+ pxor $inout0,$inout0 # clear register bank
movups $inout1,0x10($out)
+ pxor $inout1,$inout1
movups $inout2,0x20($out)
+ pxor $inout2,$inout2
movups $inout3,0x30($out)
+ pxor $inout3,$inout3
jmp .Lecb_ret
.align 16
.Lecb_dec_five:
xorps $inout5,$inout5
call _aesni_decrypt6
- movups $inout0,($out)
+ movups $inout0,($out) # store 5 output blocks
+ pxor $inout0,$inout0 # clear register bank
movups $inout1,0x10($out)
+ pxor $inout1,$inout1
movups $inout2,0x20($out)
+ pxor $inout2,$inout2
movups $inout3,0x30($out)
+ pxor $inout3,$inout3
movups $inout4,0x40($out)
+ pxor $inout4,$inout4
+ pxor $inout5,$inout5
jmp .Lecb_ret
.align 16
.Lecb_dec_six:
call _aesni_decrypt6
- movups $inout0,($out)
+ movups $inout0,($out) # store 6 output blocks
+ pxor $inout0,$inout0 # clear register bank
movups $inout1,0x10($out)
+ pxor $inout1,$inout1
movups $inout2,0x20($out)
+ pxor $inout2,$inout2
movups $inout3,0x30($out)
+ pxor $inout3,$inout3
movups $inout4,0x40($out)
+ pxor $inout4,$inout4
movups $inout5,0x50($out)
+ pxor $inout5,$inout5
.Lecb_ret:
+ xorps $rndkey0,$rndkey0 # %xmm0
+ pxor $rndkey1,$rndkey1
___
$code.=<<___ if ($win64);
movaps (%rsp),%xmm6
+ movaps %xmm0,(%rsp) # clear stack
movaps 0x10(%rsp),%xmm7
+ movaps %xmm0,0x10(%rsp)
movaps 0x20(%rsp),%xmm8
+ movaps %xmm0,0x20(%rsp)
movaps 0x30(%rsp),%xmm9
+ movaps %xmm0,0x30(%rsp)
lea 0x58(%rsp),%rsp
.Lecb_enc_ret:
___
@@ -842,7 +943,8 @@ ___
{
my $cmac="%r9"; # 6th argument
-my $increment="%xmm6";
+my $increment="%xmm9";
+my $iv="%xmm6";
my $bswap_mask="%xmm7";
$code.=<<___;
@@ -853,10 +955,10 @@ aesni_ccm64_encrypt_blocks:
___
$code.=<<___ if ($win64);
lea -0x58(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
- movaps %xmm8,0x20(%rsp)
- movaps %xmm9,0x30(%rsp)
+ movaps %xmm6,(%rsp) # $iv
+ movaps %xmm7,0x10(%rsp) # $bswap_mask
+ movaps %xmm8,0x20(%rsp) # $in0
+ movaps %xmm9,0x30(%rsp) # $increment
.Lccm64_enc_body:
___
$code.=<<___;
@@ -865,58 +967,68 @@ $code.=<<___;
movdqa .Lincrement64(%rip),$increment
movdqa .Lbswap_mask(%rip),$bswap_mask
- shr \$1,$rounds
+ shl \$4,$rounds
+ mov \$16,$rnds_
lea 0($key),$key_
movdqu ($cmac),$inout1
movdqa $iv,$inout0
- mov $rounds,$rnds_
+ lea 32($key,$rounds),$key # end of key schedule
pshufb $bswap_mask,$iv
+ sub %rax,%r10 # twisted $rounds
jmp .Lccm64_enc_outer
.align 16
.Lccm64_enc_outer:
$movkey ($key_),$rndkey0
- mov $rnds_,$rounds
+ mov %r10,%rax
movups ($inp),$in0 # load inp
xorps $rndkey0,$inout0 # counter
$movkey 16($key_),$rndkey1
xorps $in0,$rndkey0
- lea 32($key_),$key
xorps $rndkey0,$inout1 # cmac^=inp
- $movkey ($key),$rndkey0
+ $movkey 32($key_),$rndkey0
.Lccm64_enc2_loop:
aesenc $rndkey1,$inout0
- dec $rounds
aesenc $rndkey1,$inout1
- $movkey 16($key),$rndkey1
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
aesenc $rndkey0,$inout0
- lea 32($key),$key
aesenc $rndkey0,$inout1
- $movkey 0($key),$rndkey0
+ $movkey -16($key,%rax),$rndkey0
jnz .Lccm64_enc2_loop
aesenc $rndkey1,$inout0
aesenc $rndkey1,$inout1
paddq $increment,$iv
+ dec $len # $len-- ($len is in blocks)
aesenclast $rndkey0,$inout0
aesenclast $rndkey0,$inout1
- dec $len
lea 16($inp),$inp
xorps $inout0,$in0 # inp ^= E(iv)
movdqa $iv,$inout0
movups $in0,($out) # save output
- lea 16($out),$out
pshufb $bswap_mask,$inout0
- jnz .Lccm64_enc_outer
+ lea 16($out),$out # $out+=16
+ jnz .Lccm64_enc_outer # loop if ($len!=0)
- movups $inout1,($cmac)
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
+ pxor $inout0,$inout0
+ movups $inout1,($cmac) # store resulting mac
+ pxor $inout1,$inout1
+ pxor $in0,$in0
+ pxor $iv,$iv
___
$code.=<<___ if ($win64);
movaps (%rsp),%xmm6
+ movaps %xmm0,(%rsp) # clear stack
movaps 0x10(%rsp),%xmm7
+ movaps %xmm0,0x10(%rsp)
movaps 0x20(%rsp),%xmm8
+ movaps %xmm0,0x20(%rsp)
movaps 0x30(%rsp),%xmm9
+ movaps %xmm0,0x30(%rsp)
lea 0x58(%rsp),%rsp
.Lccm64_enc_ret:
___
@@ -933,10 +1045,10 @@ aesni_ccm64_decrypt_blocks:
___
$code.=<<___ if ($win64);
lea -0x58(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
- movaps %xmm8,0x20(%rsp)
- movaps %xmm9,0x30(%rsp)
+ movaps %xmm6,(%rsp) # $iv
+ movaps %xmm7,0x10(%rsp) # $bswap_mask
+ movaps %xmm8,0x20(%rsp) # $in8
+ movaps %xmm9,0x30(%rsp) # $increment
.Lccm64_dec_body:
___
$code.=<<___;
@@ -953,63 +1065,77 @@ $code.=<<___;
___
&aesni_generate1("enc",$key,$rounds);
$code.=<<___;
+ shl \$4,$rnds_
+ mov \$16,$rounds
movups ($inp),$in0 # load inp
paddq $increment,$iv
- lea 16($inp),$inp
+ lea 16($inp),$inp # $inp+=16
+ sub %r10,%rax # twisted $rounds
+ lea 32($key_,$rnds_),$key # end of key schedule
+ mov %rax,%r10
jmp .Lccm64_dec_outer
.align 16
.Lccm64_dec_outer:
xorps $inout0,$in0 # inp ^= E(iv)
movdqa $iv,$inout0
- mov $rnds_,$rounds
movups $in0,($out) # save output
- lea 16($out),$out
+ lea 16($out),$out # $out+=16
pshufb $bswap_mask,$inout0
- sub \$1,$len
- jz .Lccm64_dec_break
+ sub \$1,$len # $len-- ($len is in blocks)
+ jz .Lccm64_dec_break # if ($len==0) break
$movkey ($key_),$rndkey0
- shr \$1,$rounds
+ mov %r10,%rax
$movkey 16($key_),$rndkey1
xorps $rndkey0,$in0
- lea 32($key_),$key
xorps $rndkey0,$inout0
xorps $in0,$inout1 # cmac^=out
- $movkey ($key),$rndkey0
-
+ $movkey 32($key_),$rndkey0
+ jmp .Lccm64_dec2_loop
+.align 16
.Lccm64_dec2_loop:
aesenc $rndkey1,$inout0
- dec $rounds
aesenc $rndkey1,$inout1
- $movkey 16($key),$rndkey1
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
aesenc $rndkey0,$inout0
- lea 32($key),$key
aesenc $rndkey0,$inout1
- $movkey 0($key),$rndkey0
+ $movkey -16($key,%rax),$rndkey0
jnz .Lccm64_dec2_loop
- movups ($inp),$in0 # load inp
+ movups ($inp),$in0 # load input
paddq $increment,$iv
aesenc $rndkey1,$inout0
aesenc $rndkey1,$inout1
- lea 16($inp),$inp
aesenclast $rndkey0,$inout0
aesenclast $rndkey0,$inout1
+ lea 16($inp),$inp # $inp+=16
jmp .Lccm64_dec_outer
.align 16
.Lccm64_dec_break:
#xorps $in0,$inout1 # cmac^=out
+ mov 240($key_),$rounds
___
&aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
$code.=<<___;
- movups $inout1,($cmac)
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
+ pxor $inout0,$inout0
+ movups $inout1,($cmac) # store resulting mac
+ pxor $inout1,$inout1
+ pxor $in0,$in0
+ pxor $iv,$iv
___
$code.=<<___ if ($win64);
movaps (%rsp),%xmm6
+ movaps %xmm0,(%rsp) # clear stack
movaps 0x10(%rsp),%xmm7
+ movaps %xmm0,0x10(%rsp)
movaps 0x20(%rsp),%xmm8
+ movaps %xmm0,0x20(%rsp)
movaps 0x30(%rsp),%xmm9
+ movaps %xmm0,0x30(%rsp)
lea 0x58(%rsp),%rsp
.Lccm64_dec_ret:
___
@@ -1024,285 +1150,567 @@ ___
# const char *ivec);
#
# Handles only complete blocks, operates on 32-bit counter and
-# does not update *ivec! (see engine/eng_aesni.c for details)
+# does not update *ivec! (see crypto/modes/ctr128.c for details)
#
+# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
+# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
+# Keywords are full unroll and modulo-schedule counter calculations
+# with zero-round key xor.
{
-my $reserved = $win64?0:-0x28;
-my ($in0,$in1,$in2,$in3)=map("%xmm$_",(8..11));
-my ($iv0,$iv1,$ivec)=("%xmm12","%xmm13","%xmm14");
-my $bswap_mask="%xmm15";
+my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
+my ($key0,$ctr)=("${key_}d","${ivp}d");
+my $frame_size = 0x80 + ($win64?160:0);
$code.=<<___;
.globl aesni_ctr32_encrypt_blocks
.type aesni_ctr32_encrypt_blocks,\@function,5
.align 16
aesni_ctr32_encrypt_blocks:
+ cmp \$1,$len
+ jne .Lctr32_bulk
+
+ # handle single block without allocating stack frame,
+ # useful when handling edges
+ movups ($ivp),$inout0
+ movups ($inp),$inout1
+ mov 240($key),%edx # key->rounds
+___
+ &aesni_generate1("enc",$key,"%edx");
+$code.=<<___;
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
+ xorps $inout1,$inout0
+ pxor $inout1,$inout1
+ movups $inout0,($out)
+ xorps $inout0,$inout0
+ jmp .Lctr32_epilogue
+
+.align 16
+.Lctr32_bulk:
+ lea (%rsp),%rax
+ push %rbp
+ sub \$$frame_size,%rsp
+ and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
___
$code.=<<___ if ($win64);
- lea -0xc8(%rsp),%rsp
- movaps %xmm6,0x20(%rsp)
- movaps %xmm7,0x30(%rsp)
- movaps %xmm8,0x40(%rsp)
- movaps %xmm9,0x50(%rsp)
- movaps %xmm10,0x60(%rsp)
- movaps %xmm11,0x70(%rsp)
- movaps %xmm12,0x80(%rsp)
- movaps %xmm13,0x90(%rsp)
- movaps %xmm14,0xa0(%rsp)
- movaps %xmm15,0xb0(%rsp)
+ movaps %xmm6,-0xa8(%rax) # offload everything
+ movaps %xmm7,-0x98(%rax)
+ movaps %xmm8,-0x88(%rax)
+ movaps %xmm9,-0x78(%rax)
+ movaps %xmm10,-0x68(%rax)
+ movaps %xmm11,-0x58(%rax)
+ movaps %xmm12,-0x48(%rax)
+ movaps %xmm13,-0x38(%rax)
+ movaps %xmm14,-0x28(%rax)
+ movaps %xmm15,-0x18(%rax)
.Lctr32_body:
___
$code.=<<___;
- cmp \$1,$len
- je .Lctr32_one_shortcut
-
- movdqu ($ivp),$ivec
- movdqa .Lbswap_mask(%rip),$bswap_mask
- xor $rounds,$rounds
- pextrd \$3,$ivec,$rnds_ # pull 32-bit counter
- pinsrd \$3,$rounds,$ivec # wipe 32-bit counter
-
+ lea -8(%rax),%rbp
+
+ # 8 16-byte words on top of stack are counter values
+ # xor-ed with zero-round key
+
+ movdqu ($ivp),$inout0
+ movdqu ($key),$rndkey0
+ mov 12($ivp),$ctr # counter LSB
+ pxor $rndkey0,$inout0
+ mov 12($key),$key0 # 0-round key LSB
+ movdqa $inout0,0x00(%rsp) # populate counter block
+ bswap $ctr
+ movdqa $inout0,$inout1
+ movdqa $inout0,$inout2
+ movdqa $inout0,$inout3
+ movdqa $inout0,0x40(%rsp)
+ movdqa $inout0,0x50(%rsp)
+ movdqa $inout0,0x60(%rsp)
+ mov %rdx,%r10 # about to borrow %rdx
+ movdqa $inout0,0x70(%rsp)
+
+ lea 1($ctr),%rax
+ lea 2($ctr),%rdx
+ bswap %eax
+ bswap %edx
+ xor $key0,%eax
+ xor $key0,%edx
+ pinsrd \$3,%eax,$inout1
+ lea 3($ctr),%rax
+ movdqa $inout1,0x10(%rsp)
+ pinsrd \$3,%edx,$inout2
+ bswap %eax
+ mov %r10,%rdx # restore %rdx
+ lea 4($ctr),%r10
+ movdqa $inout2,0x20(%rsp)
+ xor $key0,%eax
+ bswap %r10d
+ pinsrd \$3,%eax,$inout3
+ xor $key0,%r10d
+ movdqa $inout3,0x30(%rsp)
+ lea 5($ctr),%r9
+ mov %r10d,0x40+12(%rsp)
+ bswap %r9d
+ lea 6($ctr),%r10
mov 240($key),$rounds # key->rounds
- bswap $rnds_
- pxor $iv0,$iv0 # vector of 3 32-bit counters
- pxor $iv1,$iv1 # vector of 3 32-bit counters
- pinsrd \$0,$rnds_,$iv0
- lea 3($rnds_),$key_
- pinsrd \$0,$key_,$iv1
- inc $rnds_
- pinsrd \$1,$rnds_,$iv0
- inc $key_
- pinsrd \$1,$key_,$iv1
- inc $rnds_
- pinsrd \$2,$rnds_,$iv0
- inc $key_
- pinsrd \$2,$key_,$iv1
- movdqa $iv0,$reserved(%rsp)
- pshufb $bswap_mask,$iv0
- movdqa $iv1,`$reserved+0x10`(%rsp)
- pshufb $bswap_mask,$iv1
-
- pshufd \$`3<<6`,$iv0,$inout0 # place counter to upper dword
- pshufd \$`2<<6`,$iv0,$inout1
- pshufd \$`1<<6`,$iv0,$inout2
- cmp \$6,$len
- jb .Lctr32_tail
- shr \$1,$rounds
- mov $key,$key_ # backup $key
- mov $rounds,$rnds_ # backup $rounds
- sub \$6,$len
+ xor $key0,%r9d
+ bswap %r10d
+ mov %r9d,0x50+12(%rsp)
+ xor $key0,%r10d
+ lea 7($ctr),%r9
+ mov %r10d,0x60+12(%rsp)
+ bswap %r9d
+ mov OPENSSL_ia32cap_P+4(%rip),%r10d
+ xor $key0,%r9d
+ and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
+ mov %r9d,0x70+12(%rsp)
+
+ $movkey 0x10($key),$rndkey1
+
+ movdqa 0x40(%rsp),$inout4
+ movdqa 0x50(%rsp),$inout5
+
+ cmp \$8,$len # $len is in blocks
+ jb .Lctr32_tail # short input if ($len<8)
+
+ sub \$6,$len # $len is biased by -6
+ cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE
+ je .Lctr32_6x # [which denotes Atom Silvermont]
+
+ lea 0x80($key),$key # size optimization
+ sub \$2,$len # $len is biased by -8
+ jmp .Lctr32_loop8
+
+.align 16
+.Lctr32_6x:
+ shl \$4,$rounds
+ mov \$48,$rnds_
+ bswap $key0
+ lea 32($key,$rounds),$key # end of key schedule
+ sub %rax,%r10 # twisted $rounds
jmp .Lctr32_loop6
.align 16
.Lctr32_loop6:
- pshufd \$`3<<6`,$iv1,$inout3
- por $ivec,$inout0 # merge counter-less ivec
- $movkey ($key_),$rndkey0
- pshufd \$`2<<6`,$iv1,$inout4
- por $ivec,$inout1
- $movkey 16($key_),$rndkey1
- pshufd \$`1<<6`,$iv1,$inout5
- por $ivec,$inout2
- por $ivec,$inout3
- xorps $rndkey0,$inout0
- por $ivec,$inout4
- por $ivec,$inout5
-
- # inline _aesni_encrypt6 and interleave last rounds
- # with own code...
+ add \$6,$ctr # next counter value
+ $movkey -48($key,$rnds_),$rndkey0
+ aesenc $rndkey1,$inout0
+ mov $ctr,%eax
+ xor $key0,%eax
+ aesenc $rndkey1,$inout1
+ movbe %eax,`0x00+12`(%rsp) # store next counter value
+ lea 1($ctr),%eax
+ aesenc $rndkey1,$inout2
+ xor $key0,%eax
+ movbe %eax,`0x10+12`(%rsp)
+ aesenc $rndkey1,$inout3
+ lea 2($ctr),%eax
+ xor $key0,%eax
+ aesenc $rndkey1,$inout4
+ movbe %eax,`0x20+12`(%rsp)
+ lea 3($ctr),%eax
+ aesenc $rndkey1,$inout5
+ $movkey -32($key,$rnds_),$rndkey1
+ xor $key0,%eax
- pxor $rndkey0,$inout1
+ aesenc $rndkey0,$inout0
+ movbe %eax,`0x30+12`(%rsp)
+ lea 4($ctr),%eax
+ aesenc $rndkey0,$inout1
+ xor $key0,%eax
+ movbe %eax,`0x40+12`(%rsp)
+ aesenc $rndkey0,$inout2
+ lea 5($ctr),%eax
+ xor $key0,%eax
+ aesenc $rndkey0,$inout3
+ movbe %eax,`0x50+12`(%rsp)
+ mov %r10,%rax # mov $rnds_,$rounds
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ $movkey -16($key,$rnds_),$rndkey0
+
+ call .Lenc_loop6
+
+ movdqu ($inp),$inout6 # load 6 input blocks
+ movdqu 0x10($inp),$inout7
+ movdqu 0x20($inp),$in0
+ movdqu 0x30($inp),$in1
+ movdqu 0x40($inp),$in2
+ movdqu 0x50($inp),$in3
+ lea 0x60($inp),$inp # $inp+=6*16
+ $movkey -64($key,$rnds_),$rndkey1
+ pxor $inout0,$inout6 # inp^=E(ctr)
+ movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round]
+ pxor $inout1,$inout7
+ movaps 0x10(%rsp),$inout1
+ pxor $inout2,$in0
+ movaps 0x20(%rsp),$inout2
+ pxor $inout3,$in1
+ movaps 0x30(%rsp),$inout3
+ pxor $inout4,$in2
+ movaps 0x40(%rsp),$inout4
+ pxor $inout5,$in3
+ movaps 0x50(%rsp),$inout5
+ movdqu $inout6,($out) # store 6 output blocks
+ movdqu $inout7,0x10($out)
+ movdqu $in0,0x20($out)
+ movdqu $in1,0x30($out)
+ movdqu $in2,0x40($out)
+ movdqu $in3,0x50($out)
+ lea 0x60($out),$out # $out+=6*16
+
+ sub \$6,$len
+ jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow
+
+ add \$6,$len # restore real remaining $len
+ jz .Lctr32_done # done if ($len==0)
+
+ lea -48($rnds_),$rounds
+ lea -80($key,$rnds_),$key # restore $key
+ neg $rounds
+ shr \$4,$rounds # restore $rounds
+ jmp .Lctr32_tail
+
+.align 32
+.Lctr32_loop8:
+ add \$8,$ctr # next counter value
+ movdqa 0x60(%rsp),$inout6
aesenc $rndkey1,$inout0
- lea 32($key_),$key
- pxor $rndkey0,$inout2
+ mov $ctr,%r9d
+ movdqa 0x70(%rsp),$inout7
aesenc $rndkey1,$inout1
- movdqa .Lincrement32(%rip),$iv1
- pxor $rndkey0,$inout3
+ bswap %r9d
+ $movkey 0x20-0x80($key),$rndkey0
aesenc $rndkey1,$inout2
- movdqa $reserved(%rsp),$iv0
- pxor $rndkey0,$inout4
+ xor $key0,%r9d
+ nop
aesenc $rndkey1,$inout3
- pxor $rndkey0,$inout5
- $movkey ($key),$rndkey0
- dec $rounds
+ mov %r9d,0x00+12(%rsp) # store next counter value
+ lea 1($ctr),%r9
aesenc $rndkey1,$inout4
aesenc $rndkey1,$inout5
- jmp .Lctr32_enc_loop6_enter
-.align 16
-.Lctr32_enc_loop6:
+ aesenc $rndkey1,$inout6
+ aesenc $rndkey1,$inout7
+ $movkey 0x30-0x80($key),$rndkey1
+___
+for($i=2;$i<8;$i++) {
+my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
+$code.=<<___;
+ bswap %r9d
+ aesenc $rndkeyx,$inout0
+ aesenc $rndkeyx,$inout1
+ xor $key0,%r9d
+ .byte 0x66,0x90
+ aesenc $rndkeyx,$inout2
+ aesenc $rndkeyx,$inout3
+ mov %r9d,`0x10*($i-1)`+12(%rsp)
+ lea $i($ctr),%r9
+ aesenc $rndkeyx,$inout4
+ aesenc $rndkeyx,$inout5
+ aesenc $rndkeyx,$inout6
+ aesenc $rndkeyx,$inout7
+ $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx
+___
+}
+$code.=<<___;
+ bswap %r9d
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ xor $key0,%r9d
+ movdqu 0x00($inp),$in0 # start loading input
+ aesenc $rndkey0,$inout3
+ mov %r9d,0x70+12(%rsp)
+ cmp \$11,$rounds
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ aesenc $rndkey0,$inout6
+ aesenc $rndkey0,$inout7
+ $movkey 0xa0-0x80($key),$rndkey0
+
+ jb .Lctr32_enc_done
+
aesenc $rndkey1,$inout0
aesenc $rndkey1,$inout1
- dec $rounds
aesenc $rndkey1,$inout2
aesenc $rndkey1,$inout3
aesenc $rndkey1,$inout4
aesenc $rndkey1,$inout5
-.Lctr32_enc_loop6_enter:
- $movkey 16($key),$rndkey1
+ aesenc $rndkey1,$inout6
+ aesenc $rndkey1,$inout7
+ $movkey 0xb0-0x80($key),$rndkey1
+
aesenc $rndkey0,$inout0
aesenc $rndkey0,$inout1
- lea 32($key),$key
aesenc $rndkey0,$inout2
aesenc $rndkey0,$inout3
aesenc $rndkey0,$inout4
aesenc $rndkey0,$inout5
- $movkey ($key),$rndkey0
- jnz .Lctr32_enc_loop6
+ aesenc $rndkey0,$inout6
+ aesenc $rndkey0,$inout7
+ $movkey 0xc0-0x80($key),$rndkey0
+ je .Lctr32_enc_done
aesenc $rndkey1,$inout0
- paddd $iv1,$iv0 # increment counter vector
aesenc $rndkey1,$inout1
- paddd `$reserved+0x10`(%rsp),$iv1
aesenc $rndkey1,$inout2
- movdqa $iv0,$reserved(%rsp) # save counter vector
aesenc $rndkey1,$inout3
- movdqa $iv1,`$reserved+0x10`(%rsp)
aesenc $rndkey1,$inout4
- pshufb $bswap_mask,$iv0 # byte swap
aesenc $rndkey1,$inout5
- pshufb $bswap_mask,$iv1
-
- aesenclast $rndkey0,$inout0
- movups ($inp),$in0 # load input
- aesenclast $rndkey0,$inout1
- movups 0x10($inp),$in1
- aesenclast $rndkey0,$inout2
- movups 0x20($inp),$in2
- aesenclast $rndkey0,$inout3
- movups 0x30($inp),$in3
- aesenclast $rndkey0,$inout4
- movups 0x40($inp),$rndkey1
- aesenclast $rndkey0,$inout5
- movups 0x50($inp),$rndkey0
- lea 0x60($inp),$inp
-
- xorps $inout0,$in0 # xor
- pshufd \$`3<<6`,$iv0,$inout0
- xorps $inout1,$in1
- pshufd \$`2<<6`,$iv0,$inout1
- movups $in0,($out) # store output
- xorps $inout2,$in2
- pshufd \$`1<<6`,$iv0,$inout2
- movups $in1,0x10($out)
- xorps $inout3,$in3
- movups $in2,0x20($out)
- xorps $inout4,$rndkey1
- movups $in3,0x30($out)
- xorps $inout5,$rndkey0
- movups $rndkey1,0x40($out)
- movups $rndkey0,0x50($out)
- lea 0x60($out),$out
- mov $rnds_,$rounds
- sub \$6,$len
- jnc .Lctr32_loop6
+ aesenc $rndkey1,$inout6
+ aesenc $rndkey1,$inout7
+ $movkey 0xd0-0x80($key),$rndkey1
- add \$6,$len
- jz .Lctr32_done
- mov $key_,$key # restore $key
- lea 1($rounds,$rounds),$rounds # restore original value
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ aesenc $rndkey0,$inout6
+ aesenc $rndkey0,$inout7
+ $movkey 0xe0-0x80($key),$rndkey0
+ jmp .Lctr32_enc_done
+
+.align 16
+.Lctr32_enc_done:
+ movdqu 0x10($inp),$in1
+ pxor $rndkey0,$in0 # input^=round[last]
+ movdqu 0x20($inp),$in2
+ pxor $rndkey0,$in1
+ movdqu 0x30($inp),$in3
+ pxor $rndkey0,$in2
+ movdqu 0x40($inp),$in4
+ pxor $rndkey0,$in3
+ movdqu 0x50($inp),$in5
+ pxor $rndkey0,$in4
+ pxor $rndkey0,$in5
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ aesenc $rndkey1,$inout6
+ aesenc $rndkey1,$inout7
+ movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6]
+ lea 0x80($inp),$inp # $inp+=8*16
+
+ aesenclast $in0,$inout0 # $inN is inp[N]^round[last]
+ pxor $rndkey0,$rndkey1 # borrowed $rndkey
+ movdqu 0x70-0x80($inp),$in0
+ aesenclast $in1,$inout1
+ pxor $rndkey0,$in0
+ movdqa 0x00(%rsp),$in1 # load next counter block
+ aesenclast $in2,$inout2
+ aesenclast $in3,$inout3
+ movdqa 0x10(%rsp),$in2
+ movdqa 0x20(%rsp),$in3
+ aesenclast $in4,$inout4
+ aesenclast $in5,$inout5
+ movdqa 0x30(%rsp),$in4
+ movdqa 0x40(%rsp),$in5
+ aesenclast $rndkey1,$inout6
+ movdqa 0x50(%rsp),$rndkey0
+ $movkey 0x10-0x80($key),$rndkey1#real 1st-round key
+ aesenclast $in0,$inout7
+
+ movups $inout0,($out) # store 8 output blocks
+ movdqa $in1,$inout0
+ movups $inout1,0x10($out)
+ movdqa $in2,$inout1
+ movups $inout2,0x20($out)
+ movdqa $in3,$inout2
+ movups $inout3,0x30($out)
+ movdqa $in4,$inout3
+ movups $inout4,0x40($out)
+ movdqa $in5,$inout4
+ movups $inout5,0x50($out)
+ movdqa $rndkey0,$inout5
+ movups $inout6,0x60($out)
+ movups $inout7,0x70($out)
+ lea 0x80($out),$out # $out+=8*16
+
+ sub \$8,$len
+ jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow
+
+ add \$8,$len # restore real remainig $len
+ jz .Lctr32_done # done if ($len==0)
+ lea -0x80($key),$key
.Lctr32_tail:
- por $ivec,$inout0
- movups ($inp),$in0
- cmp \$2,$len
- jb .Lctr32_one
-
- por $ivec,$inout1
- movups 0x10($inp),$in1
- je .Lctr32_two
-
- pshufd \$`3<<6`,$iv1,$inout3
- por $ivec,$inout2
- movups 0x20($inp),$in2
+ # note that at this point $inout0..5 are populated with
+ # counter values xor-ed with 0-round key
+ lea 16($key),$key
cmp \$4,$len
- jb .Lctr32_three
+ jb .Lctr32_loop3
+ je .Lctr32_loop4
- pshufd \$`2<<6`,$iv1,$inout4
- por $ivec,$inout3
- movups 0x30($inp),$in3
- je .Lctr32_four
+ # if ($len>4) compute 7 E(counter)
+ shl \$4,$rounds
+ movdqa 0x60(%rsp),$inout6
+ pxor $inout7,$inout7
- por $ivec,$inout4
- xorps $inout5,$inout5
+ $movkey 16($key),$rndkey0
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
+ neg %rax
+ aesenc $rndkey1,$inout2
+ add \$16,%rax # prepare for .Lenc_loop8_enter
+ movups ($inp),$in0
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ movups 0x10($inp),$in1 # pre-load input
+ movups 0x20($inp),$in2
+ aesenc $rndkey1,$inout5
+ aesenc $rndkey1,$inout6
+
+ call .Lenc_loop8_enter
+
+ movdqu 0x30($inp),$in3
+ pxor $in0,$inout0
+ movdqu 0x40($inp),$in0
+ pxor $in1,$inout1
+ movdqu $inout0,($out) # store output
+ pxor $in2,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $in3,$inout3
+ movdqu $inout2,0x20($out)
+ pxor $in0,$inout4
+ movdqu $inout3,0x30($out)
+ movdqu $inout4,0x40($out)
+ cmp \$6,$len
+ jb .Lctr32_done # $len was 5, stop store
- call _aesni_encrypt6
+ movups 0x50($inp),$in1
+ xorps $in1,$inout5
+ movups $inout5,0x50($out)
+ je .Lctr32_done # $len was 6, stop store
- movups 0x40($inp),$rndkey1
- xorps $inout0,$in0
- xorps $inout1,$in1
- movups $in0,($out)
- xorps $inout2,$in2
- movups $in1,0x10($out)
- xorps $inout3,$in3
- movups $in2,0x20($out)
- xorps $inout4,$rndkey1
- movups $in3,0x30($out)
- movups $rndkey1,0x40($out)
- jmp .Lctr32_done
-
-.align 16
-.Lctr32_one_shortcut:
- movups ($ivp),$inout0
- movups ($inp),$in0
- mov 240($key),$rounds # key->rounds
-.Lctr32_one:
-___
- &aesni_generate1("enc",$key,$rounds);
-$code.=<<___;
- xorps $inout0,$in0
- movups $in0,($out)
- jmp .Lctr32_done
+ movups 0x60($inp),$in2
+ xorps $in2,$inout6
+ movups $inout6,0x60($out)
+ jmp .Lctr32_done # $len was 7, stop store
-.align 16
-.Lctr32_two:
- xorps $inout2,$inout2
- call _aesni_encrypt3
- xorps $inout0,$in0
- xorps $inout1,$in1
- movups $in0,($out)
- movups $in1,0x10($out)
- jmp .Lctr32_done
+.align 32
+.Lctr32_loop4:
+ aesenc $rndkey1,$inout0
+ lea 16($key),$key
+ dec $rounds
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ $movkey ($key),$rndkey1
+ jnz .Lctr32_loop4
+ aesenclast $rndkey1,$inout0
+ aesenclast $rndkey1,$inout1
+ movups ($inp),$in0 # load input
+ movups 0x10($inp),$in1
+ aesenclast $rndkey1,$inout2
+ aesenclast $rndkey1,$inout3
+ movups 0x20($inp),$in2
+ movups 0x30($inp),$in3
-.align 16
-.Lctr32_three:
- call _aesni_encrypt3
- xorps $inout0,$in0
- xorps $inout1,$in1
- movups $in0,($out)
- xorps $inout2,$in2
- movups $in1,0x10($out)
- movups $in2,0x20($out)
- jmp .Lctr32_done
+ xorps $in0,$inout0
+ movups $inout0,($out) # store output
+ xorps $in1,$inout1
+ movups $inout1,0x10($out)
+ pxor $in2,$inout2
+ movdqu $inout2,0x20($out)
+ pxor $in3,$inout3
+ movdqu $inout3,0x30($out)
+ jmp .Lctr32_done # $len was 4, stop store
+
+.align 32
+.Lctr32_loop3:
+ aesenc $rndkey1,$inout0
+ lea 16($key),$key
+ dec $rounds
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ $movkey ($key),$rndkey1
+ jnz .Lctr32_loop3
+ aesenclast $rndkey1,$inout0
+ aesenclast $rndkey1,$inout1
+ aesenclast $rndkey1,$inout2
+
+ movups ($inp),$in0 # load input
+ xorps $in0,$inout0
+ movups $inout0,($out) # store output
+ cmp \$2,$len
+ jb .Lctr32_done # $len was 1, stop store
-.align 16
-.Lctr32_four:
- call _aesni_encrypt4
- xorps $inout0,$in0
- xorps $inout1,$in1
- movups $in0,($out)
- xorps $inout2,$in2
- movups $in1,0x10($out)
- xorps $inout3,$in3
- movups $in2,0x20($out)
- movups $in3,0x30($out)
+ movups 0x10($inp),$in1
+ xorps $in1,$inout1
+ movups $inout1,0x10($out)
+ je .Lctr32_done # $len was 2, stop store
+
+ movups 0x20($inp),$in2
+ xorps $in2,$inout2
+ movups $inout2,0x20($out) # $len was 3, stop store
.Lctr32_done:
+ xorps %xmm0,%xmm0 # clear regiser bank
+ xor $key0,$key0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0x00(%rsp) # clear stack
+ pxor %xmm8,%xmm8
+ movaps %xmm0,0x10(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,0x20(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,0x30(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,0x40(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,0x50(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,0x60(%rsp)
+ pxor %xmm14,%xmm14
+ movaps %xmm0,0x70(%rsp)
+ pxor %xmm15,%xmm15
___
$code.=<<___ if ($win64);
- movaps 0x20(%rsp),%xmm6
- movaps 0x30(%rsp),%xmm7
- movaps 0x40(%rsp),%xmm8
- movaps 0x50(%rsp),%xmm9
- movaps 0x60(%rsp),%xmm10
- movaps 0x70(%rsp),%xmm11
- movaps 0x80(%rsp),%xmm12
- movaps 0x90(%rsp),%xmm13
- movaps 0xa0(%rsp),%xmm14
- movaps 0xb0(%rsp),%xmm15
- lea 0xc8(%rsp),%rsp
-.Lctr32_ret:
+ movaps -0xa0(%rbp),%xmm6
+ movaps %xmm0,-0xa0(%rbp) # clear stack
+ movaps -0x90(%rbp),%xmm7
+ movaps %xmm0,-0x90(%rbp)
+ movaps -0x80(%rbp),%xmm8
+ movaps %xmm0,-0x80(%rbp)
+ movaps -0x70(%rbp),%xmm9
+ movaps %xmm0,-0x70(%rbp)
+ movaps -0x60(%rbp),%xmm10
+ movaps %xmm0,-0x60(%rbp)
+ movaps -0x50(%rbp),%xmm11
+ movaps %xmm0,-0x50(%rbp)
+ movaps -0x40(%rbp),%xmm12
+ movaps %xmm0,-0x40(%rbp)
+ movaps -0x30(%rbp),%xmm13
+ movaps %xmm0,-0x30(%rbp)
+ movaps -0x20(%rbp),%xmm14
+ movaps %xmm0,-0x20(%rbp)
+ movaps -0x10(%rbp),%xmm15
+ movaps %xmm0,-0x10(%rbp)
+ movaps %xmm0,0x00(%rsp)
+ movaps %xmm0,0x10(%rsp)
+ movaps %xmm0,0x20(%rsp)
+ movaps %xmm0,0x30(%rsp)
+ movaps %xmm0,0x40(%rsp)
+ movaps %xmm0,0x50(%rsp)
+ movaps %xmm0,0x60(%rsp)
+ movaps %xmm0,0x70(%rsp)
___
$code.=<<___;
+ lea (%rbp),%rsp
+ pop %rbp
+.Lctr32_epilogue:
ret
.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
___
@@ -1317,252 +1725,297 @@ ___
my @tweak=map("%xmm$_",(10..15));
my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
-my $frame_size = 0x68 + ($win64?160:0);
+my $frame_size = 0x70 + ($win64?160:0);
$code.=<<___;
.globl aesni_xts_encrypt
.type aesni_xts_encrypt,\@function,6
.align 16
aesni_xts_encrypt:
- lea -$frame_size(%rsp),%rsp
+ lea (%rsp),%rax
+ push %rbp
+ sub \$$frame_size,%rsp
+ and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
___
$code.=<<___ if ($win64);
- movaps %xmm6,0x60(%rsp)
- movaps %xmm7,0x70(%rsp)
- movaps %xmm8,0x80(%rsp)
- movaps %xmm9,0x90(%rsp)
- movaps %xmm10,0xa0(%rsp)
- movaps %xmm11,0xb0(%rsp)
- movaps %xmm12,0xc0(%rsp)
- movaps %xmm13,0xd0(%rsp)
- movaps %xmm14,0xe0(%rsp)
- movaps %xmm15,0xf0(%rsp)
+ movaps %xmm6,-0xa8(%rax) # offload everything
+ movaps %xmm7,-0x98(%rax)
+ movaps %xmm8,-0x88(%rax)
+ movaps %xmm9,-0x78(%rax)
+ movaps %xmm10,-0x68(%rax)
+ movaps %xmm11,-0x58(%rax)
+ movaps %xmm12,-0x48(%rax)
+ movaps %xmm13,-0x38(%rax)
+ movaps %xmm14,-0x28(%rax)
+ movaps %xmm15,-0x18(%rax)
.Lxts_enc_body:
___
$code.=<<___;
- movups ($ivp),@tweak[5] # load clear-text tweak
+ lea -8(%rax),%rbp
+ movups ($ivp),$inout0 # load clear-text tweak
mov 240(%r8),$rounds # key2->rounds
mov 240($key),$rnds_ # key1->rounds
___
# generate the tweak
- &aesni_generate1("enc",$key2,$rounds,@tweak[5]);
+ &aesni_generate1("enc",$key2,$rounds,$inout0);
$code.=<<___;
+ $movkey ($key),$rndkey0 # zero round key
mov $key,$key_ # backup $key
mov $rnds_,$rounds # backup $rounds
+ shl \$4,$rnds_
mov $len,$len_ # backup $len
and \$-16,$len
+ $movkey 16($key,$rnds_),$rndkey1 # last round key
+
movdqa .Lxts_magic(%rip),$twmask
- pxor $twtmp,$twtmp
- pcmpgtd @tweak[5],$twtmp # broadcast upper bits
+ movdqa $inout0,@tweak[5]
+ pshufd \$0x5f,$inout0,$twres
+ pxor $rndkey0,$rndkey1
___
+ # alternative tweak calculation algorithm is based on suggestions
+ # by Shay Gueron. psrad doesn't conflict with AES-NI instructions
+ # and should help in the future...
for ($i=0;$i<4;$i++) {
$code.=<<___;
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
+ movdqa $twres,$twtmp
+ paddd $twres,$twres
movdqa @tweak[5],@tweak[$i]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
- pand $twmask,$twres # isolate carry and residue
- pcmpgtd @tweak[5],$twtmp # broadcat upper bits
- pxor $twres,@tweak[5]
+ psrad \$31,$twtmp # broadcast upper bits
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
+ pxor $rndkey0,@tweak[$i]
+ pxor $twtmp,@tweak[5]
___
}
$code.=<<___;
+ movdqa @tweak[5],@tweak[4]
+ psrad \$31,$twres
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twres
+ pxor $rndkey0,@tweak[4]
+ pxor $twres,@tweak[5]
+ movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
+
sub \$16*6,$len
- jc .Lxts_enc_short
+ jc .Lxts_enc_short # if $len-=6*16 borrowed
- shr \$1,$rounds
- sub \$1,$rounds
- mov $rounds,$rnds_
+ mov \$16+96,$rounds
+ lea 32($key_,$rnds_),$key # end of key schedule
+ sub %r10,%rax # twisted $rounds
+ $movkey 16($key_),$rndkey1
+ mov %rax,%r10 # backup twisted $rounds
+ lea .Lxts_magic(%rip),%r8
jmp .Lxts_enc_grandloop
-.align 16
+.align 32
.Lxts_enc_grandloop:
- pshufd \$0x13,$twtmp,$twres
- movdqa @tweak[5],@tweak[4]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
movdqu `16*0`($inp),$inout0 # load input
- pand $twmask,$twres # isolate carry and residue
+ movdqa $rndkey0,$twmask
movdqu `16*1`($inp),$inout1
- pxor $twres,@tweak[5]
-
+ pxor @tweak[0],$inout0 # input^=tweak^round[0]
movdqu `16*2`($inp),$inout2
- pxor @tweak[0],$inout0 # input^=tweak
- movdqu `16*3`($inp),$inout3
pxor @tweak[1],$inout1
- movdqu `16*4`($inp),$inout4
+ aesenc $rndkey1,$inout0
+ movdqu `16*3`($inp),$inout3
pxor @tweak[2],$inout2
- movdqu `16*5`($inp),$inout5
- lea `16*6`($inp),$inp
+ aesenc $rndkey1,$inout1
+ movdqu `16*4`($inp),$inout4
pxor @tweak[3],$inout3
- $movkey ($key_),$rndkey0
+ aesenc $rndkey1,$inout2
+ movdqu `16*5`($inp),$inout5
+ pxor @tweak[5],$twmask # round[0]^=tweak[5]
+ movdqa 0x60(%rsp),$twres # load round[0]^round[last]
pxor @tweak[4],$inout4
- pxor @tweak[5],$inout5
+ aesenc $rndkey1,$inout3
+ $movkey 32($key_),$rndkey0
+ lea `16*6`($inp),$inp
+ pxor $twmask,$inout5
- # inline _aesni_encrypt6 and interleave first and last rounds
- # with own code...
- $movkey 16($key_),$rndkey1
- pxor $rndkey0,$inout0
- pxor $rndkey0,$inout1
- movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks
- aesenc $rndkey1,$inout0
- lea 32($key_),$key
- pxor $rndkey0,$inout2
- movdqa @tweak[1],`16*1`(%rsp)
- aesenc $rndkey1,$inout1
- pxor $rndkey0,$inout3
- movdqa @tweak[2],`16*2`(%rsp)
- aesenc $rndkey1,$inout2
- pxor $rndkey0,$inout4
- movdqa @tweak[3],`16*3`(%rsp)
- aesenc $rndkey1,$inout3
- pxor $rndkey0,$inout5
- $movkey ($key),$rndkey0
- dec $rounds
- movdqa @tweak[4],`16*4`(%rsp)
+ pxor $twres,@tweak[0] # calclulate tweaks^round[last]
aesenc $rndkey1,$inout4
- movdqa @tweak[5],`16*5`(%rsp)
+ pxor $twres,@tweak[1]
+ movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last]
aesenc $rndkey1,$inout5
- pxor $twtmp,$twtmp
- pcmpgtd @tweak[5],$twtmp
- jmp .Lxts_enc_loop6_enter
+ $movkey 48($key_),$rndkey1
+ pxor $twres,@tweak[2]
-.align 16
+ aesenc $rndkey0,$inout0
+ pxor $twres,@tweak[3]
+ movdqa @tweak[1],`16*1`(%rsp)
+ aesenc $rndkey0,$inout1
+ pxor $twres,@tweak[4]
+ movdqa @tweak[2],`16*2`(%rsp)
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ pxor $twres,$twmask
+ movdqa @tweak[4],`16*4`(%rsp)
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ $movkey 64($key_),$rndkey0
+ movdqa $twmask,`16*5`(%rsp)
+ pshufd \$0x5f,@tweak[5],$twres
+ jmp .Lxts_enc_loop6
+.align 32
.Lxts_enc_loop6:
aesenc $rndkey1,$inout0
aesenc $rndkey1,$inout1
- dec $rounds
aesenc $rndkey1,$inout2
aesenc $rndkey1,$inout3
aesenc $rndkey1,$inout4
aesenc $rndkey1,$inout5
-.Lxts_enc_loop6_enter:
- $movkey 16($key),$rndkey1
+ $movkey -64($key,%rax),$rndkey1
+ add \$32,%rax
+
aesenc $rndkey0,$inout0
aesenc $rndkey0,$inout1
- lea 32($key),$key
aesenc $rndkey0,$inout2
aesenc $rndkey0,$inout3
aesenc $rndkey0,$inout4
aesenc $rndkey0,$inout5
- $movkey ($key),$rndkey0
+ $movkey -80($key,%rax),$rndkey0
jnz .Lxts_enc_loop6
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ movdqa (%r8),$twmask # start calculating next tweak
+ movdqa $twres,$twtmp
+ paddd $twres,$twres
aesenc $rndkey1,$inout0
- pand $twmask,$twres # isolate carry and residue
+ paddq @tweak[5],@tweak[5]
+ psrad \$31,$twtmp
aesenc $rndkey1,$inout1
- pcmpgtd @tweak[5],$twtmp # broadcast upper bits
+ pand $twmask,$twtmp
+ $movkey ($key_),@tweak[0] # load round[0]
aesenc $rndkey1,$inout2
- pxor $twres,@tweak[5]
aesenc $rndkey1,$inout3
aesenc $rndkey1,$inout4
+ pxor $twtmp,@tweak[5]
+ movaps @tweak[0],@tweak[1] # copy round[0]
aesenc $rndkey1,$inout5
- $movkey 16($key),$rndkey1
+ $movkey -64($key),$rndkey1
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
- movdqa @tweak[5],@tweak[0]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ movdqa $twres,$twtmp
aesenc $rndkey0,$inout0
- pand $twmask,$twres # isolate carry and residue
+ paddd $twres,$twres
+ pxor @tweak[5],@tweak[0]
aesenc $rndkey0,$inout1
- pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ psrad \$31,$twtmp
+ paddq @tweak[5],@tweak[5]
aesenc $rndkey0,$inout2
- pxor $twres,@tweak[5]
aesenc $rndkey0,$inout3
+ pand $twmask,$twtmp
+ movaps @tweak[1],@tweak[2]
aesenc $rndkey0,$inout4
+ pxor $twtmp,@tweak[5]
+ movdqa $twres,$twtmp
aesenc $rndkey0,$inout5
- $movkey 32($key),$rndkey0
+ $movkey -48($key),$rndkey0
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
- movdqa @tweak[5],@tweak[1]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ paddd $twres,$twres
aesenc $rndkey1,$inout0
- pand $twmask,$twres # isolate carry and residue
+ pxor @tweak[5],@tweak[1]
+ psrad \$31,$twtmp
aesenc $rndkey1,$inout1
- pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
aesenc $rndkey1,$inout2
- pxor $twres,@tweak[5]
aesenc $rndkey1,$inout3
+ movdqa @tweak[3],`16*3`(%rsp)
+ pxor $twtmp,@tweak[5]
aesenc $rndkey1,$inout4
+ movaps @tweak[2],@tweak[3]
+ movdqa $twres,$twtmp
aesenc $rndkey1,$inout5
+ $movkey -32($key),$rndkey1
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
- movdqa @tweak[5],@tweak[2]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
- aesenclast $rndkey0,$inout0
- pand $twmask,$twres # isolate carry and residue
- aesenclast $rndkey0,$inout1
- pcmpgtd @tweak[5],$twtmp # broadcat upper bits
- aesenclast $rndkey0,$inout2
- pxor $twres,@tweak[5]
- aesenclast $rndkey0,$inout3
- aesenclast $rndkey0,$inout4
- aesenclast $rndkey0,$inout5
-
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
- movdqa @tweak[5],@tweak[3]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
- xorps `16*0`(%rsp),$inout0 # output^=tweak
- pand $twmask,$twres # isolate carry and residue
- xorps `16*1`(%rsp),$inout1
- pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ paddd $twres,$twres
+ aesenc $rndkey0,$inout0
+ pxor @tweak[5],@tweak[2]
+ psrad \$31,$twtmp
+ aesenc $rndkey0,$inout1
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ pxor $twtmp,@tweak[5]
+ movaps @tweak[3],@tweak[4]
+ aesenc $rndkey0,$inout5
+
+ movdqa $twres,$rndkey0
+ paddd $twres,$twres
+ aesenc $rndkey1,$inout0
+ pxor @tweak[5],@tweak[3]
+ psrad \$31,$rndkey0
+ aesenc $rndkey1,$inout1
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$rndkey0
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ pxor $rndkey0,@tweak[5]
+ $movkey ($key_),$rndkey0
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ $movkey 16($key_),$rndkey1
+
+ pxor @tweak[5],@tweak[4]
+ aesenclast `16*0`(%rsp),$inout0
+ psrad \$31,$twres
+ paddq @tweak[5],@tweak[5]
+ aesenclast `16*1`(%rsp),$inout1
+ aesenclast `16*2`(%rsp),$inout2
+ pand $twmask,$twres
+ mov %r10,%rax # restore $rounds
+ aesenclast `16*3`(%rsp),$inout3
+ aesenclast `16*4`(%rsp),$inout4
+ aesenclast `16*5`(%rsp),$inout5
pxor $twres,@tweak[5]
- xorps `16*2`(%rsp),$inout2
- movups $inout0,`16*0`($out) # write output
- xorps `16*3`(%rsp),$inout3
- movups $inout1,`16*1`($out)
- xorps `16*4`(%rsp),$inout4
- movups $inout2,`16*2`($out)
- xorps `16*5`(%rsp),$inout5
- movups $inout3,`16*3`($out)
- mov $rnds_,$rounds # restore $rounds
- movups $inout4,`16*4`($out)
- movups $inout5,`16*5`($out)
- lea `16*6`($out),$out
+ lea `16*6`($out),$out # $out+=6*16
+ movups $inout0,`-16*6`($out) # store 6 output blocks
+ movups $inout1,`-16*5`($out)
+ movups $inout2,`-16*4`($out)
+ movups $inout3,`-16*3`($out)
+ movups $inout4,`-16*2`($out)
+ movups $inout5,`-16*1`($out)
sub \$16*6,$len
- jnc .Lxts_enc_grandloop
+ jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow
- lea 3($rounds,$rounds),$rounds # restore original value
+ mov \$16+96,$rounds
+ sub $rnds_,$rounds
mov $key_,$key # restore $key
- mov $rounds,$rnds_ # backup $rounds
+ shr \$4,$rounds # restore original value
.Lxts_enc_short:
- add \$16*6,$len
- jz .Lxts_enc_done
+ # at the point @tweak[0..5] are populated with tweak values
+ mov $rounds,$rnds_ # backup $rounds
+ pxor $rndkey0,@tweak[0]
+ add \$16*6,$len # restore real remaining $len
+ jz .Lxts_enc_done # done if ($len==0)
+ pxor $rndkey0,@tweak[1]
cmp \$0x20,$len
- jb .Lxts_enc_one
- je .Lxts_enc_two
+ jb .Lxts_enc_one # $len is 1*16
+ pxor $rndkey0,@tweak[2]
+ je .Lxts_enc_two # $len is 2*16
+ pxor $rndkey0,@tweak[3]
cmp \$0x40,$len
- jb .Lxts_enc_three
- je .Lxts_enc_four
-
- pshufd \$0x13,$twtmp,$twres
- movdqa @tweak[5],@tweak[4]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
- movdqu ($inp),$inout0
- pand $twmask,$twres # isolate carry and residue
- movdqu 16*1($inp),$inout1
- pxor $twres,@tweak[5]
+ jb .Lxts_enc_three # $len is 3*16
+ pxor $rndkey0,@tweak[4]
+ je .Lxts_enc_four # $len is 4*16
+ movdqu ($inp),$inout0 # $len is 5*16
+ movdqu 16*1($inp),$inout1
movdqu 16*2($inp),$inout2
pxor @tweak[0],$inout0
movdqu 16*3($inp),$inout3
pxor @tweak[1],$inout1
movdqu 16*4($inp),$inout4
- lea 16*5($inp),$inp
+ lea 16*5($inp),$inp # $inp+=5*16
pxor @tweak[2],$inout2
pxor @tweak[3],$inout3
pxor @tweak[4],$inout4
+ pxor $inout5,$inout5
call _aesni_encrypt6
@@ -1570,46 +2023,46 @@ $code.=<<___;
movdqa @tweak[5],@tweak[0]
xorps @tweak[1],$inout1
xorps @tweak[2],$inout2
- movdqu $inout0,($out)
+ movdqu $inout0,($out) # store 5 output blocks
xorps @tweak[3],$inout3
movdqu $inout1,16*1($out)
xorps @tweak[4],$inout4
movdqu $inout2,16*2($out)
movdqu $inout3,16*3($out)
movdqu $inout4,16*4($out)
- lea 16*5($out),$out
+ lea 16*5($out),$out # $out+=5*16
jmp .Lxts_enc_done
.align 16
.Lxts_enc_one:
movups ($inp),$inout0
- lea 16*1($inp),$inp
+ lea 16*1($inp),$inp # inp+=1*16
xorps @tweak[0],$inout0
___
&aesni_generate1("enc",$key,$rounds);
$code.=<<___;
xorps @tweak[0],$inout0
movdqa @tweak[1],@tweak[0]
- movups $inout0,($out)
- lea 16*1($out),$out
+ movups $inout0,($out) # store one output block
+ lea 16*1($out),$out # $out+=1*16
jmp .Lxts_enc_done
.align 16
.Lxts_enc_two:
movups ($inp),$inout0
movups 16($inp),$inout1
- lea 32($inp),$inp
+ lea 32($inp),$inp # $inp+=2*16
xorps @tweak[0],$inout0
xorps @tweak[1],$inout1
- call _aesni_encrypt3
+ call _aesni_encrypt2
xorps @tweak[0],$inout0
movdqa @tweak[2],@tweak[0]
xorps @tweak[1],$inout1
- movups $inout0,($out)
+ movups $inout0,($out) # store 2 output blocks
movups $inout1,16*1($out)
- lea 16*2($out),$out
+ lea 16*2($out),$out # $out+=2*16
jmp .Lxts_enc_done
.align 16
@@ -1617,7 +2070,7 @@ $code.=<<___;
movups ($inp),$inout0
movups 16*1($inp),$inout1
movups 16*2($inp),$inout2
- lea 16*3($inp),$inp
+ lea 16*3($inp),$inp # $inp+=3*16
xorps @tweak[0],$inout0
xorps @tweak[1],$inout1
xorps @tweak[2],$inout2
@@ -1628,10 +2081,10 @@ $code.=<<___;
movdqa @tweak[3],@tweak[0]
xorps @tweak[1],$inout1
xorps @tweak[2],$inout2
- movups $inout0,($out)
+ movups $inout0,($out) # store 3 output blocks
movups $inout1,16*1($out)
movups $inout2,16*2($out)
- lea 16*3($out),$out
+ lea 16*3($out),$out # $out+=3*16
jmp .Lxts_enc_done
.align 16
@@ -1641,28 +2094,28 @@ $code.=<<___;
movups 16*2($inp),$inout2
xorps @tweak[0],$inout0
movups 16*3($inp),$inout3
- lea 16*4($inp),$inp
+ lea 16*4($inp),$inp # $inp+=4*16
xorps @tweak[1],$inout1
xorps @tweak[2],$inout2
xorps @tweak[3],$inout3
call _aesni_encrypt4
- xorps @tweak[0],$inout0
- movdqa @tweak[5],@tweak[0]
- xorps @tweak[1],$inout1
- xorps @tweak[2],$inout2
- movups $inout0,($out)
- xorps @tweak[3],$inout3
- movups $inout1,16*1($out)
- movups $inout2,16*2($out)
- movups $inout3,16*3($out)
- lea 16*4($out),$out
+ pxor @tweak[0],$inout0
+ movdqa @tweak[4],@tweak[0]
+ pxor @tweak[1],$inout1
+ pxor @tweak[2],$inout2
+ movdqu $inout0,($out) # store 4 output blocks
+ pxor @tweak[3],$inout3
+ movdqu $inout1,16*1($out)
+ movdqu $inout2,16*2($out)
+ movdqu $inout3,16*3($out)
+ lea 16*4($out),$out # $out+=4*16
jmp .Lxts_enc_done
.align 16
.Lxts_enc_done:
- and \$15,$len_
+ and \$15,$len_ # see if $len%16 is 0
jz .Lxts_enc_ret
mov $len_,$len
@@ -1689,21 +2142,64 @@ $code.=<<___;
movups $inout0,-16($out)
.Lxts_enc_ret:
+ xorps %xmm0,%xmm0 # clear register bank
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0x00(%rsp) # clear stack
+ pxor %xmm8,%xmm8
+ movaps %xmm0,0x10(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,0x20(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,0x30(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,0x40(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,0x50(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,0x60(%rsp)
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
___
$code.=<<___ if ($win64);
- movaps 0x60(%rsp),%xmm6
- movaps 0x70(%rsp),%xmm7
- movaps 0x80(%rsp),%xmm8
- movaps 0x90(%rsp),%xmm9
- movaps 0xa0(%rsp),%xmm10
- movaps 0xb0(%rsp),%xmm11
- movaps 0xc0(%rsp),%xmm12
- movaps 0xd0(%rsp),%xmm13
- movaps 0xe0(%rsp),%xmm14
- movaps 0xf0(%rsp),%xmm15
+ movaps -0xa0(%rbp),%xmm6
+ movaps %xmm0,-0xa0(%rbp) # clear stack
+ movaps -0x90(%rbp),%xmm7
+ movaps %xmm0,-0x90(%rbp)
+ movaps -0x80(%rbp),%xmm8
+ movaps %xmm0,-0x80(%rbp)
+ movaps -0x70(%rbp),%xmm9
+ movaps %xmm0,-0x70(%rbp)
+ movaps -0x60(%rbp),%xmm10
+ movaps %xmm0,-0x60(%rbp)
+ movaps -0x50(%rbp),%xmm11
+ movaps %xmm0,-0x50(%rbp)
+ movaps -0x40(%rbp),%xmm12
+ movaps %xmm0,-0x40(%rbp)
+ movaps -0x30(%rbp),%xmm13
+ movaps %xmm0,-0x30(%rbp)
+ movaps -0x20(%rbp),%xmm14
+ movaps %xmm0,-0x20(%rbp)
+ movaps -0x10(%rbp),%xmm15
+ movaps %xmm0,-0x10(%rbp)
+ movaps %xmm0,0x00(%rsp)
+ movaps %xmm0,0x10(%rsp)
+ movaps %xmm0,0x20(%rsp)
+ movaps %xmm0,0x30(%rsp)
+ movaps %xmm0,0x40(%rsp)
+ movaps %xmm0,0x50(%rsp)
+ movaps %xmm0,0x60(%rsp)
___
$code.=<<___;
- lea $frame_size(%rsp),%rsp
+ lea (%rbp),%rsp
+ pop %rbp
.Lxts_enc_epilogue:
ret
.size aesni_xts_encrypt,.-aesni_xts_encrypt
@@ -1714,28 +2210,32 @@ $code.=<<___;
.type aesni_xts_decrypt,\@function,6
.align 16
aesni_xts_decrypt:
- lea -$frame_size(%rsp),%rsp
+ lea (%rsp),%rax
+ push %rbp
+ sub \$$frame_size,%rsp
+ and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
___
$code.=<<___ if ($win64);
- movaps %xmm6,0x60(%rsp)
- movaps %xmm7,0x70(%rsp)
- movaps %xmm8,0x80(%rsp)
- movaps %xmm9,0x90(%rsp)
- movaps %xmm10,0xa0(%rsp)
- movaps %xmm11,0xb0(%rsp)
- movaps %xmm12,0xc0(%rsp)
- movaps %xmm13,0xd0(%rsp)
- movaps %xmm14,0xe0(%rsp)
- movaps %xmm15,0xf0(%rsp)
+ movaps %xmm6,-0xa8(%rax) # offload everything
+ movaps %xmm7,-0x98(%rax)
+ movaps %xmm8,-0x88(%rax)
+ movaps %xmm9,-0x78(%rax)
+ movaps %xmm10,-0x68(%rax)
+ movaps %xmm11,-0x58(%rax)
+ movaps %xmm12,-0x48(%rax)
+ movaps %xmm13,-0x38(%rax)
+ movaps %xmm14,-0x28(%rax)
+ movaps %xmm15,-0x18(%rax)
.Lxts_dec_body:
___
$code.=<<___;
- movups ($ivp),@tweak[5] # load clear-text tweak
+ lea -8(%rax),%rbp
+ movups ($ivp),$inout0 # load clear-text tweak
mov 240($key2),$rounds # key2->rounds
mov 240($key),$rnds_ # key1->rounds
___
# generate the tweak
- &aesni_generate1("enc",$key2,$rounds,@tweak[5]);
+ &aesni_generate1("enc",$key2,$rounds,$inout0);
$code.=<<___;
xor %eax,%eax # if ($len%16) len-=16;
test \$15,$len
@@ -1743,219 +2243,256 @@ $code.=<<___;
shl \$4,%rax
sub %rax,$len
+ $movkey ($key),$rndkey0 # zero round key
mov $key,$key_ # backup $key
mov $rnds_,$rounds # backup $rounds
+ shl \$4,$rnds_
mov $len,$len_ # backup $len
and \$-16,$len
+ $movkey 16($key,$rnds_),$rndkey1 # last round key
+
movdqa .Lxts_magic(%rip),$twmask
- pxor $twtmp,$twtmp
- pcmpgtd @tweak[5],$twtmp # broadcast upper bits
+ movdqa $inout0,@tweak[5]
+ pshufd \$0x5f,$inout0,$twres
+ pxor $rndkey0,$rndkey1
___
for ($i=0;$i<4;$i++) {
$code.=<<___;
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
+ movdqa $twres,$twtmp
+ paddd $twres,$twres
movdqa @tweak[5],@tweak[$i]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
- pand $twmask,$twres # isolate carry and residue
- pcmpgtd @tweak[5],$twtmp # broadcat upper bits
- pxor $twres,@tweak[5]
+ psrad \$31,$twtmp # broadcast upper bits
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
+ pxor $rndkey0,@tweak[$i]
+ pxor $twtmp,@tweak[5]
___
}
$code.=<<___;
+ movdqa @tweak[5],@tweak[4]
+ psrad \$31,$twres
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twres
+ pxor $rndkey0,@tweak[4]
+ pxor $twres,@tweak[5]
+ movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
+
sub \$16*6,$len
- jc .Lxts_dec_short
+ jc .Lxts_dec_short # if $len-=6*16 borrowed
- shr \$1,$rounds
- sub \$1,$rounds
- mov $rounds,$rnds_
+ mov \$16+96,$rounds
+ lea 32($key_,$rnds_),$key # end of key schedule
+ sub %r10,%rax # twisted $rounds
+ $movkey 16($key_),$rndkey1
+ mov %rax,%r10 # backup twisted $rounds
+ lea .Lxts_magic(%rip),%r8
jmp .Lxts_dec_grandloop
-.align 16
+.align 32
.Lxts_dec_grandloop:
- pshufd \$0x13,$twtmp,$twres
- movdqa @tweak[5],@tweak[4]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
movdqu `16*0`($inp),$inout0 # load input
- pand $twmask,$twres # isolate carry and residue
+ movdqa $rndkey0,$twmask
movdqu `16*1`($inp),$inout1
- pxor $twres,@tweak[5]
-
+ pxor @tweak[0],$inout0 # intput^=tweak^round[0]
movdqu `16*2`($inp),$inout2
- pxor @tweak[0],$inout0 # input^=tweak
- movdqu `16*3`($inp),$inout3
pxor @tweak[1],$inout1
- movdqu `16*4`($inp),$inout4
+ aesdec $rndkey1,$inout0
+ movdqu `16*3`($inp),$inout3
pxor @tweak[2],$inout2
- movdqu `16*5`($inp),$inout5
- lea `16*6`($inp),$inp
+ aesdec $rndkey1,$inout1
+ movdqu `16*4`($inp),$inout4
pxor @tweak[3],$inout3
- $movkey ($key_),$rndkey0
+ aesdec $rndkey1,$inout2
+ movdqu `16*5`($inp),$inout5
+ pxor @tweak[5],$twmask # round[0]^=tweak[5]
+ movdqa 0x60(%rsp),$twres # load round[0]^round[last]
pxor @tweak[4],$inout4
- pxor @tweak[5],$inout5
+ aesdec $rndkey1,$inout3
+ $movkey 32($key_),$rndkey0
+ lea `16*6`($inp),$inp
+ pxor $twmask,$inout5
- # inline _aesni_decrypt6 and interleave first and last rounds
- # with own code...
- $movkey 16($key_),$rndkey1
- pxor $rndkey0,$inout0
- pxor $rndkey0,$inout1
- movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks
- aesdec $rndkey1,$inout0
- lea 32($key_),$key
- pxor $rndkey0,$inout2
- movdqa @tweak[1],`16*1`(%rsp)
- aesdec $rndkey1,$inout1
- pxor $rndkey0,$inout3
- movdqa @tweak[2],`16*2`(%rsp)
- aesdec $rndkey1,$inout2
- pxor $rndkey0,$inout4
- movdqa @tweak[3],`16*3`(%rsp)
- aesdec $rndkey1,$inout3
- pxor $rndkey0,$inout5
- $movkey ($key),$rndkey0
- dec $rounds
- movdqa @tweak[4],`16*4`(%rsp)
+ pxor $twres,@tweak[0] # calclulate tweaks^round[last]
aesdec $rndkey1,$inout4
- movdqa @tweak[5],`16*5`(%rsp)
+ pxor $twres,@tweak[1]
+ movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key
aesdec $rndkey1,$inout5
- pxor $twtmp,$twtmp
- pcmpgtd @tweak[5],$twtmp
- jmp .Lxts_dec_loop6_enter
+ $movkey 48($key_),$rndkey1
+ pxor $twres,@tweak[2]
-.align 16
+ aesdec $rndkey0,$inout0
+ pxor $twres,@tweak[3]
+ movdqa @tweak[1],`16*1`(%rsp)
+ aesdec $rndkey0,$inout1
+ pxor $twres,@tweak[4]
+ movdqa @tweak[2],`16*2`(%rsp)
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ pxor $twres,$twmask
+ movdqa @tweak[4],`16*4`(%rsp)
+ aesdec $rndkey0,$inout4
+ aesdec $rndkey0,$inout5
+ $movkey 64($key_),$rndkey0
+ movdqa $twmask,`16*5`(%rsp)
+ pshufd \$0x5f,@tweak[5],$twres
+ jmp .Lxts_dec_loop6
+.align 32
.Lxts_dec_loop6:
aesdec $rndkey1,$inout0
aesdec $rndkey1,$inout1
- dec $rounds
aesdec $rndkey1,$inout2
aesdec $rndkey1,$inout3
aesdec $rndkey1,$inout4
aesdec $rndkey1,$inout5
-.Lxts_dec_loop6_enter:
- $movkey 16($key),$rndkey1
+ $movkey -64($key,%rax),$rndkey1
+ add \$32,%rax
+
aesdec $rndkey0,$inout0
aesdec $rndkey0,$inout1
- lea 32($key),$key
aesdec $rndkey0,$inout2
aesdec $rndkey0,$inout3
aesdec $rndkey0,$inout4
aesdec $rndkey0,$inout5
- $movkey ($key),$rndkey0
+ $movkey -80($key,%rax),$rndkey0
jnz .Lxts_dec_loop6
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ movdqa (%r8),$twmask # start calculating next tweak
+ movdqa $twres,$twtmp
+ paddd $twres,$twres
aesdec $rndkey1,$inout0
- pand $twmask,$twres # isolate carry and residue
+ paddq @tweak[5],@tweak[5]
+ psrad \$31,$twtmp
aesdec $rndkey1,$inout1
- pcmpgtd @tweak[5],$twtmp # broadcast upper bits
+ pand $twmask,$twtmp
+ $movkey ($key_),@tweak[0] # load round[0]
aesdec $rndkey1,$inout2
- pxor $twres,@tweak[5]
aesdec $rndkey1,$inout3
aesdec $rndkey1,$inout4
+ pxor $twtmp,@tweak[5]
+ movaps @tweak[0],@tweak[1] # copy round[0]
aesdec $rndkey1,$inout5
- $movkey 16($key),$rndkey1
+ $movkey -64($key),$rndkey1
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
- movdqa @tweak[5],@tweak[0]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ movdqa $twres,$twtmp
aesdec $rndkey0,$inout0
- pand $twmask,$twres # isolate carry and residue
+ paddd $twres,$twres
+ pxor @tweak[5],@tweak[0]
aesdec $rndkey0,$inout1
- pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ psrad \$31,$twtmp
+ paddq @tweak[5],@tweak[5]
aesdec $rndkey0,$inout2
- pxor $twres,@tweak[5]
aesdec $rndkey0,$inout3
+ pand $twmask,$twtmp
+ movaps @tweak[1],@tweak[2]
aesdec $rndkey0,$inout4
+ pxor $twtmp,@tweak[5]
+ movdqa $twres,$twtmp
aesdec $rndkey0,$inout5
- $movkey 32($key),$rndkey0
+ $movkey -48($key),$rndkey0
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
- movdqa @tweak[5],@tweak[1]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ paddd $twres,$twres
aesdec $rndkey1,$inout0
- pand $twmask,$twres # isolate carry and residue
+ pxor @tweak[5],@tweak[1]
+ psrad \$31,$twtmp
aesdec $rndkey1,$inout1
- pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
aesdec $rndkey1,$inout2
- pxor $twres,@tweak[5]
aesdec $rndkey1,$inout3
+ movdqa @tweak[3],`16*3`(%rsp)
+ pxor $twtmp,@tweak[5]
aesdec $rndkey1,$inout4
+ movaps @tweak[2],@tweak[3]
+ movdqa $twres,$twtmp
aesdec $rndkey1,$inout5
+ $movkey -32($key),$rndkey1
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
- movdqa @tweak[5],@tweak[2]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
- aesdeclast $rndkey0,$inout0
- pand $twmask,$twres # isolate carry and residue
- aesdeclast $rndkey0,$inout1
- pcmpgtd @tweak[5],$twtmp # broadcat upper bits
- aesdeclast $rndkey0,$inout2
- pxor $twres,@tweak[5]
- aesdeclast $rndkey0,$inout3
- aesdeclast $rndkey0,$inout4
- aesdeclast $rndkey0,$inout5
-
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
- movdqa @tweak[5],@tweak[3]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
- xorps `16*0`(%rsp),$inout0 # output^=tweak
- pand $twmask,$twres # isolate carry and residue
- xorps `16*1`(%rsp),$inout1
- pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ paddd $twres,$twres
+ aesdec $rndkey0,$inout0
+ pxor @tweak[5],@tweak[2]
+ psrad \$31,$twtmp
+ aesdec $rndkey0,$inout1
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ aesdec $rndkey0,$inout4
+ pxor $twtmp,@tweak[5]
+ movaps @tweak[3],@tweak[4]
+ aesdec $rndkey0,$inout5
+
+ movdqa $twres,$rndkey0
+ paddd $twres,$twres
+ aesdec $rndkey1,$inout0
+ pxor @tweak[5],@tweak[3]
+ psrad \$31,$rndkey0
+ aesdec $rndkey1,$inout1
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$rndkey0
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ pxor $rndkey0,@tweak[5]
+ $movkey ($key_),$rndkey0
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+ $movkey 16($key_),$rndkey1
+
+ pxor @tweak[5],@tweak[4]
+ aesdeclast `16*0`(%rsp),$inout0
+ psrad \$31,$twres
+ paddq @tweak[5],@tweak[5]
+ aesdeclast `16*1`(%rsp),$inout1
+ aesdeclast `16*2`(%rsp),$inout2
+ pand $twmask,$twres
+ mov %r10,%rax # restore $rounds
+ aesdeclast `16*3`(%rsp),$inout3
+ aesdeclast `16*4`(%rsp),$inout4
+ aesdeclast `16*5`(%rsp),$inout5
pxor $twres,@tweak[5]
- xorps `16*2`(%rsp),$inout2
- movups $inout0,`16*0`($out) # write output
- xorps `16*3`(%rsp),$inout3
- movups $inout1,`16*1`($out)
- xorps `16*4`(%rsp),$inout4
- movups $inout2,`16*2`($out)
- xorps `16*5`(%rsp),$inout5
- movups $inout3,`16*3`($out)
- mov $rnds_,$rounds # restore $rounds
- movups $inout4,`16*4`($out)
- movups $inout5,`16*5`($out)
- lea `16*6`($out),$out
+ lea `16*6`($out),$out # $out+=6*16
+ movups $inout0,`-16*6`($out) # store 6 output blocks
+ movups $inout1,`-16*5`($out)
+ movups $inout2,`-16*4`($out)
+ movups $inout3,`-16*3`($out)
+ movups $inout4,`-16*2`($out)
+ movups $inout5,`-16*1`($out)
sub \$16*6,$len
- jnc .Lxts_dec_grandloop
+ jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow
- lea 3($rounds,$rounds),$rounds # restore original value
+ mov \$16+96,$rounds
+ sub $rnds_,$rounds
mov $key_,$key # restore $key
- mov $rounds,$rnds_ # backup $rounds
+ shr \$4,$rounds # restore original value
.Lxts_dec_short:
- add \$16*6,$len
- jz .Lxts_dec_done
+ # at the point @tweak[0..5] are populated with tweak values
+ mov $rounds,$rnds_ # backup $rounds
+ pxor $rndkey0,@tweak[0]
+ pxor $rndkey0,@tweak[1]
+ add \$16*6,$len # restore real remaining $len
+ jz .Lxts_dec_done # done if ($len==0)
+ pxor $rndkey0,@tweak[2]
cmp \$0x20,$len
- jb .Lxts_dec_one
- je .Lxts_dec_two
+ jb .Lxts_dec_one # $len is 1*16
+ pxor $rndkey0,@tweak[3]
+ je .Lxts_dec_two # $len is 2*16
+ pxor $rndkey0,@tweak[4]
cmp \$0x40,$len
- jb .Lxts_dec_three
- je .Lxts_dec_four
-
- pshufd \$0x13,$twtmp,$twres
- movdqa @tweak[5],@tweak[4]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
- movdqu ($inp),$inout0
- pand $twmask,$twres # isolate carry and residue
- movdqu 16*1($inp),$inout1
- pxor $twres,@tweak[5]
+ jb .Lxts_dec_three # $len is 3*16
+ je .Lxts_dec_four # $len is 4*16
+ movdqu ($inp),$inout0 # $len is 5*16
+ movdqu 16*1($inp),$inout1
movdqu 16*2($inp),$inout2
pxor @tweak[0],$inout0
movdqu 16*3($inp),$inout3
pxor @tweak[1],$inout1
movdqu 16*4($inp),$inout4
- lea 16*5($inp),$inp
+ lea 16*5($inp),$inp # $inp+=5*16
pxor @tweak[2],$inout2
pxor @tweak[3],$inout3
pxor @tweak[4],$inout4
@@ -1965,7 +2502,7 @@ $code.=<<___;
xorps @tweak[0],$inout0
xorps @tweak[1],$inout1
xorps @tweak[2],$inout2
- movdqu $inout0,($out)
+ movdqu $inout0,($out) # store 5 output blocks
xorps @tweak[3],$inout3
movdqu $inout1,16*1($out)
xorps @tweak[4],$inout4
@@ -1974,7 +2511,7 @@ $code.=<<___;
movdqu $inout3,16*3($out)
pcmpgtd @tweak[5],$twtmp
movdqu $inout4,16*4($out)
- lea 16*5($out),$out
+ lea 16*5($out),$out # $out+=5*16
pshufd \$0x13,$twtmp,@tweak[1] # $twres
and \$15,$len_
jz .Lxts_dec_ret
@@ -1988,35 +2525,35 @@ $code.=<<___;
.align 16
.Lxts_dec_one:
movups ($inp),$inout0
- lea 16*1($inp),$inp
+ lea 16*1($inp),$inp # $inp+=1*16
xorps @tweak[0],$inout0
___
&aesni_generate1("dec",$key,$rounds);
$code.=<<___;
xorps @tweak[0],$inout0
movdqa @tweak[1],@tweak[0]
- movups $inout0,($out)
+ movups $inout0,($out) # store one output block
movdqa @tweak[2],@tweak[1]
- lea 16*1($out),$out
+ lea 16*1($out),$out # $out+=1*16
jmp .Lxts_dec_done
.align 16
.Lxts_dec_two:
movups ($inp),$inout0
movups 16($inp),$inout1
- lea 32($inp),$inp
+ lea 32($inp),$inp # $inp+=2*16
xorps @tweak[0],$inout0
xorps @tweak[1],$inout1
- call _aesni_decrypt3
+ call _aesni_decrypt2
xorps @tweak[0],$inout0
movdqa @tweak[2],@tweak[0]
xorps @tweak[1],$inout1
movdqa @tweak[3],@tweak[1]
- movups $inout0,($out)
+ movups $inout0,($out) # store 2 output blocks
movups $inout1,16*1($out)
- lea 16*2($out),$out
+ lea 16*2($out),$out # $out+=2*16
jmp .Lxts_dec_done
.align 16
@@ -2024,7 +2561,7 @@ $code.=<<___;
movups ($inp),$inout0
movups 16*1($inp),$inout1
movups 16*2($inp),$inout2
- lea 16*3($inp),$inp
+ lea 16*3($inp),$inp # $inp+=3*16
xorps @tweak[0],$inout0
xorps @tweak[1],$inout1
xorps @tweak[2],$inout2
@@ -2034,50 +2571,44 @@ $code.=<<___;
xorps @tweak[0],$inout0
movdqa @tweak[3],@tweak[0]
xorps @tweak[1],$inout1
- movdqa @tweak[5],@tweak[1]
+ movdqa @tweak[4],@tweak[1]
xorps @tweak[2],$inout2
- movups $inout0,($out)
+ movups $inout0,($out) # store 3 output blocks
movups $inout1,16*1($out)
movups $inout2,16*2($out)
- lea 16*3($out),$out
+ lea 16*3($out),$out # $out+=3*16
jmp .Lxts_dec_done
.align 16
.Lxts_dec_four:
- pshufd \$0x13,$twtmp,$twres
- movdqa @tweak[5],@tweak[4]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
- movups ($inp),$inout0
- pand $twmask,$twres # isolate carry and residue
- movups 16*1($inp),$inout1
- pxor $twres,@tweak[5]
-
+ movups ($inp),$inout0
+ movups 16*1($inp),$inout1
movups 16*2($inp),$inout2
xorps @tweak[0],$inout0
movups 16*3($inp),$inout3
- lea 16*4($inp),$inp
+ lea 16*4($inp),$inp # $inp+=4*16
xorps @tweak[1],$inout1
xorps @tweak[2],$inout2
xorps @tweak[3],$inout3
call _aesni_decrypt4
- xorps @tweak[0],$inout0
+ pxor @tweak[0],$inout0
movdqa @tweak[4],@tweak[0]
- xorps @tweak[1],$inout1
+ pxor @tweak[1],$inout1
movdqa @tweak[5],@tweak[1]
- xorps @tweak[2],$inout2
- movups $inout0,($out)
- xorps @tweak[3],$inout3
- movups $inout1,16*1($out)
- movups $inout2,16*2($out)
- movups $inout3,16*3($out)
- lea 16*4($out),$out
+ pxor @tweak[2],$inout2
+ movdqu $inout0,($out) # store 4 output blocks
+ pxor @tweak[3],$inout3
+ movdqu $inout1,16*1($out)
+ movdqu $inout2,16*2($out)
+ movdqu $inout3,16*3($out)
+ lea 16*4($out),$out # $out+=4*16
jmp .Lxts_dec_done
.align 16
.Lxts_dec_done:
- and \$15,$len_
+ and \$15,$len_ # see if $len%16 is 0
jz .Lxts_dec_ret
.Lxts_dec_done2:
mov $len_,$len
@@ -2115,21 +2646,64 @@ $code.=<<___;
movups $inout0,($out)
.Lxts_dec_ret:
+ xorps %xmm0,%xmm0 # clear register bank
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0x00(%rsp) # clear stack
+ pxor %xmm8,%xmm8
+ movaps %xmm0,0x10(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,0x20(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,0x30(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,0x40(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,0x50(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,0x60(%rsp)
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
___
$code.=<<___ if ($win64);
- movaps 0x60(%rsp),%xmm6
- movaps 0x70(%rsp),%xmm7
- movaps 0x80(%rsp),%xmm8
- movaps 0x90(%rsp),%xmm9
- movaps 0xa0(%rsp),%xmm10
- movaps 0xb0(%rsp),%xmm11
- movaps 0xc0(%rsp),%xmm12
- movaps 0xd0(%rsp),%xmm13
- movaps 0xe0(%rsp),%xmm14
- movaps 0xf0(%rsp),%xmm15
+ movaps -0xa0(%rbp),%xmm6
+ movaps %xmm0,-0xa0(%rbp) # clear stack
+ movaps -0x90(%rbp),%xmm7
+ movaps %xmm0,-0x90(%rbp)
+ movaps -0x80(%rbp),%xmm8
+ movaps %xmm0,-0x80(%rbp)
+ movaps -0x70(%rbp),%xmm9
+ movaps %xmm0,-0x70(%rbp)
+ movaps -0x60(%rbp),%xmm10
+ movaps %xmm0,-0x60(%rbp)
+ movaps -0x50(%rbp),%xmm11
+ movaps %xmm0,-0x50(%rbp)
+ movaps -0x40(%rbp),%xmm12
+ movaps %xmm0,-0x40(%rbp)
+ movaps -0x30(%rbp),%xmm13
+ movaps %xmm0,-0x30(%rbp)
+ movaps -0x20(%rbp),%xmm14
+ movaps %xmm0,-0x20(%rbp)
+ movaps -0x10(%rbp),%xmm15
+ movaps %xmm0,-0x10(%rbp)
+ movaps %xmm0,0x00(%rsp)
+ movaps %xmm0,0x10(%rsp)
+ movaps %xmm0,0x20(%rsp)
+ movaps %xmm0,0x30(%rsp)
+ movaps %xmm0,0x40(%rsp)
+ movaps %xmm0,0x50(%rsp)
+ movaps %xmm0,0x60(%rsp)
___
$code.=<<___;
- lea $frame_size(%rsp),%rsp
+ lea (%rbp),%rsp
+ pop %rbp
.Lxts_dec_epilogue:
ret
.size aesni_xts_decrypt,.-aesni_xts_decrypt
@@ -2141,7 +2715,10 @@ ___
# size_t length, const AES_KEY *key,
# unsigned char *ivp,const int enc);
{
-my $reserved = $win64?0x40:-0x18; # used in decrypt
+my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt
+my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
+my $inp_=$key_;
+
$code.=<<___;
.globl ${PREFIX}_cbc_encrypt
.type ${PREFIX}_cbc_encrypt,\@function,6
@@ -2177,7 +2754,11 @@ $code.=<<___;
jnc .Lcbc_enc_loop
add \$16,$len
jnz .Lcbc_enc_tail
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
movups $inout0,($ivp)
+ pxor $inout0,$inout0
+ pxor $inout1,$inout1
jmp .Lcbc_ret
.Lcbc_enc_tail:
@@ -2197,283 +2778,483 @@ $code.=<<___;
#--------------------------- CBC DECRYPT ------------------------------#
.align 16
.Lcbc_decrypt:
+ cmp \$16,$len
+ jne .Lcbc_decrypt_bulk
+
+ # handle single block without allocating stack frame,
+ # useful in ciphertext stealing mode
+ movdqu ($inp),$inout0 # load input
+ movdqu ($ivp),$inout1 # load iv
+ movdqa $inout0,$inout2 # future iv
+___
+ &aesni_generate1("dec",$key,$rnds_);
+$code.=<<___;
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
+ movdqu $inout2,($ivp) # store iv
+ xorps $inout1,$inout0 # ^=iv
+ pxor $inout1,$inout1
+ movups $inout0,($out) # store output
+ pxor $inout0,$inout0
+ jmp .Lcbc_ret
+.align 16
+.Lcbc_decrypt_bulk:
+ lea (%rsp),%rax
+ push %rbp
+ sub \$$frame_size,%rsp
+ and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
___
$code.=<<___ if ($win64);
- lea -0x58(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
- movaps %xmm8,0x20(%rsp)
- movaps %xmm9,0x30(%rsp)
+ movaps %xmm6,0x10(%rsp)
+ movaps %xmm7,0x20(%rsp)
+ movaps %xmm8,0x30(%rsp)
+ movaps %xmm9,0x40(%rsp)
+ movaps %xmm10,0x50(%rsp)
+ movaps %xmm11,0x60(%rsp)
+ movaps %xmm12,0x70(%rsp)
+ movaps %xmm13,0x80(%rsp)
+ movaps %xmm14,0x90(%rsp)
+ movaps %xmm15,0xa0(%rsp)
.Lcbc_decrypt_body:
___
$code.=<<___;
+ lea -8(%rax),%rbp
movups ($ivp),$iv
mov $rnds_,$rounds
- cmp \$0x70,$len
+ cmp \$0x50,$len
jbe .Lcbc_dec_tail
- shr \$1,$rnds_
- sub \$0x70,$len
- mov $rnds_,$rounds
- movaps $iv,$reserved(%rsp)
+
+ $movkey ($key),$rndkey0
+ movdqu 0x00($inp),$inout0 # load input
+ movdqu 0x10($inp),$inout1
+ movdqa $inout0,$in0
+ movdqu 0x20($inp),$inout2
+ movdqa $inout1,$in1
+ movdqu 0x30($inp),$inout3
+ movdqa $inout2,$in2
+ movdqu 0x40($inp),$inout4
+ movdqa $inout3,$in3
+ movdqu 0x50($inp),$inout5
+ movdqa $inout4,$in4
+ mov OPENSSL_ia32cap_P+4(%rip),%r9d
+ cmp \$0x70,$len
+ jbe .Lcbc_dec_six_or_seven
+
+ and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE
+ sub \$0x50,$len # $len is biased by -5*16
+ cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE
+ je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont]
+ sub \$0x20,$len # $len is biased by -7*16
+ lea 0x70($key),$key # size optimization
jmp .Lcbc_dec_loop8_enter
.align 16
.Lcbc_dec_loop8:
- movaps $rndkey0,$reserved(%rsp) # save IV
movups $inout7,($out)
lea 0x10($out),$out
.Lcbc_dec_loop8_enter:
- $movkey ($key),$rndkey0
- movups ($inp),$inout0 # load input
- movups 0x10($inp),$inout1
- $movkey 16($key),$rndkey1
+ movdqu 0x60($inp),$inout6
+ pxor $rndkey0,$inout0
+ movdqu 0x70($inp),$inout7
+ pxor $rndkey0,$inout1
+ $movkey 0x10-0x70($key),$rndkey1
+ pxor $rndkey0,$inout2
+ xor $inp_,$inp_
+ cmp \$0x70,$len # is there at least 0x60 bytes ahead?
+ pxor $rndkey0,$inout3
+ pxor $rndkey0,$inout4
+ pxor $rndkey0,$inout5
+ pxor $rndkey0,$inout6
- lea 32($key),$key
- movdqu 0x20($inp),$inout2
- xorps $rndkey0,$inout0
- movdqu 0x30($inp),$inout3
- xorps $rndkey0,$inout1
- movdqu 0x40($inp),$inout4
aesdec $rndkey1,$inout0
- pxor $rndkey0,$inout2
- movdqu 0x50($inp),$inout5
+ pxor $rndkey0,$inout7
+ $movkey 0x20-0x70($key),$rndkey0
aesdec $rndkey1,$inout1
- pxor $rndkey0,$inout3
- movdqu 0x60($inp),$inout6
aesdec $rndkey1,$inout2
- pxor $rndkey0,$inout4
- movdqu 0x70($inp),$inout7
aesdec $rndkey1,$inout3
- pxor $rndkey0,$inout5
- dec $rounds
aesdec $rndkey1,$inout4
- pxor $rndkey0,$inout6
aesdec $rndkey1,$inout5
- pxor $rndkey0,$inout7
- $movkey ($key),$rndkey0
aesdec $rndkey1,$inout6
+ setnc ${inp_}b
+ shl \$7,$inp_
aesdec $rndkey1,$inout7
- $movkey 16($key),$rndkey1
-
- call .Ldec_loop8_enter
+ add $inp,$inp_
+ $movkey 0x30-0x70($key),$rndkey1
+___
+for($i=1;$i<12;$i++) {
+my $rndkeyx = ($i&1)?$rndkey0:$rndkey1;
+$code.=<<___ if ($i==7);
+ cmp \$11,$rounds
+___
+$code.=<<___;
+ aesdec $rndkeyx,$inout0
+ aesdec $rndkeyx,$inout1
+ aesdec $rndkeyx,$inout2
+ aesdec $rndkeyx,$inout3
+ aesdec $rndkeyx,$inout4
+ aesdec $rndkeyx,$inout5
+ aesdec $rndkeyx,$inout6
+ aesdec $rndkeyx,$inout7
+ $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx
+___
+$code.=<<___ if ($i<6 || (!($i&1) && $i>7));
+ nop
+___
+$code.=<<___ if ($i==7);
+ jb .Lcbc_dec_done
+___
+$code.=<<___ if ($i==9);
+ je .Lcbc_dec_done
+___
+$code.=<<___ if ($i==11);
+ jmp .Lcbc_dec_done
+___
+}
+$code.=<<___;
+.align 16
+.Lcbc_dec_done:
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ pxor $rndkey0,$iv
+ pxor $rndkey0,$in0
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ pxor $rndkey0,$in1
+ pxor $rndkey0,$in2
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+ pxor $rndkey0,$in3
+ pxor $rndkey0,$in4
+ aesdec $rndkey1,$inout6
+ aesdec $rndkey1,$inout7
+ movdqu 0x50($inp),$rndkey1
+
+ aesdeclast $iv,$inout0
+ movdqu 0x60($inp),$iv # borrow $iv
+ pxor $rndkey0,$rndkey1
+ aesdeclast $in0,$inout1
+ pxor $rndkey0,$iv
+ movdqu 0x70($inp),$rndkey0 # next IV
+ aesdeclast $in1,$inout2
+ lea 0x80($inp),$inp
+ movdqu 0x00($inp_),$in0
+ aesdeclast $in2,$inout3
+ aesdeclast $in3,$inout4
+ movdqu 0x10($inp_),$in1
+ movdqu 0x20($inp_),$in2
+ aesdeclast $in4,$inout5
+ aesdeclast $rndkey1,$inout6
+ movdqu 0x30($inp_),$in3
+ movdqu 0x40($inp_),$in4
+ aesdeclast $iv,$inout7
+ movdqa $rndkey0,$iv # return $iv
+ movdqu 0x50($inp_),$rndkey1
+ $movkey -0x70($key),$rndkey0
+
+ movups $inout0,($out) # store output
+ movdqa $in0,$inout0
+ movups $inout1,0x10($out)
+ movdqa $in1,$inout1
+ movups $inout2,0x20($out)
+ movdqa $in2,$inout2
+ movups $inout3,0x30($out)
+ movdqa $in3,$inout3
+ movups $inout4,0x40($out)
+ movdqa $in4,$inout4
+ movups $inout5,0x50($out)
+ movdqa $rndkey1,$inout5
+ movups $inout6,0x60($out)
+ lea 0x70($out),$out
- movups ($inp),$rndkey1 # re-load input
- movups 0x10($inp),$rndkey0
- xorps $reserved(%rsp),$inout0 # ^= IV
- xorps $rndkey1,$inout1
- movups 0x20($inp),$rndkey1
- xorps $rndkey0,$inout2
- movups 0x30($inp),$rndkey0
- xorps $rndkey1,$inout3
- movups 0x40($inp),$rndkey1
- xorps $rndkey0,$inout4
- movups 0x50($inp),$rndkey0
- xorps $rndkey1,$inout5
- movups 0x60($inp),$rndkey1
- xorps $rndkey0,$inout6
- movups 0x70($inp),$rndkey0 # IV
- xorps $rndkey1,$inout7
- movups $inout0,($out)
- movups $inout1,0x10($out)
- movups $inout2,0x20($out)
- movups $inout3,0x30($out)
- mov $rnds_,$rounds # restore $rounds
- movups $inout4,0x40($out)
- mov $key_,$key # restore $key
- movups $inout5,0x50($out)
- lea 0x80($inp),$inp
- movups $inout6,0x60($out)
- lea 0x70($out),$out
sub \$0x80,$len
ja .Lcbc_dec_loop8
movaps $inout7,$inout0
- movaps $rndkey0,$iv
+ lea -0x70($key),$key
add \$0x70,$len
- jle .Lcbc_dec_tail_collected
- movups $inout0,($out)
- lea 1($rnds_,$rnds_),$rounds
+ jle .Lcbc_dec_clear_tail_collected
+ movups $inout7,($out)
lea 0x10($out),$out
+ cmp \$0x50,$len
+ jbe .Lcbc_dec_tail
+
+ movaps $in0,$inout0
+.Lcbc_dec_six_or_seven:
+ cmp \$0x60,$len
+ ja .Lcbc_dec_seven
+
+ movaps $inout5,$inout6
+ call _aesni_decrypt6
+ pxor $iv,$inout0 # ^= IV
+ movaps $inout6,$iv
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ pxor $in1,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $inout1,$inout1 # clear register bank
+ pxor $in2,$inout3
+ movdqu $inout2,0x20($out)
+ pxor $inout2,$inout2
+ pxor $in3,$inout4
+ movdqu $inout3,0x30($out)
+ pxor $inout3,$inout3
+ pxor $in4,$inout5
+ movdqu $inout4,0x40($out)
+ pxor $inout4,$inout4
+ lea 0x50($out),$out
+ movdqa $inout5,$inout0
+ pxor $inout5,$inout5
+ jmp .Lcbc_dec_tail_collected
+
+.align 16
+.Lcbc_dec_seven:
+ movups 0x60($inp),$inout6
+ xorps $inout7,$inout7
+ call _aesni_decrypt8
+ movups 0x50($inp),$inout7
+ pxor $iv,$inout0 # ^= IV
+ movups 0x60($inp),$iv
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ pxor $in1,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $inout1,$inout1 # clear register bank
+ pxor $in2,$inout3
+ movdqu $inout2,0x20($out)
+ pxor $inout2,$inout2
+ pxor $in3,$inout4
+ movdqu $inout3,0x30($out)
+ pxor $inout3,$inout3
+ pxor $in4,$inout5
+ movdqu $inout4,0x40($out)
+ pxor $inout4,$inout4
+ pxor $inout7,$inout6
+ movdqu $inout5,0x50($out)
+ pxor $inout5,$inout5
+ lea 0x60($out),$out
+ movdqa $inout6,$inout0
+ pxor $inout6,$inout6
+ pxor $inout7,$inout7
+ jmp .Lcbc_dec_tail_collected
+
+.align 16
+.Lcbc_dec_loop6:
+ movups $inout5,($out)
+ lea 0x10($out),$out
+ movdqu 0x00($inp),$inout0 # load input
+ movdqu 0x10($inp),$inout1
+ movdqa $inout0,$in0
+ movdqu 0x20($inp),$inout2
+ movdqa $inout1,$in1
+ movdqu 0x30($inp),$inout3
+ movdqa $inout2,$in2
+ movdqu 0x40($inp),$inout4
+ movdqa $inout3,$in3
+ movdqu 0x50($inp),$inout5
+ movdqa $inout4,$in4
+.Lcbc_dec_loop6_enter:
+ lea 0x60($inp),$inp
+ movdqa $inout5,$inout6
+
+ call _aesni_decrypt6
+
+ pxor $iv,$inout0 # ^= IV
+ movdqa $inout6,$iv
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ pxor $in1,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $in2,$inout3
+ movdqu $inout2,0x20($out)
+ pxor $in3,$inout4
+ mov $key_,$key
+ movdqu $inout3,0x30($out)
+ pxor $in4,$inout5
+ mov $rnds_,$rounds
+ movdqu $inout4,0x40($out)
+ lea 0x50($out),$out
+ sub \$0x60,$len
+ ja .Lcbc_dec_loop6
+
+ movdqa $inout5,$inout0
+ add \$0x50,$len
+ jle .Lcbc_dec_clear_tail_collected
+ movups $inout5,($out)
+ lea 0x10($out),$out
+
.Lcbc_dec_tail:
movups ($inp),$inout0
- movaps $inout0,$in0
- cmp \$0x10,$len
- jbe .Lcbc_dec_one
+ sub \$0x10,$len
+ jbe .Lcbc_dec_one # $len is 1*16 or less
movups 0x10($inp),$inout1
- movaps $inout1,$in1
- cmp \$0x20,$len
- jbe .Lcbc_dec_two
+ movaps $inout0,$in0
+ sub \$0x10,$len
+ jbe .Lcbc_dec_two # $len is 2*16 or less
movups 0x20($inp),$inout2
- movaps $inout2,$in2
- cmp \$0x30,$len
- jbe .Lcbc_dec_three
+ movaps $inout1,$in1
+ sub \$0x10,$len
+ jbe .Lcbc_dec_three # $len is 3*16 or less
movups 0x30($inp),$inout3
- cmp \$0x40,$len
- jbe .Lcbc_dec_four
-
- movups 0x40($inp),$inout4
- cmp \$0x50,$len
- jbe .Lcbc_dec_five
-
- movups 0x50($inp),$inout5
- cmp \$0x60,$len
- jbe .Lcbc_dec_six
+ movaps $inout2,$in2
+ sub \$0x10,$len
+ jbe .Lcbc_dec_four # $len is 4*16 or less
- movups 0x60($inp),$inout6
- movaps $iv,$reserved(%rsp) # save IV
- call _aesni_decrypt8
- movups ($inp),$rndkey1
- movups 0x10($inp),$rndkey0
- xorps $reserved(%rsp),$inout0 # ^= IV
- xorps $rndkey1,$inout1
- movups 0x20($inp),$rndkey1
- xorps $rndkey0,$inout2
- movups 0x30($inp),$rndkey0
- xorps $rndkey1,$inout3
- movups 0x40($inp),$rndkey1
- xorps $rndkey0,$inout4
- movups 0x50($inp),$rndkey0
- xorps $rndkey1,$inout5
- movups 0x60($inp),$iv # IV
- xorps $rndkey0,$inout6
- movups $inout0,($out)
- movups $inout1,0x10($out)
- movups $inout2,0x20($out)
- movups $inout3,0x30($out)
- movups $inout4,0x40($out)
- movups $inout5,0x50($out)
- lea 0x60($out),$out
- movaps $inout6,$inout0
- sub \$0x70,$len
+ movups 0x40($inp),$inout4 # $len is 5*16 or less
+ movaps $inout3,$in3
+ movaps $inout4,$in4
+ xorps $inout5,$inout5
+ call _aesni_decrypt6
+ pxor $iv,$inout0
+ movaps $in4,$iv
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ pxor $in1,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $inout1,$inout1 # clear register bank
+ pxor $in2,$inout3
+ movdqu $inout2,0x20($out)
+ pxor $inout2,$inout2
+ pxor $in3,$inout4
+ movdqu $inout3,0x30($out)
+ pxor $inout3,$inout3
+ lea 0x40($out),$out
+ movdqa $inout4,$inout0
+ pxor $inout4,$inout4
+ pxor $inout5,$inout5
+ sub \$0x10,$len
jmp .Lcbc_dec_tail_collected
+
.align 16
.Lcbc_dec_one:
+ movaps $inout0,$in0
___
&aesni_generate1("dec",$key,$rounds);
$code.=<<___;
xorps $iv,$inout0
movaps $in0,$iv
- sub \$0x10,$len
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_two:
- xorps $inout2,$inout2
- call _aesni_decrypt3
- xorps $iv,$inout0
- xorps $in0,$inout1
- movups $inout0,($out)
+ movaps $inout1,$in1
+ call _aesni_decrypt2
+ pxor $iv,$inout0
movaps $in1,$iv
- movaps $inout1,$inout0
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ movdqa $inout1,$inout0
+ pxor $inout1,$inout1 # clear register bank
lea 0x10($out),$out
- sub \$0x20,$len
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_three:
+ movaps $inout2,$in2
call _aesni_decrypt3
- xorps $iv,$inout0
- xorps $in0,$inout1
- movups $inout0,($out)
- xorps $in1,$inout2
- movups $inout1,0x10($out)
+ pxor $iv,$inout0
movaps $in2,$iv
- movaps $inout2,$inout0
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ pxor $in1,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $inout1,$inout1 # clear register bank
+ movdqa $inout2,$inout0
+ pxor $inout2,$inout2
lea 0x20($out),$out
- sub \$0x30,$len
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_four:
+ movaps $inout3,$in3
call _aesni_decrypt4
- xorps $iv,$inout0
- movups 0x30($inp),$iv
- xorps $in0,$inout1
- movups $inout0,($out)
- xorps $in1,$inout2
- movups $inout1,0x10($out)
- xorps $in2,$inout3
- movups $inout2,0x20($out)
- movaps $inout3,$inout0
+ pxor $iv,$inout0
+ movaps $in3,$iv
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ pxor $in1,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $inout1,$inout1 # clear register bank
+ pxor $in2,$inout3
+ movdqu $inout2,0x20($out)
+ pxor $inout2,$inout2
+ movdqa $inout3,$inout0
+ pxor $inout3,$inout3
lea 0x30($out),$out
- sub \$0x40,$len
- jmp .Lcbc_dec_tail_collected
-.align 16
-.Lcbc_dec_five:
- xorps $inout5,$inout5
- call _aesni_decrypt6
- movups 0x10($inp),$rndkey1
- movups 0x20($inp),$rndkey0
- xorps $iv,$inout0
- xorps $in0,$inout1
- xorps $rndkey1,$inout2
- movups 0x30($inp),$rndkey1
- xorps $rndkey0,$inout3
- movups 0x40($inp),$iv
- xorps $rndkey1,$inout4
- movups $inout0,($out)
- movups $inout1,0x10($out)
- movups $inout2,0x20($out)
- movups $inout3,0x30($out)
- lea 0x40($out),$out
- movaps $inout4,$inout0
- sub \$0x50,$len
- jmp .Lcbc_dec_tail_collected
-.align 16
-.Lcbc_dec_six:
- call _aesni_decrypt6
- movups 0x10($inp),$rndkey1
- movups 0x20($inp),$rndkey0
- xorps $iv,$inout0
- xorps $in0,$inout1
- xorps $rndkey1,$inout2
- movups 0x30($inp),$rndkey1
- xorps $rndkey0,$inout3
- movups 0x40($inp),$rndkey0
- xorps $rndkey1,$inout4
- movups 0x50($inp),$iv
- xorps $rndkey0,$inout5
- movups $inout0,($out)
- movups $inout1,0x10($out)
- movups $inout2,0x20($out)
- movups $inout3,0x30($out)
- movups $inout4,0x40($out)
- lea 0x50($out),$out
- movaps $inout5,$inout0
- sub \$0x60,$len
jmp .Lcbc_dec_tail_collected
+
.align 16
+.Lcbc_dec_clear_tail_collected:
+ pxor $inout1,$inout1 # clear register bank
+ pxor $inout2,$inout2
+ pxor $inout3,$inout3
+___
+$code.=<<___ if (!$win64);
+ pxor $inout4,$inout4 # %xmm6..9
+ pxor $inout5,$inout5
+ pxor $inout6,$inout6
+ pxor $inout7,$inout7
+___
+$code.=<<___;
.Lcbc_dec_tail_collected:
- and \$15,$len
movups $iv,($ivp)
+ and \$15,$len
jnz .Lcbc_dec_tail_partial
movups $inout0,($out)
+ pxor $inout0,$inout0
jmp .Lcbc_dec_ret
.align 16
.Lcbc_dec_tail_partial:
- movaps $inout0,$reserved(%rsp)
+ movaps $inout0,(%rsp)
+ pxor $inout0,$inout0
mov \$16,%rcx
mov $out,%rdi
sub $len,%rcx
- lea $reserved(%rsp),%rsi
- .long 0x9066A4F3 # rep movsb
+ lea (%rsp),%rsi
+ .long 0x9066A4F3 # rep movsb
+ movdqa $inout0,(%rsp)
.Lcbc_dec_ret:
+ xorps $rndkey0,$rndkey0 # %xmm0
+ pxor $rndkey1,$rndkey1
___
$code.=<<___ if ($win64);
- movaps (%rsp),%xmm6
- movaps 0x10(%rsp),%xmm7
- movaps 0x20(%rsp),%xmm8
- movaps 0x30(%rsp),%xmm9
- lea 0x58(%rsp),%rsp
+ movaps 0x10(%rsp),%xmm6
+ movaps %xmm0,0x10(%rsp) # clear stack
+ movaps 0x20(%rsp),%xmm7
+ movaps %xmm0,0x20(%rsp)
+ movaps 0x30(%rsp),%xmm8
+ movaps %xmm0,0x30(%rsp)
+ movaps 0x40(%rsp),%xmm9
+ movaps %xmm0,0x40(%rsp)
+ movaps 0x50(%rsp),%xmm10
+ movaps %xmm0,0x50(%rsp)
+ movaps 0x60(%rsp),%xmm11
+ movaps %xmm0,0x60(%rsp)
+ movaps 0x70(%rsp),%xmm12
+ movaps %xmm0,0x70(%rsp)
+ movaps 0x80(%rsp),%xmm13
+ movaps %xmm0,0x80(%rsp)
+ movaps 0x90(%rsp),%xmm14
+ movaps %xmm0,0x90(%rsp)
+ movaps 0xa0(%rsp),%xmm15
+ movaps %xmm0,0xa0(%rsp)
___
$code.=<<___;
+ lea (%rbp),%rsp
+ pop %rbp
.Lcbc_ret:
ret
.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
___
}
-# int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey,
+# int ${PREFIX}_set_decrypt_key(const unsigned char *inp,
# int bits, AES_KEY *key)
+#
+# input: $inp user-supplied key
+# $bits $inp length in bits
+# $key pointer to key schedule
+# output: %eax 0 denoting success, -1 or -2 - failure (see C)
+# *$key key schedule
+#
{ my ($inp,$bits,$key) = @_4args;
$bits =~ s/%r/%e/;
@@ -2510,7 +3291,9 @@ ${PREFIX}_set_decrypt_key:
$movkey ($key),%xmm0 # inverse middle
aesimc %xmm0,%xmm0
+ pxor %xmm1,%xmm1
$movkey %xmm0,($inp)
+ pxor %xmm0,%xmm0
.Ldec_key_ret:
add \$8,%rsp
ret
@@ -2527,6 +3310,22 @@ ___
# Agressively optimized in respect to aeskeygenassist's critical path
# and is contained in %xmm0-5 to meet Win64 ABI requirement.
#
+# int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
+# int bits, AES_KEY * const key);
+#
+# input: $inp user-supplied key
+# $bits $inp length in bits
+# $key pointer to key schedule
+# output: %eax 0 denoting success, -1 or -2 - failure (see C)
+# $bits rounds-1 (used in aesni_set_decrypt_key)
+# *$key key schedule
+# $key pointer to key schedule (used in
+# aesni_set_decrypt_key)
+#
+# Subroutine is frame-less, which means that only volatile registers
+# are used. Note that it's declared "abi-omnipotent", which means that
+# amount of volatile registers is smaller on Windows.
+#
$code.=<<___;
.globl ${PREFIX}_set_encrypt_key
.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
@@ -2540,9 +3339,11 @@ __aesni_set_encrypt_key:
test $key,$key
jz .Lenc_key_ret
+ mov \$`1<<28|1<<11`,%r10d # AVX and XOP bits
movups ($inp),%xmm0 # pull first 128 bits of *userKey
xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
- lea 16($key),%rax
+ and OPENSSL_ia32cap_P+4(%rip),%r10d
+ lea 16($key),%rax # %rax is used as modifiable copy of $key
cmp \$256,$bits
je .L14rounds
cmp \$192,$bits
@@ -2552,6 +3353,9 @@ __aesni_set_encrypt_key:
.L10rounds:
mov \$9,$bits # 10 rounds for 128-bit key
+ cmp \$`1<<28`,%r10d # AVX, bit no XOP
+ je .L10rounds_alt
+
$movkey %xmm0,($key) # round 0
aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1
call .Lkey_expansion_128_cold
@@ -2579,9 +3383,79 @@ __aesni_set_encrypt_key:
jmp .Lenc_key_ret
.align 16
+.L10rounds_alt:
+ movdqa .Lkey_rotate(%rip),%xmm5
+ mov \$8,%r10d
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,($key)
+ jmp .Loop_key128
+
+.align 16
+.Loop_key128:
+ pshufb %xmm5,%xmm0
+ aesenclast %xmm4,%xmm0
+ pslld \$1,%xmm4
+ lea 16(%rax),%rax
+
+ movdqa %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%rax)
+ movdqa %xmm0,%xmm2
+
+ dec %r10d
+ jnz .Loop_key128
+
+ movdqa .Lkey_rcon1b(%rip),%xmm4
+
+ pshufb %xmm5,%xmm0
+ aesenclast %xmm4,%xmm0
+ pslld \$1,%xmm4
+
+ movdqa %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ movdqa %xmm0,%xmm2
+ pshufb %xmm5,%xmm0
+ aesenclast %xmm4,%xmm0
+
+ movdqa %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%rax)
+
+ mov $bits,96(%rax) # 240($key)
+ xor %eax,%eax
+ jmp .Lenc_key_ret
+
+.align 16
.L12rounds:
movq 16($inp),%xmm2 # remaining 1/3 of *userKey
mov \$11,$bits # 12 rounds for 192
+ cmp \$`1<<28`,%r10d # AVX, but no XOP
+ je .L12rounds_alt
+
$movkey %xmm0,($key) # round 0
aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
call .Lkey_expansion_192a_cold
@@ -2605,10 +3479,54 @@ __aesni_set_encrypt_key:
jmp .Lenc_key_ret
.align 16
+.L12rounds_alt:
+ movdqa .Lkey_rotate192(%rip),%xmm5
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ mov \$8,%r10d
+ movdqu %xmm0,($key)
+ jmp .Loop_key192
+
+.align 16
+.Loop_key192:
+ movq %xmm2,0(%rax)
+ movdqa %xmm2,%xmm1
+ pshufb %xmm5,%xmm2
+ aesenclast %xmm4,%xmm2
+ pslld \$1, %xmm4
+ lea 24(%rax),%rax
+
+ movdqa %xmm0,%xmm3
+ pslldq \$4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq \$4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq \$4,%xmm0
+ pxor %xmm3,%xmm0
+
+ pshufd \$0xff,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq \$4,%xmm1
+ pxor %xmm1,%xmm3
+
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%rax)
+
+ dec %r10d
+ jnz .Loop_key192
+
+ mov $bits,32(%rax) # 240($key)
+ xor %eax,%eax
+ jmp .Lenc_key_ret
+
+.align 16
.L14rounds:
movups 16($inp),%xmm2 # remaning half of *userKey
mov \$13,$bits # 14 rounds for 256
lea 16(%rax),%rax
+ cmp \$`1<<28`,%r10d # AVX, but no XOP
+ je .L14rounds_alt
+
$movkey %xmm0,($key) # round 0
$movkey %xmm2,16($key) # round 1
aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2
@@ -2643,9 +3561,69 @@ __aesni_set_encrypt_key:
jmp .Lenc_key_ret
.align 16
+.L14rounds_alt:
+ movdqa .Lkey_rotate(%rip),%xmm5
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ mov \$7,%r10d
+ movdqu %xmm0,0($key)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,16($key)
+ jmp .Loop_key256
+
+.align 16
+.Loop_key256:
+ pshufb %xmm5,%xmm2
+ aesenclast %xmm4,%xmm2
+
+ movdqa %xmm0,%xmm3
+ pslldq \$4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq \$4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq \$4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld \$1,%xmm4
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ dec %r10d
+ jz .Ldone_key256
+
+ pshufd \$0xff,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+ aesenclast %xmm3,%xmm2
+
+ movdqa %xmm1,%xmm3
+ pslldq \$4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq \$4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq \$4,%xmm1
+ pxor %xmm3,%xmm1
+
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%rax)
+ lea 32(%rax),%rax
+ movdqa %xmm2,%xmm1
+
+ jmp .Loop_key256
+
+.Ldone_key256:
+ mov $bits,16(%rax) # 240($key)
+ xor %eax,%eax
+ jmp .Lenc_key_ret
+
+.align 16
.Lbad_keybits:
mov \$-2,%rax
.Lenc_key_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
add \$8,%rsp
ret
.LSEH_end_set_encrypt_key:
@@ -2733,6 +3711,16 @@ $code.=<<___;
.long 1,0,0,0
.Lxts_magic:
.long 0x87,0,1,0
+.Lincrement1:
+ .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Lkey_rotate:
+ .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+.Lkey_rotate192:
+ .long 0x04070605,0x04070605,0x04070605,0x04070605
+.Lkey_rcon1:
+ .long 1,1,1,1
+.Lkey_rcon1b:
+ .long 0x1b,0x1b,0x1b,0x1b
.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
.align 64
@@ -2791,45 +3779,9 @@ ecb_ccm64_se_handler:
jmp .Lcommon_seh_tail
.size ecb_ccm64_se_handler,.-ecb_ccm64_se_handler
-.type ctr32_se_handler,\@abi-omnipotent
+.type ctr_xts_se_handler,\@abi-omnipotent
.align 16
-ctr32_se_handler:
- push %rsi
- push %rdi
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
- pushfq
- sub \$64,%rsp
-
- mov 120($context),%rax # pull context->Rax
- mov 248($context),%rbx # pull context->Rip
-
- lea .Lctr32_body(%rip),%r10
- cmp %r10,%rbx # context->Rip<"prologue" label
- jb .Lcommon_seh_tail
-
- mov 152($context),%rax # pull context->Rsp
-
- lea .Lctr32_ret(%rip),%r10
- cmp %r10,%rbx
- jae .Lcommon_seh_tail
-
- lea 0x20(%rax),%rsi # %xmm save area
- lea 512($context),%rdi # &context.Xmm6
- mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
- .long 0xa548f3fc # cld; rep movsq
- lea 0xc8(%rax),%rax # adjust stack pointer
-
- jmp .Lcommon_seh_tail
-.size ctr32_se_handler,.-ctr32_se_handler
-
-.type xts_se_handler,\@abi-omnipotent
-.align 16
-xts_se_handler:
+ctr_xts_se_handler:
push %rsi
push %rdi
push %rbx
@@ -2859,14 +3811,14 @@ xts_se_handler:
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail
- lea 0x60(%rax),%rsi # %xmm save area
+ mov 160($context),%rax # pull context->Rbp
+ lea -0xa0(%rax),%rsi # %xmm save area
lea 512($context),%rdi # & context.Xmm6
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
.long 0xa548f3fc # cld; rep movsq
- lea 0x68+160(%rax),%rax # adjust stack pointer
- jmp .Lcommon_seh_tail
-.size xts_se_handler,.-xts_se_handler
+ jmp .Lcommon_rbp_tail
+.size ctr_xts_se_handler,.-ctr_xts_se_handler
___
$code.=<<___;
.type cbc_se_handler,\@abi-omnipotent
@@ -2886,7 +3838,7 @@ cbc_se_handler:
mov 152($context),%rax # pull context->Rsp
mov 248($context),%rbx # pull context->Rip
- lea .Lcbc_decrypt(%rip),%r10
+ lea .Lcbc_decrypt_bulk(%rip),%r10
cmp %r10,%rbx # context->Rip<"prologue" label
jb .Lcommon_seh_tail
@@ -2898,11 +3850,16 @@ cbc_se_handler:
cmp %r10,%rbx # context->Rip>="epilogue" label
jae .Lcommon_seh_tail
- lea 0(%rax),%rsi # top of stack
+ lea 16(%rax),%rsi # %xmm save area
lea 512($context),%rdi # &context.Xmm6
- mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
.long 0xa548f3fc # cld; rep movsq
- lea 0x58(%rax),%rax # adjust stack pointer
+
+.Lcommon_rbp_tail:
+ mov 160($context),%rax # pull context->Rbp
+ mov (%rax),%rbp # restore saved %rbp
+ lea 8(%rax),%rax # adjust stack pointer
+ mov %rbp,160($context) # restore context->Rbp
jmp .Lcommon_seh_tail
.Lrestore_cbc_rax:
@@ -3006,14 +3963,15 @@ $code.=<<___ if ($PREFIX eq "aesni");
.rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[]
.LSEH_info_ctr32:
.byte 9,0,0,0
- .rva ctr32_se_handler
+ .rva ctr_xts_se_handler
+ .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[]
.LSEH_info_xts_enc:
.byte 9,0,0,0
- .rva xts_se_handler
+ .rva ctr_xts_se_handler
.rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
.LSEH_info_xts_dec:
.byte 9,0,0,0
- .rva xts_se_handler
+ .rva ctr_xts_se_handler
.rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
___
$code.=<<___;
@@ -3060,11 +4018,30 @@ sub aesni {
push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
return ".byte\t".join(',',@opcode);
}
+ elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
+ my %opcodelet = (
+ "aesenc" => 0xdc, "aesenclast" => 0xdd,
+ "aesdec" => 0xde, "aesdeclast" => 0xdf
+ );
+ return undef if (!defined($opcodelet{$1}));
+ my $off = $2;
+ push @opcode,0x44 if ($3>=8);
+ push @opcode,0x0f,0x38,$opcodelet{$1};
+ push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
+ push @opcode,($off=~/^0/?oct($off):$off)&0xff;
+ return ".byte\t".join(',',@opcode);
+ }
return $line;
}
+sub movbe {
+ ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift;
+}
+
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
+#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact
+$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
print $code;
diff --git a/crypto/aes/asm/aesp8-ppc.pl b/crypto/aes/asm/aesp8-ppc.pl
new file mode 100755
index 000000000000..a1891cc03caa
--- /dev/null
+++ b/crypto/aes/asm/aesp8-ppc.pl
@@ -0,0 +1,1942 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for AES instructions as per PowerISA
+# specification version 2.07, first implemented by POWER8 processor.
+# The module is endian-agnostic in sense that it supports both big-
+# and little-endian cases. Data alignment in parallelizable modes is
+# handled with VSX loads and stores, which implies MSR.VSX flag being
+# set. It should also be noted that ISA specification doesn't prohibit
+# alignment exceptions for these instructions on page boundaries.
+# Initially alignment was handled in pure AltiVec/VMX way [when data
+# is aligned programmatically, which in turn guarantees exception-
+# free execution], but it turned to hamper performance when vcipher
+# instructions are interleaved. It's reckoned that eventual
+# misalignment penalties at page boundaries are in average lower
+# than additional overhead in pure AltiVec approach.
+
+$flavour = shift;
+
+if ($flavour =~ /64/) {
+ $SIZE_T =8;
+ $LRSAVE =2*$SIZE_T;
+ $STU ="stdu";
+ $POP ="ld";
+ $PUSH ="std";
+ $UCMP ="cmpld";
+ $SHL ="sldi";
+} elsif ($flavour =~ /32/) {
+ $SIZE_T =4;
+ $LRSAVE =$SIZE_T;
+ $STU ="stwu";
+ $POP ="lwz";
+ $PUSH ="stw";
+ $UCMP ="cmplw";
+ $SHL ="slwi";
+} else { die "nonsense $flavour"; }
+
+$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+
+$FRAME=8*$SIZE_T;
+$prefix="aes_p8";
+
+$sp="r1";
+$vrsave="r12";
+
+#########################################################################
+{{{ # Key setup procedures #
+my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
+my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
+my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
+
+$code.=<<___;
+.machine "any"
+
+.text
+
+.align 7
+rcon:
+.long 0x01000000, 0x01000000, 0x01000000, 0x01000000 ?rev
+.long 0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000 ?rev
+.long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c ?rev
+.long 0,0,0,0 ?asis
+Lconsts:
+ mflr r0
+ bcl 20,31,\$+4
+ mflr $ptr #vvvvv "distance between . and rcon
+ addi $ptr,$ptr,-0x48
+ mtlr r0
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+.asciz "AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
+
+.globl .${prefix}_set_encrypt_key
+.align 5
+.${prefix}_set_encrypt_key:
+Lset_encrypt_key:
+ mflr r11
+ $PUSH r11,$LRSAVE($sp)
+
+ li $ptr,-1
+ ${UCMP}i $inp,0
+ beq- Lenc_key_abort # if ($inp==0) return -1;
+ ${UCMP}i $out,0
+ beq- Lenc_key_abort # if ($out==0) return -1;
+ li $ptr,-2
+ cmpwi $bits,128
+ blt- Lenc_key_abort
+ cmpwi $bits,256
+ bgt- Lenc_key_abort
+ andi. r0,$bits,0x3f
+ bne- Lenc_key_abort
+
+ lis r0,0xfff0
+ mfspr $vrsave,256
+ mtspr 256,r0
+
+ bl Lconsts
+ mtlr r11
+
+ neg r9,$inp
+ lvx $in0,0,$inp
+ addi $inp,$inp,15 # 15 is not typo
+ lvsr $key,0,r9 # borrow $key
+ li r8,0x20
+ cmpwi $bits,192
+ lvx $in1,0,$inp
+ le?vspltisb $mask,0x0f # borrow $mask
+ lvx $rcon,0,$ptr
+ le?vxor $key,$key,$mask # adjust for byte swap
+ lvx $mask,r8,$ptr
+ addi $ptr,$ptr,0x10
+ vperm $in0,$in0,$in1,$key # align [and byte swap in LE]
+ li $cnt,8
+ vxor $zero,$zero,$zero
+ mtctr $cnt
+
+ ?lvsr $outperm,0,$out
+ vspltisb $outmask,-1
+ lvx $outhead,0,$out
+ ?vperm $outmask,$zero,$outmask,$outperm
+
+ blt Loop128
+ addi $inp,$inp,8
+ beq L192
+ addi $inp,$inp,8
+ b L256
+
+.align 4
+Loop128:
+ vperm $key,$in0,$in0,$mask # rotate-n-splat
+ vsldoi $tmp,$zero,$in0,12 # >>32
+ vperm $outtail,$in0,$in0,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ vcipherlast $key,$key,$rcon
+ stvx $stage,0,$out
+ addi $out,$out,16
+
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vadduwm $rcon,$rcon,$rcon
+ vxor $in0,$in0,$key
+ bdnz Loop128
+
+ lvx $rcon,0,$ptr # last two round keys
+
+ vperm $key,$in0,$in0,$mask # rotate-n-splat
+ vsldoi $tmp,$zero,$in0,12 # >>32
+ vperm $outtail,$in0,$in0,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ vcipherlast $key,$key,$rcon
+ stvx $stage,0,$out
+ addi $out,$out,16
+
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vadduwm $rcon,$rcon,$rcon
+ vxor $in0,$in0,$key
+
+ vperm $key,$in0,$in0,$mask # rotate-n-splat
+ vsldoi $tmp,$zero,$in0,12 # >>32
+ vperm $outtail,$in0,$in0,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ vcipherlast $key,$key,$rcon
+ stvx $stage,0,$out
+ addi $out,$out,16
+
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vxor $in0,$in0,$key
+ vperm $outtail,$in0,$in0,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ stvx $stage,0,$out
+
+ addi $inp,$out,15 # 15 is not typo
+ addi $out,$out,0x50
+
+ li $rounds,10
+ b Ldone
+
+.align 4
+L192:
+ lvx $tmp,0,$inp
+ li $cnt,4
+ vperm $outtail,$in0,$in0,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ stvx $stage,0,$out
+ addi $out,$out,16
+ vperm $in1,$in1,$tmp,$key # align [and byte swap in LE]
+ vspltisb $key,8 # borrow $key
+ mtctr $cnt
+ vsububm $mask,$mask,$key # adjust the mask
+
+Loop192:
+ vperm $key,$in1,$in1,$mask # roate-n-splat
+ vsldoi $tmp,$zero,$in0,12 # >>32
+ vcipherlast $key,$key,$rcon
+
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+
+ vsldoi $stage,$zero,$in1,8
+ vspltw $tmp,$in0,3
+ vxor $tmp,$tmp,$in1
+ vsldoi $in1,$zero,$in1,12 # >>32
+ vadduwm $rcon,$rcon,$rcon
+ vxor $in1,$in1,$tmp
+ vxor $in0,$in0,$key
+ vxor $in1,$in1,$key
+ vsldoi $stage,$stage,$in0,8
+
+ vperm $key,$in1,$in1,$mask # rotate-n-splat
+ vsldoi $tmp,$zero,$in0,12 # >>32
+ vperm $outtail,$stage,$stage,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ vcipherlast $key,$key,$rcon
+ stvx $stage,0,$out
+ addi $out,$out,16
+
+ vsldoi $stage,$in0,$in1,8
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vperm $outtail,$stage,$stage,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ stvx $stage,0,$out
+ addi $out,$out,16
+
+ vspltw $tmp,$in0,3
+ vxor $tmp,$tmp,$in1
+ vsldoi $in1,$zero,$in1,12 # >>32
+ vadduwm $rcon,$rcon,$rcon
+ vxor $in1,$in1,$tmp
+ vxor $in0,$in0,$key
+ vxor $in1,$in1,$key
+ vperm $outtail,$in0,$in0,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ stvx $stage,0,$out
+ addi $inp,$out,15 # 15 is not typo
+ addi $out,$out,16
+ bdnz Loop192
+
+ li $rounds,12
+ addi $out,$out,0x20
+ b Ldone
+
+.align 4
+L256:
+ lvx $tmp,0,$inp
+ li $cnt,7
+ li $rounds,14
+ vperm $outtail,$in0,$in0,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ stvx $stage,0,$out
+ addi $out,$out,16
+ vperm $in1,$in1,$tmp,$key # align [and byte swap in LE]
+ mtctr $cnt
+
+Loop256:
+ vperm $key,$in1,$in1,$mask # rotate-n-splat
+ vsldoi $tmp,$zero,$in0,12 # >>32
+ vperm $outtail,$in1,$in1,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ vcipherlast $key,$key,$rcon
+ stvx $stage,0,$out
+ addi $out,$out,16
+
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vadduwm $rcon,$rcon,$rcon
+ vxor $in0,$in0,$key
+ vperm $outtail,$in0,$in0,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ stvx $stage,0,$out
+ addi $inp,$out,15 # 15 is not typo
+ addi $out,$out,16
+ bdz Ldone
+
+ vspltw $key,$in0,3 # just splat
+ vsldoi $tmp,$zero,$in1,12 # >>32
+ vsbox $key,$key
+
+ vxor $in1,$in1,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in1,$in1,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in1,$in1,$tmp
+
+ vxor $in1,$in1,$key
+ b Loop256
+
+.align 4
+Ldone:
+ lvx $in1,0,$inp # redundant in aligned case
+ vsel $in1,$outhead,$in1,$outmask
+ stvx $in1,0,$inp
+ li $ptr,0
+ mtspr 256,$vrsave
+ stw $rounds,0($out)
+
+Lenc_key_abort:
+ mr r3,$ptr
+ blr
+ .long 0
+ .byte 0,12,0x14,1,0,0,3,0
+ .long 0
+.size .${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
+
+.globl .${prefix}_set_decrypt_key
+.align 5
+.${prefix}_set_decrypt_key:
+ $STU $sp,-$FRAME($sp)
+ mflr r10
+ $PUSH r10,$FRAME+$LRSAVE($sp)
+ bl Lset_encrypt_key
+ mtlr r10
+
+ cmpwi r3,0
+ bne- Ldec_key_abort
+
+ slwi $cnt,$rounds,4
+ subi $inp,$out,240 # first round key
+ srwi $rounds,$rounds,1
+ add $out,$inp,$cnt # last round key
+ mtctr $rounds
+
+Ldeckey:
+ lwz r0, 0($inp)
+ lwz r6, 4($inp)
+ lwz r7, 8($inp)
+ lwz r8, 12($inp)
+ addi $inp,$inp,16
+ lwz r9, 0($out)
+ lwz r10,4($out)
+ lwz r11,8($out)
+ lwz r12,12($out)
+ stw r0, 0($out)
+ stw r6, 4($out)
+ stw r7, 8($out)
+ stw r8, 12($out)
+ subi $out,$out,16
+ stw r9, -16($inp)
+ stw r10,-12($inp)
+ stw r11,-8($inp)
+ stw r12,-4($inp)
+ bdnz Ldeckey
+
+ xor r3,r3,r3 # return value
+Ldec_key_abort:
+ addi $sp,$sp,$FRAME
+ blr
+ .long 0
+ .byte 0,12,4,1,0x80,0,3,0
+ .long 0
+.size .${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
+___
+}}}
+#########################################################################
+{{{ # Single block en- and decrypt procedures #
+sub gen_block () {
+my $dir = shift;
+my $n = $dir eq "de" ? "n" : "";
+my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
+
+$code.=<<___;
+.globl .${prefix}_${dir}crypt
+.align 5
+.${prefix}_${dir}crypt:
+ lwz $rounds,240($key)
+ lis r0,0xfc00
+ mfspr $vrsave,256
+ li $idx,15 # 15 is not typo
+ mtspr 256,r0
+
+ lvx v0,0,$inp
+ neg r11,$out
+ lvx v1,$idx,$inp
+ lvsl v2,0,$inp # inpperm
+ le?vspltisb v4,0x0f
+ ?lvsl v3,0,r11 # outperm
+ le?vxor v2,v2,v4
+ li $idx,16
+ vperm v0,v0,v1,v2 # align [and byte swap in LE]
+ lvx v1,0,$key
+ ?lvsl v5,0,$key # keyperm
+ srwi $rounds,$rounds,1
+ lvx v2,$idx,$key
+ addi $idx,$idx,16
+ subi $rounds,$rounds,1
+ ?vperm v1,v1,v2,v5 # align round key
+
+ vxor v0,v0,v1
+ lvx v1,$idx,$key
+ addi $idx,$idx,16
+ mtctr $rounds
+
+Loop_${dir}c:
+ ?vperm v2,v2,v1,v5
+ v${n}cipher v0,v0,v2
+ lvx v2,$idx,$key
+ addi $idx,$idx,16
+ ?vperm v1,v1,v2,v5
+ v${n}cipher v0,v0,v1
+ lvx v1,$idx,$key
+ addi $idx,$idx,16
+ bdnz Loop_${dir}c
+
+ ?vperm v2,v2,v1,v5
+ v${n}cipher v0,v0,v2
+ lvx v2,$idx,$key
+ ?vperm v1,v1,v2,v5
+ v${n}cipherlast v0,v0,v1
+
+ vspltisb v2,-1
+ vxor v1,v1,v1
+ li $idx,15 # 15 is not typo
+ ?vperm v2,v1,v2,v3 # outmask
+ le?vxor v3,v3,v4
+ lvx v1,0,$out # outhead
+ vperm v0,v0,v0,v3 # rotate [and byte swap in LE]
+ vsel v1,v1,v0,v2
+ lvx v4,$idx,$out
+ stvx v1,0,$out
+ vsel v0,v0,v4,v2
+ stvx v0,$idx,$out
+
+ mtspr 256,$vrsave
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,3,0
+ .long 0
+.size .${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
+___
+}
+&gen_block("en");
+&gen_block("de");
+}}}
+#########################################################################
+{{{ # CBC en- and decrypt procedures #
+my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
+my ($rndkey0,$rndkey1,$inout,$tmp)= map("v$_",(0..3));
+my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
+ map("v$_",(4..10));
+$code.=<<___;
+.globl .${prefix}_cbc_encrypt
+.align 5
+.${prefix}_cbc_encrypt:
+ ${UCMP}i $len,16
+ bltlr-
+
+ cmpwi $enc,0 # test direction
+ lis r0,0xffe0
+ mfspr $vrsave,256
+ mtspr 256,r0
+
+ li $idx,15
+ vxor $rndkey0,$rndkey0,$rndkey0
+ le?vspltisb $tmp,0x0f
+
+ lvx $ivec,0,$ivp # load [unaligned] iv
+ lvsl $inpperm,0,$ivp
+ lvx $inptail,$idx,$ivp
+ le?vxor $inpperm,$inpperm,$tmp
+ vperm $ivec,$ivec,$inptail,$inpperm
+
+ neg r11,$inp
+ ?lvsl $keyperm,0,$key # prepare for unaligned key
+ lwz $rounds,240($key)
+
+ lvsr $inpperm,0,r11 # prepare for unaligned load
+ lvx $inptail,0,$inp
+ addi $inp,$inp,15 # 15 is not typo
+ le?vxor $inpperm,$inpperm,$tmp
+
+ ?lvsr $outperm,0,$out # prepare for unaligned store
+ vspltisb $outmask,-1
+ lvx $outhead,0,$out
+ ?vperm $outmask,$rndkey0,$outmask,$outperm
+ le?vxor $outperm,$outperm,$tmp
+
+ srwi $rounds,$rounds,1
+ li $idx,16
+ subi $rounds,$rounds,1
+ beq Lcbc_dec
+
+Lcbc_enc:
+ vmr $inout,$inptail
+ lvx $inptail,0,$inp
+ addi $inp,$inp,16
+ mtctr $rounds
+ subi $len,$len,16 # len-=16
+
+ lvx $rndkey0,0,$key
+ vperm $inout,$inout,$inptail,$inpperm
+ lvx $rndkey1,$idx,$key
+ addi $idx,$idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vxor $inout,$inout,$rndkey0
+ lvx $rndkey0,$idx,$key
+ addi $idx,$idx,16
+ vxor $inout,$inout,$ivec
+
+Loop_cbc_enc:
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vcipher $inout,$inout,$rndkey1
+ lvx $rndkey1,$idx,$key
+ addi $idx,$idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vcipher $inout,$inout,$rndkey0
+ lvx $rndkey0,$idx,$key
+ addi $idx,$idx,16
+ bdnz Loop_cbc_enc
+
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vcipher $inout,$inout,$rndkey1
+ lvx $rndkey1,$idx,$key
+ li $idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vcipherlast $ivec,$inout,$rndkey0
+ ${UCMP}i $len,16
+
+ vperm $tmp,$ivec,$ivec,$outperm
+ vsel $inout,$outhead,$tmp,$outmask
+ vmr $outhead,$tmp
+ stvx $inout,0,$out
+ addi $out,$out,16
+ bge Lcbc_enc
+
+ b Lcbc_done
+
+.align 4
+Lcbc_dec:
+ ${UCMP}i $len,128
+ bge _aesp8_cbc_decrypt8x
+ vmr $tmp,$inptail
+ lvx $inptail,0,$inp
+ addi $inp,$inp,16
+ mtctr $rounds
+ subi $len,$len,16 # len-=16
+
+ lvx $rndkey0,0,$key
+ vperm $tmp,$tmp,$inptail,$inpperm
+ lvx $rndkey1,$idx,$key
+ addi $idx,$idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vxor $inout,$tmp,$rndkey0
+ lvx $rndkey0,$idx,$key
+ addi $idx,$idx,16
+
+Loop_cbc_dec:
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vncipher $inout,$inout,$rndkey1
+ lvx $rndkey1,$idx,$key
+ addi $idx,$idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vncipher $inout,$inout,$rndkey0
+ lvx $rndkey0,$idx,$key
+ addi $idx,$idx,16
+ bdnz Loop_cbc_dec
+
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vncipher $inout,$inout,$rndkey1
+ lvx $rndkey1,$idx,$key
+ li $idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vncipherlast $inout,$inout,$rndkey0
+ ${UCMP}i $len,16
+
+ vxor $inout,$inout,$ivec
+ vmr $ivec,$tmp
+ vperm $tmp,$inout,$inout,$outperm
+ vsel $inout,$outhead,$tmp,$outmask
+ vmr $outhead,$tmp
+ stvx $inout,0,$out
+ addi $out,$out,16
+ bge Lcbc_dec
+
+Lcbc_done:
+ addi $out,$out,-1
+ lvx $inout,0,$out # redundant in aligned case
+ vsel $inout,$outhead,$inout,$outmask
+ stvx $inout,0,$out
+
+ neg $enc,$ivp # write [unaligned] iv
+ li $idx,15 # 15 is not typo
+ vxor $rndkey0,$rndkey0,$rndkey0
+ vspltisb $outmask,-1
+ le?vspltisb $tmp,0x0f
+ ?lvsl $outperm,0,$enc
+ ?vperm $outmask,$rndkey0,$outmask,$outperm
+ le?vxor $outperm,$outperm,$tmp
+ lvx $outhead,0,$ivp
+ vperm $ivec,$ivec,$ivec,$outperm
+ vsel $inout,$outhead,$ivec,$outmask
+ lvx $inptail,$idx,$ivp
+ stvx $inout,0,$ivp
+ vsel $inout,$ivec,$inptail,$outmask
+ stvx $inout,$idx,$ivp
+
+ mtspr 256,$vrsave
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,6,0
+ .long 0
+___
+#########################################################################
+{{ # Optimized CBC decrypt procedure #
+my $key_="r11";
+my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
+ $x00=0 if ($flavour =~ /osx/);
+my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
+my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
+my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys
+ # v26-v31 last 6 round keys
+my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment
+
+$code.=<<___;
+.align 5
+_aesp8_cbc_decrypt8x:
+ $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+ li r10,`$FRAME+8*16+15`
+ li r11,`$FRAME+8*16+31`
+ stvx v20,r10,$sp # ABI says so
+ addi r10,r10,32
+ stvx v21,r11,$sp
+ addi r11,r11,32
+ stvx v22,r10,$sp
+ addi r10,r10,32
+ stvx v23,r11,$sp
+ addi r11,r11,32
+ stvx v24,r10,$sp
+ addi r10,r10,32
+ stvx v25,r11,$sp
+ addi r11,r11,32
+ stvx v26,r10,$sp
+ addi r10,r10,32
+ stvx v27,r11,$sp
+ addi r11,r11,32
+ stvx v28,r10,$sp
+ addi r10,r10,32
+ stvx v29,r11,$sp
+ addi r11,r11,32
+ stvx v30,r10,$sp
+ stvx v31,r11,$sp
+ li r0,-1
+ stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
+ li $x10,0x10
+ $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+ li $x20,0x20
+ $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+ li $x30,0x30
+ $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+ li $x40,0x40
+ $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+ li $x50,0x50
+ $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+ li $x60,0x60
+ $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+ li $x70,0x70
+ mtspr 256,r0
+
+ subi $rounds,$rounds,3 # -4 in total
+ subi $len,$len,128 # bias
+
+ lvx $rndkey0,$x00,$key # load key schedule
+ lvx v30,$x10,$key
+ addi $key,$key,0x20
+ lvx v31,$x00,$key
+ ?vperm $rndkey0,$rndkey0,v30,$keyperm
+ addi $key_,$sp,$FRAME+15
+ mtctr $rounds
+
+Load_cbc_dec_key:
+ ?vperm v24,v30,v31,$keyperm
+ lvx v30,$x10,$key
+ addi $key,$key,0x20
+ stvx v24,$x00,$key_ # off-load round[1]
+ ?vperm v25,v31,v30,$keyperm
+ lvx v31,$x00,$key
+ stvx v25,$x10,$key_ # off-load round[2]
+ addi $key_,$key_,0x20
+ bdnz Load_cbc_dec_key
+
+ lvx v26,$x10,$key
+ ?vperm v24,v30,v31,$keyperm
+ lvx v27,$x20,$key
+ stvx v24,$x00,$key_ # off-load round[3]
+ ?vperm v25,v31,v26,$keyperm
+ lvx v28,$x30,$key
+ stvx v25,$x10,$key_ # off-load round[4]
+ addi $key_,$sp,$FRAME+15 # rewind $key_
+ ?vperm v26,v26,v27,$keyperm
+ lvx v29,$x40,$key
+ ?vperm v27,v27,v28,$keyperm
+ lvx v30,$x50,$key
+ ?vperm v28,v28,v29,$keyperm
+ lvx v31,$x60,$key
+ ?vperm v29,v29,v30,$keyperm
+ lvx $out0,$x70,$key # borrow $out0
+ ?vperm v30,v30,v31,$keyperm
+ lvx v24,$x00,$key_ # pre-load round[1]
+ ?vperm v31,v31,$out0,$keyperm
+ lvx v25,$x10,$key_ # pre-load round[2]
+
+ #lvx $inptail,0,$inp # "caller" already did this
+ #addi $inp,$inp,15 # 15 is not typo
+ subi $inp,$inp,15 # undo "caller"
+
+ le?li $idx,8
+ lvx_u $in0,$x00,$inp # load first 8 "words"
+ le?lvsl $inpperm,0,$idx
+ le?vspltisb $tmp,0x0f
+ lvx_u $in1,$x10,$inp
+ le?vxor $inpperm,$inpperm,$tmp # transform for lvx_u/stvx_u
+ lvx_u $in2,$x20,$inp
+ le?vperm $in0,$in0,$in0,$inpperm
+ lvx_u $in3,$x30,$inp
+ le?vperm $in1,$in1,$in1,$inpperm
+ lvx_u $in4,$x40,$inp
+ le?vperm $in2,$in2,$in2,$inpperm
+ vxor $out0,$in0,$rndkey0
+ lvx_u $in5,$x50,$inp
+ le?vperm $in3,$in3,$in3,$inpperm
+ vxor $out1,$in1,$rndkey0
+ lvx_u $in6,$x60,$inp
+ le?vperm $in4,$in4,$in4,$inpperm
+ vxor $out2,$in2,$rndkey0
+ lvx_u $in7,$x70,$inp
+ addi $inp,$inp,0x80
+ le?vperm $in5,$in5,$in5,$inpperm
+ vxor $out3,$in3,$rndkey0
+ le?vperm $in6,$in6,$in6,$inpperm
+ vxor $out4,$in4,$rndkey0
+ le?vperm $in7,$in7,$in7,$inpperm
+ vxor $out5,$in5,$rndkey0
+ vxor $out6,$in6,$rndkey0
+ vxor $out7,$in7,$rndkey0
+
+ mtctr $rounds
+ b Loop_cbc_dec8x
+.align 5
+Loop_cbc_dec8x:
+ vncipher $out0,$out0,v24
+ vncipher $out1,$out1,v24
+ vncipher $out2,$out2,v24
+ vncipher $out3,$out3,v24
+ vncipher $out4,$out4,v24
+ vncipher $out5,$out5,v24
+ vncipher $out6,$out6,v24
+ vncipher $out7,$out7,v24
+ lvx v24,$x20,$key_ # round[3]
+ addi $key_,$key_,0x20
+
+ vncipher $out0,$out0,v25
+ vncipher $out1,$out1,v25
+ vncipher $out2,$out2,v25
+ vncipher $out3,$out3,v25
+ vncipher $out4,$out4,v25
+ vncipher $out5,$out5,v25
+ vncipher $out6,$out6,v25
+ vncipher $out7,$out7,v25
+ lvx v25,$x10,$key_ # round[4]
+ bdnz Loop_cbc_dec8x
+
+ subic $len,$len,128 # $len-=128
+ vncipher $out0,$out0,v24
+ vncipher $out1,$out1,v24
+ vncipher $out2,$out2,v24
+ vncipher $out3,$out3,v24
+ vncipher $out4,$out4,v24
+ vncipher $out5,$out5,v24
+ vncipher $out6,$out6,v24
+ vncipher $out7,$out7,v24
+
+ subfe. r0,r0,r0 # borrow?-1:0
+ vncipher $out0,$out0,v25
+ vncipher $out1,$out1,v25
+ vncipher $out2,$out2,v25
+ vncipher $out3,$out3,v25
+ vncipher $out4,$out4,v25
+ vncipher $out5,$out5,v25
+ vncipher $out6,$out6,v25
+ vncipher $out7,$out7,v25
+
+ and r0,r0,$len
+ vncipher $out0,$out0,v26
+ vncipher $out1,$out1,v26
+ vncipher $out2,$out2,v26
+ vncipher $out3,$out3,v26
+ vncipher $out4,$out4,v26
+ vncipher $out5,$out5,v26
+ vncipher $out6,$out6,v26
+ vncipher $out7,$out7,v26
+
+ add $inp,$inp,r0 # $inp is adjusted in such
+ # way that at exit from the
+ # loop inX-in7 are loaded
+ # with last "words"
+ vncipher $out0,$out0,v27
+ vncipher $out1,$out1,v27
+ vncipher $out2,$out2,v27
+ vncipher $out3,$out3,v27
+ vncipher $out4,$out4,v27
+ vncipher $out5,$out5,v27
+ vncipher $out6,$out6,v27
+ vncipher $out7,$out7,v27
+
+ addi $key_,$sp,$FRAME+15 # rewind $key_
+ vncipher $out0,$out0,v28
+ vncipher $out1,$out1,v28
+ vncipher $out2,$out2,v28
+ vncipher $out3,$out3,v28
+ vncipher $out4,$out4,v28
+ vncipher $out5,$out5,v28
+ vncipher $out6,$out6,v28
+ vncipher $out7,$out7,v28
+ lvx v24,$x00,$key_ # re-pre-load round[1]
+
+ vncipher $out0,$out0,v29
+ vncipher $out1,$out1,v29
+ vncipher $out2,$out2,v29
+ vncipher $out3,$out3,v29
+ vncipher $out4,$out4,v29
+ vncipher $out5,$out5,v29
+ vncipher $out6,$out6,v29
+ vncipher $out7,$out7,v29
+ lvx v25,$x10,$key_ # re-pre-load round[2]
+
+ vncipher $out0,$out0,v30
+ vxor $ivec,$ivec,v31 # xor with last round key
+ vncipher $out1,$out1,v30
+ vxor $in0,$in0,v31
+ vncipher $out2,$out2,v30
+ vxor $in1,$in1,v31
+ vncipher $out3,$out3,v30
+ vxor $in2,$in2,v31
+ vncipher $out4,$out4,v30
+ vxor $in3,$in3,v31
+ vncipher $out5,$out5,v30
+ vxor $in4,$in4,v31
+ vncipher $out6,$out6,v30
+ vxor $in5,$in5,v31
+ vncipher $out7,$out7,v30
+ vxor $in6,$in6,v31
+
+ vncipherlast $out0,$out0,$ivec
+ vncipherlast $out1,$out1,$in0
+ lvx_u $in0,$x00,$inp # load next input block
+ vncipherlast $out2,$out2,$in1
+ lvx_u $in1,$x10,$inp
+ vncipherlast $out3,$out3,$in2
+ le?vperm $in0,$in0,$in0,$inpperm
+ lvx_u $in2,$x20,$inp
+ vncipherlast $out4,$out4,$in3
+ le?vperm $in1,$in1,$in1,$inpperm
+ lvx_u $in3,$x30,$inp
+ vncipherlast $out5,$out5,$in4
+ le?vperm $in2,$in2,$in2,$inpperm
+ lvx_u $in4,$x40,$inp
+ vncipherlast $out6,$out6,$in5
+ le?vperm $in3,$in3,$in3,$inpperm
+ lvx_u $in5,$x50,$inp
+ vncipherlast $out7,$out7,$in6
+ le?vperm $in4,$in4,$in4,$inpperm
+ lvx_u $in6,$x60,$inp
+ vmr $ivec,$in7
+ le?vperm $in5,$in5,$in5,$inpperm
+ lvx_u $in7,$x70,$inp
+ addi $inp,$inp,0x80
+
+ le?vperm $out0,$out0,$out0,$inpperm
+ le?vperm $out1,$out1,$out1,$inpperm
+ stvx_u $out0,$x00,$out
+ le?vperm $in6,$in6,$in6,$inpperm
+ vxor $out0,$in0,$rndkey0
+ le?vperm $out2,$out2,$out2,$inpperm
+ stvx_u $out1,$x10,$out
+ le?vperm $in7,$in7,$in7,$inpperm
+ vxor $out1,$in1,$rndkey0
+ le?vperm $out3,$out3,$out3,$inpperm
+ stvx_u $out2,$x20,$out
+ vxor $out2,$in2,$rndkey0
+ le?vperm $out4,$out4,$out4,$inpperm
+ stvx_u $out3,$x30,$out
+ vxor $out3,$in3,$rndkey0
+ le?vperm $out5,$out5,$out5,$inpperm
+ stvx_u $out4,$x40,$out
+ vxor $out4,$in4,$rndkey0
+ le?vperm $out6,$out6,$out6,$inpperm
+ stvx_u $out5,$x50,$out
+ vxor $out5,$in5,$rndkey0
+ le?vperm $out7,$out7,$out7,$inpperm
+ stvx_u $out6,$x60,$out
+ vxor $out6,$in6,$rndkey0
+ stvx_u $out7,$x70,$out
+ addi $out,$out,0x80
+ vxor $out7,$in7,$rndkey0
+
+ mtctr $rounds
+ beq Loop_cbc_dec8x # did $len-=128 borrow?
+
+ addic. $len,$len,128
+ beq Lcbc_dec8x_done
+ nop
+ nop
+
+Loop_cbc_dec8x_tail: # up to 7 "words" tail...
+ vncipher $out1,$out1,v24
+ vncipher $out2,$out2,v24
+ vncipher $out3,$out3,v24
+ vncipher $out4,$out4,v24
+ vncipher $out5,$out5,v24
+ vncipher $out6,$out6,v24
+ vncipher $out7,$out7,v24
+ lvx v24,$x20,$key_ # round[3]
+ addi $key_,$key_,0x20
+
+ vncipher $out1,$out1,v25
+ vncipher $out2,$out2,v25
+ vncipher $out3,$out3,v25
+ vncipher $out4,$out4,v25
+ vncipher $out5,$out5,v25
+ vncipher $out6,$out6,v25
+ vncipher $out7,$out7,v25
+ lvx v25,$x10,$key_ # round[4]
+ bdnz Loop_cbc_dec8x_tail
+
+ vncipher $out1,$out1,v24
+ vncipher $out2,$out2,v24
+ vncipher $out3,$out3,v24
+ vncipher $out4,$out4,v24
+ vncipher $out5,$out5,v24
+ vncipher $out6,$out6,v24
+ vncipher $out7,$out7,v24
+
+ vncipher $out1,$out1,v25
+ vncipher $out2,$out2,v25
+ vncipher $out3,$out3,v25
+ vncipher $out4,$out4,v25
+ vncipher $out5,$out5,v25
+ vncipher $out6,$out6,v25
+ vncipher $out7,$out7,v25
+
+ vncipher $out1,$out1,v26
+ vncipher $out2,$out2,v26
+ vncipher $out3,$out3,v26
+ vncipher $out4,$out4,v26
+ vncipher $out5,$out5,v26
+ vncipher $out6,$out6,v26
+ vncipher $out7,$out7,v26
+
+ vncipher $out1,$out1,v27
+ vncipher $out2,$out2,v27
+ vncipher $out3,$out3,v27
+ vncipher $out4,$out4,v27
+ vncipher $out5,$out5,v27
+ vncipher $out6,$out6,v27
+ vncipher $out7,$out7,v27
+
+ vncipher $out1,$out1,v28
+ vncipher $out2,$out2,v28
+ vncipher $out3,$out3,v28
+ vncipher $out4,$out4,v28
+ vncipher $out5,$out5,v28
+ vncipher $out6,$out6,v28
+ vncipher $out7,$out7,v28
+
+ vncipher $out1,$out1,v29
+ vncipher $out2,$out2,v29
+ vncipher $out3,$out3,v29
+ vncipher $out4,$out4,v29
+ vncipher $out5,$out5,v29
+ vncipher $out6,$out6,v29
+ vncipher $out7,$out7,v29
+
+ vncipher $out1,$out1,v30
+ vxor $ivec,$ivec,v31 # last round key
+ vncipher $out2,$out2,v30
+ vxor $in1,$in1,v31
+ vncipher $out3,$out3,v30
+ vxor $in2,$in2,v31
+ vncipher $out4,$out4,v30
+ vxor $in3,$in3,v31
+ vncipher $out5,$out5,v30
+ vxor $in4,$in4,v31
+ vncipher $out6,$out6,v30
+ vxor $in5,$in5,v31
+ vncipher $out7,$out7,v30
+ vxor $in6,$in6,v31
+
+ cmplwi $len,32 # switch($len)
+ blt Lcbc_dec8x_one
+ nop
+ beq Lcbc_dec8x_two
+ cmplwi $len,64
+ blt Lcbc_dec8x_three
+ nop
+ beq Lcbc_dec8x_four
+ cmplwi $len,96
+ blt Lcbc_dec8x_five
+ nop
+ beq Lcbc_dec8x_six
+
+Lcbc_dec8x_seven:
+ vncipherlast $out1,$out1,$ivec
+ vncipherlast $out2,$out2,$in1
+ vncipherlast $out3,$out3,$in2
+ vncipherlast $out4,$out4,$in3
+ vncipherlast $out5,$out5,$in4
+ vncipherlast $out6,$out6,$in5
+ vncipherlast $out7,$out7,$in6
+ vmr $ivec,$in7
+
+ le?vperm $out1,$out1,$out1,$inpperm
+ le?vperm $out2,$out2,$out2,$inpperm
+ stvx_u $out1,$x00,$out
+ le?vperm $out3,$out3,$out3,$inpperm
+ stvx_u $out2,$x10,$out
+ le?vperm $out4,$out4,$out4,$inpperm
+ stvx_u $out3,$x20,$out
+ le?vperm $out5,$out5,$out5,$inpperm
+ stvx_u $out4,$x30,$out
+ le?vperm $out6,$out6,$out6,$inpperm
+ stvx_u $out5,$x40,$out
+ le?vperm $out7,$out7,$out7,$inpperm
+ stvx_u $out6,$x50,$out
+ stvx_u $out7,$x60,$out
+ addi $out,$out,0x70
+ b Lcbc_dec8x_done
+
+.align 5
+Lcbc_dec8x_six:
+ vncipherlast $out2,$out2,$ivec
+ vncipherlast $out3,$out3,$in2
+ vncipherlast $out4,$out4,$in3
+ vncipherlast $out5,$out5,$in4
+ vncipherlast $out6,$out6,$in5
+ vncipherlast $out7,$out7,$in6
+ vmr $ivec,$in7
+
+ le?vperm $out2,$out2,$out2,$inpperm
+ le?vperm $out3,$out3,$out3,$inpperm
+ stvx_u $out2,$x00,$out
+ le?vperm $out4,$out4,$out4,$inpperm
+ stvx_u $out3,$x10,$out
+ le?vperm $out5,$out5,$out5,$inpperm
+ stvx_u $out4,$x20,$out
+ le?vperm $out6,$out6,$out6,$inpperm
+ stvx_u $out5,$x30,$out
+ le?vperm $out7,$out7,$out7,$inpperm
+ stvx_u $out6,$x40,$out
+ stvx_u $out7,$x50,$out
+ addi $out,$out,0x60
+ b Lcbc_dec8x_done
+
+.align 5
+Lcbc_dec8x_five:
+ vncipherlast $out3,$out3,$ivec
+ vncipherlast $out4,$out4,$in3
+ vncipherlast $out5,$out5,$in4
+ vncipherlast $out6,$out6,$in5
+ vncipherlast $out7,$out7,$in6
+ vmr $ivec,$in7
+
+ le?vperm $out3,$out3,$out3,$inpperm
+ le?vperm $out4,$out4,$out4,$inpperm
+ stvx_u $out3,$x00,$out
+ le?vperm $out5,$out5,$out5,$inpperm
+ stvx_u $out4,$x10,$out
+ le?vperm $out6,$out6,$out6,$inpperm
+ stvx_u $out5,$x20,$out
+ le?vperm $out7,$out7,$out7,$inpperm
+ stvx_u $out6,$x30,$out
+ stvx_u $out7,$x40,$out
+ addi $out,$out,0x50
+ b Lcbc_dec8x_done
+
+.align 5
+Lcbc_dec8x_four:
+ vncipherlast $out4,$out4,$ivec
+ vncipherlast $out5,$out5,$in4
+ vncipherlast $out6,$out6,$in5
+ vncipherlast $out7,$out7,$in6
+ vmr $ivec,$in7
+
+ le?vperm $out4,$out4,$out4,$inpperm
+ le?vperm $out5,$out5,$out5,$inpperm
+ stvx_u $out4,$x00,$out
+ le?vperm $out6,$out6,$out6,$inpperm
+ stvx_u $out5,$x10,$out
+ le?vperm $out7,$out7,$out7,$inpperm
+ stvx_u $out6,$x20,$out
+ stvx_u $out7,$x30,$out
+ addi $out,$out,0x40
+ b Lcbc_dec8x_done
+
+.align 5
+Lcbc_dec8x_three:
+ vncipherlast $out5,$out5,$ivec
+ vncipherlast $out6,$out6,$in5
+ vncipherlast $out7,$out7,$in6
+ vmr $ivec,$in7
+
+ le?vperm $out5,$out5,$out5,$inpperm
+ le?vperm $out6,$out6,$out6,$inpperm
+ stvx_u $out5,$x00,$out
+ le?vperm $out7,$out7,$out7,$inpperm
+ stvx_u $out6,$x10,$out
+ stvx_u $out7,$x20,$out
+ addi $out,$out,0x30
+ b Lcbc_dec8x_done
+
+.align 5
+Lcbc_dec8x_two:
+ vncipherlast $out6,$out6,$ivec
+ vncipherlast $out7,$out7,$in6
+ vmr $ivec,$in7
+
+ le?vperm $out6,$out6,$out6,$inpperm
+ le?vperm $out7,$out7,$out7,$inpperm
+ stvx_u $out6,$x00,$out
+ stvx_u $out7,$x10,$out
+ addi $out,$out,0x20
+ b Lcbc_dec8x_done
+
+.align 5
+Lcbc_dec8x_one:
+ vncipherlast $out7,$out7,$ivec
+ vmr $ivec,$in7
+
+ le?vperm $out7,$out7,$out7,$inpperm
+ stvx_u $out7,0,$out
+ addi $out,$out,0x10
+
+Lcbc_dec8x_done:
+ le?vperm $ivec,$ivec,$ivec,$inpperm
+ stvx_u $ivec,0,$ivp # write [unaligned] iv
+
+ li r10,`$FRAME+15`
+ li r11,`$FRAME+31`
+ stvx $inpperm,r10,$sp # wipe copies of round keys
+ addi r10,r10,32
+ stvx $inpperm,r11,$sp
+ addi r11,r11,32
+ stvx $inpperm,r10,$sp
+ addi r10,r10,32
+ stvx $inpperm,r11,$sp
+ addi r11,r11,32
+ stvx $inpperm,r10,$sp
+ addi r10,r10,32
+ stvx $inpperm,r11,$sp
+ addi r11,r11,32
+ stvx $inpperm,r10,$sp
+ addi r10,r10,32
+ stvx $inpperm,r11,$sp
+ addi r11,r11,32
+
+ mtspr 256,$vrsave
+ lvx v20,r10,$sp # ABI says so
+ addi r10,r10,32
+ lvx v21,r11,$sp
+ addi r11,r11,32
+ lvx v22,r10,$sp
+ addi r10,r10,32
+ lvx v23,r11,$sp
+ addi r11,r11,32
+ lvx v24,r10,$sp
+ addi r10,r10,32
+ lvx v25,r11,$sp
+ addi r11,r11,32
+ lvx v26,r10,$sp
+ addi r10,r10,32
+ lvx v27,r11,$sp
+ addi r11,r11,32
+ lvx v28,r10,$sp
+ addi r10,r10,32
+ lvx v29,r11,$sp
+ addi r11,r11,32
+ lvx v30,r10,$sp
+ lvx v31,r11,$sp
+ $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+ $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+ $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+ $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+ $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+ $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+ addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+ blr
+ .long 0
+ .byte 0,12,0x04,0,0x80,6,6,0
+ .long 0
+.size .${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
+___
+}} }}}
+
+#########################################################################
+{{{ # CTR procedure[s] #
+my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
+my ($rndkey0,$rndkey1,$inout,$tmp)= map("v$_",(0..3));
+my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
+ map("v$_",(4..11));
+my $dat=$tmp;
+
+$code.=<<___;
+.globl .${prefix}_ctr32_encrypt_blocks
+.align 5
+.${prefix}_ctr32_encrypt_blocks:
+ ${UCMP}i $len,1
+ bltlr-
+
+ lis r0,0xfff0
+ mfspr $vrsave,256
+ mtspr 256,r0
+
+ li $idx,15
+ vxor $rndkey0,$rndkey0,$rndkey0
+ le?vspltisb $tmp,0x0f
+
+ lvx $ivec,0,$ivp # load [unaligned] iv
+ lvsl $inpperm,0,$ivp
+ lvx $inptail,$idx,$ivp
+ vspltisb $one,1
+ le?vxor $inpperm,$inpperm,$tmp
+ vperm $ivec,$ivec,$inptail,$inpperm
+ vsldoi $one,$rndkey0,$one,1
+
+ neg r11,$inp
+ ?lvsl $keyperm,0,$key # prepare for unaligned key
+ lwz $rounds,240($key)
+
+ lvsr $inpperm,0,r11 # prepare for unaligned load
+ lvx $inptail,0,$inp
+ addi $inp,$inp,15 # 15 is not typo
+ le?vxor $inpperm,$inpperm,$tmp
+
+ srwi $rounds,$rounds,1
+ li $idx,16
+ subi $rounds,$rounds,1
+
+ ${UCMP}i $len,8
+ bge _aesp8_ctr32_encrypt8x
+
+ ?lvsr $outperm,0,$out # prepare for unaligned store
+ vspltisb $outmask,-1
+ lvx $outhead,0,$out
+ ?vperm $outmask,$rndkey0,$outmask,$outperm
+ le?vxor $outperm,$outperm,$tmp
+
+ lvx $rndkey0,0,$key
+ mtctr $rounds
+ lvx $rndkey1,$idx,$key
+ addi $idx,$idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vxor $inout,$ivec,$rndkey0
+ lvx $rndkey0,$idx,$key
+ addi $idx,$idx,16
+ b Loop_ctr32_enc
+
+.align 5
+Loop_ctr32_enc:
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vcipher $inout,$inout,$rndkey1
+ lvx $rndkey1,$idx,$key
+ addi $idx,$idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vcipher $inout,$inout,$rndkey0
+ lvx $rndkey0,$idx,$key
+ addi $idx,$idx,16
+ bdnz Loop_ctr32_enc
+
+ vadduwm $ivec,$ivec,$one
+ vmr $dat,$inptail
+ lvx $inptail,0,$inp
+ addi $inp,$inp,16
+ subic. $len,$len,1 # blocks--
+
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vcipher $inout,$inout,$rndkey1
+ lvx $rndkey1,$idx,$key
+ vperm $dat,$dat,$inptail,$inpperm
+ li $idx,16
+ ?vperm $rndkey1,$rndkey0,$rndkey1,$keyperm
+ lvx $rndkey0,0,$key
+ vxor $dat,$dat,$rndkey1 # last round key
+ vcipherlast $inout,$inout,$dat
+
+ lvx $rndkey1,$idx,$key
+ addi $idx,$idx,16
+ vperm $inout,$inout,$inout,$outperm
+ vsel $dat,$outhead,$inout,$outmask
+ mtctr $rounds
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vmr $outhead,$inout
+ vxor $inout,$ivec,$rndkey0
+ lvx $rndkey0,$idx,$key
+ addi $idx,$idx,16
+ stvx $dat,0,$out
+ addi $out,$out,16
+ bne Loop_ctr32_enc
+
+ addi $out,$out,-1
+ lvx $inout,0,$out # redundant in aligned case
+ vsel $inout,$outhead,$inout,$outmask
+ stvx $inout,0,$out
+
+ mtspr 256,$vrsave
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,6,0
+ .long 0
+___
+#########################################################################
+{{ # Optimized CTR procedure #
+my $key_="r11";
+my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
+ $x00=0 if ($flavour =~ /osx/);
+my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14));
+my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22));
+my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys
+ # v26-v31 last 6 round keys
+my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment
+my ($two,$three,$four)=($outhead,$outperm,$outmask);
+
+$code.=<<___;
+.align 5
+_aesp8_ctr32_encrypt8x:
+ $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+ li r10,`$FRAME+8*16+15`
+ li r11,`$FRAME+8*16+31`
+ stvx v20,r10,$sp # ABI says so
+ addi r10,r10,32
+ stvx v21,r11,$sp
+ addi r11,r11,32
+ stvx v22,r10,$sp
+ addi r10,r10,32
+ stvx v23,r11,$sp
+ addi r11,r11,32
+ stvx v24,r10,$sp
+ addi r10,r10,32
+ stvx v25,r11,$sp
+ addi r11,r11,32
+ stvx v26,r10,$sp
+ addi r10,r10,32
+ stvx v27,r11,$sp
+ addi r11,r11,32
+ stvx v28,r10,$sp
+ addi r10,r10,32
+ stvx v29,r11,$sp
+ addi r11,r11,32
+ stvx v30,r10,$sp
+ stvx v31,r11,$sp
+ li r0,-1
+ stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
+ li $x10,0x10
+ $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+ li $x20,0x20
+ $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+ li $x30,0x30
+ $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+ li $x40,0x40
+ $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+ li $x50,0x50
+ $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+ li $x60,0x60
+ $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+ li $x70,0x70
+ mtspr 256,r0
+
+ subi $rounds,$rounds,3 # -4 in total
+
+ lvx $rndkey0,$x00,$key # load key schedule
+ lvx v30,$x10,$key
+ addi $key,$key,0x20
+ lvx v31,$x00,$key
+ ?vperm $rndkey0,$rndkey0,v30,$keyperm
+ addi $key_,$sp,$FRAME+15
+ mtctr $rounds
+
+Load_ctr32_enc_key:
+ ?vperm v24,v30,v31,$keyperm
+ lvx v30,$x10,$key
+ addi $key,$key,0x20
+ stvx v24,$x00,$key_ # off-load round[1]
+ ?vperm v25,v31,v30,$keyperm
+ lvx v31,$x00,$key
+ stvx v25,$x10,$key_ # off-load round[2]
+ addi $key_,$key_,0x20
+ bdnz Load_ctr32_enc_key
+
+ lvx v26,$x10,$key
+ ?vperm v24,v30,v31,$keyperm
+ lvx v27,$x20,$key
+ stvx v24,$x00,$key_ # off-load round[3]
+ ?vperm v25,v31,v26,$keyperm
+ lvx v28,$x30,$key
+ stvx v25,$x10,$key_ # off-load round[4]
+ addi $key_,$sp,$FRAME+15 # rewind $key_
+ ?vperm v26,v26,v27,$keyperm
+ lvx v29,$x40,$key
+ ?vperm v27,v27,v28,$keyperm
+ lvx v30,$x50,$key
+ ?vperm v28,v28,v29,$keyperm
+ lvx v31,$x60,$key
+ ?vperm v29,v29,v30,$keyperm
+ lvx $out0,$x70,$key # borrow $out0
+ ?vperm v30,v30,v31,$keyperm
+ lvx v24,$x00,$key_ # pre-load round[1]
+ ?vperm v31,v31,$out0,$keyperm
+ lvx v25,$x10,$key_ # pre-load round[2]
+
+ vadduwm $two,$one,$one
+ subi $inp,$inp,15 # undo "caller"
+ $SHL $len,$len,4
+
+ vadduwm $out1,$ivec,$one # counter values ...
+ vadduwm $out2,$ivec,$two
+ vxor $out0,$ivec,$rndkey0 # ... xored with rndkey[0]
+ le?li $idx,8
+ vadduwm $out3,$out1,$two
+ vxor $out1,$out1,$rndkey0
+ le?lvsl $inpperm,0,$idx
+ vadduwm $out4,$out2,$two
+ vxor $out2,$out2,$rndkey0
+ le?vspltisb $tmp,0x0f
+ vadduwm $out5,$out3,$two
+ vxor $out3,$out3,$rndkey0
+ le?vxor $inpperm,$inpperm,$tmp # transform for lvx_u/stvx_u
+ vadduwm $out6,$out4,$two
+ vxor $out4,$out4,$rndkey0
+ vadduwm $out7,$out5,$two
+ vxor $out5,$out5,$rndkey0
+ vadduwm $ivec,$out6,$two # next counter value
+ vxor $out6,$out6,$rndkey0
+ vxor $out7,$out7,$rndkey0
+
+ mtctr $rounds
+ b Loop_ctr32_enc8x
+.align 5
+Loop_ctr32_enc8x:
+ vcipher $out0,$out0,v24
+ vcipher $out1,$out1,v24
+ vcipher $out2,$out2,v24
+ vcipher $out3,$out3,v24
+ vcipher $out4,$out4,v24
+ vcipher $out5,$out5,v24
+ vcipher $out6,$out6,v24
+ vcipher $out7,$out7,v24
+Loop_ctr32_enc8x_middle:
+ lvx v24,$x20,$key_ # round[3]
+ addi $key_,$key_,0x20
+
+ vcipher $out0,$out0,v25
+ vcipher $out1,$out1,v25
+ vcipher $out2,$out2,v25
+ vcipher $out3,$out3,v25
+ vcipher $out4,$out4,v25
+ vcipher $out5,$out5,v25
+ vcipher $out6,$out6,v25
+ vcipher $out7,$out7,v25
+ lvx v25,$x10,$key_ # round[4]
+ bdnz Loop_ctr32_enc8x
+
+ subic r11,$len,256 # $len-256, borrow $key_
+ vcipher $out0,$out0,v24
+ vcipher $out1,$out1,v24
+ vcipher $out2,$out2,v24
+ vcipher $out3,$out3,v24
+ vcipher $out4,$out4,v24
+ vcipher $out5,$out5,v24
+ vcipher $out6,$out6,v24
+ vcipher $out7,$out7,v24
+
+ subfe r0,r0,r0 # borrow?-1:0
+ vcipher $out0,$out0,v25
+ vcipher $out1,$out1,v25
+ vcipher $out2,$out2,v25
+ vcipher $out3,$out3,v25
+ vcipher $out4,$out4,v25
+ vcipher $out5,$out5,v25
+ vcipher $out6,$out6,v25
+ vcipher $out7,$out7,v25
+
+ and r0,r0,r11
+ addi $key_,$sp,$FRAME+15 # rewind $key_
+ vcipher $out0,$out0,v26
+ vcipher $out1,$out1,v26
+ vcipher $out2,$out2,v26
+ vcipher $out3,$out3,v26
+ vcipher $out4,$out4,v26
+ vcipher $out5,$out5,v26
+ vcipher $out6,$out6,v26
+ vcipher $out7,$out7,v26
+ lvx v24,$x00,$key_ # re-pre-load round[1]
+
+ subic $len,$len,129 # $len-=129
+ vcipher $out0,$out0,v27
+ addi $len,$len,1 # $len-=128 really
+ vcipher $out1,$out1,v27
+ vcipher $out2,$out2,v27
+ vcipher $out3,$out3,v27
+ vcipher $out4,$out4,v27
+ vcipher $out5,$out5,v27
+ vcipher $out6,$out6,v27
+ vcipher $out7,$out7,v27
+ lvx v25,$x10,$key_ # re-pre-load round[2]
+
+ vcipher $out0,$out0,v28
+ lvx_u $in0,$x00,$inp # load input
+ vcipher $out1,$out1,v28
+ lvx_u $in1,$x10,$inp
+ vcipher $out2,$out2,v28
+ lvx_u $in2,$x20,$inp
+ vcipher $out3,$out3,v28
+ lvx_u $in3,$x30,$inp
+ vcipher $out4,$out4,v28
+ lvx_u $in4,$x40,$inp
+ vcipher $out5,$out5,v28
+ lvx_u $in5,$x50,$inp
+ vcipher $out6,$out6,v28
+ lvx_u $in6,$x60,$inp
+ vcipher $out7,$out7,v28
+ lvx_u $in7,$x70,$inp
+ addi $inp,$inp,0x80
+
+ vcipher $out0,$out0,v29
+ le?vperm $in0,$in0,$in0,$inpperm
+ vcipher $out1,$out1,v29
+ le?vperm $in1,$in1,$in1,$inpperm
+ vcipher $out2,$out2,v29
+ le?vperm $in2,$in2,$in2,$inpperm
+ vcipher $out3,$out3,v29
+ le?vperm $in3,$in3,$in3,$inpperm
+ vcipher $out4,$out4,v29
+ le?vperm $in4,$in4,$in4,$inpperm
+ vcipher $out5,$out5,v29
+ le?vperm $in5,$in5,$in5,$inpperm
+ vcipher $out6,$out6,v29
+ le?vperm $in6,$in6,$in6,$inpperm
+ vcipher $out7,$out7,v29
+ le?vperm $in7,$in7,$in7,$inpperm
+
+ add $inp,$inp,r0 # $inp is adjusted in such
+ # way that at exit from the
+ # loop inX-in7 are loaded
+ # with last "words"
+ subfe. r0,r0,r0 # borrow?-1:0
+ vcipher $out0,$out0,v30
+ vxor $in0,$in0,v31 # xor with last round key
+ vcipher $out1,$out1,v30
+ vxor $in1,$in1,v31
+ vcipher $out2,$out2,v30
+ vxor $in2,$in2,v31
+ vcipher $out3,$out3,v30
+ vxor $in3,$in3,v31
+ vcipher $out4,$out4,v30
+ vxor $in4,$in4,v31
+ vcipher $out5,$out5,v30
+ vxor $in5,$in5,v31
+ vcipher $out6,$out6,v30
+ vxor $in6,$in6,v31
+ vcipher $out7,$out7,v30
+ vxor $in7,$in7,v31
+
+ bne Lctr32_enc8x_break # did $len-129 borrow?
+
+ vcipherlast $in0,$out0,$in0
+ vcipherlast $in1,$out1,$in1
+ vadduwm $out1,$ivec,$one # counter values ...
+ vcipherlast $in2,$out2,$in2
+ vadduwm $out2,$ivec,$two
+ vxor $out0,$ivec,$rndkey0 # ... xored with rndkey[0]
+ vcipherlast $in3,$out3,$in3
+ vadduwm $out3,$out1,$two
+ vxor $out1,$out1,$rndkey0
+ vcipherlast $in4,$out4,$in4
+ vadduwm $out4,$out2,$two
+ vxor $out2,$out2,$rndkey0
+ vcipherlast $in5,$out5,$in5
+ vadduwm $out5,$out3,$two
+ vxor $out3,$out3,$rndkey0
+ vcipherlast $in6,$out6,$in6
+ vadduwm $out6,$out4,$two
+ vxor $out4,$out4,$rndkey0
+ vcipherlast $in7,$out7,$in7
+ vadduwm $out7,$out5,$two
+ vxor $out5,$out5,$rndkey0
+ le?vperm $in0,$in0,$in0,$inpperm
+ vadduwm $ivec,$out6,$two # next counter value
+ vxor $out6,$out6,$rndkey0
+ le?vperm $in1,$in1,$in1,$inpperm
+ vxor $out7,$out7,$rndkey0
+ mtctr $rounds
+
+ vcipher $out0,$out0,v24
+ stvx_u $in0,$x00,$out
+ le?vperm $in2,$in2,$in2,$inpperm
+ vcipher $out1,$out1,v24
+ stvx_u $in1,$x10,$out
+ le?vperm $in3,$in3,$in3,$inpperm
+ vcipher $out2,$out2,v24
+ stvx_u $in2,$x20,$out
+ le?vperm $in4,$in4,$in4,$inpperm
+ vcipher $out3,$out3,v24
+ stvx_u $in3,$x30,$out
+ le?vperm $in5,$in5,$in5,$inpperm
+ vcipher $out4,$out4,v24
+ stvx_u $in4,$x40,$out
+ le?vperm $in6,$in6,$in6,$inpperm
+ vcipher $out5,$out5,v24
+ stvx_u $in5,$x50,$out
+ le?vperm $in7,$in7,$in7,$inpperm
+ vcipher $out6,$out6,v24
+ stvx_u $in6,$x60,$out
+ vcipher $out7,$out7,v24
+ stvx_u $in7,$x70,$out
+ addi $out,$out,0x80
+
+ b Loop_ctr32_enc8x_middle
+
+.align 5
+Lctr32_enc8x_break:
+ cmpwi $len,-0x60
+ blt Lctr32_enc8x_one
+ nop
+ beq Lctr32_enc8x_two
+ cmpwi $len,-0x40
+ blt Lctr32_enc8x_three
+ nop
+ beq Lctr32_enc8x_four
+ cmpwi $len,-0x20
+ blt Lctr32_enc8x_five
+ nop
+ beq Lctr32_enc8x_six
+ cmpwi $len,0x00
+ blt Lctr32_enc8x_seven
+
+Lctr32_enc8x_eight:
+ vcipherlast $out0,$out0,$in0
+ vcipherlast $out1,$out1,$in1
+ vcipherlast $out2,$out2,$in2
+ vcipherlast $out3,$out3,$in3
+ vcipherlast $out4,$out4,$in4
+ vcipherlast $out5,$out5,$in5
+ vcipherlast $out6,$out6,$in6
+ vcipherlast $out7,$out7,$in7
+
+ le?vperm $out0,$out0,$out0,$inpperm
+ le?vperm $out1,$out1,$out1,$inpperm
+ stvx_u $out0,$x00,$out
+ le?vperm $out2,$out2,$out2,$inpperm
+ stvx_u $out1,$x10,$out
+ le?vperm $out3,$out3,$out3,$inpperm
+ stvx_u $out2,$x20,$out
+ le?vperm $out4,$out4,$out4,$inpperm
+ stvx_u $out3,$x30,$out
+ le?vperm $out5,$out5,$out5,$inpperm
+ stvx_u $out4,$x40,$out
+ le?vperm $out6,$out6,$out6,$inpperm
+ stvx_u $out5,$x50,$out
+ le?vperm $out7,$out7,$out7,$inpperm
+ stvx_u $out6,$x60,$out
+ stvx_u $out7,$x70,$out
+ addi $out,$out,0x80
+ b Lctr32_enc8x_done
+
+.align 5
+Lctr32_enc8x_seven:
+ vcipherlast $out0,$out0,$in1
+ vcipherlast $out1,$out1,$in2
+ vcipherlast $out2,$out2,$in3
+ vcipherlast $out3,$out3,$in4
+ vcipherlast $out4,$out4,$in5
+ vcipherlast $out5,$out5,$in6
+ vcipherlast $out6,$out6,$in7
+
+ le?vperm $out0,$out0,$out0,$inpperm
+ le?vperm $out1,$out1,$out1,$inpperm
+ stvx_u $out0,$x00,$out
+ le?vperm $out2,$out2,$out2,$inpperm
+ stvx_u $out1,$x10,$out
+ le?vperm $out3,$out3,$out3,$inpperm
+ stvx_u $out2,$x20,$out
+ le?vperm $out4,$out4,$out4,$inpperm
+ stvx_u $out3,$x30,$out
+ le?vperm $out5,$out5,$out5,$inpperm
+ stvx_u $out4,$x40,$out
+ le?vperm $out6,$out6,$out6,$inpperm
+ stvx_u $out5,$x50,$out
+ stvx_u $out6,$x60,$out
+ addi $out,$out,0x70
+ b Lctr32_enc8x_done
+
+.align 5
+Lctr32_enc8x_six:
+ vcipherlast $out0,$out0,$in2
+ vcipherlast $out1,$out1,$in3
+ vcipherlast $out2,$out2,$in4
+ vcipherlast $out3,$out3,$in5
+ vcipherlast $out4,$out4,$in6
+ vcipherlast $out5,$out5,$in7
+
+ le?vperm $out0,$out0,$out0,$inpperm
+ le?vperm $out1,$out1,$out1,$inpperm
+ stvx_u $out0,$x00,$out
+ le?vperm $out2,$out2,$out2,$inpperm
+ stvx_u $out1,$x10,$out
+ le?vperm $out3,$out3,$out3,$inpperm
+ stvx_u $out2,$x20,$out
+ le?vperm $out4,$out4,$out4,$inpperm
+ stvx_u $out3,$x30,$out
+ le?vperm $out5,$out5,$out5,$inpperm
+ stvx_u $out4,$x40,$out
+ stvx_u $out5,$x50,$out
+ addi $out,$out,0x60
+ b Lctr32_enc8x_done
+
+.align 5
+Lctr32_enc8x_five:
+ vcipherlast $out0,$out0,$in3
+ vcipherlast $out1,$out1,$in4
+ vcipherlast $out2,$out2,$in5
+ vcipherlast $out3,$out3,$in6
+ vcipherlast $out4,$out4,$in7
+
+ le?vperm $out0,$out0,$out0,$inpperm
+ le?vperm $out1,$out1,$out1,$inpperm
+ stvx_u $out0,$x00,$out
+ le?vperm $out2,$out2,$out2,$inpperm
+ stvx_u $out1,$x10,$out
+ le?vperm $out3,$out3,$out3,$inpperm
+ stvx_u $out2,$x20,$out
+ le?vperm $out4,$out4,$out4,$inpperm
+ stvx_u $out3,$x30,$out
+ stvx_u $out4,$x40,$out
+ addi $out,$out,0x50
+ b Lctr32_enc8x_done
+
+.align 5
+Lctr32_enc8x_four:
+ vcipherlast $out0,$out0,$in4
+ vcipherlast $out1,$out1,$in5
+ vcipherlast $out2,$out2,$in6
+ vcipherlast $out3,$out3,$in7
+
+ le?vperm $out0,$out0,$out0,$inpperm
+ le?vperm $out1,$out1,$out1,$inpperm
+ stvx_u $out0,$x00,$out
+ le?vperm $out2,$out2,$out2,$inpperm
+ stvx_u $out1,$x10,$out
+ le?vperm $out3,$out3,$out3,$inpperm
+ stvx_u $out2,$x20,$out
+ stvx_u $out3,$x30,$out
+ addi $out,$out,0x40
+ b Lctr32_enc8x_done
+
+.align 5
+Lctr32_enc8x_three:
+ vcipherlast $out0,$out0,$in5
+ vcipherlast $out1,$out1,$in6
+ vcipherlast $out2,$out2,$in7
+
+ le?vperm $out0,$out0,$out0,$inpperm
+ le?vperm $out1,$out1,$out1,$inpperm
+ stvx_u $out0,$x00,$out
+ le?vperm $out2,$out2,$out2,$inpperm
+ stvx_u $out1,$x10,$out
+ stvx_u $out2,$x20,$out
+ addi $out,$out,0x30
+ b Lcbc_dec8x_done
+
+.align 5
+Lctr32_enc8x_two:
+ vcipherlast $out0,$out0,$in6
+ vcipherlast $out1,$out1,$in7
+
+ le?vperm $out0,$out0,$out0,$inpperm
+ le?vperm $out1,$out1,$out1,$inpperm
+ stvx_u $out0,$x00,$out
+ stvx_u $out1,$x10,$out
+ addi $out,$out,0x20
+ b Lcbc_dec8x_done
+
+.align 5
+Lctr32_enc8x_one:
+ vcipherlast $out0,$out0,$in7
+
+ le?vperm $out0,$out0,$out0,$inpperm
+ stvx_u $out0,0,$out
+ addi $out,$out,0x10
+
+Lctr32_enc8x_done:
+ li r10,`$FRAME+15`
+ li r11,`$FRAME+31`
+ stvx $inpperm,r10,$sp # wipe copies of round keys
+ addi r10,r10,32
+ stvx $inpperm,r11,$sp
+ addi r11,r11,32
+ stvx $inpperm,r10,$sp
+ addi r10,r10,32
+ stvx $inpperm,r11,$sp
+ addi r11,r11,32
+ stvx $inpperm,r10,$sp
+ addi r10,r10,32
+ stvx $inpperm,r11,$sp
+ addi r11,r11,32
+ stvx $inpperm,r10,$sp
+ addi r10,r10,32
+ stvx $inpperm,r11,$sp
+ addi r11,r11,32
+
+ mtspr 256,$vrsave
+ lvx v20,r10,$sp # ABI says so
+ addi r10,r10,32
+ lvx v21,r11,$sp
+ addi r11,r11,32
+ lvx v22,r10,$sp
+ addi r10,r10,32
+ lvx v23,r11,$sp
+ addi r11,r11,32
+ lvx v24,r10,$sp
+ addi r10,r10,32
+ lvx v25,r11,$sp
+ addi r11,r11,32
+ lvx v26,r10,$sp
+ addi r10,r10,32
+ lvx v27,r11,$sp
+ addi r11,r11,32
+ lvx v28,r10,$sp
+ addi r10,r10,32
+ lvx v29,r11,$sp
+ addi r11,r11,32
+ lvx v30,r10,$sp
+ lvx v31,r11,$sp
+ $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+ $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+ $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+ $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+ $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+ $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+ addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+ blr
+ .long 0
+ .byte 0,12,0x04,0,0x80,6,6,0
+ .long 0
+.size .${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks
+___
+}} }}}
+
+my $consts=1;
+foreach(split("\n",$code)) {
+ s/\`([^\`]*)\`/eval($1)/geo;
+
+ # constants table endian-specific conversion
+ if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
+ my $conv=$3;
+ my @bytes=();
+
+ # convert to endian-agnostic format
+ if ($1 eq "long") {
+ foreach (split(/,\s*/,$2)) {
+ my $l = /^0/?oct:int;
+ push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
+ }
+ } else {
+ @bytes = map(/^0/?oct:int,split(/,\s*/,$2));
+ }
+
+ # little-endian conversion
+ if ($flavour =~ /le$/o) {
+ SWITCH: for($conv) {
+ /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
+ /\?rev/ && do { @bytes=reverse(@bytes); last; };
+ }
+ }
+
+ #emit
+ print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
+ next;
+ }
+ $consts=0 if (m/Lconsts:/o); # end of table
+
+ # instructions prefixed with '?' are endian-specific and need
+ # to be adjusted accordingly...
+ if ($flavour =~ /le$/o) { # little-endian
+ s/le\?//o or
+ s/be\?/#be#/o or
+ s/\?lvsr/lvsl/o or
+ s/\?lvsl/lvsr/o or
+ s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
+ s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
+ s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
+ } else { # big-endian
+ s/le\?/#le#/o or
+ s/be\?//o or
+ s/\?([a-z]+)/$1/o;
+ }
+
+ print $_,"\n";
+}
+
+close STDOUT;
diff --git a/crypto/aes/asm/aest4-sparcv9.pl b/crypto/aes/asm/aest4-sparcv9.pl
new file mode 100755
index 000000000000..536f23b47c70
--- /dev/null
+++ b/crypto/aes/asm/aest4-sparcv9.pl
@@ -0,0 +1,919 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by David S. Miller <davem@devemloft.net> and Andy Polyakov
+# <appro@openssl.org>. The module is licensed under 2-clause BSD
+# license. October 2012. All rights reserved.
+# ====================================================================
+
+######################################################################
+# AES for SPARC T4.
+#
+# AES round instructions complete in 3 cycles and can be issued every
+# cycle. It means that round calculations should take 4*rounds cycles,
+# because any given round instruction depends on result of *both*
+# previous instructions:
+#
+# |0 |1 |2 |3 |4
+# |01|01|01|
+# |23|23|23|
+# |01|01|...
+# |23|...
+#
+# Provided that fxor [with IV] takes 3 cycles to complete, critical
+# path length for CBC encrypt would be 3+4*rounds, or in other words
+# it should process one byte in at least (3+4*rounds)/16 cycles. This
+# estimate doesn't account for "collateral" instructions, such as
+# fetching input from memory, xor-ing it with zero-round key and
+# storing the result. Yet, *measured* performance [for data aligned
+# at 64-bit boundary!] deviates from this equation by less than 0.5%:
+#
+# 128-bit key 192- 256-
+# CBC encrypt 2.70/2.90(*) 3.20/3.40 3.70/3.90
+# (*) numbers after slash are for
+# misaligned data.
+#
+# Out-of-order execution logic managed to fully overlap "collateral"
+# instructions with those on critical path. Amazing!
+#
+# As with Intel AES-NI, question is if it's possible to improve
+# performance of parallelizeable modes by interleaving round
+# instructions. Provided round instruction latency and throughput
+# optimal interleave factor is 2. But can we expect 2x performance
+# improvement? Well, as round instructions can be issued one per
+# cycle, they don't saturate the 2-way issue pipeline and therefore
+# there is room for "collateral" calculations... Yet, 2x speed-up
+# over CBC encrypt remains unattaintable:
+#
+# 128-bit key 192- 256-
+# CBC decrypt 1.64/2.11 1.89/2.37 2.23/2.61
+# CTR 1.64/2.08(*) 1.89/2.33 2.23/2.61
+# (*) numbers after slash are for
+# misaligned data.
+#
+# Estimates based on amount of instructions under assumption that
+# round instructions are not pairable with any other instruction
+# suggest that latter is the actual case and pipeline runs
+# underutilized. It should be noted that T4 out-of-order execution
+# logic is so capable that performance gain from 2x interleave is
+# not even impressive, ~7-13% over non-interleaved code, largest
+# for 256-bit keys.
+
+# To anchor to something else, software implementation processes
+# one byte in 29 cycles with 128-bit key on same processor. Intel
+# Sandy Bridge encrypts byte in 5.07 cycles in CBC mode and decrypts
+# in 0.93, naturally with AES-NI.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "sparcv9_modes.pl";
+
+&asm_init(@ARGV);
+
+$::evp=1; # if $evp is set to 0, script generates module with
+# AES_[en|de]crypt, AES_set_[en|de]crypt_key and AES_cbc_encrypt entry
+# points. These however are not fully compatible with openssl/aes.h,
+# because they expect AES_KEY to be aligned at 64-bit boundary. When
+# used through EVP, alignment is arranged at EVP layer. Second thing
+# that is arranged by EVP is at least 32-bit alignment of IV.
+
+######################################################################
+# single-round subroutines
+#
+{
+my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5));
+
+$code.=<<___ if ($::abibits==64);
+.register %g2,#scratch
+.register %g3,#scratch
+
+___
+$code.=<<___;
+.text
+
+.globl aes_t4_encrypt
+.align 32
+aes_t4_encrypt:
+ andcc $inp, 7, %g1 ! is input aligned?
+ andn $inp, 7, $inp
+
+ ldx [$key + 0], %g4
+ ldx [$key + 8], %g5
+
+ ldx [$inp + 0], %o4
+ bz,pt %icc, 1f
+ ldx [$inp + 8], %o5
+ ldx [$inp + 16], $inp
+ sll %g1, 3, %g1
+ sub %g0, %g1, %o3
+ sllx %o4, %g1, %o4
+ sllx %o5, %g1, %g1
+ srlx %o5, %o3, %o5
+ srlx $inp, %o3, %o3
+ or %o5, %o4, %o4
+ or %o3, %g1, %o5
+1:
+ ld [$key + 240], $rounds
+ ldd [$key + 16], %f12
+ ldd [$key + 24], %f14
+ xor %g4, %o4, %o4
+ xor %g5, %o5, %o5
+ movxtod %o4, %f0
+ movxtod %o5, %f2
+ srl $rounds, 1, $rounds
+ ldd [$key + 32], %f16
+ sub $rounds, 1, $rounds
+ ldd [$key + 40], %f18
+ add $key, 48, $key
+
+.Lenc:
+ aes_eround01 %f12, %f0, %f2, %f4
+ aes_eround23 %f14, %f0, %f2, %f2
+ ldd [$key + 0], %f12
+ ldd [$key + 8], %f14
+ sub $rounds,1,$rounds
+ aes_eround01 %f16, %f4, %f2, %f0
+ aes_eround23 %f18, %f4, %f2, %f2
+ ldd [$key + 16], %f16
+ ldd [$key + 24], %f18
+ brnz,pt $rounds, .Lenc
+ add $key, 32, $key
+
+ andcc $out, 7, $tmp ! is output aligned?
+ aes_eround01 %f12, %f0, %f2, %f4
+ aes_eround23 %f14, %f0, %f2, %f2
+ aes_eround01_l %f16, %f4, %f2, %f0
+ aes_eround23_l %f18, %f4, %f2, %f2
+
+ bnz,pn %icc, 2f
+ nop
+
+ std %f0, [$out + 0]
+ retl
+ std %f2, [$out + 8]
+
+2: alignaddrl $out, %g0, $out
+ mov 0xff, $mask
+ srl $mask, $tmp, $mask
+
+ faligndata %f0, %f0, %f4
+ faligndata %f0, %f2, %f6
+ faligndata %f2, %f2, %f8
+
+ stda %f4, [$out + $mask]0xc0 ! partial store
+ std %f6, [$out + 8]
+ add $out, 16, $out
+ orn %g0, $mask, $mask
+ retl
+ stda %f8, [$out + $mask]0xc0 ! partial store
+.type aes_t4_encrypt,#function
+.size aes_t4_encrypt,.-aes_t4_encrypt
+
+.globl aes_t4_decrypt
+.align 32
+aes_t4_decrypt:
+ andcc $inp, 7, %g1 ! is input aligned?
+ andn $inp, 7, $inp
+
+ ldx [$key + 0], %g4
+ ldx [$key + 8], %g5
+
+ ldx [$inp + 0], %o4
+ bz,pt %icc, 1f
+ ldx [$inp + 8], %o5
+ ldx [$inp + 16], $inp
+ sll %g1, 3, %g1
+ sub %g0, %g1, %o3
+ sllx %o4, %g1, %o4
+ sllx %o5, %g1, %g1
+ srlx %o5, %o3, %o5
+ srlx $inp, %o3, %o3
+ or %o5, %o4, %o4
+ or %o3, %g1, %o5
+1:
+ ld [$key + 240], $rounds
+ ldd [$key + 16], %f12
+ ldd [$key + 24], %f14
+ xor %g4, %o4, %o4
+ xor %g5, %o5, %o5
+ movxtod %o4, %f0
+ movxtod %o5, %f2
+ srl $rounds, 1, $rounds
+ ldd [$key + 32], %f16
+ sub $rounds, 1, $rounds
+ ldd [$key + 40], %f18
+ add $key, 48, $key
+
+.Ldec:
+ aes_dround01 %f12, %f0, %f2, %f4
+ aes_dround23 %f14, %f0, %f2, %f2
+ ldd [$key + 0], %f12
+ ldd [$key + 8], %f14
+ sub $rounds,1,$rounds
+ aes_dround01 %f16, %f4, %f2, %f0
+ aes_dround23 %f18, %f4, %f2, %f2
+ ldd [$key + 16], %f16
+ ldd [$key + 24], %f18
+ brnz,pt $rounds, .Ldec
+ add $key, 32, $key
+
+ andcc $out, 7, $tmp ! is output aligned?
+ aes_dround01 %f12, %f0, %f2, %f4
+ aes_dround23 %f14, %f0, %f2, %f2
+ aes_dround01_l %f16, %f4, %f2, %f0
+ aes_dround23_l %f18, %f4, %f2, %f2
+
+ bnz,pn %icc, 2f
+ nop
+
+ std %f0, [$out + 0]
+ retl
+ std %f2, [$out + 8]
+
+2: alignaddrl $out, %g0, $out
+ mov 0xff, $mask
+ srl $mask, $tmp, $mask
+
+ faligndata %f0, %f0, %f4
+ faligndata %f0, %f2, %f6
+ faligndata %f2, %f2, %f8
+
+ stda %f4, [$out + $mask]0xc0 ! partial store
+ std %f6, [$out + 8]
+ add $out, 16, $out
+ orn %g0, $mask, $mask
+ retl
+ stda %f8, [$out + $mask]0xc0 ! partial store
+.type aes_t4_decrypt,#function
+.size aes_t4_decrypt,.-aes_t4_decrypt
+___
+}
+
+######################################################################
+# key setup subroutines
+#
+{
+my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5));
+$code.=<<___;
+.globl aes_t4_set_encrypt_key
+.align 32
+aes_t4_set_encrypt_key:
+.Lset_encrypt_key:
+ and $inp, 7, $tmp
+ alignaddr $inp, %g0, $inp
+ cmp $bits, 192
+ ldd [$inp + 0], %f0
+ bl,pt %icc,.L128
+ ldd [$inp + 8], %f2
+
+ be,pt %icc,.L192
+ ldd [$inp + 16], %f4
+ brz,pt $tmp, .L256aligned
+ ldd [$inp + 24], %f6
+
+ ldd [$inp + 32], %f8
+ faligndata %f0, %f2, %f0
+ faligndata %f2, %f4, %f2
+ faligndata %f4, %f6, %f4
+ faligndata %f6, %f8, %f6
+.L256aligned:
+___
+for ($i=0; $i<6; $i++) {
+ $code.=<<___;
+ std %f0, [$out + `32*$i+0`]
+ aes_kexpand1 %f0, %f6, $i, %f0
+ std %f2, [$out + `32*$i+8`]
+ aes_kexpand2 %f2, %f0, %f2
+ std %f4, [$out + `32*$i+16`]
+ aes_kexpand0 %f4, %f2, %f4
+ std %f6, [$out + `32*$i+24`]
+ aes_kexpand2 %f6, %f4, %f6
+___
+}
+$code.=<<___;
+ std %f0, [$out + `32*$i+0`]
+ aes_kexpand1 %f0, %f6, $i, %f0
+ std %f2, [$out + `32*$i+8`]
+ aes_kexpand2 %f2, %f0, %f2
+ std %f4, [$out + `32*$i+16`]
+ std %f6, [$out + `32*$i+24`]
+ std %f0, [$out + `32*$i+32`]
+ std %f2, [$out + `32*$i+40`]
+
+ mov 14, $tmp
+ st $tmp, [$out + 240]
+ retl
+ xor %o0, %o0, %o0
+
+.align 16
+.L192:
+ brz,pt $tmp, .L192aligned
+ nop
+
+ ldd [$inp + 24], %f6
+ faligndata %f0, %f2, %f0
+ faligndata %f2, %f4, %f2
+ faligndata %f4, %f6, %f4
+.L192aligned:
+___
+for ($i=0; $i<7; $i++) {
+ $code.=<<___;
+ std %f0, [$out + `24*$i+0`]
+ aes_kexpand1 %f0, %f4, $i, %f0
+ std %f2, [$out + `24*$i+8`]
+ aes_kexpand2 %f2, %f0, %f2
+ std %f4, [$out + `24*$i+16`]
+ aes_kexpand2 %f4, %f2, %f4
+___
+}
+$code.=<<___;
+ std %f0, [$out + `24*$i+0`]
+ aes_kexpand1 %f0, %f4, $i, %f0
+ std %f2, [$out + `24*$i+8`]
+ aes_kexpand2 %f2, %f0, %f2
+ std %f4, [$out + `24*$i+16`]
+ std %f0, [$out + `24*$i+24`]
+ std %f2, [$out + `24*$i+32`]
+
+ mov 12, $tmp
+ st $tmp, [$out + 240]
+ retl
+ xor %o0, %o0, %o0
+
+.align 16
+.L128:
+ brz,pt $tmp, .L128aligned
+ nop
+
+ ldd [$inp + 16], %f4
+ faligndata %f0, %f2, %f0
+ faligndata %f2, %f4, %f2
+.L128aligned:
+___
+for ($i=0; $i<10; $i++) {
+ $code.=<<___;
+ std %f0, [$out + `16*$i+0`]
+ aes_kexpand1 %f0, %f2, $i, %f0
+ std %f2, [$out + `16*$i+8`]
+ aes_kexpand2 %f2, %f0, %f2
+___
+}
+$code.=<<___;
+ std %f0, [$out + `16*$i+0`]
+ std %f2, [$out + `16*$i+8`]
+
+ mov 10, $tmp
+ st $tmp, [$out + 240]
+ retl
+ xor %o0, %o0, %o0
+.type aes_t4_set_encrypt_key,#function
+.size aes_t4_set_encrypt_key,.-aes_t4_set_encrypt_key
+
+.globl aes_t4_set_decrypt_key
+.align 32
+aes_t4_set_decrypt_key:
+ mov %o7, %o5
+ call .Lset_encrypt_key
+ nop
+
+ mov %o5, %o7
+ sll $tmp, 4, $inp ! $tmp is number of rounds
+ add $tmp, 2, $tmp
+ add $out, $inp, $inp ! $inp=$out+16*rounds
+ srl $tmp, 2, $tmp ! $tmp=(rounds+2)/4
+
+.Lkey_flip:
+ ldd [$out + 0], %f0
+ ldd [$out + 8], %f2
+ ldd [$out + 16], %f4
+ ldd [$out + 24], %f6
+ ldd [$inp + 0], %f8
+ ldd [$inp + 8], %f10
+ ldd [$inp - 16], %f12
+ ldd [$inp - 8], %f14
+ sub $tmp, 1, $tmp
+ std %f0, [$inp + 0]
+ std %f2, [$inp + 8]
+ std %f4, [$inp - 16]
+ std %f6, [$inp - 8]
+ std %f8, [$out + 0]
+ std %f10, [$out + 8]
+ std %f12, [$out + 16]
+ std %f14, [$out + 24]
+ add $out, 32, $out
+ brnz $tmp, .Lkey_flip
+ sub $inp, 32, $inp
+
+ retl
+ xor %o0, %o0, %o0
+.type aes_t4_set_decrypt_key,#function
+.size aes_t4_set_decrypt_key,.-aes_t4_set_decrypt_key
+___
+}
+
+{{{
+my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5));
+my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7));
+
+$code.=<<___;
+.align 32
+_aes128_encrypt_1x:
+___
+for ($i=0; $i<4; $i++) {
+ $code.=<<___;
+ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
+ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
+ aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+ aes_eround01 %f48, %f0, %f2, %f4
+ aes_eround23 %f50, %f0, %f2, %f2
+ aes_eround01_l %f52, %f4, %f2, %f0
+ retl
+ aes_eround23_l %f54, %f4, %f2, %f2
+.type _aes128_encrypt_1x,#function
+.size _aes128_encrypt_1x,.-_aes128_encrypt_1x
+
+.align 32
+_aes128_encrypt_2x:
+___
+for ($i=0; $i<4; $i++) {
+ $code.=<<___;
+ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
+ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
+ aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
+ aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
+ aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
+ aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
+ aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+ aes_eround01 %f48, %f0, %f2, %f8
+ aes_eround23 %f50, %f0, %f2, %f2
+ aes_eround01 %f48, %f4, %f6, %f10
+ aes_eround23 %f50, %f4, %f6, %f6
+ aes_eround01_l %f52, %f8, %f2, %f0
+ aes_eround23_l %f54, %f8, %f2, %f2
+ aes_eround01_l %f52, %f10, %f6, %f4
+ retl
+ aes_eround23_l %f54, %f10, %f6, %f6
+.type _aes128_encrypt_2x,#function
+.size _aes128_encrypt_2x,.-_aes128_encrypt_2x
+
+.align 32
+_aes128_loadkey:
+ ldx [$key + 0], %g4
+ ldx [$key + 8], %g5
+___
+for ($i=2; $i<22;$i++) { # load key schedule
+ $code.=<<___;
+ ldd [$key + `8*$i`], %f`12+2*$i`
+___
+}
+$code.=<<___;
+ retl
+ nop
+.type _aes128_loadkey,#function
+.size _aes128_loadkey,.-_aes128_loadkey
+_aes128_load_enckey=_aes128_loadkey
+_aes128_load_deckey=_aes128_loadkey
+
+___
+
+&alg_cbc_encrypt_implement("aes",128);
+if ($::evp) {
+ &alg_ctr32_implement("aes",128);
+ &alg_xts_implement("aes",128,"en");
+ &alg_xts_implement("aes",128,"de");
+}
+&alg_cbc_decrypt_implement("aes",128);
+
+$code.=<<___;
+.align 32
+_aes128_decrypt_1x:
+___
+for ($i=0; $i<4; $i++) {
+ $code.=<<___;
+ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
+ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
+ aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+ aes_dround01 %f48, %f0, %f2, %f4
+ aes_dround23 %f50, %f0, %f2, %f2
+ aes_dround01_l %f52, %f4, %f2, %f0
+ retl
+ aes_dround23_l %f54, %f4, %f2, %f2
+.type _aes128_decrypt_1x,#function
+.size _aes128_decrypt_1x,.-_aes128_decrypt_1x
+
+.align 32
+_aes128_decrypt_2x:
+___
+for ($i=0; $i<4; $i++) {
+ $code.=<<___;
+ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
+ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
+ aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
+ aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
+ aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
+ aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
+ aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+ aes_dround01 %f48, %f0, %f2, %f8
+ aes_dround23 %f50, %f0, %f2, %f2
+ aes_dround01 %f48, %f4, %f6, %f10
+ aes_dround23 %f50, %f4, %f6, %f6
+ aes_dround01_l %f52, %f8, %f2, %f0
+ aes_dround23_l %f54, %f8, %f2, %f2
+ aes_dround01_l %f52, %f10, %f6, %f4
+ retl
+ aes_dround23_l %f54, %f10, %f6, %f6
+.type _aes128_decrypt_2x,#function
+.size _aes128_decrypt_2x,.-_aes128_decrypt_2x
+___
+
+$code.=<<___;
+.align 32
+_aes192_encrypt_1x:
+___
+for ($i=0; $i<5; $i++) {
+ $code.=<<___;
+ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
+ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
+ aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+ aes_eround01 %f56, %f0, %f2, %f4
+ aes_eround23 %f58, %f0, %f2, %f2
+ aes_eround01_l %f60, %f4, %f2, %f0
+ retl
+ aes_eround23_l %f62, %f4, %f2, %f2
+.type _aes192_encrypt_1x,#function
+.size _aes192_encrypt_1x,.-_aes192_encrypt_1x
+
+.align 32
+_aes192_encrypt_2x:
+___
+for ($i=0; $i<5; $i++) {
+ $code.=<<___;
+ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
+ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
+ aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
+ aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
+ aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
+ aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
+ aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+ aes_eround01 %f56, %f0, %f2, %f8
+ aes_eround23 %f58, %f0, %f2, %f2
+ aes_eround01 %f56, %f4, %f6, %f10
+ aes_eround23 %f58, %f4, %f6, %f6
+ aes_eround01_l %f60, %f8, %f2, %f0
+ aes_eround23_l %f62, %f8, %f2, %f2
+ aes_eround01_l %f60, %f10, %f6, %f4
+ retl
+ aes_eround23_l %f62, %f10, %f6, %f6
+.type _aes192_encrypt_2x,#function
+.size _aes192_encrypt_2x,.-_aes192_encrypt_2x
+
+.align 32
+_aes256_encrypt_1x:
+ aes_eround01 %f16, %f0, %f2, %f4
+ aes_eround23 %f18, %f0, %f2, %f2
+ ldd [$key + 208], %f16
+ ldd [$key + 216], %f18
+ aes_eround01 %f20, %f4, %f2, %f0
+ aes_eround23 %f22, %f4, %f2, %f2
+ ldd [$key + 224], %f20
+ ldd [$key + 232], %f22
+___
+for ($i=1; $i<6; $i++) {
+ $code.=<<___;
+ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
+ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
+ aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+ aes_eround01 %f16, %f0, %f2, %f4
+ aes_eround23 %f18, %f0, %f2, %f2
+ ldd [$key + 16], %f16
+ ldd [$key + 24], %f18
+ aes_eround01_l %f20, %f4, %f2, %f0
+ aes_eround23_l %f22, %f4, %f2, %f2
+ ldd [$key + 32], %f20
+ retl
+ ldd [$key + 40], %f22
+.type _aes256_encrypt_1x,#function
+.size _aes256_encrypt_1x,.-_aes256_encrypt_1x
+
+.align 32
+_aes256_encrypt_2x:
+ aes_eround01 %f16, %f0, %f2, %f8
+ aes_eround23 %f18, %f0, %f2, %f2
+ aes_eround01 %f16, %f4, %f6, %f10
+ aes_eround23 %f18, %f4, %f6, %f6
+ ldd [$key + 208], %f16
+ ldd [$key + 216], %f18
+ aes_eround01 %f20, %f8, %f2, %f0
+ aes_eround23 %f22, %f8, %f2, %f2
+ aes_eround01 %f20, %f10, %f6, %f4
+ aes_eround23 %f22, %f10, %f6, %f6
+ ldd [$key + 224], %f20
+ ldd [$key + 232], %f22
+___
+for ($i=1; $i<6; $i++) {
+ $code.=<<___;
+ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
+ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
+ aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
+ aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
+ aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
+ aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
+ aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+ aes_eround01 %f16, %f0, %f2, %f8
+ aes_eround23 %f18, %f0, %f2, %f2
+ aes_eround01 %f16, %f4, %f6, %f10
+ aes_eround23 %f18, %f4, %f6, %f6
+ ldd [$key + 16], %f16
+ ldd [$key + 24], %f18
+ aes_eround01_l %f20, %f8, %f2, %f0
+ aes_eround23_l %f22, %f8, %f2, %f2
+ aes_eround01_l %f20, %f10, %f6, %f4
+ aes_eround23_l %f22, %f10, %f6, %f6
+ ldd [$key + 32], %f20
+ retl
+ ldd [$key + 40], %f22
+.type _aes256_encrypt_2x,#function
+.size _aes256_encrypt_2x,.-_aes256_encrypt_2x
+
+.align 32
+_aes192_loadkey:
+ ldx [$key + 0], %g4
+ ldx [$key + 8], %g5
+___
+for ($i=2; $i<26;$i++) { # load key schedule
+ $code.=<<___;
+ ldd [$key + `8*$i`], %f`12+2*$i`
+___
+}
+$code.=<<___;
+ retl
+ nop
+.type _aes192_loadkey,#function
+.size _aes192_loadkey,.-_aes192_loadkey
+_aes256_loadkey=_aes192_loadkey
+_aes192_load_enckey=_aes192_loadkey
+_aes192_load_deckey=_aes192_loadkey
+_aes256_load_enckey=_aes192_loadkey
+_aes256_load_deckey=_aes192_loadkey
+___
+
+&alg_cbc_encrypt_implement("aes",256);
+&alg_cbc_encrypt_implement("aes",192);
+if ($::evp) {
+ &alg_ctr32_implement("aes",256);
+ &alg_xts_implement("aes",256,"en");
+ &alg_xts_implement("aes",256,"de");
+ &alg_ctr32_implement("aes",192);
+}
+&alg_cbc_decrypt_implement("aes",192);
+&alg_cbc_decrypt_implement("aes",256);
+
+$code.=<<___;
+.align 32
+_aes256_decrypt_1x:
+ aes_dround01 %f16, %f0, %f2, %f4
+ aes_dround23 %f18, %f0, %f2, %f2
+ ldd [$key + 208], %f16
+ ldd [$key + 216], %f18
+ aes_dround01 %f20, %f4, %f2, %f0
+ aes_dround23 %f22, %f4, %f2, %f2
+ ldd [$key + 224], %f20
+ ldd [$key + 232], %f22
+___
+for ($i=1; $i<6; $i++) {
+ $code.=<<___;
+ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
+ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
+ aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+ aes_dround01 %f16, %f0, %f2, %f4
+ aes_dround23 %f18, %f0, %f2, %f2
+ ldd [$key + 16], %f16
+ ldd [$key + 24], %f18
+ aes_dround01_l %f20, %f4, %f2, %f0
+ aes_dround23_l %f22, %f4, %f2, %f2
+ ldd [$key + 32], %f20
+ retl
+ ldd [$key + 40], %f22
+.type _aes256_decrypt_1x,#function
+.size _aes256_decrypt_1x,.-_aes256_decrypt_1x
+
+.align 32
+_aes256_decrypt_2x:
+ aes_dround01 %f16, %f0, %f2, %f8
+ aes_dround23 %f18, %f0, %f2, %f2
+ aes_dround01 %f16, %f4, %f6, %f10
+ aes_dround23 %f18, %f4, %f6, %f6
+ ldd [$key + 208], %f16
+ ldd [$key + 216], %f18
+ aes_dround01 %f20, %f8, %f2, %f0
+ aes_dround23 %f22, %f8, %f2, %f2
+ aes_dround01 %f20, %f10, %f6, %f4
+ aes_dround23 %f22, %f10, %f6, %f6
+ ldd [$key + 224], %f20
+ ldd [$key + 232], %f22
+___
+for ($i=1; $i<6; $i++) {
+ $code.=<<___;
+ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
+ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
+ aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
+ aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
+ aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
+ aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
+ aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+ aes_dround01 %f16, %f0, %f2, %f8
+ aes_dround23 %f18, %f0, %f2, %f2
+ aes_dround01 %f16, %f4, %f6, %f10
+ aes_dround23 %f18, %f4, %f6, %f6
+ ldd [$key + 16], %f16
+ ldd [$key + 24], %f18
+ aes_dround01_l %f20, %f8, %f2, %f0
+ aes_dround23_l %f22, %f8, %f2, %f2
+ aes_dround01_l %f20, %f10, %f6, %f4
+ aes_dround23_l %f22, %f10, %f6, %f6
+ ldd [$key + 32], %f20
+ retl
+ ldd [$key + 40], %f22
+.type _aes256_decrypt_2x,#function
+.size _aes256_decrypt_2x,.-_aes256_decrypt_2x
+
+.align 32
+_aes192_decrypt_1x:
+___
+for ($i=0; $i<5; $i++) {
+ $code.=<<___;
+ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
+ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
+ aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+ aes_dround01 %f56, %f0, %f2, %f4
+ aes_dround23 %f58, %f0, %f2, %f2
+ aes_dround01_l %f60, %f4, %f2, %f0
+ retl
+ aes_dround23_l %f62, %f4, %f2, %f2
+.type _aes192_decrypt_1x,#function
+.size _aes192_decrypt_1x,.-_aes192_decrypt_1x
+
+.align 32
+_aes192_decrypt_2x:
+___
+for ($i=0; $i<5; $i++) {
+ $code.=<<___;
+ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
+ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
+ aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
+ aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
+ aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
+ aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
+ aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+ aes_dround01 %f56, %f0, %f2, %f8
+ aes_dround23 %f58, %f0, %f2, %f2
+ aes_dround01 %f56, %f4, %f6, %f10
+ aes_dround23 %f58, %f4, %f6, %f6
+ aes_dround01_l %f60, %f8, %f2, %f0
+ aes_dround23_l %f62, %f8, %f2, %f2
+ aes_dround01_l %f60, %f10, %f6, %f4
+ retl
+ aes_dround23_l %f62, %f10, %f6, %f6
+.type _aes192_decrypt_2x,#function
+.size _aes192_decrypt_2x,.-_aes192_decrypt_2x
+___
+}}}
+
+if (!$::evp) {
+$code.=<<___;
+.global AES_encrypt
+AES_encrypt=aes_t4_encrypt
+.global AES_decrypt
+AES_decrypt=aes_t4_decrypt
+.global AES_set_encrypt_key
+.align 32
+AES_set_encrypt_key:
+ andcc %o2, 7, %g0 ! check alignment
+ bnz,a,pn %icc, 1f
+ mov -1, %o0
+ brz,a,pn %o0, 1f
+ mov -1, %o0
+ brz,a,pn %o2, 1f
+ mov -1, %o0
+ andncc %o1, 0x1c0, %g0
+ bnz,a,pn %icc, 1f
+ mov -2, %o0
+ cmp %o1, 128
+ bl,a,pn %icc, 1f
+ mov -2, %o0
+ b aes_t4_set_encrypt_key
+ nop
+1: retl
+ nop
+.type AES_set_encrypt_key,#function
+.size AES_set_encrypt_key,.-AES_set_encrypt_key
+
+.global AES_set_decrypt_key
+.align 32
+AES_set_decrypt_key:
+ andcc %o2, 7, %g0 ! check alignment
+ bnz,a,pn %icc, 1f
+ mov -1, %o0
+ brz,a,pn %o0, 1f
+ mov -1, %o0
+ brz,a,pn %o2, 1f
+ mov -1, %o0
+ andncc %o1, 0x1c0, %g0
+ bnz,a,pn %icc, 1f
+ mov -2, %o0
+ cmp %o1, 128
+ bl,a,pn %icc, 1f
+ mov -2, %o0
+ b aes_t4_set_decrypt_key
+ nop
+1: retl
+ nop
+.type AES_set_decrypt_key,#function
+.size AES_set_decrypt_key,.-AES_set_decrypt_key
+___
+
+my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5));
+
+$code.=<<___;
+.globl AES_cbc_encrypt
+.align 32
+AES_cbc_encrypt:
+ ld [$key + 240], %g1
+ nop
+ brz $enc, .Lcbc_decrypt
+ cmp %g1, 12
+
+ bl,pt %icc, aes128_t4_cbc_encrypt
+ nop
+ be,pn %icc, aes192_t4_cbc_encrypt
+ nop
+ ba aes256_t4_cbc_encrypt
+ nop
+
+.Lcbc_decrypt:
+ bl,pt %icc, aes128_t4_cbc_decrypt
+ nop
+ be,pn %icc, aes192_t4_cbc_decrypt
+ nop
+ ba aes256_t4_cbc_decrypt
+ nop
+.type AES_cbc_encrypt,#function
+.size AES_cbc_encrypt,.-AES_cbc_encrypt
+___
+}
+$code.=<<___;
+.asciz "AES for SPARC T4, David S. Miller, Andy Polyakov"
+.align 4
+___
+
+&emit_assembler();
+
+close STDOUT;
diff --git a/crypto/aes/asm/aesv8-armx.pl b/crypto/aes/asm/aesv8-armx.pl
new file mode 100755
index 000000000000..95ebae3beb9b
--- /dev/null
+++ b/crypto/aes/asm/aesv8-armx.pl
@@ -0,0 +1,989 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for ARMv8 AES instructions. The
+# module is endian-agnostic in sense that it supports both big- and
+# little-endian cases. As does it support both 32- and 64-bit modes
+# of operation. Latter is achieved by limiting amount of utilized
+# registers to 16, which implies additional NEON load and integer
+# instructions. This has no effect on mighty Apple A7, where results
+# are literally equal to the theoretical estimates based on AES
+# instruction latencies and issue rates. On Cortex-A53, an in-order
+# execution core, this costs up to 10-15%, which is partially
+# compensated by implementing dedicated code path for 128-bit
+# CBC encrypt case. On Cortex-A57 parallelizable mode performance
+# seems to be limited by sheer amount of NEON instructions...
+#
+# Performance in cycles per byte processed with 128-bit key:
+#
+# CBC enc CBC dec CTR
+# Apple A7 2.39 1.20 1.20
+# Cortex-A53 1.32 1.29 1.46
+# Cortex-A57(*) 1.95 0.85 0.93
+# Denver 1.96 0.86 0.80
+#
+# (*) original 3.64/1.34/1.32 results were for r0p0 revision
+# and are still same even for updated module;
+
+$flavour = shift;
+open STDOUT,">".shift;
+
+$prefix="aes_v8";
+
+$code=<<___;
+#include "arm_arch.h"
+
+#if __ARM_MAX_ARCH__>=7
+.text
+___
+$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
+$code.=".arch armv7-a\n.fpu neon\n.code 32\n" if ($flavour !~ /64/);
+ #^^^^^^ this is done to simplify adoption by not depending
+ # on latest binutils.
+
+# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
+# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
+# maintain both 32- and 64-bit codes within single module and
+# transliterate common code to either flavour with regex vodoo.
+#
+{{{
+my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
+my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
+ $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
+
+
+$code.=<<___;
+.align 5
+rcon:
+.long 0x01,0x01,0x01,0x01
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
+.long 0x1b,0x1b,0x1b,0x1b
+
+.globl ${prefix}_set_encrypt_key
+.type ${prefix}_set_encrypt_key,%function
+.align 5
+${prefix}_set_encrypt_key:
+.Lenc_key:
+___
+$code.=<<___ if ($flavour =~ /64/);
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+___
+$code.=<<___;
+ mov $ptr,#-1
+ cmp $inp,#0
+ b.eq .Lenc_key_abort
+ cmp $out,#0
+ b.eq .Lenc_key_abort
+ mov $ptr,#-2
+ cmp $bits,#128
+ b.lt .Lenc_key_abort
+ cmp $bits,#256
+ b.gt .Lenc_key_abort
+ tst $bits,#0x3f
+ b.ne .Lenc_key_abort
+
+ adr $ptr,rcon
+ cmp $bits,#192
+
+ veor $zero,$zero,$zero
+ vld1.8 {$in0},[$inp],#16
+ mov $bits,#8 // reuse $bits
+ vld1.32 {$rcon,$mask},[$ptr],#32
+
+ b.lt .Loop128
+ b.eq .L192
+ b .L256
+
+.align 4
+.Loop128:
+ vtbl.8 $key,{$in0},$mask
+ vext.8 $tmp,$zero,$in0,#12
+ vst1.32 {$in0},[$out],#16
+ aese $key,$zero
+ subs $bits,$bits,#1
+
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $key,$key,$rcon
+ veor $in0,$in0,$tmp
+ vshl.u8 $rcon,$rcon,#1
+ veor $in0,$in0,$key
+ b.ne .Loop128
+
+ vld1.32 {$rcon},[$ptr]
+
+ vtbl.8 $key,{$in0},$mask
+ vext.8 $tmp,$zero,$in0,#12
+ vst1.32 {$in0},[$out],#16
+ aese $key,$zero
+
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $key,$key,$rcon
+ veor $in0,$in0,$tmp
+ vshl.u8 $rcon,$rcon,#1
+ veor $in0,$in0,$key
+
+ vtbl.8 $key,{$in0},$mask
+ vext.8 $tmp,$zero,$in0,#12
+ vst1.32 {$in0},[$out],#16
+ aese $key,$zero
+
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $key,$key,$rcon
+ veor $in0,$in0,$tmp
+ veor $in0,$in0,$key
+ vst1.32 {$in0},[$out]
+ add $out,$out,#0x50
+
+ mov $rounds,#10
+ b .Ldone
+
+.align 4
+.L192:
+ vld1.8 {$in1},[$inp],#8
+ vmov.i8 $key,#8 // borrow $key
+ vst1.32 {$in0},[$out],#16
+ vsub.i8 $mask,$mask,$key // adjust the mask
+
+.Loop192:
+ vtbl.8 $key,{$in1},$mask
+ vext.8 $tmp,$zero,$in0,#12
+ vst1.32 {$in1},[$out],#8
+ aese $key,$zero
+ subs $bits,$bits,#1
+
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $in0,$in0,$tmp
+
+ vdup.32 $tmp,${in0}[3]
+ veor $tmp,$tmp,$in1
+ veor $key,$key,$rcon
+ vext.8 $in1,$zero,$in1,#12
+ vshl.u8 $rcon,$rcon,#1
+ veor $in1,$in1,$tmp
+ veor $in0,$in0,$key
+ veor $in1,$in1,$key
+ vst1.32 {$in0},[$out],#16
+ b.ne .Loop192
+
+ mov $rounds,#12
+ add $out,$out,#0x20
+ b .Ldone
+
+.align 4
+.L256:
+ vld1.8 {$in1},[$inp]
+ mov $bits,#7
+ mov $rounds,#14
+ vst1.32 {$in0},[$out],#16
+
+.Loop256:
+ vtbl.8 $key,{$in1},$mask
+ vext.8 $tmp,$zero,$in0,#12
+ vst1.32 {$in1},[$out],#16
+ aese $key,$zero
+ subs $bits,$bits,#1
+
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $key,$key,$rcon
+ veor $in0,$in0,$tmp
+ vshl.u8 $rcon,$rcon,#1
+ veor $in0,$in0,$key
+ vst1.32 {$in0},[$out],#16
+ b.eq .Ldone
+
+ vdup.32 $key,${in0}[3] // just splat
+ vext.8 $tmp,$zero,$in1,#12
+ aese $key,$zero
+
+ veor $in1,$in1,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $in1,$in1,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $in1,$in1,$tmp
+
+ veor $in1,$in1,$key
+ b .Loop256
+
+.Ldone:
+ str $rounds,[$out]
+ mov $ptr,#0
+
+.Lenc_key_abort:
+ mov x0,$ptr // return value
+ `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
+ ret
+.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
+
+.globl ${prefix}_set_decrypt_key
+.type ${prefix}_set_decrypt_key,%function
+.align 5
+${prefix}_set_decrypt_key:
+___
+$code.=<<___ if ($flavour =~ /64/);
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+___
+$code.=<<___ if ($flavour !~ /64/);
+ stmdb sp!,{r4,lr}
+___
+$code.=<<___;
+ bl .Lenc_key
+
+ cmp x0,#0
+ b.ne .Ldec_key_abort
+
+ sub $out,$out,#240 // restore original $out
+ mov x4,#-16
+ add $inp,$out,x12,lsl#4 // end of key schedule
+
+ vld1.32 {v0.16b},[$out]
+ vld1.32 {v1.16b},[$inp]
+ vst1.32 {v0.16b},[$inp],x4
+ vst1.32 {v1.16b},[$out],#16
+
+.Loop_imc:
+ vld1.32 {v0.16b},[$out]
+ vld1.32 {v1.16b},[$inp]
+ aesimc v0.16b,v0.16b
+ aesimc v1.16b,v1.16b
+ vst1.32 {v0.16b},[$inp],x4
+ vst1.32 {v1.16b},[$out],#16
+ cmp $inp,$out
+ b.hi .Loop_imc
+
+ vld1.32 {v0.16b},[$out]
+ aesimc v0.16b,v0.16b
+ vst1.32 {v0.16b},[$inp]
+
+ eor x0,x0,x0 // return value
+.Ldec_key_abort:
+___
+$code.=<<___ if ($flavour !~ /64/);
+ ldmia sp!,{r4,pc}
+___
+$code.=<<___ if ($flavour =~ /64/);
+ ldp x29,x30,[sp],#16
+ ret
+___
+$code.=<<___;
+.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
+___
+}}}
+{{{
+sub gen_block () {
+my $dir = shift;
+my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
+my ($inp,$out,$key)=map("x$_",(0..2));
+my $rounds="w3";
+my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
+
+$code.=<<___;
+.globl ${prefix}_${dir}crypt
+.type ${prefix}_${dir}crypt,%function
+.align 5
+${prefix}_${dir}crypt:
+ ldr $rounds,[$key,#240]
+ vld1.32 {$rndkey0},[$key],#16
+ vld1.8 {$inout},[$inp]
+ sub $rounds,$rounds,#2
+ vld1.32 {$rndkey1},[$key],#16
+
+.Loop_${dir}c:
+ aes$e $inout,$rndkey0
+ aes$mc $inout,$inout
+ vld1.32 {$rndkey0},[$key],#16
+ subs $rounds,$rounds,#2
+ aes$e $inout,$rndkey1
+ aes$mc $inout,$inout
+ vld1.32 {$rndkey1},[$key],#16
+ b.gt .Loop_${dir}c
+
+ aes$e $inout,$rndkey0
+ aes$mc $inout,$inout
+ vld1.32 {$rndkey0},[$key]
+ aes$e $inout,$rndkey1
+ veor $inout,$inout,$rndkey0
+
+ vst1.8 {$inout},[$out]
+ ret
+.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
+___
+}
+&gen_block("en");
+&gen_block("de");
+}}}
+{{{
+my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
+my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
+my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
+
+my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
+my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
+
+### q8-q15 preloaded key schedule
+
+$code.=<<___;
+.globl ${prefix}_cbc_encrypt
+.type ${prefix}_cbc_encrypt,%function
+.align 5
+${prefix}_cbc_encrypt:
+___
+$code.=<<___ if ($flavour =~ /64/);
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+___
+$code.=<<___ if ($flavour !~ /64/);
+ mov ip,sp
+ stmdb sp!,{r4-r8,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+ ldmia ip,{r4-r5} @ load remaining args
+___
+$code.=<<___;
+ subs $len,$len,#16
+ mov $step,#16
+ b.lo .Lcbc_abort
+ cclr $step,eq
+
+ cmp $enc,#0 // en- or decrypting?
+ ldr $rounds,[$key,#240]
+ and $len,$len,#-16
+ vld1.8 {$ivec},[$ivp]
+ vld1.8 {$dat},[$inp],$step
+
+ vld1.32 {q8-q9},[$key] // load key schedule...
+ sub $rounds,$rounds,#6
+ add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
+ sub $rounds,$rounds,#2
+ vld1.32 {q10-q11},[$key_],#32
+ vld1.32 {q12-q13},[$key_],#32
+ vld1.32 {q14-q15},[$key_],#32
+ vld1.32 {$rndlast},[$key_]
+
+ add $key_,$key,#32
+ mov $cnt,$rounds
+ b.eq .Lcbc_dec
+
+ cmp $rounds,#2
+ veor $dat,$dat,$ivec
+ veor $rndzero_n_last,q8,$rndlast
+ b.eq .Lcbc_enc128
+
+ vld1.32 {$in0-$in1},[$key_]
+ add $key_,$key,#16
+ add $key4,$key,#16*4
+ add $key5,$key,#16*5
+ aese $dat,q8
+ aesmc $dat,$dat
+ add $key6,$key,#16*6
+ add $key7,$key,#16*7
+ b .Lenter_cbc_enc
+
+.align 4
+.Loop_cbc_enc:
+ aese $dat,q8
+ aesmc $dat,$dat
+ vst1.8 {$ivec},[$out],#16
+.Lenter_cbc_enc:
+ aese $dat,q9
+ aesmc $dat,$dat
+ aese $dat,$in0
+ aesmc $dat,$dat
+ vld1.32 {q8},[$key4]
+ cmp $rounds,#4
+ aese $dat,$in1
+ aesmc $dat,$dat
+ vld1.32 {q9},[$key5]
+ b.eq .Lcbc_enc192
+
+ aese $dat,q8
+ aesmc $dat,$dat
+ vld1.32 {q8},[$key6]
+ aese $dat,q9
+ aesmc $dat,$dat
+ vld1.32 {q9},[$key7]
+ nop
+
+.Lcbc_enc192:
+ aese $dat,q8
+ aesmc $dat,$dat
+ subs $len,$len,#16
+ aese $dat,q9
+ aesmc $dat,$dat
+ cclr $step,eq
+ aese $dat,q10
+ aesmc $dat,$dat
+ aese $dat,q11
+ aesmc $dat,$dat
+ vld1.8 {q8},[$inp],$step
+ aese $dat,q12
+ aesmc $dat,$dat
+ veor q8,q8,$rndzero_n_last
+ aese $dat,q13
+ aesmc $dat,$dat
+ vld1.32 {q9},[$key_] // re-pre-load rndkey[1]
+ aese $dat,q14
+ aesmc $dat,$dat
+ aese $dat,q15
+ veor $ivec,$dat,$rndlast
+ b.hs .Loop_cbc_enc
+
+ vst1.8 {$ivec},[$out],#16
+ b .Lcbc_done
+
+.align 5
+.Lcbc_enc128:
+ vld1.32 {$in0-$in1},[$key_]
+ aese $dat,q8
+ aesmc $dat,$dat
+ b .Lenter_cbc_enc128
+.Loop_cbc_enc128:
+ aese $dat,q8
+ aesmc $dat,$dat
+ vst1.8 {$ivec},[$out],#16
+.Lenter_cbc_enc128:
+ aese $dat,q9
+ aesmc $dat,$dat
+ subs $len,$len,#16
+ aese $dat,$in0
+ aesmc $dat,$dat
+ cclr $step,eq
+ aese $dat,$in1
+ aesmc $dat,$dat
+ aese $dat,q10
+ aesmc $dat,$dat
+ aese $dat,q11
+ aesmc $dat,$dat
+ vld1.8 {q8},[$inp],$step
+ aese $dat,q12
+ aesmc $dat,$dat
+ aese $dat,q13
+ aesmc $dat,$dat
+ aese $dat,q14
+ aesmc $dat,$dat
+ veor q8,q8,$rndzero_n_last
+ aese $dat,q15
+ veor $ivec,$dat,$rndlast
+ b.hs .Loop_cbc_enc128
+
+ vst1.8 {$ivec},[$out],#16
+ b .Lcbc_done
+___
+{
+my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
+$code.=<<___;
+.align 5
+.Lcbc_dec:
+ vld1.8 {$dat2},[$inp],#16
+ subs $len,$len,#32 // bias
+ add $cnt,$rounds,#2
+ vorr $in1,$dat,$dat
+ vorr $dat1,$dat,$dat
+ vorr $in2,$dat2,$dat2
+ b.lo .Lcbc_dec_tail
+
+ vorr $dat1,$dat2,$dat2
+ vld1.8 {$dat2},[$inp],#16
+ vorr $in0,$dat,$dat
+ vorr $in1,$dat1,$dat1
+ vorr $in2,$dat2,$dat2
+
+.Loop3x_cbc_dec:
+ aesd $dat0,q8
+ aesimc $dat0,$dat0
+ aesd $dat1,q8
+ aesimc $dat1,$dat1
+ aesd $dat2,q8
+ aesimc $dat2,$dat2
+ vld1.32 {q8},[$key_],#16
+ subs $cnt,$cnt,#2
+ aesd $dat0,q9
+ aesimc $dat0,$dat0
+ aesd $dat1,q9
+ aesimc $dat1,$dat1
+ aesd $dat2,q9
+ aesimc $dat2,$dat2
+ vld1.32 {q9},[$key_],#16
+ b.gt .Loop3x_cbc_dec
+
+ aesd $dat0,q8
+ aesimc $dat0,$dat0
+ aesd $dat1,q8
+ aesimc $dat1,$dat1
+ aesd $dat2,q8
+ aesimc $dat2,$dat2
+ veor $tmp0,$ivec,$rndlast
+ subs $len,$len,#0x30
+ veor $tmp1,$in0,$rndlast
+ mov.lo x6,$len // x6, $cnt, is zero at this point
+ aesd $dat0,q9
+ aesimc $dat0,$dat0
+ aesd $dat1,q9
+ aesimc $dat1,$dat1
+ aesd $dat2,q9
+ aesimc $dat2,$dat2
+ veor $tmp2,$in1,$rndlast
+ add $inp,$inp,x6 // $inp is adjusted in such way that
+ // at exit from the loop $dat1-$dat2
+ // are loaded with last "words"
+ vorr $ivec,$in2,$in2
+ mov $key_,$key
+ aesd $dat0,q12
+ aesimc $dat0,$dat0
+ aesd $dat1,q12
+ aesimc $dat1,$dat1
+ aesd $dat2,q12
+ aesimc $dat2,$dat2
+ vld1.8 {$in0},[$inp],#16
+ aesd $dat0,q13
+ aesimc $dat0,$dat0
+ aesd $dat1,q13
+ aesimc $dat1,$dat1
+ aesd $dat2,q13
+ aesimc $dat2,$dat2
+ vld1.8 {$in1},[$inp],#16
+ aesd $dat0,q14
+ aesimc $dat0,$dat0
+ aesd $dat1,q14
+ aesimc $dat1,$dat1
+ aesd $dat2,q14
+ aesimc $dat2,$dat2
+ vld1.8 {$in2},[$inp],#16
+ aesd $dat0,q15
+ aesd $dat1,q15
+ aesd $dat2,q15
+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
+ add $cnt,$rounds,#2
+ veor $tmp0,$tmp0,$dat0
+ veor $tmp1,$tmp1,$dat1
+ veor $dat2,$dat2,$tmp2
+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
+ vst1.8 {$tmp0},[$out],#16
+ vorr $dat0,$in0,$in0
+ vst1.8 {$tmp1},[$out],#16
+ vorr $dat1,$in1,$in1
+ vst1.8 {$dat2},[$out],#16
+ vorr $dat2,$in2,$in2
+ b.hs .Loop3x_cbc_dec
+
+ cmn $len,#0x30
+ b.eq .Lcbc_done
+ nop
+
+.Lcbc_dec_tail:
+ aesd $dat1,q8
+ aesimc $dat1,$dat1
+ aesd $dat2,q8
+ aesimc $dat2,$dat2
+ vld1.32 {q8},[$key_],#16
+ subs $cnt,$cnt,#2
+ aesd $dat1,q9
+ aesimc $dat1,$dat1
+ aesd $dat2,q9
+ aesimc $dat2,$dat2
+ vld1.32 {q9},[$key_],#16
+ b.gt .Lcbc_dec_tail
+
+ aesd $dat1,q8
+ aesimc $dat1,$dat1
+ aesd $dat2,q8
+ aesimc $dat2,$dat2
+ aesd $dat1,q9
+ aesimc $dat1,$dat1
+ aesd $dat2,q9
+ aesimc $dat2,$dat2
+ aesd $dat1,q12
+ aesimc $dat1,$dat1
+ aesd $dat2,q12
+ aesimc $dat2,$dat2
+ cmn $len,#0x20
+ aesd $dat1,q13
+ aesimc $dat1,$dat1
+ aesd $dat2,q13
+ aesimc $dat2,$dat2
+ veor $tmp1,$ivec,$rndlast
+ aesd $dat1,q14
+ aesimc $dat1,$dat1
+ aesd $dat2,q14
+ aesimc $dat2,$dat2
+ veor $tmp2,$in1,$rndlast
+ aesd $dat1,q15
+ aesd $dat2,q15
+ b.eq .Lcbc_dec_one
+ veor $tmp1,$tmp1,$dat1
+ veor $tmp2,$tmp2,$dat2
+ vorr $ivec,$in2,$in2
+ vst1.8 {$tmp1},[$out],#16
+ vst1.8 {$tmp2},[$out],#16
+ b .Lcbc_done
+
+.Lcbc_dec_one:
+ veor $tmp1,$tmp1,$dat2
+ vorr $ivec,$in2,$in2
+ vst1.8 {$tmp1},[$out],#16
+
+.Lcbc_done:
+ vst1.8 {$ivec},[$ivp]
+.Lcbc_abort:
+___
+}
+$code.=<<___ if ($flavour !~ /64/);
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r8,pc}
+___
+$code.=<<___ if ($flavour =~ /64/);
+ ldr x29,[sp],#16
+ ret
+___
+$code.=<<___;
+.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
+___
+}}}
+{{{
+my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
+my ($rounds,$cnt,$key_)=("w5","w6","x7");
+my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
+my $step="x12"; # aliases with $tctr2
+
+my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
+my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
+
+my ($dat,$tmp)=($dat0,$tmp0);
+
+### q8-q15 preloaded key schedule
+
+$code.=<<___;
+.globl ${prefix}_ctr32_encrypt_blocks
+.type ${prefix}_ctr32_encrypt_blocks,%function
+.align 5
+${prefix}_ctr32_encrypt_blocks:
+___
+$code.=<<___ if ($flavour =~ /64/);
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+___
+$code.=<<___ if ($flavour !~ /64/);
+ mov ip,sp
+ stmdb sp!,{r4-r10,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+ ldr r4, [ip] @ load remaining arg
+___
+$code.=<<___;
+ ldr $rounds,[$key,#240]
+
+ ldr $ctr, [$ivp, #12]
+ vld1.32 {$dat0},[$ivp]
+
+ vld1.32 {q8-q9},[$key] // load key schedule...
+ sub $rounds,$rounds,#4
+ mov $step,#16
+ cmp $len,#2
+ add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
+ sub $rounds,$rounds,#2
+ vld1.32 {q12-q13},[$key_],#32
+ vld1.32 {q14-q15},[$key_],#32
+ vld1.32 {$rndlast},[$key_]
+ add $key_,$key,#32
+ mov $cnt,$rounds
+ cclr $step,lo
+#ifndef __ARMEB__
+ rev $ctr, $ctr
+#endif
+ vorr $dat1,$dat0,$dat0
+ add $tctr1, $ctr, #1
+ vorr $dat2,$dat0,$dat0
+ add $ctr, $ctr, #2
+ vorr $ivec,$dat0,$dat0
+ rev $tctr1, $tctr1
+ vmov.32 ${dat1}[3],$tctr1
+ b.ls .Lctr32_tail
+ rev $tctr2, $ctr
+ sub $len,$len,#3 // bias
+ vmov.32 ${dat2}[3],$tctr2
+ b .Loop3x_ctr32
+
+.align 4
+.Loop3x_ctr32:
+ aese $dat0,q8
+ aesmc $dat0,$dat0
+ aese $dat1,q8
+ aesmc $dat1,$dat1
+ aese $dat2,q8
+ aesmc $dat2,$dat2
+ vld1.32 {q8},[$key_],#16
+ subs $cnt,$cnt,#2
+ aese $dat0,q9
+ aesmc $dat0,$dat0
+ aese $dat1,q9
+ aesmc $dat1,$dat1
+ aese $dat2,q9
+ aesmc $dat2,$dat2
+ vld1.32 {q9},[$key_],#16
+ b.gt .Loop3x_ctr32
+
+ aese $dat0,q8
+ aesmc $tmp0,$dat0
+ aese $dat1,q8
+ aesmc $tmp1,$dat1
+ vld1.8 {$in0},[$inp],#16
+ vorr $dat0,$ivec,$ivec
+ aese $dat2,q8
+ aesmc $dat2,$dat2
+ vld1.8 {$in1},[$inp],#16
+ vorr $dat1,$ivec,$ivec
+ aese $tmp0,q9
+ aesmc $tmp0,$tmp0
+ aese $tmp1,q9
+ aesmc $tmp1,$tmp1
+ vld1.8 {$in2},[$inp],#16
+ mov $key_,$key
+ aese $dat2,q9
+ aesmc $tmp2,$dat2
+ vorr $dat2,$ivec,$ivec
+ add $tctr0,$ctr,#1
+ aese $tmp0,q12
+ aesmc $tmp0,$tmp0
+ aese $tmp1,q12
+ aesmc $tmp1,$tmp1
+ veor $in0,$in0,$rndlast
+ add $tctr1,$ctr,#2
+ aese $tmp2,q12
+ aesmc $tmp2,$tmp2
+ veor $in1,$in1,$rndlast
+ add $ctr,$ctr,#3
+ aese $tmp0,q13
+ aesmc $tmp0,$tmp0
+ aese $tmp1,q13
+ aesmc $tmp1,$tmp1
+ veor $in2,$in2,$rndlast
+ rev $tctr0,$tctr0
+ aese $tmp2,q13
+ aesmc $tmp2,$tmp2
+ vmov.32 ${dat0}[3], $tctr0
+ rev $tctr1,$tctr1
+ aese $tmp0,q14
+ aesmc $tmp0,$tmp0
+ aese $tmp1,q14
+ aesmc $tmp1,$tmp1
+ vmov.32 ${dat1}[3], $tctr1
+ rev $tctr2,$ctr
+ aese $tmp2,q14
+ aesmc $tmp2,$tmp2
+ vmov.32 ${dat2}[3], $tctr2
+ subs $len,$len,#3
+ aese $tmp0,q15
+ aese $tmp1,q15
+ aese $tmp2,q15
+
+ veor $in0,$in0,$tmp0
+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
+ vst1.8 {$in0},[$out],#16
+ veor $in1,$in1,$tmp1
+ mov $cnt,$rounds
+ vst1.8 {$in1},[$out],#16
+ veor $in2,$in2,$tmp2
+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
+ vst1.8 {$in2},[$out],#16
+ b.hs .Loop3x_ctr32
+
+ adds $len,$len,#3
+ b.eq .Lctr32_done
+ cmp $len,#1
+ mov $step,#16
+ cclr $step,eq
+
+.Lctr32_tail:
+ aese $dat0,q8
+ aesmc $dat0,$dat0
+ aese $dat1,q8
+ aesmc $dat1,$dat1
+ vld1.32 {q8},[$key_],#16
+ subs $cnt,$cnt,#2
+ aese $dat0,q9
+ aesmc $dat0,$dat0
+ aese $dat1,q9
+ aesmc $dat1,$dat1
+ vld1.32 {q9},[$key_],#16
+ b.gt .Lctr32_tail
+
+ aese $dat0,q8
+ aesmc $dat0,$dat0
+ aese $dat1,q8
+ aesmc $dat1,$dat1
+ aese $dat0,q9
+ aesmc $dat0,$dat0
+ aese $dat1,q9
+ aesmc $dat1,$dat1
+ vld1.8 {$in0},[$inp],$step
+ aese $dat0,q12
+ aesmc $dat0,$dat0
+ aese $dat1,q12
+ aesmc $dat1,$dat1
+ vld1.8 {$in1},[$inp]
+ aese $dat0,q13
+ aesmc $dat0,$dat0
+ aese $dat1,q13
+ aesmc $dat1,$dat1
+ veor $in0,$in0,$rndlast
+ aese $dat0,q14
+ aesmc $dat0,$dat0
+ aese $dat1,q14
+ aesmc $dat1,$dat1
+ veor $in1,$in1,$rndlast
+ aese $dat0,q15
+ aese $dat1,q15
+
+ cmp $len,#1
+ veor $in0,$in0,$dat0
+ veor $in1,$in1,$dat1
+ vst1.8 {$in0},[$out],#16
+ b.eq .Lctr32_done
+ vst1.8 {$in1},[$out]
+
+.Lctr32_done:
+___
+$code.=<<___ if ($flavour !~ /64/);
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r10,pc}
+___
+$code.=<<___ if ($flavour =~ /64/);
+ ldr x29,[sp],#16
+ ret
+___
+$code.=<<___;
+.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
+___
+}}}
+$code.=<<___;
+#endif
+___
+########################################
+if ($flavour =~ /64/) { ######## 64-bit code
+ my %opcode = (
+ "aesd" => 0x4e285800, "aese" => 0x4e284800,
+ "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
+
+ local *unaes = sub {
+ my ($mnemonic,$arg)=@_;
+
+ $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
+ sprintf ".inst\t0x%08x\t//%s %s",
+ $opcode{$mnemonic}|$1|($2<<5),
+ $mnemonic,$arg;
+ };
+
+ foreach(split("\n",$code)) {
+ s/\`([^\`]*)\`/eval($1)/geo;
+
+ s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
+ s/@\s/\/\//o; # old->new style commentary
+
+ #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
+ s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
+ s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
+ s/vmov\.i8/movi/o or # fix up legacy mnemonics
+ s/vext\.8/ext/o or
+ s/vrev32\.8/rev32/o or
+ s/vtst\.8/cmtst/o or
+ s/vshr/ushr/o or
+ s/^(\s+)v/$1/o or # strip off v prefix
+ s/\bbx\s+lr\b/ret/o;
+
+ # fix up remainig legacy suffixes
+ s/\.[ui]?8//o;
+ m/\],#8/o and s/\.16b/\.8b/go;
+ s/\.[ui]?32//o and s/\.16b/\.4s/go;
+ s/\.[ui]?64//o and s/\.16b/\.2d/go;
+ s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
+
+ print $_,"\n";
+ }
+} else { ######## 32-bit code
+ my %opcode = (
+ "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
+ "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
+
+ local *unaes = sub {
+ my ($mnemonic,$arg)=@_;
+
+ if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
+ my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+ |(($2&7)<<1) |(($2&8)<<2);
+ # since ARMv7 instructions are always encoded little-endian.
+ # correct solution is to use .inst directive, but older
+ # assemblers don't implement it:-(
+ sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+ $word&0xff,($word>>8)&0xff,
+ ($word>>16)&0xff,($word>>24)&0xff,
+ $mnemonic,$arg;
+ }
+ };
+
+ sub unvtbl {
+ my $arg=shift;
+
+ $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
+ sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
+ "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
+ }
+
+ sub unvdup32 {
+ my $arg=shift;
+
+ $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
+ sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
+ }
+
+ sub unvmov32 {
+ my $arg=shift;
+
+ $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
+ sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
+ }
+
+ foreach(split("\n",$code)) {
+ s/\`([^\`]*)\`/eval($1)/geo;
+
+ s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
+ s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
+ s/\/\/\s?/@ /o; # new->old style commentary
+
+ # fix up remainig new-style suffixes
+ s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
+ s/\],#[0-9]+/]!/o;
+
+ s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
+ s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
+ s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
+ s/vdup\.32\s+(.*)/unvdup32($1)/geo or
+ s/vmov\.32\s+(.*)/unvmov32($1)/geo or
+ s/^(\s+)b\./$1b/o or
+ s/^(\s+)mov\./$1mov/o or
+ s/^(\s+)ret/$1bx\tlr/o;
+
+ print $_,"\n";
+ }
+}
+
+close STDOUT;
diff --git a/crypto/aes/asm/bsaes-armv7.pl b/crypto/aes/asm/bsaes-armv7.pl
new file mode 100755
index 000000000000..fcc81d1a4933
--- /dev/null
+++ b/crypto/aes/asm/bsaes-armv7.pl
@@ -0,0 +1,2469 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Specific modes and adaptation for Linux kernel by Ard Biesheuvel
+# <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
+# granted.
+# ====================================================================
+
+# Bit-sliced AES for ARM NEON
+#
+# February 2012.
+#
+# This implementation is direct adaptation of bsaes-x86_64 module for
+# ARM NEON. Except that this module is endian-neutral [in sense that
+# it can be compiled for either endianness] by courtesy of vld1.8's
+# neutrality. Initial version doesn't implement interface to OpenSSL,
+# only low-level primitives and unsupported entry points, just enough
+# to collect performance results, which for Cortex-A8 core are:
+#
+# encrypt 19.5 cycles per byte processed with 128-bit key
+# decrypt 22.1 cycles per byte processed with 128-bit key
+# key conv. 440 cycles per 128-bit key/0.18 of 8x block
+#
+# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
+# which is [much] worse than anticipated (for further details see
+# http://www.openssl.org/~appro/Snapdragon-S4.html).
+#
+# Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
+# manages in 20.0 cycles].
+#
+# When comparing to x86_64 results keep in mind that NEON unit is
+# [mostly] single-issue and thus can't [fully] benefit from
+# instruction-level parallelism. And when comparing to aes-armv4
+# results keep in mind key schedule conversion overhead (see
+# bsaes-x86_64.pl for further details)...
+#
+# <appro@openssl.org>
+
+# April-August 2013
+#
+# Add CBC, CTR and XTS subroutines, adapt for kernel use.
+#
+# <ard.biesheuvel@linaro.org>
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
+my @XMM=map("q$_",(0..15));
+
+{
+my ($key,$rounds,$const)=("r4","r5","r6");
+
+sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
+sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
+
+sub Sbox {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+ &InBasisChange (@b);
+ &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
+ &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
+}
+
+sub InBasisChange {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
+my @b=@_[0..7];
+$code.=<<___;
+ veor @b[2], @b[2], @b[1]
+ veor @b[5], @b[5], @b[6]
+ veor @b[3], @b[3], @b[0]
+ veor @b[6], @b[6], @b[2]
+ veor @b[5], @b[5], @b[0]
+
+ veor @b[6], @b[6], @b[3]
+ veor @b[3], @b[3], @b[7]
+ veor @b[7], @b[7], @b[5]
+ veor @b[3], @b[3], @b[4]
+ veor @b[4], @b[4], @b[5]
+
+ veor @b[2], @b[2], @b[7]
+ veor @b[3], @b[3], @b[1]
+ veor @b[1], @b[1], @b[5]
+___
+}
+
+sub OutBasisChange {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
+my @b=@_[0..7];
+$code.=<<___;
+ veor @b[0], @b[0], @b[6]
+ veor @b[1], @b[1], @b[4]
+ veor @b[4], @b[4], @b[6]
+ veor @b[2], @b[2], @b[0]
+ veor @b[6], @b[6], @b[1]
+
+ veor @b[1], @b[1], @b[5]
+ veor @b[5], @b[5], @b[3]
+ veor @b[3], @b[3], @b[7]
+ veor @b[7], @b[7], @b[5]
+ veor @b[2], @b[2], @b[5]
+
+ veor @b[4], @b[4], @b[7]
+___
+}
+
+sub InvSbox {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+ &InvInBasisChange (@b);
+ &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
+ &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
+}
+
+sub InvInBasisChange { # OutBasisChange in reverse (with twist)
+my @b=@_[5,1,2,6,3,7,0,4];
+$code.=<<___
+ veor @b[1], @b[1], @b[7]
+ veor @b[4], @b[4], @b[7]
+
+ veor @b[7], @b[7], @b[5]
+ veor @b[1], @b[1], @b[3]
+ veor @b[2], @b[2], @b[5]
+ veor @b[3], @b[3], @b[7]
+
+ veor @b[6], @b[6], @b[1]
+ veor @b[2], @b[2], @b[0]
+ veor @b[5], @b[5], @b[3]
+ veor @b[4], @b[4], @b[6]
+ veor @b[0], @b[0], @b[6]
+ veor @b[1], @b[1], @b[4]
+___
+}
+
+sub InvOutBasisChange { # InBasisChange in reverse
+my @b=@_[2,5,7,3,6,1,0,4];
+$code.=<<___;
+ veor @b[1], @b[1], @b[5]
+ veor @b[2], @b[2], @b[7]
+
+ veor @b[3], @b[3], @b[1]
+ veor @b[4], @b[4], @b[5]
+ veor @b[7], @b[7], @b[5]
+ veor @b[3], @b[3], @b[4]
+ veor @b[5], @b[5], @b[0]
+ veor @b[3], @b[3], @b[7]
+ veor @b[6], @b[6], @b[2]
+ veor @b[2], @b[2], @b[1]
+ veor @b[6], @b[6], @b[3]
+
+ veor @b[3], @b[3], @b[0]
+ veor @b[5], @b[5], @b[6]
+___
+}
+
+sub Mul_GF4 {
+#;*************************************************************
+#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
+#;*************************************************************
+my ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
+$code.=<<___;
+ veor $t0, $y0, $y1
+ vand $t0, $t0, $x0
+ veor $x0, $x0, $x1
+ vand $t1, $x1, $y0
+ vand $x0, $x0, $y1
+ veor $x1, $t1, $t0
+ veor $x0, $x0, $t1
+___
+}
+
+sub Mul_GF4_N { # not used, see next subroutine
+# multiply and scale by N
+my ($x0,$x1,$y0,$y1,$t0)=@_;
+$code.=<<___;
+ veor $t0, $y0, $y1
+ vand $t0, $t0, $x0
+ veor $x0, $x0, $x1
+ vand $x1, $x1, $y0
+ vand $x0, $x0, $y1
+ veor $x1, $x1, $x0
+ veor $x0, $x0, $t0
+___
+}
+
+sub Mul_GF4_N_GF4 {
+# interleaved Mul_GF4_N and Mul_GF4
+my ($x0,$x1,$y0,$y1,$t0,
+ $x2,$x3,$y2,$y3,$t1)=@_;
+$code.=<<___;
+ veor $t0, $y0, $y1
+ veor $t1, $y2, $y3
+ vand $t0, $t0, $x0
+ vand $t1, $t1, $x2
+ veor $x0, $x0, $x1
+ veor $x2, $x2, $x3
+ vand $x1, $x1, $y0
+ vand $x3, $x3, $y2
+ vand $x0, $x0, $y1
+ vand $x2, $x2, $y3
+ veor $x1, $x1, $x0
+ veor $x2, $x2, $x3
+ veor $x0, $x0, $t0
+ veor $x3, $x3, $t1
+___
+}
+sub Mul_GF16_2 {
+my @x=@_[0..7];
+my @y=@_[8..11];
+my @t=@_[12..15];
+$code.=<<___;
+ veor @t[0], @x[0], @x[2]
+ veor @t[1], @x[1], @x[3]
+___
+ &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2..3]);
+$code.=<<___;
+ veor @y[0], @y[0], @y[2]
+ veor @y[1], @y[1], @y[3]
+___
+ Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
+ @x[2], @x[3], @y[2], @y[3], @t[2]);
+$code.=<<___;
+ veor @x[0], @x[0], @t[0]
+ veor @x[2], @x[2], @t[0]
+ veor @x[1], @x[1], @t[1]
+ veor @x[3], @x[3], @t[1]
+
+ veor @t[0], @x[4], @x[6]
+ veor @t[1], @x[5], @x[7]
+___
+ &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
+ @x[6], @x[7], @y[2], @y[3], @t[2]);
+$code.=<<___;
+ veor @y[0], @y[0], @y[2]
+ veor @y[1], @y[1], @y[3]
+___
+ &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[2..3]);
+$code.=<<___;
+ veor @x[4], @x[4], @t[0]
+ veor @x[6], @x[6], @t[0]
+ veor @x[5], @x[5], @t[1]
+ veor @x[7], @x[7], @t[1]
+___
+}
+sub Inv_GF256 {
+#;********************************************************************
+#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
+#;********************************************************************
+my @x=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+# direct optimizations from hardware
+$code.=<<___;
+ veor @t[3], @x[4], @x[6]
+ veor @t[2], @x[5], @x[7]
+ veor @t[1], @x[1], @x[3]
+ veor @s[1], @x[7], @x[6]
+ vmov @t[0], @t[2]
+ veor @s[0], @x[0], @x[2]
+
+ vorr @t[2], @t[2], @t[1]
+ veor @s[3], @t[3], @t[0]
+ vand @s[2], @t[3], @s[0]
+ vorr @t[3], @t[3], @s[0]
+ veor @s[0], @s[0], @t[1]
+ vand @t[0], @t[0], @t[1]
+ veor @t[1], @x[3], @x[2]
+ vand @s[3], @s[3], @s[0]
+ vand @s[1], @s[1], @t[1]
+ veor @t[1], @x[4], @x[5]
+ veor @s[0], @x[1], @x[0]
+ veor @t[3], @t[3], @s[1]
+ veor @t[2], @t[2], @s[1]
+ vand @s[1], @t[1], @s[0]
+ vorr @t[1], @t[1], @s[0]
+ veor @t[3], @t[3], @s[3]
+ veor @t[0], @t[0], @s[1]
+ veor @t[2], @t[2], @s[2]
+ veor @t[1], @t[1], @s[3]
+ veor @t[0], @t[0], @s[2]
+ vand @s[0], @x[7], @x[3]
+ veor @t[1], @t[1], @s[2]
+ vand @s[1], @x[6], @x[2]
+ vand @s[2], @x[5], @x[1]
+ vorr @s[3], @x[4], @x[0]
+ veor @t[3], @t[3], @s[0]
+ veor @t[1], @t[1], @s[2]
+ veor @t[0], @t[0], @s[3]
+ veor @t[2], @t[2], @s[1]
+
+ @ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
+
+ @ new smaller inversion
+
+ vand @s[2], @t[3], @t[1]
+ vmov @s[0], @t[0]
+
+ veor @s[1], @t[2], @s[2]
+ veor @s[3], @t[0], @s[2]
+ veor @s[2], @t[0], @s[2] @ @s[2]=@s[3]
+
+ vbsl @s[1], @t[1], @t[0]
+ vbsl @s[3], @t[3], @t[2]
+ veor @t[3], @t[3], @t[2]
+
+ vbsl @s[0], @s[1], @s[2]
+ vbsl @t[0], @s[2], @s[1]
+
+ vand @s[2], @s[0], @s[3]
+ veor @t[1], @t[1], @t[0]
+
+ veor @s[2], @s[2], @t[3]
+___
+# output in s3, s2, s1, t1
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
+ &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
+
+### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
+}
+
+# AES linear components
+
+sub ShiftRows {
+my @x=@_[0..7];
+my @t=@_[8..11];
+my $mask=pop;
+$code.=<<___;
+ vldmia $key!, {@t[0]-@t[3]}
+ veor @t[0], @t[0], @x[0]
+ veor @t[1], @t[1], @x[1]
+ vtbl.8 `&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)`
+ vldmia $key!, {@t[0]}
+ veor @t[2], @t[2], @x[2]
+ vtbl.8 `&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)`
+ vldmia $key!, {@t[1]}
+ veor @t[3], @t[3], @x[3]
+ vtbl.8 `&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)`
+ vldmia $key!, {@t[2]}
+ vtbl.8 `&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)`
+ vldmia $key!, {@t[3]}
+ veor @t[0], @t[0], @x[4]
+ veor @t[1], @t[1], @x[5]
+ vtbl.8 `&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)`
+ veor @t[2], @t[2], @x[6]
+ vtbl.8 `&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)`
+ veor @t[3], @t[3], @x[7]
+ vtbl.8 `&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)`
+ vtbl.8 `&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)`
+___
+}
+
+sub MixColumns {
+# modified to emit output in order suitable for feeding back to aesenc[last]
+my @x=@_[0..7];
+my @t=@_[8..15];
+my $inv=@_[16]; # optional
+$code.=<<___;
+ vext.8 @t[0], @x[0], @x[0], #12 @ x0 <<< 32
+ vext.8 @t[1], @x[1], @x[1], #12
+ veor @x[0], @x[0], @t[0] @ x0 ^ (x0 <<< 32)
+ vext.8 @t[2], @x[2], @x[2], #12
+ veor @x[1], @x[1], @t[1]
+ vext.8 @t[3], @x[3], @x[3], #12
+ veor @x[2], @x[2], @t[2]
+ vext.8 @t[4], @x[4], @x[4], #12
+ veor @x[3], @x[3], @t[3]
+ vext.8 @t[5], @x[5], @x[5], #12
+ veor @x[4], @x[4], @t[4]
+ vext.8 @t[6], @x[6], @x[6], #12
+ veor @x[5], @x[5], @t[5]
+ vext.8 @t[7], @x[7], @x[7], #12
+ veor @x[6], @x[6], @t[6]
+
+ veor @t[1], @t[1], @x[0]
+ veor @x[7], @x[7], @t[7]
+ vext.8 @x[0], @x[0], @x[0], #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
+ veor @t[2], @t[2], @x[1]
+ veor @t[0], @t[0], @x[7]
+ veor @t[1], @t[1], @x[7]
+ vext.8 @x[1], @x[1], @x[1], #8
+ veor @t[5], @t[5], @x[4]
+ veor @x[0], @x[0], @t[0]
+ veor @t[6], @t[6], @x[5]
+ veor @x[1], @x[1], @t[1]
+ vext.8 @t[0], @x[4], @x[4], #8
+ veor @t[4], @t[4], @x[3]
+ vext.8 @t[1], @x[5], @x[5], #8
+ veor @t[7], @t[7], @x[6]
+ vext.8 @x[4], @x[3], @x[3], #8
+ veor @t[3], @t[3], @x[2]
+ vext.8 @x[5], @x[7], @x[7], #8
+ veor @t[4], @t[4], @x[7]
+ vext.8 @x[3], @x[6], @x[6], #8
+ veor @t[3], @t[3], @x[7]
+ vext.8 @x[6], @x[2], @x[2], #8
+ veor @x[7], @t[1], @t[5]
+___
+$code.=<<___ if (!$inv);
+ veor @x[2], @t[0], @t[4]
+ veor @x[4], @x[4], @t[3]
+ veor @x[5], @x[5], @t[7]
+ veor @x[3], @x[3], @t[6]
+ @ vmov @x[2], @t[0]
+ veor @x[6], @x[6], @t[2]
+ @ vmov @x[7], @t[1]
+___
+$code.=<<___ if ($inv);
+ veor @t[3], @t[3], @x[4]
+ veor @x[5], @x[5], @t[7]
+ veor @x[2], @x[3], @t[6]
+ veor @x[3], @t[0], @t[4]
+ veor @x[4], @x[6], @t[2]
+ vmov @x[6], @t[3]
+ @ vmov @x[7], @t[1]
+___
+}
+
+sub InvMixColumns_orig {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+$code.=<<___;
+ @ multiplication by 0x0e
+ vext.8 @t[7], @x[7], @x[7], #12
+ vmov @t[2], @x[2]
+ veor @x[2], @x[2], @x[5] @ 2 5
+ veor @x[7], @x[7], @x[5] @ 7 5
+ vext.8 @t[0], @x[0], @x[0], #12
+ vmov @t[5], @x[5]
+ veor @x[5], @x[5], @x[0] @ 5 0 [1]
+ veor @x[0], @x[0], @x[1] @ 0 1
+ vext.8 @t[1], @x[1], @x[1], #12
+ veor @x[1], @x[1], @x[2] @ 1 25
+ veor @x[0], @x[0], @x[6] @ 01 6 [2]
+ vext.8 @t[3], @x[3], @x[3], #12
+ veor @x[1], @x[1], @x[3] @ 125 3 [4]
+ veor @x[2], @x[2], @x[0] @ 25 016 [3]
+ veor @x[3], @x[3], @x[7] @ 3 75
+ veor @x[7], @x[7], @x[6] @ 75 6 [0]
+ vext.8 @t[6], @x[6], @x[6], #12
+ vmov @t[4], @x[4]
+ veor @x[6], @x[6], @x[4] @ 6 4
+ veor @x[4], @x[4], @x[3] @ 4 375 [6]
+ veor @x[3], @x[3], @x[7] @ 375 756=36
+ veor @x[6], @x[6], @t[5] @ 64 5 [7]
+ veor @x[3], @x[3], @t[2] @ 36 2
+ vext.8 @t[5], @t[5], @t[5], #12
+ veor @x[3], @x[3], @t[4] @ 362 4 [5]
+___
+ my @y = @x[7,5,0,2,1,3,4,6];
+$code.=<<___;
+ @ multiplication by 0x0b
+ veor @y[1], @y[1], @y[0]
+ veor @y[0], @y[0], @t[0]
+ vext.8 @t[2], @t[2], @t[2], #12
+ veor @y[1], @y[1], @t[1]
+ veor @y[0], @y[0], @t[5]
+ vext.8 @t[4], @t[4], @t[4], #12
+ veor @y[1], @y[1], @t[6]
+ veor @y[0], @y[0], @t[7]
+ veor @t[7], @t[7], @t[6] @ clobber t[7]
+
+ veor @y[3], @y[3], @t[0]
+ veor @y[1], @y[1], @y[0]
+ vext.8 @t[0], @t[0], @t[0], #12
+ veor @y[2], @y[2], @t[1]
+ veor @y[4], @y[4], @t[1]
+ vext.8 @t[1], @t[1], @t[1], #12
+ veor @y[2], @y[2], @t[2]
+ veor @y[3], @y[3], @t[2]
+ veor @y[5], @y[5], @t[2]
+ veor @y[2], @y[2], @t[7]
+ vext.8 @t[2], @t[2], @t[2], #12
+ veor @y[3], @y[3], @t[3]
+ veor @y[6], @y[6], @t[3]
+ veor @y[4], @y[4], @t[3]
+ veor @y[7], @y[7], @t[4]
+ vext.8 @t[3], @t[3], @t[3], #12
+ veor @y[5], @y[5], @t[4]
+ veor @y[7], @y[7], @t[7]
+ veor @t[7], @t[7], @t[5] @ clobber t[7] even more
+ veor @y[3], @y[3], @t[5]
+ veor @y[4], @y[4], @t[4]
+
+ veor @y[5], @y[5], @t[7]
+ vext.8 @t[4], @t[4], @t[4], #12
+ veor @y[6], @y[6], @t[7]
+ veor @y[4], @y[4], @t[7]
+
+ veor @t[7], @t[7], @t[5]
+ vext.8 @t[5], @t[5], @t[5], #12
+
+ @ multiplication by 0x0d
+ veor @y[4], @y[4], @y[7]
+ veor @t[7], @t[7], @t[6] @ restore t[7]
+ veor @y[7], @y[7], @t[4]
+ vext.8 @t[6], @t[6], @t[6], #12
+ veor @y[2], @y[2], @t[0]
+ veor @y[7], @y[7], @t[5]
+ vext.8 @t[7], @t[7], @t[7], #12
+ veor @y[2], @y[2], @t[2]
+
+ veor @y[3], @y[3], @y[1]
+ veor @y[1], @y[1], @t[1]
+ veor @y[0], @y[0], @t[0]
+ veor @y[3], @y[3], @t[0]
+ veor @y[1], @y[1], @t[5]
+ veor @y[0], @y[0], @t[5]
+ vext.8 @t[0], @t[0], @t[0], #12
+ veor @y[1], @y[1], @t[7]
+ veor @y[0], @y[0], @t[6]
+ veor @y[3], @y[3], @y[1]
+ veor @y[4], @y[4], @t[1]
+ vext.8 @t[1], @t[1], @t[1], #12
+
+ veor @y[7], @y[7], @t[7]
+ veor @y[4], @y[4], @t[2]
+ veor @y[5], @y[5], @t[2]
+ veor @y[2], @y[2], @t[6]
+ veor @t[6], @t[6], @t[3] @ clobber t[6]
+ vext.8 @t[2], @t[2], @t[2], #12
+ veor @y[4], @y[4], @y[7]
+ veor @y[3], @y[3], @t[6]
+
+ veor @y[6], @y[6], @t[6]
+ veor @y[5], @y[5], @t[5]
+ vext.8 @t[5], @t[5], @t[5], #12
+ veor @y[6], @y[6], @t[4]
+ vext.8 @t[4], @t[4], @t[4], #12
+ veor @y[5], @y[5], @t[6]
+ veor @y[6], @y[6], @t[7]
+ vext.8 @t[7], @t[7], @t[7], #12
+ veor @t[6], @t[6], @t[3] @ restore t[6]
+ vext.8 @t[3], @t[3], @t[3], #12
+
+ @ multiplication by 0x09
+ veor @y[4], @y[4], @y[1]
+ veor @t[1], @t[1], @y[1] @ t[1]=y[1]
+ veor @t[0], @t[0], @t[5] @ clobber t[0]
+ vext.8 @t[6], @t[6], @t[6], #12
+ veor @t[1], @t[1], @t[5]
+ veor @y[3], @y[3], @t[0]
+ veor @t[0], @t[0], @y[0] @ t[0]=y[0]
+ veor @t[1], @t[1], @t[6]
+ veor @t[6], @t[6], @t[7] @ clobber t[6]
+ veor @y[4], @y[4], @t[1]
+ veor @y[7], @y[7], @t[4]
+ veor @y[6], @y[6], @t[3]
+ veor @y[5], @y[5], @t[2]
+ veor @t[4], @t[4], @y[4] @ t[4]=y[4]
+ veor @t[3], @t[3], @y[3] @ t[3]=y[3]
+ veor @t[5], @t[5], @y[5] @ t[5]=y[5]
+ veor @t[2], @t[2], @y[2] @ t[2]=y[2]
+ veor @t[3], @t[3], @t[7]
+ veor @XMM[5], @t[5], @t[6]
+ veor @XMM[6], @t[6], @y[6] @ t[6]=y[6]
+ veor @XMM[2], @t[2], @t[6]
+ veor @XMM[7], @t[7], @y[7] @ t[7]=y[7]
+
+ vmov @XMM[0], @t[0]
+ vmov @XMM[1], @t[1]
+ @ vmov @XMM[2], @t[2]
+ vmov @XMM[3], @t[3]
+ vmov @XMM[4], @t[4]
+ @ vmov @XMM[5], @t[5]
+ @ vmov @XMM[6], @t[6]
+ @ vmov @XMM[7], @t[7]
+___
+}
+
+sub InvMixColumns {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+# Thanks to Jussi Kivilinna for providing pointer to
+#
+# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
+# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
+# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
+# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
+
+$code.=<<___;
+ @ multiplication by 0x05-0x00-0x04-0x00
+ vext.8 @t[0], @x[0], @x[0], #8
+ vext.8 @t[6], @x[6], @x[6], #8
+ vext.8 @t[7], @x[7], @x[7], #8
+ veor @t[0], @t[0], @x[0]
+ vext.8 @t[1], @x[1], @x[1], #8
+ veor @t[6], @t[6], @x[6]
+ vext.8 @t[2], @x[2], @x[2], #8
+ veor @t[7], @t[7], @x[7]
+ vext.8 @t[3], @x[3], @x[3], #8
+ veor @t[1], @t[1], @x[1]
+ vext.8 @t[4], @x[4], @x[4], #8
+ veor @t[2], @t[2], @x[2]
+ vext.8 @t[5], @x[5], @x[5], #8
+ veor @t[3], @t[3], @x[3]
+ veor @t[4], @t[4], @x[4]
+ veor @t[5], @t[5], @x[5]
+
+ veor @x[0], @x[0], @t[6]
+ veor @x[1], @x[1], @t[6]
+ veor @x[2], @x[2], @t[0]
+ veor @x[4], @x[4], @t[2]
+ veor @x[3], @x[3], @t[1]
+ veor @x[1], @x[1], @t[7]
+ veor @x[2], @x[2], @t[7]
+ veor @x[4], @x[4], @t[6]
+ veor @x[5], @x[5], @t[3]
+ veor @x[3], @x[3], @t[6]
+ veor @x[6], @x[6], @t[4]
+ veor @x[4], @x[4], @t[7]
+ veor @x[5], @x[5], @t[7]
+ veor @x[7], @x[7], @t[5]
+___
+ &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
+}
+
+sub swapmove {
+my ($a,$b,$n,$mask,$t)=@_;
+$code.=<<___;
+ vshr.u64 $t, $b, #$n
+ veor $t, $t, $a
+ vand $t, $t, $mask
+ veor $a, $a, $t
+ vshl.u64 $t, $t, #$n
+ veor $b, $b, $t
+___
+}
+sub swapmove2x {
+my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
+$code.=<<___;
+ vshr.u64 $t0, $b0, #$n
+ vshr.u64 $t1, $b1, #$n
+ veor $t0, $t0, $a0
+ veor $t1, $t1, $a1
+ vand $t0, $t0, $mask
+ vand $t1, $t1, $mask
+ veor $a0, $a0, $t0
+ vshl.u64 $t0, $t0, #$n
+ veor $a1, $a1, $t1
+ vshl.u64 $t1, $t1, #$n
+ veor $b0, $b0, $t0
+ veor $b1, $b1, $t1
+___
+}
+
+sub bitslice {
+my @x=reverse(@_[0..7]);
+my ($t0,$t1,$t2,$t3)=@_[8..11];
+$code.=<<___;
+ vmov.i8 $t0,#0x55 @ compose .LBS0
+ vmov.i8 $t1,#0x33 @ compose .LBS1
+___
+ &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
+ &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+$code.=<<___;
+ vmov.i8 $t0,#0x0f @ compose .LBS2
+___
+ &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
+ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+
+ &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
+ &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
+}
+
+$code.=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+
+# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
+# define VFP_ABI_POP vldmia sp!,{d8-d15}
+# define VFP_ABI_FRAME 0x40
+#else
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+# define VFP_ABI_FRAME 0
+# define BSAES_ASM_EXTENDED_KEY
+# define XTS_CHAIN_TWEAK
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
+#endif
+
+#ifdef __thumb__
+# define adrl adr
+#endif
+
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.text
+.syntax unified @ ARMv7-capable assembler is expected to handle this
+#ifdef __thumb2__
+.thumb
+#else
+.code 32
+#endif
+
+.type _bsaes_decrypt8,%function
+.align 4
+_bsaes_decrypt8:
+ adr $const,_bsaes_decrypt8
+ vldmia $key!, {@XMM[9]} @ round 0 key
+ add $const,$const,#.LM0ISR-_bsaes_decrypt8
+
+ vldmia $const!, {@XMM[8]} @ .LM0ISR
+ veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
+ veor @XMM[11], @XMM[1], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+ veor @XMM[12], @XMM[2], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+ veor @XMM[13], @XMM[3], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
+ veor @XMM[14], @XMM[4], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
+ veor @XMM[15], @XMM[5], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
+ veor @XMM[10], @XMM[6], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
+ veor @XMM[11], @XMM[7], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+ vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+___
+ &bitslice (@XMM[0..7, 8..11]);
+$code.=<<___;
+ sub $rounds,$rounds,#1
+ b .Ldec_sbox
+.align 4
+.Ldec_loop:
+___
+ &ShiftRows (@XMM[0..7, 8..12]);
+$code.=".Ldec_sbox:\n";
+ &InvSbox (@XMM[0..7, 8..15]);
+$code.=<<___;
+ subs $rounds,$rounds,#1
+ bcc .Ldec_done
+___
+ &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
+$code.=<<___;
+ vldmia $const, {@XMM[12]} @ .LISR
+ ite eq @ Thumb2 thing, sanity check in ARM
+ addeq $const,$const,#0x10
+ bne .Ldec_loop
+ vldmia $const, {@XMM[12]} @ .LISRM0
+ b .Ldec_loop
+.align 4
+.Ldec_done:
+___
+ &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
+$code.=<<___;
+ vldmia $key, {@XMM[8]} @ last round key
+ veor @XMM[6], @XMM[6], @XMM[8]
+ veor @XMM[4], @XMM[4], @XMM[8]
+ veor @XMM[2], @XMM[2], @XMM[8]
+ veor @XMM[7], @XMM[7], @XMM[8]
+ veor @XMM[3], @XMM[3], @XMM[8]
+ veor @XMM[5], @XMM[5], @XMM[8]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ veor @XMM[1], @XMM[1], @XMM[8]
+ bx lr
+.size _bsaes_decrypt8,.-_bsaes_decrypt8
+
+.type _bsaes_const,%object
+.align 6
+_bsaes_const:
+.LM0ISR: @ InvShiftRows constants
+ .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISR:
+ .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
+.LISRM0:
+ .quad 0x01040b0e0205080f, 0x0306090c00070a0d
+.LM0SR: @ ShiftRows constants
+ .quad 0x0a0e02060f03070b, 0x0004080c05090d01
+.LSR:
+ .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+ .quad 0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0:
+ .quad 0x02060a0e03070b0f, 0x0004080c0105090d
+.LREVM0SR:
+ .quad 0x090d01050c000408, 0x03070b0f060a0e02
+.asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align 6
+.size _bsaes_const,.-_bsaes_const
+
+.type _bsaes_encrypt8,%function
+.align 4
+_bsaes_encrypt8:
+ adr $const,_bsaes_encrypt8
+ vldmia $key!, {@XMM[9]} @ round 0 key
+ sub $const,$const,#_bsaes_encrypt8-.LM0SR
+
+ vldmia $const!, {@XMM[8]} @ .LM0SR
+_bsaes_encrypt8_alt:
+ veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
+ veor @XMM[11], @XMM[1], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+ veor @XMM[12], @XMM[2], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+ veor @XMM[13], @XMM[3], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
+ veor @XMM[14], @XMM[4], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
+ veor @XMM[15], @XMM[5], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
+ veor @XMM[10], @XMM[6], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
+ veor @XMM[11], @XMM[7], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+ vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+_bsaes_encrypt8_bitslice:
+___
+ &bitslice (@XMM[0..7, 8..11]);
+$code.=<<___;
+ sub $rounds,$rounds,#1
+ b .Lenc_sbox
+.align 4
+.Lenc_loop:
+___
+ &ShiftRows (@XMM[0..7, 8..12]);
+$code.=".Lenc_sbox:\n";
+ &Sbox (@XMM[0..7, 8..15]);
+$code.=<<___;
+ subs $rounds,$rounds,#1
+ bcc .Lenc_done
+___
+ &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
+$code.=<<___;
+ vldmia $const, {@XMM[12]} @ .LSR
+ ite eq @ Thumb2 thing, samity check in ARM
+ addeq $const,$const,#0x10
+ bne .Lenc_loop
+ vldmia $const, {@XMM[12]} @ .LSRM0
+ b .Lenc_loop
+.align 4
+.Lenc_done:
+___
+ # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
+ &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
+$code.=<<___;
+ vldmia $key, {@XMM[8]} @ last round key
+ veor @XMM[4], @XMM[4], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[8]
+ veor @XMM[3], @XMM[3], @XMM[8]
+ veor @XMM[7], @XMM[7], @XMM[8]
+ veor @XMM[2], @XMM[2], @XMM[8]
+ veor @XMM[5], @XMM[5], @XMM[8]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ veor @XMM[1], @XMM[1], @XMM[8]
+ bx lr
+.size _bsaes_encrypt8,.-_bsaes_encrypt8
+___
+}
+{
+my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6");
+
+sub bitslice_key {
+my @x=reverse(@_[0..7]);
+my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
+
+ &swapmove (@x[0,1],1,$bs0,$t2,$t3);
+$code.=<<___;
+ @ &swapmove(@x[2,3],1,$t0,$t2,$t3);
+ vmov @x[2], @x[0]
+ vmov @x[3], @x[1]
+___
+ #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+
+ &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
+$code.=<<___;
+ @ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+ vmov @x[4], @x[0]
+ vmov @x[6], @x[2]
+ vmov @x[5], @x[1]
+ vmov @x[7], @x[3]
+___
+ &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
+ &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
+}
+
+$code.=<<___;
+.type _bsaes_key_convert,%function
+.align 4
+_bsaes_key_convert:
+ adr $const,_bsaes_key_convert
+ vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key
+ sub $const,$const,#_bsaes_key_convert-.LM0
+ vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key
+
+ vmov.i8 @XMM[8], #0x01 @ bit masks
+ vmov.i8 @XMM[9], #0x02
+ vmov.i8 @XMM[10], #0x04
+ vmov.i8 @XMM[11], #0x08
+ vmov.i8 @XMM[12], #0x10
+ vmov.i8 @XMM[13], #0x20
+ vldmia $const, {@XMM[14]} @ .LM0
+
+#ifdef __ARMEL__
+ vrev32.8 @XMM[7], @XMM[7]
+ vrev32.8 @XMM[15], @XMM[15]
+#endif
+ sub $rounds,$rounds,#1
+ vstmia $out!, {@XMM[7]} @ save round 0 key
+ b .Lkey_loop
+
+.align 4
+.Lkey_loop:
+ vtbl.8 `&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])`
+ vtbl.8 `&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])`
+ vmov.i8 @XMM[6], #0x40
+ vmov.i8 @XMM[15], #0x80
+
+ vtst.8 @XMM[0], @XMM[7], @XMM[8]
+ vtst.8 @XMM[1], @XMM[7], @XMM[9]
+ vtst.8 @XMM[2], @XMM[7], @XMM[10]
+ vtst.8 @XMM[3], @XMM[7], @XMM[11]
+ vtst.8 @XMM[4], @XMM[7], @XMM[12]
+ vtst.8 @XMM[5], @XMM[7], @XMM[13]
+ vtst.8 @XMM[6], @XMM[7], @XMM[6]
+ vtst.8 @XMM[7], @XMM[7], @XMM[15]
+ vld1.8 {@XMM[15]}, [$inp]! @ load next round key
+ vmvn @XMM[0], @XMM[0] @ "pnot"
+ vmvn @XMM[1], @XMM[1]
+ vmvn @XMM[5], @XMM[5]
+ vmvn @XMM[6], @XMM[6]
+#ifdef __ARMEL__
+ vrev32.8 @XMM[15], @XMM[15]
+#endif
+ subs $rounds,$rounds,#1
+ vstmia $out!,{@XMM[0]-@XMM[7]} @ write bit-sliced round key
+ bne .Lkey_loop
+
+ vmov.i8 @XMM[7],#0x63 @ compose .L63
+ @ don't save last round key
+ bx lr
+.size _bsaes_key_convert,.-_bsaes_key_convert
+___
+}
+
+if (0) { # following four functions are unsupported interface
+ # used for benchmarking...
+$code.=<<___;
+.globl bsaes_enc_key_convert
+.type bsaes_enc_key_convert,%function
+.align 4
+bsaes_enc_key_convert:
+ stmdb sp!,{r4-r6,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+
+ ldr r5,[$inp,#240] @ pass rounds
+ mov r4,$inp @ pass key
+ mov r12,$out @ pass key schedule
+ bl _bsaes_key_convert
+ veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
+ vstmia r12, {@XMM[7]} @ save last round key
+
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r6,pc}
+.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
+
+.globl bsaes_encrypt_128
+.type bsaes_encrypt_128,%function
+.align 4
+bsaes_encrypt_128:
+ stmdb sp!,{r4-r6,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+.Lenc128_loop:
+ vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
+ vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
+ mov r4,$key @ pass the key
+ vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
+ mov r5,#10 @ pass rounds
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
+
+ bl _bsaes_encrypt8
+
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[3]}, [$out]!
+ vst1.8 {@XMM[7]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ subs $len,$len,#0x80
+ vst1.8 {@XMM[5]}, [$out]!
+ bhi .Lenc128_loop
+
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r6,pc}
+.size bsaes_encrypt_128,.-bsaes_encrypt_128
+
+.globl bsaes_dec_key_convert
+.type bsaes_dec_key_convert,%function
+.align 4
+bsaes_dec_key_convert:
+ stmdb sp!,{r4-r6,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+
+ ldr r5,[$inp,#240] @ pass rounds
+ mov r4,$inp @ pass key
+ mov r12,$out @ pass key schedule
+ bl _bsaes_key_convert
+ vldmia $out, {@XMM[6]}
+ vstmia r12, {@XMM[15]} @ save last round key
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
+ vstmia $out, {@XMM[7]}
+
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r6,pc}
+.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
+
+.globl bsaes_decrypt_128
+.type bsaes_decrypt_128,%function
+.align 4
+bsaes_decrypt_128:
+ stmdb sp!,{r4-r6,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+.Ldec128_loop:
+ vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
+ vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
+ mov r4,$key @ pass the key
+ vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
+ mov r5,#10 @ pass rounds
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
+
+ bl _bsaes_decrypt8
+
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ vst1.8 {@XMM[7]}, [$out]!
+ vst1.8 {@XMM[3]}, [$out]!
+ subs $len,$len,#0x80
+ vst1.8 {@XMM[5]}, [$out]!
+ bhi .Ldec128_loop
+
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r6,pc}
+.size bsaes_decrypt_128,.-bsaes_decrypt_128
+___
+}
+{
+my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10));
+my ($keysched)=("sp");
+
+$code.=<<___;
+.extern AES_cbc_encrypt
+.extern AES_decrypt
+
+.global bsaes_cbc_encrypt
+.type bsaes_cbc_encrypt,%function
+.align 5
+bsaes_cbc_encrypt:
+#ifndef __KERNEL__
+ cmp $len, #128
+#ifndef __thumb__
+ blo AES_cbc_encrypt
+#else
+ bhs 1f
+ b AES_cbc_encrypt
+1:
+#endif
+#endif
+
+ @ it is up to the caller to make sure we are called with enc == 0
+
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr}
+ VFP_ABI_PUSH
+ ldr $ivp, [ip] @ IV is 1st arg on the stack
+ mov $len, $len, lsr#4 @ len in 16 byte blocks
+ sub sp, #0x10 @ scratch space to carry over the IV
+ mov $fp, sp @ save sp
+
+ ldr $rounds, [$key, #240] @ get # of rounds
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
+ add r12, #`128-32` @ sifze of bit-slices key schedule
+
+ @ populate the key schedule
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ mov sp, r12 @ sp is $keysched
+ bl _bsaes_key_convert
+ vldmia $keysched, {@XMM[6]}
+ vstmia r12, {@XMM[15]} @ save last round key
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
+ vstmia $keysched, {@XMM[7]}
+#else
+ ldr r12, [$key, #244]
+ eors r12, #1
+ beq 0f
+
+ @ populate the key schedule
+ str r12, [$key, #244]
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ add r12, $key, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ add r4, $key, #248
+ vldmia r4, {@XMM[6]}
+ vstmia r12, {@XMM[15]} @ save last round key
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
+ vstmia r4, {@XMM[7]}
+
+.align 2
+0:
+#endif
+
+ vld1.8 {@XMM[15]}, [$ivp] @ load IV
+ b .Lcbc_dec_loop
+
+.align 4
+.Lcbc_dec_loop:
+ subs $len, $len, #0x8
+ bmi .Lcbc_dec_loop_finish
+
+ vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
+ vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
+#ifndef BSAES_ASM_EXTENDED_KEY
+ mov r4, $keysched @ pass the key
+#else
+ add r4, $key, #248
+#endif
+ vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
+ mov r5, $rounds
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]
+ sub $inp, $inp, #0x60
+ vstmia $fp, {@XMM[15]} @ put aside IV
+
+ bl _bsaes_decrypt8
+
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
+ veor @XMM[4], @XMM[4], @XMM[10]
+ veor @XMM[2], @XMM[2], @XMM[11]
+ vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
+ veor @XMM[7], @XMM[7], @XMM[12]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ veor @XMM[3], @XMM[3], @XMM[13]
+ vst1.8 {@XMM[6]}, [$out]!
+ veor @XMM[5], @XMM[5], @XMM[14]
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ vst1.8 {@XMM[7]}, [$out]!
+ vst1.8 {@XMM[3]}, [$out]!
+ vst1.8 {@XMM[5]}, [$out]!
+
+ b .Lcbc_dec_loop
+
+.Lcbc_dec_loop_finish:
+ adds $len, $len, #8
+ beq .Lcbc_dec_done
+
+ vld1.8 {@XMM[0]}, [$inp]! @ load input
+ cmp $len, #2
+ blo .Lcbc_dec_one
+ vld1.8 {@XMM[1]}, [$inp]!
+#ifndef BSAES_ASM_EXTENDED_KEY
+ mov r4, $keysched @ pass the key
+#else
+ add r4, $key, #248
+#endif
+ mov r5, $rounds
+ vstmia $fp, {@XMM[15]} @ put aside IV
+ beq .Lcbc_dec_two
+ vld1.8 {@XMM[2]}, [$inp]!
+ cmp $len, #4
+ blo .Lcbc_dec_three
+ vld1.8 {@XMM[3]}, [$inp]!
+ beq .Lcbc_dec_four
+ vld1.8 {@XMM[4]}, [$inp]!
+ cmp $len, #6
+ blo .Lcbc_dec_five
+ vld1.8 {@XMM[5]}, [$inp]!
+ beq .Lcbc_dec_six
+ vld1.8 {@XMM[6]}, [$inp]!
+ sub $inp, $inp, #0x70
+
+ bl _bsaes_decrypt8
+
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
+ veor @XMM[4], @XMM[4], @XMM[10]
+ veor @XMM[2], @XMM[2], @XMM[11]
+ vld1.8 {@XMM[15]}, [$inp]!
+ veor @XMM[7], @XMM[7], @XMM[12]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ veor @XMM[3], @XMM[3], @XMM[13]
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ vst1.8 {@XMM[7]}, [$out]!
+ vst1.8 {@XMM[3]}, [$out]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_six:
+ sub $inp, $inp, #0x60
+ bl _bsaes_decrypt8
+ vldmia $fp,{@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vld1.8 {@XMM[12]}, [$inp]!
+ veor @XMM[4], @XMM[4], @XMM[10]
+ veor @XMM[2], @XMM[2], @XMM[11]
+ vld1.8 {@XMM[15]}, [$inp]!
+ veor @XMM[7], @XMM[7], @XMM[12]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ vst1.8 {@XMM[7]}, [$out]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_five:
+ sub $inp, $inp, #0x50
+ bl _bsaes_decrypt8
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vld1.8 {@XMM[15]}, [$inp]!
+ veor @XMM[4], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ veor @XMM[2], @XMM[2], @XMM[11]
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_four:
+ sub $inp, $inp, #0x40
+ bl _bsaes_decrypt8
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[10]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vld1.8 {@XMM[15]}, [$inp]!
+ veor @XMM[4], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[4]}, [$out]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_three:
+ sub $inp, $inp, #0x30
+ bl _bsaes_decrypt8
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[15]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ vst1.8 {@XMM[6]}, [$out]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_two:
+ sub $inp, $inp, #0x20
+ bl _bsaes_decrypt8
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[15]}, [$inp]! @ reload input
+ veor @XMM[1], @XMM[1], @XMM[8]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_one:
+ sub $inp, $inp, #0x10
+ mov $rounds, $out @ save original out pointer
+ mov $out, $fp @ use the iv scratch space as out buffer
+ mov r2, $key
+ vmov @XMM[4],@XMM[15] @ just in case ensure that IV
+ vmov @XMM[5],@XMM[0] @ and input are preserved
+ bl AES_decrypt
+ vld1.8 {@XMM[0]}, [$fp,:64] @ load result
+ veor @XMM[0], @XMM[0], @XMM[4] @ ^= IV
+ vmov @XMM[15], @XMM[5] @ @XMM[5] holds input
+ vst1.8 {@XMM[0]}, [$rounds] @ write output
+
+.Lcbc_dec_done:
+#ifndef BSAES_ASM_EXTENDED_KEY
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+.Lcbc_dec_bzero: @ wipe key schedule [if any]
+ vstmia $keysched!, {q0-q1}
+ cmp $keysched, $fp
+ bne .Lcbc_dec_bzero
+#endif
+
+ mov sp, $fp
+ add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
+ vst1.8 {@XMM[15]}, [$ivp] @ return IV
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc}
+.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+___
+}
+{
+my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10)));
+my $const = "r6"; # shared with _bsaes_encrypt8_alt
+my $keysched = "sp";
+
+$code.=<<___;
+.extern AES_encrypt
+.global bsaes_ctr32_encrypt_blocks
+.type bsaes_ctr32_encrypt_blocks,%function
+.align 5
+bsaes_ctr32_encrypt_blocks:
+ cmp $len, #8 @ use plain AES for
+ blo .Lctr_enc_short @ small sizes
+
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr}
+ VFP_ABI_PUSH
+ ldr $ctr, [ip] @ ctr is 1st arg on the stack
+ sub sp, sp, #0x10 @ scratch space to carry over the ctr
+ mov $fp, sp @ save sp
+
+ ldr $rounds, [$key, #240] @ get # of rounds
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
+ add r12, #`128-32` @ size of bit-sliced key schedule
+
+ @ populate the key schedule
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ mov sp, r12 @ sp is $keysched
+ bl _bsaes_key_convert
+ veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
+ vstmia r12, {@XMM[7]} @ save last round key
+
+ vld1.8 {@XMM[0]}, [$ctr] @ load counter
+ add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr
+ vldmia $keysched, {@XMM[4]} @ load round0 key
+#else
+ ldr r12, [$key, #244]
+ eors r12, #1
+ beq 0f
+
+ @ populate the key schedule
+ str r12, [$key, #244]
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ add r12, $key, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
+ vstmia r12, {@XMM[7]} @ save last round key
+
+.align 2
+0: add r12, $key, #248
+ vld1.8 {@XMM[0]}, [$ctr] @ load counter
+ adrl $ctr, .LREVM0SR @ borrow $ctr
+ vldmia r12, {@XMM[4]} @ load round0 key
+ sub sp, #0x10 @ place for adjusted round0 key
+#endif
+
+ vmov.i32 @XMM[8],#1 @ compose 1<<96
+ veor @XMM[9],@XMM[9],@XMM[9]
+ vrev32.8 @XMM[0],@XMM[0]
+ vext.8 @XMM[8],@XMM[9],@XMM[8],#4
+ vrev32.8 @XMM[4],@XMM[4]
+ vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
+ vstmia $keysched, {@XMM[4]} @ save adjusted round0 key
+ b .Lctr_enc_loop
+
+.align 4
+.Lctr_enc_loop:
+ vadd.u32 @XMM[10], @XMM[8], @XMM[9] @ compose 3<<96
+ vadd.u32 @XMM[1], @XMM[0], @XMM[8] @ +1
+ vadd.u32 @XMM[2], @XMM[0], @XMM[9] @ +2
+ vadd.u32 @XMM[3], @XMM[0], @XMM[10] @ +3
+ vadd.u32 @XMM[4], @XMM[1], @XMM[10]
+ vadd.u32 @XMM[5], @XMM[2], @XMM[10]
+ vadd.u32 @XMM[6], @XMM[3], @XMM[10]
+ vadd.u32 @XMM[7], @XMM[4], @XMM[10]
+ vadd.u32 @XMM[10], @XMM[5], @XMM[10] @ next counter
+
+ @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
+ @ to flip byte order in 32-bit counter
+
+ vldmia $keysched, {@XMM[9]} @ load round0 key
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, $keysched, #0x10 @ pass next round key
+#else
+ add r4, $key, #`248+16`
+#endif
+ vldmia $ctr, {@XMM[8]} @ .LREVM0SR
+ mov r5, $rounds @ pass rounds
+ vstmia $fp, {@XMM[10]} @ save next counter
+ sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants
+
+ bl _bsaes_encrypt8_alt
+
+ subs $len, $len, #8
+ blo .Lctr_enc_loop_done
+
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ load input
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
+ veor @XMM[0], @XMM[8]
+ veor @XMM[1], @XMM[9]
+ vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
+ veor @XMM[4], @XMM[10]
+ veor @XMM[6], @XMM[11]
+ vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
+ veor @XMM[3], @XMM[12]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ veor @XMM[7], @XMM[13]
+ veor @XMM[2], @XMM[14]
+ vst1.8 {@XMM[4]}, [$out]!
+ veor @XMM[5], @XMM[15]
+ vst1.8 {@XMM[6]}, [$out]!
+ vmov.i32 @XMM[8], #1 @ compose 1<<96
+ vst1.8 {@XMM[3]}, [$out]!
+ veor @XMM[9], @XMM[9], @XMM[9]
+ vst1.8 {@XMM[7]}, [$out]!
+ vext.8 @XMM[8], @XMM[9], @XMM[8], #4
+ vst1.8 {@XMM[2]}, [$out]!
+ vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
+ vst1.8 {@XMM[5]}, [$out]!
+ vldmia $fp, {@XMM[0]} @ load counter
+
+ bne .Lctr_enc_loop
+ b .Lctr_enc_done
+
+.align 4
+.Lctr_enc_loop_done:
+ add $len, $len, #8
+ vld1.8 {@XMM[8]}, [$inp]! @ load input
+ veor @XMM[0], @XMM[8]
+ vst1.8 {@XMM[0]}, [$out]! @ write output
+ cmp $len, #2
+ blo .Lctr_enc_done
+ vld1.8 {@XMM[9]}, [$inp]!
+ veor @XMM[1], @XMM[9]
+ vst1.8 {@XMM[1]}, [$out]!
+ beq .Lctr_enc_done
+ vld1.8 {@XMM[10]}, [$inp]!
+ veor @XMM[4], @XMM[10]
+ vst1.8 {@XMM[4]}, [$out]!
+ cmp $len, #4
+ blo .Lctr_enc_done
+ vld1.8 {@XMM[11]}, [$inp]!
+ veor @XMM[6], @XMM[11]
+ vst1.8 {@XMM[6]}, [$out]!
+ beq .Lctr_enc_done
+ vld1.8 {@XMM[12]}, [$inp]!
+ veor @XMM[3], @XMM[12]
+ vst1.8 {@XMM[3]}, [$out]!
+ cmp $len, #6
+ blo .Lctr_enc_done
+ vld1.8 {@XMM[13]}, [$inp]!
+ veor @XMM[7], @XMM[13]
+ vst1.8 {@XMM[7]}, [$out]!
+ beq .Lctr_enc_done
+ vld1.8 {@XMM[14]}, [$inp]
+ veor @XMM[2], @XMM[14]
+ vst1.8 {@XMM[2]}, [$out]!
+
+.Lctr_enc_done:
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+#ifndef BSAES_ASM_EXTENDED_KEY
+.Lctr_enc_bzero: @ wipe key schedule [if any]
+ vstmia $keysched!, {q0-q1}
+ cmp $keysched, $fp
+ bne .Lctr_enc_bzero
+#else
+ vstmia $keysched, {q0-q1}
+#endif
+
+ mov sp, $fp
+ add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc} @ return
+
+.align 4
+.Lctr_enc_short:
+ ldr ip, [sp] @ ctr pointer is passed on stack
+ stmdb sp!, {r4-r8, lr}
+
+ mov r4, $inp @ copy arguments
+ mov r5, $out
+ mov r6, $len
+ mov r7, $key
+ ldr r8, [ip, #12] @ load counter LSW
+ vld1.8 {@XMM[1]}, [ip] @ load whole counter value
+#ifdef __ARMEL__
+ rev r8, r8
+#endif
+ sub sp, sp, #0x10
+ vst1.8 {@XMM[1]}, [sp,:64] @ copy counter value
+ sub sp, sp, #0x10
+
+.Lctr_enc_short_loop:
+ add r0, sp, #0x10 @ input counter value
+ mov r1, sp @ output on the stack
+ mov r2, r7 @ key
+
+ bl AES_encrypt
+
+ vld1.8 {@XMM[0]}, [r4]! @ load input
+ vld1.8 {@XMM[1]}, [sp,:64] @ load encrypted counter
+ add r8, r8, #1
+#ifdef __ARMEL__
+ rev r0, r8
+ str r0, [sp, #0x1c] @ next counter value
+#else
+ str r8, [sp, #0x1c] @ next counter value
+#endif
+ veor @XMM[0],@XMM[0],@XMM[1]
+ vst1.8 {@XMM[0]}, [r5]! @ store output
+ subs r6, r6, #1
+ bne .Lctr_enc_short_loop
+
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+ vstmia sp!, {q0-q1}
+
+ ldmia sp!, {r4-r8, pc}
+.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+___
+}
+{
+######################################################################
+# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+# const AES_KEY *key1, const AES_KEY *key2,
+# const unsigned char iv[16]);
+#
+my ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3)));
+my $const="r6"; # returned by _bsaes_key_convert
+my $twmask=@XMM[5];
+my @T=@XMM[6..7];
+
+$code.=<<___;
+.globl bsaes_xts_encrypt
+.type bsaes_xts_encrypt,%function
+.align 4
+bsaes_xts_encrypt:
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr} @ 0x20
+ VFP_ABI_PUSH
+ mov r6, sp @ future $fp
+
+ mov $inp, r0
+ mov $out, r1
+ mov $len, r2
+ mov $key, r3
+
+ sub r0, sp, #0x10 @ 0x10
+ bic r0, #0xf @ align at 16 bytes
+ mov sp, r0
+
+#ifdef XTS_CHAIN_TWEAK
+ ldr r0, [ip] @ pointer to input tweak
+#else
+ @ generate initial tweak
+ ldr r0, [ip, #4] @ iv[]
+ mov r1, sp
+ ldr r2, [ip, #0] @ key2
+ bl AES_encrypt
+ mov r0,sp @ pointer to initial tweak
+#endif
+
+ ldr $rounds, [$key, #240] @ get # of rounds
+ mov $fp, r6
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
+ @ add r12, #`128-32` @ size of bit-sliced key schedule
+ sub r12, #`32+16` @ place for tweak[9]
+
+ @ populate the key schedule
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ mov sp, r12
+ add r12, #0x90 @ pass key schedule
+ bl _bsaes_key_convert
+ veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
+ vstmia r12, {@XMM[7]} @ save last round key
+#else
+ ldr r12, [$key, #244]
+ eors r12, #1
+ beq 0f
+
+ str r12, [$key, #244]
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ add r12, $key, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
+ vstmia r12, {@XMM[7]}
+
+.align 2
+0: sub sp, #0x90 @ place for tweak[9]
+#endif
+
+ vld1.8 {@XMM[8]}, [r0] @ initial tweak
+ adr $magic, .Lxts_magic
+
+ subs $len, #0x80
+ blo .Lxts_enc_short
+ b .Lxts_enc_loop
+
+.align 4
+.Lxts_enc_loop:
+ vldmia $magic, {$twmask} @ load XTS magic
+ vshr.s64 @T[0], @XMM[8], #63
+ mov r0, sp
+ vand @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+ vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+ vst1.64 {@XMM[$i-1]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ vshr.s64 @T[1], @XMM[$i], #63
+ veor @XMM[$i], @XMM[$i], @T[0]
+ vand @T[1], @T[1], $twmask
+___
+ @T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+ vld1.8 {@XMM[$i-10]}, [$inp]!
+___
+$code.=<<___ if ($i>=11);
+ veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+ vadd.u64 @XMM[8], @XMM[15], @XMM[15]
+ vst1.64 {@XMM[15]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ veor @XMM[8], @XMM[8], @T[0]
+ vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
+ veor @XMM[5], @XMM[5], @XMM[13]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[6], @XMM[6], @XMM[14]
+ mov r5, $rounds @ pass rounds
+ veor @XMM[7], @XMM[7], @XMM[15]
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[6], @XMM[11]
+ vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
+ veor @XMM[10], @XMM[3], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ veor @XMM[12], @XMM[2], @XMM[14]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+ veor @XMM[13], @XMM[5], @XMM[15]
+ vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+
+ subs $len, #0x80
+ bpl .Lxts_enc_loop
+
+.Lxts_enc_short:
+ adds $len, #0x70
+ bmi .Lxts_enc_done
+
+ vldmia $magic, {$twmask} @ load XTS magic
+ vshr.s64 @T[0], @XMM[8], #63
+ mov r0, sp
+ vand @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+ vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+ vst1.64 {@XMM[$i-1]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ vshr.s64 @T[1], @XMM[$i], #63
+ veor @XMM[$i], @XMM[$i], @T[0]
+ vand @T[1], @T[1], $twmask
+___
+ @T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+ vld1.8 {@XMM[$i-10]}, [$inp]!
+ subs $len, #0x10
+ bmi .Lxts_enc_`$i-9`
+___
+$code.=<<___ if ($i>=11);
+ veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+ sub $len, #0x10
+ vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
+
+ vld1.8 {@XMM[6]}, [$inp]!
+ veor @XMM[5], @XMM[5], @XMM[13]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[6], @XMM[6], @XMM[14]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[6], @XMM[11]
+ vld1.64 {@XMM[14]}, [r0,:128]!
+ veor @XMM[10], @XMM[3], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ veor @XMM[12], @XMM[2], @XMM[14]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+ vst1.8 {@XMM[12]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_6:
+ vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
+
+ veor @XMM[4], @XMM[4], @XMM[12]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[5], @XMM[5], @XMM[13]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[6], @XMM[11]
+ veor @XMM[10], @XMM[3], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+
+@ put this in range for both ARM and Thumb mode adr instructions
+.align 5
+.Lxts_magic:
+ .quad 1, 0x87
+
+.align 5
+.Lxts_enc_5:
+ vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak
+
+ veor @XMM[3], @XMM[3], @XMM[11]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[4], @XMM[4], @XMM[12]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[6], @XMM[11]
+ veor @XMM[10], @XMM[3], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ vst1.8 {@XMM[10]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_4:
+ vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak
+
+ veor @XMM[2], @XMM[2], @XMM[10]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[3], @XMM[3], @XMM[11]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[6], @XMM[11]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_3:
+ vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak
+
+ veor @XMM[1], @XMM[1], @XMM[9]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[2], @XMM[2], @XMM[10]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
+ vld1.64 {@XMM[10]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ vst1.8 {@XMM[8]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_2:
+ vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak
+
+ veor @XMM[0], @XMM[0], @XMM[8]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[1], @XMM[1], @XMM[9]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_1:
+ mov r0, sp
+ veor @XMM[0], @XMM[8]
+ mov r1, sp
+ vst1.8 {@XMM[0]}, [sp,:128]
+ mov r2, $key
+ mov r4, $fp @ preserve fp
+
+ bl AES_encrypt
+
+ vld1.8 {@XMM[0]}, [sp,:128]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ vst1.8 {@XMM[0]}, [$out]!
+ mov $fp, r4
+
+ vmov @XMM[8], @XMM[9] @ next round tweak
+
+.Lxts_enc_done:
+#ifndef XTS_CHAIN_TWEAK
+ adds $len, #0x10
+ beq .Lxts_enc_ret
+ sub r6, $out, #0x10
+
+.Lxts_enc_steal:
+ ldrb r0, [$inp], #1
+ ldrb r1, [$out, #-0x10]
+ strb r0, [$out, #-0x10]
+ strb r1, [$out], #1
+
+ subs $len, #1
+ bhi .Lxts_enc_steal
+
+ vld1.8 {@XMM[0]}, [r6]
+ mov r0, sp
+ veor @XMM[0], @XMM[0], @XMM[8]
+ mov r1, sp
+ vst1.8 {@XMM[0]}, [sp,:128]
+ mov r2, $key
+ mov r4, $fp @ preserve fp
+
+ bl AES_encrypt
+
+ vld1.8 {@XMM[0]}, [sp,:128]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ vst1.8 {@XMM[0]}, [r6]
+ mov $fp, r4
+#endif
+
+.Lxts_enc_ret:
+ bic r0, $fp, #0xf
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+#ifdef XTS_CHAIN_TWEAK
+ ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
+#endif
+.Lxts_enc_bzero: @ wipe key schedule [if any]
+ vstmia sp!, {q0-q1}
+ cmp sp, r0
+ bne .Lxts_enc_bzero
+
+ mov sp, $fp
+#ifdef XTS_CHAIN_TWEAK
+ vst1.8 {@XMM[8]}, [r1]
+#endif
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc} @ return
+
+.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
+
+.globl bsaes_xts_decrypt
+.type bsaes_xts_decrypt,%function
+.align 4
+bsaes_xts_decrypt:
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr} @ 0x20
+ VFP_ABI_PUSH
+ mov r6, sp @ future $fp
+
+ mov $inp, r0
+ mov $out, r1
+ mov $len, r2
+ mov $key, r3
+
+ sub r0, sp, #0x10 @ 0x10
+ bic r0, #0xf @ align at 16 bytes
+ mov sp, r0
+
+#ifdef XTS_CHAIN_TWEAK
+ ldr r0, [ip] @ pointer to input tweak
+#else
+ @ generate initial tweak
+ ldr r0, [ip, #4] @ iv[]
+ mov r1, sp
+ ldr r2, [ip, #0] @ key2
+ bl AES_encrypt
+ mov r0, sp @ pointer to initial tweak
+#endif
+
+ ldr $rounds, [$key, #240] @ get # of rounds
+ mov $fp, r6
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
+ @ add r12, #`128-32` @ size of bit-sliced key schedule
+ sub r12, #`32+16` @ place for tweak[9]
+
+ @ populate the key schedule
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ mov sp, r12
+ add r12, #0x90 @ pass key schedule
+ bl _bsaes_key_convert
+ add r4, sp, #0x90
+ vldmia r4, {@XMM[6]}
+ vstmia r12, {@XMM[15]} @ save last round key
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
+ vstmia r4, {@XMM[7]}
+#else
+ ldr r12, [$key, #244]
+ eors r12, #1
+ beq 0f
+
+ str r12, [$key, #244]
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ add r12, $key, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ add r4, $key, #248
+ vldmia r4, {@XMM[6]}
+ vstmia r12, {@XMM[15]} @ save last round key
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
+ vstmia r4, {@XMM[7]}
+
+.align 2
+0: sub sp, #0x90 @ place for tweak[9]
+#endif
+ vld1.8 {@XMM[8]}, [r0] @ initial tweak
+ adr $magic, .Lxts_magic
+
+ tst $len, #0xf @ if not multiple of 16
+ it ne @ Thumb2 thing, sanity check in ARM
+ subne $len, #0x10 @ subtract another 16 bytes
+ subs $len, #0x80
+
+ blo .Lxts_dec_short
+ b .Lxts_dec_loop
+
+.align 4
+.Lxts_dec_loop:
+ vldmia $magic, {$twmask} @ load XTS magic
+ vshr.s64 @T[0], @XMM[8], #63
+ mov r0, sp
+ vand @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+ vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+ vst1.64 {@XMM[$i-1]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ vshr.s64 @T[1], @XMM[$i], #63
+ veor @XMM[$i], @XMM[$i], @T[0]
+ vand @T[1], @T[1], $twmask
+___
+ @T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+ vld1.8 {@XMM[$i-10]}, [$inp]!
+___
+$code.=<<___ if ($i>=11);
+ veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+ vadd.u64 @XMM[8], @XMM[15], @XMM[15]
+ vst1.64 {@XMM[15]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ veor @XMM[8], @XMM[8], @T[0]
+ vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
+ veor @XMM[5], @XMM[5], @XMM[13]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[6], @XMM[6], @XMM[14]
+ mov r5, $rounds @ pass rounds
+ veor @XMM[7], @XMM[7], @XMM[15]
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[4], @XMM[11]
+ vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
+ veor @XMM[10], @XMM[2], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ veor @XMM[12], @XMM[3], @XMM[14]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+ veor @XMM[13], @XMM[5], @XMM[15]
+ vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+
+ subs $len, #0x80
+ bpl .Lxts_dec_loop
+
+.Lxts_dec_short:
+ adds $len, #0x70
+ bmi .Lxts_dec_done
+
+ vldmia $magic, {$twmask} @ load XTS magic
+ vshr.s64 @T[0], @XMM[8], #63
+ mov r0, sp
+ vand @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+ vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+ vst1.64 {@XMM[$i-1]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ vshr.s64 @T[1], @XMM[$i], #63
+ veor @XMM[$i], @XMM[$i], @T[0]
+ vand @T[1], @T[1], $twmask
+___
+ @T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+ vld1.8 {@XMM[$i-10]}, [$inp]!
+ subs $len, #0x10
+ bmi .Lxts_dec_`$i-9`
+___
+$code.=<<___ if ($i>=11);
+ veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+ sub $len, #0x10
+ vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
+
+ vld1.8 {@XMM[6]}, [$inp]!
+ veor @XMM[5], @XMM[5], @XMM[13]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[6], @XMM[6], @XMM[14]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[4], @XMM[11]
+ vld1.64 {@XMM[14]}, [r0,:128]!
+ veor @XMM[10], @XMM[2], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ veor @XMM[12], @XMM[3], @XMM[14]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+ vst1.8 {@XMM[12]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_6:
+ vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
+
+ veor @XMM[4], @XMM[4], @XMM[12]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[5], @XMM[5], @XMM[13]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[4], @XMM[11]
+ veor @XMM[10], @XMM[2], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_5:
+ vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak
+
+ veor @XMM[3], @XMM[3], @XMM[11]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[4], @XMM[4], @XMM[12]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[4], @XMM[11]
+ veor @XMM[10], @XMM[2], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ vst1.8 {@XMM[10]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_4:
+ vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak
+
+ veor @XMM[2], @XMM[2], @XMM[10]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[3], @XMM[3], @XMM[11]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[4], @XMM[11]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_3:
+ vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak
+
+ veor @XMM[1], @XMM[1], @XMM[9]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[2], @XMM[2], @XMM[10]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
+ vld1.64 {@XMM[10]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ vst1.8 {@XMM[8]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_2:
+ vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak
+
+ veor @XMM[0], @XMM[0], @XMM[8]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[1], @XMM[1], @XMM[9]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_1:
+ mov r0, sp
+ veor @XMM[0], @XMM[8]
+ mov r1, sp
+ vst1.8 {@XMM[0]}, [sp,:128]
+ mov r2, $key
+ mov r4, $fp @ preserve fp
+ mov r5, $magic @ preserve magic
+
+ bl AES_decrypt
+
+ vld1.8 {@XMM[0]}, [sp,:128]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ vst1.8 {@XMM[0]}, [$out]!
+ mov $fp, r4
+ mov $magic, r5
+
+ vmov @XMM[8], @XMM[9] @ next round tweak
+
+.Lxts_dec_done:
+#ifndef XTS_CHAIN_TWEAK
+ adds $len, #0x10
+ beq .Lxts_dec_ret
+
+ @ calculate one round of extra tweak for the stolen ciphertext
+ vldmia $magic, {$twmask}
+ vshr.s64 @XMM[6], @XMM[8], #63
+ vand @XMM[6], @XMM[6], $twmask
+ vadd.u64 @XMM[9], @XMM[8], @XMM[8]
+ vswp `&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")`
+ veor @XMM[9], @XMM[9], @XMM[6]
+
+ @ perform the final decryption with the last tweak value
+ vld1.8 {@XMM[0]}, [$inp]!
+ mov r0, sp
+ veor @XMM[0], @XMM[0], @XMM[9]
+ mov r1, sp
+ vst1.8 {@XMM[0]}, [sp,:128]
+ mov r2, $key
+ mov r4, $fp @ preserve fp
+
+ bl AES_decrypt
+
+ vld1.8 {@XMM[0]}, [sp,:128]
+ veor @XMM[0], @XMM[0], @XMM[9]
+ vst1.8 {@XMM[0]}, [$out]
+
+ mov r6, $out
+.Lxts_dec_steal:
+ ldrb r1, [$out]
+ ldrb r0, [$inp], #1
+ strb r1, [$out, #0x10]
+ strb r0, [$out], #1
+
+ subs $len, #1
+ bhi .Lxts_dec_steal
+
+ vld1.8 {@XMM[0]}, [r6]
+ mov r0, sp
+ veor @XMM[0], @XMM[8]
+ mov r1, sp
+ vst1.8 {@XMM[0]}, [sp,:128]
+ mov r2, $key
+
+ bl AES_decrypt
+
+ vld1.8 {@XMM[0]}, [sp,:128]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ vst1.8 {@XMM[0]}, [r6]
+ mov $fp, r4
+#endif
+
+.Lxts_dec_ret:
+ bic r0, $fp, #0xf
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+#ifdef XTS_CHAIN_TWEAK
+ ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
+#endif
+.Lxts_dec_bzero: @ wipe key schedule [if any]
+ vstmia sp!, {q0-q1}
+ cmp sp, r0
+ bne .Lxts_dec_bzero
+
+ mov sp, $fp
+#ifdef XTS_CHAIN_TWEAK
+ vst1.8 {@XMM[8]}, [r1]
+#endif
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc} @ return
+
+.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
+___
+}
+$code.=<<___;
+#endif
+___
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/@/ and !/^$/);
+ print;
+}
+close SELF;
+
+print $code;
+
+close STDOUT;
diff --git a/crypto/aes/asm/bsaes-x86_64.pl b/crypto/aes/asm/bsaes-x86_64.pl
index 41b90f08443f..3f7d33c45bce 100755
--- a/crypto/aes/asm/bsaes-x86_64.pl
+++ b/crypto/aes/asm/bsaes-x86_64.pl
@@ -38,8 +38,9 @@
# Emilia's this(*) difference
#
# Core 2 9.30 8.69 +7%
-# Nehalem(**) 7.63 6.98 +9%
-# Atom 17.1 17.4 -2%(***)
+# Nehalem(**) 7.63 6.88 +11%
+# Atom 17.1 16.4 +4%
+# Silvermont - 12.9
#
# (*) Comparison is not completely fair, because "this" is ECB,
# i.e. no extra processing such as counter values calculation
@@ -50,14 +51,6 @@
# (**) Results were collected on Westmere, which is considered to
# be equivalent to Nehalem for this code.
#
-# (***) Slowdown on Atom is rather strange per se, because original
-# implementation has a number of 9+-bytes instructions, which
-# are bad for Atom front-end, and which I eliminated completely.
-# In attempt to address deterioration sbox() was tested in FP
-# SIMD "domain" (movaps instead of movdqa, xorps instead of
-# pxor, etc.). While it resulted in nominal 4% improvement on
-# Atom, it hurted Westmere by more than 2x factor.
-#
# As for key schedule conversion subroutine. Interface to OpenSSL
# relies on per-invocation on-the-fly conversion. This naturally
# has impact on performance, especially for short inputs. Conversion
@@ -67,7 +60,7 @@
# conversion conversion/8x block
# Core 2 240 0.22
# Nehalem 180 0.20
-# Atom 430 0.19
+# Atom 430 0.20
#
# The ratio values mean that 128-byte blocks will be processed
# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
@@ -83,9 +76,10 @@
# Add decryption procedure. Performance in CPU cycles spent to decrypt
# one byte out of 4096-byte buffer with 128-bit key is:
#
-# Core 2 9.83
-# Nehalem 7.74
-# Atom 19.0
+# Core 2 9.98
+# Nehalem 7.80
+# Atom 17.9
+# Silvermont 14.0
#
# November 2011.
#
@@ -434,21 +428,21 @@ my $mask=pop;
$code.=<<___;
pxor 0x00($key),@x[0]
pxor 0x10($key),@x[1]
- pshufb $mask,@x[0]
pxor 0x20($key),@x[2]
- pshufb $mask,@x[1]
pxor 0x30($key),@x[3]
- pshufb $mask,@x[2]
+ pshufb $mask,@x[0]
+ pshufb $mask,@x[1]
pxor 0x40($key),@x[4]
- pshufb $mask,@x[3]
pxor 0x50($key),@x[5]
- pshufb $mask,@x[4]
+ pshufb $mask,@x[2]
+ pshufb $mask,@x[3]
pxor 0x60($key),@x[6]
- pshufb $mask,@x[5]
pxor 0x70($key),@x[7]
+ pshufb $mask,@x[4]
+ pshufb $mask,@x[5]
pshufb $mask,@x[6]
- lea 0x80($key),$key
pshufb $mask,@x[7]
+ lea 0x80($key),$key
___
}
@@ -820,18 +814,18 @@ _bsaes_encrypt8:
movdqa 0x50($const), @XMM[8] # .LM0SR
pxor @XMM[9], @XMM[0] # xor with round0 key
pxor @XMM[9], @XMM[1]
- pshufb @XMM[8], @XMM[0]
pxor @XMM[9], @XMM[2]
- pshufb @XMM[8], @XMM[1]
pxor @XMM[9], @XMM[3]
- pshufb @XMM[8], @XMM[2]
+ pshufb @XMM[8], @XMM[0]
+ pshufb @XMM[8], @XMM[1]
pxor @XMM[9], @XMM[4]
- pshufb @XMM[8], @XMM[3]
pxor @XMM[9], @XMM[5]
- pshufb @XMM[8], @XMM[4]
+ pshufb @XMM[8], @XMM[2]
+ pshufb @XMM[8], @XMM[3]
pxor @XMM[9], @XMM[6]
- pshufb @XMM[8], @XMM[5]
pxor @XMM[9], @XMM[7]
+ pshufb @XMM[8], @XMM[4]
+ pshufb @XMM[8], @XMM[5]
pshufb @XMM[8], @XMM[6]
pshufb @XMM[8], @XMM[7]
_bsaes_encrypt8_bitslice:
@@ -884,18 +878,18 @@ _bsaes_decrypt8:
movdqa -0x30($const), @XMM[8] # .LM0ISR
pxor @XMM[9], @XMM[0] # xor with round0 key
pxor @XMM[9], @XMM[1]
- pshufb @XMM[8], @XMM[0]
pxor @XMM[9], @XMM[2]
- pshufb @XMM[8], @XMM[1]
pxor @XMM[9], @XMM[3]
- pshufb @XMM[8], @XMM[2]
+ pshufb @XMM[8], @XMM[0]
+ pshufb @XMM[8], @XMM[1]
pxor @XMM[9], @XMM[4]
- pshufb @XMM[8], @XMM[3]
pxor @XMM[9], @XMM[5]
- pshufb @XMM[8], @XMM[4]
+ pshufb @XMM[8], @XMM[2]
+ pshufb @XMM[8], @XMM[3]
pxor @XMM[9], @XMM[6]
- pshufb @XMM[8], @XMM[5]
pxor @XMM[9], @XMM[7]
+ pshufb @XMM[8], @XMM[4]
+ pshufb @XMM[8], @XMM[5]
pshufb @XMM[8], @XMM[6]
pshufb @XMM[8], @XMM[7]
___
@@ -1937,21 +1931,21 @@ $code.=<<___;
movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
pxor @XMM[9], @XMM[0] # xor with round0 key
pxor @XMM[9], @XMM[1]
- pshufb @XMM[8], @XMM[0]
pxor @XMM[9], @XMM[2]
- pshufb @XMM[8], @XMM[1]
pxor @XMM[9], @XMM[3]
- pshufb @XMM[8], @XMM[2]
+ pshufb @XMM[8], @XMM[0]
+ pshufb @XMM[8], @XMM[1]
pxor @XMM[9], @XMM[4]
- pshufb @XMM[8], @XMM[3]
pxor @XMM[9], @XMM[5]
- pshufb @XMM[8], @XMM[4]
+ pshufb @XMM[8], @XMM[2]
+ pshufb @XMM[8], @XMM[3]
pxor @XMM[9], @XMM[6]
- pshufb @XMM[8], @XMM[5]
pxor @XMM[9], @XMM[7]
+ pshufb @XMM[8], @XMM[4]
+ pshufb @XMM[8], @XMM[5]
pshufb @XMM[8], @XMM[6]
- lea .LBS0(%rip), %r11 # constants table
pshufb @XMM[8], @XMM[7]
+ lea .LBS0(%rip), %r11 # constants table
mov %ebx,%r10d # pass rounds
call _bsaes_encrypt8_bitslice
diff --git a/crypto/aes/asm/vpaes-ppc.pl b/crypto/aes/asm/vpaes-ppc.pl
new file mode 100755
index 000000000000..7fda60ed9e4d
--- /dev/null
+++ b/crypto/aes/asm/vpaes-ppc.pl
@@ -0,0 +1,1512 @@
+#!/usr/bin/env perl
+
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+
+# CBC encrypt/decrypt performance in cycles per byte processed with
+# 128-bit key.
+#
+# aes-ppc.pl this
+# G4e 35.5/52.1/(23.8) 11.9(*)/15.4
+# POWER6 42.7/54.3/(28.2) 63.0/92.8(**)
+# POWER7 32.3/42.9/(18.4) 18.5/23.3
+#
+# (*) This is ~10% worse than reported in paper. The reason is
+# twofold. This module doesn't make any assumption about
+# key schedule (or data for that matter) alignment and handles
+# it in-line. Secondly it, being transliterated from
+# vpaes-x86_64.pl, relies on "nested inversion" better suited
+# for Intel CPUs.
+# (**) Inadequate POWER6 performance is due to astronomic AltiVec
+# latency, 9 cycles per simple logical operation.
+
+$flavour = shift;
+
+if ($flavour =~ /64/) {
+ $SIZE_T =8;
+ $LRSAVE =2*$SIZE_T;
+ $STU ="stdu";
+ $POP ="ld";
+ $PUSH ="std";
+ $UCMP ="cmpld";
+} elsif ($flavour =~ /32/) {
+ $SIZE_T =4;
+ $LRSAVE =$SIZE_T;
+ $STU ="stwu";
+ $POP ="lwz";
+ $PUSH ="stw";
+ $UCMP ="cmplw";
+} else { die "nonsense $flavour"; }
+
+$sp="r1";
+$FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+
+$code.=<<___;
+.machine "any"
+
+.text
+
+.align 7 # totally strategic alignment
+_vpaes_consts:
+Lk_mc_forward: # mc_forward
+ .long 0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c ?inv
+ .long 0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300 ?inv
+ .long 0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704 ?inv
+ .long 0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08 ?inv
+Lk_mc_backward: # mc_backward
+ .long 0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e ?inv
+ .long 0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a ?inv
+ .long 0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506 ?inv
+ .long 0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102 ?inv
+Lk_sr: # sr
+ .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f ?inv
+ .long 0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b ?inv
+ .long 0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07 ?inv
+ .long 0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603 ?inv
+
+##
+## "Hot" constants
+##
+Lk_inv: # inv, inva
+ .long 0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704 ?rev
+ .long 0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03 ?rev
+Lk_ipt: # input transform (lo, hi)
+ .long 0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca ?rev
+ .long 0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd ?rev
+Lk_sbo: # sbou, sbot
+ .long 0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15 ?rev
+ .long 0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e ?rev
+Lk_sb1: # sb1u, sb1t
+ .long 0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b ?rev
+ .long 0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5 ?rev
+Lk_sb2: # sb2u, sb2t
+ .long 0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2 ?rev
+ .long 0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e ?rev
+
+##
+## Decryption stuff
+##
+Lk_dipt: # decryption input transform
+ .long 0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15 ?rev
+ .long 0x00650560, 0xe683e386, 0x94f191f4, 0x72177712 ?rev
+Lk_dsbo: # decryption sbox final output
+ .long 0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7 ?rev
+ .long 0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca ?rev
+Lk_dsb9: # decryption sbox output *9*u, *9*t
+ .long 0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca ?rev
+ .long 0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72 ?rev
+Lk_dsbd: # decryption sbox output *D*u, *D*t
+ .long 0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5 ?rev
+ .long 0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129 ?rev
+Lk_dsbb: # decryption sbox output *B*u, *B*t
+ .long 0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660 ?rev
+ .long 0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3 ?rev
+Lk_dsbe: # decryption sbox output *E*u, *E*t
+ .long 0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222 ?rev
+ .long 0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794 ?rev
+
+##
+## Key schedule constants
+##
+Lk_dksd: # decryption key schedule: invskew x*D
+ .long 0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007 ?rev
+ .long 0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f ?rev
+Lk_dksb: # decryption key schedule: invskew x*B
+ .long 0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603 ?rev
+ .long 0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9 ?rev
+Lk_dkse: # decryption key schedule: invskew x*E + 0x63
+ .long 0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553 ?rev
+ .long 0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd ?rev
+Lk_dks9: # decryption key schedule: invskew x*9
+ .long 0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a ?rev
+ .long 0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b ?rev
+
+Lk_rcon: # rcon
+ .long 0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70 ?asis
+Lk_s63:
+ .long 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b ?asis
+
+Lk_opt: # output transform
+ .long 0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7 ?rev
+ .long 0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1 ?rev
+Lk_deskew: # deskew tables: inverts the sbox's "skew"
+ .long 0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d ?rev
+ .long 0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128 ?rev
+.align 5
+Lconsts:
+ mflr r0
+ bcl 20,31,\$+4
+ mflr r12 #vvvvv "distance between . and _vpaes_consts
+ addi r12,r12,-0x308
+ mtlr r0
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+.asciz "Vector Permutation AES for AltiVec, Mike Hamburg (Stanford University)"
+.align 6
+___
+
+my ($inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm) = map("v$_",(26..31));
+{
+my ($inp,$out,$key) = map("r$_",(3..5));
+
+my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_",(10..15));
+my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_",(16..19));
+my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_",(16..23));
+
+$code.=<<___;
+##
+## _aes_preheat
+##
+## Fills register %r10 -> .aes_consts (so you can -fPIC)
+## and %xmm9-%xmm15 as specified below.
+##
+.align 4
+_vpaes_encrypt_preheat:
+ mflr r8
+ bl Lconsts
+ mtlr r8
+ li r11, 0xc0 # Lk_inv
+ li r10, 0xd0
+ li r9, 0xe0 # Lk_ipt
+ li r8, 0xf0
+ vxor v7, v7, v7 # 0x00..00
+ vspltisb v8,4 # 0x04..04
+ vspltisb v9,0x0f # 0x0f..0f
+ lvx $invlo, r12, r11
+ li r11, 0x100
+ lvx $invhi, r12, r10
+ li r10, 0x110
+ lvx $iptlo, r12, r9
+ li r9, 0x120
+ lvx $ipthi, r12, r8
+ li r8, 0x130
+ lvx $sbou, r12, r11
+ li r11, 0x140
+ lvx $sbot, r12, r10
+ li r10, 0x150
+ lvx $sb1u, r12, r9
+ lvx $sb1t, r12, r8
+ lvx $sb2u, r12, r11
+ lvx $sb2t, r12, r10
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+
+##
+## _aes_encrypt_core
+##
+## AES-encrypt %xmm0.
+##
+## Inputs:
+## %xmm0 = input
+## %xmm9-%xmm15 as in _vpaes_preheat
+## (%rdx) = scheduled keys
+##
+## Output in %xmm0
+## Clobbers %xmm1-%xmm6, %r9, %r10, %r11, %rax
+##
+##
+.align 5
+_vpaes_encrypt_core:
+ lwz r8, 240($key) # pull rounds
+ li r9, 16
+ lvx v5, 0, $key # vmovdqu (%r9), %xmm5 # round0 key
+ li r11, 0x10
+ lvx v6, r9, $key
+ addi r9, r9, 16
+ ?vperm v5, v5, v6, $keyperm # align round key
+ addi r10, r11, 0x40
+ vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
+ vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm1
+ vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm3, %xmm2
+ vxor v0, v0, v5 # vpxor %xmm5, %xmm1, %xmm0
+ vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
+ mtctr r8
+ b Lenc_entry
+
+.align 4
+Lenc_loop:
+ # middle of middle round
+ vperm v4, $sb1t, v7, v2 # vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
+ lvx v1, r12, r11 # vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
+ addi r11, r11, 16
+ vperm v0, $sb1u, v7, v3 # vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
+ vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ andi. r11, r11, 0x30 # and \$0x30, %r11 # ... mod 4
+ vperm v5, $sb2t, v7, v2 # vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
+ vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ vperm v2, $sb2u, v7, v3 # vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
+ lvx v4, r12, r10 # vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
+ addi r10, r11, 0x40
+ vperm v3, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
+ vxor v2, v2, v5 # vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
+ vperm v0, v0, v7, v4 # vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
+ vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
+ vperm v4, v3, v7, v1 # vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
+ vxor v0, v0, v3 # vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
+ vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
+
+Lenc_entry:
+ # top of round
+ vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
+ vperm v5, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
+ vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ vand v0, v0, v9
+ vxor v3, v3, v5 # vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ vmr v5, v6
+ lvx v6, r9, $key # vmovdqu (%r9), %xmm5
+ vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ addi r9, r9, 16
+ vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ ?vperm v5, v5, v6, $keyperm # align round key
+ vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ bdnz Lenc_loop
+
+ # middle of last round
+ addi r10, r11, 0x80
+ # vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
+ # vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
+ vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ lvx v1, r12, r10 # vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
+ vperm v0, $sbot, v7, v3 # vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
+ vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ vperm v0, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm0
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+
+.globl .vpaes_encrypt
+.align 5
+.vpaes_encrypt:
+ $STU $sp,-$FRAME($sp)
+ li r10,`15+6*$SIZE_T`
+ li r11,`31+6*$SIZE_T`
+ mflr r6
+ mfspr r7, 256 # save vrsave
+ stvx v20,r10,$sp
+ addi r10,r10,32
+ stvx v21,r11,$sp
+ addi r11,r11,32
+ stvx v22,r10,$sp
+ addi r10,r10,32
+ stvx v23,r11,$sp
+ addi r11,r11,32
+ stvx v24,r10,$sp
+ addi r10,r10,32
+ stvx v25,r11,$sp
+ addi r11,r11,32
+ stvx v26,r10,$sp
+ addi r10,r10,32
+ stvx v27,r11,$sp
+ addi r11,r11,32
+ stvx v28,r10,$sp
+ addi r10,r10,32
+ stvx v29,r11,$sp
+ addi r11,r11,32
+ stvx v30,r10,$sp
+ stvx v31,r11,$sp
+ stw r7,`$FRAME-4`($sp) # save vrsave
+ li r0, -1
+ $PUSH r6,`$FRAME+$LRSAVE`($sp)
+ mtspr 256, r0 # preserve all AltiVec registers
+
+ bl _vpaes_encrypt_preheat
+
+ ?lvsl $inpperm, 0, $inp # prepare for unaligned access
+ lvx v0, 0, $inp
+ addi $inp, $inp, 15 # 15 is not a typo
+ ?lvsr $outperm, 0, $out
+ ?lvsl $keyperm, 0, $key # prepare for unaligned access
+ vnor $outmask, v7, v7 # 0xff..ff
+ lvx $inptail, 0, $inp # redundant in aligned case
+ ?vperm $outmask, v7, $outmask, $outperm
+ lvx $outhead, 0, $out
+ ?vperm v0, v0, $inptail, $inpperm
+
+ bl _vpaes_encrypt_core
+
+ vperm v0, v0, v0, $outperm # rotate right/left
+ vsel v1, $outhead, v0, $outmask
+ vmr $outhead, v0
+ stvx v1, 0, $out
+ addi $out, $out, 15 # 15 is not a typo
+ ########
+
+ lvx v1, 0, $out # redundant in aligned case
+ vsel v1, $outhead, v1, $outmask
+ stvx v1, 0, $out
+
+ li r10,`15+6*$SIZE_T`
+ li r11,`31+6*$SIZE_T`
+ mtlr r6
+ mtspr 256, r7 # restore vrsave
+ lvx v20,r10,$sp
+ addi r10,r10,32
+ lvx v21,r11,$sp
+ addi r11,r11,32
+ lvx v22,r10,$sp
+ addi r10,r10,32
+ lvx v23,r11,$sp
+ addi r11,r11,32
+ lvx v24,r10,$sp
+ addi r10,r10,32
+ lvx v25,r11,$sp
+ addi r11,r11,32
+ lvx v26,r10,$sp
+ addi r10,r10,32
+ lvx v27,r11,$sp
+ addi r11,r11,32
+ lvx v28,r10,$sp
+ addi r10,r10,32
+ lvx v29,r11,$sp
+ addi r11,r11,32
+ lvx v30,r10,$sp
+ lvx v31,r11,$sp
+ addi $sp,$sp,$FRAME
+ blr
+ .long 0
+ .byte 0,12,0x04,1,0x80,0,3,0
+ .long 0
+.size .vpaes_encrypt,.-.vpaes_encrypt
+
+.align 4
+_vpaes_decrypt_preheat:
+ mflr r8
+ bl Lconsts
+ mtlr r8
+ li r11, 0xc0 # Lk_inv
+ li r10, 0xd0
+ li r9, 0x160 # Ldipt
+ li r8, 0x170
+ vxor v7, v7, v7 # 0x00..00
+ vspltisb v8,4 # 0x04..04
+ vspltisb v9,0x0f # 0x0f..0f
+ lvx $invlo, r12, r11
+ li r11, 0x180
+ lvx $invhi, r12, r10
+ li r10, 0x190
+ lvx $iptlo, r12, r9
+ li r9, 0x1a0
+ lvx $ipthi, r12, r8
+ li r8, 0x1b0
+ lvx $sbou, r12, r11
+ li r11, 0x1c0
+ lvx $sbot, r12, r10
+ li r10, 0x1d0
+ lvx $sb9u, r12, r9
+ li r9, 0x1e0
+ lvx $sb9t, r12, r8
+ li r8, 0x1f0
+ lvx $sbdu, r12, r11
+ li r11, 0x200
+ lvx $sbdt, r12, r10
+ li r10, 0x210
+ lvx $sbbu, r12, r9
+ lvx $sbbt, r12, r8
+ lvx $sbeu, r12, r11
+ lvx $sbet, r12, r10
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+
+##
+## Decryption core
+##
+## Same API as encryption core.
+##
+.align 4
+_vpaes_decrypt_core:
+ lwz r8, 240($key) # pull rounds
+ li r9, 16
+ lvx v5, 0, $key # vmovdqu (%r9), %xmm4 # round0 key
+ li r11, 0x30
+ lvx v6, r9, $key
+ addi r9, r9, 16
+ ?vperm v5, v5, v6, $keyperm # align round key
+ vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
+ vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
+ vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm1, %xmm0
+ vxor v0, v0, v5 # vpxor %xmm4, %xmm2, %xmm2
+ vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
+ mtctr r8
+ b Ldec_entry
+
+.align 4
+Ldec_loop:
+#
+# Inverse mix columns
+#
+ lvx v0, r12, r11 # v5 and v0 are flipped
+ # vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
+ # vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
+ vperm v4, $sb9u, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
+ subi r11, r11, 16
+ vperm v1, $sb9t, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
+ andi. r11, r11, 0x30
+ vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0
+ # vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
+ vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ # vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
+
+ vperm v4, $sbdu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
+ vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ vperm v1, $sbdt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
+ vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ # vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
+ vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ # vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
+
+ vperm v4, $sbbu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
+ vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ vperm v1, $sbbt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
+ vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ # vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
+ vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ # vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
+
+ vperm v4, $sbeu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
+ vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ vperm v1, $sbet, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
+ vxor v0, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ vxor v0, v0, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+
+Ldec_entry:
+ # top of round
+ vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
+ vperm v2, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
+ vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ vand v0, v0, v9
+ vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ vmr v5, v6
+ lvx v6, r9, $key # vmovdqu (%r9), %xmm0
+ vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ addi r9, r9, 16
+ vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ ?vperm v5, v5, v6, $keyperm # align round key
+ vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ bdnz Ldec_loop
+
+ # middle of last round
+ addi r10, r11, 0x80
+ # vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
+ vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ # vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
+ lvx v2, r12, r10 # vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
+ vperm v1, $sbot, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
+ vxor v4, v4, v5 # vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
+ vxor v0, v1, v4 # vpxor %xmm4, %xmm1, %xmm0 # 0 = A
+ vperm v0, v0, v7, v2 # vpshufb %xmm2, %xmm0, %xmm0
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+
+.globl .vpaes_decrypt
+.align 5
+.vpaes_decrypt:
+ $STU $sp,-$FRAME($sp)
+ li r10,`15+6*$SIZE_T`
+ li r11,`31+6*$SIZE_T`
+ mflr r6
+ mfspr r7, 256 # save vrsave
+ stvx v20,r10,$sp
+ addi r10,r10,32
+ stvx v21,r11,$sp
+ addi r11,r11,32
+ stvx v22,r10,$sp
+ addi r10,r10,32
+ stvx v23,r11,$sp
+ addi r11,r11,32
+ stvx v24,r10,$sp
+ addi r10,r10,32
+ stvx v25,r11,$sp
+ addi r11,r11,32
+ stvx v26,r10,$sp
+ addi r10,r10,32
+ stvx v27,r11,$sp
+ addi r11,r11,32
+ stvx v28,r10,$sp
+ addi r10,r10,32
+ stvx v29,r11,$sp
+ addi r11,r11,32
+ stvx v30,r10,$sp
+ stvx v31,r11,$sp
+ stw r7,`$FRAME-4`($sp) # save vrsave
+ li r0, -1
+ $PUSH r6,`$FRAME+$LRSAVE`($sp)
+ mtspr 256, r0 # preserve all AltiVec registers
+
+ bl _vpaes_decrypt_preheat
+
+ ?lvsl $inpperm, 0, $inp # prepare for unaligned access
+ lvx v0, 0, $inp
+ addi $inp, $inp, 15 # 15 is not a typo
+ ?lvsr $outperm, 0, $out
+ ?lvsl $keyperm, 0, $key
+ vnor $outmask, v7, v7 # 0xff..ff
+ lvx $inptail, 0, $inp # redundant in aligned case
+ ?vperm $outmask, v7, $outmask, $outperm
+ lvx $outhead, 0, $out
+ ?vperm v0, v0, $inptail, $inpperm
+
+ bl _vpaes_decrypt_core
+
+ vperm v0, v0, v0, $outperm # rotate right/left
+ vsel v1, $outhead, v0, $outmask
+ vmr $outhead, v0
+ stvx v1, 0, $out
+ addi $out, $out, 15 # 15 is not a typo
+ ########
+
+ lvx v1, 0, $out # redundant in aligned case
+ vsel v1, $outhead, v1, $outmask
+ stvx v1, 0, $out
+
+ li r10,`15+6*$SIZE_T`
+ li r11,`31+6*$SIZE_T`
+ mtlr r6
+ mtspr 256, r7 # restore vrsave
+ lvx v20,r10,$sp
+ addi r10,r10,32
+ lvx v21,r11,$sp
+ addi r11,r11,32
+ lvx v22,r10,$sp
+ addi r10,r10,32
+ lvx v23,r11,$sp
+ addi r11,r11,32
+ lvx v24,r10,$sp
+ addi r10,r10,32
+ lvx v25,r11,$sp
+ addi r11,r11,32
+ lvx v26,r10,$sp
+ addi r10,r10,32
+ lvx v27,r11,$sp
+ addi r11,r11,32
+ lvx v28,r10,$sp
+ addi r10,r10,32
+ lvx v29,r11,$sp
+ addi r11,r11,32
+ lvx v30,r10,$sp
+ lvx v31,r11,$sp
+ addi $sp,$sp,$FRAME
+ blr
+ .long 0
+ .byte 0,12,0x04,1,0x80,0,3,0
+ .long 0
+.size .vpaes_decrypt,.-.vpaes_decrypt
+
+.globl .vpaes_cbc_encrypt
+.align 5
+.vpaes_cbc_encrypt:
+ ${UCMP}i r5,16
+ bltlr-
+
+ $STU $sp,-`($FRAME+2*$SIZE_T)`($sp)
+ mflr r0
+ li r10,`15+6*$SIZE_T`
+ li r11,`31+6*$SIZE_T`
+ mfspr r12, 256
+ stvx v20,r10,$sp
+ addi r10,r10,32
+ stvx v21,r11,$sp
+ addi r11,r11,32
+ stvx v22,r10,$sp
+ addi r10,r10,32
+ stvx v23,r11,$sp
+ addi r11,r11,32
+ stvx v24,r10,$sp
+ addi r10,r10,32
+ stvx v25,r11,$sp
+ addi r11,r11,32
+ stvx v26,r10,$sp
+ addi r10,r10,32
+ stvx v27,r11,$sp
+ addi r11,r11,32
+ stvx v28,r10,$sp
+ addi r10,r10,32
+ stvx v29,r11,$sp
+ addi r11,r11,32
+ stvx v30,r10,$sp
+ stvx v31,r11,$sp
+ stw r12,`$FRAME-4`($sp) # save vrsave
+ $PUSH r30,`$FRAME+$SIZE_T*0`($sp)
+ $PUSH r31,`$FRAME+$SIZE_T*1`($sp)
+ li r9, -16
+ $PUSH r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
+
+ and r30, r5, r9 # copy length&-16
+ mr r5, r6 # copy pointer to key
+ mr r31, r7 # copy pointer to iv
+ blt Lcbc_abort
+ cmpwi r8, 0 # test direction
+ li r6, -1
+ mr r7, r12 # copy vrsave
+ mtspr 256, r6 # preserve all AltiVec registers
+
+ lvx v24, 0, r31 # load [potentially unaligned] iv
+ li r9, 15
+ ?lvsl $inpperm, 0, r31
+ lvx v25, r9, r31
+ ?vperm v24, v24, v25, $inpperm
+
+ neg r8, $inp # prepare for unaligned access
+ vxor v7, v7, v7
+ ?lvsl $keyperm, 0, $key
+ ?lvsr $outperm, 0, $out
+ ?lvsr $inpperm, 0, r8 # -$inp
+ vnor $outmask, v7, v7 # 0xff..ff
+ lvx $inptail, 0, $inp
+ ?vperm $outmask, v7, $outmask, $outperm
+ addi $inp, $inp, 15 # 15 is not a typo
+ lvx $outhead, 0, $out
+
+ beq Lcbc_decrypt
+
+ bl _vpaes_encrypt_preheat
+ li r0, 16
+
+Lcbc_enc_loop:
+ vmr v0, $inptail
+ lvx $inptail, 0, $inp
+ addi $inp, $inp, 16
+ ?vperm v0, v0, $inptail, $inpperm
+ vxor v0, v0, v24 # ^= iv
+
+ bl _vpaes_encrypt_core
+
+ vmr v24, v0 # put aside iv
+ sub. r30, r30, r0 # len -= 16
+ vperm v0, v0, v0, $outperm # rotate right/left
+ vsel v1, $outhead, v0, $outmask
+ vmr $outhead, v0
+ stvx v1, 0, $out
+ addi $out, $out, 16
+ bne Lcbc_enc_loop
+
+ b Lcbc_done
+
+.align 5
+Lcbc_decrypt:
+ bl _vpaes_decrypt_preheat
+ li r0, 16
+
+Lcbc_dec_loop:
+ vmr v0, $inptail
+ lvx $inptail, 0, $inp
+ addi $inp, $inp, 16
+ ?vperm v0, v0, $inptail, $inpperm
+ vmr v25, v0 # put aside input
+
+ bl _vpaes_decrypt_core
+
+ vxor v0, v0, v24 # ^= iv
+ vmr v24, v25
+ sub. r30, r30, r0 # len -= 16
+ vperm v0, v0, v0, $outperm # rotate right/left
+ vsel v1, $outhead, v0, $outmask
+ vmr $outhead, v0
+ stvx v1, 0, $out
+ addi $out, $out, 16
+ bne Lcbc_dec_loop
+
+Lcbc_done:
+ addi $out, $out, -1
+ lvx v1, 0, $out # redundant in aligned case
+ vsel v1, $outhead, v1, $outmask
+ stvx v1, 0, $out
+
+ neg r8, r31 # write [potentially unaligned] iv
+ ?lvsl $outperm, 0, r8
+ li r6, 15
+ vnor $outmask, v7, v7 # 0xff..ff
+ ?vperm $outmask, v7, $outmask, $outperm
+ lvx $outhead, 0, r31
+ vperm v24, v24, v24, $outperm # rotate right/left
+ vsel v0, $outhead, v24, $outmask
+ lvx v1, r6, r31
+ stvx v0, 0, r31
+ vsel v1, v24, v1, $outmask
+ stvx v1, r6, r31
+
+ mtspr 256, r7 # restore vrsave
+ li r10,`15+6*$SIZE_T`
+ li r11,`31+6*$SIZE_T`
+ lvx v20,r10,$sp
+ addi r10,r10,32
+ lvx v21,r11,$sp
+ addi r11,r11,32
+ lvx v22,r10,$sp
+ addi r10,r10,32
+ lvx v23,r11,$sp
+ addi r11,r11,32
+ lvx v24,r10,$sp
+ addi r10,r10,32
+ lvx v25,r11,$sp
+ addi r11,r11,32
+ lvx v26,r10,$sp
+ addi r10,r10,32
+ lvx v27,r11,$sp
+ addi r11,r11,32
+ lvx v28,r10,$sp
+ addi r10,r10,32
+ lvx v29,r11,$sp
+ addi r11,r11,32
+ lvx v30,r10,$sp
+ lvx v31,r11,$sp
+Lcbc_abort:
+ $POP r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
+ $POP r30,`$FRAME+$SIZE_T*0`($sp)
+ $POP r31,`$FRAME+$SIZE_T*1`($sp)
+ mtlr r0
+ addi $sp,$sp,`$FRAME+$SIZE_T*2`
+ blr
+ .long 0
+ .byte 0,12,0x04,1,0x80,2,6,0
+ .long 0
+.size .vpaes_cbc_encrypt,.-.vpaes_cbc_encrypt
+___
+}
+{
+my ($inp,$bits,$out)=map("r$_",(3..5));
+my $dir="cr1";
+my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_",(10..13,24));
+
+$code.=<<___;
+########################################################
+## ##
+## AES key schedule ##
+## ##
+########################################################
+.align 4
+_vpaes_key_preheat:
+ mflr r8
+ bl Lconsts
+ mtlr r8
+ li r11, 0xc0 # Lk_inv
+ li r10, 0xd0
+ li r9, 0xe0 # L_ipt
+ li r8, 0xf0
+
+ vspltisb v8,4 # 0x04..04
+ vxor v9,v9,v9 # 0x00..00
+ lvx $invlo, r12, r11 # Lk_inv
+ li r11, 0x120
+ lvx $invhi, r12, r10
+ li r10, 0x130
+ lvx $iptlo, r12, r9 # Lk_ipt
+ li r9, 0x220
+ lvx $ipthi, r12, r8
+ li r8, 0x230
+
+ lvx v14, r12, r11 # Lk_sb1
+ li r11, 0x240
+ lvx v15, r12, r10
+ li r10, 0x250
+
+ lvx v16, r12, r9 # Lk_dksd
+ li r9, 0x260
+ lvx v17, r12, r8
+ li r8, 0x270
+ lvx v18, r12, r11 # Lk_dksb
+ li r11, 0x280
+ lvx v19, r12, r10
+ li r10, 0x290
+ lvx v20, r12, r9 # Lk_dkse
+ li r9, 0x2a0
+ lvx v21, r12, r8
+ li r8, 0x2b0
+ lvx v22, r12, r11 # Lk_dks9
+ lvx v23, r12, r10
+
+ lvx v24, r12, r9 # Lk_rcon
+ lvx v25, 0, r12 # Lk_mc_forward[0]
+ lvx v26, r12, r8 # Lks63
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+
+.align 4
+_vpaes_schedule_core:
+ mflr r7
+
+ bl _vpaes_key_preheat # load the tables
+
+ #lvx v0, 0, $inp # vmovdqu (%rdi), %xmm0 # load key (unaligned)
+ neg r8, $inp # prepare for unaligned access
+ lvx v0, 0, $inp
+ addi $inp, $inp, 15 # 15 is not typo
+ ?lvsr $inpperm, 0, r8 # -$inp
+ lvx v6, 0, $inp # v6 serves as inptail
+ addi $inp, $inp, 8
+ ?vperm v0, v0, v6, $inpperm
+
+ # input transform
+ vmr v3, v0 # vmovdqa %xmm0, %xmm3
+ bl _vpaes_schedule_transform
+ vmr v7, v0 # vmovdqa %xmm0, %xmm7
+
+ bne $dir, Lschedule_am_decrypting
+
+ # encrypting, output zeroth round key after transform
+ li r8, 0x30 # mov \$0x30,%r8d
+ addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
+
+ ?lvsr $outperm, 0, $out # prepare for unaligned access
+ vnor $outmask, v9, v9 # 0xff..ff
+ lvx $outhead, 0, $out
+ ?vperm $outmask, v9, $outmask, $outperm
+
+ #stvx v0, 0, $out # vmovdqu %xmm0, (%rdx)
+ vperm v1, v0, v0, $outperm # rotate right/left
+ vsel v2, $outhead, v1, $outmask
+ vmr $outhead, v1
+ stvx v2, 0, $out
+ b Lschedule_go
+
+Lschedule_am_decrypting:
+ srwi r8, $bits, 1 # shr \$1,%r8d
+ andi. r8, r8, 32 # and \$32,%r8d
+ xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
+ addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
+ # decrypting, output zeroth round key after shiftrows
+ lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
+ vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
+
+ neg r0, $out # prepare for unaligned access
+ ?lvsl $outperm, 0, r0
+ addi $out, $out, 15 # 15 is not typo
+ vnor $outmask, v9, v9 # 0xff..ff
+ lvx $outhead, 0, $out
+ ?vperm $outmask, $outmask, v9, $outperm
+
+ #stvx v4, 0, $out # vmovdqu %xmm3, (%rdx)
+ vperm v4, v4, v4, $outperm # rotate right/left
+ vsel v2, $outhead, v4, $outmask
+ vmr $outhead, v4
+ stvx v2, 0, $out
+ xori r8, r8, 0x30 # xor \$0x30, %r8
+
+Lschedule_go:
+ cmplwi $bits, 192 # cmp \$192, %esi
+ bgt Lschedule_256
+ beq Lschedule_192
+ # 128: fall though
+
+##
+## .schedule_128
+##
+## 128-bit specific part of key schedule.
+##
+## This schedule is really simple, because all its parts
+## are accomplished by the subroutines.
+##
+Lschedule_128:
+ li r0, 10 # mov \$10, %esi
+ mtctr r0
+
+Loop_schedule_128:
+ bl _vpaes_schedule_round
+ bdz Lschedule_mangle_last # dec %esi
+ bl _vpaes_schedule_mangle # write output
+ b Loop_schedule_128
+
+##
+## .aes_schedule_192
+##
+## 192-bit specific part of key schedule.
+##
+## The main body of this schedule is the same as the 128-bit
+## schedule, but with more smearing. The long, high side is
+## stored in %xmm7 as before, and the short, low side is in
+## the high bits of %xmm6.
+##
+## This schedule is somewhat nastier, however, because each
+## round produces 192 bits of key material, or 1.5 round keys.
+## Therefore, on each cycle we do 2 rounds and produce 3 round
+## keys.
+##
+.align 4
+Lschedule_192:
+ li r0, 4 # mov \$4, %esi
+ lvx v0, 0, $inp
+ ?vperm v0, v6, v0, $inpperm
+ ?vsldoi v0, v3, v0, 8 # vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
+ bl _vpaes_schedule_transform # input transform
+ ?vsldoi v6, v0, v9, 8
+ ?vsldoi v6, v9, v6, 8 # clobber "low" side with zeros
+ mtctr r0
+
+Loop_schedule_192:
+ bl _vpaes_schedule_round
+ ?vsldoi v0, v6, v0, 8 # vpalignr \$8,%xmm6,%xmm0,%xmm0
+ bl _vpaes_schedule_mangle # save key n
+ bl _vpaes_schedule_192_smear
+ bl _vpaes_schedule_mangle # save key n+1
+ bl _vpaes_schedule_round
+ bdz Lschedule_mangle_last # dec %esi
+ bl _vpaes_schedule_mangle # save key n+2
+ bl _vpaes_schedule_192_smear
+ b Loop_schedule_192
+
+##
+## .aes_schedule_256
+##
+## 256-bit specific part of key schedule.
+##
+## The structure here is very similar to the 128-bit
+## schedule, but with an additional "low side" in
+## %xmm6. The low side's rounds are the same as the
+## high side's, except no rcon and no rotation.
+##
+.align 4
+Lschedule_256:
+ li r0, 7 # mov \$7, %esi
+ addi $inp, $inp, 8
+ lvx v0, 0, $inp # vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
+ ?vperm v0, v6, v0, $inpperm
+ bl _vpaes_schedule_transform # input transform
+ mtctr r0
+
+Loop_schedule_256:
+ bl _vpaes_schedule_mangle # output low result
+ vmr v6, v0 # vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
+
+ # high round
+ bl _vpaes_schedule_round
+ bdz Lschedule_mangle_last # dec %esi
+ bl _vpaes_schedule_mangle
+
+ # low round. swap xmm7 and xmm6
+ ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
+ vmr v5, v7 # vmovdqa %xmm7, %xmm5
+ vmr v7, v6 # vmovdqa %xmm6, %xmm7
+ bl _vpaes_schedule_low_round
+ vmr v7, v5 # vmovdqa %xmm5, %xmm7
+
+ b Loop_schedule_256
+##
+## .aes_schedule_mangle_last
+##
+## Mangler for last round of key schedule
+## Mangles %xmm0
+## when encrypting, outputs out(%xmm0) ^ 63
+## when decrypting, outputs unskew(%xmm0)
+##
+## Always called right before return... jumps to cleanup and exits
+##
+.align 4
+Lschedule_mangle_last:
+ # schedule last round key from xmm0
+ li r11, 0x2e0 # lea .Lk_deskew(%rip),%r11
+ li r9, 0x2f0
+ bne $dir, Lschedule_mangle_last_dec
+
+ # encrypting
+ lvx v1, r8, r10 # vmovdqa (%r8,%r10),%xmm1
+ li r11, 0x2c0 # lea .Lk_opt(%rip), %r11 # prepare to output transform
+ li r9, 0x2d0 # prepare to output transform
+ vperm v0, v0, v0, v1 # vpshufb %xmm1, %xmm0, %xmm0 # output permute
+
+ lvx $iptlo, r11, r12 # reload $ipt
+ lvx $ipthi, r9, r12
+ addi $out, $out, 16 # add \$16, %rdx
+ vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
+ bl _vpaes_schedule_transform # output transform
+
+ #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
+ vperm v0, v0, v0, $outperm # rotate right/left
+ vsel v2, $outhead, v0, $outmask
+ vmr $outhead, v0
+ stvx v2, 0, $out
+
+ addi $out, $out, 15 # 15 is not typo
+ lvx v1, 0, $out # redundant in aligned case
+ vsel v1, $outhead, v1, $outmask
+ stvx v1, 0, $out
+ b Lschedule_mangle_done
+
+.align 4
+Lschedule_mangle_last_dec:
+ lvx $iptlo, r11, r12 # reload $ipt
+ lvx $ipthi, r9, r12
+ addi $out, $out, -16 # add \$-16, %rdx
+ vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
+ bl _vpaes_schedule_transform # output transform
+
+ #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
+ vperm v0, v0, v0, $outperm # rotate right/left
+ vsel v2, $outhead, v0, $outmask
+ vmr $outhead, v0
+ stvx v2, 0, $out
+
+ addi $out, $out, -15 # -15 is not typo
+ lvx v1, 0, $out # redundant in aligned case
+ vsel v1, $outhead, v1, $outmask
+ stvx v1, 0, $out
+
+Lschedule_mangle_done:
+ mtlr r7
+ # cleanup
+ vxor v0, v0, v0 # vpxor %xmm0, %xmm0, %xmm0
+ vxor v1, v1, v1 # vpxor %xmm1, %xmm1, %xmm1
+ vxor v2, v2, v2 # vpxor %xmm2, %xmm2, %xmm2
+ vxor v3, v3, v3 # vpxor %xmm3, %xmm3, %xmm3
+ vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
+ vxor v5, v5, v5 # vpxor %xmm5, %xmm5, %xmm5
+ vxor v6, v6, v6 # vpxor %xmm6, %xmm6, %xmm6
+ vxor v7, v7, v7 # vpxor %xmm7, %xmm7, %xmm7
+
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+
+##
+## .aes_schedule_192_smear
+##
+## Smear the short, low side in the 192-bit key schedule.
+##
+## Inputs:
+## %xmm7: high side, b a x y
+## %xmm6: low side, d c 0 0
+## %xmm13: 0
+##
+## Outputs:
+## %xmm6: b+c+d b+c 0 0
+## %xmm0: b+c+d b+c b a
+##
+.align 4
+_vpaes_schedule_192_smear:
+ ?vspltw v0, v7, 3
+ ?vsldoi v1, v9, v6, 12 # vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
+ ?vsldoi v0, v7, v0, 8 # vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
+ vxor v6, v6, v1 # vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
+ vxor v6, v6, v0 # vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
+ vmr v0, v6
+ ?vsldoi v6, v6, v9, 8
+ ?vsldoi v6, v9, v6, 8 # clobber low side with zeros
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+
+##
+## .aes_schedule_round
+##
+## Runs one main round of the key schedule on %xmm0, %xmm7
+##
+## Specifically, runs subbytes on the high dword of %xmm0
+## then rotates it by one byte and xors into the low dword of
+## %xmm7.
+##
+## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+## next rcon.
+##
+## Smears the dwords of %xmm7 by xoring the low into the
+## second low, result into third, result into highest.
+##
+## Returns results in %xmm7 = %xmm0.
+## Clobbers %xmm1-%xmm4, %r11.
+##
+.align 4
+_vpaes_schedule_round:
+ # extract rcon from xmm8
+ #vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
+ ?vsldoi v1, $rcon, v9, 15 # vpalignr \$15, %xmm8, %xmm4, %xmm1
+ ?vsldoi $rcon, $rcon, $rcon, 15 # vpalignr \$15, %xmm8, %xmm8, %xmm8
+ vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
+
+ # rotate
+ ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
+ ?vsldoi v0, v0, v0, 1 # vpalignr \$1, %xmm0, %xmm0, %xmm0
+
+ # fall through...
+
+ # low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+ # smear xmm7
+ ?vsldoi v1, v9, v7, 12 # vpslldq \$4, %xmm7, %xmm1
+ vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
+ vspltisb v1, 0x0f # 0x0f..0f
+ ?vsldoi v4, v9, v7, 8 # vpslldq \$8, %xmm7, %xmm4
+
+ # subbytes
+ vand v1, v1, v0 # vpand %xmm9, %xmm0, %xmm1 # 0 = k
+ vsrb v0, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
+ vxor v7, v7, v4 # vpxor %xmm4, %xmm7, %xmm7
+ vperm v2, $invhi, v9, v1 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
+ vxor v1, v1, v0 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ vperm v3, $invlo, v9, v0 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ vperm v4, $invlo, v9, v1 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ vxor v7, v7, v26 # vpxor .Lk_s63(%rip), %xmm7, %xmm7
+ vperm v3, $invlo, v9, v3 # vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
+ vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ vperm v2, $invlo, v9, v4 # vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
+ vxor v3, v3, v1 # vpxor %xmm1, %xmm3, %xmm3 # 2 = io
+ vxor v2, v2, v0 # vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
+ vperm v4, v15, v9, v3 # vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
+ vperm v1, v14, v9, v2 # vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
+ vxor v1, v1, v4 # vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
+
+ # add in smeared stuff
+ vxor v0, v1, v7 # vpxor %xmm7, %xmm1, %xmm0
+ vxor v7, v1, v7 # vmovdqa %xmm0, %xmm7
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+
+##
+## .aes_schedule_transform
+##
+## Linear-transform %xmm0 according to tables at (%r11)
+##
+## Requires that %xmm9 = 0x0F0F... as in preheat
+## Output in %xmm0
+## Clobbers %xmm2
+##
+.align 4
+_vpaes_schedule_transform:
+ #vand v1, v0, v9 # vpand %xmm9, %xmm0, %xmm1
+ vsrb v2, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
+ # vmovdqa (%r11), %xmm2 # lo
+ vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
+ # vmovdqa 16(%r11), %xmm1 # hi
+ vperm v2, $ipthi, $ipthi, v2 # vpshufb %xmm0, %xmm1, %xmm0
+ vxor v0, v0, v2 # vpxor %xmm2, %xmm0, %xmm0
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+
+##
+## .aes_schedule_mangle
+##
+## Mangle xmm0 from (basis-transformed) standard version
+## to our version.
+##
+## On encrypt,
+## xor with 0x63
+## multiply by circulant 0,1,1,1
+## apply shiftrows transform
+##
+## On decrypt,
+## xor with 0x63
+## multiply by "inverse mixcolumns" circulant E,B,D,9
+## deskew
+## apply shiftrows transform
+##
+##
+## Writes out to (%rdx), and increments or decrements it
+## Keeps track of round number mod 4 in %r8
+## Preserves xmm0
+## Clobbers xmm1-xmm5
+##
+.align 4
+_vpaes_schedule_mangle:
+ #vmr v4, v0 # vmovdqa %xmm0, %xmm4 # save xmm0 for later
+ # vmovdqa .Lk_mc_forward(%rip),%xmm5
+ bne $dir, Lschedule_mangle_dec
+
+ # encrypting
+ vxor v4, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm4
+ addi $out, $out, 16 # add \$16, %rdx
+ vperm v4, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm4
+ vperm v1, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm1
+ vperm v3, v1, v1, v25 # vpshufb %xmm5, %xmm1, %xmm3
+ vxor v4, v4, v1 # vpxor %xmm1, %xmm4, %xmm4
+ lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
+ vxor v3, v3, v4 # vpxor %xmm4, %xmm3, %xmm3
+
+ vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
+ addi r8, r8, -16 # add \$-16, %r8
+ andi. r8, r8, 0x30 # and \$0x30, %r8
+
+ #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
+ vperm v1, v3, v3, $outperm # rotate right/left
+ vsel v2, $outhead, v1, $outmask
+ vmr $outhead, v1
+ stvx v2, 0, $out
+ blr
+
+.align 4
+Lschedule_mangle_dec:
+ # inverse mix columns
+ # lea .Lk_dksd(%rip),%r11
+ vsrb v1, v0, v8 # vpsrlb \$4, %xmm4, %xmm1 # 1 = hi
+ #and v4, v0, v9 # vpand %xmm9, %xmm4, %xmm4 # 4 = lo
+
+ # vmovdqa 0x00(%r11), %xmm2
+ vperm v2, v16, v16, v0 # vpshufb %xmm4, %xmm2, %xmm2
+ # vmovdqa 0x10(%r11), %xmm3
+ vperm v3, v17, v17, v1 # vpshufb %xmm1, %xmm3, %xmm3
+ vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
+ vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
+
+ # vmovdqa 0x20(%r11), %xmm2
+ vperm v2, v18, v18, v0 # vpshufb %xmm4, %xmm2, %xmm2
+ vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
+ # vmovdqa 0x30(%r11), %xmm3
+ vperm v3, v19, v19, v1 # vpshufb %xmm1, %xmm3, %xmm3
+ vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
+ vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
+
+ # vmovdqa 0x40(%r11), %xmm2
+ vperm v2, v20, v20, v0 # vpshufb %xmm4, %xmm2, %xmm2
+ vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
+ # vmovdqa 0x50(%r11), %xmm3
+ vperm v3, v21, v21, v1 # vpshufb %xmm1, %xmm3, %xmm3
+ vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
+
+ # vmovdqa 0x60(%r11), %xmm2
+ vperm v2, v22, v22, v0 # vpshufb %xmm4, %xmm2, %xmm2
+ vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
+ # vmovdqa 0x70(%r11), %xmm4
+ vperm v4, v23, v23, v1 # vpshufb %xmm1, %xmm4, %xmm4
+ lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
+ vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
+ vxor v3, v4, v2 # vpxor %xmm2, %xmm4, %xmm3
+
+ addi $out, $out, -16 # add \$-16, %rdx
+
+ vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
+ addi r8, r8, -16 # add \$-16, %r8
+ andi. r8, r8, 0x30 # and \$0x30, %r8
+
+ #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
+ vperm v1, v3, v3, $outperm # rotate right/left
+ vsel v2, $outhead, v1, $outmask
+ vmr $outhead, v1
+ stvx v2, 0, $out
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+
+.globl .vpaes_set_encrypt_key
+.align 5
+.vpaes_set_encrypt_key:
+ $STU $sp,-$FRAME($sp)
+ li r10,`15+6*$SIZE_T`
+ li r11,`31+6*$SIZE_T`
+ mflr r0
+ mfspr r6, 256 # save vrsave
+ stvx v20,r10,$sp
+ addi r10,r10,32
+ stvx v21,r11,$sp
+ addi r11,r11,32
+ stvx v22,r10,$sp
+ addi r10,r10,32
+ stvx v23,r11,$sp
+ addi r11,r11,32
+ stvx v24,r10,$sp
+ addi r10,r10,32
+ stvx v25,r11,$sp
+ addi r11,r11,32
+ stvx v26,r10,$sp
+ addi r10,r10,32
+ stvx v27,r11,$sp
+ addi r11,r11,32
+ stvx v28,r10,$sp
+ addi r10,r10,32
+ stvx v29,r11,$sp
+ addi r11,r11,32
+ stvx v30,r10,$sp
+ stvx v31,r11,$sp
+ stw r6,`$FRAME-4`($sp) # save vrsave
+ li r7, -1
+ $PUSH r0, `$FRAME+$LRSAVE`($sp)
+ mtspr 256, r7 # preserve all AltiVec registers
+
+ srwi r9, $bits, 5 # shr \$5,%eax
+ addi r9, r9, 6 # add \$5,%eax
+ stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
+
+ cmplw $dir, $bits, $bits # set encrypt direction
+ li r8, 0x30 # mov \$0x30,%r8d
+ bl _vpaes_schedule_core
+
+ $POP r0, `$FRAME+$LRSAVE`($sp)
+ li r10,`15+6*$SIZE_T`
+ li r11,`31+6*$SIZE_T`
+ mtspr 256, r6 # restore vrsave
+ mtlr r0
+ xor r3, r3, r3
+ lvx v20,r10,$sp
+ addi r10,r10,32
+ lvx v21,r11,$sp
+ addi r11,r11,32
+ lvx v22,r10,$sp
+ addi r10,r10,32
+ lvx v23,r11,$sp
+ addi r11,r11,32
+ lvx v24,r10,$sp
+ addi r10,r10,32
+ lvx v25,r11,$sp
+ addi r11,r11,32
+ lvx v26,r10,$sp
+ addi r10,r10,32
+ lvx v27,r11,$sp
+ addi r11,r11,32
+ lvx v28,r10,$sp
+ addi r10,r10,32
+ lvx v29,r11,$sp
+ addi r11,r11,32
+ lvx v30,r10,$sp
+ lvx v31,r11,$sp
+ addi $sp,$sp,$FRAME
+ blr
+ .long 0
+ .byte 0,12,0x04,1,0x80,0,3,0
+ .long 0
+.size .vpaes_set_encrypt_key,.-.vpaes_set_encrypt_key
+
+.globl .vpaes_set_decrypt_key
+.align 4
+.vpaes_set_decrypt_key:
+ $STU $sp,-$FRAME($sp)
+ li r10,`15+6*$SIZE_T`
+ li r11,`31+6*$SIZE_T`
+ mflr r0
+ mfspr r6, 256 # save vrsave
+ stvx v20,r10,$sp
+ addi r10,r10,32
+ stvx v21,r11,$sp
+ addi r11,r11,32
+ stvx v22,r10,$sp
+ addi r10,r10,32
+ stvx v23,r11,$sp
+ addi r11,r11,32
+ stvx v24,r10,$sp
+ addi r10,r10,32
+ stvx v25,r11,$sp
+ addi r11,r11,32
+ stvx v26,r10,$sp
+ addi r10,r10,32
+ stvx v27,r11,$sp
+ addi r11,r11,32
+ stvx v28,r10,$sp
+ addi r10,r10,32
+ stvx v29,r11,$sp
+ addi r11,r11,32
+ stvx v30,r10,$sp
+ stvx v31,r11,$sp
+ stw r6,`$FRAME-4`($sp) # save vrsave
+ li r7, -1
+ $PUSH r0, `$FRAME+$LRSAVE`($sp)
+ mtspr 256, r7 # preserve all AltiVec registers
+
+ srwi r9, $bits, 5 # shr \$5,%eax
+ addi r9, r9, 6 # add \$5,%eax
+ stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
+
+ slwi r9, r9, 4 # shl \$4,%eax
+ add $out, $out, r9 # lea (%rdx,%rax),%rdx
+
+ cmplwi $dir, $bits, 0 # set decrypt direction
+ srwi r8, $bits, 1 # shr \$1,%r8d
+ andi. r8, r8, 32 # and \$32,%r8d
+ xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
+ bl _vpaes_schedule_core
+
+ $POP r0, `$FRAME+$LRSAVE`($sp)
+ li r10,`15+6*$SIZE_T`
+ li r11,`31+6*$SIZE_T`
+ mtspr 256, r6 # restore vrsave
+ mtlr r0
+ xor r3, r3, r3
+ lvx v20,r10,$sp
+ addi r10,r10,32
+ lvx v21,r11,$sp
+ addi r11,r11,32
+ lvx v22,r10,$sp
+ addi r10,r10,32
+ lvx v23,r11,$sp
+ addi r11,r11,32
+ lvx v24,r10,$sp
+ addi r10,r10,32
+ lvx v25,r11,$sp
+ addi r11,r11,32
+ lvx v26,r10,$sp
+ addi r10,r10,32
+ lvx v27,r11,$sp
+ addi r11,r11,32
+ lvx v28,r10,$sp
+ addi r10,r10,32
+ lvx v29,r11,$sp
+ addi r11,r11,32
+ lvx v30,r10,$sp
+ lvx v31,r11,$sp
+ addi $sp,$sp,$FRAME
+ blr
+ .long 0
+ .byte 0,12,0x04,1,0x80,0,3,0
+ .long 0
+.size .vpaes_set_decrypt_key,.-.vpaes_set_decrypt_key
+___
+}
+
+my $consts=1;
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ # constants table endian-specific conversion
+ if ($consts && m/\.long\s+(.+)\s+(\?[a-z]*)$/o) {
+ my $conv=$2;
+ my @bytes=();
+
+ # convert to endian-agnostic format
+ foreach (split(/,\s+/,$1)) {
+ my $l = /^0/?oct:int;
+ push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
+ }
+
+ # little-endian conversion
+ if ($flavour =~ /le$/o) {
+ SWITCH: for($conv) {
+ /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
+ /\?rev/ && do { @bytes=reverse(@bytes); last; };
+ }
+ }
+
+ #emit
+ print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
+ next;
+ }
+ $consts=0 if (m/Lconsts:/o); # end of table
+
+ # instructions prefixed with '?' are endian-specific and need
+ # to be adjusted accordingly...
+ if ($flavour =~ /le$/o) { # little-endian
+ s/\?lvsr/lvsl/o or
+ s/\?lvsl/lvsr/o or
+ s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
+ s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
+ s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
+ } else { # big-endian
+ s/\?([a-z]+)/$1/o;
+ }
+
+ print $_,"\n";
+}
+
+close STDOUT;
diff --git a/crypto/aes/asm/vpaes-x86.pl b/crypto/aes/asm/vpaes-x86.pl
index 1533e2c3042c..2ba149c3f9d5 100755
--- a/crypto/aes/asm/vpaes-x86.pl
+++ b/crypto/aes/asm/vpaes-x86.pl
@@ -27,9 +27,10 @@
#
# aes-586.pl vpaes-x86.pl
#
-# Core 2(**) 29.1/42.3/18.3 22.0/25.6(***)
-# Nehalem 27.9/40.4/18.1 10.3/12.0
-# Atom 102./119./60.1 64.5/85.3(***)
+# Core 2(**) 28.1/41.4/18.3 21.9/25.2(***)
+# Nehalem 27.9/40.4/18.1 10.2/11.9
+# Atom 70.7/92.1/60.1 61.1/75.4(***)
+# Silvermont 45.4/62.9/24.1 49.2/61.1(***)
#
# (*) "Hyper-threading" in the context refers rather to cache shared
# among multiple cores, than to specifically Intel HTT. As vast
@@ -40,8 +41,8 @@
# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
#
# (***) Less impressive improvement on Core 2 and Atom is due to slow
-# pshufb, yet it's respectable +32%/65% improvement on Core 2
-# and +58%/40% on Atom (as implied, over "hyper-threading-safe"
+# pshufb, yet it's respectable +28%/64% improvement on Core 2
+# and +15% on Atom (as implied, over "hyper-threading-safe"
# code path).
#
# <appro@openssl.org>
@@ -183,35 +184,35 @@ $k_dsbo=0x2c0; # decryption sbox final output
&movdqa ("xmm1","xmm6")
&movdqa ("xmm2",&QWP($k_ipt,$const));
&pandn ("xmm1","xmm0");
- &movdqu ("xmm5",&QWP(0,$key));
- &psrld ("xmm1",4);
&pand ("xmm0","xmm6");
+ &movdqu ("xmm5",&QWP(0,$key));
&pshufb ("xmm2","xmm0");
&movdqa ("xmm0",&QWP($k_ipt+16,$const));
- &pshufb ("xmm0","xmm1");
&pxor ("xmm2","xmm5");
- &pxor ("xmm0","xmm2");
+ &psrld ("xmm1",4);
&add ($key,16);
+ &pshufb ("xmm0","xmm1");
&lea ($base,&DWP($k_mc_backward,$const));
+ &pxor ("xmm0","xmm2");
&jmp (&label("enc_entry"));
&set_label("enc_loop",16);
# middle of middle round
&movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u
- &pshufb ("xmm4","xmm2"); # 4 = sb1u
- &pxor ("xmm4","xmm5"); # 4 = sb1u + k
&movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
+ &pshufb ("xmm4","xmm2"); # 4 = sb1u
&pshufb ("xmm0","xmm3"); # 0 = sb1t
- &pxor ("xmm0","xmm4"); # 0 = A
+ &pxor ("xmm4","xmm5"); # 4 = sb1u + k
&movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u
- &pshufb ("xmm5","xmm2"); # 4 = sb2u
+ &pxor ("xmm0","xmm4"); # 0 = A
&movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
+ &pshufb ("xmm5","xmm2"); # 4 = sb2u
&movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
- &pshufb ("xmm2","xmm3"); # 2 = sb2t
- &pxor ("xmm2","xmm5"); # 2 = 2A
&movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[]
+ &pshufb ("xmm2","xmm3"); # 2 = sb2t
&movdqa ("xmm3","xmm0"); # 3 = A
+ &pxor ("xmm2","xmm5"); # 2 = 2A
&pshufb ("xmm0","xmm1"); # 0 = B
&add ($key,16); # next key
&pxor ("xmm0","xmm2"); # 0 = 2A+B
@@ -220,30 +221,30 @@ $k_dsbo=0x2c0; # decryption sbox final output
&pxor ("xmm3","xmm0"); # 3 = 2A+B+D
&pshufb ("xmm0","xmm1"); # 0 = 2B+C
&and ($magic,0x30); # ... mod 4
- &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D
&sub ($round,1); # nr--
+ &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D
&set_label("enc_entry");
# top of round
&movdqa ("xmm1","xmm6"); # 1 : i
+ &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
&pandn ("xmm1","xmm0"); # 1 = i<<4
&psrld ("xmm1",4); # 1 = i
&pand ("xmm0","xmm6"); # 0 = k
- &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
&pshufb ("xmm5","xmm0"); # 2 = a/k
- &pxor ("xmm0","xmm1"); # 0 = j
&movdqa ("xmm3","xmm7"); # 3 : 1/i
+ &pxor ("xmm0","xmm1"); # 0 = j
&pshufb ("xmm3","xmm1"); # 3 = 1/i
- &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k
&movdqa ("xmm4","xmm7"); # 4 : 1/j
+ &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k
&pshufb ("xmm4","xmm0"); # 4 = 1/j
- &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k
&movdqa ("xmm2","xmm7"); # 2 : 1/iak
+ &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k
&pshufb ("xmm2","xmm3"); # 2 = 1/iak
- &pxor ("xmm2","xmm0"); # 2 = io
&movdqa ("xmm3","xmm7"); # 3 : 1/jak
- &movdqu ("xmm5",&QWP(0,$key));
+ &pxor ("xmm2","xmm0"); # 2 = io
&pshufb ("xmm3","xmm4"); # 3 = 1/jak
+ &movdqu ("xmm5",&QWP(0,$key));
&pxor ("xmm3","xmm1"); # 3 = jo
&jnz (&label("enc_loop"));
@@ -265,8 +266,8 @@ $k_dsbo=0x2c0; # decryption sbox final output
## Same API as encryption core.
##
&function_begin_B("_vpaes_decrypt_core");
- &mov ($round,&DWP(240,$key));
&lea ($base,&DWP($k_dsbd,$const));
+ &mov ($round,&DWP(240,$key));
&movdqa ("xmm1","xmm6");
&movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base));
&pandn ("xmm1","xmm0");
@@ -292,62 +293,61 @@ $k_dsbo=0x2c0; # decryption sbox final output
## Inverse mix columns
##
&movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u
+ &movdqa ("xmm1",&QWP(-0x10,$base)); # 0 : sb9t
&pshufb ("xmm4","xmm2"); # 4 = sb9u
- &pxor ("xmm4","xmm0");
- &movdqa ("xmm0",&QWP(-0x10,$base)); # 0 : sb9t
- &pshufb ("xmm0","xmm3"); # 0 = sb9t
- &pxor ("xmm0","xmm4"); # 0 = ch
- &add ($key,16); # next round key
-
- &pshufb ("xmm0","xmm5"); # MC ch
+ &pshufb ("xmm1","xmm3"); # 0 = sb9t
+ &pxor ("xmm0","xmm4");
&movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu
- &pshufb ("xmm4","xmm2"); # 4 = sbdu
- &pxor ("xmm4","xmm0"); # 4 = ch
- &movdqa ("xmm0",&QWP(0x10,$base)); # 0 : sbdt
- &pshufb ("xmm0","xmm3"); # 0 = sbdt
- &pxor ("xmm0","xmm4"); # 0 = ch
- &sub ($round,1); # nr--
+ &pxor ("xmm0","xmm1"); # 0 = ch
+ &movdqa ("xmm1",&QWP(0x10,$base)); # 0 : sbdt
+ &pshufb ("xmm4","xmm2"); # 4 = sbdu
&pshufb ("xmm0","xmm5"); # MC ch
+ &pshufb ("xmm1","xmm3"); # 0 = sbdt
+ &pxor ("xmm0","xmm4"); # 4 = ch
&movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu
- &pshufb ("xmm4","xmm2"); # 4 = sbbu
- &pxor ("xmm4","xmm0"); # 4 = ch
- &movdqa ("xmm0",&QWP(0x30,$base)); # 0 : sbbt
- &pshufb ("xmm0","xmm3"); # 0 = sbbt
- &pxor ("xmm0","xmm4"); # 0 = ch
+ &pxor ("xmm0","xmm1"); # 0 = ch
+ &movdqa ("xmm1",&QWP(0x30,$base)); # 0 : sbbt
+ &pshufb ("xmm4","xmm2"); # 4 = sbbu
&pshufb ("xmm0","xmm5"); # MC ch
+ &pshufb ("xmm1","xmm3"); # 0 = sbbt
+ &pxor ("xmm0","xmm4"); # 4 = ch
&movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu
- &pshufb ("xmm4","xmm2"); # 4 = sbeu
- &pxor ("xmm4","xmm0"); # 4 = ch
- &movdqa ("xmm0",&QWP(0x50,$base)); # 0 : sbet
- &pshufb ("xmm0","xmm3"); # 0 = sbet
- &pxor ("xmm0","xmm4"); # 0 = ch
+ &pxor ("xmm0","xmm1"); # 0 = ch
+ &movdqa ("xmm1",&QWP(0x50,$base)); # 0 : sbet
+ &pshufb ("xmm4","xmm2"); # 4 = sbeu
+ &pshufb ("xmm0","xmm5"); # MC ch
+ &pshufb ("xmm1","xmm3"); # 0 = sbet
+ &pxor ("xmm0","xmm4"); # 4 = ch
+ &add ($key,16); # next round key
&palignr("xmm5","xmm5",12);
+ &pxor ("xmm0","xmm1"); # 0 = ch
+ &sub ($round,1); # nr--
&set_label("dec_entry");
# top of round
&movdqa ("xmm1","xmm6"); # 1 : i
+ &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
&pandn ("xmm1","xmm0"); # 1 = i<<4
- &psrld ("xmm1",4); # 1 = i
&pand ("xmm0","xmm6"); # 0 = k
- &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
+ &psrld ("xmm1",4); # 1 = i
&pshufb ("xmm2","xmm0"); # 2 = a/k
- &pxor ("xmm0","xmm1"); # 0 = j
&movdqa ("xmm3","xmm7"); # 3 : 1/i
+ &pxor ("xmm0","xmm1"); # 0 = j
&pshufb ("xmm3","xmm1"); # 3 = 1/i
- &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
&movdqa ("xmm4","xmm7"); # 4 : 1/j
+ &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
&pshufb ("xmm4","xmm0"); # 4 = 1/j
&pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
&movdqa ("xmm2","xmm7"); # 2 : 1/iak
&pshufb ("xmm2","xmm3"); # 2 = 1/iak
- &pxor ("xmm2","xmm0"); # 2 = io
&movdqa ("xmm3","xmm7"); # 3 : 1/jak
+ &pxor ("xmm2","xmm0"); # 2 = io
&pshufb ("xmm3","xmm4"); # 3 = 1/jak
- &pxor ("xmm3","xmm1"); # 3 = jo
&movdqu ("xmm0",&QWP(0,$key));
+ &pxor ("xmm3","xmm1"); # 3 = jo
&jnz (&label("dec_loop"));
# middle of last round
@@ -542,12 +542,12 @@ $k_dsbo=0x2c0; # decryption sbox final output
## %xmm0: b+c+d b+c b a
##
&function_begin_B("_vpaes_schedule_192_smear");
- &pshufd ("xmm0","xmm6",0x80); # d c 0 0 -> c 0 0 0
- &pxor ("xmm6","xmm0"); # -> c+d c 0 0
+ &pshufd ("xmm1","xmm6",0x80); # d c 0 0 -> c 0 0 0
&pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a
+ &pxor ("xmm6","xmm1"); # -> c+d c 0 0
+ &pxor ("xmm1","xmm1");
&pxor ("xmm6","xmm0"); # -> b+c+d b+c b a
&movdqa ("xmm0","xmm6");
- &pxor ("xmm1","xmm1");
&movhlps("xmm6","xmm1"); # clobber low side with zeros
&ret ();
&function_end_B("_vpaes_schedule_192_smear");
diff --git a/crypto/aes/asm/vpaes-x86_64.pl b/crypto/aes/asm/vpaes-x86_64.pl
index bd7f45b85091..f2ef318fae4e 100755
--- a/crypto/aes/asm/vpaes-x86_64.pl
+++ b/crypto/aes/asm/vpaes-x86_64.pl
@@ -27,9 +27,10 @@
#
# aes-x86_64.pl vpaes-x86_64.pl
#
-# Core 2(**) 30.5/43.7/14.3 21.8/25.7(***)
-# Nehalem 30.5/42.2/14.6 9.8/11.8
-# Atom 63.9/79.0/32.1 64.0/84.8(***)
+# Core 2(**) 29.6/41.1/14.3 21.9/25.2(***)
+# Nehalem 29.6/40.3/14.6 10.0/11.8
+# Atom 57.3/74.2/32.1 60.9/77.2(***)
+# Silvermont 52.7/64.0/19.5 48.8/60.8(***)
#
# (*) "Hyper-threading" in the context refers rather to cache shared
# among multiple cores, than to specifically Intel HTT. As vast
@@ -40,7 +41,7 @@
# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
#
# (***) Less impressive improvement on Core 2 and Atom is due to slow
-# pshufb, yet it's respectable +40%/78% improvement on Core 2
+# pshufb, yet it's respectable +36%/62% improvement on Core 2
# (as implied, over "hyper-threading-safe" code path).
#
# <appro@openssl.org>
@@ -95,8 +96,8 @@ _vpaes_encrypt_core:
movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi
pshufb %xmm1, %xmm0
pxor %xmm5, %xmm2
- pxor %xmm2, %xmm0
add \$16, %r9
+ pxor %xmm2, %xmm0
lea .Lk_mc_backward(%rip),%r10
jmp .Lenc_entry
@@ -104,19 +105,19 @@ _vpaes_encrypt_core:
.Lenc_loop:
# middle of middle round
movdqa %xmm13, %xmm4 # 4 : sb1u
- pshufb %xmm2, %xmm4 # 4 = sb1u
- pxor %xmm5, %xmm4 # 4 = sb1u + k
movdqa %xmm12, %xmm0 # 0 : sb1t
+ pshufb %xmm2, %xmm4 # 4 = sb1u
pshufb %xmm3, %xmm0 # 0 = sb1t
- pxor %xmm4, %xmm0 # 0 = A
+ pxor %xmm5, %xmm4 # 4 = sb1u + k
movdqa %xmm15, %xmm5 # 4 : sb2u
- pshufb %xmm2, %xmm5 # 4 = sb2u
+ pxor %xmm4, %xmm0 # 0 = A
movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
+ pshufb %xmm2, %xmm5 # 4 = sb2u
+ movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
movdqa %xmm14, %xmm2 # 2 : sb2t
pshufb %xmm3, %xmm2 # 2 = sb2t
- pxor %xmm5, %xmm2 # 2 = 2A
- movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
movdqa %xmm0, %xmm3 # 3 = A
+ pxor %xmm5, %xmm2 # 2 = 2A
pshufb %xmm1, %xmm0 # 0 = B
add \$16, %r9 # next key
pxor %xmm2, %xmm0 # 0 = 2A+B
@@ -125,30 +126,30 @@ _vpaes_encrypt_core:
pxor %xmm0, %xmm3 # 3 = 2A+B+D
pshufb %xmm1, %xmm0 # 0 = 2B+C
and \$0x30, %r11 # ... mod 4
- pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D
sub \$1,%rax # nr--
+ pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D
.Lenc_entry:
# top of round
movdqa %xmm9, %xmm1 # 1 : i
+ movdqa %xmm11, %xmm5 # 2 : a/k
pandn %xmm0, %xmm1 # 1 = i<<4
psrld \$4, %xmm1 # 1 = i
pand %xmm9, %xmm0 # 0 = k
- movdqa %xmm11, %xmm5 # 2 : a/k
pshufb %xmm0, %xmm5 # 2 = a/k
- pxor %xmm1, %xmm0 # 0 = j
movdqa %xmm10, %xmm3 # 3 : 1/i
+ pxor %xmm1, %xmm0 # 0 = j
pshufb %xmm1, %xmm3 # 3 = 1/i
- pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k
movdqa %xmm10, %xmm4 # 4 : 1/j
+ pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k
pshufb %xmm0, %xmm4 # 4 = 1/j
- pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k
movdqa %xmm10, %xmm2 # 2 : 1/iak
+ pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k
pshufb %xmm3, %xmm2 # 2 = 1/iak
- pxor %xmm0, %xmm2 # 2 = io
movdqa %xmm10, %xmm3 # 3 : 1/jak
- movdqu (%r9), %xmm5
+ pxor %xmm0, %xmm2 # 2 = io
pshufb %xmm4, %xmm3 # 3 = 1/jak
+ movdqu (%r9), %xmm5
pxor %xmm1, %xmm3 # 3 = jo
jnz .Lenc_loop
@@ -201,62 +202,61 @@ _vpaes_decrypt_core:
## Inverse mix columns
##
movdqa -0x20(%r10),%xmm4 # 4 : sb9u
+ movdqa -0x10(%r10),%xmm1 # 0 : sb9t
pshufb %xmm2, %xmm4 # 4 = sb9u
- pxor %xmm0, %xmm4
- movdqa -0x10(%r10),%xmm0 # 0 : sb9t
- pshufb %xmm3, %xmm0 # 0 = sb9t
- pxor %xmm4, %xmm0 # 0 = ch
- add \$16, %r9 # next round key
-
- pshufb %xmm5, %xmm0 # MC ch
+ pshufb %xmm3, %xmm1 # 0 = sb9t
+ pxor %xmm4, %xmm0
movdqa 0x00(%r10),%xmm4 # 4 : sbdu
+ pxor %xmm1, %xmm0 # 0 = ch
+ movdqa 0x10(%r10),%xmm1 # 0 : sbdt
+
pshufb %xmm2, %xmm4 # 4 = sbdu
- pxor %xmm0, %xmm4 # 4 = ch
- movdqa 0x10(%r10),%xmm0 # 0 : sbdt
- pshufb %xmm3, %xmm0 # 0 = sbdt
- pxor %xmm4, %xmm0 # 0 = ch
- sub \$1,%rax # nr--
-
pshufb %xmm5, %xmm0 # MC ch
+ pshufb %xmm3, %xmm1 # 0 = sbdt
+ pxor %xmm4, %xmm0 # 4 = ch
movdqa 0x20(%r10),%xmm4 # 4 : sbbu
+ pxor %xmm1, %xmm0 # 0 = ch
+ movdqa 0x30(%r10),%xmm1 # 0 : sbbt
+
pshufb %xmm2, %xmm4 # 4 = sbbu
- pxor %xmm0, %xmm4 # 4 = ch
- movdqa 0x30(%r10),%xmm0 # 0 : sbbt
- pshufb %xmm3, %xmm0 # 0 = sbbt
- pxor %xmm4, %xmm0 # 0 = ch
-
pshufb %xmm5, %xmm0 # MC ch
+ pshufb %xmm3, %xmm1 # 0 = sbbt
+ pxor %xmm4, %xmm0 # 4 = ch
movdqa 0x40(%r10),%xmm4 # 4 : sbeu
- pshufb %xmm2, %xmm4 # 4 = sbeu
- pxor %xmm0, %xmm4 # 4 = ch
- movdqa 0x50(%r10),%xmm0 # 0 : sbet
- pshufb %xmm3, %xmm0 # 0 = sbet
- pxor %xmm4, %xmm0 # 0 = ch
+ pxor %xmm1, %xmm0 # 0 = ch
+ movdqa 0x50(%r10),%xmm1 # 0 : sbet
+ pshufb %xmm2, %xmm4 # 4 = sbeu
+ pshufb %xmm5, %xmm0 # MC ch
+ pshufb %xmm3, %xmm1 # 0 = sbet
+ pxor %xmm4, %xmm0 # 4 = ch
+ add \$16, %r9 # next round key
palignr \$12, %xmm5, %xmm5
-
+ pxor %xmm1, %xmm0 # 0 = ch
+ sub \$1,%rax # nr--
+
.Ldec_entry:
# top of round
movdqa %xmm9, %xmm1 # 1 : i
pandn %xmm0, %xmm1 # 1 = i<<4
+ movdqa %xmm11, %xmm2 # 2 : a/k
psrld \$4, %xmm1 # 1 = i
pand %xmm9, %xmm0 # 0 = k
- movdqa %xmm11, %xmm2 # 2 : a/k
pshufb %xmm0, %xmm2 # 2 = a/k
- pxor %xmm1, %xmm0 # 0 = j
movdqa %xmm10, %xmm3 # 3 : 1/i
+ pxor %xmm1, %xmm0 # 0 = j
pshufb %xmm1, %xmm3 # 3 = 1/i
- pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
movdqa %xmm10, %xmm4 # 4 : 1/j
+ pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
pshufb %xmm0, %xmm4 # 4 = 1/j
pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
movdqa %xmm10, %xmm2 # 2 : 1/iak
pshufb %xmm3, %xmm2 # 2 = 1/iak
- pxor %xmm0, %xmm2 # 2 = io
movdqa %xmm10, %xmm3 # 3 : 1/jak
+ pxor %xmm0, %xmm2 # 2 = io
pshufb %xmm4, %xmm3 # 3 = 1/jak
- pxor %xmm1, %xmm3 # 3 = jo
movdqu (%r9), %xmm0
+ pxor %xmm1, %xmm3 # 3 = jo
jnz .Ldec_loop
# middle of last round
@@ -464,12 +464,12 @@ _vpaes_schedule_core:
.type _vpaes_schedule_192_smear,\@abi-omnipotent
.align 16
_vpaes_schedule_192_smear:
- pshufd \$0x80, %xmm6, %xmm0 # d c 0 0 -> c 0 0 0
- pxor %xmm0, %xmm6 # -> c+d c 0 0
+ pshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
+ pxor %xmm1, %xmm6 # -> c+d c 0 0
+ pxor %xmm1, %xmm1
pxor %xmm0, %xmm6 # -> b+c+d b+c b a
movdqa %xmm6, %xmm0
- pxor %xmm1, %xmm1
movhlps %xmm1, %xmm6 # clobber low side with zeros
ret
.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
diff --git a/crypto/arm64cpuid.S b/crypto/arm64cpuid.S
new file mode 100644
index 000000000000..4778ac1deacc
--- /dev/null
+++ b/crypto/arm64cpuid.S
@@ -0,0 +1,46 @@
+#include "arm_arch.h"
+
+.text
+.arch armv8-a+crypto
+
+.align 5
+.global _armv7_neon_probe
+.type _armv7_neon_probe,%function
+_armv7_neon_probe:
+ orr v15.16b, v15.16b, v15.16b
+ ret
+.size _armv7_neon_probe,.-_armv7_neon_probe
+
+.global _armv7_tick
+.type _armv7_tick,%function
+_armv7_tick:
+ mrs x0, CNTVCT_EL0
+ ret
+.size _armv7_tick,.-_armv7_tick
+
+.global _armv8_aes_probe
+.type _armv8_aes_probe,%function
+_armv8_aes_probe:
+ aese v0.16b, v0.16b
+ ret
+.size _armv8_aes_probe,.-_armv8_aes_probe
+
+.global _armv8_sha1_probe
+.type _armv8_sha1_probe,%function
+_armv8_sha1_probe:
+ sha1h s0, s0
+ ret
+.size _armv8_sha1_probe,.-_armv8_sha1_probe
+
+.global _armv8_sha256_probe
+.type _armv8_sha256_probe,%function
+_armv8_sha256_probe:
+ sha256su0 v0.4s, v0.4s
+ ret
+.size _armv8_sha256_probe,.-_armv8_sha256_probe
+.global _armv8_pmull_probe
+.type _armv8_pmull_probe,%function
+_armv8_pmull_probe:
+ pmull v0.1q, v0.1d, v0.1d
+ ret
+.size _armv8_pmull_probe,.-_armv8_pmull_probe
diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h
index b65437162596..9d6e58880d0e 100644
--- a/crypto/arm_arch.h
+++ b/crypto/arm_arch.h
@@ -10,13 +10,24 @@
# define __ARMEL__
# endif
# elif defined(__GNUC__)
+# if defined(__aarch64__)
+# define __ARM_ARCH__ 8
+# if __BYTE_ORDER__==__ORDER_BIG_ENDIAN__
+# define __ARMEB__
+# else
+# define __ARMEL__
+# endif
/*
* Why doesn't gcc define __ARM_ARCH__? Instead it defines
* bunch of below macros. See all_architectires[] table in
* gcc/config/arm/arm.c. On a side note it defines
* __ARMEL__/__ARMEB__ for little-/big-endian.
*/
-# if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \
+# elif defined(__ARM_ARCH)
+# define __ARM_ARCH__ __ARM_ARCH
+# elif defined(__ARM_ARCH_8A__)
+# define __ARM_ARCH__ 8
+# elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \
defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__) || \
defined(__ARM_ARCH_7EM__)
# define __ARM_ARCH__ 7
@@ -41,11 +52,27 @@
# include <openssl/fipssyms.h>
# endif
+# if !defined(__ARM_MAX_ARCH__)
+# define __ARM_MAX_ARCH__ __ARM_ARCH__
+# endif
+
+# if __ARM_MAX_ARCH__<__ARM_ARCH__
+# error "__ARM_MAX_ARCH__ can't be less than __ARM_ARCH__"
+# elif __ARM_MAX_ARCH__!=__ARM_ARCH__
+# if __ARM_ARCH__<7 && __ARM_MAX_ARCH__>=7 && defined(__ARMEB__)
+# error "can't build universal big-endian binary"
+# endif
+# endif
+
# if !__ASSEMBLER__
extern unsigned int OPENSSL_armcap_P;
-
-# define ARMV7_NEON (1<<0)
-# define ARMV7_TICK (1<<1)
# endif
+# define ARMV7_NEON (1<<0)
+# define ARMV7_TICK (1<<1)
+# define ARMV8_AES (1<<2)
+# define ARMV8_SHA1 (1<<3)
+# define ARMV8_SHA256 (1<<4)
+# define ARMV8_PMULL (1<<5)
+
#endif
diff --git a/crypto/armcap.c b/crypto/armcap.c
index 28522ea86784..356fa152871f 100644
--- a/crypto/armcap.c
+++ b/crypto/armcap.c
@@ -7,8 +7,18 @@
#include "arm_arch.h"
-unsigned int OPENSSL_armcap_P;
+unsigned int OPENSSL_armcap_P = 0;
+#if __ARM_MAX_ARCH__<7
+void OPENSSL_cpuid_setup(void)
+{
+}
+
+unsigned long OPENSSL_rdtsc(void)
+{
+ return 0;
+}
+#else
static sigset_t all_masked;
static sigjmp_buf ill_jmp;
@@ -22,9 +32,13 @@ static void ill_handler(int sig)
* ARM compilers support inline assembler...
*/
void _armv7_neon_probe(void);
-unsigned int _armv7_tick(void);
+void _armv8_aes_probe(void);
+void _armv8_sha1_probe(void);
+void _armv8_sha256_probe(void);
+void _armv8_pmull_probe(void);
+unsigned long _armv7_tick(void);
-unsigned int OPENSSL_rdtsc(void)
+unsigned long OPENSSL_rdtsc(void)
{
if (OPENSSL_armcap_P & ARMV7_TICK)
return _armv7_tick();
@@ -32,9 +46,44 @@ unsigned int OPENSSL_rdtsc(void)
return 0;
}
-#if defined(__GNUC__) && __GNUC__>=2
+/*
+ * Use a weak reference to getauxval() so we can use it if it is available but
+ * don't break the build if it is not.
+ */
+# if defined(__GNUC__) && __GNUC__>=2
void OPENSSL_cpuid_setup(void) __attribute__ ((constructor));
-#endif
+extern unsigned long getauxval(unsigned long type) __attribute__ ((weak));
+# else
+static unsigned long (*getauxval) (unsigned long) = NULL;
+# endif
+
+/*
+ * ARM puts the the feature bits for Crypto Extensions in AT_HWCAP2, whereas
+ * AArch64 used AT_HWCAP.
+ */
+# if defined(__arm__) || defined (__arm)
+# define HWCAP 16
+ /* AT_HWCAP */
+# define HWCAP_NEON (1 << 12)
+
+# define HWCAP_CE 26
+ /* AT_HWCAP2 */
+# define HWCAP_CE_AES (1 << 0)
+# define HWCAP_CE_PMULL (1 << 1)
+# define HWCAP_CE_SHA1 (1 << 2)
+# define HWCAP_CE_SHA256 (1 << 3)
+# elif defined(__aarch64__)
+# define HWCAP 16
+ /* AT_HWCAP */
+# define HWCAP_NEON (1 << 1)
+
+# define HWCAP_CE HWCAP
+# define HWCAP_CE_AES (1 << 3)
+# define HWCAP_CE_PMULL (1 << 4)
+# define HWCAP_CE_SHA1 (1 << 5)
+# define HWCAP_CE_SHA256 (1 << 6)
+# endif
+
void OPENSSL_cpuid_setup(void)
{
char *e;
@@ -47,7 +96,7 @@ void OPENSSL_cpuid_setup(void)
trigger = 1;
if ((e = getenv("OPENSSL_armcap"))) {
- OPENSSL_armcap_P = strtoul(e, NULL, 0);
+ OPENSSL_armcap_P = (unsigned int)strtoul(e, NULL, 0);
return;
}
@@ -67,9 +116,42 @@ void OPENSSL_cpuid_setup(void)
sigprocmask(SIG_SETMASK, &ill_act.sa_mask, &oset);
sigaction(SIGILL, &ill_act, &ill_oact);
- if (sigsetjmp(ill_jmp, 1) == 0) {
+ if (getauxval != NULL) {
+ if (getauxval(HWCAP) & HWCAP_NEON) {
+ unsigned long hwcap = getauxval(HWCAP_CE);
+
+ OPENSSL_armcap_P |= ARMV7_NEON;
+
+ if (hwcap & HWCAP_CE_AES)
+ OPENSSL_armcap_P |= ARMV8_AES;
+
+ if (hwcap & HWCAP_CE_PMULL)
+ OPENSSL_armcap_P |= ARMV8_PMULL;
+
+ if (hwcap & HWCAP_CE_SHA1)
+ OPENSSL_armcap_P |= ARMV8_SHA1;
+
+ if (hwcap & HWCAP_CE_SHA256)
+ OPENSSL_armcap_P |= ARMV8_SHA256;
+ }
+ } else if (sigsetjmp(ill_jmp, 1) == 0) {
_armv7_neon_probe();
OPENSSL_armcap_P |= ARMV7_NEON;
+ if (sigsetjmp(ill_jmp, 1) == 0) {
+ _armv8_pmull_probe();
+ OPENSSL_armcap_P |= ARMV8_PMULL | ARMV8_AES;
+ } else if (sigsetjmp(ill_jmp, 1) == 0) {
+ _armv8_aes_probe();
+ OPENSSL_armcap_P |= ARMV8_AES;
+ }
+ if (sigsetjmp(ill_jmp, 1) == 0) {
+ _armv8_sha1_probe();
+ OPENSSL_armcap_P |= ARMV8_SHA1;
+ }
+ if (sigsetjmp(ill_jmp, 1) == 0) {
+ _armv8_sha256_probe();
+ OPENSSL_armcap_P |= ARMV8_SHA256;
+ }
}
if (sigsetjmp(ill_jmp, 1) == 0) {
_armv7_tick();
@@ -79,3 +161,4 @@ void OPENSSL_cpuid_setup(void)
sigaction(SIGILL, &ill_oact, NULL);
sigprocmask(SIG_SETMASK, &oset, NULL);
}
+#endif
diff --git a/crypto/armv4cpuid.S b/crypto/armv4cpuid.S
index 2d618deaa43e..65010ae4fe06 100644
--- a/crypto/armv4cpuid.S
+++ b/crypto/armv4cpuid.S
@@ -4,20 +4,6 @@
.code 32
.align 5
-.global _armv7_neon_probe
-.type _armv7_neon_probe,%function
-_armv7_neon_probe:
- .word 0xf26ee1fe @ vorr q15,q15,q15
- .word 0xe12fff1e @ bx lr
-.size _armv7_neon_probe,.-_armv7_neon_probe
-
-.global _armv7_tick
-.type _armv7_tick,%function
-_armv7_tick:
- mrc p15,0,r0,c9,c13,0
- .word 0xe12fff1e @ bx lr
-.size _armv7_tick,.-_armv7_tick
-
.global OPENSSL_atomic_add
.type OPENSSL_atomic_add,%function
OPENSSL_atomic_add:
@@ -28,7 +14,7 @@ OPENSSL_atomic_add:
cmp r2,#0
bne .Ladd
mov r0,r3
- .word 0xe12fff1e @ bx lr
+ bx lr
#else
stmdb sp!,{r4-r6,lr}
ldr r2,.Lspinlock
@@ -81,62 +67,131 @@ OPENSSL_cleanse:
adds r1,r1,#4
bne .Little
.Lcleanse_done:
+#if __ARM_ARCH__>=5
+ bx lr
+#else
tst lr,#1
moveq pc,lr
.word 0xe12fff1e @ bx lr
+#endif
.size OPENSSL_cleanse,.-OPENSSL_cleanse
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.align 5
+.global _armv7_neon_probe
+.type _armv7_neon_probe,%function
+_armv7_neon_probe:
+ vorr q0,q0,q0
+ bx lr
+.size _armv7_neon_probe,.-_armv7_neon_probe
+
+.global _armv7_tick
+.type _armv7_tick,%function
+_armv7_tick:
+ mrrc p15,1,r0,r1,c14 @ CNTVCT
+ bx lr
+.size _armv7_tick,.-_armv7_tick
+
+.global _armv8_aes_probe
+.type _armv8_aes_probe,%function
+_armv8_aes_probe:
+ .byte 0x00,0x03,0xb0,0xf3 @ aese.8 q0,q0
+ bx lr
+.size _armv8_aes_probe,.-_armv8_aes_probe
+
+.global _armv8_sha1_probe
+.type _armv8_sha1_probe,%function
+_armv8_sha1_probe:
+ .byte 0x40,0x0c,0x00,0xf2 @ sha1c.32 q0,q0,q0
+ bx lr
+.size _armv8_sha1_probe,.-_armv8_sha1_probe
+
+.global _armv8_sha256_probe
+.type _armv8_sha256_probe,%function
+_armv8_sha256_probe:
+ .byte 0x40,0x0c,0x00,0xf3 @ sha256h.32 q0,q0,q0
+ bx lr
+.size _armv8_sha256_probe,.-_armv8_sha256_probe
+.global _armv8_pmull_probe
+.type _armv8_pmull_probe,%function
+_armv8_pmull_probe:
+ .byte 0x00,0x0e,0xa0,0xf2 @ vmull.p64 q0,d0,d0
+ bx lr
+.size _armv8_pmull_probe,.-_armv8_pmull_probe
+#endif
+
.global OPENSSL_wipe_cpu
.type OPENSSL_wipe_cpu,%function
OPENSSL_wipe_cpu:
+#if __ARM_MAX_ARCH__>=7
ldr r0,.LOPENSSL_armcap
adr r1,.LOPENSSL_armcap
ldr r0,[r1,r0]
+#endif
eor r2,r2,r2
eor r3,r3,r3
eor ip,ip,ip
+#if __ARM_MAX_ARCH__>=7
tst r0,#1
beq .Lwipe_done
- .word 0xf3000150 @ veor q0, q0, q0
- .word 0xf3022152 @ veor q1, q1, q1
- .word 0xf3044154 @ veor q2, q2, q2
- .word 0xf3066156 @ veor q3, q3, q3
- .word 0xf34001f0 @ veor q8, q8, q8
- .word 0xf34221f2 @ veor q9, q9, q9
- .word 0xf34441f4 @ veor q10, q10, q10
- .word 0xf34661f6 @ veor q11, q11, q11
- .word 0xf34881f8 @ veor q12, q12, q12
- .word 0xf34aa1fa @ veor q13, q13, q13
- .word 0xf34cc1fc @ veor q14, q14, q14
- .word 0xf34ee1fe @ veor q15, q15, q15
+ veor q0, q0, q0
+ veor q1, q1, q1
+ veor q2, q2, q2
+ veor q3, q3, q3
+ veor q8, q8, q8
+ veor q9, q9, q9
+ veor q10, q10, q10
+ veor q11, q11, q11
+ veor q12, q12, q12
+ veor q13, q13, q13
+ veor q14, q14, q14
+ veor q15, q15, q15
.Lwipe_done:
+#endif
mov r0,sp
+#if __ARM_ARCH__>=5
+ bx lr
+#else
tst lr,#1
moveq pc,lr
.word 0xe12fff1e @ bx lr
+#endif
.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
.global OPENSSL_instrument_bus
.type OPENSSL_instrument_bus,%function
OPENSSL_instrument_bus:
eor r0,r0,r0
+#if __ARM_ARCH__>=5
+ bx lr
+#else
tst lr,#1
moveq pc,lr
.word 0xe12fff1e @ bx lr
+#endif
.size OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
.global OPENSSL_instrument_bus2
.type OPENSSL_instrument_bus2,%function
OPENSSL_instrument_bus2:
eor r0,r0,r0
+#if __ARM_ARCH__>=5
+ bx lr
+#else
tst lr,#1
moveq pc,lr
.word 0xe12fff1e @ bx lr
+#endif
.size OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
.align 5
+#if __ARM_MAX_ARCH__>=7
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-.LOPENSSL_armcap
+#endif
#if __ARM_ARCH__>=6
.align 5
#else
diff --git a/crypto/asn1/Makefile b/crypto/asn1/Makefile
index d774f7821238..330fe81b740c 100644
--- a/crypto/asn1/Makefile
+++ b/crypto/asn1/Makefile
@@ -176,7 +176,7 @@ a_gentm.o: ../../include/openssl/err.h ../../include/openssl/lhash.h
a_gentm.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h
a_gentm.o: ../../include/openssl/ossl_typ.h ../../include/openssl/safestack.h
a_gentm.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
-a_gentm.o: ../cryptlib.h ../o_time.h a_gentm.c
+a_gentm.o: ../cryptlib.h ../o_time.h a_gentm.c asn1_locl.h
a_i2d_fp.o: ../../e_os.h ../../include/openssl/asn1.h
a_i2d_fp.o: ../../include/openssl/bio.h ../../include/openssl/buffer.h
a_i2d_fp.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
@@ -277,6 +277,7 @@ a_time.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
a_time.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
a_time.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
a_time.o: ../../include/openssl/symhacks.h ../cryptlib.h ../o_time.h a_time.c
+a_time.o: asn1_locl.h
a_type.o: ../../e_os.h ../../include/openssl/asn1.h
a_type.o: ../../include/openssl/asn1t.h ../../include/openssl/bio.h
a_type.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
@@ -293,7 +294,7 @@ a_utctm.o: ../../include/openssl/err.h ../../include/openssl/lhash.h
a_utctm.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h
a_utctm.o: ../../include/openssl/ossl_typ.h ../../include/openssl/safestack.h
a_utctm.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
-a_utctm.o: ../cryptlib.h ../o_time.h a_utctm.c
+a_utctm.o: ../cryptlib.h ../o_time.h a_utctm.c asn1_locl.h
a_utf8.o: ../../e_os.h ../../include/openssl/asn1.h ../../include/openssl/bio.h
a_utf8.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
a_utf8.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
diff --git a/crypto/asn1/a_gentm.c b/crypto/asn1/a_gentm.c
index 8b3ef716e334..fa76dcac91f3 100644
--- a/crypto/asn1/a_gentm.c
+++ b/crypto/asn1/a_gentm.c
@@ -65,6 +65,7 @@
#include "cryptlib.h"
#include "o_time.h"
#include <openssl/asn1.h>
+#include "asn1_locl.h"
#if 0
@@ -117,7 +118,7 @@ ASN1_GENERALIZEDTIME *d2i_ASN1_GENERALIZEDTIME(ASN1_GENERALIZEDTIME **a,
#endif
-int ASN1_GENERALIZEDTIME_check(ASN1_GENERALIZEDTIME *d)
+int asn1_generalizedtime_to_tm(struct tm *tm, const ASN1_GENERALIZEDTIME *d)
{
static const int min[9] = { 0, 0, 1, 1, 0, 0, 0, 0, 0 };
static const int max[9] = { 99, 99, 12, 31, 23, 59, 59, 12, 59 };
@@ -139,6 +140,8 @@ int ASN1_GENERALIZEDTIME_check(ASN1_GENERALIZEDTIME *d)
for (i = 0; i < 7; i++) {
if ((i == 6) && ((a[o] == 'Z') || (a[o] == '+') || (a[o] == '-'))) {
i++;
+ if (tm)
+ tm->tm_sec = 0;
break;
}
if ((a[o] < '0') || (a[o] > '9'))
@@ -155,6 +158,31 @@ int ASN1_GENERALIZEDTIME_check(ASN1_GENERALIZEDTIME *d)
if ((n < min[i]) || (n > max[i]))
goto err;
+ if (tm) {
+ switch (i) {
+ case 0:
+ tm->tm_year = n * 100 - 1900;
+ break;
+ case 1:
+ tm->tm_year += n;
+ break;
+ case 2:
+ tm->tm_mon = n - 1;
+ break;
+ case 3:
+ tm->tm_mday = n;
+ break;
+ case 4:
+ tm->tm_hour = n;
+ break;
+ case 5:
+ tm->tm_min = n;
+ break;
+ case 6:
+ tm->tm_sec = n;
+ break;
+ }
+ }
}
/*
* Optional fractional seconds: decimal point followed by one or more
@@ -174,6 +202,7 @@ int ASN1_GENERALIZEDTIME_check(ASN1_GENERALIZEDTIME *d)
if (a[o] == 'Z')
o++;
else if ((a[o] == '+') || (a[o] == '-')) {
+ int offsign = a[o] == '-' ? -1 : 1, offset = 0;
o++;
if (o + 4 > l)
goto err;
@@ -187,9 +216,17 @@ int ASN1_GENERALIZEDTIME_check(ASN1_GENERALIZEDTIME *d)
n = (n * 10) + a[o] - '0';
if ((n < min[i]) || (n > max[i]))
goto err;
+ if (tm) {
+ if (i == 7)
+ offset = n * 3600;
+ else if (i == 8)
+ offset += n * 60;
+ }
o++;
}
- } else {
+ if (offset && !OPENSSL_gmtime_adj(tm, 0, offset * offsign))
+ return 0;
+ } else if (a[o]) {
/* Missing time zone information. */
goto err;
}
@@ -198,6 +235,11 @@ int ASN1_GENERALIZEDTIME_check(ASN1_GENERALIZEDTIME *d)
return (0);
}
+int ASN1_GENERALIZEDTIME_check(const ASN1_GENERALIZEDTIME *d)
+{
+ return asn1_generalizedtime_to_tm(NULL, d);
+}
+
int ASN1_GENERALIZEDTIME_set_string(ASN1_GENERALIZEDTIME *s, const char *str)
{
ASN1_GENERALIZEDTIME t;
diff --git a/crypto/asn1/a_time.c b/crypto/asn1/a_time.c
index c81f0de5a9e8..fcb2d565cdd0 100644
--- a/crypto/asn1/a_time.c
+++ b/crypto/asn1/a_time.c
@@ -66,6 +66,7 @@
#include "cryptlib.h"
#include "o_time.h"
#include <openssl/asn1t.h>
+#include "asn1_locl.h"
IMPLEMENT_ASN1_MSTRING(ASN1_TIME, B_ASN1_TIME)
@@ -196,3 +197,32 @@ int ASN1_TIME_set_string(ASN1_TIME *s, const char *str)
return 1;
}
+
+static int asn1_time_to_tm(struct tm *tm, const ASN1_TIME *t)
+{
+ if (t == NULL) {
+ time_t now_t;
+ time(&now_t);
+ if (OPENSSL_gmtime(&now_t, tm))
+ return 1;
+ return 0;
+ }
+
+ if (t->type == V_ASN1_UTCTIME)
+ return asn1_utctime_to_tm(tm, t);
+ else if (t->type == V_ASN1_GENERALIZEDTIME)
+ return asn1_generalizedtime_to_tm(tm, t);
+
+ return 0;
+}
+
+int ASN1_TIME_diff(int *pday, int *psec,
+ const ASN1_TIME *from, const ASN1_TIME *to)
+{
+ struct tm tm_from, tm_to;
+ if (!asn1_time_to_tm(&tm_from, from))
+ return 0;
+ if (!asn1_time_to_tm(&tm_to, to))
+ return 0;
+ return OPENSSL_gmtime_diff(pday, psec, &tm_from, &tm_to);
+}
diff --git a/crypto/asn1/a_utctm.c b/crypto/asn1/a_utctm.c
index 179de6d39590..724a10be4ed6 100644
--- a/crypto/asn1/a_utctm.c
+++ b/crypto/asn1/a_utctm.c
@@ -61,6 +61,7 @@
#include "cryptlib.h"
#include "o_time.h"
#include <openssl/asn1.h>
+#include "asn1_locl.h"
#if 0
int i2d_ASN1_UTCTIME(ASN1_UTCTIME *a, unsigned char **pp)
@@ -109,7 +110,7 @@ ASN1_UTCTIME *d2i_ASN1_UTCTIME(ASN1_UTCTIME **a, unsigned char **pp,
#endif
-int ASN1_UTCTIME_check(ASN1_UTCTIME *d)
+int asn1_utctime_to_tm(struct tm *tm, const ASN1_UTCTIME *d)
{
static const int min[8] = { 0, 1, 1, 0, 0, 0, 0, 0 };
static const int max[8] = { 99, 12, 31, 23, 59, 59, 12, 59 };
@@ -127,6 +128,8 @@ int ASN1_UTCTIME_check(ASN1_UTCTIME *d)
for (i = 0; i < 6; i++) {
if ((i == 5) && ((a[o] == 'Z') || (a[o] == '+') || (a[o] == '-'))) {
i++;
+ if (tm)
+ tm->tm_sec = 0;
break;
}
if ((a[o] < '0') || (a[o] > '9'))
@@ -143,10 +146,33 @@ int ASN1_UTCTIME_check(ASN1_UTCTIME *d)
if ((n < min[i]) || (n > max[i]))
goto err;
+ if (tm) {
+ switch (i) {
+ case 0:
+ tm->tm_year = n < 50 ? n + 100 : n;
+ break;
+ case 1:
+ tm->tm_mon = n - 1;
+ break;
+ case 2:
+ tm->tm_mday = n;
+ break;
+ case 3:
+ tm->tm_hour = n;
+ break;
+ case 4:
+ tm->tm_min = n;
+ break;
+ case 5:
+ tm->tm_sec = n;
+ break;
+ }
+ }
}
if (a[o] == 'Z')
o++;
else if ((a[o] == '+') || (a[o] == '-')) {
+ int offsign = a[o] == '-' ? -1 : 1, offset = 0;
o++;
if (o + 4 > l)
goto err;
@@ -160,12 +186,25 @@ int ASN1_UTCTIME_check(ASN1_UTCTIME *d)
n = (n * 10) + a[o] - '0';
if ((n < min[i]) || (n > max[i]))
goto err;
+ if (tm) {
+ if (i == 6)
+ offset = n * 3600;
+ else if (i == 7)
+ offset += n * 60;
+ }
o++;
}
+ if (offset && !OPENSSL_gmtime_adj(tm, 0, offset * offsign))
+ return 0;
}
- return (o == l);
+ return o == l;
err:
- return (0);
+ return 0;
+}
+
+int ASN1_UTCTIME_check(const ASN1_UTCTIME *d)
+{
+ return asn1_utctime_to_tm(NULL, d);
}
int ASN1_UTCTIME_set_string(ASN1_UTCTIME *s, const char *str)
@@ -249,43 +288,26 @@ ASN1_UTCTIME *ASN1_UTCTIME_adj(ASN1_UTCTIME *s, time_t t,
int ASN1_UTCTIME_cmp_time_t(const ASN1_UTCTIME *s, time_t t)
{
- struct tm *tm;
- struct tm data;
- int offset;
- int year;
-
-#define g2(p) (((p)[0]-'0')*10+(p)[1]-'0')
-
- if (s->data[12] == 'Z')
- offset = 0;
- else {
- offset = g2(s->data + 13) * 60 + g2(s->data + 15);
- if (s->data[12] == '-')
- offset = -offset;
- }
+ struct tm stm, ttm;
+ int day, sec;
- t -= offset * 60; /* FIXME: may overflow in extreme cases */
+ if (!asn1_utctime_to_tm(&stm, s))
+ return -2;
- tm = OPENSSL_gmtime(&t, &data);
- /*
- * NB: -1, 0, 1 already valid return values so use -2 to indicate error.
- */
- if (tm == NULL)
+ if (!OPENSSL_gmtime(&t, &ttm))
return -2;
-#define return_cmp(a,b) if ((a)<(b)) return -1; else if ((a)>(b)) return 1
- year = g2(s->data);
- if (year < 50)
- year += 100;
- return_cmp(year, tm->tm_year);
- return_cmp(g2(s->data + 2) - 1, tm->tm_mon);
- return_cmp(g2(s->data + 4), tm->tm_mday);
- return_cmp(g2(s->data + 6), tm->tm_hour);
- return_cmp(g2(s->data + 8), tm->tm_min);
- return_cmp(g2(s->data + 10), tm->tm_sec);
-#undef g2
-#undef return_cmp
+ if (!OPENSSL_gmtime_diff(&day, &sec, &ttm, &stm))
+ return -2;
+ if (day > 0)
+ return 1;
+ if (day < 0)
+ return -1;
+ if (sec > 0)
+ return 1;
+ if (sec < 0)
+ return -1;
return 0;
}
diff --git a/crypto/asn1/ameth_lib.c b/crypto/asn1/ameth_lib.c
index 45f3f4056049..5389c0434740 100644
--- a/crypto/asn1/ameth_lib.c
+++ b/crypto/asn1/ameth_lib.c
@@ -68,6 +68,7 @@
extern const EVP_PKEY_ASN1_METHOD rsa_asn1_meths[];
extern const EVP_PKEY_ASN1_METHOD dsa_asn1_meths[];
extern const EVP_PKEY_ASN1_METHOD dh_asn1_meth;
+extern const EVP_PKEY_ASN1_METHOD dhx_asn1_meth;
extern const EVP_PKEY_ASN1_METHOD eckey_asn1_meth;
extern const EVP_PKEY_ASN1_METHOD hmac_asn1_meth;
extern const EVP_PKEY_ASN1_METHOD cmac_asn1_meth;
@@ -92,7 +93,10 @@ static const EVP_PKEY_ASN1_METHOD *standard_methods[] = {
&eckey_asn1_meth,
#endif
&hmac_asn1_meth,
- &cmac_asn1_meth
+ &cmac_asn1_meth,
+#ifndef OPENSSL_NO_DH
+ &dhx_asn1_meth
+#endif
};
typedef int sk_cmp_fn_type(const char *const *a, const char *const *b);
@@ -460,3 +464,21 @@ void EVP_PKEY_asn1_set_ctrl(EVP_PKEY_ASN1_METHOD *ameth,
{
ameth->pkey_ctrl = pkey_ctrl;
}
+
+void EVP_PKEY_asn1_set_item(EVP_PKEY_ASN1_METHOD *ameth,
+ int (*item_verify) (EVP_MD_CTX *ctx,
+ const ASN1_ITEM *it,
+ void *asn,
+ X509_ALGOR *a,
+ ASN1_BIT_STRING *sig,
+ EVP_PKEY *pkey),
+ int (*item_sign) (EVP_MD_CTX *ctx,
+ const ASN1_ITEM *it,
+ void *asn,
+ X509_ALGOR *alg1,
+ X509_ALGOR *alg2,
+ ASN1_BIT_STRING *sig))
+{
+ ameth->item_sign = item_sign;
+ ameth->item_verify = item_verify;
+}
diff --git a/crypto/asn1/asn1.h b/crypto/asn1/asn1.h
index 39b7833f582f..68e791fcdbe8 100644
--- a/crypto/asn1/asn1.h
+++ b/crypto/asn1/asn1.h
@@ -207,13 +207,13 @@ typedef struct asn1_const_ctx_st {
# define ASN1_OBJECT_FLAG_CRITICAL 0x02/* critical x509v3 object id */
# define ASN1_OBJECT_FLAG_DYNAMIC_STRINGS 0x04/* internal use */
# define ASN1_OBJECT_FLAG_DYNAMIC_DATA 0x08/* internal use */
-typedef struct asn1_object_st {
+struct asn1_object_st {
const char *sn, *ln;
int nid;
int length;
const unsigned char *data; /* data remains const after init */
int flags; /* Should we free this one */
-} ASN1_OBJECT;
+};
# define ASN1_STRING_FLAG_BITS_LEFT 0x08/* Set if 0x07 has bits left value */
/*
@@ -843,7 +843,7 @@ int ASN1_INTEGER_cmp(const ASN1_INTEGER *x, const ASN1_INTEGER *y);
DECLARE_ASN1_FUNCTIONS(ASN1_ENUMERATED)
-int ASN1_UTCTIME_check(ASN1_UTCTIME *a);
+int ASN1_UTCTIME_check(const ASN1_UTCTIME *a);
ASN1_UTCTIME *ASN1_UTCTIME_set(ASN1_UTCTIME *s, time_t t);
ASN1_UTCTIME *ASN1_UTCTIME_adj(ASN1_UTCTIME *s, time_t t,
int offset_day, long offset_sec);
@@ -853,13 +853,15 @@ int ASN1_UTCTIME_cmp_time_t(const ASN1_UTCTIME *s, time_t t);
time_t ASN1_UTCTIME_get(const ASN1_UTCTIME *s);
# endif
-int ASN1_GENERALIZEDTIME_check(ASN1_GENERALIZEDTIME *a);
+int ASN1_GENERALIZEDTIME_check(const ASN1_GENERALIZEDTIME *a);
ASN1_GENERALIZEDTIME *ASN1_GENERALIZEDTIME_set(ASN1_GENERALIZEDTIME *s,
time_t t);
ASN1_GENERALIZEDTIME *ASN1_GENERALIZEDTIME_adj(ASN1_GENERALIZEDTIME *s,
time_t t, int offset_day,
long offset_sec);
int ASN1_GENERALIZEDTIME_set_string(ASN1_GENERALIZEDTIME *s, const char *str);
+int ASN1_TIME_diff(int *pday, int *psec,
+ const ASN1_TIME *from, const ASN1_TIME *to);
DECLARE_ASN1_FUNCTIONS(ASN1_OCTET_STRING)
ASN1_OCTET_STRING *ASN1_OCTET_STRING_dup(const ASN1_OCTET_STRING *a);
diff --git a/crypto/asn1/asn1_locl.h b/crypto/asn1/asn1_locl.h
index 9f5ed8472718..4c004fab9a8d 100644
--- a/crypto/asn1/asn1_locl.h
+++ b/crypto/asn1/asn1_locl.h
@@ -59,6 +59,9 @@
/* Internal ASN1 structures and functions: not for application use */
+int asn1_utctime_to_tm(struct tm *tm, const ASN1_UTCTIME *d);
+int asn1_generalizedtime_to_tm(struct tm *tm, const ASN1_GENERALIZEDTIME *d);
+
/* ASN1 print context structure */
struct asn1_pctx_st {
diff --git a/crypto/asn1/t_x509.c b/crypto/asn1/t_x509.c
index 4e7c45dd5ddc..8aab55130c95 100644
--- a/crypto/asn1/t_x509.c
+++ b/crypto/asn1/t_x509.c
@@ -228,6 +228,21 @@ int X509_print_ex(BIO *bp, X509 *x, unsigned long nmflags,
}
}
+ if (!(cflag & X509_FLAG_NO_IDS)) {
+ if (ci->issuerUID) {
+ if (BIO_printf(bp, "%8sIssuer Unique ID: ", "") <= 0)
+ goto err;
+ if (!X509_signature_dump(bp, ci->issuerUID, 12))
+ goto err;
+ }
+ if (ci->subjectUID) {
+ if (BIO_printf(bp, "%8sSubject Unique ID: ", "") <= 0)
+ goto err;
+ if (!X509_signature_dump(bp, ci->subjectUID, 12))
+ goto err;
+ }
+ }
+
if (!(cflag & X509_FLAG_NO_EXTENSIONS))
X509V3_extensions_print(bp, "X509v3 extensions",
ci->extensions, cflag, 8);
diff --git a/crypto/asn1/x_crl.c b/crypto/asn1/x_crl.c
index e258c714b249..027950330d8b 100644
--- a/crypto/asn1/x_crl.c
+++ b/crypto/asn1/x_crl.c
@@ -58,8 +58,8 @@
#include <stdio.h>
#include "cryptlib.h"
-#include "asn1_locl.h"
#include <openssl/asn1t.h>
+#include "asn1_locl.h"
#include <openssl/x509.h>
#include <openssl/x509v3.h>
@@ -341,6 +341,8 @@ ASN1_SEQUENCE_ref(X509_CRL, crl_cb, CRYPTO_LOCK_X509_CRL) = {
IMPLEMENT_ASN1_FUNCTIONS(X509_REVOKED)
+IMPLEMENT_ASN1_DUP_FUNCTION(X509_REVOKED)
+
IMPLEMENT_ASN1_FUNCTIONS(X509_CRL_INFO)
IMPLEMENT_ASN1_FUNCTIONS(X509_CRL)
diff --git a/crypto/asn1/x_x509.c b/crypto/asn1/x_x509.c
index f56e837b3a08..5f266a26b4c2 100644
--- a/crypto/asn1/x_x509.c
+++ b/crypto/asn1/x_x509.c
@@ -208,3 +208,23 @@ int i2d_X509_AUX(X509 *a, unsigned char **pp)
length += i2d_X509_CERT_AUX(a->aux, pp);
return length;
}
+
+int i2d_re_X509_tbs(X509 *x, unsigned char **pp)
+{
+ x->cert_info->enc.modified = 1;
+ return i2d_X509_CINF(x->cert_info, pp);
+}
+
+void X509_get0_signature(ASN1_BIT_STRING **psig, X509_ALGOR **palg,
+ const X509 *x)
+{
+ if (psig)
+ *psig = x->signature;
+ if (palg)
+ *palg = x->sig_alg;
+}
+
+int X509_get_signature_nid(const X509 *x)
+{
+ return OBJ_obj2nid(x->sig_alg->algorithm);
+}
diff --git a/crypto/bio/b_dump.c b/crypto/bio/b_dump.c
index 3293c724c94f..ed8e521449a4 100644
--- a/crypto/bio/b_dump.c
+++ b/crypto/bio/b_dump.c
@@ -182,3 +182,28 @@ int BIO_dump_indent(BIO *bp, const char *s, int len, int indent)
{
return BIO_dump_indent_cb(write_bio, bp, s, len, indent);
}
+
+int BIO_hex_string(BIO *out, int indent, int width, unsigned char *data,
+ int datalen)
+{
+ int i, j = 0;
+
+ if (datalen < 1)
+ return 1;
+
+ for (i = 0; i < datalen - 1; i++) {
+ if (i && !j)
+ BIO_printf(out, "%*s", indent, "");
+
+ BIO_printf(out, "%02X:", data[i]);
+
+ j = (j + 1) % width;
+ if (!j)
+ BIO_printf(out, "\n");
+ }
+
+ if (i && !j)
+ BIO_printf(out, "%*s", indent, "");
+ BIO_printf(out, "%02X", data[datalen - 1]);
+ return 1;
+}
diff --git a/crypto/bio/b_sock.c b/crypto/bio/b_sock.c
index bda882c40b08..5bad0a2bada2 100644
--- a/crypto/bio/b_sock.c
+++ b/crypto/bio/b_sock.c
@@ -225,13 +225,17 @@ int BIO_get_port(const char *str, unsigned short *port_ptr)
int BIO_sock_error(int sock)
{
int j, i;
- int size;
+ union {
+ size_t s;
+ int i;
+ } size;
# if defined(OPENSSL_SYS_BEOS_R5)
return 0;
# endif
- size = sizeof(int);
+ /* heuristic way to adapt for platforms that expect 64-bit optlen */
+ size.s = 0, size.i = sizeof(j);
/*
* Note: under Windows the third parameter is of type (char *) whereas
* under other systems it is (void *) if you don't have a cast it will
diff --git a/crypto/bio/bio.h b/crypto/bio/bio.h
index d583cc108508..f78796b069f5 100644
--- a/crypto/bio/bio.h
+++ b/crypto/bio/bio.h
@@ -174,6 +174,7 @@ extern "C" {
# define BIO_CTRL_DGRAM_SET_NEXT_TIMEOUT 45/* Next DTLS handshake timeout
* to adjust socket timeouts */
+# define BIO_CTRL_DGRAM_SET_DONT_FRAG 48
# define BIO_CTRL_DGRAM_GET_MTU_OVERHEAD 49
@@ -725,6 +726,9 @@ int BIO_dump_indent(BIO *b, const char *bytes, int len, int indent);
int BIO_dump_fp(FILE *fp, const char *s, int len);
int BIO_dump_indent_fp(FILE *fp, const char *s, int len, int indent);
# endif
+int BIO_hex_string(BIO *out, int indent, int width, unsigned char *data,
+ int datalen);
+
struct hostent *BIO_gethostbyname(const char *name);
/*-
* We might want a thread-safe interface too:
@@ -761,8 +765,8 @@ int BIO_dgram_sctp_wait_for_dry(BIO *b);
int BIO_dgram_sctp_msg_waiting(BIO *b);
# endif
BIO *BIO_new_fd(int fd, int close_flag);
-BIO *BIO_new_connect(char *host_port);
-BIO *BIO_new_accept(char *host_port);
+BIO *BIO_new_connect(const char *host_port);
+BIO *BIO_new_accept(const char *host_port);
int BIO_new_bio_pair(BIO **bio1, size_t writebuf1,
BIO **bio2, size_t writebuf2);
diff --git a/crypto/bio/bio_err.c b/crypto/bio/bio_err.c
index 6dd6162fc93a..d9007aa3d32d 100644
--- a/crypto/bio/bio_err.c
+++ b/crypto/bio/bio_err.c
@@ -1,6 +1,6 @@
/* crypto/bio/bio_err.c */
/* ====================================================================
- * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1999-2015 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
diff --git a/crypto/bio/bss_acpt.c b/crypto/bio/bss_acpt.c
index d08292c3e9ac..4a5e39bd387b 100644
--- a/crypto/bio/bss_acpt.c
+++ b/crypto/bio/bss_acpt.c
@@ -445,7 +445,7 @@ static int acpt_puts(BIO *bp, const char *str)
return (ret);
}
-BIO *BIO_new_accept(char *str)
+BIO *BIO_new_accept(const char *str)
{
BIO *ret;
diff --git a/crypto/bio/bss_conn.c b/crypto/bio/bss_conn.c
index 6a5e8de8812a..42d0afffbc6c 100644
--- a/crypto/bio/bss_conn.c
+++ b/crypto/bio/bss_conn.c
@@ -585,7 +585,7 @@ static int conn_puts(BIO *bp, const char *str)
return (ret);
}
-BIO *BIO_new_connect(char *str)
+BIO *BIO_new_connect(const char *str)
{
BIO *ret;
diff --git a/crypto/bio/bss_dgram.c b/crypto/bio/bss_dgram.c
index e3e3dd0503a8..7fcd831da06b 100644
--- a/crypto/bio/bss_dgram.c
+++ b/crypto/bio/bss_dgram.c
@@ -65,7 +65,7 @@
#include <openssl/bio.h>
#ifndef OPENSSL_NO_DGRAM
-# if defined(OPENSSL_SYS_WIN32) || defined(OPENSSL_SYS_VMS)
+# if defined(OPENSSL_SYS_VMS)
# include <sys/timeb.h>
# endif
@@ -80,6 +80,10 @@
# define IP_MTU 14 /* linux is lame */
# endif
+# if OPENSSL_USE_IPV6 && !defined(IPPROTO_IPV6)
+# define IPPROTO_IPV6 41 /* windows is lame */
+# endif
+
# if defined(__FreeBSD__) && defined(IN6_IS_ADDR_V4MAPPED)
/* Standard definition causes type-punning problems. */
# undef IN6_IS_ADDR_V4MAPPED
@@ -496,8 +500,8 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
int *ip;
struct sockaddr *to = NULL;
bio_dgram_data *data = NULL;
-# if defined(OPENSSL_SYS_LINUX) && (defined(IP_MTU_DISCOVER) || defined(IP_MTU))
int sockopt_val = 0;
+# if defined(OPENSSL_SYS_LINUX) && (defined(IP_MTU_DISCOVER) || defined(IP_MTU))
socklen_t sockopt_len; /* assume that system supporting IP_MTU is
* modern enough to define socklen_t */
socklen_t addr_len;
@@ -882,6 +886,61 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
ret = 0;
break;
# endif
+ case BIO_CTRL_DGRAM_SET_DONT_FRAG:
+ sockopt_val = num ? 1 : 0;
+
+ switch (data->peer.sa.sa_family) {
+ case AF_INET:
+# if defined(IP_DONTFRAG)
+ if ((ret = setsockopt(b->num, IPPROTO_IP, IP_DONTFRAG,
+ &sockopt_val, sizeof(sockopt_val))) < 0) {
+ perror("setsockopt");
+ ret = -1;
+ }
+# elif defined(OPENSSL_SYS_LINUX) && defined(IP_MTU_DISCOVER) && defined (IP_PMTUDISC_PROBE)
+ if ((sockopt_val = num ? IP_PMTUDISC_PROBE : IP_PMTUDISC_DONT),
+ (ret = setsockopt(b->num, IPPROTO_IP, IP_MTU_DISCOVER,
+ &sockopt_val, sizeof(sockopt_val))) < 0) {
+ perror("setsockopt");
+ ret = -1;
+ }
+# elif defined(OPENSSL_SYS_WINDOWS) && defined(IP_DONTFRAGMENT)
+ if ((ret = setsockopt(b->num, IPPROTO_IP, IP_DONTFRAGMENT,
+ (const char *)&sockopt_val,
+ sizeof(sockopt_val))) < 0) {
+ perror("setsockopt");
+ ret = -1;
+ }
+# else
+ ret = -1;
+# endif
+ break;
+# if OPENSSL_USE_IPV6
+ case AF_INET6:
+# if defined(IPV6_DONTFRAG)
+ if ((ret = setsockopt(b->num, IPPROTO_IPV6, IPV6_DONTFRAG,
+ (const void *)&sockopt_val,
+ sizeof(sockopt_val))) < 0) {
+ perror("setsockopt");
+ ret = -1;
+ }
+# elif defined(OPENSSL_SYS_LINUX) && defined(IPV6_MTUDISCOVER)
+ if ((sockopt_val = num ? IP_PMTUDISC_PROBE : IP_PMTUDISC_DONT),
+ (ret = setsockopt(b->num, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
+ &sockopt_val, sizeof(sockopt_val))) < 0) {
+ perror("setsockopt");
+ ret = -1;
+ }
+# else
+ ret = -1;
+# endif
+ break;
+# endif
+ default:
+ ret = -1;
+ break;
+ }
+ break;
case BIO_CTRL_DGRAM_GET_MTU_OVERHEAD:
ret = dgram_get_mtu_overhead(data);
break;
@@ -1995,11 +2054,22 @@ int BIO_dgram_non_fatal_error(int err)
static void get_current_time(struct timeval *t)
{
-# ifdef OPENSSL_SYS_WIN32
- struct _timeb tb;
- _ftime(&tb);
- t->tv_sec = (long)tb.time;
- t->tv_usec = (long)tb.millitm * 1000;
+# if defined(_WIN32)
+ SYSTEMTIME st;
+ union {
+ unsigned __int64 ul;
+ FILETIME ft;
+ } now;
+
+ GetSystemTime(&st);
+ SystemTimeToFileTime(&st, &now.ft);
+# ifdef __MINGW32__
+ now.ul -= 116444736000000000ULL;
+# else
+ now.ul -= 116444736000000000UI64; /* re-bias to 1/1/1970 */
+# endif
+ t->tv_sec = (long)(now.ul / 10000000);
+ t->tv_usec = ((int)(now.ul % 10000000)) / 10;
# elif defined(OPENSSL_SYS_VMS)
struct timeb tb;
ftime(&tb);
diff --git a/crypto/bio/bss_fd.c b/crypto/bio/bss_fd.c
index ccef578154d7..5f4e34481b0e 100644
--- a/crypto/bio/bss_fd.c
+++ b/crypto/bio/bss_fd.c
@@ -63,9 +63,27 @@
#if defined(OPENSSL_NO_POSIX_IO)
/*
- * One can argue that one should implement dummy placeholder for
- * BIO_s_fd here...
+ * Dummy placeholder for BIO_s_fd...
*/
+BIO *BIO_new_fd(int fd, int close_flag)
+{
+ return NULL;
+}
+
+int BIO_fd_non_fatal_error(int err)
+{
+ return 0;
+}
+
+int BIO_fd_should_retry(int i)
+{
+ return 0;
+}
+
+BIO_METHOD *BIO_s_fd(void)
+{
+ return NULL;
+}
#else
/*
* As for unconditional usage of "UPLINK" interface in this module.
diff --git a/crypto/bn/Makefile b/crypto/bn/Makefile
index 3d0158c15aea..215855ecae91 100644
--- a/crypto/bn/Makefile
+++ b/crypto/bn/Makefile
@@ -77,6 +77,12 @@ sparcv9a-mont.s: asm/sparcv9a-mont.pl
$(PERL) asm/sparcv9a-mont.pl $(CFLAGS) > $@
sparcv9-mont.s: asm/sparcv9-mont.pl
$(PERL) asm/sparcv9-mont.pl $(CFLAGS) > $@
+vis3-mont.s: asm/vis3-mont.pl
+ $(PERL) asm/vis3-mont.pl $(CFLAGS) > $@
+sparct4-mont.S: asm/sparct4-mont.pl
+ $(PERL) asm/sparct4-mont.pl $(CFLAGS) > $@
+sparcv9-gf2m.S: asm/sparcv9-gf2m.pl
+ $(PERL) asm/sparcv9-gf2m.pl $(CFLAGS) > $@
bn-mips3.o: asm/mips3.s
@if [ "$(CC)" = "gcc" ]; then \
@@ -102,8 +108,10 @@ x86_64-mont5.s: asm/x86_64-mont5.pl
$(PERL) asm/x86_64-mont5.pl $(PERLASM_SCHEME) > $@
x86_64-gf2m.s: asm/x86_64-gf2m.pl
$(PERL) asm/x86_64-gf2m.pl $(PERLASM_SCHEME) > $@
-modexp512-x86_64.s: asm/modexp512-x86_64.pl
- $(PERL) asm/modexp512-x86_64.pl $(PERLASM_SCHEME) > $@
+rsaz-x86_64.s: asm/rsaz-x86_64.pl
+ $(PERL) asm/rsaz-x86_64.pl $(PERLASM_SCHEME) > $@
+rsaz-avx2.s: asm/rsaz-avx2.pl
+ $(PERL) asm/rsaz-avx2.pl $(PERLASM_SCHEME) > $@
bn-ia64.s: asm/ia64.S
$(CC) $(CFLAGS) -E asm/ia64.S > $@
@@ -125,14 +133,15 @@ ppc-mont.s: asm/ppc-mont.pl;$(PERL) asm/ppc-mont.pl $(PERLASM_SCHEME) $@
ppc64-mont.s: asm/ppc64-mont.pl;$(PERL) asm/ppc64-mont.pl $(PERLASM_SCHEME) $@
alpha-mont.s: asm/alpha-mont.pl
- (preproc=/tmp/$$$$.$@; trap "rm $$preproc" INT; \
+ (preproc=$$$$.$@.S; trap "rm $$preproc" INT; \
$(PERL) asm/alpha-mont.pl > $$preproc && \
- $(CC) -E $$preproc > $@ && rm $$preproc)
+ $(CC) -E -P $$preproc > $@ && rm $$preproc)
# GNU make "catch all"
-%-mont.s: asm/%-mont.pl; $(PERL) $< $(PERLASM_SCHEME) $@
+%-mont.S: asm/%-mont.pl; $(PERL) $< $(PERLASM_SCHEME) $@
%-gf2m.S: asm/%-gf2m.pl; $(PERL) $< $(PERLASM_SCHEME) $@
+armv4-mont.o: armv4-mont.S
armv4-gf2m.o: armv4-gf2m.S
files:
@@ -244,6 +253,7 @@ bn_exp.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
bn_exp.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
bn_exp.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
bn_exp.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_exp.c bn_lcl.h
+bn_exp.o: rsaz_exp.h
bn_exp2.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
bn_exp2.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
bn_exp2.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
diff --git a/crypto/bn/asm/armv4-gf2m.pl b/crypto/bn/asm/armv4-gf2m.pl
index c52e0b75b5b6..8f529c95cf05 100755
--- a/crypto/bn/asm/armv4-gf2m.pl
+++ b/crypto/bn/asm/armv4-gf2m.pl
@@ -20,48 +20,26 @@
# length, more for longer keys. Even though NEON 1x1 multiplication
# runs in even less cycles, ~30, improvement is measurable only on
# longer keys. One has to optimize code elsewhere to get NEON glow...
+#
+# April 2014
+#
+# Double bn_GF2m_mul_2x2 performance by using algorithm from paper
+# referred below, which improves ECDH and ECDSA verify benchmarks
+# by 18-40%.
+#
+# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
+# Polynomial Multiplication on ARM Processors using the NEON Engine.
+#
+# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
-sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
-sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
-sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
-
$code=<<___;
#include "arm_arch.h"
.text
.code 32
-
-#if __ARM_ARCH__>=7
-.fpu neon
-
-.type mul_1x1_neon,%function
-.align 5
-mul_1x1_neon:
- vshl.u64 `&Dlo("q1")`,d16,#8 @ q1-q3 are slided $a
- vmull.p8 `&Q("d0")`,d16,d17 @ a·bb
- vshl.u64 `&Dlo("q2")`,d16,#16
- vmull.p8 q1,`&Dlo("q1")`,d17 @ a<<8·bb
- vshl.u64 `&Dlo("q3")`,d16,#24
- vmull.p8 q2,`&Dlo("q2")`,d17 @ a<<16·bb
- vshr.u64 `&Dlo("q1")`,#8
- vmull.p8 q3,`&Dlo("q3")`,d17 @ a<<24·bb
- vshl.u64 `&Dhi("q1")`,#24
- veor d0,`&Dlo("q1")`
- vshr.u64 `&Dlo("q2")`,#16
- veor d0,`&Dhi("q1")`
- vshl.u64 `&Dhi("q2")`,#16
- veor d0,`&Dlo("q2")`
- vshr.u64 `&Dlo("q3")`,#24
- veor d0,`&Dhi("q2")`
- vshl.u64 `&Dhi("q3")`,#8
- veor d0,`&Dlo("q3")`
- veor d0,`&Dhi("q3")`
- bx lr
-.size mul_1x1_neon,.-mul_1x1_neon
-#endif
___
################
# private interface to mul_1x1_ialu
@@ -159,56 +137,17 @@ ___
# void bn_GF2m_mul_2x2(BN_ULONG *r,
# BN_ULONG a1,BN_ULONG a0,
# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0
-
-($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23));
-
+{
$code.=<<___;
.global bn_GF2m_mul_2x2
.type bn_GF2m_mul_2x2,%function
.align 5
bn_GF2m_mul_2x2:
-#if __ARM_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7
ldr r12,.LOPENSSL_armcap
.Lpic: ldr r12,[pc,r12]
tst r12,#1
- beq .Lialu
-
- veor $A1,$A1
- vmov.32 $B1,r3,r3 @ two copies of b1
- vmov.32 ${A1}[0],r1 @ a1
-
- veor $A0,$A0
- vld1.32 ${B0}[],[sp,:32] @ two copies of b0
- vmov.32 ${A0}[0],r2 @ a0
- mov r12,lr
-
- vmov d16,$A1
- vmov d17,$B1
- bl mul_1x1_neon @ a1·b1
- vmov $A1B1,d0
-
- vmov d16,$A0
- vmov d17,$B0
- bl mul_1x1_neon @ a0·b0
- vmov $A0B0,d0
-
- veor d16,$A0,$A1
- veor d17,$B0,$B1
- veor $A0,$A0B0,$A1B1
- bl mul_1x1_neon @ (a0+a1)·(b0+b1)
-
- veor d0,$A0 @ (a0+a1)·(b0+b1)-a0·b0-a1·b1
- vshl.u64 d1,d0,#32
- vshr.u64 d0,d0,#32
- veor $A0B0,d1
- veor $A1B1,d0
- vst1.32 {${A0B0}[0]},[r0,:32]!
- vst1.32 {${A0B0}[1]},[r0,:32]!
- vst1.32 {${A1B1}[0]},[r0,:32]!
- vst1.32 {${A1B1}[1]},[r0,:32]
- bx r12
-.align 4
-.Lialu:
+ bne .LNEON
#endif
___
$ret="r10"; # reassigned 1st argument
@@ -260,8 +199,72 @@ $code.=<<___;
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
+___
+}
+{
+my ($r,$t0,$t1,$t2,$t3)=map("q$_",(0..3,8..12));
+my ($a,$b,$k48,$k32,$k16)=map("d$_",(26..31));
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.align 5
+.LNEON:
+ ldr r12, [sp] @ 5th argument
+ vmov.32 $a, r2, r1
+ vmov.32 $b, r12, r3
+ vmov.i64 $k48, #0x0000ffffffffffff
+ vmov.i64 $k32, #0x00000000ffffffff
+ vmov.i64 $k16, #0x000000000000ffff
+
+ vext.8 $t0#lo, $a, $a, #1 @ A1
+ vmull.p8 $t0, $t0#lo, $b @ F = A1*B
+ vext.8 $r#lo, $b, $b, #1 @ B1
+ vmull.p8 $r, $a, $r#lo @ E = A*B1
+ vext.8 $t1#lo, $a, $a, #2 @ A2
+ vmull.p8 $t1, $t1#lo, $b @ H = A2*B
+ vext.8 $t3#lo, $b, $b, #2 @ B2
+ vmull.p8 $t3, $a, $t3#lo @ G = A*B2
+ vext.8 $t2#lo, $a, $a, #3 @ A3
+ veor $t0, $t0, $r @ L = E + F
+ vmull.p8 $t2, $t2#lo, $b @ J = A3*B
+ vext.8 $r#lo, $b, $b, #3 @ B3
+ veor $t1, $t1, $t3 @ M = G + H
+ vmull.p8 $r, $a, $r#lo @ I = A*B3
+ veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
+ vand $t0#hi, $t0#hi, $k48
+ vext.8 $t3#lo, $b, $b, #4 @ B4
+ veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
+ vand $t1#hi, $t1#hi, $k32
+ vmull.p8 $t3, $a, $t3#lo @ K = A*B4
+ veor $t2, $t2, $r @ N = I + J
+ veor $t0#lo, $t0#lo, $t0#hi
+ veor $t1#lo, $t1#lo, $t1#hi
+ veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
+ vand $t2#hi, $t2#hi, $k16
+ vext.8 $t0, $t0, $t0, #15
+ veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
+ vmov.i64 $t3#hi, #0
+ vext.8 $t1, $t1, $t1, #14
+ veor $t2#lo, $t2#lo, $t2#hi
+ vmull.p8 $r, $a, $b @ D = A*B
+ vext.8 $t3, $t3, $t3, #12
+ vext.8 $t2, $t2, $t2, #13
+ veor $t0, $t0, $t1
+ veor $t2, $t2, $t3
+ veor $r, $r, $t0
+ veor $r, $r, $t2
+
+ vst1.32 {$r}, [r0]
+ ret @ bx lr
+#endif
+___
+}
+$code.=<<___;
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
-#if __ARM_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7
.align 5
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-(.Lpic+8)
@@ -269,10 +272,18 @@ $code.=<<___;
.asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 5
+#if __ARM_MAX_ARCH__>=7
.comm OPENSSL_armcap_P,4,4
+#endif
___
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
-print $code;
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
+ s/\bret\b/bx lr/go or
+ s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
+
+ print $_,"\n";
+}
close STDOUT; # enforce flush
diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl
index f78a8b5f0f55..1d330e9f8aa3 100755
--- a/crypto/bn/asm/armv4-mont.pl
+++ b/crypto/bn/asm/armv4-mont.pl
@@ -1,7 +1,7 @@
#!/usr/bin/env perl
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -23,6 +23,21 @@
# than 1/2KB. Windows CE port would be trivial, as it's exclusively
# about decorations, ABI and instruction syntax are identical.
+# November 2013
+#
+# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
+# performance improvement on Cortex-A8 is ~45-100% depending on key
+# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
+# On Snapdragon S4 improvement was measured to vary from ~70% to
+# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
+# rather because original integer-only code seems to perform
+# suboptimally on S4. Situation on Cortex-A9 is unfortunately
+# different. It's being looked into, but the trouble is that
+# performance for vectors longer than 256 bits is actually couple
+# of percent worse than for integer-only code. The code is chosen
+# for execution on all NEON-capable processors, because gain on
+# others outweighs the marginal loss on Cortex-A9.
+
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
@@ -52,16 +67,40 @@ $_n0="$num,#14*4";
$_num="$num,#15*4"; $_bpend=$_num;
$code=<<___;
+#include "arm_arch.h"
+
.text
+.code 32
+
+#if __ARM_MAX_ARCH__>=7
+.align 5
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-bn_mul_mont
+#endif
.global bn_mul_mont
.type bn_mul_mont,%function
-.align 2
+.align 5
bn_mul_mont:
+ ldr ip,[sp,#4] @ load num
stmdb sp!,{r0,r2} @ sp points at argument block
- ldr $num,[sp,#3*4] @ load num
- cmp $num,#2
+#if __ARM_MAX_ARCH__>=7
+ tst ip,#7
+ bne .Lialu
+ adr r0,bn_mul_mont
+ ldr r2,.LOPENSSL_armcap
+ ldr r0,[r0,r2]
+ tst r0,#1 @ NEON available?
+ ldmia sp, {r0,r2}
+ beq .Lialu
+ add sp,sp,#8
+ b bn_mul8x_mont_neon
+.align 4
+.Lialu:
+#endif
+ cmp ip,#2
+ mov $num,ip @ load num
movlt r0,#0
addlt sp,sp,#2*4
blt .Labrt
@@ -191,14 +230,447 @@ bn_mul_mont:
ldmia sp!,{r4-r12,lr} @ restore registers
add sp,sp,#2*4 @ skip over {r0,r2}
mov r0,#1
-.Labrt: tst lr,#1
+.Labrt:
+#if __ARM_ARCH__>=5
+ ret @ bx lr
+#else
+ tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
+#endif
.size bn_mul_mont,.-bn_mul_mont
-.asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
+___
+{
+sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
+sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
+
+my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
+my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
+my ($Z,$Temp)=("q4","q5");
+my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13));
+my ($Bi,$Ni,$M0)=map("d$_",(28..31));
+my $zero=&Dlo($Z);
+my $temp=&Dlo($Temp);
+
+my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
+my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9));
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.type bn_mul8x_mont_neon,%function
+.align 5
+bn_mul8x_mont_neon:
+ mov ip,sp
+ stmdb sp!,{r4-r11}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+ ldmia ip,{r4-r5} @ load rest of parameter block
+
+ sub $toutptr,sp,#16
+ vld1.32 {${Bi}[0]}, [$bptr,:32]!
+ sub $toutptr,$toutptr,$num,lsl#4
+ vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-(
+ and $toutptr,$toutptr,#-64
+ vld1.32 {${M0}[0]}, [$n0,:32]
+ mov sp,$toutptr @ alloca
+ veor $zero,$zero,$zero
+ subs $inner,$num,#8
+ vzip.16 $Bi,$zero
+
+ vmull.u32 $A0xB,$Bi,${A0}[0]
+ vmull.u32 $A1xB,$Bi,${A0}[1]
+ vmull.u32 $A2xB,$Bi,${A1}[0]
+ vshl.i64 $temp,`&Dhi("$A0xB")`,#16
+ vmull.u32 $A3xB,$Bi,${A1}[1]
+
+ vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
+ veor $zero,$zero,$zero
+ vmul.u32 $Ni,$temp,$M0
+
+ vmull.u32 $A4xB,$Bi,${A2}[0]
+ vld1.32 {$N0-$N3}, [$nptr]!
+ vmull.u32 $A5xB,$Bi,${A2}[1]
+ vmull.u32 $A6xB,$Bi,${A3}[0]
+ vzip.16 $Ni,$zero
+ vmull.u32 $A7xB,$Bi,${A3}[1]
+
+ bne .LNEON_1st
+
+ @ special case for num=8, everything is in register bank...
+
+ vmlal.u32 $A0xB,$Ni,${N0}[0]
+ sub $outer,$num,#1
+ vmlal.u32 $A1xB,$Ni,${N0}[1]
+ vmlal.u32 $A2xB,$Ni,${N1}[0]
+ vmlal.u32 $A3xB,$Ni,${N1}[1]
+
+ vmlal.u32 $A4xB,$Ni,${N2}[0]
+ vmov $Temp,$A0xB
+ vmlal.u32 $A5xB,$Ni,${N2}[1]
+ vmov $A0xB,$A1xB
+ vmlal.u32 $A6xB,$Ni,${N3}[0]
+ vmov $A1xB,$A2xB
+ vmlal.u32 $A7xB,$Ni,${N3}[1]
+ vmov $A2xB,$A3xB
+ vmov $A3xB,$A4xB
+ vshr.u64 $temp,$temp,#16
+ vmov $A4xB,$A5xB
+ vmov $A5xB,$A6xB
+ vadd.u64 $temp,$temp,`&Dhi("$Temp")`
+ vmov $A6xB,$A7xB
+ veor $A7xB,$A7xB
+ vshr.u64 $temp,$temp,#16
+
+ b .LNEON_outer8
+
+.align 4
+.LNEON_outer8:
+ vld1.32 {${Bi}[0]}, [$bptr,:32]!
+ veor $zero,$zero,$zero
+ vzip.16 $Bi,$zero
+ vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
+
+ vmlal.u32 $A0xB,$Bi,${A0}[0]
+ vmlal.u32 $A1xB,$Bi,${A0}[1]
+ vmlal.u32 $A2xB,$Bi,${A1}[0]
+ vshl.i64 $temp,`&Dhi("$A0xB")`,#16
+ vmlal.u32 $A3xB,$Bi,${A1}[1]
+
+ vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
+ veor $zero,$zero,$zero
+ subs $outer,$outer,#1
+ vmul.u32 $Ni,$temp,$M0
+
+ vmlal.u32 $A4xB,$Bi,${A2}[0]
+ vmlal.u32 $A5xB,$Bi,${A2}[1]
+ vmlal.u32 $A6xB,$Bi,${A3}[0]
+ vzip.16 $Ni,$zero
+ vmlal.u32 $A7xB,$Bi,${A3}[1]
+
+ vmlal.u32 $A0xB,$Ni,${N0}[0]
+ vmlal.u32 $A1xB,$Ni,${N0}[1]
+ vmlal.u32 $A2xB,$Ni,${N1}[0]
+ vmlal.u32 $A3xB,$Ni,${N1}[1]
+
+ vmlal.u32 $A4xB,$Ni,${N2}[0]
+ vmov $Temp,$A0xB
+ vmlal.u32 $A5xB,$Ni,${N2}[1]
+ vmov $A0xB,$A1xB
+ vmlal.u32 $A6xB,$Ni,${N3}[0]
+ vmov $A1xB,$A2xB
+ vmlal.u32 $A7xB,$Ni,${N3}[1]
+ vmov $A2xB,$A3xB
+ vmov $A3xB,$A4xB
+ vshr.u64 $temp,$temp,#16
+ vmov $A4xB,$A5xB
+ vmov $A5xB,$A6xB
+ vadd.u64 $temp,$temp,`&Dhi("$Temp")`
+ vmov $A6xB,$A7xB
+ veor $A7xB,$A7xB
+ vshr.u64 $temp,$temp,#16
+
+ bne .LNEON_outer8
+
+ vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
+ mov $toutptr,sp
+ vshr.u64 $temp,`&Dlo("$A0xB")`,#16
+ mov $inner,$num
+ vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
+ add $tinptr,sp,#16
+ vshr.u64 $temp,`&Dhi("$A0xB")`,#16
+ vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")`
+
+ b .LNEON_tail2
+
+.align 4
+.LNEON_1st:
+ vmlal.u32 $A0xB,$Ni,${N0}[0]
+ vld1.32 {$A0-$A3}, [$aptr]!
+ vmlal.u32 $A1xB,$Ni,${N0}[1]
+ subs $inner,$inner,#8
+ vmlal.u32 $A2xB,$Ni,${N1}[0]
+ vmlal.u32 $A3xB,$Ni,${N1}[1]
+
+ vmlal.u32 $A4xB,$Ni,${N2}[0]
+ vld1.32 {$N0-$N1}, [$nptr]!
+ vmlal.u32 $A5xB,$Ni,${N2}[1]
+ vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
+ vmlal.u32 $A6xB,$Ni,${N3}[0]
+ vmlal.u32 $A7xB,$Ni,${N3}[1]
+ vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
+
+ vmull.u32 $A0xB,$Bi,${A0}[0]
+ vld1.32 {$N2-$N3}, [$nptr]!
+ vmull.u32 $A1xB,$Bi,${A0}[1]
+ vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
+ vmull.u32 $A2xB,$Bi,${A1}[0]
+ vmull.u32 $A3xB,$Bi,${A1}[1]
+ vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
+
+ vmull.u32 $A4xB,$Bi,${A2}[0]
+ vmull.u32 $A5xB,$Bi,${A2}[1]
+ vmull.u32 $A6xB,$Bi,${A3}[0]
+ vmull.u32 $A7xB,$Bi,${A3}[1]
+
+ bne .LNEON_1st
+
+ vmlal.u32 $A0xB,$Ni,${N0}[0]
+ add $tinptr,sp,#16
+ vmlal.u32 $A1xB,$Ni,${N0}[1]
+ sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr
+ vmlal.u32 $A2xB,$Ni,${N1}[0]
+ vld1.64 {$Temp}, [sp,:128]
+ vmlal.u32 $A3xB,$Ni,${N1}[1]
+ sub $outer,$num,#1
+
+ vmlal.u32 $A4xB,$Ni,${N2}[0]
+ vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
+ vmlal.u32 $A5xB,$Ni,${N2}[1]
+ vshr.u64 $temp,$temp,#16
+ vld1.64 {$A0xB}, [$tinptr, :128]!
+ vmlal.u32 $A6xB,$Ni,${N3}[0]
+ vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
+ vmlal.u32 $A7xB,$Ni,${N3}[1]
+
+ vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
+ vadd.u64 $temp,$temp,`&Dhi("$Temp")`
+ veor $Z,$Z,$Z
+ vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
+ vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
+ vst1.64 {$Z}, [$toutptr,:128]
+ vshr.u64 $temp,$temp,#16
+
+ b .LNEON_outer
+
+.align 4
+.LNEON_outer:
+ vld1.32 {${Bi}[0]}, [$bptr,:32]!
+ sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
+ vld1.32 {$A0-$A3}, [$aptr]!
+ veor $zero,$zero,$zero
+ mov $toutptr,sp
+ vzip.16 $Bi,$zero
+ sub $inner,$num,#8
+ vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
+
+ vmlal.u32 $A0xB,$Bi,${A0}[0]
+ vld1.64 {$A3xB-$A4xB},[$tinptr,:256]!
+ vmlal.u32 $A1xB,$Bi,${A0}[1]
+ vmlal.u32 $A2xB,$Bi,${A1}[0]
+ vld1.64 {$A5xB-$A6xB},[$tinptr,:256]!
+ vmlal.u32 $A3xB,$Bi,${A1}[1]
+
+ vshl.i64 $temp,`&Dhi("$A0xB")`,#16
+ veor $zero,$zero,$zero
+ vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
+ vld1.64 {$A7xB},[$tinptr,:128]!
+ vmul.u32 $Ni,$temp,$M0
+
+ vmlal.u32 $A4xB,$Bi,${A2}[0]
+ vld1.32 {$N0-$N3}, [$nptr]!
+ vmlal.u32 $A5xB,$Bi,${A2}[1]
+ vmlal.u32 $A6xB,$Bi,${A3}[0]
+ vzip.16 $Ni,$zero
+ vmlal.u32 $A7xB,$Bi,${A3}[1]
+
+.LNEON_inner:
+ vmlal.u32 $A0xB,$Ni,${N0}[0]
+ vld1.32 {$A0-$A3}, [$aptr]!
+ vmlal.u32 $A1xB,$Ni,${N0}[1]
+ subs $inner,$inner,#8
+ vmlal.u32 $A2xB,$Ni,${N1}[0]
+ vmlal.u32 $A3xB,$Ni,${N1}[1]
+ vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
+
+ vmlal.u32 $A4xB,$Ni,${N2}[0]
+ vld1.64 {$A0xB}, [$tinptr, :128]!
+ vmlal.u32 $A5xB,$Ni,${N2}[1]
+ vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
+ vmlal.u32 $A6xB,$Ni,${N3}[0]
+ vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
+ vmlal.u32 $A7xB,$Ni,${N3}[1]
+ vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
+
+ vmlal.u32 $A0xB,$Bi,${A0}[0]
+ vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]!
+ vmlal.u32 $A1xB,$Bi,${A0}[1]
+ vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
+ vmlal.u32 $A2xB,$Bi,${A1}[0]
+ vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]!
+ vmlal.u32 $A3xB,$Bi,${A1}[1]
+ vld1.32 {$N0-$N3}, [$nptr]!
+
+ vmlal.u32 $A4xB,$Bi,${A2}[0]
+ vld1.64 {$A7xB}, [$tinptr, :128]!
+ vmlal.u32 $A5xB,$Bi,${A2}[1]
+ vmlal.u32 $A6xB,$Bi,${A3}[0]
+ vmlal.u32 $A7xB,$Bi,${A3}[1]
+
+ bne .LNEON_inner
+
+ vmlal.u32 $A0xB,$Ni,${N0}[0]
+ add $tinptr,sp,#16
+ vmlal.u32 $A1xB,$Ni,${N0}[1]
+ sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr
+ vmlal.u32 $A2xB,$Ni,${N1}[0]
+ vld1.64 {$Temp}, [sp,:128]
+ vmlal.u32 $A3xB,$Ni,${N1}[1]
+ subs $outer,$outer,#1
+
+ vmlal.u32 $A4xB,$Ni,${N2}[0]
+ vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
+ vmlal.u32 $A5xB,$Ni,${N2}[1]
+ vld1.64 {$A0xB}, [$tinptr, :128]!
+ vshr.u64 $temp,$temp,#16
+ vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
+ vmlal.u32 $A6xB,$Ni,${N3}[0]
+ vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
+ vmlal.u32 $A7xB,$Ni,${N3}[1]
+
+ vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
+ vadd.u64 $temp,$temp,`&Dhi("$Temp")`
+ vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
+ vshr.u64 $temp,$temp,#16
+
+ bne .LNEON_outer
+
+ mov $toutptr,sp
+ mov $inner,$num
+
+.LNEON_tail:
+ vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
+ vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]!
+ vshr.u64 $temp,`&Dlo("$A0xB")`,#16
+ vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
+ vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]!
+ vshr.u64 $temp,`&Dhi("$A0xB")`,#16
+ vld1.64 {$A7xB}, [$tinptr, :128]!
+ vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")`
+
+.LNEON_tail2:
+ vadd.u64 `&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp
+ vst1.32 {`&Dlo("$A0xB")`[0]}, [$toutptr, :32]!
+ vshr.u64 $temp,`&Dlo("$A1xB")`,#16
+ vadd.u64 `&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp
+ vshr.u64 $temp,`&Dhi("$A1xB")`,#16
+ vzip.16 `&Dlo("$A1xB")`,`&Dhi("$A1xB")`
+
+ vadd.u64 `&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp
+ vst1.32 {`&Dlo("$A1xB")`[0]}, [$toutptr, :32]!
+ vshr.u64 $temp,`&Dlo("$A2xB")`,#16
+ vadd.u64 `&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp
+ vshr.u64 $temp,`&Dhi("$A2xB")`,#16
+ vzip.16 `&Dlo("$A2xB")`,`&Dhi("$A2xB")`
+
+ vadd.u64 `&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp
+ vst1.32 {`&Dlo("$A2xB")`[0]}, [$toutptr, :32]!
+ vshr.u64 $temp,`&Dlo("$A3xB")`,#16
+ vadd.u64 `&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp
+ vshr.u64 $temp,`&Dhi("$A3xB")`,#16
+ vzip.16 `&Dlo("$A3xB")`,`&Dhi("$A3xB")`
+
+ vadd.u64 `&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp
+ vst1.32 {`&Dlo("$A3xB")`[0]}, [$toutptr, :32]!
+ vshr.u64 $temp,`&Dlo("$A4xB")`,#16
+ vadd.u64 `&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp
+ vshr.u64 $temp,`&Dhi("$A4xB")`,#16
+ vzip.16 `&Dlo("$A4xB")`,`&Dhi("$A4xB")`
+
+ vadd.u64 `&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp
+ vst1.32 {`&Dlo("$A4xB")`[0]}, [$toutptr, :32]!
+ vshr.u64 $temp,`&Dlo("$A5xB")`,#16
+ vadd.u64 `&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp
+ vshr.u64 $temp,`&Dhi("$A5xB")`,#16
+ vzip.16 `&Dlo("$A5xB")`,`&Dhi("$A5xB")`
+
+ vadd.u64 `&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp
+ vst1.32 {`&Dlo("$A5xB")`[0]}, [$toutptr, :32]!
+ vshr.u64 $temp,`&Dlo("$A6xB")`,#16
+ vadd.u64 `&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp
+ vld1.64 {$A0xB}, [$tinptr, :128]!
+ vshr.u64 $temp,`&Dhi("$A6xB")`,#16
+ vzip.16 `&Dlo("$A6xB")`,`&Dhi("$A6xB")`
+
+ vadd.u64 `&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp
+ vst1.32 {`&Dlo("$A6xB")`[0]}, [$toutptr, :32]!
+ vshr.u64 $temp,`&Dlo("$A7xB")`,#16
+ vadd.u64 `&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp
+ vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
+ vshr.u64 $temp,`&Dhi("$A7xB")`,#16
+ vzip.16 `&Dlo("$A7xB")`,`&Dhi("$A7xB")`
+ subs $inner,$inner,#8
+ vst1.32 {`&Dlo("$A7xB")`[0]}, [$toutptr, :32]!
+
+ bne .LNEON_tail
+
+ vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit
+ sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
+ subs $aptr,sp,#0 @ clear carry flag
+ add $bptr,sp,$num,lsl#2
+
+.LNEON_sub:
+ ldmia $aptr!, {r4-r7}
+ ldmia $nptr!, {r8-r11}
+ sbcs r8, r4,r8
+ sbcs r9, r5,r9
+ sbcs r10,r6,r10
+ sbcs r11,r7,r11
+ teq $aptr,$bptr @ preserves carry
+ stmia $rptr!, {r8-r11}
+ bne .LNEON_sub
+
+ ldr r10, [$aptr] @ load top-most bit
+ veor q0,q0,q0
+ sub r11,$bptr,sp @ this is num*4
+ veor q1,q1,q1
+ mov $aptr,sp
+ sub $rptr,$rptr,r11 @ rewind $rptr
+ mov $nptr,$bptr @ second 3/4th of frame
+ sbcs r10,r10,#0 @ result is carry flag
+
+.LNEON_copy_n_zap:
+ ldmia $aptr!, {r4-r7}
+ ldmia $rptr, {r8-r11}
+ movcc r8, r4
+ vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
+ movcc r9, r5
+ movcc r10,r6
+ vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
+ movcc r11,r7
+ ldmia $aptr, {r4-r7}
+ stmia $rptr!, {r8-r11}
+ sub $aptr,$aptr,#16
+ ldmia $rptr, {r8-r11}
+ movcc r8, r4
+ vst1.64 {q0-q1}, [$aptr,:256]! @ wipe
+ movcc r9, r5
+ movcc r10,r6
+ vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
+ movcc r11,r7
+ teq $aptr,$bptr @ preserves carry
+ stmia $rptr!, {r8-r11}
+ bne .LNEON_copy_n_zap
+
+ sub sp,ip,#96
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r11}
+ ret @ bx lr
+.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
+#if __ARM_MAX_ARCH__>=7
+.comm OPENSSL_armcap_P,4,4
+#endif
___
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
+$code =~ s/\bret\b/bx lr/gm;
print $code;
close STDOUT;
diff --git a/crypto/bn/asm/mips-mont.pl b/crypto/bn/asm/mips-mont.pl
index caae04ed3a86..a33cdf411121 100755
--- a/crypto/bn/asm/mips-mont.pl
+++ b/crypto/bn/asm/mips-mont.pl
@@ -46,7 +46,7 @@
# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
#
-$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
if ($flavour =~ /64|n32/i) {
$PTR_ADD="dadd"; # incidentally works even on n32
diff --git a/crypto/bn/asm/mips.pl b/crypto/bn/asm/mips.pl
index 215c9a748325..acafde5e5685 100755
--- a/crypto/bn/asm/mips.pl
+++ b/crypto/bn/asm/mips.pl
@@ -48,7 +48,7 @@
# has to content with 40-85% improvement depending on benchmark and
# key length, more for longer keys.
-$flavour = shift;
+$flavour = shift || "o32";
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
diff --git a/crypto/bn/asm/mips3.s b/crypto/bn/asm/mips3.s
new file mode 100644
index 000000000000..dca4105c7db1
--- /dev/null
+++ b/crypto/bn/asm/mips3.s
@@ -0,0 +1,2201 @@
+.rdata
+.asciiz "mips3.s, Version 1.1"
+.asciiz "MIPS III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
+
+/*
+ * ====================================================================
+ * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+ * project.
+ *
+ * Rights for redistribution and usage in source and binary forms are
+ * granted according to the OpenSSL license. Warranty of any kind is
+ * disclaimed.
+ * ====================================================================
+ */
+
+/*
+ * This is my modest contributon to the OpenSSL project (see
+ * http://www.openssl.org/ for more information about it) and is
+ * a drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c
+ * module. For updates see http://fy.chalmers.se/~appro/hpe/.
+ *
+ * The module is designed to work with either of the "new" MIPS ABI(5),
+ * namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
+ * IRIX 5.x not only because it doesn't support new ABIs but also
+ * because 5.x kernels put R4x00 CPU into 32-bit mode and all those
+ * 64-bit instructions (daddu, dmultu, etc.) found below gonna only
+ * cause illegal instruction exception:-(
+ *
+ * In addition the code depends on preprocessor flags set up by MIPSpro
+ * compiler driver (either as or cc) and therefore (probably?) can't be
+ * compiled by the GNU assembler. GNU C driver manages fine though...
+ * I mean as long as -mmips-as is specified or is the default option,
+ * because then it simply invokes /usr/bin/as which in turn takes
+ * perfect care of the preprocessor definitions. Another neat feature
+ * offered by the MIPSpro assembler is an optimization pass. This gave
+ * me the opportunity to have the code looking more regular as all those
+ * architecture dependent instruction rescheduling details were left to
+ * the assembler. Cool, huh?
+ *
+ * Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
+ * goes way over 3 times faster!
+ *
+ * <appro@fy.chalmers.se>
+ */
+#include <asm.h>
+#include <regdef.h>
+
+#if _MIPS_ISA>=4
+#define MOVNZ(cond,dst,src) \
+ movn dst,src,cond
+#else
+#define MOVNZ(cond,dst,src) \
+ .set noreorder; \
+ bnezl cond,.+8; \
+ move dst,src; \
+ .set reorder
+#endif
+
+.text
+
+.set noat
+.set reorder
+
+#define MINUS4 v1
+
+.align 5
+LEAF(bn_mul_add_words)
+ .set noreorder
+ bgtzl a2,.L_bn_mul_add_words_proceed
+ ld t0,0(a1)
+ jr ra
+ move v0,zero
+ .set reorder
+
+.L_bn_mul_add_words_proceed:
+ li MINUS4,-4
+ and ta0,a2,MINUS4
+ move v0,zero
+ beqz ta0,.L_bn_mul_add_words_tail
+
+.L_bn_mul_add_words_loop:
+ dmultu t0,a3
+ ld t1,0(a0)
+ ld t2,8(a1)
+ ld t3,8(a0)
+ ld ta0,16(a1)
+ ld ta1,16(a0)
+ daddu t1,v0
+ sltu v0,t1,v0 /* All manuals say it "compares 32-bit
+ * values", but it seems to work fine
+ * even on 64-bit registers. */
+ mflo AT
+ mfhi t0
+ daddu t1,AT
+ daddu v0,t0
+ sltu AT,t1,AT
+ sd t1,0(a0)
+ daddu v0,AT
+
+ dmultu t2,a3
+ ld ta2,24(a1)
+ ld ta3,24(a0)
+ daddu t3,v0
+ sltu v0,t3,v0
+ mflo AT
+ mfhi t2
+ daddu t3,AT
+ daddu v0,t2
+ sltu AT,t3,AT
+ sd t3,8(a0)
+ daddu v0,AT
+
+ dmultu ta0,a3
+ subu a2,4
+ PTR_ADD a0,32
+ PTR_ADD a1,32
+ daddu ta1,v0
+ sltu v0,ta1,v0
+ mflo AT
+ mfhi ta0
+ daddu ta1,AT
+ daddu v0,ta0
+ sltu AT,ta1,AT
+ sd ta1,-16(a0)
+ daddu v0,AT
+
+
+ dmultu ta2,a3
+ and ta0,a2,MINUS4
+ daddu ta3,v0
+ sltu v0,ta3,v0
+ mflo AT
+ mfhi ta2
+ daddu ta3,AT
+ daddu v0,ta2
+ sltu AT,ta3,AT
+ sd ta3,-8(a0)
+ daddu v0,AT
+ .set noreorder
+ bgtzl ta0,.L_bn_mul_add_words_loop
+ ld t0,0(a1)
+
+ bnezl a2,.L_bn_mul_add_words_tail
+ ld t0,0(a1)
+ .set reorder
+
+.L_bn_mul_add_words_return:
+ jr ra
+
+.L_bn_mul_add_words_tail:
+ dmultu t0,a3
+ ld t1,0(a0)
+ subu a2,1
+ daddu t1,v0
+ sltu v0,t1,v0
+ mflo AT
+ mfhi t0
+ daddu t1,AT
+ daddu v0,t0
+ sltu AT,t1,AT
+ sd t1,0(a0)
+ daddu v0,AT
+ beqz a2,.L_bn_mul_add_words_return
+
+ ld t0,8(a1)
+ dmultu t0,a3
+ ld t1,8(a0)
+ subu a2,1
+ daddu t1,v0
+ sltu v0,t1,v0
+ mflo AT
+ mfhi t0
+ daddu t1,AT
+ daddu v0,t0
+ sltu AT,t1,AT
+ sd t1,8(a0)
+ daddu v0,AT
+ beqz a2,.L_bn_mul_add_words_return
+
+ ld t0,16(a1)
+ dmultu t0,a3
+ ld t1,16(a0)
+ daddu t1,v0
+ sltu v0,t1,v0
+ mflo AT
+ mfhi t0
+ daddu t1,AT
+ daddu v0,t0
+ sltu AT,t1,AT
+ sd t1,16(a0)
+ daddu v0,AT
+ jr ra
+END(bn_mul_add_words)
+
+.align 5
+LEAF(bn_mul_words)
+ .set noreorder
+ bgtzl a2,.L_bn_mul_words_proceed
+ ld t0,0(a1)
+ jr ra
+ move v0,zero
+ .set reorder
+
+.L_bn_mul_words_proceed:
+ li MINUS4,-4
+ and ta0,a2,MINUS4
+ move v0,zero
+ beqz ta0,.L_bn_mul_words_tail
+
+.L_bn_mul_words_loop:
+ dmultu t0,a3
+ ld t2,8(a1)
+ ld ta0,16(a1)
+ ld ta2,24(a1)
+ mflo AT
+ mfhi t0
+ daddu v0,AT
+ sltu t1,v0,AT
+ sd v0,0(a0)
+ daddu v0,t1,t0
+
+ dmultu t2,a3
+ subu a2,4
+ PTR_ADD a0,32
+ PTR_ADD a1,32
+ mflo AT
+ mfhi t2
+ daddu v0,AT
+ sltu t3,v0,AT
+ sd v0,-24(a0)
+ daddu v0,t3,t2
+
+ dmultu ta0,a3
+ mflo AT
+ mfhi ta0
+ daddu v0,AT
+ sltu ta1,v0,AT
+ sd v0,-16(a0)
+ daddu v0,ta1,ta0
+
+
+ dmultu ta2,a3
+ and ta0,a2,MINUS4
+ mflo AT
+ mfhi ta2
+ daddu v0,AT
+ sltu ta3,v0,AT
+ sd v0,-8(a0)
+ daddu v0,ta3,ta2
+ .set noreorder
+ bgtzl ta0,.L_bn_mul_words_loop
+ ld t0,0(a1)
+
+ bnezl a2,.L_bn_mul_words_tail
+ ld t0,0(a1)
+ .set reorder
+
+.L_bn_mul_words_return:
+ jr ra
+
+.L_bn_mul_words_tail:
+ dmultu t0,a3
+ subu a2,1
+ mflo AT
+ mfhi t0
+ daddu v0,AT
+ sltu t1,v0,AT
+ sd v0,0(a0)
+ daddu v0,t1,t0
+ beqz a2,.L_bn_mul_words_return
+
+ ld t0,8(a1)
+ dmultu t0,a3
+ subu a2,1
+ mflo AT
+ mfhi t0
+ daddu v0,AT
+ sltu t1,v0,AT
+ sd v0,8(a0)
+ daddu v0,t1,t0
+ beqz a2,.L_bn_mul_words_return
+
+ ld t0,16(a1)
+ dmultu t0,a3
+ mflo AT
+ mfhi t0
+ daddu v0,AT
+ sltu t1,v0,AT
+ sd v0,16(a0)
+ daddu v0,t1,t0
+ jr ra
+END(bn_mul_words)
+
+.align 5
+LEAF(bn_sqr_words)
+ .set noreorder
+ bgtzl a2,.L_bn_sqr_words_proceed
+ ld t0,0(a1)
+ jr ra
+ move v0,zero
+ .set reorder
+
+.L_bn_sqr_words_proceed:
+ li MINUS4,-4
+ and ta0,a2,MINUS4
+ move v0,zero
+ beqz ta0,.L_bn_sqr_words_tail
+
+.L_bn_sqr_words_loop:
+ dmultu t0,t0
+ ld t2,8(a1)
+ ld ta0,16(a1)
+ ld ta2,24(a1)
+ mflo t1
+ mfhi t0
+ sd t1,0(a0)
+ sd t0,8(a0)
+
+ dmultu t2,t2
+ subu a2,4
+ PTR_ADD a0,64
+ PTR_ADD a1,32
+ mflo t3
+ mfhi t2
+ sd t3,-48(a0)
+ sd t2,-40(a0)
+
+ dmultu ta0,ta0
+ mflo ta1
+ mfhi ta0
+ sd ta1,-32(a0)
+ sd ta0,-24(a0)
+
+
+ dmultu ta2,ta2
+ and ta0,a2,MINUS4
+ mflo ta3
+ mfhi ta2
+ sd ta3,-16(a0)
+ sd ta2,-8(a0)
+
+ .set noreorder
+ bgtzl ta0,.L_bn_sqr_words_loop
+ ld t0,0(a1)
+
+ bnezl a2,.L_bn_sqr_words_tail
+ ld t0,0(a1)
+ .set reorder
+
+.L_bn_sqr_words_return:
+ move v0,zero
+ jr ra
+
+.L_bn_sqr_words_tail:
+ dmultu t0,t0
+ subu a2,1
+ mflo t1
+ mfhi t0
+ sd t1,0(a0)
+ sd t0,8(a0)
+ beqz a2,.L_bn_sqr_words_return
+
+ ld t0,8(a1)
+ dmultu t0,t0
+ subu a2,1
+ mflo t1
+ mfhi t0
+ sd t1,16(a0)
+ sd t0,24(a0)
+ beqz a2,.L_bn_sqr_words_return
+
+ ld t0,16(a1)
+ dmultu t0,t0
+ mflo t1
+ mfhi t0
+ sd t1,32(a0)
+ sd t0,40(a0)
+ jr ra
+END(bn_sqr_words)
+
+.align 5
+LEAF(bn_add_words)
+ .set noreorder
+ bgtzl a3,.L_bn_add_words_proceed
+ ld t0,0(a1)
+ jr ra
+ move v0,zero
+ .set reorder
+
+.L_bn_add_words_proceed:
+ li MINUS4,-4
+ and AT,a3,MINUS4
+ move v0,zero
+ beqz AT,.L_bn_add_words_tail
+
+.L_bn_add_words_loop:
+ ld ta0,0(a2)
+ subu a3,4
+ ld t1,8(a1)
+ and AT,a3,MINUS4
+ ld t2,16(a1)
+ PTR_ADD a2,32
+ ld t3,24(a1)
+ PTR_ADD a0,32
+ ld ta1,-24(a2)
+ PTR_ADD a1,32
+ ld ta2,-16(a2)
+ ld ta3,-8(a2)
+ daddu ta0,t0
+ sltu t8,ta0,t0
+ daddu t0,ta0,v0
+ sltu v0,t0,ta0
+ sd t0,-32(a0)
+ daddu v0,t8
+
+ daddu ta1,t1
+ sltu t9,ta1,t1
+ daddu t1,ta1,v0
+ sltu v0,t1,ta1
+ sd t1,-24(a0)
+ daddu v0,t9
+
+ daddu ta2,t2
+ sltu t8,ta2,t2
+ daddu t2,ta2,v0
+ sltu v0,t2,ta2
+ sd t2,-16(a0)
+ daddu v0,t8
+
+ daddu ta3,t3
+ sltu t9,ta3,t3
+ daddu t3,ta3,v0
+ sltu v0,t3,ta3
+ sd t3,-8(a0)
+ daddu v0,t9
+
+ .set noreorder
+ bgtzl AT,.L_bn_add_words_loop
+ ld t0,0(a1)
+
+ bnezl a3,.L_bn_add_words_tail
+ ld t0,0(a1)
+ .set reorder
+
+.L_bn_add_words_return:
+ jr ra
+
+.L_bn_add_words_tail:
+ ld ta0,0(a2)
+ daddu ta0,t0
+ subu a3,1
+ sltu t8,ta0,t0
+ daddu t0,ta0,v0
+ sltu v0,t0,ta0
+ sd t0,0(a0)
+ daddu v0,t8
+ beqz a3,.L_bn_add_words_return
+
+ ld t1,8(a1)
+ ld ta1,8(a2)
+ daddu ta1,t1
+ subu a3,1
+ sltu t9,ta1,t1
+ daddu t1,ta1,v0
+ sltu v0,t1,ta1
+ sd t1,8(a0)
+ daddu v0,t9
+ beqz a3,.L_bn_add_words_return
+
+ ld t2,16(a1)
+ ld ta2,16(a2)
+ daddu ta2,t2
+ sltu t8,ta2,t2
+ daddu t2,ta2,v0
+ sltu v0,t2,ta2
+ sd t2,16(a0)
+ daddu v0,t8
+ jr ra
+END(bn_add_words)
+
+.align 5
+LEAF(bn_sub_words)
+ .set noreorder
+ bgtzl a3,.L_bn_sub_words_proceed
+ ld t0,0(a1)
+ jr ra
+ move v0,zero
+ .set reorder
+
+.L_bn_sub_words_proceed:
+ li MINUS4,-4
+ and AT,a3,MINUS4
+ move v0,zero
+ beqz AT,.L_bn_sub_words_tail
+
+.L_bn_sub_words_loop:
+ ld ta0,0(a2)
+ subu a3,4
+ ld t1,8(a1)
+ and AT,a3,MINUS4
+ ld t2,16(a1)
+ PTR_ADD a2,32
+ ld t3,24(a1)
+ PTR_ADD a0,32
+ ld ta1,-24(a2)
+ PTR_ADD a1,32
+ ld ta2,-16(a2)
+ ld ta3,-8(a2)
+ sltu t8,t0,ta0
+ dsubu t0,ta0
+ dsubu ta0,t0,v0
+ sd ta0,-32(a0)
+ MOVNZ (t0,v0,t8)
+
+ sltu t9,t1,ta1
+ dsubu t1,ta1
+ dsubu ta1,t1,v0
+ sd ta1,-24(a0)
+ MOVNZ (t1,v0,t9)
+
+
+ sltu t8,t2,ta2
+ dsubu t2,ta2
+ dsubu ta2,t2,v0
+ sd ta2,-16(a0)
+ MOVNZ (t2,v0,t8)
+
+ sltu t9,t3,ta3
+ dsubu t3,ta3
+ dsubu ta3,t3,v0
+ sd ta3,-8(a0)
+ MOVNZ (t3,v0,t9)
+
+ .set noreorder
+ bgtzl AT,.L_bn_sub_words_loop
+ ld t0,0(a1)
+
+ bnezl a3,.L_bn_sub_words_tail
+ ld t0,0(a1)
+ .set reorder
+
+.L_bn_sub_words_return:
+ jr ra
+
+.L_bn_sub_words_tail:
+ ld ta0,0(a2)
+ subu a3,1
+ sltu t8,t0,ta0
+ dsubu t0,ta0
+ dsubu ta0,t0,v0
+ MOVNZ (t0,v0,t8)
+ sd ta0,0(a0)
+ beqz a3,.L_bn_sub_words_return
+
+ ld t1,8(a1)
+ subu a3,1
+ ld ta1,8(a2)
+ sltu t9,t1,ta1
+ dsubu t1,ta1
+ dsubu ta1,t1,v0
+ MOVNZ (t1,v0,t9)
+ sd ta1,8(a0)
+ beqz a3,.L_bn_sub_words_return
+
+ ld t2,16(a1)
+ ld ta2,16(a2)
+ sltu t8,t2,ta2
+ dsubu t2,ta2
+ dsubu ta2,t2,v0
+ MOVNZ (t2,v0,t8)
+ sd ta2,16(a0)
+ jr ra
+END(bn_sub_words)
+
+#undef MINUS4
+
+.align 5
+LEAF(bn_div_3_words)
+ .set reorder
+ move a3,a0 /* we know that bn_div_words doesn't
+ * touch a3, ta2, ta3 and preserves a2
+ * so that we can save two arguments
+ * and return address in registers
+ * instead of stack:-)
+ */
+ ld a0,(a3)
+ move ta2,a1
+ ld a1,-8(a3)
+ bne a0,a2,.L_bn_div_3_words_proceed
+ li v0,-1
+ jr ra
+.L_bn_div_3_words_proceed:
+ move ta3,ra
+ bal bn_div_words
+ move ra,ta3
+ dmultu ta2,v0
+ ld t2,-16(a3)
+ move ta0,zero
+ mfhi t1
+ mflo t0
+ sltu t8,t1,v1
+.L_bn_div_3_words_inner_loop:
+ bnez t8,.L_bn_div_3_words_inner_loop_done
+ sgeu AT,t2,t0
+ seq t9,t1,v1
+ and AT,t9
+ sltu t3,t0,ta2
+ daddu v1,a2
+ dsubu t1,t3
+ dsubu t0,ta2
+ sltu t8,t1,v1
+ sltu ta0,v1,a2
+ or t8,ta0
+ .set noreorder
+ beqzl AT,.L_bn_div_3_words_inner_loop
+ dsubu v0,1
+ .set reorder
+.L_bn_div_3_words_inner_loop_done:
+ jr ra
+END(bn_div_3_words)
+
+.align 5
+LEAF(bn_div_words)
+ .set noreorder
+ bnezl a2,.L_bn_div_words_proceed
+ move v1,zero
+ jr ra
+ li v0,-1 /* I'd rather signal div-by-zero
+ * which can be done with 'break 7' */
+
+.L_bn_div_words_proceed:
+ bltz a2,.L_bn_div_words_body
+ move t9,v1
+ dsll a2,1
+ bgtz a2,.-4
+ addu t9,1
+
+ .set reorder
+ negu t1,t9
+ li t2,-1
+ dsll t2,t1
+ and t2,a0
+ dsrl AT,a1,t1
+ .set noreorder
+ bnezl t2,.+8
+ break 6 /* signal overflow */
+ .set reorder
+ dsll a0,t9
+ dsll a1,t9
+ or a0,AT
+
+#define QT ta0
+#define HH ta1
+#define DH v1
+.L_bn_div_words_body:
+ dsrl DH,a2,32
+ sgeu AT,a0,a2
+ .set noreorder
+ bnezl AT,.+8
+ dsubu a0,a2
+ .set reorder
+
+ li QT,-1
+ dsrl HH,a0,32
+ dsrl QT,32 /* q=0xffffffff */
+ beq DH,HH,.L_bn_div_words_skip_div1
+ ddivu zero,a0,DH
+ mflo QT
+.L_bn_div_words_skip_div1:
+ dmultu a2,QT
+ dsll t3,a0,32
+ dsrl AT,a1,32
+ or t3,AT
+ mflo t0
+ mfhi t1
+.L_bn_div_words_inner_loop1:
+ sltu t2,t3,t0
+ seq t8,HH,t1
+ sltu AT,HH,t1
+ and t2,t8
+ sltu v0,t0,a2
+ or AT,t2
+ .set noreorder
+ beqz AT,.L_bn_div_words_inner_loop1_done
+ dsubu t1,v0
+ dsubu t0,a2
+ b .L_bn_div_words_inner_loop1
+ dsubu QT,1
+ .set reorder
+.L_bn_div_words_inner_loop1_done:
+
+ dsll a1,32
+ dsubu a0,t3,t0
+ dsll v0,QT,32
+
+ li QT,-1
+ dsrl HH,a0,32
+ dsrl QT,32 /* q=0xffffffff */
+ beq DH,HH,.L_bn_div_words_skip_div2
+ ddivu zero,a0,DH
+ mflo QT
+.L_bn_div_words_skip_div2:
+#undef DH
+ dmultu a2,QT
+ dsll t3,a0,32
+ dsrl AT,a1,32
+ or t3,AT
+ mflo t0
+ mfhi t1
+.L_bn_div_words_inner_loop2:
+ sltu t2,t3,t0
+ seq t8,HH,t1
+ sltu AT,HH,t1
+ and t2,t8
+ sltu v1,t0,a2
+ or AT,t2
+ .set noreorder
+ beqz AT,.L_bn_div_words_inner_loop2_done
+ dsubu t1,v1
+ dsubu t0,a2
+ b .L_bn_div_words_inner_loop2
+ dsubu QT,1
+ .set reorder
+.L_bn_div_words_inner_loop2_done:
+#undef HH
+
+ dsubu a0,t3,t0
+ or v0,QT
+ dsrl v1,a0,t9 /* v1 contains remainder if anybody wants it */
+ dsrl a2,t9 /* restore a2 */
+ jr ra
+#undef QT
+END(bn_div_words)
+
+#define a_0 t0
+#define a_1 t1
+#define a_2 t2
+#define a_3 t3
+#define b_0 ta0
+#define b_1 ta1
+#define b_2 ta2
+#define b_3 ta3
+
+#define a_4 s0
+#define a_5 s2
+#define a_6 s4
+#define a_7 a1 /* once we load a[7] we don't need a anymore */
+#define b_4 s1
+#define b_5 s3
+#define b_6 s5
+#define b_7 a2 /* once we load b[7] we don't need b anymore */
+
+#define t_1 t8
+#define t_2 t9
+
+#define c_1 v0
+#define c_2 v1
+#define c_3 a3
+
+#define FRAME_SIZE 48
+
+.align 5
+LEAF(bn_mul_comba8)
+ .set noreorder
+ PTR_SUB sp,FRAME_SIZE
+ .frame sp,64,ra
+ .set reorder
+ ld a_0,0(a1) /* If compiled with -mips3 option on
+ * R5000 box assembler barks on this
+ * line with "shouldn't have mult/div
+ * as last instruction in bb (R10K
+ * bug)" warning. If anybody out there
+ * has a clue about how to circumvent
+ * this do send me a note.
+ * <appro@fy.chalmers.se>
+ */
+ ld b_0,0(a2)
+ ld a_1,8(a1)
+ ld a_2,16(a1)
+ ld a_3,24(a1)
+ ld b_1,8(a2)
+ ld b_2,16(a2)
+ ld b_3,24(a2)
+ dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
+ sd s0,0(sp)
+ sd s1,8(sp)
+ sd s2,16(sp)
+ sd s3,24(sp)
+ sd s4,32(sp)
+ sd s5,40(sp)
+ mflo c_1
+ mfhi c_2
+
+ dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */
+ ld a_4,32(a1)
+ ld a_5,40(a1)
+ ld a_6,48(a1)
+ ld a_7,56(a1)
+ ld b_4,32(a2)
+ ld b_5,40(a2)
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu c_3,t_2,AT
+ dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */
+ ld b_6,48(a2)
+ ld b_7,56(a2)
+ sd c_1,0(a0) /* r[0]=c1; */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu c_1,c_3,t_2
+ sd c_2,8(a0) /* r[1]=c2; */
+
+ dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu c_2,c_1,t_2
+ dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ sd c_3,16(a0) /* r[2]=c3; */
+
+ dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu c_3,c_2,t_2
+ dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ sd c_1,24(a0) /* r[3]=c1; */
+
+ dmultu a_4,b_0 /* mul_add_c(a[4],b[0],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu c_1,c_3,t_2
+ dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_0,b_4 /* mul_add_c(a[0],b[4],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ sd c_2,32(a0) /* r[4]=c2; */
+
+ dmultu a_0,b_5 /* mul_add_c(a[0],b[5],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu c_2,c_1,t_2
+ dmultu a_1,b_4 /* mul_add_c(a[1],b[4],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ dmultu a_4,b_1 /* mul_add_c(a[4],b[1],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ dmultu a_5,b_0 /* mul_add_c(a[5],b[0],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ sd c_3,40(a0) /* r[5]=c3; */
+
+ dmultu a_6,b_0 /* mul_add_c(a[6],b[0],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu c_3,c_2,t_2
+ dmultu a_5,b_1 /* mul_add_c(a[5],b[1],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_4,b_2 /* mul_add_c(a[4],b[2],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_2,b_4 /* mul_add_c(a[2],b[4],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_1,b_5 /* mul_add_c(a[1],b[5],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_0,b_6 /* mul_add_c(a[0],b[6],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ sd c_1,48(a0) /* r[6]=c1; */
+
+ dmultu a_0,b_7 /* mul_add_c(a[0],b[7],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu c_1,c_3,t_2
+ dmultu a_1,b_6 /* mul_add_c(a[1],b[6],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_2,b_5 /* mul_add_c(a[2],b[5],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_3,b_4 /* mul_add_c(a[3],b[4],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_4,b_3 /* mul_add_c(a[4],b[3],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_5,b_2 /* mul_add_c(a[5],b[2],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_6,b_1 /* mul_add_c(a[6],b[1],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_7,b_0 /* mul_add_c(a[7],b[0],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ sd c_2,56(a0) /* r[7]=c2; */
+
+ dmultu a_7,b_1 /* mul_add_c(a[7],b[1],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu c_2,c_1,t_2
+ dmultu a_6,b_2 /* mul_add_c(a[6],b[2],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ dmultu a_5,b_3 /* mul_add_c(a[5],b[3],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ dmultu a_4,b_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ dmultu a_3,b_5 /* mul_add_c(a[3],b[5],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ dmultu a_2,b_6 /* mul_add_c(a[2],b[6],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ dmultu a_1,b_7 /* mul_add_c(a[1],b[7],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ sd c_3,64(a0) /* r[8]=c3; */
+
+ dmultu a_2,b_7 /* mul_add_c(a[2],b[7],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu c_3,c_2,t_2
+ dmultu a_3,b_6 /* mul_add_c(a[3],b[6],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_4,b_5 /* mul_add_c(a[4],b[5],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_5,b_4 /* mul_add_c(a[5],b[4],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_6,b_3 /* mul_add_c(a[6],b[3],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_7,b_2 /* mul_add_c(a[7],b[2],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ sd c_1,72(a0) /* r[9]=c1; */
+
+ dmultu a_7,b_3 /* mul_add_c(a[7],b[3],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu c_1,c_3,t_2
+ dmultu a_6,b_4 /* mul_add_c(a[6],b[4],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_5,b_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_4,b_6 /* mul_add_c(a[4],b[6],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_3,b_7 /* mul_add_c(a[3],b[7],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ sd c_2,80(a0) /* r[10]=c2; */
+
+ dmultu a_4,b_7 /* mul_add_c(a[4],b[7],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu c_2,c_1,t_2
+ dmultu a_5,b_6 /* mul_add_c(a[5],b[6],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ dmultu a_6,b_5 /* mul_add_c(a[6],b[5],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ dmultu a_7,b_4 /* mul_add_c(a[7],b[4],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ sd c_3,88(a0) /* r[11]=c3; */
+
+ dmultu a_7,b_5 /* mul_add_c(a[7],b[5],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu c_3,c_2,t_2
+ dmultu a_6,b_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_5,b_7 /* mul_add_c(a[5],b[7],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ sd c_1,96(a0) /* r[12]=c1; */
+
+ dmultu a_6,b_7 /* mul_add_c(a[6],b[7],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu c_1,c_3,t_2
+ dmultu a_7,b_6 /* mul_add_c(a[7],b[6],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ sd c_2,104(a0) /* r[13]=c2; */
+
+ dmultu a_7,b_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */
+ ld s0,0(sp)
+ ld s1,8(sp)
+ ld s2,16(sp)
+ ld s3,24(sp)
+ ld s4,32(sp)
+ ld s5,40(sp)
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sd c_3,112(a0) /* r[14]=c3; */
+ sd c_1,120(a0) /* r[15]=c1; */
+
+ PTR_ADD sp,FRAME_SIZE
+
+ jr ra
+END(bn_mul_comba8)
+
+.align 5
+LEAF(bn_mul_comba4)
+ .set reorder
+ ld a_0,0(a1)
+ ld b_0,0(a2)
+ ld a_1,8(a1)
+ ld a_2,16(a1)
+ dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
+ ld a_3,24(a1)
+ ld b_1,8(a2)
+ ld b_2,16(a2)
+ ld b_3,24(a2)
+ mflo c_1
+ mfhi c_2
+ sd c_1,0(a0)
+
+ dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu c_3,t_2,AT
+ dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu c_1,c_3,t_2
+ sd c_2,8(a0)
+
+ dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu c_2,c_1,t_2
+ dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ sd c_3,16(a0)
+
+ dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu c_3,c_2,t_2
+ dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ sd c_1,24(a0)
+
+ dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu c_1,c_3,t_2
+ dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ sd c_2,32(a0)
+
+ dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu c_2,c_1,t_2
+ dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ sd c_3,40(a0)
+
+ dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sd c_1,48(a0)
+ sd c_2,56(a0)
+
+ jr ra
+END(bn_mul_comba4)
+
+#undef a_4
+#undef a_5
+#undef a_6
+#undef a_7
+#define a_4 b_0
+#define a_5 b_1
+#define a_6 b_2
+#define a_7 b_3
+
+.align 5
+LEAF(bn_sqr_comba8)
+ .set reorder
+ ld a_0,0(a1)
+ ld a_1,8(a1)
+ ld a_2,16(a1)
+ ld a_3,24(a1)
+
+ dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
+ ld a_4,32(a1)
+ ld a_5,40(a1)
+ ld a_6,48(a1)
+ ld a_7,56(a1)
+ mflo c_1
+ mfhi c_2
+ sd c_1,0(a0)
+
+ dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ slt c_1,t_2,zero
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu c_3,t_2,AT
+ sd c_2,8(a0)
+
+ dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ slt c_2,t_2,zero
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ sd c_3,16(a0)
+
+ dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ slt c_3,t_2,zero
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_1,a_2 /* mul_add_c2(a[1],b[2],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ slt AT,t_2,zero
+ daddu c_3,AT
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ sd c_1,24(a0)
+
+ dmultu a_4,a_0 /* mul_add_c2(a[4],b[0],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ slt c_1,t_2,zero
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ slt AT,t_2,zero
+ daddu c_1,AT
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ sd c_2,32(a0)
+
+ dmultu a_0,a_5 /* mul_add_c2(a[0],b[5],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ slt c_2,t_2,zero
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ dmultu a_1,a_4 /* mul_add_c2(a[1],b[4],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ slt AT,t_2,zero
+ daddu c_2,AT
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ slt AT,t_2,zero
+ daddu c_2,AT
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ sd c_3,40(a0)
+
+ dmultu a_6,a_0 /* mul_add_c2(a[6],b[0],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ slt c_3,t_2,zero
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_5,a_1 /* mul_add_c2(a[5],b[1],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ slt AT,t_2,zero
+ daddu c_3,AT
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_4,a_2 /* mul_add_c2(a[4],b[2],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ slt AT,t_2,zero
+ daddu c_3,AT
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ sd c_1,48(a0)
+
+ dmultu a_0,a_7 /* mul_add_c2(a[0],b[7],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ slt c_1,t_2,zero
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_1,a_6 /* mul_add_c2(a[1],b[6],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ slt AT,t_2,zero
+ daddu c_1,AT
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_2,a_5 /* mul_add_c2(a[2],b[5],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ slt AT,t_2,zero
+ daddu c_1,AT
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_3,a_4 /* mul_add_c2(a[3],b[4],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ slt AT,t_2,zero
+ daddu c_1,AT
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ sd c_2,56(a0)
+
+ dmultu a_7,a_1 /* mul_add_c2(a[7],b[1],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ slt c_2,t_2,zero
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ dmultu a_6,a_2 /* mul_add_c2(a[6],b[2],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ slt AT,t_2,zero
+ daddu c_2,AT
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ dmultu a_5,a_3 /* mul_add_c2(a[5],b[3],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ slt AT,t_2,zero
+ daddu c_2,AT
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ dmultu a_4,a_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ sd c_3,64(a0)
+
+ dmultu a_2,a_7 /* mul_add_c2(a[2],b[7],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ slt c_3,t_2,zero
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_3,a_6 /* mul_add_c2(a[3],b[6],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ slt AT,t_2,zero
+ daddu c_3,AT
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_4,a_5 /* mul_add_c2(a[4],b[5],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ slt AT,t_2,zero
+ daddu c_3,AT
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ sd c_1,72(a0)
+
+ dmultu a_7,a_3 /* mul_add_c2(a[7],b[3],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ slt c_1,t_2,zero
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_6,a_4 /* mul_add_c2(a[6],b[4],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ slt AT,t_2,zero
+ daddu c_1,AT
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_5,a_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ sd c_2,80(a0)
+
+ dmultu a_4,a_7 /* mul_add_c2(a[4],b[7],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ slt c_2,t_2,zero
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ dmultu a_5,a_6 /* mul_add_c2(a[5],b[6],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ slt AT,t_2,zero
+ daddu c_2,AT
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ sd c_3,88(a0)
+
+ dmultu a_7,a_5 /* mul_add_c2(a[7],b[5],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ slt c_3,t_2,zero
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_6,a_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ sd c_1,96(a0)
+
+ dmultu a_6,a_7 /* mul_add_c2(a[6],b[7],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ slt c_1,t_2,zero
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ sd c_2,104(a0)
+
+ dmultu a_7,a_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sd c_3,112(a0)
+ sd c_1,120(a0)
+
+ jr ra
+END(bn_sqr_comba8)
+
+.align 5
+LEAF(bn_sqr_comba4)
+ .set reorder
+ ld a_0,0(a1)
+ ld a_1,8(a1)
+ ld a_2,16(a1)
+ ld a_3,24(a1)
+ dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
+ mflo c_1
+ mfhi c_2
+ sd c_1,0(a0)
+
+ dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ slt c_1,t_2,zero
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu c_3,t_2,AT
+ sd c_2,8(a0)
+
+ dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ slt c_2,t_2,zero
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ sd c_3,16(a0)
+
+ dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ slt c_3,t_2,zero
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_1,a_2 /* mul_add_c(a2[1],b[2],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ slt AT,t_2,zero
+ daddu c_3,AT
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ sd c_1,24(a0)
+
+ dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ slt c_1,t_2,zero
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ sd c_2,32(a0)
+
+ dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ slt c_2,t_2,zero
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ sd c_3,40(a0)
+
+ dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sd c_1,48(a0)
+ sd c_2,56(a0)
+
+ jr ra
+END(bn_sqr_comba4)
diff --git a/crypto/bn/asm/modexp512-x86_64.pl b/crypto/bn/asm/modexp512-x86_64.pl
deleted file mode 100755
index bfd6e975416d..000000000000
--- a/crypto/bn/asm/modexp512-x86_64.pl
+++ /dev/null
@@ -1,1497 +0,0 @@
-#!/usr/bin/env perl
-#
-# Copyright (c) 2010-2011 Intel Corp.
-# Author: Vinodh.Gopal@intel.com
-# Jim Guilford
-# Erdinc.Ozturk@intel.com
-# Maxim.Perminov@intel.com
-#
-# More information about algorithm used can be found at:
-# http://www.cse.buffalo.edu/srds2009/escs2009_submission_Gopal.pdf
-#
-# ====================================================================
-# Copyright (c) 2011 The OpenSSL Project. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#
-# 1. Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in
-# the documentation and/or other materials provided with the
-# distribution.
-#
-# 3. All advertising materials mentioning features or use of this
-# software must display the following acknowledgment:
-# "This product includes software developed by the OpenSSL Project
-# for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
-#
-# 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
-# endorse or promote products derived from this software without
-# prior written permission. For written permission, please contact
-# licensing@OpenSSL.org.
-#
-# 5. Products derived from this software may not be called "OpenSSL"
-# nor may "OpenSSL" appear in their names without prior written
-# permission of the OpenSSL Project.
-#
-# 6. Redistributions of any form whatsoever must retain the following
-# acknowledgment:
-# "This product includes software developed by the OpenSSL Project
-# for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
-#
-# THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
-# EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
-# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
-# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
-# OF THE POSSIBILITY OF SUCH DAMAGE.
-# ====================================================================
-
-$flavour = shift;
-$output = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
-die "can't locate x86_64-xlate.pl";
-
-open OUT,"| \"$^X\" $xlate $flavour $output";
-*STDOUT=*OUT;
-
-use strict;
-my $code=".text\n\n";
-my $m=0;
-
-#
-# Define x512 macros
-#
-
-#MULSTEP_512_ADD MACRO x7, x6, x5, x4, x3, x2, x1, x0, dst, src1, src2, add_src, tmp1, tmp2
-#
-# uses rax, rdx, and args
-sub MULSTEP_512_ADD
-{
- my ($x, $DST, $SRC2, $ASRC, $OP, $TMP)=@_;
- my @X=@$x; # make a copy
-$code.=<<___;
- mov (+8*0)($SRC2), %rax
- mul $OP # rdx:rax = %OP * [0]
- mov ($ASRC), $X[0]
- add %rax, $X[0]
- adc \$0, %rdx
- mov $X[0], $DST
-___
-for(my $i=1;$i<8;$i++) {
-$code.=<<___;
- mov %rdx, $TMP
-
- mov (+8*$i)($SRC2), %rax
- mul $OP # rdx:rax = %OP * [$i]
- mov (+8*$i)($ASRC), $X[$i]
- add %rax, $X[$i]
- adc \$0, %rdx
- add $TMP, $X[$i]
- adc \$0, %rdx
-___
-}
-$code.=<<___;
- mov %rdx, $X[0]
-___
-}
-
-#MULSTEP_512 MACRO x7, x6, x5, x4, x3, x2, x1, x0, dst, src2, src1_val, tmp
-#
-# uses rax, rdx, and args
-sub MULSTEP_512
-{
- my ($x, $DST, $SRC2, $OP, $TMP)=@_;
- my @X=@$x; # make a copy
-$code.=<<___;
- mov (+8*0)($SRC2), %rax
- mul $OP # rdx:rax = %OP * [0]
- add %rax, $X[0]
- adc \$0, %rdx
- mov $X[0], $DST
-___
-for(my $i=1;$i<8;$i++) {
-$code.=<<___;
- mov %rdx, $TMP
-
- mov (+8*$i)($SRC2), %rax
- mul $OP # rdx:rax = %OP * [$i]
- add %rax, $X[$i]
- adc \$0, %rdx
- add $TMP, $X[$i]
- adc \$0, %rdx
-___
-}
-$code.=<<___;
- mov %rdx, $X[0]
-___
-}
-
-#
-# Swizzle Macros
-#
-
-# macro to copy data from flat space to swizzled table
-#MACRO swizzle pDst, pSrc, tmp1, tmp2
-# pDst and pSrc are modified
-sub swizzle
-{
- my ($pDst, $pSrc, $cnt, $d0)=@_;
-$code.=<<___;
- mov \$8, $cnt
-loop_$m:
- mov ($pSrc), $d0
- mov $d0#w, ($pDst)
- shr \$16, $d0
- mov $d0#w, (+64*1)($pDst)
- shr \$16, $d0
- mov $d0#w, (+64*2)($pDst)
- shr \$16, $d0
- mov $d0#w, (+64*3)($pDst)
- lea 8($pSrc), $pSrc
- lea 64*4($pDst), $pDst
- dec $cnt
- jnz loop_$m
-___
-
- $m++;
-}
-
-# macro to copy data from swizzled table to flat space
-#MACRO unswizzle pDst, pSrc, tmp*3
-sub unswizzle
-{
- my ($pDst, $pSrc, $cnt, $d0, $d1)=@_;
-$code.=<<___;
- mov \$4, $cnt
-loop_$m:
- movzxw (+64*3+256*0)($pSrc), $d0
- movzxw (+64*3+256*1)($pSrc), $d1
- shl \$16, $d0
- shl \$16, $d1
- mov (+64*2+256*0)($pSrc), $d0#w
- mov (+64*2+256*1)($pSrc), $d1#w
- shl \$16, $d0
- shl \$16, $d1
- mov (+64*1+256*0)($pSrc), $d0#w
- mov (+64*1+256*1)($pSrc), $d1#w
- shl \$16, $d0
- shl \$16, $d1
- mov (+64*0+256*0)($pSrc), $d0#w
- mov (+64*0+256*1)($pSrc), $d1#w
- mov $d0, (+8*0)($pDst)
- mov $d1, (+8*1)($pDst)
- lea 256*2($pSrc), $pSrc
- lea 8*2($pDst), $pDst
- sub \$1, $cnt
- jnz loop_$m
-___
-
- $m++;
-}
-
-#
-# Data Structures
-#
-
-# Reduce Data
-#
-#
-# Offset Value
-# 0C0 Carries
-# 0B8 X2[10]
-# 0B0 X2[9]
-# 0A8 X2[8]
-# 0A0 X2[7]
-# 098 X2[6]
-# 090 X2[5]
-# 088 X2[4]
-# 080 X2[3]
-# 078 X2[2]
-# 070 X2[1]
-# 068 X2[0]
-# 060 X1[12] P[10]
-# 058 X1[11] P[9] Z[8]
-# 050 X1[10] P[8] Z[7]
-# 048 X1[9] P[7] Z[6]
-# 040 X1[8] P[6] Z[5]
-# 038 X1[7] P[5] Z[4]
-# 030 X1[6] P[4] Z[3]
-# 028 X1[5] P[3] Z[2]
-# 020 X1[4] P[2] Z[1]
-# 018 X1[3] P[1] Z[0]
-# 010 X1[2] P[0] Y[2]
-# 008 X1[1] Q[1] Y[1]
-# 000 X1[0] Q[0] Y[0]
-
-my $X1_offset = 0; # 13 qwords
-my $X2_offset = $X1_offset + 13*8; # 11 qwords
-my $Carries_offset = $X2_offset + 11*8; # 1 qword
-my $Q_offset = 0; # 2 qwords
-my $P_offset = $Q_offset + 2*8; # 11 qwords
-my $Y_offset = 0; # 3 qwords
-my $Z_offset = $Y_offset + 3*8; # 9 qwords
-
-my $Red_Data_Size = $Carries_offset + 1*8; # (25 qwords)
-
-#
-# Stack Frame
-#
-#
-# offset value
-# ... <old stack contents>
-# ...
-# 280 Garray
-
-# 278 tmp16[15]
-# ... ...
-# 200 tmp16[0]
-
-# 1F8 tmp[7]
-# ... ...
-# 1C0 tmp[0]
-
-# 1B8 GT[7]
-# ... ...
-# 180 GT[0]
-
-# 178 Reduce Data
-# ... ...
-# 0B8 Reduce Data
-# 0B0 reserved
-# 0A8 reserved
-# 0A0 reserved
-# 098 reserved
-# 090 reserved
-# 088 reduce result addr
-# 080 exp[8]
-
-# ...
-# 048 exp[1]
-# 040 exp[0]
-
-# 038 reserved
-# 030 loop_idx
-# 028 pg
-# 020 i
-# 018 pData ; arg 4
-# 010 pG ; arg 2
-# 008 pResult ; arg 1
-# 000 rsp ; stack pointer before subtract
-
-my $rsp_offset = 0;
-my $pResult_offset = 8*1 + $rsp_offset;
-my $pG_offset = 8*1 + $pResult_offset;
-my $pData_offset = 8*1 + $pG_offset;
-my $i_offset = 8*1 + $pData_offset;
-my $pg_offset = 8*1 + $i_offset;
-my $loop_idx_offset = 8*1 + $pg_offset;
-my $reserved1_offset = 8*1 + $loop_idx_offset;
-my $exp_offset = 8*1 + $reserved1_offset;
-my $red_result_addr_offset= 8*9 + $exp_offset;
-my $reserved2_offset = 8*1 + $red_result_addr_offset;
-my $Reduce_Data_offset = 8*5 + $reserved2_offset;
-my $GT_offset = $Red_Data_Size + $Reduce_Data_offset;
-my $tmp_offset = 8*8 + $GT_offset;
-my $tmp16_offset = 8*8 + $tmp_offset;
-my $garray_offset = 8*16 + $tmp16_offset;
-my $mem_size = 8*8*32 + $garray_offset;
-
-#
-# Offsets within Reduce Data
-#
-#
-# struct MODF_2FOLD_MONT_512_C1_DATA {
-# UINT64 t[8][8];
-# UINT64 m[8];
-# UINT64 m1[8]; /* 2^768 % m */
-# UINT64 m2[8]; /* 2^640 % m */
-# UINT64 k1[2]; /* (- 1/m) % 2^128 */
-# };
-
-my $T = 0;
-my $M = 512; # = 8 * 8 * 8
-my $M1 = 576; # = 8 * 8 * 9 /* += 8 * 8 */
-my $M2 = 640; # = 8 * 8 * 10 /* += 8 * 8 */
-my $K1 = 704; # = 8 * 8 * 11 /* += 8 * 8 */
-
-#
-# FUNCTIONS
-#
-
-{{{
-#
-# MULADD_128x512 : Function to multiply 128-bits (2 qwords) by 512-bits (8 qwords)
-# and add 512-bits (8 qwords)
-# to get 640 bits (10 qwords)
-# Input: 128-bit mul source: [rdi+8*1], rbp
-# 512-bit mul source: [rsi+8*n]
-# 512-bit add source: r15, r14, ..., r9, r8
-# Output: r9, r8, r15, r14, r13, r12, r11, r10, [rcx+8*1], [rcx+8*0]
-# Clobbers all regs except: rcx, rsi, rdi
-$code.=<<___;
-.type MULADD_128x512,\@abi-omnipotent
-.align 16
-MULADD_128x512:
-___
- &MULSTEP_512([map("%r$_",(8..15))], "(+8*0)(%rcx)", "%rsi", "%rbp", "%rbx");
-$code.=<<___;
- mov (+8*1)(%rdi), %rbp
-___
- &MULSTEP_512([map("%r$_",(9..15,8))], "(+8*1)(%rcx)", "%rsi", "%rbp", "%rbx");
-$code.=<<___;
- ret
-.size MULADD_128x512,.-MULADD_128x512
-___
-}}}
-
-{{{
-#MULADD_256x512 MACRO pDst, pA, pB, OP, TMP, X7, X6, X5, X4, X3, X2, X1, X0
-#
-# Inputs: pDst: Destination (768 bits, 12 qwords)
-# pA: Multiplicand (1024 bits, 16 qwords)
-# pB: Multiplicand (512 bits, 8 qwords)
-# Dst = Ah * B + Al
-# where Ah is (in qwords) A[15:12] (256 bits) and Al is A[7:0] (512 bits)
-# Results in X3 X2 X1 X0 X7 X6 X5 X4 Dst[3:0]
-# Uses registers: arguments, RAX, RDX
-sub MULADD_256x512
-{
- my ($pDst, $pA, $pB, $OP, $TMP, $X)=@_;
-$code.=<<___;
- mov (+8*12)($pA), $OP
-___
- &MULSTEP_512_ADD($X, "(+8*0)($pDst)", $pB, $pA, $OP, $TMP);
- push(@$X,shift(@$X));
-
-$code.=<<___;
- mov (+8*13)($pA), $OP
-___
- &MULSTEP_512($X, "(+8*1)($pDst)", $pB, $OP, $TMP);
- push(@$X,shift(@$X));
-
-$code.=<<___;
- mov (+8*14)($pA), $OP
-___
- &MULSTEP_512($X, "(+8*2)($pDst)", $pB, $OP, $TMP);
- push(@$X,shift(@$X));
-
-$code.=<<___;
- mov (+8*15)($pA), $OP
-___
- &MULSTEP_512($X, "(+8*3)($pDst)", $pB, $OP, $TMP);
- push(@$X,shift(@$X));
-}
-
-#
-# mont_reduce(UINT64 *x, /* 1024 bits, 16 qwords */
-# UINT64 *m, /* 512 bits, 8 qwords */
-# MODF_2FOLD_MONT_512_C1_DATA *data,
-# UINT64 *r) /* 512 bits, 8 qwords */
-# Input: x (number to be reduced): tmp16 (Implicit)
-# m (modulus): [pM] (Implicit)
-# data (reduce data): [pData] (Implicit)
-# Output: r (result): Address in [red_res_addr]
-# result also in: r9, r8, r15, r14, r13, r12, r11, r10
-
-my @X=map("%r$_",(8..15));
-
-$code.=<<___;
-.type mont_reduce,\@abi-omnipotent
-.align 16
-mont_reduce:
-___
-
-my $STACK_DEPTH = 8;
- #
- # X1 = Xh * M1 + Xl
-$code.=<<___;
- lea (+$Reduce_Data_offset+$X1_offset+$STACK_DEPTH)(%rsp), %rdi # pX1 (Dst) 769 bits, 13 qwords
- mov (+$pData_offset+$STACK_DEPTH)(%rsp), %rsi # pM1 (Bsrc) 512 bits, 8 qwords
- add \$$M1, %rsi
- lea (+$tmp16_offset+$STACK_DEPTH)(%rsp), %rcx # X (Asrc) 1024 bits, 16 qwords
-
-___
-
- &MULADD_256x512("%rdi", "%rcx", "%rsi", "%rbp", "%rbx", \@X); # rotates @X 4 times
- # results in r11, r10, r9, r8, r15, r14, r13, r12, X1[3:0]
-
-$code.=<<___;
- xor %rax, %rax
- # X1 += xl
- add (+8*8)(%rcx), $X[4]
- adc (+8*9)(%rcx), $X[5]
- adc (+8*10)(%rcx), $X[6]
- adc (+8*11)(%rcx), $X[7]
- adc \$0, %rax
- # X1 is now rax, r11-r8, r15-r12, tmp16[3:0]
-
- #
- # check for carry ;; carry stored in rax
- mov $X[4], (+8*8)(%rdi) # rdi points to X1
- mov $X[5], (+8*9)(%rdi)
- mov $X[6], %rbp
- mov $X[7], (+8*11)(%rdi)
-
- mov %rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp)
-
- mov (+8*0)(%rdi), $X[4]
- mov (+8*1)(%rdi), $X[5]
- mov (+8*2)(%rdi), $X[6]
- mov (+8*3)(%rdi), $X[7]
-
- # X1 is now stored in: X1[11], rbp, X1[9:8], r15-r8
- # rdi -> X1
- # rsi -> M1
-
- #
- # X2 = Xh * M2 + Xl
- # do first part (X2 = Xh * M2)
- add \$8*10, %rdi # rdi -> pXh ; 128 bits, 2 qwords
- # Xh is actually { [rdi+8*1], rbp }
- add \$`$M2-$M1`, %rsi # rsi -> M2
- lea (+$Reduce_Data_offset+$X2_offset+$STACK_DEPTH)(%rsp), %rcx # rcx -> pX2 ; 641 bits, 11 qwords
-___
- unshift(@X,pop(@X)); unshift(@X,pop(@X));
-$code.=<<___;
-
- call MULADD_128x512 # args in rcx, rdi / rbp, rsi, r15-r8
- # result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0]
- mov (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rax
-
- # X2 += Xl
- add (+8*8-8*10)(%rdi), $X[6] # (-8*10) is to adjust rdi -> Xh to Xl
- adc (+8*9-8*10)(%rdi), $X[7]
- mov $X[6], (+8*8)(%rcx)
- mov $X[7], (+8*9)(%rcx)
-
- adc %rax, %rax
- mov %rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp)
-
- lea (+$Reduce_Data_offset+$Q_offset+$STACK_DEPTH)(%rsp), %rdi # rdi -> pQ ; 128 bits, 2 qwords
- add \$`$K1-$M2`, %rsi # rsi -> pK1 ; 128 bits, 2 qwords
-
- # MUL_128x128t128 rdi, rcx, rsi ; Q = X2 * K1 (bottom half)
- # B1:B0 = rsi[1:0] = K1[1:0]
- # A1:A0 = rcx[1:0] = X2[1:0]
- # Result = rdi[1],rbp = Q[1],rbp
- mov (%rsi), %r8 # B0
- mov (+8*1)(%rsi), %rbx # B1
-
- mov (%rcx), %rax # A0
- mul %r8 # B0
- mov %rax, %rbp
- mov %rdx, %r9
-
- mov (+8*1)(%rcx), %rax # A1
- mul %r8 # B0
- add %rax, %r9
-
- mov (%rcx), %rax # A0
- mul %rbx # B1
- add %rax, %r9
-
- mov %r9, (+8*1)(%rdi)
- # end MUL_128x128t128
-
- sub \$`$K1-$M`, %rsi
-
- mov (%rcx), $X[6]
- mov (+8*1)(%rcx), $X[7] # r9:r8 = X2[1:0]
-
- call MULADD_128x512 # args in rcx, rdi / rbp, rsi, r15-r8
- # result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0]
-
- # load first half of m to rdx, rdi, rbx, rax
- # moved this here for efficiency
- mov (+8*0)(%rsi), %rax
- mov (+8*1)(%rsi), %rbx
- mov (+8*2)(%rsi), %rdi
- mov (+8*3)(%rsi), %rdx
-
- # continue with reduction
- mov (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rbp
-
- add (+8*8)(%rcx), $X[6]
- adc (+8*9)(%rcx), $X[7]
-
- #accumulate the final carry to rbp
- adc %rbp, %rbp
-
- # Add in overflow corrections: R = (X2>>128) += T[overflow]
- # R = {r9, r8, r15, r14, ..., r10}
- shl \$3, %rbp
- mov (+$pData_offset+$STACK_DEPTH)(%rsp), %rcx # rsi -> Data (and points to T)
- add %rcx, %rbp # pT ; 512 bits, 8 qwords, spread out
-
- # rsi will be used to generate a mask after the addition
- xor %rsi, %rsi
-
- add (+8*8*0)(%rbp), $X[0]
- adc (+8*8*1)(%rbp), $X[1]
- adc (+8*8*2)(%rbp), $X[2]
- adc (+8*8*3)(%rbp), $X[3]
- adc (+8*8*4)(%rbp), $X[4]
- adc (+8*8*5)(%rbp), $X[5]
- adc (+8*8*6)(%rbp), $X[6]
- adc (+8*8*7)(%rbp), $X[7]
-
- # if there is a carry: rsi = 0xFFFFFFFFFFFFFFFF
- # if carry is clear: rsi = 0x0000000000000000
- sbb \$0, %rsi
-
- # if carry is clear, subtract 0. Otherwise, subtract 256 bits of m
- and %rsi, %rax
- and %rsi, %rbx
- and %rsi, %rdi
- and %rsi, %rdx
-
- mov \$1, %rbp
- sub %rax, $X[0]
- sbb %rbx, $X[1]
- sbb %rdi, $X[2]
- sbb %rdx, $X[3]
-
- # if there is a borrow: rbp = 0
- # if there is no borrow: rbp = 1
- # this is used to save the borrows in between the first half and the 2nd half of the subtraction of m
- sbb \$0, %rbp
-
- #load second half of m to rdx, rdi, rbx, rax
-
- add \$$M, %rcx
- mov (+8*4)(%rcx), %rax
- mov (+8*5)(%rcx), %rbx
- mov (+8*6)(%rcx), %rdi
- mov (+8*7)(%rcx), %rdx
-
- # use the rsi mask as before
- # if carry is clear, subtract 0. Otherwise, subtract 256 bits of m
- and %rsi, %rax
- and %rsi, %rbx
- and %rsi, %rdi
- and %rsi, %rdx
-
- # if rbp = 0, there was a borrow before, it is moved to the carry flag
- # if rbp = 1, there was not a borrow before, carry flag is cleared
- sub \$1, %rbp
-
- sbb %rax, $X[4]
- sbb %rbx, $X[5]
- sbb %rdi, $X[6]
- sbb %rdx, $X[7]
-
- # write R back to memory
-
- mov (+$red_result_addr_offset+$STACK_DEPTH)(%rsp), %rsi
- mov $X[0], (+8*0)(%rsi)
- mov $X[1], (+8*1)(%rsi)
- mov $X[2], (+8*2)(%rsi)
- mov $X[3], (+8*3)(%rsi)
- mov $X[4], (+8*4)(%rsi)
- mov $X[5], (+8*5)(%rsi)
- mov $X[6], (+8*6)(%rsi)
- mov $X[7], (+8*7)(%rsi)
-
- ret
-.size mont_reduce,.-mont_reduce
-___
-}}}
-
-{{{
-#MUL_512x512 MACRO pDst, pA, pB, x7, x6, x5, x4, x3, x2, x1, x0, tmp*2
-#
-# Inputs: pDst: Destination (1024 bits, 16 qwords)
-# pA: Multiplicand (512 bits, 8 qwords)
-# pB: Multiplicand (512 bits, 8 qwords)
-# Uses registers rax, rdx, args
-# B operand in [pB] and also in x7...x0
-sub MUL_512x512
-{
- my ($pDst, $pA, $pB, $x, $OP, $TMP, $pDst_o)=@_;
- my ($pDst, $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/);
- my @X=@$x; # make a copy
-
-$code.=<<___;
- mov (+8*0)($pA), $OP
-
- mov $X[0], %rax
- mul $OP # rdx:rax = %OP * [0]
- mov %rax, (+$pDst_o+8*0)($pDst)
- mov %rdx, $X[0]
-___
-for(my $i=1;$i<8;$i++) {
-$code.=<<___;
- mov $X[$i], %rax
- mul $OP # rdx:rax = %OP * [$i]
- add %rax, $X[$i-1]
- adc \$0, %rdx
- mov %rdx, $X[$i]
-___
-}
-
-for(my $i=1;$i<8;$i++) {
-$code.=<<___;
- mov (+8*$i)($pA), $OP
-___
-
- &MULSTEP_512(\@X, "(+$pDst_o+8*$i)($pDst)", $pB, $OP, $TMP);
- push(@X,shift(@X));
-}
-
-$code.=<<___;
- mov $X[0], (+$pDst_o+8*8)($pDst)
- mov $X[1], (+$pDst_o+8*9)($pDst)
- mov $X[2], (+$pDst_o+8*10)($pDst)
- mov $X[3], (+$pDst_o+8*11)($pDst)
- mov $X[4], (+$pDst_o+8*12)($pDst)
- mov $X[5], (+$pDst_o+8*13)($pDst)
- mov $X[6], (+$pDst_o+8*14)($pDst)
- mov $X[7], (+$pDst_o+8*15)($pDst)
-___
-}
-
-#
-# mont_mul_a3b : subroutine to compute (Src1 * Src2) % M (all 512-bits)
-# Input: src1: Address of source 1: rdi
-# src2: Address of source 2: rsi
-# Output: dst: Address of destination: [red_res_addr]
-# src2 and result also in: r9, r8, r15, r14, r13, r12, r11, r10
-# Temp: Clobbers [tmp16], all registers
-$code.=<<___;
-.type mont_mul_a3b,\@abi-omnipotent
-.align 16
-mont_mul_a3b:
- #
- # multiply tmp = src1 * src2
- # For multiply: dst = rcx, src1 = rdi, src2 = rsi
- # stack depth is extra 8 from call
-___
- &MUL_512x512("%rsp+$tmp16_offset+8", "%rdi", "%rsi", [map("%r$_",(10..15,8..9))], "%rbp", "%rbx");
-$code.=<<___;
- #
- # Dst = tmp % m
- # Call reduce(tmp, m, data, dst)
-
- # tail recursion optimization: jmp to mont_reduce and return from there
- jmp mont_reduce
- # call mont_reduce
- # ret
-.size mont_mul_a3b,.-mont_mul_a3b
-___
-}}}
-
-{{{
-#SQR_512 MACRO pDest, pA, x7, x6, x5, x4, x3, x2, x1, x0, tmp*4
-#
-# Input in memory [pA] and also in x7...x0
-# Uses all argument registers plus rax and rdx
-#
-# This version computes all of the off-diagonal terms into memory,
-# and then it adds in the diagonal terms
-
-sub SQR_512
-{
- my ($pDst, $pA, $x, $A, $tmp, $x7, $x6, $pDst_o)=@_;
- my ($pDst, $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/);
- my @X=@$x; # make a copy
-$code.=<<___;
- # ------------------
- # first pass 01...07
- # ------------------
- mov $X[0], $A
-
- mov $X[1],%rax
- mul $A
- mov %rax, (+$pDst_o+8*1)($pDst)
-___
-for(my $i=2;$i<8;$i++) {
-$code.=<<___;
- mov %rdx, $X[$i-2]
- mov $X[$i],%rax
- mul $A
- add %rax, $X[$i-2]
- adc \$0, %rdx
-___
-}
-$code.=<<___;
- mov %rdx, $x7
-
- mov $X[0], (+$pDst_o+8*2)($pDst)
-
- # ------------------
- # second pass 12...17
- # ------------------
-
- mov (+8*1)($pA), $A
-
- mov (+8*2)($pA),%rax
- mul $A
- add %rax, $X[1]
- adc \$0, %rdx
- mov $X[1], (+$pDst_o+8*3)($pDst)
-
- mov %rdx, $X[0]
- mov (+8*3)($pA),%rax
- mul $A
- add %rax, $X[2]
- adc \$0, %rdx
- add $X[0], $X[2]
- adc \$0, %rdx
- mov $X[2], (+$pDst_o+8*4)($pDst)
-
- mov %rdx, $X[0]
- mov (+8*4)($pA),%rax
- mul $A
- add %rax, $X[3]
- adc \$0, %rdx
- add $X[0], $X[3]
- adc \$0, %rdx
-
- mov %rdx, $X[0]
- mov (+8*5)($pA),%rax
- mul $A
- add %rax, $X[4]
- adc \$0, %rdx
- add $X[0], $X[4]
- adc \$0, %rdx
-
- mov %rdx, $X[0]
- mov $X[6],%rax
- mul $A
- add %rax, $X[5]
- adc \$0, %rdx
- add $X[0], $X[5]
- adc \$0, %rdx
-
- mov %rdx, $X[0]
- mov $X[7],%rax
- mul $A
- add %rax, $x7
- adc \$0, %rdx
- add $X[0], $x7
- adc \$0, %rdx
-
- mov %rdx, $X[1]
-
- # ------------------
- # third pass 23...27
- # ------------------
- mov (+8*2)($pA), $A
-
- mov (+8*3)($pA),%rax
- mul $A
- add %rax, $X[3]
- adc \$0, %rdx
- mov $X[3], (+$pDst_o+8*5)($pDst)
-
- mov %rdx, $X[0]
- mov (+8*4)($pA),%rax
- mul $A
- add %rax, $X[4]
- adc \$0, %rdx
- add $X[0], $X[4]
- adc \$0, %rdx
- mov $X[4], (+$pDst_o+8*6)($pDst)
-
- mov %rdx, $X[0]
- mov (+8*5)($pA),%rax
- mul $A
- add %rax, $X[5]
- adc \$0, %rdx
- add $X[0], $X[5]
- adc \$0, %rdx
-
- mov %rdx, $X[0]
- mov $X[6],%rax
- mul $A
- add %rax, $x7
- adc \$0, %rdx
- add $X[0], $x7
- adc \$0, %rdx
-
- mov %rdx, $X[0]
- mov $X[7],%rax
- mul $A
- add %rax, $X[1]
- adc \$0, %rdx
- add $X[0], $X[1]
- adc \$0, %rdx
-
- mov %rdx, $X[2]
-
- # ------------------
- # fourth pass 34...37
- # ------------------
-
- mov (+8*3)($pA), $A
-
- mov (+8*4)($pA),%rax
- mul $A
- add %rax, $X[5]
- adc \$0, %rdx
- mov $X[5], (+$pDst_o+8*7)($pDst)
-
- mov %rdx, $X[0]
- mov (+8*5)($pA),%rax
- mul $A
- add %rax, $x7
- adc \$0, %rdx
- add $X[0], $x7
- adc \$0, %rdx
- mov $x7, (+$pDst_o+8*8)($pDst)
-
- mov %rdx, $X[0]
- mov $X[6],%rax
- mul $A
- add %rax, $X[1]
- adc \$0, %rdx
- add $X[0], $X[1]
- adc \$0, %rdx
-
- mov %rdx, $X[0]
- mov $X[7],%rax
- mul $A
- add %rax, $X[2]
- adc \$0, %rdx
- add $X[0], $X[2]
- adc \$0, %rdx
-
- mov %rdx, $X[5]
-
- # ------------------
- # fifth pass 45...47
- # ------------------
- mov (+8*4)($pA), $A
-
- mov (+8*5)($pA),%rax
- mul $A
- add %rax, $X[1]
- adc \$0, %rdx
- mov $X[1], (+$pDst_o+8*9)($pDst)
-
- mov %rdx, $X[0]
- mov $X[6],%rax
- mul $A
- add %rax, $X[2]
- adc \$0, %rdx
- add $X[0], $X[2]
- adc \$0, %rdx
- mov $X[2], (+$pDst_o+8*10)($pDst)
-
- mov %rdx, $X[0]
- mov $X[7],%rax
- mul $A
- add %rax, $X[5]
- adc \$0, %rdx
- add $X[0], $X[5]
- adc \$0, %rdx
-
- mov %rdx, $X[1]
-
- # ------------------
- # sixth pass 56...57
- # ------------------
- mov (+8*5)($pA), $A
-
- mov $X[6],%rax
- mul $A
- add %rax, $X[5]
- adc \$0, %rdx
- mov $X[5], (+$pDst_o+8*11)($pDst)
-
- mov %rdx, $X[0]
- mov $X[7],%rax
- mul $A
- add %rax, $X[1]
- adc \$0, %rdx
- add $X[0], $X[1]
- adc \$0, %rdx
- mov $X[1], (+$pDst_o+8*12)($pDst)
-
- mov %rdx, $X[2]
-
- # ------------------
- # seventh pass 67
- # ------------------
- mov $X[6], $A
-
- mov $X[7],%rax
- mul $A
- add %rax, $X[2]
- adc \$0, %rdx
- mov $X[2], (+$pDst_o+8*13)($pDst)
-
- mov %rdx, (+$pDst_o+8*14)($pDst)
-
- # start finalize (add in squares, and double off-terms)
- mov (+$pDst_o+8*1)($pDst), $X[0]
- mov (+$pDst_o+8*2)($pDst), $X[1]
- mov (+$pDst_o+8*3)($pDst), $X[2]
- mov (+$pDst_o+8*4)($pDst), $X[3]
- mov (+$pDst_o+8*5)($pDst), $X[4]
- mov (+$pDst_o+8*6)($pDst), $X[5]
-
- mov (+8*3)($pA), %rax
- mul %rax
- mov %rax, $x6
- mov %rdx, $X[6]
-
- add $X[0], $X[0]
- adc $X[1], $X[1]
- adc $X[2], $X[2]
- adc $X[3], $X[3]
- adc $X[4], $X[4]
- adc $X[5], $X[5]
- adc \$0, $X[6]
-
- mov (+8*0)($pA), %rax
- mul %rax
- mov %rax, (+$pDst_o+8*0)($pDst)
- mov %rdx, $A
-
- mov (+8*1)($pA), %rax
- mul %rax
-
- add $A, $X[0]
- adc %rax, $X[1]
- adc \$0, %rdx
-
- mov %rdx, $A
- mov $X[0], (+$pDst_o+8*1)($pDst)
- mov $X[1], (+$pDst_o+8*2)($pDst)
-
- mov (+8*2)($pA), %rax
- mul %rax
-
- add $A, $X[2]
- adc %rax, $X[3]
- adc \$0, %rdx
-
- mov %rdx, $A
-
- mov $X[2], (+$pDst_o+8*3)($pDst)
- mov $X[3], (+$pDst_o+8*4)($pDst)
-
- xor $tmp, $tmp
- add $A, $X[4]
- adc $x6, $X[5]
- adc \$0, $tmp
-
- mov $X[4], (+$pDst_o+8*5)($pDst)
- mov $X[5], (+$pDst_o+8*6)($pDst)
-
- # %%tmp has 0/1 in column 7
- # %%A6 has a full value in column 7
-
- mov (+$pDst_o+8*7)($pDst), $X[0]
- mov (+$pDst_o+8*8)($pDst), $X[1]
- mov (+$pDst_o+8*9)($pDst), $X[2]
- mov (+$pDst_o+8*10)($pDst), $X[3]
- mov (+$pDst_o+8*11)($pDst), $X[4]
- mov (+$pDst_o+8*12)($pDst), $X[5]
- mov (+$pDst_o+8*13)($pDst), $x6
- mov (+$pDst_o+8*14)($pDst), $x7
-
- mov $X[7], %rax
- mul %rax
- mov %rax, $X[7]
- mov %rdx, $A
-
- add $X[0], $X[0]
- adc $X[1], $X[1]
- adc $X[2], $X[2]
- adc $X[3], $X[3]
- adc $X[4], $X[4]
- adc $X[5], $X[5]
- adc $x6, $x6
- adc $x7, $x7
- adc \$0, $A
-
- add $tmp, $X[0]
-
- mov (+8*4)($pA), %rax
- mul %rax
-
- add $X[6], $X[0]
- adc %rax, $X[1]
- adc \$0, %rdx
-
- mov %rdx, $tmp
-
- mov $X[0], (+$pDst_o+8*7)($pDst)
- mov $X[1], (+$pDst_o+8*8)($pDst)
-
- mov (+8*5)($pA), %rax
- mul %rax
-
- add $tmp, $X[2]
- adc %rax, $X[3]
- adc \$0, %rdx
-
- mov %rdx, $tmp
-
- mov $X[2], (+$pDst_o+8*9)($pDst)
- mov $X[3], (+$pDst_o+8*10)($pDst)
-
- mov (+8*6)($pA), %rax
- mul %rax
-
- add $tmp, $X[4]
- adc %rax, $X[5]
- adc \$0, %rdx
-
- mov $X[4], (+$pDst_o+8*11)($pDst)
- mov $X[5], (+$pDst_o+8*12)($pDst)
-
- add %rdx, $x6
- adc $X[7], $x7
- adc \$0, $A
-
- mov $x6, (+$pDst_o+8*13)($pDst)
- mov $x7, (+$pDst_o+8*14)($pDst)
- mov $A, (+$pDst_o+8*15)($pDst)
-___
-}
-
-#
-# sqr_reduce: subroutine to compute Result = reduce(Result * Result)
-#
-# input and result also in: r9, r8, r15, r14, r13, r12, r11, r10
-#
-$code.=<<___;
-.type sqr_reduce,\@abi-omnipotent
-.align 16
-sqr_reduce:
- mov (+$pResult_offset+8)(%rsp), %rcx
-___
- &SQR_512("%rsp+$tmp16_offset+8", "%rcx", [map("%r$_",(10..15,8..9))], "%rbx", "%rbp", "%rsi", "%rdi");
-$code.=<<___;
- # tail recursion optimization: jmp to mont_reduce and return from there
- jmp mont_reduce
- # call mont_reduce
- # ret
-.size sqr_reduce,.-sqr_reduce
-___
-}}}
-
-#
-# MAIN FUNCTION
-#
-
-#mod_exp_512(UINT64 *result, /* 512 bits, 8 qwords */
-# UINT64 *g, /* 512 bits, 8 qwords */
-# UINT64 *exp, /* 512 bits, 8 qwords */
-# struct mod_ctx_512 *data)
-
-# window size = 5
-# table size = 2^5 = 32
-#table_entries equ 32
-#table_size equ table_entries * 8
-$code.=<<___;
-.globl mod_exp_512
-.type mod_exp_512,\@function,4
-mod_exp_512:
- push %rbp
- push %rbx
- push %r12
- push %r13
- push %r14
- push %r15
-
- # adjust stack down and then align it with cache boundary
- mov %rsp, %r8
- sub \$$mem_size, %rsp
- and \$-64, %rsp
-
- # store previous stack pointer and arguments
- mov %r8, (+$rsp_offset)(%rsp)
- mov %rdi, (+$pResult_offset)(%rsp)
- mov %rsi, (+$pG_offset)(%rsp)
- mov %rcx, (+$pData_offset)(%rsp)
-.Lbody:
- # transform g into montgomery space
- # GT = reduce(g * C2) = reduce(g * (2^256))
- # reduce expects to have the input in [tmp16]
- pxor %xmm4, %xmm4
- movdqu (+16*0)(%rsi), %xmm0
- movdqu (+16*1)(%rsi), %xmm1
- movdqu (+16*2)(%rsi), %xmm2
- movdqu (+16*3)(%rsi), %xmm3
- movdqa %xmm4, (+$tmp16_offset+16*0)(%rsp)
- movdqa %xmm4, (+$tmp16_offset+16*1)(%rsp)
- movdqa %xmm4, (+$tmp16_offset+16*6)(%rsp)
- movdqa %xmm4, (+$tmp16_offset+16*7)(%rsp)
- movdqa %xmm0, (+$tmp16_offset+16*2)(%rsp)
- movdqa %xmm1, (+$tmp16_offset+16*3)(%rsp)
- movdqa %xmm2, (+$tmp16_offset+16*4)(%rsp)
- movdqa %xmm3, (+$tmp16_offset+16*5)(%rsp)
-
- # load pExp before rdx gets blown away
- movdqu (+16*0)(%rdx), %xmm0
- movdqu (+16*1)(%rdx), %xmm1
- movdqu (+16*2)(%rdx), %xmm2
- movdqu (+16*3)(%rdx), %xmm3
-
- lea (+$GT_offset)(%rsp), %rbx
- mov %rbx, (+$red_result_addr_offset)(%rsp)
- call mont_reduce
-
- # Initialize tmp = C
- lea (+$tmp_offset)(%rsp), %rcx
- xor %rax, %rax
- mov %rax, (+8*0)(%rcx)
- mov %rax, (+8*1)(%rcx)
- mov %rax, (+8*3)(%rcx)
- mov %rax, (+8*4)(%rcx)
- mov %rax, (+8*5)(%rcx)
- mov %rax, (+8*6)(%rcx)
- mov %rax, (+8*7)(%rcx)
- mov %rax, (+$exp_offset+8*8)(%rsp)
- movq \$1, (+8*2)(%rcx)
-
- lea (+$garray_offset)(%rsp), %rbp
- mov %rcx, %rsi # pTmp
- mov %rbp, %rdi # Garray[][0]
-___
-
- &swizzle("%rdi", "%rcx", "%rax", "%rbx");
-
- # for (rax = 31; rax != 0; rax--) {
- # tmp = reduce(tmp * G)
- # swizzle(pg, tmp);
- # pg += 2; }
-$code.=<<___;
- mov \$31, %rax
- mov %rax, (+$i_offset)(%rsp)
- mov %rbp, (+$pg_offset)(%rsp)
- # rsi -> pTmp
- mov %rsi, (+$red_result_addr_offset)(%rsp)
- mov (+8*0)(%rsi), %r10
- mov (+8*1)(%rsi), %r11
- mov (+8*2)(%rsi), %r12
- mov (+8*3)(%rsi), %r13
- mov (+8*4)(%rsi), %r14
- mov (+8*5)(%rsi), %r15
- mov (+8*6)(%rsi), %r8
- mov (+8*7)(%rsi), %r9
-init_loop:
- lea (+$GT_offset)(%rsp), %rdi
- call mont_mul_a3b
- lea (+$tmp_offset)(%rsp), %rsi
- mov (+$pg_offset)(%rsp), %rbp
- add \$2, %rbp
- mov %rbp, (+$pg_offset)(%rsp)
- mov %rsi, %rcx # rcx = rsi = addr of tmp
-___
-
- &swizzle("%rbp", "%rcx", "%rax", "%rbx");
-$code.=<<___;
- mov (+$i_offset)(%rsp), %rax
- sub \$1, %rax
- mov %rax, (+$i_offset)(%rsp)
- jne init_loop
-
- #
- # Copy exponent onto stack
- movdqa %xmm0, (+$exp_offset+16*0)(%rsp)
- movdqa %xmm1, (+$exp_offset+16*1)(%rsp)
- movdqa %xmm2, (+$exp_offset+16*2)(%rsp)
- movdqa %xmm3, (+$exp_offset+16*3)(%rsp)
-
-
- #
- # Do exponentiation
- # Initialize result to G[exp{511:507}]
- mov (+$exp_offset+62)(%rsp), %eax
- mov %rax, %rdx
- shr \$11, %rax
- and \$0x07FF, %edx
- mov %edx, (+$exp_offset+62)(%rsp)
- lea (+$garray_offset)(%rsp,%rax,2), %rsi
- mov (+$pResult_offset)(%rsp), %rdx
-___
-
- &unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax");
-
- #
- # Loop variables
- # rcx = [loop_idx] = index: 510-5 to 0 by 5
-$code.=<<___;
- movq \$505, (+$loop_idx_offset)(%rsp)
-
- mov (+$pResult_offset)(%rsp), %rcx
- mov %rcx, (+$red_result_addr_offset)(%rsp)
- mov (+8*0)(%rcx), %r10
- mov (+8*1)(%rcx), %r11
- mov (+8*2)(%rcx), %r12
- mov (+8*3)(%rcx), %r13
- mov (+8*4)(%rcx), %r14
- mov (+8*5)(%rcx), %r15
- mov (+8*6)(%rcx), %r8
- mov (+8*7)(%rcx), %r9
- jmp sqr_2
-
-main_loop_a3b:
- call sqr_reduce
- call sqr_reduce
- call sqr_reduce
-sqr_2:
- call sqr_reduce
- call sqr_reduce
-
- #
- # Do multiply, first look up proper value in Garray
- mov (+$loop_idx_offset)(%rsp), %rcx # bit index
- mov %rcx, %rax
- shr \$4, %rax # rax is word pointer
- mov (+$exp_offset)(%rsp,%rax,2), %edx
- and \$15, %rcx
- shrq %cl, %rdx
- and \$0x1F, %rdx
-
- lea (+$garray_offset)(%rsp,%rdx,2), %rsi
- lea (+$tmp_offset)(%rsp), %rdx
- mov %rdx, %rdi
-___
-
- &unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax");
- # rdi = tmp = pG
-
- #
- # Call mod_mul_a1(pDst, pSrc1, pSrc2, pM, pData)
- # result result pG M Data
-$code.=<<___;
- mov (+$pResult_offset)(%rsp), %rsi
- call mont_mul_a3b
-
- #
- # finish loop
- mov (+$loop_idx_offset)(%rsp), %rcx
- sub \$5, %rcx
- mov %rcx, (+$loop_idx_offset)(%rsp)
- jge main_loop_a3b
-
- #
-
-end_main_loop_a3b:
- # transform result out of Montgomery space
- # result = reduce(result)
- mov (+$pResult_offset)(%rsp), %rdx
- pxor %xmm4, %xmm4
- movdqu (+16*0)(%rdx), %xmm0
- movdqu (+16*1)(%rdx), %xmm1
- movdqu (+16*2)(%rdx), %xmm2
- movdqu (+16*3)(%rdx), %xmm3
- movdqa %xmm4, (+$tmp16_offset+16*4)(%rsp)
- movdqa %xmm4, (+$tmp16_offset+16*5)(%rsp)
- movdqa %xmm4, (+$tmp16_offset+16*6)(%rsp)
- movdqa %xmm4, (+$tmp16_offset+16*7)(%rsp)
- movdqa %xmm0, (+$tmp16_offset+16*0)(%rsp)
- movdqa %xmm1, (+$tmp16_offset+16*1)(%rsp)
- movdqa %xmm2, (+$tmp16_offset+16*2)(%rsp)
- movdqa %xmm3, (+$tmp16_offset+16*3)(%rsp)
- call mont_reduce
-
- # If result > m, subract m
- # load result into r15:r8
- mov (+$pResult_offset)(%rsp), %rax
- mov (+8*0)(%rax), %r8
- mov (+8*1)(%rax), %r9
- mov (+8*2)(%rax), %r10
- mov (+8*3)(%rax), %r11
- mov (+8*4)(%rax), %r12
- mov (+8*5)(%rax), %r13
- mov (+8*6)(%rax), %r14
- mov (+8*7)(%rax), %r15
-
- # subtract m
- mov (+$pData_offset)(%rsp), %rbx
- add \$$M, %rbx
-
- sub (+8*0)(%rbx), %r8
- sbb (+8*1)(%rbx), %r9
- sbb (+8*2)(%rbx), %r10
- sbb (+8*3)(%rbx), %r11
- sbb (+8*4)(%rbx), %r12
- sbb (+8*5)(%rbx), %r13
- sbb (+8*6)(%rbx), %r14
- sbb (+8*7)(%rbx), %r15
-
- # if Carry is clear, replace result with difference
- mov (+8*0)(%rax), %rsi
- mov (+8*1)(%rax), %rdi
- mov (+8*2)(%rax), %rcx
- mov (+8*3)(%rax), %rdx
- cmovnc %r8, %rsi
- cmovnc %r9, %rdi
- cmovnc %r10, %rcx
- cmovnc %r11, %rdx
- mov %rsi, (+8*0)(%rax)
- mov %rdi, (+8*1)(%rax)
- mov %rcx, (+8*2)(%rax)
- mov %rdx, (+8*3)(%rax)
-
- mov (+8*4)(%rax), %rsi
- mov (+8*5)(%rax), %rdi
- mov (+8*6)(%rax), %rcx
- mov (+8*7)(%rax), %rdx
- cmovnc %r12, %rsi
- cmovnc %r13, %rdi
- cmovnc %r14, %rcx
- cmovnc %r15, %rdx
- mov %rsi, (+8*4)(%rax)
- mov %rdi, (+8*5)(%rax)
- mov %rcx, (+8*6)(%rax)
- mov %rdx, (+8*7)(%rax)
-
- mov (+$rsp_offset)(%rsp), %rsi
- mov 0(%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbx
- mov 40(%rsi),%rbp
- lea 48(%rsi),%rsp
-.Lepilogue:
- ret
-.size mod_exp_512, . - mod_exp_512
-___
-
-if ($win64) {
-# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
-# CONTEXT *context,DISPATCHER_CONTEXT *disp)
-my $rec="%rcx";
-my $frame="%rdx";
-my $context="%r8";
-my $disp="%r9";
-
-$code.=<<___;
-.extern __imp_RtlVirtualUnwind
-.type mod_exp_512_se_handler,\@abi-omnipotent
-.align 16
-mod_exp_512_se_handler:
- push %rsi
- push %rdi
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
- pushfq
- sub \$64,%rsp
-
- mov 120($context),%rax # pull context->Rax
- mov 248($context),%rbx # pull context->Rip
-
- lea .Lbody(%rip),%r10
- cmp %r10,%rbx # context->Rip<prologue label
- jb .Lin_prologue
-
- mov 152($context),%rax # pull context->Rsp
-
- lea .Lepilogue(%rip),%r10
- cmp %r10,%rbx # context->Rip>=epilogue label
- jae .Lin_prologue
-
- mov $rsp_offset(%rax),%rax # pull saved Rsp
-
- mov 32(%rax),%rbx
- mov 40(%rax),%rbp
- mov 24(%rax),%r12
- mov 16(%rax),%r13
- mov 8(%rax),%r14
- mov 0(%rax),%r15
- lea 48(%rax),%rax
- mov %rbx,144($context) # restore context->Rbx
- mov %rbp,160($context) # restore context->Rbp
- mov %r12,216($context) # restore context->R12
- mov %r13,224($context) # restore context->R13
- mov %r14,232($context) # restore context->R14
- mov %r15,240($context) # restore context->R15
-
-.Lin_prologue:
- mov 8(%rax),%rdi
- mov 16(%rax),%rsi
- mov %rax,152($context) # restore context->Rsp
- mov %rsi,168($context) # restore context->Rsi
- mov %rdi,176($context) # restore context->Rdi
-
- mov 40($disp),%rdi # disp->ContextRecord
- mov $context,%rsi # context
- mov \$154,%ecx # sizeof(CONTEXT)
- .long 0xa548f3fc # cld; rep movsq
-
- mov $disp,%rsi
- xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
- mov 8(%rsi),%rdx # arg2, disp->ImageBase
- mov 0(%rsi),%r8 # arg3, disp->ControlPc
- mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
- mov 40(%rsi),%r10 # disp->ContextRecord
- lea 56(%rsi),%r11 # &disp->HandlerData
- lea 24(%rsi),%r12 # &disp->EstablisherFrame
- mov %r10,32(%rsp) # arg5
- mov %r11,40(%rsp) # arg6
- mov %r12,48(%rsp) # arg7
- mov %rcx,56(%rsp) # arg8, (NULL)
- call *__imp_RtlVirtualUnwind(%rip)
-
- mov \$1,%eax # ExceptionContinueSearch
- add \$64,%rsp
- popfq
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- pop %rdi
- pop %rsi
- ret
-.size mod_exp_512_se_handler,.-mod_exp_512_se_handler
-
-.section .pdata
-.align 4
- .rva .LSEH_begin_mod_exp_512
- .rva .LSEH_end_mod_exp_512
- .rva .LSEH_info_mod_exp_512
-
-.section .xdata
-.align 8
-.LSEH_info_mod_exp_512:
- .byte 9,0,0,0
- .rva mod_exp_512_se_handler
-___
-}
-
-sub reg_part {
-my ($reg,$conv)=@_;
- if ($reg =~ /%r[0-9]+/) { $reg .= $conv; }
- elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; }
- elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; }
- elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; }
- return $reg;
-}
-
-$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/(\(\+[^)]+\))/eval $1/gem;
-print $code;
-close STDOUT;
diff --git a/crypto/bn/asm/ppc-mont.pl b/crypto/bn/asm/ppc-mont.pl
index f9b6992ccc82..da69c6aaaf6a 100755
--- a/crypto/bn/asm/ppc-mont.pl
+++ b/crypto/bn/asm/ppc-mont.pl
@@ -325,6 +325,7 @@ Lcopy: ; copy or in-place refresh
.long 0
.byte 0,12,4,0,0x80,12,6,0
.long 0
+.size .bn_mul_mont_int,.-.bn_mul_mont_int
.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
___
diff --git a/crypto/bn/asm/ppc.pl b/crypto/bn/asm/ppc.pl
index 1249ce229988..04df1fe5cc65 100644
--- a/crypto/bn/asm/ppc.pl
+++ b/crypto/bn/asm/ppc.pl
@@ -392,6 +392,7 @@ $data=<<EOF;
.long 0
.byte 0,12,0x14,0,0,0,2,0
.long 0
+.size .bn_sqr_comba4,.-.bn_sqr_comba4
#
# NOTE: The following label name should be changed to
@@ -819,6 +820,7 @@ $data=<<EOF;
.long 0
.byte 0,12,0x14,0,0,0,2,0
.long 0
+.size .bn_sqr_comba8,.-.bn_sqr_comba8
#
# NOTE: The following label name should be changed to
@@ -972,6 +974,7 @@ $data=<<EOF;
.long 0
.byte 0,12,0x14,0,0,0,3,0
.long 0
+.size .bn_mul_comba4,.-.bn_mul_comba4
#
# NOTE: The following label name should be changed to
@@ -1510,6 +1513,7 @@ $data=<<EOF;
.long 0
.byte 0,12,0x14,0,0,0,3,0
.long 0
+.size .bn_mul_comba8,.-.bn_mul_comba8
#
# NOTE: The following label name should be changed to
@@ -1560,6 +1564,7 @@ Lppcasm_sub_adios:
.long 0
.byte 0,12,0x14,0,0,0,4,0
.long 0
+.size .bn_sub_words,.-.bn_sub_words
#
# NOTE: The following label name should be changed to
@@ -1605,6 +1610,7 @@ Lppcasm_add_adios:
.long 0
.byte 0,12,0x14,0,0,0,4,0
.long 0
+.size .bn_add_words,.-.bn_add_words
#
# NOTE: The following label name should be changed to
@@ -1720,6 +1726,7 @@ Lppcasm_div9:
.long 0
.byte 0,12,0x14,0,0,0,3,0
.long 0
+.size .bn_div_words,.-.bn_div_words
#
# NOTE: The following label name should be changed to
@@ -1761,6 +1768,7 @@ Lppcasm_sqr_adios:
.long 0
.byte 0,12,0x14,0,0,0,3,0
.long 0
+.size .bn_sqr_words,.-.bn_sqr_words
#
# NOTE: The following label name should be changed to
@@ -1866,6 +1874,7 @@ Lppcasm_mw_OVER:
.long 0
.byte 0,12,0x14,0,0,0,4,0
.long 0
+.size bn_mul_words,.-bn_mul_words
#
# NOTE: The following label name should be changed to
@@ -1991,6 +2000,7 @@ Lppcasm_maw_adios:
.long 0
.byte 0,12,0x14,0,0,0,4,0
.long 0
+.size .bn_mul_add_words,.-.bn_mul_add_words
.align 4
EOF
$data =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/crypto/bn/asm/ppc64-mont.pl b/crypto/bn/asm/ppc64-mont.pl
index a14e769ad055..68e3733e3f79 100755
--- a/crypto/bn/asm/ppc64-mont.pl
+++ b/crypto/bn/asm/ppc64-mont.pl
@@ -1,7 +1,7 @@
#!/usr/bin/env perl
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -65,6 +65,14 @@
# others alternative would be to break dependence on upper halves of
# GPRs by sticking to 32-bit integer operations...
+# December 2012
+
+# Remove above mentioned dependence on GPRs' upper halves in 32-bit
+# build. No signal masking overhead, but integer instructions are
+# *more* numerous... It's still "universally" faster than 32-bit
+# ppc-mont.pl, but improvement coefficient is not as impressive
+# for longer keys...
+
$flavour = shift;
if ($flavour =~ /32/) {
@@ -110,6 +118,9 @@ $tp="r10";
$j="r11";
$i="r12";
# non-volatile registers
+$c1="r19";
+$n1="r20";
+$a1="r21";
$nap_d="r22"; # interleaved ap and np in double format
$a0="r23"; # ap[0]
$t0="r24"; # temporary registers
@@ -180,8 +191,8 @@ $T3a="f30"; $T3b="f31";
# . .
# +-------------------------------+
# . .
-# -12*size_t +-------------------------------+
-# | 10 saved gpr, r22-r31 |
+# -13*size_t +-------------------------------+
+# | 13 saved gpr, r19-r31 |
# . .
# . .
# -12*8 +-------------------------------+
@@ -215,6 +226,9 @@ $code=<<___;
mr $i,$sp
$STUX $sp,$sp,$tp ; alloca
+ $PUSH r19,`-12*8-13*$SIZE_T`($i)
+ $PUSH r20,`-12*8-12*$SIZE_T`($i)
+ $PUSH r21,`-12*8-11*$SIZE_T`($i)
$PUSH r22,`-12*8-10*$SIZE_T`($i)
$PUSH r23,`-12*8-9*$SIZE_T`($i)
$PUSH r24,`-12*8-8*$SIZE_T`($i)
@@ -237,40 +251,26 @@ $code=<<___;
stfd f29,`-3*8`($i)
stfd f30,`-2*8`($i)
stfd f31,`-1*8`($i)
-___
-$code.=<<___ if ($SIZE_T==8);
- ld $a0,0($ap) ; pull ap[0] value
- ld $n0,0($n0) ; pull n0[0] value
- ld $t3,0($bp) ; bp[0]
-___
-$code.=<<___ if ($SIZE_T==4);
- mr $t1,$n0
- lwz $a0,0($ap) ; pull ap[0,1] value
- lwz $t0,4($ap)
- lwz $n0,0($t1) ; pull n0[0,1] value
- lwz $t1,4($t1)
- lwz $t3,0($bp) ; bp[0,1]
- lwz $t2,4($bp)
- insrdi $a0,$t0,32,0
- insrdi $n0,$t1,32,0
- insrdi $t3,$t2,32,0
-___
-$code.=<<___;
+
addi $tp,$sp,`$FRAME+$TRANSFER+8+64`
li $i,-64
add $nap_d,$tp,$num
and $nap_d,$nap_d,$i ; align to 64 bytes
-
- mulld $t7,$a0,$t3 ; ap[0]*bp[0]
; nap_d is off by 1, because it's used with stfdu/lfdu
addi $nap_d,$nap_d,-8
srwi $j,$num,`3+1` ; counter register, num/2
- mulld $t7,$t7,$n0 ; tp[0]*n0
addi $j,$j,-1
addi $tp,$sp,`$FRAME+$TRANSFER-8`
li $carry,0
mtctr $j
+___
+
+$code.=<<___ if ($SIZE_T==8);
+ ld $a0,0($ap) ; pull ap[0] value
+ ld $t3,0($bp) ; bp[0]
+ ld $n0,0($n0) ; pull n0[0] value
+ mulld $t7,$a0,$t3 ; ap[0]*bp[0]
; transfer bp[0] to FPU as 4x16-bit values
extrdi $t0,$t3,16,48
extrdi $t1,$t3,16,32
@@ -280,6 +280,8 @@ $code.=<<___;
std $t1,`$FRAME+8`($sp)
std $t2,`$FRAME+16`($sp)
std $t3,`$FRAME+24`($sp)
+
+ mulld $t7,$t7,$n0 ; tp[0]*n0
; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
extrdi $t4,$t7,16,48
extrdi $t5,$t7,16,32
@@ -289,21 +291,61 @@ $code.=<<___;
std $t5,`$FRAME+40`($sp)
std $t6,`$FRAME+48`($sp)
std $t7,`$FRAME+56`($sp)
-___
-$code.=<<___ if ($SIZE_T==8);
- lwz $t0,4($ap) ; load a[j] as 32-bit word pair
- lwz $t1,0($ap)
- lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair
+
+ extrdi $t0,$a0,32,32 ; lwz $t0,4($ap)
+ extrdi $t1,$a0,32,0 ; lwz $t1,0($ap)
+ lwz $t2,12($ap) ; load a[1] as 32-bit word pair
lwz $t3,8($ap)
- lwz $t4,4($np) ; load n[j] as 32-bit word pair
+ lwz $t4,4($np) ; load n[0] as 32-bit word pair
lwz $t5,0($np)
- lwz $t6,12($np) ; load n[j+1] as 32-bit word pair
+ lwz $t6,12($np) ; load n[1] as 32-bit word pair
lwz $t7,8($np)
___
$code.=<<___ if ($SIZE_T==4);
- lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs
- lwz $t1,4($ap)
- lwz $t2,8($ap)
+ lwz $a0,0($ap) ; pull ap[0,1] value
+ mr $n1,$n0
+ lwz $a1,4($ap)
+ li $c1,0
+ lwz $t1,0($bp) ; bp[0,1]
+ lwz $t3,4($bp)
+ lwz $n0,0($n1) ; pull n0[0,1] value
+ lwz $n1,4($n1)
+
+ mullw $t4,$a0,$t1 ; mulld ap[0]*bp[0]
+ mulhwu $t5,$a0,$t1
+ mullw $t6,$a1,$t1
+ mullw $t7,$a0,$t3
+ add $t5,$t5,$t6
+ add $t5,$t5,$t7
+ ; transfer bp[0] to FPU as 4x16-bit values
+ extrwi $t0,$t1,16,16
+ extrwi $t1,$t1,16,0
+ extrwi $t2,$t3,16,16
+ extrwi $t3,$t3,16,0
+ std $t0,`$FRAME+0`($sp) ; yes, std in 32-bit build
+ std $t1,`$FRAME+8`($sp)
+ std $t2,`$FRAME+16`($sp)
+ std $t3,`$FRAME+24`($sp)
+
+ mullw $t0,$t4,$n0 ; mulld tp[0]*n0
+ mulhwu $t1,$t4,$n0
+ mullw $t2,$t5,$n0
+ mullw $t3,$t4,$n1
+ add $t1,$t1,$t2
+ add $t1,$t1,$t3
+ ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
+ extrwi $t4,$t0,16,16
+ extrwi $t5,$t0,16,0
+ extrwi $t6,$t1,16,16
+ extrwi $t7,$t1,16,0
+ std $t4,`$FRAME+32`($sp) ; yes, std in 32-bit build
+ std $t5,`$FRAME+40`($sp)
+ std $t6,`$FRAME+48`($sp)
+ std $t7,`$FRAME+56`($sp)
+
+ mr $t0,$a0 ; lwz $t0,0($ap)
+ mr $t1,$a1 ; lwz $t1,4($ap)
+ lwz $t2,8($ap) ; load a[j..j+3] as 32-bit word pairs
lwz $t3,12($ap)
lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
lwz $t5,4($np)
@@ -319,7 +361,7 @@ $code.=<<___;
lfd $nb,`$FRAME+40`($sp)
lfd $nc,`$FRAME+48`($sp)
lfd $nd,`$FRAME+56`($sp)
- std $t0,`$FRAME+64`($sp)
+ std $t0,`$FRAME+64`($sp) ; yes, std even in 32-bit build
std $t1,`$FRAME+72`($sp)
std $t2,`$FRAME+80`($sp)
std $t3,`$FRAME+88`($sp)
@@ -441,7 +483,7 @@ $code.=<<___ if ($SIZE_T==4);
lwz $t7,12($np)
___
$code.=<<___;
- std $t0,`$FRAME+64`($sp)
+ std $t0,`$FRAME+64`($sp) ; yes, std even in 32-bit build
std $t1,`$FRAME+72`($sp)
std $t2,`$FRAME+80`($sp)
std $t3,`$FRAME+88`($sp)
@@ -449,6 +491,9 @@ $code.=<<___;
std $t5,`$FRAME+104`($sp)
std $t6,`$FRAME+112`($sp)
std $t7,`$FRAME+120`($sp)
+___
+if ($SIZE_T==8 or $flavour =~ /osx/) {
+$code.=<<___;
ld $t0,`$FRAME+0`($sp)
ld $t1,`$FRAME+8`($sp)
ld $t2,`$FRAME+16`($sp)
@@ -457,6 +502,20 @@ $code.=<<___;
ld $t5,`$FRAME+40`($sp)
ld $t6,`$FRAME+48`($sp)
ld $t7,`$FRAME+56`($sp)
+___
+} else {
+$code.=<<___;
+ lwz $t1,`$FRAME+0`($sp)
+ lwz $t0,`$FRAME+4`($sp)
+ lwz $t3,`$FRAME+8`($sp)
+ lwz $t2,`$FRAME+12`($sp)
+ lwz $t5,`$FRAME+16`($sp)
+ lwz $t4,`$FRAME+20`($sp)
+ lwz $t7,`$FRAME+24`($sp)
+ lwz $t6,`$FRAME+28`($sp)
+___
+}
+$code.=<<___;
lfd $A0,`$FRAME+64`($sp)
lfd $A1,`$FRAME+72`($sp)
lfd $A2,`$FRAME+80`($sp)
@@ -488,7 +547,9 @@ $code.=<<___;
fmadd $T0b,$A0,$bb,$dotb
stfd $A2,24($nap_d) ; save a[j+1] in double format
stfd $A3,32($nap_d)
-
+___
+if ($SIZE_T==8 or $flavour =~ /osx/) {
+$code.=<<___;
fmadd $T1a,$A0,$bc,$T1a
fmadd $T1b,$A0,$bd,$T1b
fmadd $T2a,$A1,$bc,$T2a
@@ -561,11 +622,123 @@ $code.=<<___;
stfd $T3b,`$FRAME+56`($sp)
std $t0,8($tp) ; tp[j-1]
stdu $t4,16($tp) ; tp[j]
+___
+} else {
+$code.=<<___;
+ fmadd $T1a,$A0,$bc,$T1a
+ fmadd $T1b,$A0,$bd,$T1b
+ addc $t0,$t0,$carry
+ adde $t1,$t1,$c1
+ srwi $carry,$t0,16
+ fmadd $T2a,$A1,$bc,$T2a
+ fmadd $T2b,$A1,$bd,$T2b
+ stfd $N0,40($nap_d) ; save n[j] in double format
+ stfd $N1,48($nap_d)
+ srwi $c1,$t1,16
+ insrwi $carry,$t1,16,0
+ fmadd $T3a,$A2,$bc,$T3a
+ fmadd $T3b,$A2,$bd,$T3b
+ addc $t2,$t2,$carry
+ adde $t3,$t3,$c1
+ srwi $carry,$t2,16
+ fmul $dota,$A3,$bc
+ fmul $dotb,$A3,$bd
+ stfd $N2,56($nap_d) ; save n[j+1] in double format
+ stfdu $N3,64($nap_d)
+ insrwi $t0,$t2,16,0 ; 0..31 bits
+ srwi $c1,$t3,16
+ insrwi $carry,$t3,16,0
+
+ fmadd $T1a,$N1,$na,$T1a
+ fmadd $T1b,$N1,$nb,$T1b
+ lwz $t3,`$FRAME+32`($sp) ; permuted $t1
+ lwz $t2,`$FRAME+36`($sp) ; permuted $t0
+ addc $t4,$t4,$carry
+ adde $t5,$t5,$c1
+ srwi $carry,$t4,16
+ fmadd $T2a,$N2,$na,$T2a
+ fmadd $T2b,$N2,$nb,$T2b
+ srwi $c1,$t5,16
+ insrwi $carry,$t5,16,0
+ fmadd $T3a,$N3,$na,$T3a
+ fmadd $T3b,$N3,$nb,$T3b
+ addc $t6,$t6,$carry
+ adde $t7,$t7,$c1
+ srwi $carry,$t6,16
+ fmadd $T0a,$N0,$na,$T0a
+ fmadd $T0b,$N0,$nb,$T0b
+ insrwi $t4,$t6,16,0 ; 32..63 bits
+ srwi $c1,$t7,16
+ insrwi $carry,$t7,16,0
+
+ fmadd $T1a,$N0,$nc,$T1a
+ fmadd $T1b,$N0,$nd,$T1b
+ lwz $t7,`$FRAME+40`($sp) ; permuted $t3
+ lwz $t6,`$FRAME+44`($sp) ; permuted $t2
+ addc $t2,$t2,$carry
+ adde $t3,$t3,$c1
+ srwi $carry,$t2,16
+ fmadd $T2a,$N1,$nc,$T2a
+ fmadd $T2b,$N1,$nd,$T2b
+ stw $t0,12($tp) ; tp[j-1]
+ stw $t4,8($tp)
+ srwi $c1,$t3,16
+ insrwi $carry,$t3,16,0
+ fmadd $T3a,$N2,$nc,$T3a
+ fmadd $T3b,$N2,$nd,$T3b
+ lwz $t1,`$FRAME+48`($sp) ; permuted $t5
+ lwz $t0,`$FRAME+52`($sp) ; permuted $t4
+ addc $t6,$t6,$carry
+ adde $t7,$t7,$c1
+ srwi $carry,$t6,16
+ fmadd $dota,$N3,$nc,$dota
+ fmadd $dotb,$N3,$nd,$dotb
+ insrwi $t2,$t6,16,0 ; 64..95 bits
+ srwi $c1,$t7,16
+ insrwi $carry,$t7,16,0
+
+ fctid $T0a,$T0a
+ fctid $T0b,$T0b
+ lwz $t5,`$FRAME+56`($sp) ; permuted $t7
+ lwz $t4,`$FRAME+60`($sp) ; permuted $t6
+ addc $t0,$t0,$carry
+ adde $t1,$t1,$c1
+ srwi $carry,$t0,16
+ fctid $T1a,$T1a
+ fctid $T1b,$T1b
+ srwi $c1,$t1,16
+ insrwi $carry,$t1,16,0
+ fctid $T2a,$T2a
+ fctid $T2b,$T2b
+ addc $t4,$t4,$carry
+ adde $t5,$t5,$c1
+ srwi $carry,$t4,16
+ fctid $T3a,$T3a
+ fctid $T3b,$T3b
+ insrwi $t0,$t4,16,0 ; 96..127 bits
+ srwi $c1,$t5,16
+ insrwi $carry,$t5,16,0
+
+ stfd $T0a,`$FRAME+0`($sp)
+ stfd $T0b,`$FRAME+8`($sp)
+ stfd $T1a,`$FRAME+16`($sp)
+ stfd $T1b,`$FRAME+24`($sp)
+ stfd $T2a,`$FRAME+32`($sp)
+ stfd $T2b,`$FRAME+40`($sp)
+ stfd $T3a,`$FRAME+48`($sp)
+ stfd $T3b,`$FRAME+56`($sp)
+ stw $t2,20($tp) ; tp[j]
+ stwu $t0,16($tp)
+___
+}
+$code.=<<___;
bdnz- L1st
fctid $dota,$dota
fctid $dotb,$dotb
-
+___
+if ($SIZE_T==8 or $flavour =~ /osx/) {
+$code.=<<___;
ld $t0,`$FRAME+0`($sp)
ld $t1,`$FRAME+8`($sp)
ld $t2,`$FRAME+16`($sp)
@@ -611,33 +784,117 @@ $code.=<<___;
insrdi $t6,$t7,48,0
srdi $ovf,$t7,48
std $t6,8($tp) ; tp[num-1]
+___
+} else {
+$code.=<<___;
+ lwz $t1,`$FRAME+0`($sp)
+ lwz $t0,`$FRAME+4`($sp)
+ lwz $t3,`$FRAME+8`($sp)
+ lwz $t2,`$FRAME+12`($sp)
+ lwz $t5,`$FRAME+16`($sp)
+ lwz $t4,`$FRAME+20`($sp)
+ lwz $t7,`$FRAME+24`($sp)
+ lwz $t6,`$FRAME+28`($sp)
+ stfd $dota,`$FRAME+64`($sp)
+ stfd $dotb,`$FRAME+72`($sp)
+ addc $t0,$t0,$carry
+ adde $t1,$t1,$c1
+ srwi $carry,$t0,16
+ insrwi $carry,$t1,16,0
+ srwi $c1,$t1,16
+ addc $t2,$t2,$carry
+ adde $t3,$t3,$c1
+ srwi $carry,$t2,16
+ insrwi $t0,$t2,16,0 ; 0..31 bits
+ insrwi $carry,$t3,16,0
+ srwi $c1,$t3,16
+ addc $t4,$t4,$carry
+ adde $t5,$t5,$c1
+ srwi $carry,$t4,16
+ insrwi $carry,$t5,16,0
+ srwi $c1,$t5,16
+ addc $t6,$t6,$carry
+ adde $t7,$t7,$c1
+ srwi $carry,$t6,16
+ insrwi $t4,$t6,16,0 ; 32..63 bits
+ insrwi $carry,$t7,16,0
+ srwi $c1,$t7,16
+ stw $t0,12($tp) ; tp[j-1]
+ stw $t4,8($tp)
+
+ lwz $t3,`$FRAME+32`($sp) ; permuted $t1
+ lwz $t2,`$FRAME+36`($sp) ; permuted $t0
+ lwz $t7,`$FRAME+40`($sp) ; permuted $t3
+ lwz $t6,`$FRAME+44`($sp) ; permuted $t2
+ lwz $t1,`$FRAME+48`($sp) ; permuted $t5
+ lwz $t0,`$FRAME+52`($sp) ; permuted $t4
+ lwz $t5,`$FRAME+56`($sp) ; permuted $t7
+ lwz $t4,`$FRAME+60`($sp) ; permuted $t6
+
+ addc $t2,$t2,$carry
+ adde $t3,$t3,$c1
+ srwi $carry,$t2,16
+ insrwi $carry,$t3,16,0
+ srwi $c1,$t3,16
+ addc $t6,$t6,$carry
+ adde $t7,$t7,$c1
+ srwi $carry,$t6,16
+ insrwi $t2,$t6,16,0 ; 64..95 bits
+ insrwi $carry,$t7,16,0
+ srwi $c1,$t7,16
+ addc $t0,$t0,$carry
+ adde $t1,$t1,$c1
+ srwi $carry,$t0,16
+ insrwi $carry,$t1,16,0
+ srwi $c1,$t1,16
+ addc $t4,$t4,$carry
+ adde $t5,$t5,$c1
+ srwi $carry,$t4,16
+ insrwi $t0,$t4,16,0 ; 96..127 bits
+ insrwi $carry,$t5,16,0
+ srwi $c1,$t5,16
+ stw $t2,20($tp) ; tp[j]
+ stwu $t0,16($tp)
+
+ lwz $t7,`$FRAME+64`($sp)
+ lwz $t6,`$FRAME+68`($sp)
+ lwz $t5,`$FRAME+72`($sp)
+ lwz $t4,`$FRAME+76`($sp)
+
+ addc $t6,$t6,$carry
+ adde $t7,$t7,$c1
+ srwi $carry,$t6,16
+ insrwi $carry,$t7,16,0
+ srwi $c1,$t7,16
+ addc $t4,$t4,$carry
+ adde $t5,$t5,$c1
+
+ insrwi $t6,$t4,16,0
+ srwi $t4,$t4,16
+ insrwi $t4,$t5,16,0
+ srwi $ovf,$t5,16
+ stw $t6,12($tp) ; tp[num-1]
+ stw $t4,8($tp)
+___
+}
+$code.=<<___;
slwi $t7,$num,2
subf $nap_d,$t7,$nap_d ; rewind pointer
li $i,8 ; i=1
.align 5
Louter:
-___
-$code.=<<___ if ($SIZE_T==8);
- ldx $t3,$bp,$i ; bp[i]
-___
-$code.=<<___ if ($SIZE_T==4);
- add $t0,$bp,$i
- lwz $t3,0($t0) ; bp[i,i+1]
- lwz $t0,4($t0)
- insrdi $t3,$t0,32,0
-___
-$code.=<<___;
- ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0]
- mulld $t7,$a0,$t3 ; ap[0]*bp[i]
-
addi $tp,$sp,`$FRAME+$TRANSFER`
- add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0]
li $carry,0
- mulld $t7,$t7,$n0 ; tp[0]*n0
mtctr $j
+___
+$code.=<<___ if ($SIZE_T==8);
+ ldx $t3,$bp,$i ; bp[i]
+ ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0]
+ mulld $t7,$a0,$t3 ; ap[0]*bp[i]
+ add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0]
; transfer bp[i] to FPU as 4x16-bit values
extrdi $t0,$t3,16,48
extrdi $t1,$t3,16,32
@@ -647,6 +904,8 @@ $code.=<<___;
std $t1,`$FRAME+8`($sp)
std $t2,`$FRAME+16`($sp)
std $t3,`$FRAME+24`($sp)
+
+ mulld $t7,$t7,$n0 ; tp[0]*n0
; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
extrdi $t4,$t7,16,48
extrdi $t5,$t7,16,32
@@ -656,7 +915,50 @@ $code.=<<___;
std $t5,`$FRAME+40`($sp)
std $t6,`$FRAME+48`($sp)
std $t7,`$FRAME+56`($sp)
+___
+$code.=<<___ if ($SIZE_T==4);
+ add $t0,$bp,$i
+ li $c1,0
+ lwz $t1,0($t0) ; bp[i,i+1]
+ lwz $t3,4($t0)
+
+ mullw $t4,$a0,$t1 ; ap[0]*bp[i]
+ lwz $t0,`$FRAME+$TRANSFER+8+4`($sp) ; tp[0]
+ mulhwu $t5,$a0,$t1
+ lwz $t2,`$FRAME+$TRANSFER+8`($sp) ; tp[0]
+ mullw $t6,$a1,$t1
+ mullw $t7,$a0,$t3
+ add $t5,$t5,$t6
+ add $t5,$t5,$t7
+ addc $t4,$t4,$t0 ; ap[0]*bp[i]+tp[0]
+ adde $t5,$t5,$t2
+ ; transfer bp[i] to FPU as 4x16-bit values
+ extrwi $t0,$t1,16,16
+ extrwi $t1,$t1,16,0
+ extrwi $t2,$t3,16,16
+ extrwi $t3,$t3,16,0
+ std $t0,`$FRAME+0`($sp) ; yes, std in 32-bit build
+ std $t1,`$FRAME+8`($sp)
+ std $t2,`$FRAME+16`($sp)
+ std $t3,`$FRAME+24`($sp)
+ mullw $t0,$t4,$n0 ; mulld tp[0]*n0
+ mulhwu $t1,$t4,$n0
+ mullw $t2,$t5,$n0
+ mullw $t3,$t4,$n1
+ add $t1,$t1,$t2
+ add $t1,$t1,$t3
+ ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
+ extrwi $t4,$t0,16,16
+ extrwi $t5,$t0,16,0
+ extrwi $t6,$t1,16,16
+ extrwi $t7,$t1,16,0
+ std $t4,`$FRAME+32`($sp) ; yes, std in 32-bit build
+ std $t5,`$FRAME+40`($sp)
+ std $t6,`$FRAME+48`($sp)
+ std $t7,`$FRAME+56`($sp)
+___
+$code.=<<___;
lfd $A0,8($nap_d) ; load a[j] in double format
lfd $A1,16($nap_d)
lfd $A2,24($nap_d) ; load a[j+1] in double format
@@ -769,7 +1071,9 @@ Linner:
fmul $dotb,$A3,$bd
lfd $A2,24($nap_d) ; load a[j+1] in double format
lfd $A3,32($nap_d)
-
+___
+if ($SIZE_T==8 or $flavour =~ /osx/) {
+$code.=<<___;
fmadd $T1a,$N1,$na,$T1a
fmadd $T1b,$N1,$nb,$T1b
ld $t0,`$FRAME+0`($sp)
@@ -856,10 +1160,131 @@ $code.=<<___;
addze $carry,$carry
std $t3,-16($tp) ; tp[j-1]
std $t5,-8($tp) ; tp[j]
+___
+} else {
+$code.=<<___;
+ fmadd $T1a,$N1,$na,$T1a
+ fmadd $T1b,$N1,$nb,$T1b
+ lwz $t1,`$FRAME+0`($sp)
+ lwz $t0,`$FRAME+4`($sp)
+ fmadd $T2a,$N2,$na,$T2a
+ fmadd $T2b,$N2,$nb,$T2b
+ lwz $t3,`$FRAME+8`($sp)
+ lwz $t2,`$FRAME+12`($sp)
+ fmadd $T3a,$N3,$na,$T3a
+ fmadd $T3b,$N3,$nb,$T3b
+ lwz $t5,`$FRAME+16`($sp)
+ lwz $t4,`$FRAME+20`($sp)
+ addc $t0,$t0,$carry
+ adde $t1,$t1,$c1
+ srwi $carry,$t0,16
+ fmadd $T0a,$N0,$na,$T0a
+ fmadd $T0b,$N0,$nb,$T0b
+ lwz $t7,`$FRAME+24`($sp)
+ lwz $t6,`$FRAME+28`($sp)
+ srwi $c1,$t1,16
+ insrwi $carry,$t1,16,0
+
+ fmadd $T1a,$N0,$nc,$T1a
+ fmadd $T1b,$N0,$nd,$T1b
+ addc $t2,$t2,$carry
+ adde $t3,$t3,$c1
+ srwi $carry,$t2,16
+ fmadd $T2a,$N1,$nc,$T2a
+ fmadd $T2b,$N1,$nd,$T2b
+ insrwi $t0,$t2,16,0 ; 0..31 bits
+ srwi $c1,$t3,16
+ insrwi $carry,$t3,16,0
+ fmadd $T3a,$N2,$nc,$T3a
+ fmadd $T3b,$N2,$nd,$T3b
+ lwz $t2,12($tp) ; tp[j]
+ lwz $t3,8($tp)
+ addc $t4,$t4,$carry
+ adde $t5,$t5,$c1
+ srwi $carry,$t4,16
+ fmadd $dota,$N3,$nc,$dota
+ fmadd $dotb,$N3,$nd,$dotb
+ srwi $c1,$t5,16
+ insrwi $carry,$t5,16,0
+
+ fctid $T0a,$T0a
+ addc $t6,$t6,$carry
+ adde $t7,$t7,$c1
+ srwi $carry,$t6,16
+ fctid $T0b,$T0b
+ insrwi $t4,$t6,16,0 ; 32..63 bits
+ srwi $c1,$t7,16
+ insrwi $carry,$t7,16,0
+ fctid $T1a,$T1a
+ addc $t0,$t0,$t2
+ adde $t4,$t4,$t3
+ lwz $t3,`$FRAME+32`($sp) ; permuted $t1
+ lwz $t2,`$FRAME+36`($sp) ; permuted $t0
+ fctid $T1b,$T1b
+ addze $carry,$carry
+ addze $c1,$c1
+ stw $t0,4($tp) ; tp[j-1]
+ stw $t4,0($tp)
+ fctid $T2a,$T2a
+ addc $t2,$t2,$carry
+ adde $t3,$t3,$c1
+ srwi $carry,$t2,16
+ lwz $t7,`$FRAME+40`($sp) ; permuted $t3
+ lwz $t6,`$FRAME+44`($sp) ; permuted $t2
+ fctid $T2b,$T2b
+ srwi $c1,$t3,16
+ insrwi $carry,$t3,16,0
+ lwz $t1,`$FRAME+48`($sp) ; permuted $t5
+ lwz $t0,`$FRAME+52`($sp) ; permuted $t4
+ fctid $T3a,$T3a
+ addc $t6,$t6,$carry
+ adde $t7,$t7,$c1
+ srwi $carry,$t6,16
+ lwz $t5,`$FRAME+56`($sp) ; permuted $t7
+ lwz $t4,`$FRAME+60`($sp) ; permuted $t6
+ fctid $T3b,$T3b
+
+ insrwi $t2,$t6,16,0 ; 64..95 bits
+ insrwi $carry,$t7,16,0
+ srwi $c1,$t7,16
+ lwz $t6,20($tp)
+ lwzu $t7,16($tp)
+ addc $t0,$t0,$carry
+ stfd $T0a,`$FRAME+0`($sp)
+ adde $t1,$t1,$c1
+ srwi $carry,$t0,16
+ stfd $T0b,`$FRAME+8`($sp)
+ insrwi $carry,$t1,16,0
+ srwi $c1,$t1,16
+ addc $t4,$t4,$carry
+ stfd $T1a,`$FRAME+16`($sp)
+ adde $t5,$t5,$c1
+ srwi $carry,$t4,16
+ insrwi $t0,$t4,16,0 ; 96..127 bits
+ stfd $T1b,`$FRAME+24`($sp)
+ insrwi $carry,$t5,16,0
+ srwi $c1,$t5,16
+
+ addc $t2,$t2,$t6
+ stfd $T2a,`$FRAME+32`($sp)
+ adde $t0,$t0,$t7
+ stfd $T2b,`$FRAME+40`($sp)
+ addze $carry,$carry
+ stfd $T3a,`$FRAME+48`($sp)
+ addze $c1,$c1
+ stfd $T3b,`$FRAME+56`($sp)
+ stw $t2,-4($tp) ; tp[j]
+ stw $t0,-8($tp)
+___
+}
+$code.=<<___;
bdnz- Linner
fctid $dota,$dota
fctid $dotb,$dotb
+___
+if ($SIZE_T==8 or $flavour =~ /osx/) {
+$code.=<<___;
ld $t0,`$FRAME+0`($sp)
ld $t1,`$FRAME+8`($sp)
ld $t2,`$FRAME+16`($sp)
@@ -926,7 +1351,116 @@ $code.=<<___;
insrdi $t6,$t7,48,0
srdi $ovf,$t7,48
std $t6,0($tp) ; tp[num-1]
+___
+} else {
+$code.=<<___;
+ lwz $t1,`$FRAME+0`($sp)
+ lwz $t0,`$FRAME+4`($sp)
+ lwz $t3,`$FRAME+8`($sp)
+ lwz $t2,`$FRAME+12`($sp)
+ lwz $t5,`$FRAME+16`($sp)
+ lwz $t4,`$FRAME+20`($sp)
+ lwz $t7,`$FRAME+24`($sp)
+ lwz $t6,`$FRAME+28`($sp)
+ stfd $dota,`$FRAME+64`($sp)
+ stfd $dotb,`$FRAME+72`($sp)
+ addc $t0,$t0,$carry
+ adde $t1,$t1,$c1
+ srwi $carry,$t0,16
+ insrwi $carry,$t1,16,0
+ srwi $c1,$t1,16
+ addc $t2,$t2,$carry
+ adde $t3,$t3,$c1
+ srwi $carry,$t2,16
+ insrwi $t0,$t2,16,0 ; 0..31 bits
+ lwz $t2,12($tp) ; tp[j]
+ insrwi $carry,$t3,16,0
+ srwi $c1,$t3,16
+ lwz $t3,8($tp)
+ addc $t4,$t4,$carry
+ adde $t5,$t5,$c1
+ srwi $carry,$t4,16
+ insrwi $carry,$t5,16,0
+ srwi $c1,$t5,16
+ addc $t6,$t6,$carry
+ adde $t7,$t7,$c1
+ srwi $carry,$t6,16
+ insrwi $t4,$t6,16,0 ; 32..63 bits
+ insrwi $carry,$t7,16,0
+ srwi $c1,$t7,16
+
+ addc $t0,$t0,$t2
+ adde $t4,$t4,$t3
+ addze $carry,$carry
+ addze $c1,$c1
+ stw $t0,4($tp) ; tp[j-1]
+ stw $t4,0($tp)
+
+ lwz $t3,`$FRAME+32`($sp) ; permuted $t1
+ lwz $t2,`$FRAME+36`($sp) ; permuted $t0
+ lwz $t7,`$FRAME+40`($sp) ; permuted $t3
+ lwz $t6,`$FRAME+44`($sp) ; permuted $t2
+ lwz $t1,`$FRAME+48`($sp) ; permuted $t5
+ lwz $t0,`$FRAME+52`($sp) ; permuted $t4
+ lwz $t5,`$FRAME+56`($sp) ; permuted $t7
+ lwz $t4,`$FRAME+60`($sp) ; permuted $t6
+
+ addc $t2,$t2,$carry
+ adde $t3,$t3,$c1
+ srwi $carry,$t2,16
+ insrwi $carry,$t3,16,0
+ srwi $c1,$t3,16
+ addc $t6,$t6,$carry
+ adde $t7,$t7,$c1
+ srwi $carry,$t6,16
+ insrwi $t2,$t6,16,0 ; 64..95 bits
+ lwz $t6,20($tp)
+ insrwi $carry,$t7,16,0
+ srwi $c1,$t7,16
+ lwzu $t7,16($tp)
+ addc $t0,$t0,$carry
+ adde $t1,$t1,$c1
+ srwi $carry,$t0,16
+ insrwi $carry,$t1,16,0
+ srwi $c1,$t1,16
+ addc $t4,$t4,$carry
+ adde $t5,$t5,$c1
+ srwi $carry,$t4,16
+ insrwi $t0,$t4,16,0 ; 96..127 bits
+ insrwi $carry,$t5,16,0
+ srwi $c1,$t5,16
+
+ addc $t2,$t2,$t6
+ adde $t0,$t0,$t7
+ lwz $t7,`$FRAME+64`($sp)
+ lwz $t6,`$FRAME+68`($sp)
+ addze $carry,$carry
+ addze $c1,$c1
+ lwz $t5,`$FRAME+72`($sp)
+ lwz $t4,`$FRAME+76`($sp)
+
+ addc $t6,$t6,$carry
+ adde $t7,$t7,$c1
+ stw $t2,-4($tp) ; tp[j]
+ stw $t0,-8($tp)
+ addc $t6,$t6,$ovf
+ addze $t7,$t7
+ srwi $carry,$t6,16
+ insrwi $carry,$t7,16,0
+ srwi $c1,$t7,16
+ addc $t4,$t4,$carry
+ adde $t5,$t5,$c1
+
+ insrwi $t6,$t4,16,0
+ srwi $t4,$t4,16
+ insrwi $t4,$t5,16,0
+ srwi $ovf,$t5,16
+ stw $t6,4($tp) ; tp[num-1]
+ stw $t4,0($tp)
+___
+}
+$code.=<<___;
slwi $t7,$num,2
addi $i,$i,8
subf $nap_d,$t7,$nap_d ; rewind pointer
@@ -994,14 +1528,14 @@ $code.=<<___ if ($SIZE_T==4);
mtctr $j
.align 4
-Lsub: ld $t0,8($tp) ; load tp[j..j+3] in 64-bit word order
- ldu $t2,16($tp)
+Lsub: lwz $t0,12($tp) ; load tp[j..j+3] in 64-bit word order
+ lwz $t1,8($tp)
+ lwz $t2,20($tp)
+ lwzu $t3,16($tp)
lwz $t4,4($np) ; load np[j..j+3] in 32-bit word order
lwz $t5,8($np)
lwz $t6,12($np)
lwzu $t7,16($np)
- extrdi $t1,$t0,32,0
- extrdi $t3,$t2,32,0
subfe $t4,$t4,$t0 ; tp[j]-np[j]
stw $t0,4($ap) ; save tp[j..j+3] in 32-bit word order
subfe $t5,$t5,$t1 ; tp[j+1]-np[j+1]
@@ -1052,6 +1586,9 @@ ___
$code.=<<___;
$POP $i,0($sp)
li r3,1 ; signal "handled"
+ $POP r19,`-12*8-13*$SIZE_T`($i)
+ $POP r20,`-12*8-12*$SIZE_T`($i)
+ $POP r21,`-12*8-11*$SIZE_T`($i)
$POP r22,`-12*8-10*$SIZE_T`($i)
$POP r23,`-12*8-9*$SIZE_T`($i)
$POP r24,`-12*8-8*$SIZE_T`($i)
@@ -1077,8 +1614,9 @@ $code.=<<___;
mr $sp,$i
blr
.long 0
- .byte 0,12,4,0,0x8c,10,6,0
+ .byte 0,12,4,0,0x8c,13,6,0
.long 0
+.size .$fname,.-.$fname
.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
___
diff --git a/crypto/bn/asm/rsaz-avx2.pl b/crypto/bn/asm/rsaz-avx2.pl
new file mode 100755
index 000000000000..3b6ccf83d13e
--- /dev/null
+++ b/crypto/bn/asm/rsaz-avx2.pl
@@ -0,0 +1,1898 @@
+#!/usr/bin/env perl
+
+##############################################################################
+# #
+# Copyright (c) 2012, Intel Corporation #
+# #
+# All rights reserved. #
+# #
+# Redistribution and use in source and binary forms, with or without #
+# modification, are permitted provided that the following conditions are #
+# met: #
+# #
+# * Redistributions of source code must retain the above copyright #
+# notice, this list of conditions and the following disclaimer. #
+# #
+# * Redistributions in binary form must reproduce the above copyright #
+# notice, this list of conditions and the following disclaimer in the #
+# documentation and/or other materials provided with the #
+# distribution. #
+# #
+# * Neither the name of the Intel Corporation nor the names of its #
+# contributors may be used to endorse or promote products derived from #
+# this software without specific prior written permission. #
+# #
+# #
+# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
+# #
+##############################################################################
+# Developers and authors: #
+# Shay Gueron (1, 2), and Vlad Krasnov (1) #
+# (1) Intel Corporation, Israel Development Center, Haifa, Israel #
+# (2) University of Haifa, Israel #
+##############################################################################
+# Reference: #
+# [1] S. Gueron, V. Krasnov: "Software Implementation of Modular #
+# Exponentiation, Using Advanced Vector Instructions Architectures", #
+# F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369, #
+# pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012 #
+# [2] S. Gueron: "Efficient Software Implementations of Modular #
+# Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012). #
+# [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE #
+# Proceedings of 9th International Conference on Information Technology: #
+# New Generations (ITNG 2012), pp.821-823 (2012) #
+# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
+# resistant 1024-bit modular exponentiation, for optimizing RSA2048 #
+# on AVX2 capable x86_64 platforms", #
+# http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest#
+##############################################################################
+#
+# +13% improvement over original submission by <appro@openssl.org>
+#
+# rsa2048 sign/sec OpenSSL 1.0.1 scalar(*) this
+# 2.3GHz Haswell 621 765/+23% 1113/+79%
+# 2.3GHz Broadwell(**) 688 1200(***)/+74% 1120/+63%
+#
+# (*) if system doesn't support AVX2, for reference purposes;
+# (**) scaled to 2.3GHz to simplify comparison;
+# (***) scalar AD*X code is faster than AVX2 and is preferred code
+# path for Broadwell;
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22);
+ $addx = ($1>=2.23);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+ $addx = ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+ $addx = ($1>=11);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
+ my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
+ $avx = ($ver>=3.0) + ($ver>=3.01);
+ $addx = ($ver>=3.03);
+}
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT = *OUT;
+
+if ($avx>1) {{{
+{ # void AMS_WW(
+my $rp="%rdi"; # BN_ULONG *rp,
+my $ap="%rsi"; # const BN_ULONG *ap,
+my $np="%rdx"; # const BN_ULONG *np,
+my $n0="%ecx"; # const BN_ULONG n0,
+my $rep="%r8d"; # int repeat);
+
+# The registers that hold the accumulated redundant result
+# The AMM works on 1024 bit operands, and redundant word size is 29
+# Therefore: ceil(1024/29)/4 = 9
+my $ACC0="%ymm0";
+my $ACC1="%ymm1";
+my $ACC2="%ymm2";
+my $ACC3="%ymm3";
+my $ACC4="%ymm4";
+my $ACC5="%ymm5";
+my $ACC6="%ymm6";
+my $ACC7="%ymm7";
+my $ACC8="%ymm8";
+my $ACC9="%ymm9";
+# Registers that hold the broadcasted words of bp, currently used
+my $B1="%ymm10";
+my $B2="%ymm11";
+# Registers that hold the broadcasted words of Y, currently used
+my $Y1="%ymm12";
+my $Y2="%ymm13";
+# Helper registers
+my $TEMP1="%ymm14";
+my $AND_MASK="%ymm15";
+# alu registers that hold the first words of the ACC
+my $r0="%r9";
+my $r1="%r10";
+my $r2="%r11";
+my $r3="%r12";
+
+my $i="%r14d"; # loop counter
+my $tmp = "%r15";
+
+my $FrameSize=32*18+32*8; # place for A^2 and 2*A
+
+my $aap=$r0;
+my $tp0="%rbx";
+my $tp1=$r3;
+my $tpa=$tmp;
+
+$np="%r13"; # reassigned argument
+
+$code.=<<___;
+.text
+
+.globl rsaz_1024_sqr_avx2
+.type rsaz_1024_sqr_avx2,\@function,5
+.align 64
+rsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2
+ lea (%rsp), %rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ lea -0xa8(%rsp),%rsp
+ vmovaps %xmm6,-0xd8(%rax)
+ vmovaps %xmm7,-0xc8(%rax)
+ vmovaps %xmm8,-0xb8(%rax)
+ vmovaps %xmm9,-0xa8(%rax)
+ vmovaps %xmm10,-0x98(%rax)
+ vmovaps %xmm11,-0x88(%rax)
+ vmovaps %xmm12,-0x78(%rax)
+ vmovaps %xmm13,-0x68(%rax)
+ vmovaps %xmm14,-0x58(%rax)
+ vmovaps %xmm15,-0x48(%rax)
+.Lsqr_1024_body:
+___
+$code.=<<___;
+ mov %rax,%rbp
+ mov %rdx, $np # reassigned argument
+ sub \$$FrameSize, %rsp
+ mov $np, $tmp
+ sub \$-128, $rp # size optimization
+ sub \$-128, $ap
+ sub \$-128, $np
+
+ and \$4095, $tmp # see if $np crosses page
+ add \$32*10, $tmp
+ shr \$12, $tmp
+ vpxor $ACC9,$ACC9,$ACC9
+ jz .Lsqr_1024_no_n_copy
+
+ # unaligned 256-bit load that crosses page boundary can
+ # cause >2x performance degradation here, so if $np does
+ # cross page boundary, copy it to stack and make sure stack
+ # frame doesn't...
+ sub \$32*10,%rsp
+ vmovdqu 32*0-128($np), $ACC0
+ and \$-2048, %rsp
+ vmovdqu 32*1-128($np), $ACC1
+ vmovdqu 32*2-128($np), $ACC2
+ vmovdqu 32*3-128($np), $ACC3
+ vmovdqu 32*4-128($np), $ACC4
+ vmovdqu 32*5-128($np), $ACC5
+ vmovdqu 32*6-128($np), $ACC6
+ vmovdqu 32*7-128($np), $ACC7
+ vmovdqu 32*8-128($np), $ACC8
+ lea $FrameSize+128(%rsp),$np
+ vmovdqu $ACC0, 32*0-128($np)
+ vmovdqu $ACC1, 32*1-128($np)
+ vmovdqu $ACC2, 32*2-128($np)
+ vmovdqu $ACC3, 32*3-128($np)
+ vmovdqu $ACC4, 32*4-128($np)
+ vmovdqu $ACC5, 32*5-128($np)
+ vmovdqu $ACC6, 32*6-128($np)
+ vmovdqu $ACC7, 32*7-128($np)
+ vmovdqu $ACC8, 32*8-128($np)
+ vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero
+
+.Lsqr_1024_no_n_copy:
+ and \$-1024, %rsp
+
+ vmovdqu 32*1-128($ap), $ACC1
+ vmovdqu 32*2-128($ap), $ACC2
+ vmovdqu 32*3-128($ap), $ACC3
+ vmovdqu 32*4-128($ap), $ACC4
+ vmovdqu 32*5-128($ap), $ACC5
+ vmovdqu 32*6-128($ap), $ACC6
+ vmovdqu 32*7-128($ap), $ACC7
+ vmovdqu 32*8-128($ap), $ACC8
+
+ lea 192(%rsp), $tp0 # 64+128=192
+ vpbroadcastq .Land_mask(%rip), $AND_MASK
+ jmp .LOOP_GRANDE_SQR_1024
+
+.align 32
+.LOOP_GRANDE_SQR_1024:
+ lea 32*18+128(%rsp), $aap # size optimization
+ lea 448(%rsp), $tp1 # 64+128+256=448
+
+ # the squaring is performed as described in Variant B of
+ # "Speeding up Big-Number Squaring", so start by calculating
+ # the A*2=A+A vector
+ vpaddq $ACC1, $ACC1, $ACC1
+ vpbroadcastq 32*0-128($ap), $B1
+ vpaddq $ACC2, $ACC2, $ACC2
+ vmovdqa $ACC1, 32*0-128($aap)
+ vpaddq $ACC3, $ACC3, $ACC3
+ vmovdqa $ACC2, 32*1-128($aap)
+ vpaddq $ACC4, $ACC4, $ACC4
+ vmovdqa $ACC3, 32*2-128($aap)
+ vpaddq $ACC5, $ACC5, $ACC5
+ vmovdqa $ACC4, 32*3-128($aap)
+ vpaddq $ACC6, $ACC6, $ACC6
+ vmovdqa $ACC5, 32*4-128($aap)
+ vpaddq $ACC7, $ACC7, $ACC7
+ vmovdqa $ACC6, 32*5-128($aap)
+ vpaddq $ACC8, $ACC8, $ACC8
+ vmovdqa $ACC7, 32*6-128($aap)
+ vpxor $ACC9, $ACC9, $ACC9
+ vmovdqa $ACC8, 32*7-128($aap)
+
+ vpmuludq 32*0-128($ap), $B1, $ACC0
+ vpbroadcastq 32*1-128($ap), $B2
+ vmovdqu $ACC9, 32*9-192($tp0) # zero upper half
+ vpmuludq $B1, $ACC1, $ACC1
+ vmovdqu $ACC9, 32*10-448($tp1)
+ vpmuludq $B1, $ACC2, $ACC2
+ vmovdqu $ACC9, 32*11-448($tp1)
+ vpmuludq $B1, $ACC3, $ACC3
+ vmovdqu $ACC9, 32*12-448($tp1)
+ vpmuludq $B1, $ACC4, $ACC4
+ vmovdqu $ACC9, 32*13-448($tp1)
+ vpmuludq $B1, $ACC5, $ACC5
+ vmovdqu $ACC9, 32*14-448($tp1)
+ vpmuludq $B1, $ACC6, $ACC6
+ vmovdqu $ACC9, 32*15-448($tp1)
+ vpmuludq $B1, $ACC7, $ACC7
+ vmovdqu $ACC9, 32*16-448($tp1)
+ vpmuludq $B1, $ACC8, $ACC8
+ vpbroadcastq 32*2-128($ap), $B1
+ vmovdqu $ACC9, 32*17-448($tp1)
+
+ mov $ap, $tpa
+ mov \$4, $i
+ jmp .Lsqr_entry_1024
+___
+$TEMP0=$Y1;
+$TEMP2=$Y2;
+$code.=<<___;
+.align 32
+.LOOP_SQR_1024:
+ vpbroadcastq 32*1-128($tpa), $B2
+ vpmuludq 32*0-128($ap), $B1, $ACC0
+ vpaddq 32*0-192($tp0), $ACC0, $ACC0
+ vpmuludq 32*0-128($aap), $B1, $ACC1
+ vpaddq 32*1-192($tp0), $ACC1, $ACC1
+ vpmuludq 32*1-128($aap), $B1, $ACC2
+ vpaddq 32*2-192($tp0), $ACC2, $ACC2
+ vpmuludq 32*2-128($aap), $B1, $ACC3
+ vpaddq 32*3-192($tp0), $ACC3, $ACC3
+ vpmuludq 32*3-128($aap), $B1, $ACC4
+ vpaddq 32*4-192($tp0), $ACC4, $ACC4
+ vpmuludq 32*4-128($aap), $B1, $ACC5
+ vpaddq 32*5-192($tp0), $ACC5, $ACC5
+ vpmuludq 32*5-128($aap), $B1, $ACC6
+ vpaddq 32*6-192($tp0), $ACC6, $ACC6
+ vpmuludq 32*6-128($aap), $B1, $ACC7
+ vpaddq 32*7-192($tp0), $ACC7, $ACC7
+ vpmuludq 32*7-128($aap), $B1, $ACC8
+ vpbroadcastq 32*2-128($tpa), $B1
+ vpaddq 32*8-192($tp0), $ACC8, $ACC8
+.Lsqr_entry_1024:
+ vmovdqu $ACC0, 32*0-192($tp0)
+ vmovdqu $ACC1, 32*1-192($tp0)
+
+ vpmuludq 32*1-128($ap), $B2, $TEMP0
+ vpaddq $TEMP0, $ACC2, $ACC2
+ vpmuludq 32*1-128($aap), $B2, $TEMP1
+ vpaddq $TEMP1, $ACC3, $ACC3
+ vpmuludq 32*2-128($aap), $B2, $TEMP2
+ vpaddq $TEMP2, $ACC4, $ACC4
+ vpmuludq 32*3-128($aap), $B2, $TEMP0
+ vpaddq $TEMP0, $ACC5, $ACC5
+ vpmuludq 32*4-128($aap), $B2, $TEMP1
+ vpaddq $TEMP1, $ACC6, $ACC6
+ vpmuludq 32*5-128($aap), $B2, $TEMP2
+ vpaddq $TEMP2, $ACC7, $ACC7
+ vpmuludq 32*6-128($aap), $B2, $TEMP0
+ vpaddq $TEMP0, $ACC8, $ACC8
+ vpmuludq 32*7-128($aap), $B2, $ACC0
+ vpbroadcastq 32*3-128($tpa), $B2
+ vpaddq 32*9-192($tp0), $ACC0, $ACC0
+
+ vmovdqu $ACC2, 32*2-192($tp0)
+ vmovdqu $ACC3, 32*3-192($tp0)
+
+ vpmuludq 32*2-128($ap), $B1, $TEMP2
+ vpaddq $TEMP2, $ACC4, $ACC4
+ vpmuludq 32*2-128($aap), $B1, $TEMP0
+ vpaddq $TEMP0, $ACC5, $ACC5
+ vpmuludq 32*3-128($aap), $B1, $TEMP1
+ vpaddq $TEMP1, $ACC6, $ACC6
+ vpmuludq 32*4-128($aap), $B1, $TEMP2
+ vpaddq $TEMP2, $ACC7, $ACC7
+ vpmuludq 32*5-128($aap), $B1, $TEMP0
+ vpaddq $TEMP0, $ACC8, $ACC8
+ vpmuludq 32*6-128($aap), $B1, $TEMP1
+ vpaddq $TEMP1, $ACC0, $ACC0
+ vpmuludq 32*7-128($aap), $B1, $ACC1
+ vpbroadcastq 32*4-128($tpa), $B1
+ vpaddq 32*10-448($tp1), $ACC1, $ACC1
+
+ vmovdqu $ACC4, 32*4-192($tp0)
+ vmovdqu $ACC5, 32*5-192($tp0)
+
+ vpmuludq 32*3-128($ap), $B2, $TEMP0
+ vpaddq $TEMP0, $ACC6, $ACC6
+ vpmuludq 32*3-128($aap), $B2, $TEMP1
+ vpaddq $TEMP1, $ACC7, $ACC7
+ vpmuludq 32*4-128($aap), $B2, $TEMP2
+ vpaddq $TEMP2, $ACC8, $ACC8
+ vpmuludq 32*5-128($aap), $B2, $TEMP0
+ vpaddq $TEMP0, $ACC0, $ACC0
+ vpmuludq 32*6-128($aap), $B2, $TEMP1
+ vpaddq $TEMP1, $ACC1, $ACC1
+ vpmuludq 32*7-128($aap), $B2, $ACC2
+ vpbroadcastq 32*5-128($tpa), $B2
+ vpaddq 32*11-448($tp1), $ACC2, $ACC2
+
+ vmovdqu $ACC6, 32*6-192($tp0)
+ vmovdqu $ACC7, 32*7-192($tp0)
+
+ vpmuludq 32*4-128($ap), $B1, $TEMP0
+ vpaddq $TEMP0, $ACC8, $ACC8
+ vpmuludq 32*4-128($aap), $B1, $TEMP1
+ vpaddq $TEMP1, $ACC0, $ACC0
+ vpmuludq 32*5-128($aap), $B1, $TEMP2
+ vpaddq $TEMP2, $ACC1, $ACC1
+ vpmuludq 32*6-128($aap), $B1, $TEMP0
+ vpaddq $TEMP0, $ACC2, $ACC2
+ vpmuludq 32*7-128($aap), $B1, $ACC3
+ vpbroadcastq 32*6-128($tpa), $B1
+ vpaddq 32*12-448($tp1), $ACC3, $ACC3
+
+ vmovdqu $ACC8, 32*8-192($tp0)
+ vmovdqu $ACC0, 32*9-192($tp0)
+ lea 8($tp0), $tp0
+
+ vpmuludq 32*5-128($ap), $B2, $TEMP2
+ vpaddq $TEMP2, $ACC1, $ACC1
+ vpmuludq 32*5-128($aap), $B2, $TEMP0
+ vpaddq $TEMP0, $ACC2, $ACC2
+ vpmuludq 32*6-128($aap), $B2, $TEMP1
+ vpaddq $TEMP1, $ACC3, $ACC3
+ vpmuludq 32*7-128($aap), $B2, $ACC4
+ vpbroadcastq 32*7-128($tpa), $B2
+ vpaddq 32*13-448($tp1), $ACC4, $ACC4
+
+ vmovdqu $ACC1, 32*10-448($tp1)
+ vmovdqu $ACC2, 32*11-448($tp1)
+
+ vpmuludq 32*6-128($ap), $B1, $TEMP0
+ vpaddq $TEMP0, $ACC3, $ACC3
+ vpmuludq 32*6-128($aap), $B1, $TEMP1
+ vpbroadcastq 32*8-128($tpa), $ACC0 # borrow $ACC0 for $B1
+ vpaddq $TEMP1, $ACC4, $ACC4
+ vpmuludq 32*7-128($aap), $B1, $ACC5
+ vpbroadcastq 32*0+8-128($tpa), $B1 # for next iteration
+ vpaddq 32*14-448($tp1), $ACC5, $ACC5
+
+ vmovdqu $ACC3, 32*12-448($tp1)
+ vmovdqu $ACC4, 32*13-448($tp1)
+ lea 8($tpa), $tpa
+
+ vpmuludq 32*7-128($ap), $B2, $TEMP0
+ vpaddq $TEMP0, $ACC5, $ACC5
+ vpmuludq 32*7-128($aap), $B2, $ACC6
+ vpaddq 32*15-448($tp1), $ACC6, $ACC6
+
+ vpmuludq 32*8-128($ap), $ACC0, $ACC7
+ vmovdqu $ACC5, 32*14-448($tp1)
+ vpaddq 32*16-448($tp1), $ACC7, $ACC7
+ vmovdqu $ACC6, 32*15-448($tp1)
+ vmovdqu $ACC7, 32*16-448($tp1)
+ lea 8($tp1), $tp1
+
+ dec $i
+ jnz .LOOP_SQR_1024
+___
+$ZERO = $ACC9;
+$TEMP0 = $B1;
+$TEMP2 = $B2;
+$TEMP3 = $Y1;
+$TEMP4 = $Y2;
+$code.=<<___;
+ #we need to fix indexes 32-39 to avoid overflow
+ vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0),
+ vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0)
+ vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0)
+ lea 192(%rsp), $tp0 # 64+128=192
+
+ vpsrlq \$29, $ACC8, $TEMP1
+ vpand $AND_MASK, $ACC8, $ACC8
+ vpsrlq \$29, $ACC1, $TEMP2
+ vpand $AND_MASK, $ACC1, $ACC1
+
+ vpermq \$0x93, $TEMP1, $TEMP1
+ vpxor $ZERO, $ZERO, $ZERO
+ vpermq \$0x93, $TEMP2, $TEMP2
+
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
+ vpaddq $TEMP0, $ACC8, $ACC8
+ vpblendd \$3, $TEMP2, $ZERO, $TEMP2
+ vpaddq $TEMP1, $ACC1, $ACC1
+ vpaddq $TEMP2, $ACC2, $ACC2
+ vmovdqu $ACC1, 32*9-192($tp0)
+ vmovdqu $ACC2, 32*10-192($tp0)
+
+ mov (%rsp), %rax
+ mov 8(%rsp), $r1
+ mov 16(%rsp), $r2
+ mov 24(%rsp), $r3
+ vmovdqu 32*1(%rsp), $ACC1
+ vmovdqu 32*2-192($tp0), $ACC2
+ vmovdqu 32*3-192($tp0), $ACC3
+ vmovdqu 32*4-192($tp0), $ACC4
+ vmovdqu 32*5-192($tp0), $ACC5
+ vmovdqu 32*6-192($tp0), $ACC6
+ vmovdqu 32*7-192($tp0), $ACC7
+
+ mov %rax, $r0
+ imull $n0, %eax
+ and \$0x1fffffff, %eax
+ vmovd %eax, $Y1
+
+ mov %rax, %rdx
+ imulq -128($np), %rax
+ vpbroadcastq $Y1, $Y1
+ add %rax, $r0
+ mov %rdx, %rax
+ imulq 8-128($np), %rax
+ shr \$29, $r0
+ add %rax, $r1
+ mov %rdx, %rax
+ imulq 16-128($np), %rax
+ add $r0, $r1
+ add %rax, $r2
+ imulq 24-128($np), %rdx
+ add %rdx, $r3
+
+ mov $r1, %rax
+ imull $n0, %eax
+ and \$0x1fffffff, %eax
+
+ mov \$9, $i
+ jmp .LOOP_REDUCE_1024
+
+.align 32
+.LOOP_REDUCE_1024:
+ vmovd %eax, $Y2
+ vpbroadcastq $Y2, $Y2
+
+ vpmuludq 32*1-128($np), $Y1, $TEMP0
+ mov %rax, %rdx
+ imulq -128($np), %rax
+ vpaddq $TEMP0, $ACC1, $ACC1
+ add %rax, $r1
+ vpmuludq 32*2-128($np), $Y1, $TEMP1
+ mov %rdx, %rax
+ imulq 8-128($np), %rax
+ vpaddq $TEMP1, $ACC2, $ACC2
+ vpmuludq 32*3-128($np), $Y1, $TEMP2
+ .byte 0x67
+ add %rax, $r2
+ .byte 0x67
+ mov %rdx, %rax
+ imulq 16-128($np), %rax
+ shr \$29, $r1
+ vpaddq $TEMP2, $ACC3, $ACC3
+ vpmuludq 32*4-128($np), $Y1, $TEMP0
+ add %rax, $r3
+ add $r1, $r2
+ vpaddq $TEMP0, $ACC4, $ACC4
+ vpmuludq 32*5-128($np), $Y1, $TEMP1
+ mov $r2, %rax
+ imull $n0, %eax
+ vpaddq $TEMP1, $ACC5, $ACC5
+ vpmuludq 32*6-128($np), $Y1, $TEMP2
+ and \$0x1fffffff, %eax
+ vpaddq $TEMP2, $ACC6, $ACC6
+ vpmuludq 32*7-128($np), $Y1, $TEMP0
+ vpaddq $TEMP0, $ACC7, $ACC7
+ vpmuludq 32*8-128($np), $Y1, $TEMP1
+ vmovd %eax, $Y1
+ #vmovdqu 32*1-8-128($np), $TEMP2 # moved below
+ vpaddq $TEMP1, $ACC8, $ACC8
+ #vmovdqu 32*2-8-128($np), $TEMP0 # moved below
+ vpbroadcastq $Y1, $Y1
+
+ vpmuludq 32*1-8-128($np), $Y2, $TEMP2 # see above
+ vmovdqu 32*3-8-128($np), $TEMP1
+ mov %rax, %rdx
+ imulq -128($np), %rax
+ vpaddq $TEMP2, $ACC1, $ACC1
+ vpmuludq 32*2-8-128($np), $Y2, $TEMP0 # see above
+ vmovdqu 32*4-8-128($np), $TEMP2
+ add %rax, $r2
+ mov %rdx, %rax
+ imulq 8-128($np), %rax
+ vpaddq $TEMP0, $ACC2, $ACC2
+ add $r3, %rax
+ shr \$29, $r2
+ vpmuludq $Y2, $TEMP1, $TEMP1
+ vmovdqu 32*5-8-128($np), $TEMP0
+ add $r2, %rax
+ vpaddq $TEMP1, $ACC3, $ACC3
+ vpmuludq $Y2, $TEMP2, $TEMP2
+ vmovdqu 32*6-8-128($np), $TEMP1
+ .byte 0x67
+ mov %rax, $r3
+ imull $n0, %eax
+ vpaddq $TEMP2, $ACC4, $ACC4
+ vpmuludq $Y2, $TEMP0, $TEMP0
+ .byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 # vmovdqu 32*7-8-128($np), $TEMP2
+ and \$0x1fffffff, %eax
+ vpaddq $TEMP0, $ACC5, $ACC5
+ vpmuludq $Y2, $TEMP1, $TEMP1
+ vmovdqu 32*8-8-128($np), $TEMP0
+ vpaddq $TEMP1, $ACC6, $ACC6
+ vpmuludq $Y2, $TEMP2, $TEMP2
+ vmovdqu 32*9-8-128($np), $ACC9
+ vmovd %eax, $ACC0 # borrow ACC0 for Y2
+ imulq -128($np), %rax
+ vpaddq $TEMP2, $ACC7, $ACC7
+ vpmuludq $Y2, $TEMP0, $TEMP0
+ vmovdqu 32*1-16-128($np), $TEMP1
+ vpbroadcastq $ACC0, $ACC0
+ vpaddq $TEMP0, $ACC8, $ACC8
+ vpmuludq $Y2, $ACC9, $ACC9
+ vmovdqu 32*2-16-128($np), $TEMP2
+ add %rax, $r3
+
+___
+($ACC0,$Y2)=($Y2,$ACC0);
+$code.=<<___;
+ vmovdqu 32*1-24-128($np), $ACC0
+ vpmuludq $Y1, $TEMP1, $TEMP1
+ vmovdqu 32*3-16-128($np), $TEMP0
+ vpaddq $TEMP1, $ACC1, $ACC1
+ vpmuludq $Y2, $ACC0, $ACC0
+ vpmuludq $Y1, $TEMP2, $TEMP2
+ .byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff # vmovdqu 32*4-16-128($np), $TEMP1
+ vpaddq $ACC1, $ACC0, $ACC0
+ vpaddq $TEMP2, $ACC2, $ACC2
+ vpmuludq $Y1, $TEMP0, $TEMP0
+ vmovdqu 32*5-16-128($np), $TEMP2
+ .byte 0x67
+ vmovq $ACC0, %rax
+ vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
+ vpaddq $TEMP0, $ACC3, $ACC3
+ vpmuludq $Y1, $TEMP1, $TEMP1
+ vmovdqu 32*6-16-128($np), $TEMP0
+ vpaddq $TEMP1, $ACC4, $ACC4
+ vpmuludq $Y1, $TEMP2, $TEMP2
+ vmovdqu 32*7-16-128($np), $TEMP1
+ vpaddq $TEMP2, $ACC5, $ACC5
+ vpmuludq $Y1, $TEMP0, $TEMP0
+ vmovdqu 32*8-16-128($np), $TEMP2
+ vpaddq $TEMP0, $ACC6, $ACC6
+ vpmuludq $Y1, $TEMP1, $TEMP1
+ shr \$29, $r3
+ vmovdqu 32*9-16-128($np), $TEMP0
+ add $r3, %rax
+ vpaddq $TEMP1, $ACC7, $ACC7
+ vpmuludq $Y1, $TEMP2, $TEMP2
+ #vmovdqu 32*2-24-128($np), $TEMP1 # moved below
+ mov %rax, $r0
+ imull $n0, %eax
+ vpaddq $TEMP2, $ACC8, $ACC8
+ vpmuludq $Y1, $TEMP0, $TEMP0
+ and \$0x1fffffff, %eax
+ vmovd %eax, $Y1
+ vmovdqu 32*3-24-128($np), $TEMP2
+ .byte 0x67
+ vpaddq $TEMP0, $ACC9, $ACC9
+ vpbroadcastq $Y1, $Y1
+
+ vpmuludq 32*2-24-128($np), $Y2, $TEMP1 # see above
+ vmovdqu 32*4-24-128($np), $TEMP0
+ mov %rax, %rdx
+ imulq -128($np), %rax
+ mov 8(%rsp), $r1
+ vpaddq $TEMP1, $ACC2, $ACC1
+ vpmuludq $Y2, $TEMP2, $TEMP2
+ vmovdqu 32*5-24-128($np), $TEMP1
+ add %rax, $r0
+ mov %rdx, %rax
+ imulq 8-128($np), %rax
+ .byte 0x67
+ shr \$29, $r0
+ mov 16(%rsp), $r2
+ vpaddq $TEMP2, $ACC3, $ACC2
+ vpmuludq $Y2, $TEMP0, $TEMP0
+ vmovdqu 32*6-24-128($np), $TEMP2
+ add %rax, $r1
+ mov %rdx, %rax
+ imulq 16-128($np), %rax
+ vpaddq $TEMP0, $ACC4, $ACC3
+ vpmuludq $Y2, $TEMP1, $TEMP1
+ vmovdqu 32*7-24-128($np), $TEMP0
+ imulq 24-128($np), %rdx # future $r3
+ add %rax, $r2
+ lea ($r0,$r1), %rax
+ vpaddq $TEMP1, $ACC5, $ACC4
+ vpmuludq $Y2, $TEMP2, $TEMP2
+ vmovdqu 32*8-24-128($np), $TEMP1
+ mov %rax, $r1
+ imull $n0, %eax
+ vpmuludq $Y2, $TEMP0, $TEMP0
+ vpaddq $TEMP2, $ACC6, $ACC5
+ vmovdqu 32*9-24-128($np), $TEMP2
+ and \$0x1fffffff, %eax
+ vpaddq $TEMP0, $ACC7, $ACC6
+ vpmuludq $Y2, $TEMP1, $TEMP1
+ add 24(%rsp), %rdx
+ vpaddq $TEMP1, $ACC8, $ACC7
+ vpmuludq $Y2, $TEMP2, $TEMP2
+ vpaddq $TEMP2, $ACC9, $ACC8
+ vmovq $r3, $ACC9
+ mov %rdx, $r3
+
+ dec $i
+ jnz .LOOP_REDUCE_1024
+___
+($ACC0,$Y2)=($Y2,$ACC0);
+$code.=<<___;
+ lea 448(%rsp), $tp1 # size optimization
+ vpaddq $ACC9, $Y2, $ACC0
+ vpxor $ZERO, $ZERO, $ZERO
+
+ vpaddq 32*9-192($tp0), $ACC0, $ACC0
+ vpaddq 32*10-448($tp1), $ACC1, $ACC1
+ vpaddq 32*11-448($tp1), $ACC2, $ACC2
+ vpaddq 32*12-448($tp1), $ACC3, $ACC3
+ vpaddq 32*13-448($tp1), $ACC4, $ACC4
+ vpaddq 32*14-448($tp1), $ACC5, $ACC5
+ vpaddq 32*15-448($tp1), $ACC6, $ACC6
+ vpaddq 32*16-448($tp1), $ACC7, $ACC7
+ vpaddq 32*17-448($tp1), $ACC8, $ACC8
+
+ vpsrlq \$29, $ACC0, $TEMP1
+ vpand $AND_MASK, $ACC0, $ACC0
+ vpsrlq \$29, $ACC1, $TEMP2
+ vpand $AND_MASK, $ACC1, $ACC1
+ vpsrlq \$29, $ACC2, $TEMP3
+ vpermq \$0x93, $TEMP1, $TEMP1
+ vpand $AND_MASK, $ACC2, $ACC2
+ vpsrlq \$29, $ACC3, $TEMP4
+ vpermq \$0x93, $TEMP2, $TEMP2
+ vpand $AND_MASK, $ACC3, $ACC3
+ vpermq \$0x93, $TEMP3, $TEMP3
+
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
+ vpermq \$0x93, $TEMP4, $TEMP4
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
+ vpaddq $TEMP0, $ACC0, $ACC0
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
+ vpaddq $TEMP1, $ACC1, $ACC1
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
+ vpaddq $TEMP2, $ACC2, $ACC2
+ vpblendd \$3, $TEMP4, $ZERO, $TEMP4
+ vpaddq $TEMP3, $ACC3, $ACC3
+ vpaddq $TEMP4, $ACC4, $ACC4
+
+ vpsrlq \$29, $ACC0, $TEMP1
+ vpand $AND_MASK, $ACC0, $ACC0
+ vpsrlq \$29, $ACC1, $TEMP2
+ vpand $AND_MASK, $ACC1, $ACC1
+ vpsrlq \$29, $ACC2, $TEMP3
+ vpermq \$0x93, $TEMP1, $TEMP1
+ vpand $AND_MASK, $ACC2, $ACC2
+ vpsrlq \$29, $ACC3, $TEMP4
+ vpermq \$0x93, $TEMP2, $TEMP2
+ vpand $AND_MASK, $ACC3, $ACC3
+ vpermq \$0x93, $TEMP3, $TEMP3
+
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
+ vpermq \$0x93, $TEMP4, $TEMP4
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
+ vpaddq $TEMP0, $ACC0, $ACC0
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
+ vpaddq $TEMP1, $ACC1, $ACC1
+ vmovdqu $ACC0, 32*0-128($rp)
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
+ vpaddq $TEMP2, $ACC2, $ACC2
+ vmovdqu $ACC1, 32*1-128($rp)
+ vpblendd \$3, $TEMP4, $ZERO, $TEMP4
+ vpaddq $TEMP3, $ACC3, $ACC3
+ vmovdqu $ACC2, 32*2-128($rp)
+ vpaddq $TEMP4, $ACC4, $ACC4
+ vmovdqu $ACC3, 32*3-128($rp)
+___
+$TEMP5=$ACC0;
+$code.=<<___;
+ vpsrlq \$29, $ACC4, $TEMP1
+ vpand $AND_MASK, $ACC4, $ACC4
+ vpsrlq \$29, $ACC5, $TEMP2
+ vpand $AND_MASK, $ACC5, $ACC5
+ vpsrlq \$29, $ACC6, $TEMP3
+ vpermq \$0x93, $TEMP1, $TEMP1
+ vpand $AND_MASK, $ACC6, $ACC6
+ vpsrlq \$29, $ACC7, $TEMP4
+ vpermq \$0x93, $TEMP2, $TEMP2
+ vpand $AND_MASK, $ACC7, $ACC7
+ vpsrlq \$29, $ACC8, $TEMP5
+ vpermq \$0x93, $TEMP3, $TEMP3
+ vpand $AND_MASK, $ACC8, $ACC8
+ vpermq \$0x93, $TEMP4, $TEMP4
+
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
+ vpermq \$0x93, $TEMP5, $TEMP5
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
+ vpaddq $TEMP0, $ACC4, $ACC4
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
+ vpaddq $TEMP1, $ACC5, $ACC5
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
+ vpaddq $TEMP2, $ACC6, $ACC6
+ vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
+ vpaddq $TEMP3, $ACC7, $ACC7
+ vpaddq $TEMP4, $ACC8, $ACC8
+
+ vpsrlq \$29, $ACC4, $TEMP1
+ vpand $AND_MASK, $ACC4, $ACC4
+ vpsrlq \$29, $ACC5, $TEMP2
+ vpand $AND_MASK, $ACC5, $ACC5
+ vpsrlq \$29, $ACC6, $TEMP3
+ vpermq \$0x93, $TEMP1, $TEMP1
+ vpand $AND_MASK, $ACC6, $ACC6
+ vpsrlq \$29, $ACC7, $TEMP4
+ vpermq \$0x93, $TEMP2, $TEMP2
+ vpand $AND_MASK, $ACC7, $ACC7
+ vpsrlq \$29, $ACC8, $TEMP5
+ vpermq \$0x93, $TEMP3, $TEMP3
+ vpand $AND_MASK, $ACC8, $ACC8
+ vpermq \$0x93, $TEMP4, $TEMP4
+
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
+ vpermq \$0x93, $TEMP5, $TEMP5
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
+ vpaddq $TEMP0, $ACC4, $ACC4
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
+ vpaddq $TEMP1, $ACC5, $ACC5
+ vmovdqu $ACC4, 32*4-128($rp)
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
+ vpaddq $TEMP2, $ACC6, $ACC6
+ vmovdqu $ACC5, 32*5-128($rp)
+ vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
+ vpaddq $TEMP3, $ACC7, $ACC7
+ vmovdqu $ACC6, 32*6-128($rp)
+ vpaddq $TEMP4, $ACC8, $ACC8
+ vmovdqu $ACC7, 32*7-128($rp)
+ vmovdqu $ACC8, 32*8-128($rp)
+
+ mov $rp, $ap
+ dec $rep
+ jne .LOOP_GRANDE_SQR_1024
+
+ vzeroall
+ mov %rbp, %rax
+___
+$code.=<<___ if ($win64);
+ movaps -0xd8(%rax),%xmm6
+ movaps -0xc8(%rax),%xmm7
+ movaps -0xb8(%rax),%xmm8
+ movaps -0xa8(%rax),%xmm9
+ movaps -0x98(%rax),%xmm10
+ movaps -0x88(%rax),%xmm11
+ movaps -0x78(%rax),%xmm12
+ movaps -0x68(%rax),%xmm13
+ movaps -0x58(%rax),%xmm14
+ movaps -0x48(%rax),%xmm15
+___
+$code.=<<___;
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp # restore %rsp
+.Lsqr_1024_epilogue:
+ ret
+.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
+___
+}
+
+{ # void AMM_WW(
+my $rp="%rdi"; # BN_ULONG *rp,
+my $ap="%rsi"; # const BN_ULONG *ap,
+my $bp="%rdx"; # const BN_ULONG *bp,
+my $np="%rcx"; # const BN_ULONG *np,
+my $n0="%r8d"; # unsigned int n0);
+
+# The registers that hold the accumulated redundant result
+# The AMM works on 1024 bit operands, and redundant word size is 29
+# Therefore: ceil(1024/29)/4 = 9
+my $ACC0="%ymm0";
+my $ACC1="%ymm1";
+my $ACC2="%ymm2";
+my $ACC3="%ymm3";
+my $ACC4="%ymm4";
+my $ACC5="%ymm5";
+my $ACC6="%ymm6";
+my $ACC7="%ymm7";
+my $ACC8="%ymm8";
+my $ACC9="%ymm9";
+
+# Registers that hold the broadcasted words of multiplier, currently used
+my $Bi="%ymm10";
+my $Yi="%ymm11";
+
+# Helper registers
+my $TEMP0=$ACC0;
+my $TEMP1="%ymm12";
+my $TEMP2="%ymm13";
+my $ZERO="%ymm14";
+my $AND_MASK="%ymm15";
+
+# alu registers that hold the first words of the ACC
+my $r0="%r9";
+my $r1="%r10";
+my $r2="%r11";
+my $r3="%r12";
+
+my $i="%r14d";
+my $tmp="%r15";
+
+$bp="%r13"; # reassigned argument
+
+$code.=<<___;
+.globl rsaz_1024_mul_avx2
+.type rsaz_1024_mul_avx2,\@function,5
+.align 64
+rsaz_1024_mul_avx2:
+ lea (%rsp), %rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ vzeroupper
+ lea -0xa8(%rsp),%rsp
+ vmovaps %xmm6,-0xd8(%rax)
+ vmovaps %xmm7,-0xc8(%rax)
+ vmovaps %xmm8,-0xb8(%rax)
+ vmovaps %xmm9,-0xa8(%rax)
+ vmovaps %xmm10,-0x98(%rax)
+ vmovaps %xmm11,-0x88(%rax)
+ vmovaps %xmm12,-0x78(%rax)
+ vmovaps %xmm13,-0x68(%rax)
+ vmovaps %xmm14,-0x58(%rax)
+ vmovaps %xmm15,-0x48(%rax)
+.Lmul_1024_body:
+___
+$code.=<<___;
+ mov %rax,%rbp
+ vzeroall
+ mov %rdx, $bp # reassigned argument
+ sub \$64,%rsp
+
+ # unaligned 256-bit load that crosses page boundary can
+ # cause severe performance degradation here, so if $ap does
+ # cross page boundary, swap it with $bp [meaning that caller
+ # is advised to lay down $ap and $bp next to each other, so
+ # that only one can cross page boundary].
+ .byte 0x67,0x67
+ mov $ap, $tmp
+ and \$4095, $tmp
+ add \$32*10, $tmp
+ shr \$12, $tmp
+ mov $ap, $tmp
+ cmovnz $bp, $ap
+ cmovnz $tmp, $bp
+
+ mov $np, $tmp
+ sub \$-128,$ap # size optimization
+ sub \$-128,$np
+ sub \$-128,$rp
+
+ and \$4095, $tmp # see if $np crosses page
+ add \$32*10, $tmp
+ .byte 0x67,0x67
+ shr \$12, $tmp
+ jz .Lmul_1024_no_n_copy
+
+ # unaligned 256-bit load that crosses page boundary can
+ # cause severe performance degradation here, so if $np does
+ # cross page boundary, copy it to stack and make sure stack
+ # frame doesn't...
+ sub \$32*10,%rsp
+ vmovdqu 32*0-128($np), $ACC0
+ and \$-512, %rsp
+ vmovdqu 32*1-128($np), $ACC1
+ vmovdqu 32*2-128($np), $ACC2
+ vmovdqu 32*3-128($np), $ACC3
+ vmovdqu 32*4-128($np), $ACC4
+ vmovdqu 32*5-128($np), $ACC5
+ vmovdqu 32*6-128($np), $ACC6
+ vmovdqu 32*7-128($np), $ACC7
+ vmovdqu 32*8-128($np), $ACC8
+ lea 64+128(%rsp),$np
+ vmovdqu $ACC0, 32*0-128($np)
+ vpxor $ACC0, $ACC0, $ACC0
+ vmovdqu $ACC1, 32*1-128($np)
+ vpxor $ACC1, $ACC1, $ACC1
+ vmovdqu $ACC2, 32*2-128($np)
+ vpxor $ACC2, $ACC2, $ACC2
+ vmovdqu $ACC3, 32*3-128($np)
+ vpxor $ACC3, $ACC3, $ACC3
+ vmovdqu $ACC4, 32*4-128($np)
+ vpxor $ACC4, $ACC4, $ACC4
+ vmovdqu $ACC5, 32*5-128($np)
+ vpxor $ACC5, $ACC5, $ACC5
+ vmovdqu $ACC6, 32*6-128($np)
+ vpxor $ACC6, $ACC6, $ACC6
+ vmovdqu $ACC7, 32*7-128($np)
+ vpxor $ACC7, $ACC7, $ACC7
+ vmovdqu $ACC8, 32*8-128($np)
+ vmovdqa $ACC0, $ACC8
+ vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero after vzeroall
+.Lmul_1024_no_n_copy:
+ and \$-64,%rsp
+
+ mov ($bp), %rbx
+ vpbroadcastq ($bp), $Bi
+ vmovdqu $ACC0, (%rsp) # clear top of stack
+ xor $r0, $r0
+ .byte 0x67
+ xor $r1, $r1
+ xor $r2, $r2
+ xor $r3, $r3
+
+ vmovdqu .Land_mask(%rip), $AND_MASK
+ mov \$9, $i
+ vmovdqu $ACC9, 32*9-128($rp) # $ACC9 is zero after vzeroall
+ jmp .Loop_mul_1024
+
+.align 32
+.Loop_mul_1024:
+ vpsrlq \$29, $ACC3, $ACC9 # correct $ACC3(*)
+ mov %rbx, %rax
+ imulq -128($ap), %rax
+ add $r0, %rax
+ mov %rbx, $r1
+ imulq 8-128($ap), $r1
+ add 8(%rsp), $r1
+
+ mov %rax, $r0
+ imull $n0, %eax
+ and \$0x1fffffff, %eax
+
+ mov %rbx, $r2
+ imulq 16-128($ap), $r2
+ add 16(%rsp), $r2
+
+ mov %rbx, $r3
+ imulq 24-128($ap), $r3
+ add 24(%rsp), $r3
+ vpmuludq 32*1-128($ap),$Bi,$TEMP0
+ vmovd %eax, $Yi
+ vpaddq $TEMP0,$ACC1,$ACC1
+ vpmuludq 32*2-128($ap),$Bi,$TEMP1
+ vpbroadcastq $Yi, $Yi
+ vpaddq $TEMP1,$ACC2,$ACC2
+ vpmuludq 32*3-128($ap),$Bi,$TEMP2
+ vpand $AND_MASK, $ACC3, $ACC3 # correct $ACC3
+ vpaddq $TEMP2,$ACC3,$ACC3
+ vpmuludq 32*4-128($ap),$Bi,$TEMP0
+ vpaddq $TEMP0,$ACC4,$ACC4
+ vpmuludq 32*5-128($ap),$Bi,$TEMP1
+ vpaddq $TEMP1,$ACC5,$ACC5
+ vpmuludq 32*6-128($ap),$Bi,$TEMP2
+ vpaddq $TEMP2,$ACC6,$ACC6
+ vpmuludq 32*7-128($ap),$Bi,$TEMP0
+ vpermq \$0x93, $ACC9, $ACC9 # correct $ACC3
+ vpaddq $TEMP0,$ACC7,$ACC7
+ vpmuludq 32*8-128($ap),$Bi,$TEMP1
+ vpbroadcastq 8($bp), $Bi
+ vpaddq $TEMP1,$ACC8,$ACC8
+
+ mov %rax,%rdx
+ imulq -128($np),%rax
+ add %rax,$r0
+ mov %rdx,%rax
+ imulq 8-128($np),%rax
+ add %rax,$r1
+ mov %rdx,%rax
+ imulq 16-128($np),%rax
+ add %rax,$r2
+ shr \$29, $r0
+ imulq 24-128($np),%rdx
+ add %rdx,$r3
+ add $r0, $r1
+
+ vpmuludq 32*1-128($np),$Yi,$TEMP2
+ vmovq $Bi, %rbx
+ vpaddq $TEMP2,$ACC1,$ACC1
+ vpmuludq 32*2-128($np),$Yi,$TEMP0
+ vpaddq $TEMP0,$ACC2,$ACC2
+ vpmuludq 32*3-128($np),$Yi,$TEMP1
+ vpaddq $TEMP1,$ACC3,$ACC3
+ vpmuludq 32*4-128($np),$Yi,$TEMP2
+ vpaddq $TEMP2,$ACC4,$ACC4
+ vpmuludq 32*5-128($np),$Yi,$TEMP0
+ vpaddq $TEMP0,$ACC5,$ACC5
+ vpmuludq 32*6-128($np),$Yi,$TEMP1
+ vpaddq $TEMP1,$ACC6,$ACC6
+ vpmuludq 32*7-128($np),$Yi,$TEMP2
+ vpblendd \$3, $ZERO, $ACC9, $ACC9 # correct $ACC3
+ vpaddq $TEMP2,$ACC7,$ACC7
+ vpmuludq 32*8-128($np),$Yi,$TEMP0
+ vpaddq $ACC9, $ACC3, $ACC3 # correct $ACC3
+ vpaddq $TEMP0,$ACC8,$ACC8
+
+ mov %rbx, %rax
+ imulq -128($ap),%rax
+ add %rax,$r1
+ vmovdqu -8+32*1-128($ap),$TEMP1
+ mov %rbx, %rax
+ imulq 8-128($ap),%rax
+ add %rax,$r2
+ vmovdqu -8+32*2-128($ap),$TEMP2
+
+ mov $r1, %rax
+ imull $n0, %eax
+ and \$0x1fffffff, %eax
+
+ imulq 16-128($ap),%rbx
+ add %rbx,$r3
+ vpmuludq $Bi,$TEMP1,$TEMP1
+ vmovd %eax, $Yi
+ vmovdqu -8+32*3-128($ap),$TEMP0
+ vpaddq $TEMP1,$ACC1,$ACC1
+ vpmuludq $Bi,$TEMP2,$TEMP2
+ vpbroadcastq $Yi, $Yi
+ vmovdqu -8+32*4-128($ap),$TEMP1
+ vpaddq $TEMP2,$ACC2,$ACC2
+ vpmuludq $Bi,$TEMP0,$TEMP0
+ vmovdqu -8+32*5-128($ap),$TEMP2
+ vpaddq $TEMP0,$ACC3,$ACC3
+ vpmuludq $Bi,$TEMP1,$TEMP1
+ vmovdqu -8+32*6-128($ap),$TEMP0
+ vpaddq $TEMP1,$ACC4,$ACC4
+ vpmuludq $Bi,$TEMP2,$TEMP2
+ vmovdqu -8+32*7-128($ap),$TEMP1
+ vpaddq $TEMP2,$ACC5,$ACC5
+ vpmuludq $Bi,$TEMP0,$TEMP0
+ vmovdqu -8+32*8-128($ap),$TEMP2
+ vpaddq $TEMP0,$ACC6,$ACC6
+ vpmuludq $Bi,$TEMP1,$TEMP1
+ vmovdqu -8+32*9-128($ap),$ACC9
+ vpaddq $TEMP1,$ACC7,$ACC7
+ vpmuludq $Bi,$TEMP2,$TEMP2
+ vpaddq $TEMP2,$ACC8,$ACC8
+ vpmuludq $Bi,$ACC9,$ACC9
+ vpbroadcastq 16($bp), $Bi
+
+ mov %rax,%rdx
+ imulq -128($np),%rax
+ add %rax,$r1
+ vmovdqu -8+32*1-128($np),$TEMP0
+ mov %rdx,%rax
+ imulq 8-128($np),%rax
+ add %rax,$r2
+ vmovdqu -8+32*2-128($np),$TEMP1
+ shr \$29, $r1
+ imulq 16-128($np),%rdx
+ add %rdx,$r3
+ add $r1, $r2
+
+ vpmuludq $Yi,$TEMP0,$TEMP0
+ vmovq $Bi, %rbx
+ vmovdqu -8+32*3-128($np),$TEMP2
+ vpaddq $TEMP0,$ACC1,$ACC1
+ vpmuludq $Yi,$TEMP1,$TEMP1
+ vmovdqu -8+32*4-128($np),$TEMP0
+ vpaddq $TEMP1,$ACC2,$ACC2
+ vpmuludq $Yi,$TEMP2,$TEMP2
+ vmovdqu -8+32*5-128($np),$TEMP1
+ vpaddq $TEMP2,$ACC3,$ACC3
+ vpmuludq $Yi,$TEMP0,$TEMP0
+ vmovdqu -8+32*6-128($np),$TEMP2
+ vpaddq $TEMP0,$ACC4,$ACC4
+ vpmuludq $Yi,$TEMP1,$TEMP1
+ vmovdqu -8+32*7-128($np),$TEMP0
+ vpaddq $TEMP1,$ACC5,$ACC5
+ vpmuludq $Yi,$TEMP2,$TEMP2
+ vmovdqu -8+32*8-128($np),$TEMP1
+ vpaddq $TEMP2,$ACC6,$ACC6
+ vpmuludq $Yi,$TEMP0,$TEMP0
+ vmovdqu -8+32*9-128($np),$TEMP2
+ vpaddq $TEMP0,$ACC7,$ACC7
+ vpmuludq $Yi,$TEMP1,$TEMP1
+ vpaddq $TEMP1,$ACC8,$ACC8
+ vpmuludq $Yi,$TEMP2,$TEMP2
+ vpaddq $TEMP2,$ACC9,$ACC9
+
+ vmovdqu -16+32*1-128($ap),$TEMP0
+ mov %rbx,%rax
+ imulq -128($ap),%rax
+ add $r2,%rax
+
+ vmovdqu -16+32*2-128($ap),$TEMP1
+ mov %rax,$r2
+ imull $n0, %eax
+ and \$0x1fffffff, %eax
+
+ imulq 8-128($ap),%rbx
+ add %rbx,$r3
+ vpmuludq $Bi,$TEMP0,$TEMP0
+ vmovd %eax, $Yi
+ vmovdqu -16+32*3-128($ap),$TEMP2
+ vpaddq $TEMP0,$ACC1,$ACC1
+ vpmuludq $Bi,$TEMP1,$TEMP1
+ vpbroadcastq $Yi, $Yi
+ vmovdqu -16+32*4-128($ap),$TEMP0
+ vpaddq $TEMP1,$ACC2,$ACC2
+ vpmuludq $Bi,$TEMP2,$TEMP2
+ vmovdqu -16+32*5-128($ap),$TEMP1
+ vpaddq $TEMP2,$ACC3,$ACC3
+ vpmuludq $Bi,$TEMP0,$TEMP0
+ vmovdqu -16+32*6-128($ap),$TEMP2
+ vpaddq $TEMP0,$ACC4,$ACC4
+ vpmuludq $Bi,$TEMP1,$TEMP1
+ vmovdqu -16+32*7-128($ap),$TEMP0
+ vpaddq $TEMP1,$ACC5,$ACC5
+ vpmuludq $Bi,$TEMP2,$TEMP2
+ vmovdqu -16+32*8-128($ap),$TEMP1
+ vpaddq $TEMP2,$ACC6,$ACC6
+ vpmuludq $Bi,$TEMP0,$TEMP0
+ vmovdqu -16+32*9-128($ap),$TEMP2
+ vpaddq $TEMP0,$ACC7,$ACC7
+ vpmuludq $Bi,$TEMP1,$TEMP1
+ vpaddq $TEMP1,$ACC8,$ACC8
+ vpmuludq $Bi,$TEMP2,$TEMP2
+ vpbroadcastq 24($bp), $Bi
+ vpaddq $TEMP2,$ACC9,$ACC9
+
+ vmovdqu -16+32*1-128($np),$TEMP0
+ mov %rax,%rdx
+ imulq -128($np),%rax
+ add %rax,$r2
+ vmovdqu -16+32*2-128($np),$TEMP1
+ imulq 8-128($np),%rdx
+ add %rdx,$r3
+ shr \$29, $r2
+
+ vpmuludq $Yi,$TEMP0,$TEMP0
+ vmovq $Bi, %rbx
+ vmovdqu -16+32*3-128($np),$TEMP2
+ vpaddq $TEMP0,$ACC1,$ACC1
+ vpmuludq $Yi,$TEMP1,$TEMP1
+ vmovdqu -16+32*4-128($np),$TEMP0
+ vpaddq $TEMP1,$ACC2,$ACC2
+ vpmuludq $Yi,$TEMP2,$TEMP2
+ vmovdqu -16+32*5-128($np),$TEMP1
+ vpaddq $TEMP2,$ACC3,$ACC3
+ vpmuludq $Yi,$TEMP0,$TEMP0
+ vmovdqu -16+32*6-128($np),$TEMP2
+ vpaddq $TEMP0,$ACC4,$ACC4
+ vpmuludq $Yi,$TEMP1,$TEMP1
+ vmovdqu -16+32*7-128($np),$TEMP0
+ vpaddq $TEMP1,$ACC5,$ACC5
+ vpmuludq $Yi,$TEMP2,$TEMP2
+ vmovdqu -16+32*8-128($np),$TEMP1
+ vpaddq $TEMP2,$ACC6,$ACC6
+ vpmuludq $Yi,$TEMP0,$TEMP0
+ vmovdqu -16+32*9-128($np),$TEMP2
+ vpaddq $TEMP0,$ACC7,$ACC7
+ vpmuludq $Yi,$TEMP1,$TEMP1
+ vmovdqu -24+32*1-128($ap),$TEMP0
+ vpaddq $TEMP1,$ACC8,$ACC8
+ vpmuludq $Yi,$TEMP2,$TEMP2
+ vmovdqu -24+32*2-128($ap),$TEMP1
+ vpaddq $TEMP2,$ACC9,$ACC9
+
+ add $r2, $r3
+ imulq -128($ap),%rbx
+ add %rbx,$r3
+
+ mov $r3, %rax
+ imull $n0, %eax
+ and \$0x1fffffff, %eax
+
+ vpmuludq $Bi,$TEMP0,$TEMP0
+ vmovd %eax, $Yi
+ vmovdqu -24+32*3-128($ap),$TEMP2
+ vpaddq $TEMP0,$ACC1,$ACC1
+ vpmuludq $Bi,$TEMP1,$TEMP1
+ vpbroadcastq $Yi, $Yi
+ vmovdqu -24+32*4-128($ap),$TEMP0
+ vpaddq $TEMP1,$ACC2,$ACC2
+ vpmuludq $Bi,$TEMP2,$TEMP2
+ vmovdqu -24+32*5-128($ap),$TEMP1
+ vpaddq $TEMP2,$ACC3,$ACC3
+ vpmuludq $Bi,$TEMP0,$TEMP0
+ vmovdqu -24+32*6-128($ap),$TEMP2
+ vpaddq $TEMP0,$ACC4,$ACC4
+ vpmuludq $Bi,$TEMP1,$TEMP1
+ vmovdqu -24+32*7-128($ap),$TEMP0
+ vpaddq $TEMP1,$ACC5,$ACC5
+ vpmuludq $Bi,$TEMP2,$TEMP2
+ vmovdqu -24+32*8-128($ap),$TEMP1
+ vpaddq $TEMP2,$ACC6,$ACC6
+ vpmuludq $Bi,$TEMP0,$TEMP0
+ vmovdqu -24+32*9-128($ap),$TEMP2
+ vpaddq $TEMP0,$ACC7,$ACC7
+ vpmuludq $Bi,$TEMP1,$TEMP1
+ vpaddq $TEMP1,$ACC8,$ACC8
+ vpmuludq $Bi,$TEMP2,$TEMP2
+ vpbroadcastq 32($bp), $Bi
+ vpaddq $TEMP2,$ACC9,$ACC9
+ add \$32, $bp # $bp++
+
+ vmovdqu -24+32*1-128($np),$TEMP0
+ imulq -128($np),%rax
+ add %rax,$r3
+ shr \$29, $r3
+
+ vmovdqu -24+32*2-128($np),$TEMP1
+ vpmuludq $Yi,$TEMP0,$TEMP0
+ vmovq $Bi, %rbx
+ vmovdqu -24+32*3-128($np),$TEMP2
+ vpaddq $TEMP0,$ACC1,$ACC0 # $ACC0==$TEMP0
+ vpmuludq $Yi,$TEMP1,$TEMP1
+ vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
+ vpaddq $TEMP1,$ACC2,$ACC1
+ vmovdqu -24+32*4-128($np),$TEMP0
+ vpmuludq $Yi,$TEMP2,$TEMP2
+ vmovdqu -24+32*5-128($np),$TEMP1
+ vpaddq $TEMP2,$ACC3,$ACC2
+ vpmuludq $Yi,$TEMP0,$TEMP0
+ vmovdqu -24+32*6-128($np),$TEMP2
+ vpaddq $TEMP0,$ACC4,$ACC3
+ vpmuludq $Yi,$TEMP1,$TEMP1
+ vmovdqu -24+32*7-128($np),$TEMP0
+ vpaddq $TEMP1,$ACC5,$ACC4
+ vpmuludq $Yi,$TEMP2,$TEMP2
+ vmovdqu -24+32*8-128($np),$TEMP1
+ vpaddq $TEMP2,$ACC6,$ACC5
+ vpmuludq $Yi,$TEMP0,$TEMP0
+ vmovdqu -24+32*9-128($np),$TEMP2
+ mov $r3, $r0
+ vpaddq $TEMP0,$ACC7,$ACC6
+ vpmuludq $Yi,$TEMP1,$TEMP1
+ add (%rsp), $r0
+ vpaddq $TEMP1,$ACC8,$ACC7
+ vpmuludq $Yi,$TEMP2,$TEMP2
+ vmovq $r3, $TEMP1
+ vpaddq $TEMP2,$ACC9,$ACC8
+
+ dec $i
+ jnz .Loop_mul_1024
+___
+
+# (*) Original implementation was correcting ACC1-ACC3 for overflow
+# after 7 loop runs, or after 28 iterations, or 56 additions.
+# But as we underutilize resources, it's possible to correct in
+# each iteration with marginal performance loss. But then, as
+# we do it in each iteration, we can correct less digits, and
+# avoid performance penalties completely. Also note that we
+# correct only three digits out of four. This works because
+# most significant digit is subjected to less additions.
+
+$TEMP0 = $ACC9;
+$TEMP3 = $Bi;
+$TEMP4 = $Yi;
+$code.=<<___;
+ vpermq \$0, $AND_MASK, $AND_MASK
+ vpaddq (%rsp), $TEMP1, $ACC0
+
+ vpsrlq \$29, $ACC0, $TEMP1
+ vpand $AND_MASK, $ACC0, $ACC0
+ vpsrlq \$29, $ACC1, $TEMP2
+ vpand $AND_MASK, $ACC1, $ACC1
+ vpsrlq \$29, $ACC2, $TEMP3
+ vpermq \$0x93, $TEMP1, $TEMP1
+ vpand $AND_MASK, $ACC2, $ACC2
+ vpsrlq \$29, $ACC3, $TEMP4
+ vpermq \$0x93, $TEMP2, $TEMP2
+ vpand $AND_MASK, $ACC3, $ACC3
+
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
+ vpermq \$0x93, $TEMP3, $TEMP3
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
+ vpermq \$0x93, $TEMP4, $TEMP4
+ vpaddq $TEMP0, $ACC0, $ACC0
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
+ vpaddq $TEMP1, $ACC1, $ACC1
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
+ vpaddq $TEMP2, $ACC2, $ACC2
+ vpblendd \$3, $TEMP4, $ZERO, $TEMP4
+ vpaddq $TEMP3, $ACC3, $ACC3
+ vpaddq $TEMP4, $ACC4, $ACC4
+
+ vpsrlq \$29, $ACC0, $TEMP1
+ vpand $AND_MASK, $ACC0, $ACC0
+ vpsrlq \$29, $ACC1, $TEMP2
+ vpand $AND_MASK, $ACC1, $ACC1
+ vpsrlq \$29, $ACC2, $TEMP3
+ vpermq \$0x93, $TEMP1, $TEMP1
+ vpand $AND_MASK, $ACC2, $ACC2
+ vpsrlq \$29, $ACC3, $TEMP4
+ vpermq \$0x93, $TEMP2, $TEMP2
+ vpand $AND_MASK, $ACC3, $ACC3
+ vpermq \$0x93, $TEMP3, $TEMP3
+
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
+ vpermq \$0x93, $TEMP4, $TEMP4
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
+ vpaddq $TEMP0, $ACC0, $ACC0
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
+ vpaddq $TEMP1, $ACC1, $ACC1
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
+ vpaddq $TEMP2, $ACC2, $ACC2
+ vpblendd \$3, $TEMP4, $ZERO, $TEMP4
+ vpaddq $TEMP3, $ACC3, $ACC3
+ vpaddq $TEMP4, $ACC4, $ACC4
+
+ vmovdqu $ACC0, 0-128($rp)
+ vmovdqu $ACC1, 32-128($rp)
+ vmovdqu $ACC2, 64-128($rp)
+ vmovdqu $ACC3, 96-128($rp)
+___
+
+$TEMP5=$ACC0;
+$code.=<<___;
+ vpsrlq \$29, $ACC4, $TEMP1
+ vpand $AND_MASK, $ACC4, $ACC4
+ vpsrlq \$29, $ACC5, $TEMP2
+ vpand $AND_MASK, $ACC5, $ACC5
+ vpsrlq \$29, $ACC6, $TEMP3
+ vpermq \$0x93, $TEMP1, $TEMP1
+ vpand $AND_MASK, $ACC6, $ACC6
+ vpsrlq \$29, $ACC7, $TEMP4
+ vpermq \$0x93, $TEMP2, $TEMP2
+ vpand $AND_MASK, $ACC7, $ACC7
+ vpsrlq \$29, $ACC8, $TEMP5
+ vpermq \$0x93, $TEMP3, $TEMP3
+ vpand $AND_MASK, $ACC8, $ACC8
+ vpermq \$0x93, $TEMP4, $TEMP4
+
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
+ vpermq \$0x93, $TEMP5, $TEMP5
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
+ vpaddq $TEMP0, $ACC4, $ACC4
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
+ vpaddq $TEMP1, $ACC5, $ACC5
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
+ vpaddq $TEMP2, $ACC6, $ACC6
+ vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
+ vpaddq $TEMP3, $ACC7, $ACC7
+ vpaddq $TEMP4, $ACC8, $ACC8
+
+ vpsrlq \$29, $ACC4, $TEMP1
+ vpand $AND_MASK, $ACC4, $ACC4
+ vpsrlq \$29, $ACC5, $TEMP2
+ vpand $AND_MASK, $ACC5, $ACC5
+ vpsrlq \$29, $ACC6, $TEMP3
+ vpermq \$0x93, $TEMP1, $TEMP1
+ vpand $AND_MASK, $ACC6, $ACC6
+ vpsrlq \$29, $ACC7, $TEMP4
+ vpermq \$0x93, $TEMP2, $TEMP2
+ vpand $AND_MASK, $ACC7, $ACC7
+ vpsrlq \$29, $ACC8, $TEMP5
+ vpermq \$0x93, $TEMP3, $TEMP3
+ vpand $AND_MASK, $ACC8, $ACC8
+ vpermq \$0x93, $TEMP4, $TEMP4
+
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
+ vpermq \$0x93, $TEMP5, $TEMP5
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
+ vpaddq $TEMP0, $ACC4, $ACC4
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
+ vpaddq $TEMP1, $ACC5, $ACC5
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
+ vpaddq $TEMP2, $ACC6, $ACC6
+ vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
+ vpaddq $TEMP3, $ACC7, $ACC7
+ vpaddq $TEMP4, $ACC8, $ACC8
+
+ vmovdqu $ACC4, 128-128($rp)
+ vmovdqu $ACC5, 160-128($rp)
+ vmovdqu $ACC6, 192-128($rp)
+ vmovdqu $ACC7, 224-128($rp)
+ vmovdqu $ACC8, 256-128($rp)
+ vzeroupper
+
+ mov %rbp, %rax
+___
+$code.=<<___ if ($win64);
+ movaps -0xd8(%rax),%xmm6
+ movaps -0xc8(%rax),%xmm7
+ movaps -0xb8(%rax),%xmm8
+ movaps -0xa8(%rax),%xmm9
+ movaps -0x98(%rax),%xmm10
+ movaps -0x88(%rax),%xmm11
+ movaps -0x78(%rax),%xmm12
+ movaps -0x68(%rax),%xmm13
+ movaps -0x58(%rax),%xmm14
+ movaps -0x48(%rax),%xmm15
+___
+$code.=<<___;
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp # restore %rsp
+.Lmul_1024_epilogue:
+ ret
+.size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
+___
+}
+{
+my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi");
+my @T = map("%r$_",(8..11));
+
+$code.=<<___;
+.globl rsaz_1024_red2norm_avx2
+.type rsaz_1024_red2norm_avx2,\@abi-omnipotent
+.align 32
+rsaz_1024_red2norm_avx2:
+ sub \$-128,$inp # size optimization
+ xor %rax,%rax
+___
+
+for ($j=0,$i=0; $i<16; $i++) {
+ my $k=0;
+ while (29*$j<64*($i+1)) { # load data till boundary
+ $code.=" mov `8*$j-128`($inp), @T[0]\n";
+ $j++; $k++; push(@T,shift(@T));
+ }
+ $l=$k;
+ while ($k>1) { # shift loaded data but last value
+ $code.=" shl \$`29*($j-$k)`,@T[-$k]\n";
+ $k--;
+ }
+ $code.=<<___; # shift last value
+ mov @T[-1], @T[0]
+ shl \$`29*($j-1)`, @T[-1]
+ shr \$`-29*($j-1)`, @T[0]
+___
+ while ($l) { # accumulate all values
+ $code.=" add @T[-$l], %rax\n";
+ $l--;
+ }
+ $code.=<<___;
+ adc \$0, @T[0] # consume eventual carry
+ mov %rax, 8*$i($out)
+ mov @T[0], %rax
+___
+ push(@T,shift(@T));
+}
+$code.=<<___;
+ ret
+.size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
+
+.globl rsaz_1024_norm2red_avx2
+.type rsaz_1024_norm2red_avx2,\@abi-omnipotent
+.align 32
+rsaz_1024_norm2red_avx2:
+ sub \$-128,$out # size optimization
+ mov ($inp),@T[0]
+ mov \$0x1fffffff,%eax
+___
+for ($j=0,$i=0; $i<16; $i++) {
+ $code.=" mov `8*($i+1)`($inp),@T[1]\n" if ($i<15);
+ $code.=" xor @T[1],@T[1]\n" if ($i==15);
+ my $k=1;
+ while (29*($j+1)<64*($i+1)) {
+ $code.=<<___;
+ mov @T[0],@T[-$k]
+ shr \$`29*$j`,@T[-$k]
+ and %rax,@T[-$k] # &0x1fffffff
+ mov @T[-$k],`8*$j-128`($out)
+___
+ $j++; $k++;
+ }
+ $code.=<<___;
+ shrd \$`29*$j`,@T[1],@T[0]
+ and %rax,@T[0]
+ mov @T[0],`8*$j-128`($out)
+___
+ $j++;
+ push(@T,shift(@T));
+}
+$code.=<<___;
+ mov @T[0],`8*$j-128`($out) # zero
+ mov @T[0],`8*($j+1)-128`($out)
+ mov @T[0],`8*($j+2)-128`($out)
+ mov @T[0],`8*($j+3)-128`($out)
+ ret
+.size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
+___
+}
+{
+my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
+
+$code.=<<___;
+.globl rsaz_1024_scatter5_avx2
+.type rsaz_1024_scatter5_avx2,\@abi-omnipotent
+.align 32
+rsaz_1024_scatter5_avx2:
+ vzeroupper
+ vmovdqu .Lscatter_permd(%rip),%ymm5
+ shl \$4,$power
+ lea ($out,$power),$out
+ mov \$9,%eax
+ jmp .Loop_scatter_1024
+
+.align 32
+.Loop_scatter_1024:
+ vmovdqu ($inp),%ymm0
+ lea 32($inp),$inp
+ vpermd %ymm0,%ymm5,%ymm0
+ vmovdqu %xmm0,($out)
+ lea 16*32($out),$out
+ dec %eax
+ jnz .Loop_scatter_1024
+
+ vzeroupper
+ ret
+.size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
+
+.globl rsaz_1024_gather5_avx2
+.type rsaz_1024_gather5_avx2,\@abi-omnipotent
+.align 32
+rsaz_1024_gather5_avx2:
+___
+$code.=<<___ if ($win64);
+ lea -0x88(%rsp),%rax
+ vzeroupper
+.LSEH_begin_rsaz_1024_gather5:
+ # I can't trust assembler to use specific encoding:-(
+ .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
+ .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6,-0x20(%rax)
+ .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7,-0x10(%rax)
+ .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8,0(%rax)
+ .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9,0x10(%rax)
+ .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10,0x20(%rax)
+ .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11,0x30(%rax)
+ .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12,0x40(%rax)
+ .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13,0x50(%rax)
+ .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14,0x60(%rax)
+ .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15,0x70(%rax)
+___
+$code.=<<___;
+ lea .Lgather_table(%rip),%r11
+ mov $power,%eax
+ and \$3,$power
+ shr \$2,%eax # cache line number
+ shl \$4,$power # offset within cache line
+
+ vmovdqu -32(%r11),%ymm7 # .Lgather_permd
+ vpbroadcastb 8(%r11,%rax), %xmm8
+ vpbroadcastb 7(%r11,%rax), %xmm9
+ vpbroadcastb 6(%r11,%rax), %xmm10
+ vpbroadcastb 5(%r11,%rax), %xmm11
+ vpbroadcastb 4(%r11,%rax), %xmm12
+ vpbroadcastb 3(%r11,%rax), %xmm13
+ vpbroadcastb 2(%r11,%rax), %xmm14
+ vpbroadcastb 1(%r11,%rax), %xmm15
+
+ lea 64($inp,$power),$inp
+ mov \$64,%r11 # size optimization
+ mov \$9,%eax
+ jmp .Loop_gather_1024
+
+.align 32
+.Loop_gather_1024:
+ vpand -64($inp), %xmm8,%xmm0
+ vpand ($inp), %xmm9,%xmm1
+ vpand 64($inp), %xmm10,%xmm2
+ vpand ($inp,%r11,2), %xmm11,%xmm3
+ vpor %xmm0,%xmm1,%xmm1
+ vpand 64($inp,%r11,2), %xmm12,%xmm4
+ vpor %xmm2,%xmm3,%xmm3
+ vpand ($inp,%r11,4), %xmm13,%xmm5
+ vpor %xmm1,%xmm3,%xmm3
+ vpand 64($inp,%r11,4), %xmm14,%xmm6
+ vpor %xmm4,%xmm5,%xmm5
+ vpand -128($inp,%r11,8), %xmm15,%xmm2
+ lea ($inp,%r11,8),$inp
+ vpor %xmm3,%xmm5,%xmm5
+ vpor %xmm2,%xmm6,%xmm6
+ vpor %xmm5,%xmm6,%xmm6
+ vpermd %ymm6,%ymm7,%ymm6
+ vmovdqu %ymm6,($out)
+ lea 32($out),$out
+ dec %eax
+ jnz .Loop_gather_1024
+
+ vpxor %ymm0,%ymm0,%ymm0
+ vmovdqu %ymm0,($out)
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ movaps 0x10(%rsp),%xmm7
+ movaps 0x20(%rsp),%xmm8
+ movaps 0x30(%rsp),%xmm9
+ movaps 0x40(%rsp),%xmm10
+ movaps 0x50(%rsp),%xmm11
+ movaps 0x60(%rsp),%xmm12
+ movaps 0x70(%rsp),%xmm13
+ movaps 0x80(%rsp),%xmm14
+ movaps 0x90(%rsp),%xmm15
+ lea 0xa8(%rsp),%rsp
+.LSEH_end_rsaz_1024_gather5:
+___
+$code.=<<___;
+ ret
+.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
+___
+}
+
+$code.=<<___;
+.extern OPENSSL_ia32cap_P
+.globl rsaz_avx2_eligible
+.type rsaz_avx2_eligible,\@abi-omnipotent
+.align 32
+rsaz_avx2_eligible:
+ mov OPENSSL_ia32cap_P+8(%rip),%eax
+___
+$code.=<<___ if ($addx);
+ mov \$`1<<8|1<<19`,%ecx
+ mov \$0,%edx
+ and %eax,%ecx
+ cmp \$`1<<8|1<<19`,%ecx # check for BMI2+AD*X
+ cmove %edx,%eax
+___
+$code.=<<___;
+ and \$`1<<5`,%eax
+ shr \$5,%eax
+ ret
+.size rsaz_avx2_eligible,.-rsaz_avx2_eligible
+
+.align 64
+.Land_mask:
+ .quad 0x1fffffff,0x1fffffff,0x1fffffff,-1
+.Lscatter_permd:
+ .long 0,2,4,6,7,7,7,7
+.Lgather_permd:
+ .long 0,7,1,7,2,7,3,7
+.Lgather_table:
+ .byte 0,0,0,0,0,0,0,0, 0xff,0,0,0,0,0,0,0
+.align 64
+___
+
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___
+.extern __imp_RtlVirtualUnwind
+.type rsaz_se_handler,\@abi-omnipotent
+.align 16
+rsaz_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ mov 160($context),%rax # pull context->Rbp
+
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ mov %r15,240($context)
+ mov %r14,232($context)
+ mov %r13,224($context)
+ mov %r12,216($context)
+ mov %rbp,160($context)
+ mov %rbx,144($context)
+
+ lea -0xd8(%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # & context.Xmm6
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+
+.Lcommon_seh_tail:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size rsaz_se_handler,.-rsaz_se_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_rsaz_1024_sqr_avx2
+ .rva .LSEH_end_rsaz_1024_sqr_avx2
+ .rva .LSEH_info_rsaz_1024_sqr_avx2
+
+ .rva .LSEH_begin_rsaz_1024_mul_avx2
+ .rva .LSEH_end_rsaz_1024_mul_avx2
+ .rva .LSEH_info_rsaz_1024_mul_avx2
+
+ .rva .LSEH_begin_rsaz_1024_gather5
+ .rva .LSEH_end_rsaz_1024_gather5
+ .rva .LSEH_info_rsaz_1024_gather5
+.section .xdata
+.align 8
+.LSEH_info_rsaz_1024_sqr_avx2:
+ .byte 9,0,0,0
+ .rva rsaz_se_handler
+ .rva .Lsqr_1024_body,.Lsqr_1024_epilogue
+.LSEH_info_rsaz_1024_mul_avx2:
+ .byte 9,0,0,0
+ .rva rsaz_se_handler
+ .rva .Lmul_1024_body,.Lmul_1024_epilogue
+.LSEH_info_rsaz_1024_gather5:
+ .byte 0x01,0x33,0x16,0x00
+ .byte 0x36,0xf8,0x09,0x00 #vmovaps 0x90(rsp),xmm15
+ .byte 0x31,0xe8,0x08,0x00 #vmovaps 0x80(rsp),xmm14
+ .byte 0x2c,0xd8,0x07,0x00 #vmovaps 0x70(rsp),xmm13
+ .byte 0x27,0xc8,0x06,0x00 #vmovaps 0x60(rsp),xmm12
+ .byte 0x22,0xb8,0x05,0x00 #vmovaps 0x50(rsp),xmm11
+ .byte 0x1d,0xa8,0x04,0x00 #vmovaps 0x40(rsp),xmm10
+ .byte 0x18,0x98,0x03,0x00 #vmovaps 0x30(rsp),xmm9
+ .byte 0x13,0x88,0x02,0x00 #vmovaps 0x20(rsp),xmm8
+ .byte 0x0e,0x78,0x01,0x00 #vmovaps 0x10(rsp),xmm7
+ .byte 0x09,0x68,0x00,0x00 #vmovaps 0x00(rsp),xmm6
+ .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
+___
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval($1)/ge;
+
+ s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or
+
+ s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
+ s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
+ s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
+ s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
+ s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
+ print $_,"\n";
+}
+
+}}} else {{{
+print <<___; # assembler is too old
+.text
+
+.globl rsaz_avx2_eligible
+.type rsaz_avx2_eligible,\@abi-omnipotent
+rsaz_avx2_eligible:
+ xor %eax,%eax
+ ret
+.size rsaz_avx2_eligible,.-rsaz_avx2_eligible
+
+.globl rsaz_1024_sqr_avx2
+.globl rsaz_1024_mul_avx2
+.globl rsaz_1024_norm2red_avx2
+.globl rsaz_1024_red2norm_avx2
+.globl rsaz_1024_scatter5_avx2
+.globl rsaz_1024_gather5_avx2
+.type rsaz_1024_sqr_avx2,\@abi-omnipotent
+rsaz_1024_sqr_avx2:
+rsaz_1024_mul_avx2:
+rsaz_1024_norm2red_avx2:
+rsaz_1024_red2norm_avx2:
+rsaz_1024_scatter5_avx2:
+rsaz_1024_gather5_avx2:
+ .byte 0x0f,0x0b # ud2
+ ret
+.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
+___
+}}}
+
+close STDOUT;
diff --git a/crypto/bn/asm/rsaz-x86_64.pl b/crypto/bn/asm/rsaz-x86_64.pl
new file mode 100755
index 000000000000..3bd45dbac01d
--- /dev/null
+++ b/crypto/bn/asm/rsaz-x86_64.pl
@@ -0,0 +1,2144 @@
+#!/usr/bin/env perl
+
+##############################################################################
+# #
+# Copyright (c) 2012, Intel Corporation #
+# #
+# All rights reserved. #
+# #
+# Redistribution and use in source and binary forms, with or without #
+# modification, are permitted provided that the following conditions are #
+# met: #
+# #
+# * Redistributions of source code must retain the above copyright #
+# notice, this list of conditions and the following disclaimer. #
+# #
+# * Redistributions in binary form must reproduce the above copyright #
+# notice, this list of conditions and the following disclaimer in the #
+# documentation and/or other materials provided with the #
+# distribution. #
+# #
+# * Neither the name of the Intel Corporation nor the names of its #
+# contributors may be used to endorse or promote products derived from #
+# this software without specific prior written permission. #
+# #
+# #
+# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
+# #
+##############################################################################
+# Developers and authors: #
+# Shay Gueron (1, 2), and Vlad Krasnov (1) #
+# (1) Intel Architecture Group, Microprocessor and Chipset Development, #
+# Israel Development Center, Haifa, Israel #
+# (2) University of Haifa #
+##############################################################################
+# Reference: #
+# [1] S. Gueron, "Efficient Software Implementations of Modular #
+# Exponentiation", http://eprint.iacr.org/2011/239 #
+# [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". #
+# IEEE Proceedings of 9th International Conference on Information #
+# Technology: New Generations (ITNG 2012), 821-823 (2012). #
+# [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation#
+# Journal of Cryptographic Engineering 2:31-43 (2012). #
+# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
+# resistant 512-bit and 1024-bit modular exponentiation for optimizing #
+# RSA1024 and RSA2048 on x86_64 platforms", #
+# http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest#
+##############################################################################
+
+# While original submission covers 512- and 1024-bit exponentiation,
+# this module is limited to 512-bit version only (and as such
+# accelerates RSA1024 sign). This is because improvement for longer
+# keys is not high enough to justify the effort, highest measured
+# was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
+# for the moment of this writing!] Nor does this module implement
+# "monolithic" complete exponentiation jumbo-subroutine, but adheres
+# to more modular mixture of C and assembly. And it's optimized even
+# for processors other than Intel Core family (see table below for
+# improvement coefficients).
+# <appro@openssl.org>
+#
+# RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
+# ----------------+---------------------------
+# Opteron +13% |+5% +20%
+# Bulldozer -0% |-1% +10%
+# P4 +11% |+7% +8%
+# Westmere +5% |+14% +17%
+# Sandy Bridge +2% |+12% +29%
+# Ivy Bridge +1% |+11% +35%
+# Haswell(**) -0% |+12% +39%
+# Atom +13% |+11% +4%
+# VIA Nano +70% |+9% +25%
+#
+# (*) rsax engine and fips numbers are presented for reference
+# purposes;
+# (**) MULX was attempted, but found to give only marginal improvement;
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $addx = ($1>=2.23);
+}
+
+if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $addx = ($1>=2.10);
+}
+
+if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $addx = ($1>=12);
+}
+
+if (!$addx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
+ my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
+ $addx = ($ver>=3.03);
+}
+
+($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
+{
+my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
+
+$code.=<<___;
+.text
+
+.extern OPENSSL_ia32cap_P
+
+.globl rsaz_512_sqr
+.type rsaz_512_sqr,\@function,5
+.align 32
+rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ subq \$128+24, %rsp
+.Lsqr_body:
+ movq $mod, %rbp # common argument
+ movq ($inp), %rdx
+ movq 8($inp), %rax
+ movq $n0, 128(%rsp)
+___
+$code.=<<___ if ($addx);
+ movl \$0x80100,%r11d
+ andl OPENSSL_ia32cap_P+8(%rip),%r11d
+ cmpl \$0x80100,%r11d # check for MULX and ADO/CX
+ je .Loop_sqrx
+___
+$code.=<<___;
+ jmp .Loop_sqr
+
+.align 32
+.Loop_sqr:
+ movl $times,128+8(%rsp)
+#first iteration
+ movq %rdx, %rbx
+ mulq %rdx
+ movq %rax, %r8
+ movq 16($inp), %rax
+ movq %rdx, %r9
+
+ mulq %rbx
+ addq %rax, %r9
+ movq 24($inp), %rax
+ movq %rdx, %r10
+ adcq \$0, %r10
+
+ mulq %rbx
+ addq %rax, %r10
+ movq 32($inp), %rax
+ movq %rdx, %r11
+ adcq \$0, %r11
+
+ mulq %rbx
+ addq %rax, %r11
+ movq 40($inp), %rax
+ movq %rdx, %r12
+ adcq \$0, %r12
+
+ mulq %rbx
+ addq %rax, %r12
+ movq 48($inp), %rax
+ movq %rdx, %r13
+ adcq \$0, %r13
+
+ mulq %rbx
+ addq %rax, %r13
+ movq 56($inp), %rax
+ movq %rdx, %r14
+ adcq \$0, %r14
+
+ mulq %rbx
+ addq %rax, %r14
+ movq %rbx, %rax
+ movq %rdx, %r15
+ adcq \$0, %r15
+
+ addq %r8, %r8 #shlq \$1, %r8
+ movq %r9, %rcx
+ adcq %r9, %r9 #shld \$1, %r8, %r9
+
+ mulq %rax
+ movq %rax, (%rsp)
+ addq %rdx, %r8
+ adcq \$0, %r9
+
+ movq %r8, 8(%rsp)
+ shrq \$63, %rcx
+
+#second iteration
+ movq 8($inp), %r8
+ movq 16($inp), %rax
+ mulq %r8
+ addq %rax, %r10
+ movq 24($inp), %rax
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+ mulq %r8
+ addq %rax, %r11
+ movq 32($inp), %rax
+ adcq \$0, %rdx
+ addq %rbx, %r11
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+ mulq %r8
+ addq %rax, %r12
+ movq 40($inp), %rax
+ adcq \$0, %rdx
+ addq %rbx, %r12
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+ mulq %r8
+ addq %rax, %r13
+ movq 48($inp), %rax
+ adcq \$0, %rdx
+ addq %rbx, %r13
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+ mulq %r8
+ addq %rax, %r14
+ movq 56($inp), %rax
+ adcq \$0, %rdx
+ addq %rbx, %r14
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+ mulq %r8
+ addq %rax, %r15
+ movq %r8, %rax
+ adcq \$0, %rdx
+ addq %rbx, %r15
+ movq %rdx, %r8
+ movq %r10, %rdx
+ adcq \$0, %r8
+
+ add %rdx, %rdx
+ lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
+ movq %r11, %rbx
+ adcq %r11, %r11 #shld \$1, %r10, %r11
+
+ mulq %rax
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq \$0, %r11
+
+ movq %r9, 16(%rsp)
+ movq %r10, 24(%rsp)
+ shrq \$63, %rbx
+
+#third iteration
+ movq 16($inp), %r9
+ movq 24($inp), %rax
+ mulq %r9
+ addq %rax, %r12
+ movq 32($inp), %rax
+ movq %rdx, %rcx
+ adcq \$0, %rcx
+
+ mulq %r9
+ addq %rax, %r13
+ movq 40($inp), %rax
+ adcq \$0, %rdx
+ addq %rcx, %r13
+ movq %rdx, %rcx
+ adcq \$0, %rcx
+
+ mulq %r9
+ addq %rax, %r14
+ movq 48($inp), %rax
+ adcq \$0, %rdx
+ addq %rcx, %r14
+ movq %rdx, %rcx
+ adcq \$0, %rcx
+
+ mulq %r9
+ movq %r12, %r10
+ lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12
+ addq %rax, %r15
+ movq 56($inp), %rax
+ adcq \$0, %rdx
+ addq %rcx, %r15
+ movq %rdx, %rcx
+ adcq \$0, %rcx
+
+ mulq %r9
+ shrq \$63, %r10
+ addq %rax, %r8
+ movq %r9, %rax
+ adcq \$0, %rdx
+ addq %rcx, %r8
+ movq %rdx, %r9
+ adcq \$0, %r9
+
+ movq %r13, %rcx
+ leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13
+
+ mulq %rax
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq \$0, %r13
+
+ movq %r11, 32(%rsp)
+ movq %r12, 40(%rsp)
+ shrq \$63, %rcx
+
+#fourth iteration
+ movq 24($inp), %r10
+ movq 32($inp), %rax
+ mulq %r10
+ addq %rax, %r14
+ movq 40($inp), %rax
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+ mulq %r10
+ addq %rax, %r15
+ movq 48($inp), %rax
+ adcq \$0, %rdx
+ addq %rbx, %r15
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+ mulq %r10
+ movq %r14, %r12
+ leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14
+ addq %rax, %r8
+ movq 56($inp), %rax
+ adcq \$0, %rdx
+ addq %rbx, %r8
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+ mulq %r10
+ shrq \$63, %r12
+ addq %rax, %r9
+ movq %r10, %rax
+ adcq \$0, %rdx
+ addq %rbx, %r9
+ movq %rdx, %r10
+ adcq \$0, %r10
+
+ movq %r15, %rbx
+ leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15
+
+ mulq %rax
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq \$0, %r15
+
+ movq %r13, 48(%rsp)
+ movq %r14, 56(%rsp)
+ shrq \$63, %rbx
+
+#fifth iteration
+ movq 32($inp), %r11
+ movq 40($inp), %rax
+ mulq %r11
+ addq %rax, %r8
+ movq 48($inp), %rax
+ movq %rdx, %rcx
+ adcq \$0, %rcx
+
+ mulq %r11
+ addq %rax, %r9
+ movq 56($inp), %rax
+ adcq \$0, %rdx
+ movq %r8, %r12
+ leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8
+ addq %rcx, %r9
+ movq %rdx, %rcx
+ adcq \$0, %rcx
+
+ mulq %r11
+ shrq \$63, %r12
+ addq %rax, %r10
+ movq %r11, %rax
+ adcq \$0, %rdx
+ addq %rcx, %r10
+ movq %rdx, %r11
+ adcq \$0, %r11
+
+ movq %r9, %rcx
+ leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9
+
+ mulq %rax
+ addq %rax, %r15
+ adcq %rdx, %r8
+ adcq \$0, %r9
+
+ movq %r15, 64(%rsp)
+ movq %r8, 72(%rsp)
+ shrq \$63, %rcx
+
+#sixth iteration
+ movq 40($inp), %r12
+ movq 48($inp), %rax
+ mulq %r12
+ addq %rax, %r10
+ movq 56($inp), %rax
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+ mulq %r12
+ addq %rax, %r11
+ movq %r12, %rax
+ movq %r10, %r15
+ leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
+ adcq \$0, %rdx
+ shrq \$63, %r15
+ addq %rbx, %r11
+ movq %rdx, %r12
+ adcq \$0, %r12
+
+ movq %r11, %rbx
+ leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11
+
+ mulq %rax
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq \$0, %r11
+
+ movq %r9, 80(%rsp)
+ movq %r10, 88(%rsp)
+
+#seventh iteration
+ movq 48($inp), %r13
+ movq 56($inp), %rax
+ mulq %r13
+ addq %rax, %r12
+ movq %r13, %rax
+ movq %rdx, %r13
+ adcq \$0, %r13
+
+ xorq %r14, %r14
+ shlq \$1, %rbx
+ adcq %r12, %r12 #shld \$1, %rbx, %r12
+ adcq %r13, %r13 #shld \$1, %r12, %r13
+ adcq %r14, %r14 #shld \$1, %r13, %r14
+
+ mulq %rax
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq \$0, %r13
+
+ movq %r11, 96(%rsp)
+ movq %r12, 104(%rsp)
+
+#eighth iteration
+ movq 56($inp), %rax
+ mulq %rax
+ addq %rax, %r13
+ adcq \$0, %rdx
+
+ addq %rdx, %r14
+
+ movq %r13, 112(%rsp)
+ movq %r14, 120(%rsp)
+
+ movq (%rsp), %r8
+ movq 8(%rsp), %r9
+ movq 16(%rsp), %r10
+ movq 24(%rsp), %r11
+ movq 32(%rsp), %r12
+ movq 40(%rsp), %r13
+ movq 48(%rsp), %r14
+ movq 56(%rsp), %r15
+
+ call __rsaz_512_reduce
+
+ addq 64(%rsp), %r8
+ adcq 72(%rsp), %r9
+ adcq 80(%rsp), %r10
+ adcq 88(%rsp), %r11
+ adcq 96(%rsp), %r12
+ adcq 104(%rsp), %r13
+ adcq 112(%rsp), %r14
+ adcq 120(%rsp), %r15
+ sbbq %rcx, %rcx
+
+ call __rsaz_512_subtract
+
+ movq %r8, %rdx
+ movq %r9, %rax
+ movl 128+8(%rsp), $times
+ movq $out, $inp
+
+ decl $times
+ jnz .Loop_sqr
+___
+if ($addx) {
+$code.=<<___;
+ jmp .Lsqr_tail
+
+.align 32
+.Loop_sqrx:
+ movl $times,128+8(%rsp)
+ movq $out, %xmm0 # off-load
+ movq %rbp, %xmm1 # off-load
+#first iteration
+ mulx %rax, %r8, %r9
+
+ mulx 16($inp), %rcx, %r10
+ xor %rbp, %rbp # cf=0, of=0
+
+ mulx 24($inp), %rax, %r11
+ adcx %rcx, %r9
+
+ mulx 32($inp), %rcx, %r12
+ adcx %rax, %r10
+
+ mulx 40($inp), %rax, %r13
+ adcx %rcx, %r11
+
+ .byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14
+ adcx %rax, %r12
+ adcx %rcx, %r13
+
+ .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15
+ adcx %rax, %r14
+ adcx %rbp, %r15 # %rbp is 0
+
+ mov %r9, %rcx
+ shld \$1, %r8, %r9
+ shl \$1, %r8
+
+ xor %ebp, %ebp
+ mulx %rdx, %rax, %rdx
+ adcx %rdx, %r8
+ mov 8($inp), %rdx
+ adcx %rbp, %r9
+
+ mov %rax, (%rsp)
+ mov %r8, 8(%rsp)
+
+#second iteration
+ mulx 16($inp), %rax, %rbx
+ adox %rax, %r10
+ adcx %rbx, %r11
+
+ .byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8
+ adox $out, %r11
+ adcx %r8, %r12
+
+ mulx 32($inp), %rax, %rbx
+ adox %rax, %r12
+ adcx %rbx, %r13
+
+ mulx 40($inp), $out, %r8
+ adox $out, %r13
+ adcx %r8, %r14
+
+ .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
+ adox %rax, %r14
+ adcx %rbx, %r15
+
+ .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
+ adox $out, %r15
+ adcx %rbp, %r8
+ adox %rbp, %r8
+
+ mov %r11, %rbx
+ shld \$1, %r10, %r11
+ shld \$1, %rcx, %r10
+
+ xor %ebp,%ebp
+ mulx %rdx, %rax, %rcx
+ mov 16($inp), %rdx
+ adcx %rax, %r9
+ adcx %rcx, %r10
+ adcx %rbp, %r11
+
+ mov %r9, 16(%rsp)
+ .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
+
+#third iteration
+ .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9
+ adox $out, %r12
+ adcx %r9, %r13
+
+ mulx 32($inp), %rax, %rcx
+ adox %rax, %r13
+ adcx %rcx, %r14
+
+ mulx 40($inp), $out, %r9
+ adox $out, %r14
+ adcx %r9, %r15
+
+ .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
+ adox %rax, %r15
+ adcx %rcx, %r8
+
+ .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9
+ adox $out, %r8
+ adcx %rbp, %r9
+ adox %rbp, %r9
+
+ mov %r13, %rcx
+ shld \$1, %r12, %r13
+ shld \$1, %rbx, %r12
+
+ xor %ebp, %ebp
+ mulx %rdx, %rax, %rdx
+ adcx %rax, %r11
+ adcx %rdx, %r12
+ mov 24($inp), %rdx
+ adcx %rbp, %r13
+
+ mov %r11, 32(%rsp)
+ .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp)
+
+#fourth iteration
+ .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx
+ adox %rax, %r14
+ adcx %rbx, %r15
+
+ mulx 40($inp), $out, %r10
+ adox $out, %r15
+ adcx %r10, %r8
+
+ mulx 48($inp), %rax, %rbx
+ adox %rax, %r8
+ adcx %rbx, %r9
+
+ mulx 56($inp), $out, %r10
+ adox $out, %r9
+ adcx %rbp, %r10
+ adox %rbp, %r10
+
+ .byte 0x66
+ mov %r15, %rbx
+ shld \$1, %r14, %r15
+ shld \$1, %rcx, %r14
+
+ xor %ebp, %ebp
+ mulx %rdx, %rax, %rdx
+ adcx %rax, %r13
+ adcx %rdx, %r14
+ mov 32($inp), %rdx
+ adcx %rbp, %r15
+
+ mov %r13, 48(%rsp)
+ mov %r14, 56(%rsp)
+
+#fifth iteration
+ .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11
+ adox $out, %r8
+ adcx %r11, %r9
+
+ mulx 48($inp), %rax, %rcx
+ adox %rax, %r9
+ adcx %rcx, %r10
+
+ mulx 56($inp), $out, %r11
+ adox $out, %r10
+ adcx %rbp, %r11
+ adox %rbp, %r11
+
+ mov %r9, %rcx
+ shld \$1, %r8, %r9
+ shld \$1, %rbx, %r8
+
+ xor %ebp, %ebp
+ mulx %rdx, %rax, %rdx
+ adcx %rax, %r15
+ adcx %rdx, %r8
+ mov 40($inp), %rdx
+ adcx %rbp, %r9
+
+ mov %r15, 64(%rsp)
+ mov %r8, 72(%rsp)
+
+#sixth iteration
+ .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
+ adox %rax, %r10
+ adcx %rbx, %r11
+
+ .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
+ adox $out, %r11
+ adcx %rbp, %r12
+ adox %rbp, %r12
+
+ mov %r11, %rbx
+ shld \$1, %r10, %r11
+ shld \$1, %rcx, %r10
+
+ xor %ebp, %ebp
+ mulx %rdx, %rax, %rdx
+ adcx %rax, %r9
+ adcx %rdx, %r10
+ mov 48($inp), %rdx
+ adcx %rbp, %r11
+
+ mov %r9, 80(%rsp)
+ mov %r10, 88(%rsp)
+
+#seventh iteration
+ .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
+ adox %rax, %r12
+ adox %rbp, %r13
+
+ xor %r14, %r14
+ shld \$1, %r13, %r14
+ shld \$1, %r12, %r13
+ shld \$1, %rbx, %r12
+
+ xor %ebp, %ebp
+ mulx %rdx, %rax, %rdx
+ adcx %rax, %r11
+ adcx %rdx, %r12
+ mov 56($inp), %rdx
+ adcx %rbp, %r13
+
+ .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
+ .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
+
+#eighth iteration
+ mulx %rdx, %rax, %rdx
+ adox %rax, %r13
+ adox %rbp, %rdx
+
+ .byte 0x66
+ add %rdx, %r14
+
+ movq %r13, 112(%rsp)
+ movq %r14, 120(%rsp)
+ movq %xmm0, $out
+ movq %xmm1, %rbp
+
+ movq 128(%rsp), %rdx # pull $n0
+ movq (%rsp), %r8
+ movq 8(%rsp), %r9
+ movq 16(%rsp), %r10
+ movq 24(%rsp), %r11
+ movq 32(%rsp), %r12
+ movq 40(%rsp), %r13
+ movq 48(%rsp), %r14
+ movq 56(%rsp), %r15
+
+ call __rsaz_512_reducex
+
+ addq 64(%rsp), %r8
+ adcq 72(%rsp), %r9
+ adcq 80(%rsp), %r10
+ adcq 88(%rsp), %r11
+ adcq 96(%rsp), %r12
+ adcq 104(%rsp), %r13
+ adcq 112(%rsp), %r14
+ adcq 120(%rsp), %r15
+ sbbq %rcx, %rcx
+
+ call __rsaz_512_subtract
+
+ movq %r8, %rdx
+ movq %r9, %rax
+ movl 128+8(%rsp), $times
+ movq $out, $inp
+
+ decl $times
+ jnz .Loop_sqrx
+
+.Lsqr_tail:
+___
+}
+$code.=<<___;
+
+ leaq 128+24+48(%rsp), %rax
+ movq -48(%rax), %r15
+ movq -40(%rax), %r14
+ movq -32(%rax), %r13
+ movq -24(%rax), %r12
+ movq -16(%rax), %rbp
+ movq -8(%rax), %rbx
+ leaq (%rax), %rsp
+.Lsqr_epilogue:
+ ret
+.size rsaz_512_sqr,.-rsaz_512_sqr
+___
+}
+{
+my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
+$code.=<<___;
+.globl rsaz_512_mul
+.type rsaz_512_mul,\@function,5
+.align 32
+rsaz_512_mul:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ subq \$128+24, %rsp
+.Lmul_body:
+ movq $out, %xmm0 # off-load arguments
+ movq $mod, %xmm1
+ movq $n0, 128(%rsp)
+___
+$code.=<<___ if ($addx);
+ movl \$0x80100,%r11d
+ andl OPENSSL_ia32cap_P+8(%rip),%r11d
+ cmpl \$0x80100,%r11d # check for MULX and ADO/CX
+ je .Lmulx
+___
+$code.=<<___;
+ movq ($bp), %rbx # pass b[0]
+ movq $bp, %rbp # pass argument
+ call __rsaz_512_mul
+
+ movq %xmm0, $out
+ movq %xmm1, %rbp
+
+ movq (%rsp), %r8
+ movq 8(%rsp), %r9
+ movq 16(%rsp), %r10
+ movq 24(%rsp), %r11
+ movq 32(%rsp), %r12
+ movq 40(%rsp), %r13
+ movq 48(%rsp), %r14
+ movq 56(%rsp), %r15
+
+ call __rsaz_512_reduce
+___
+$code.=<<___ if ($addx);
+ jmp .Lmul_tail
+
+.align 32
+.Lmulx:
+ movq $bp, %rbp # pass argument
+ movq ($bp), %rdx # pass b[0]
+ call __rsaz_512_mulx
+
+ movq %xmm0, $out
+ movq %xmm1, %rbp
+
+ movq 128(%rsp), %rdx # pull $n0
+ movq (%rsp), %r8
+ movq 8(%rsp), %r9
+ movq 16(%rsp), %r10
+ movq 24(%rsp), %r11
+ movq 32(%rsp), %r12
+ movq 40(%rsp), %r13
+ movq 48(%rsp), %r14
+ movq 56(%rsp), %r15
+
+ call __rsaz_512_reducex
+.Lmul_tail:
+___
+$code.=<<___;
+ addq 64(%rsp), %r8
+ adcq 72(%rsp), %r9
+ adcq 80(%rsp), %r10
+ adcq 88(%rsp), %r11
+ adcq 96(%rsp), %r12
+ adcq 104(%rsp), %r13
+ adcq 112(%rsp), %r14
+ adcq 120(%rsp), %r15
+ sbbq %rcx, %rcx
+
+ call __rsaz_512_subtract
+
+ leaq 128+24+48(%rsp), %rax
+ movq -48(%rax), %r15
+ movq -40(%rax), %r14
+ movq -32(%rax), %r13
+ movq -24(%rax), %r12
+ movq -16(%rax), %rbp
+ movq -8(%rax), %rbx
+ leaq (%rax), %rsp
+.Lmul_epilogue:
+ ret
+.size rsaz_512_mul,.-rsaz_512_mul
+___
+}
+{
+my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
+$code.=<<___;
+.globl rsaz_512_mul_gather4
+.type rsaz_512_mul_gather4,\@function,6
+.align 32
+rsaz_512_mul_gather4:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov $pwr, $pwr
+ subq \$128+24, %rsp
+.Lmul_gather4_body:
+___
+$code.=<<___ if ($addx);
+ movl \$0x80100,%r11d
+ andl OPENSSL_ia32cap_P+8(%rip),%r11d
+ cmpl \$0x80100,%r11d # check for MULX and ADO/CX
+ je .Lmulx_gather
+___
+$code.=<<___;
+ movl 64($bp,$pwr,4), %eax
+ movq $out, %xmm0 # off-load arguments
+ movl ($bp,$pwr,4), %ebx
+ movq $mod, %xmm1
+ movq $n0, 128(%rsp)
+
+ shlq \$32, %rax
+ or %rax, %rbx
+ movq ($ap), %rax
+ movq 8($ap), %rcx
+ leaq 128($bp,$pwr,4), %rbp
+ mulq %rbx # 0 iteration
+ movq %rax, (%rsp)
+ movq %rcx, %rax
+ movq %rdx, %r8
+
+ mulq %rbx
+ movd (%rbp), %xmm4
+ addq %rax, %r8
+ movq 16($ap), %rax
+ movq %rdx, %r9
+ adcq \$0, %r9
+
+ mulq %rbx
+ movd 64(%rbp), %xmm5
+ addq %rax, %r9
+ movq 24($ap), %rax
+ movq %rdx, %r10
+ adcq \$0, %r10
+
+ mulq %rbx
+ pslldq \$4, %xmm5
+ addq %rax, %r10
+ movq 32($ap), %rax
+ movq %rdx, %r11
+ adcq \$0, %r11
+
+ mulq %rbx
+ por %xmm5, %xmm4
+ addq %rax, %r11
+ movq 40($ap), %rax
+ movq %rdx, %r12
+ adcq \$0, %r12
+
+ mulq %rbx
+ addq %rax, %r12
+ movq 48($ap), %rax
+ movq %rdx, %r13
+ adcq \$0, %r13
+
+ mulq %rbx
+ leaq 128(%rbp), %rbp
+ addq %rax, %r13
+ movq 56($ap), %rax
+ movq %rdx, %r14
+ adcq \$0, %r14
+
+ mulq %rbx
+ movq %xmm4, %rbx
+ addq %rax, %r14
+ movq ($ap), %rax
+ movq %rdx, %r15
+ adcq \$0, %r15
+
+ leaq 8(%rsp), %rdi
+ movl \$7, %ecx
+ jmp .Loop_mul_gather
+
+.align 32
+.Loop_mul_gather:
+ mulq %rbx
+ addq %rax, %r8
+ movq 8($ap), %rax
+ movq %r8, (%rdi)
+ movq %rdx, %r8
+ adcq \$0, %r8
+
+ mulq %rbx
+ movd (%rbp), %xmm4
+ addq %rax, %r9
+ movq 16($ap), %rax
+ adcq \$0, %rdx
+ addq %r9, %r8
+ movq %rdx, %r9
+ adcq \$0, %r9
+
+ mulq %rbx
+ movd 64(%rbp), %xmm5
+ addq %rax, %r10
+ movq 24($ap), %rax
+ adcq \$0, %rdx
+ addq %r10, %r9
+ movq %rdx, %r10
+ adcq \$0, %r10
+
+ mulq %rbx
+ pslldq \$4, %xmm5
+ addq %rax, %r11
+ movq 32($ap), %rax
+ adcq \$0, %rdx
+ addq %r11, %r10
+ movq %rdx, %r11
+ adcq \$0, %r11
+
+ mulq %rbx
+ por %xmm5, %xmm4
+ addq %rax, %r12
+ movq 40($ap), %rax
+ adcq \$0, %rdx
+ addq %r12, %r11
+ movq %rdx, %r12
+ adcq \$0, %r12
+
+ mulq %rbx
+ addq %rax, %r13
+ movq 48($ap), %rax
+ adcq \$0, %rdx
+ addq %r13, %r12
+ movq %rdx, %r13
+ adcq \$0, %r13
+
+ mulq %rbx
+ addq %rax, %r14
+ movq 56($ap), %rax
+ adcq \$0, %rdx
+ addq %r14, %r13
+ movq %rdx, %r14
+ adcq \$0, %r14
+
+ mulq %rbx
+ movq %xmm4, %rbx
+ addq %rax, %r15
+ movq ($ap), %rax
+ adcq \$0, %rdx
+ addq %r15, %r14
+ movq %rdx, %r15
+ adcq \$0, %r15
+
+ leaq 128(%rbp), %rbp
+ leaq 8(%rdi), %rdi
+
+ decl %ecx
+ jnz .Loop_mul_gather
+
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq %r12, 32(%rdi)
+ movq %r13, 40(%rdi)
+ movq %r14, 48(%rdi)
+ movq %r15, 56(%rdi)
+
+ movq %xmm0, $out
+ movq %xmm1, %rbp
+
+ movq (%rsp), %r8
+ movq 8(%rsp), %r9
+ movq 16(%rsp), %r10
+ movq 24(%rsp), %r11
+ movq 32(%rsp), %r12
+ movq 40(%rsp), %r13
+ movq 48(%rsp), %r14
+ movq 56(%rsp), %r15
+
+ call __rsaz_512_reduce
+___
+$code.=<<___ if ($addx);
+ jmp .Lmul_gather_tail
+
+.align 32
+.Lmulx_gather:
+ mov 64($bp,$pwr,4), %eax
+ movq $out, %xmm0 # off-load arguments
+ lea 128($bp,$pwr,4), %rbp
+ mov ($bp,$pwr,4), %edx
+ movq $mod, %xmm1
+ mov $n0, 128(%rsp)
+
+ shl \$32, %rax
+ or %rax, %rdx
+ mulx ($ap), %rbx, %r8 # 0 iteration
+ mov %rbx, (%rsp)
+ xor %edi, %edi # cf=0, of=0
+
+ mulx 8($ap), %rax, %r9
+ movd (%rbp), %xmm4
+
+ mulx 16($ap), %rbx, %r10
+ movd 64(%rbp), %xmm5
+ adcx %rax, %r8
+
+ mulx 24($ap), %rax, %r11
+ pslldq \$4, %xmm5
+ adcx %rbx, %r9
+
+ mulx 32($ap), %rbx, %r12
+ por %xmm5, %xmm4
+ adcx %rax, %r10
+
+ mulx 40($ap), %rax, %r13
+ adcx %rbx, %r11
+
+ mulx 48($ap), %rbx, %r14
+ lea 128(%rbp), %rbp
+ adcx %rax, %r12
+
+ mulx 56($ap), %rax, %r15
+ movq %xmm4, %rdx
+ adcx %rbx, %r13
+ adcx %rax, %r14
+ mov %r8, %rbx
+ adcx %rdi, %r15 # %rdi is 0
+
+ mov \$-7, %rcx
+ jmp .Loop_mulx_gather
+
+.align 32
+.Loop_mulx_gather:
+ mulx ($ap), %rax, %r8
+ adcx %rax, %rbx
+ adox %r9, %r8
+
+ mulx 8($ap), %rax, %r9
+ .byte 0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00 # movd (%rbp), %xmm4
+ adcx %rax, %r8
+ adox %r10, %r9
+
+ mulx 16($ap), %rax, %r10
+ movd 64(%rbp), %xmm5
+ lea 128(%rbp), %rbp
+ adcx %rax, %r9
+ adox %r11, %r10
+
+ .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
+ pslldq \$4, %xmm5
+ por %xmm5, %xmm4
+ adcx %rax, %r10
+ adox %r12, %r11
+
+ mulx 32($ap), %rax, %r12
+ adcx %rax, %r11
+ adox %r13, %r12
+
+ mulx 40($ap), %rax, %r13
+ adcx %rax, %r12
+ adox %r14, %r13
+
+ .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
+ adcx %rax, %r13
+ adox %r15, %r14
+
+ mulx 56($ap), %rax, %r15
+ movq %xmm4, %rdx
+ mov %rbx, 64(%rsp,%rcx,8)
+ adcx %rax, %r14
+ adox %rdi, %r15
+ mov %r8, %rbx
+ adcx %rdi, %r15 # cf=0
+
+ inc %rcx # of=0
+ jnz .Loop_mulx_gather
+
+ mov %r8, 64(%rsp)
+ mov %r9, 64+8(%rsp)
+ mov %r10, 64+16(%rsp)
+ mov %r11, 64+24(%rsp)
+ mov %r12, 64+32(%rsp)
+ mov %r13, 64+40(%rsp)
+ mov %r14, 64+48(%rsp)
+ mov %r15, 64+56(%rsp)
+
+ movq %xmm0, $out
+ movq %xmm1, %rbp
+
+ mov 128(%rsp), %rdx # pull $n0
+ mov (%rsp), %r8
+ mov 8(%rsp), %r9
+ mov 16(%rsp), %r10
+ mov 24(%rsp), %r11
+ mov 32(%rsp), %r12
+ mov 40(%rsp), %r13
+ mov 48(%rsp), %r14
+ mov 56(%rsp), %r15
+
+ call __rsaz_512_reducex
+
+.Lmul_gather_tail:
+___
+$code.=<<___;
+ addq 64(%rsp), %r8
+ adcq 72(%rsp), %r9
+ adcq 80(%rsp), %r10
+ adcq 88(%rsp), %r11
+ adcq 96(%rsp), %r12
+ adcq 104(%rsp), %r13
+ adcq 112(%rsp), %r14
+ adcq 120(%rsp), %r15
+ sbbq %rcx, %rcx
+
+ call __rsaz_512_subtract
+
+ leaq 128+24+48(%rsp), %rax
+ movq -48(%rax), %r15
+ movq -40(%rax), %r14
+ movq -32(%rax), %r13
+ movq -24(%rax), %r12
+ movq -16(%rax), %rbp
+ movq -8(%rax), %rbx
+ leaq (%rax), %rsp
+.Lmul_gather4_epilogue:
+ ret
+.size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
+___
+}
+{
+my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
+$code.=<<___;
+.globl rsaz_512_mul_scatter4
+.type rsaz_512_mul_scatter4,\@function,6
+.align 32
+rsaz_512_mul_scatter4:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov $pwr, $pwr
+ subq \$128+24, %rsp
+.Lmul_scatter4_body:
+ leaq ($tbl,$pwr,4), $tbl
+ movq $out, %xmm0 # off-load arguments
+ movq $mod, %xmm1
+ movq $tbl, %xmm2
+ movq $n0, 128(%rsp)
+
+ movq $out, %rbp
+___
+$code.=<<___ if ($addx);
+ movl \$0x80100,%r11d
+ andl OPENSSL_ia32cap_P+8(%rip),%r11d
+ cmpl \$0x80100,%r11d # check for MULX and ADO/CX
+ je .Lmulx_scatter
+___
+$code.=<<___;
+ movq ($out),%rbx # pass b[0]
+ call __rsaz_512_mul
+
+ movq %xmm0, $out
+ movq %xmm1, %rbp
+
+ movq (%rsp), %r8
+ movq 8(%rsp), %r9
+ movq 16(%rsp), %r10
+ movq 24(%rsp), %r11
+ movq 32(%rsp), %r12
+ movq 40(%rsp), %r13
+ movq 48(%rsp), %r14
+ movq 56(%rsp), %r15
+
+ call __rsaz_512_reduce
+___
+$code.=<<___ if ($addx);
+ jmp .Lmul_scatter_tail
+
+.align 32
+.Lmulx_scatter:
+ movq ($out), %rdx # pass b[0]
+ call __rsaz_512_mulx
+
+ movq %xmm0, $out
+ movq %xmm1, %rbp
+
+ movq 128(%rsp), %rdx # pull $n0
+ movq (%rsp), %r8
+ movq 8(%rsp), %r9
+ movq 16(%rsp), %r10
+ movq 24(%rsp), %r11
+ movq 32(%rsp), %r12
+ movq 40(%rsp), %r13
+ movq 48(%rsp), %r14
+ movq 56(%rsp), %r15
+
+ call __rsaz_512_reducex
+
+.Lmul_scatter_tail:
+___
+$code.=<<___;
+ addq 64(%rsp), %r8
+ adcq 72(%rsp), %r9
+ adcq 80(%rsp), %r10
+ adcq 88(%rsp), %r11
+ adcq 96(%rsp), %r12
+ adcq 104(%rsp), %r13
+ adcq 112(%rsp), %r14
+ adcq 120(%rsp), %r15
+ movq %xmm2, $inp
+ sbbq %rcx, %rcx
+
+ call __rsaz_512_subtract
+
+ movl %r8d, 64*0($inp) # scatter
+ shrq \$32, %r8
+ movl %r9d, 64*2($inp)
+ shrq \$32, %r9
+ movl %r10d, 64*4($inp)
+ shrq \$32, %r10
+ movl %r11d, 64*6($inp)
+ shrq \$32, %r11
+ movl %r12d, 64*8($inp)
+ shrq \$32, %r12
+ movl %r13d, 64*10($inp)
+ shrq \$32, %r13
+ movl %r14d, 64*12($inp)
+ shrq \$32, %r14
+ movl %r15d, 64*14($inp)
+ shrq \$32, %r15
+ movl %r8d, 64*1($inp)
+ movl %r9d, 64*3($inp)
+ movl %r10d, 64*5($inp)
+ movl %r11d, 64*7($inp)
+ movl %r12d, 64*9($inp)
+ movl %r13d, 64*11($inp)
+ movl %r14d, 64*13($inp)
+ movl %r15d, 64*15($inp)
+
+ leaq 128+24+48(%rsp), %rax
+ movq -48(%rax), %r15
+ movq -40(%rax), %r14
+ movq -32(%rax), %r13
+ movq -24(%rax), %r12
+ movq -16(%rax), %rbp
+ movq -8(%rax), %rbx
+ leaq (%rax), %rsp
+.Lmul_scatter4_epilogue:
+ ret
+.size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
+___
+}
+{
+my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
+$code.=<<___;
+.globl rsaz_512_mul_by_one
+.type rsaz_512_mul_by_one,\@function,4
+.align 32
+rsaz_512_mul_by_one:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ subq \$128+24, %rsp
+.Lmul_by_one_body:
+___
+$code.=<<___ if ($addx);
+ movl OPENSSL_ia32cap_P+8(%rip),%eax
+___
+$code.=<<___;
+ movq $mod, %rbp # reassign argument
+ movq $n0, 128(%rsp)
+
+ movq ($inp), %r8
+ pxor %xmm0, %xmm0
+ movq 8($inp), %r9
+ movq 16($inp), %r10
+ movq 24($inp), %r11
+ movq 32($inp), %r12
+ movq 40($inp), %r13
+ movq 48($inp), %r14
+ movq 56($inp), %r15
+
+ movdqa %xmm0, (%rsp)
+ movdqa %xmm0, 16(%rsp)
+ movdqa %xmm0, 32(%rsp)
+ movdqa %xmm0, 48(%rsp)
+ movdqa %xmm0, 64(%rsp)
+ movdqa %xmm0, 80(%rsp)
+ movdqa %xmm0, 96(%rsp)
+___
+$code.=<<___ if ($addx);
+ andl \$0x80100,%eax
+ cmpl \$0x80100,%eax # check for MULX and ADO/CX
+ je .Lby_one_callx
+___
+$code.=<<___;
+ call __rsaz_512_reduce
+___
+$code.=<<___ if ($addx);
+ jmp .Lby_one_tail
+.align 32
+.Lby_one_callx:
+ movq 128(%rsp), %rdx # pull $n0
+ call __rsaz_512_reducex
+.Lby_one_tail:
+___
+$code.=<<___;
+ movq %r8, ($out)
+ movq %r9, 8($out)
+ movq %r10, 16($out)
+ movq %r11, 24($out)
+ movq %r12, 32($out)
+ movq %r13, 40($out)
+ movq %r14, 48($out)
+ movq %r15, 56($out)
+
+ leaq 128+24+48(%rsp), %rax
+ movq -48(%rax), %r15
+ movq -40(%rax), %r14
+ movq -32(%rax), %r13
+ movq -24(%rax), %r12
+ movq -16(%rax), %rbp
+ movq -8(%rax), %rbx
+ leaq (%rax), %rsp
+.Lmul_by_one_epilogue:
+ ret
+.size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
+___
+}
+{ # __rsaz_512_reduce
+ #
+ # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
+ # output: %r8-%r15
+ # clobbers: everything except %rbp and %rdi
+$code.=<<___;
+.type __rsaz_512_reduce,\@abi-omnipotent
+.align 32
+__rsaz_512_reduce:
+ movq %r8, %rbx
+ imulq 128+8(%rsp), %rbx
+ movq 0(%rbp), %rax
+ movl \$8, %ecx
+ jmp .Lreduction_loop
+
+.align 32
+.Lreduction_loop:
+ mulq %rbx
+ movq 8(%rbp), %rax
+ negq %r8
+ movq %rdx, %r8
+ adcq \$0, %r8
+
+ mulq %rbx
+ addq %rax, %r9
+ movq 16(%rbp), %rax
+ adcq \$0, %rdx
+ addq %r9, %r8
+ movq %rdx, %r9
+ adcq \$0, %r9
+
+ mulq %rbx
+ addq %rax, %r10
+ movq 24(%rbp), %rax
+ adcq \$0, %rdx
+ addq %r10, %r9
+ movq %rdx, %r10
+ adcq \$0, %r10
+
+ mulq %rbx
+ addq %rax, %r11
+ movq 32(%rbp), %rax
+ adcq \$0, %rdx
+ addq %r11, %r10
+ movq 128+8(%rsp), %rsi
+ #movq %rdx, %r11
+ #adcq \$0, %r11
+ adcq \$0, %rdx
+ movq %rdx, %r11
+
+ mulq %rbx
+ addq %rax, %r12
+ movq 40(%rbp), %rax
+ adcq \$0, %rdx
+ imulq %r8, %rsi
+ addq %r12, %r11
+ movq %rdx, %r12
+ adcq \$0, %r12
+
+ mulq %rbx
+ addq %rax, %r13
+ movq 48(%rbp), %rax
+ adcq \$0, %rdx
+ addq %r13, %r12
+ movq %rdx, %r13
+ adcq \$0, %r13
+
+ mulq %rbx
+ addq %rax, %r14
+ movq 56(%rbp), %rax
+ adcq \$0, %rdx
+ addq %r14, %r13
+ movq %rdx, %r14
+ adcq \$0, %r14
+
+ mulq %rbx
+ movq %rsi, %rbx
+ addq %rax, %r15
+ movq 0(%rbp), %rax
+ adcq \$0, %rdx
+ addq %r15, %r14
+ movq %rdx, %r15
+ adcq \$0, %r15
+
+ decl %ecx
+ jne .Lreduction_loop
+
+ ret
+.size __rsaz_512_reduce,.-__rsaz_512_reduce
+___
+}
+if ($addx) {
+ # __rsaz_512_reducex
+ #
+ # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
+ # output: %r8-%r15
+ # clobbers: everything except %rbp and %rdi
+$code.=<<___;
+.type __rsaz_512_reducex,\@abi-omnipotent
+.align 32
+__rsaz_512_reducex:
+ #movq 128+8(%rsp), %rdx # pull $n0
+ imulq %r8, %rdx
+ xorq %rsi, %rsi # cf=0,of=0
+ movl \$8, %ecx
+ jmp .Lreduction_loopx
+
+.align 32
+.Lreduction_loopx:
+ mov %r8, %rbx
+ mulx 0(%rbp), %rax, %r8
+ adcx %rbx, %rax
+ adox %r9, %r8
+
+ mulx 8(%rbp), %rax, %r9
+ adcx %rax, %r8
+ adox %r10, %r9
+
+ mulx 16(%rbp), %rbx, %r10
+ adcx %rbx, %r9
+ adox %r11, %r10
+
+ mulx 24(%rbp), %rbx, %r11
+ adcx %rbx, %r10
+ adox %r12, %r11
+
+ .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
+ mov %rdx, %rax
+ mov %r8, %rdx
+ adcx %rbx, %r11
+ adox %r13, %r12
+
+ mulx 128+8(%rsp), %rbx, %rdx
+ mov %rax, %rdx
+
+ mulx 40(%rbp), %rax, %r13
+ adcx %rax, %r12
+ adox %r14, %r13
+
+ .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
+ adcx %rax, %r13
+ adox %r15, %r14
+
+ mulx 56(%rbp), %rax, %r15
+ mov %rbx, %rdx
+ adcx %rax, %r14
+ adox %rsi, %r15 # %rsi is 0
+ adcx %rsi, %r15 # cf=0
+
+ decl %ecx # of=0
+ jne .Lreduction_loopx
+
+ ret
+.size __rsaz_512_reducex,.-__rsaz_512_reducex
+___
+}
+{ # __rsaz_512_subtract
+ # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
+ # output:
+ # clobbers: everything but %rdi, %rsi and %rbp
+$code.=<<___;
+.type __rsaz_512_subtract,\@abi-omnipotent
+.align 32
+__rsaz_512_subtract:
+ movq %r8, ($out)
+ movq %r9, 8($out)
+ movq %r10, 16($out)
+ movq %r11, 24($out)
+ movq %r12, 32($out)
+ movq %r13, 40($out)
+ movq %r14, 48($out)
+ movq %r15, 56($out)
+
+ movq 0($mod), %r8
+ movq 8($mod), %r9
+ negq %r8
+ notq %r9
+ andq %rcx, %r8
+ movq 16($mod), %r10
+ andq %rcx, %r9
+ notq %r10
+ movq 24($mod), %r11
+ andq %rcx, %r10
+ notq %r11
+ movq 32($mod), %r12
+ andq %rcx, %r11
+ notq %r12
+ movq 40($mod), %r13
+ andq %rcx, %r12
+ notq %r13
+ movq 48($mod), %r14
+ andq %rcx, %r13
+ notq %r14
+ movq 56($mod), %r15
+ andq %rcx, %r14
+ notq %r15
+ andq %rcx, %r15
+
+ addq ($out), %r8
+ adcq 8($out), %r9
+ adcq 16($out), %r10
+ adcq 24($out), %r11
+ adcq 32($out), %r12
+ adcq 40($out), %r13
+ adcq 48($out), %r14
+ adcq 56($out), %r15
+
+ movq %r8, ($out)
+ movq %r9, 8($out)
+ movq %r10, 16($out)
+ movq %r11, 24($out)
+ movq %r12, 32($out)
+ movq %r13, 40($out)
+ movq %r14, 48($out)
+ movq %r15, 56($out)
+
+ ret
+.size __rsaz_512_subtract,.-__rsaz_512_subtract
+___
+}
+{ # __rsaz_512_mul
+ #
+ # input: %rsi - ap, %rbp - bp
+ # ouput:
+ # clobbers: everything
+my ($ap,$bp) = ("%rsi","%rbp");
+$code.=<<___;
+.type __rsaz_512_mul,\@abi-omnipotent
+.align 32
+__rsaz_512_mul:
+ leaq 8(%rsp), %rdi
+
+ movq ($ap), %rax
+ mulq %rbx
+ movq %rax, (%rdi)
+ movq 8($ap), %rax
+ movq %rdx, %r8
+
+ mulq %rbx
+ addq %rax, %r8
+ movq 16($ap), %rax
+ movq %rdx, %r9
+ adcq \$0, %r9
+
+ mulq %rbx
+ addq %rax, %r9
+ movq 24($ap), %rax
+ movq %rdx, %r10
+ adcq \$0, %r10
+
+ mulq %rbx
+ addq %rax, %r10
+ movq 32($ap), %rax
+ movq %rdx, %r11
+ adcq \$0, %r11
+
+ mulq %rbx
+ addq %rax, %r11
+ movq 40($ap), %rax
+ movq %rdx, %r12
+ adcq \$0, %r12
+
+ mulq %rbx
+ addq %rax, %r12
+ movq 48($ap), %rax
+ movq %rdx, %r13
+ adcq \$0, %r13
+
+ mulq %rbx
+ addq %rax, %r13
+ movq 56($ap), %rax
+ movq %rdx, %r14
+ adcq \$0, %r14
+
+ mulq %rbx
+ addq %rax, %r14
+ movq ($ap), %rax
+ movq %rdx, %r15
+ adcq \$0, %r15
+
+ leaq 8($bp), $bp
+ leaq 8(%rdi), %rdi
+
+ movl \$7, %ecx
+ jmp .Loop_mul
+
+.align 32
+.Loop_mul:
+ movq ($bp), %rbx
+ mulq %rbx
+ addq %rax, %r8
+ movq 8($ap), %rax
+ movq %r8, (%rdi)
+ movq %rdx, %r8
+ adcq \$0, %r8
+
+ mulq %rbx
+ addq %rax, %r9
+ movq 16($ap), %rax
+ adcq \$0, %rdx
+ addq %r9, %r8
+ movq %rdx, %r9
+ adcq \$0, %r9
+
+ mulq %rbx
+ addq %rax, %r10
+ movq 24($ap), %rax
+ adcq \$0, %rdx
+ addq %r10, %r9
+ movq %rdx, %r10
+ adcq \$0, %r10
+
+ mulq %rbx
+ addq %rax, %r11
+ movq 32($ap), %rax
+ adcq \$0, %rdx
+ addq %r11, %r10
+ movq %rdx, %r11
+ adcq \$0, %r11
+
+ mulq %rbx
+ addq %rax, %r12
+ movq 40($ap), %rax
+ adcq \$0, %rdx
+ addq %r12, %r11
+ movq %rdx, %r12
+ adcq \$0, %r12
+
+ mulq %rbx
+ addq %rax, %r13
+ movq 48($ap), %rax
+ adcq \$0, %rdx
+ addq %r13, %r12
+ movq %rdx, %r13
+ adcq \$0, %r13
+
+ mulq %rbx
+ addq %rax, %r14
+ movq 56($ap), %rax
+ adcq \$0, %rdx
+ addq %r14, %r13
+ movq %rdx, %r14
+ leaq 8($bp), $bp
+ adcq \$0, %r14
+
+ mulq %rbx
+ addq %rax, %r15
+ movq ($ap), %rax
+ adcq \$0, %rdx
+ addq %r15, %r14
+ movq %rdx, %r15
+ adcq \$0, %r15
+
+ leaq 8(%rdi), %rdi
+
+ decl %ecx
+ jnz .Loop_mul
+
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq %r12, 32(%rdi)
+ movq %r13, 40(%rdi)
+ movq %r14, 48(%rdi)
+ movq %r15, 56(%rdi)
+
+ ret
+.size __rsaz_512_mul,.-__rsaz_512_mul
+___
+}
+if ($addx) {
+ # __rsaz_512_mulx
+ #
+ # input: %rsi - ap, %rbp - bp
+ # ouput:
+ # clobbers: everything
+my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
+$code.=<<___;
+.type __rsaz_512_mulx,\@abi-omnipotent
+.align 32
+__rsaz_512_mulx:
+ mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller
+ mov \$-6, %rcx
+
+ mulx 8($ap), %rax, %r9
+ movq %rbx, 8(%rsp)
+
+ mulx 16($ap), %rbx, %r10
+ adc %rax, %r8
+
+ mulx 24($ap), %rax, %r11
+ adc %rbx, %r9
+
+ mulx 32($ap), %rbx, %r12
+ adc %rax, %r10
+
+ mulx 40($ap), %rax, %r13
+ adc %rbx, %r11
+
+ mulx 48($ap), %rbx, %r14
+ adc %rax, %r12
+
+ mulx 56($ap), %rax, %r15
+ mov 8($bp), %rdx
+ adc %rbx, %r13
+ adc %rax, %r14
+ adc \$0, %r15
+
+ xor $zero, $zero # cf=0,of=0
+ jmp .Loop_mulx
+
+.align 32
+.Loop_mulx:
+ movq %r8, %rbx
+ mulx ($ap), %rax, %r8
+ adcx %rax, %rbx
+ adox %r9, %r8
+
+ mulx 8($ap), %rax, %r9
+ adcx %rax, %r8
+ adox %r10, %r9
+
+ mulx 16($ap), %rax, %r10
+ adcx %rax, %r9
+ adox %r11, %r10
+
+ mulx 24($ap), %rax, %r11
+ adcx %rax, %r10
+ adox %r12, %r11
+
+ .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
+ adcx %rax, %r11
+ adox %r13, %r12
+
+ mulx 40($ap), %rax, %r13
+ adcx %rax, %r12
+ adox %r14, %r13
+
+ mulx 48($ap), %rax, %r14
+ adcx %rax, %r13
+ adox %r15, %r14
+
+ mulx 56($ap), %rax, %r15
+ movq 64($bp,%rcx,8), %rdx
+ movq %rbx, 8+64-8(%rsp,%rcx,8)
+ adcx %rax, %r14
+ adox $zero, %r15
+ adcx $zero, %r15 # cf=0
+
+ inc %rcx # of=0
+ jnz .Loop_mulx
+
+ movq %r8, %rbx
+ mulx ($ap), %rax, %r8
+ adcx %rax, %rbx
+ adox %r9, %r8
+
+ .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
+ adcx %rax, %r8
+ adox %r10, %r9
+
+ .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
+ adcx %rax, %r9
+ adox %r11, %r10
+
+ mulx 24($ap), %rax, %r11
+ adcx %rax, %r10
+ adox %r12, %r11
+
+ mulx 32($ap), %rax, %r12
+ adcx %rax, %r11
+ adox %r13, %r12
+
+ mulx 40($ap), %rax, %r13
+ adcx %rax, %r12
+ adox %r14, %r13
+
+ .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
+ adcx %rax, %r13
+ adox %r15, %r14
+
+ .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
+ adcx %rax, %r14
+ adox $zero, %r15
+ adcx $zero, %r15
+
+ mov %rbx, 8+64-8(%rsp)
+ mov %r8, 8+64(%rsp)
+ mov %r9, 8+64+8(%rsp)
+ mov %r10, 8+64+16(%rsp)
+ mov %r11, 8+64+24(%rsp)
+ mov %r12, 8+64+32(%rsp)
+ mov %r13, 8+64+40(%rsp)
+ mov %r14, 8+64+48(%rsp)
+ mov %r15, 8+64+56(%rsp)
+
+ ret
+.size __rsaz_512_mulx,.-__rsaz_512_mulx
+___
+}
+{
+my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
+$code.=<<___;
+.globl rsaz_512_scatter4
+.type rsaz_512_scatter4,\@abi-omnipotent
+.align 16
+rsaz_512_scatter4:
+ leaq ($out,$power,4), $out
+ movl \$8, %r9d
+ jmp .Loop_scatter
+.align 16
+.Loop_scatter:
+ movq ($inp), %rax
+ leaq 8($inp), $inp
+ movl %eax, ($out)
+ shrq \$32, %rax
+ movl %eax, 64($out)
+ leaq 128($out), $out
+ decl %r9d
+ jnz .Loop_scatter
+ ret
+.size rsaz_512_scatter4,.-rsaz_512_scatter4
+
+.globl rsaz_512_gather4
+.type rsaz_512_gather4,\@abi-omnipotent
+.align 16
+rsaz_512_gather4:
+ leaq ($inp,$power,4), $inp
+ movl \$8, %r9d
+ jmp .Loop_gather
+.align 16
+.Loop_gather:
+ movl ($inp), %eax
+ movl 64($inp), %r8d
+ leaq 128($inp), $inp
+ shlq \$32, %r8
+ or %r8, %rax
+ movq %rax, ($out)
+ leaq 8($out), $out
+ decl %r9d
+ jnz .Loop_gather
+ ret
+.size rsaz_512_gather4,.-rsaz_512_gather4
+___
+}
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # end of prologue label
+ cmp %r10,%rbx # context->Rip<end of prologue label
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ lea 128+24+48(%rax),%rax
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
+
+.Lcommon_seh_tail:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size sqr_handler,.-sqr_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_rsaz_512_sqr
+ .rva .LSEH_end_rsaz_512_sqr
+ .rva .LSEH_info_rsaz_512_sqr
+
+ .rva .LSEH_begin_rsaz_512_mul
+ .rva .LSEH_end_rsaz_512_mul
+ .rva .LSEH_info_rsaz_512_mul
+
+ .rva .LSEH_begin_rsaz_512_mul_gather4
+ .rva .LSEH_end_rsaz_512_mul_gather4
+ .rva .LSEH_info_rsaz_512_mul_gather4
+
+ .rva .LSEH_begin_rsaz_512_mul_scatter4
+ .rva .LSEH_end_rsaz_512_mul_scatter4
+ .rva .LSEH_info_rsaz_512_mul_scatter4
+
+ .rva .LSEH_begin_rsaz_512_mul_by_one
+ .rva .LSEH_end_rsaz_512_mul_by_one
+ .rva .LSEH_info_rsaz_512_mul_by_one
+
+.section .xdata
+.align 8
+.LSEH_info_rsaz_512_sqr:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
+.LSEH_info_rsaz_512_mul:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
+.LSEH_info_rsaz_512_mul_gather4:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
+.LSEH_info_rsaz_512_mul_scatter4:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
+.LSEH_info_rsaz_512_mul_by_one:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/crypto/bn/asm/sparct4-mont.pl b/crypto/bn/asm/sparct4-mont.pl
new file mode 100755
index 000000000000..71b45002a42f
--- /dev/null
+++ b/crypto/bn/asm/sparct4-mont.pl
@@ -0,0 +1,1222 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by David S. Miller <davem@devemloft.net> and Andy Polyakov
+# <appro@openssl.org>. The module is licensed under 2-clause BSD
+# license. November 2012. All rights reserved.
+# ====================================================================
+
+######################################################################
+# Montgomery squaring-n-multiplication module for SPARC T4.
+#
+# The module consists of three parts:
+#
+# 1) collection of "single-op" subroutines that perform single
+# operation, Montgomery squaring or multiplication, on 512-,
+# 1024-, 1536- and 2048-bit operands;
+# 2) collection of "multi-op" subroutines that perform 5 squaring and
+# 1 multiplication operations on operands of above lengths;
+# 3) fall-back and helper VIS3 subroutines.
+#
+# RSA sign is dominated by multi-op subroutine, while RSA verify and
+# DSA - by single-op. Special note about 4096-bit RSA verify result.
+# Operands are too long for dedicated hardware and it's handled by
+# VIS3 code, which is why you don't see any improvement. It's surely
+# possible to improve it [by deploying 'mpmul' instruction], maybe in
+# the future...
+#
+# Performance improvement.
+#
+# 64-bit process, VIS3:
+# sign verify sign/s verify/s
+# rsa 1024 bits 0.000628s 0.000028s 1592.4 35434.4
+# rsa 2048 bits 0.003282s 0.000106s 304.7 9438.3
+# rsa 4096 bits 0.025866s 0.000340s 38.7 2940.9
+# dsa 1024 bits 0.000301s 0.000332s 3323.7 3013.9
+# dsa 2048 bits 0.001056s 0.001233s 946.9 810.8
+#
+# 64-bit process, this module:
+# sign verify sign/s verify/s
+# rsa 1024 bits 0.000256s 0.000016s 3904.4 61411.9
+# rsa 2048 bits 0.000946s 0.000029s 1056.8 34292.7
+# rsa 4096 bits 0.005061s 0.000340s 197.6 2940.5
+# dsa 1024 bits 0.000176s 0.000195s 5674.7 5130.5
+# dsa 2048 bits 0.000296s 0.000354s 3383.2 2827.6
+#
+######################################################################
+# 32-bit process, VIS3:
+# sign verify sign/s verify/s
+# rsa 1024 bits 0.000665s 0.000028s 1504.8 35233.3
+# rsa 2048 bits 0.003349s 0.000106s 298.6 9433.4
+# rsa 4096 bits 0.025959s 0.000341s 38.5 2934.8
+# dsa 1024 bits 0.000320s 0.000341s 3123.3 2929.6
+# dsa 2048 bits 0.001101s 0.001260s 908.2 793.4
+#
+# 32-bit process, this module:
+# sign verify sign/s verify/s
+# rsa 1024 bits 0.000301s 0.000017s 3317.1 60240.0
+# rsa 2048 bits 0.001034s 0.000030s 966.9 33812.7
+# rsa 4096 bits 0.005244s 0.000341s 190.7 2935.4
+# dsa 1024 bits 0.000201s 0.000205s 4976.1 4879.2
+# dsa 2048 bits 0.000328s 0.000360s 3051.1 2774.2
+#
+# 32-bit code is prone to performance degradation as interrupt rate
+# dispatched to CPU executing the code grows. This is because in
+# standard process of handling interrupt in 32-bit process context
+# upper halves of most integer registers used as input or output are
+# zeroed. This renders result invalid, and operation has to be re-run.
+# If CPU is "bothered" with timer interrupts only, the penalty is
+# hardly measurable. But in order to mitigate this problem for higher
+# interrupt rates contemporary Linux kernel recognizes biased stack
+# even in 32-bit process context and preserves full register contents.
+# See http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=517ffce4e1a03aea979fe3a18a3dd1761a24fafb
+# for details.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "sparcv9_modes.pl";
+
+$code.=<<___;
+#include "sparc_arch.h"
+
+#ifdef __arch64__
+.register %g2,#scratch
+.register %g3,#scratch
+#endif
+
+.section ".text",#alloc,#execinstr
+
+#ifdef __PIC__
+SPARC_PIC_THUNK(%g1)
+#endif
+___
+
+########################################################################
+# Register layout for mont[mul|sqr] instructions.
+# For details see "Oracle SPARC Architecture 2011" manual at
+# http://www.oracle.com/technetwork/server-storage/sun-sparc-enterprise/documentation/.
+#
+my @R=map("%f".2*$_,(0..11,30,31,12..29));
+my @N=(map("%l$_",(0..7)),map("%o$_",(0..5))); @N=(@N,@N,@N[0..3]);
+my @A=(@N[0..13],@R[14..31]);
+my @B=(map("%i$_",(0..5)),map("%l$_",(0..7))); @B=(@B,@B,map("%o$_",(0..3)));
+
+########################################################################
+# int bn_mul_mont_t4_$NUM(u64 *rp,const u64 *ap,const u64 *bp,
+# const u64 *np,const BN_ULONG *n0);
+#
+sub generate_bn_mul_mont_t4() {
+my $NUM=shift;
+my ($rp,$ap,$bp,$np,$sentinel)=map("%g$_",(1..5));
+
+$code.=<<___;
+.globl bn_mul_mont_t4_$NUM
+.align 32
+bn_mul_mont_t4_$NUM:
+#ifdef __arch64__
+ mov 0,$sentinel
+ mov -128,%g4
+#elif defined(SPARCV9_64BIT_STACK)
+ SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
+ ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0]
+ mov -2047,%g4
+ and %g1,SPARCV9_64BIT_STACK,%g1
+ movrz %g1,0,%g4
+ mov -1,$sentinel
+ add %g4,-128,%g4
+#else
+ mov -1,$sentinel
+ mov -128,%g4
+#endif
+ sllx $sentinel,32,$sentinel
+ save %sp,%g4,%sp
+#ifndef __arch64__
+ save %sp,-128,%sp ! warm it up
+ save %sp,-128,%sp
+ save %sp,-128,%sp
+ save %sp,-128,%sp
+ save %sp,-128,%sp
+ save %sp,-128,%sp
+ restore
+ restore
+ restore
+ restore
+ restore
+ restore
+#endif
+ and %sp,1,%g4
+ or $sentinel,%fp,%fp
+ or %g4,$sentinel,$sentinel
+
+ ! copy arguments to global registers
+ mov %i0,$rp
+ mov %i1,$ap
+ mov %i2,$bp
+ mov %i3,$np
+ ld [%i4+0],%f1 ! load *n0
+ ld [%i4+4],%f0
+ fsrc2 %f0,%f60
+___
+
+# load ap[$NUM] ########################################################
+$code.=<<___;
+ save %sp,-128,%sp; or $sentinel,%fp,%fp
+___
+for($i=0; $i<14 && $i<$NUM; $i++) {
+my $lo=$i<13?@A[$i+1]:"%o7";
+$code.=<<___;
+ ld [$ap+$i*8+0],$lo
+ ld [$ap+$i*8+4],@A[$i]
+ sllx @A[$i],32,@A[$i]
+ or $lo,@A[$i],@A[$i]
+___
+}
+for(; $i<$NUM; $i++) {
+my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
+$code.=<<___;
+ ld [$ap+$i*8+0],$lo
+ ld [$ap+$i*8+4],$hi
+ fsrc2 $hi,@A[$i]
+___
+}
+# load np[$NUM] ########################################################
+$code.=<<___;
+ save %sp,-128,%sp; or $sentinel,%fp,%fp
+___
+for($i=0; $i<14 && $i<$NUM; $i++) {
+my $lo=$i<13?@N[$i+1]:"%o7";
+$code.=<<___;
+ ld [$np+$i*8+0],$lo
+ ld [$np+$i*8+4],@N[$i]
+ sllx @N[$i],32,@N[$i]
+ or $lo,@N[$i],@N[$i]
+___
+}
+$code.=<<___;
+ save %sp,-128,%sp; or $sentinel,%fp,%fp
+___
+for(; $i<28 && $i<$NUM; $i++) {
+my $lo=$i<27?@N[$i+1]:"%o7";
+$code.=<<___;
+ ld [$np+$i*8+0],$lo
+ ld [$np+$i*8+4],@N[$i]
+ sllx @N[$i],32,@N[$i]
+ or $lo,@N[$i],@N[$i]
+___
+}
+$code.=<<___;
+ save %sp,-128,%sp; or $sentinel,%fp,%fp
+___
+for(; $i<$NUM; $i++) {
+my $lo=($i<$NUM-1)?@N[$i+1]:"%o7";
+$code.=<<___;
+ ld [$np+$i*8+0],$lo
+ ld [$np+$i*8+4],@N[$i]
+ sllx @N[$i],32,@N[$i]
+ or $lo,@N[$i],@N[$i]
+___
+}
+$code.=<<___;
+ cmp $ap,$bp
+ be SIZE_T_CC,.Lmsquare_$NUM
+ nop
+___
+
+# load bp[$NUM] ########################################################
+$code.=<<___;
+ save %sp,-128,%sp; or $sentinel,%fp,%fp
+___
+for($i=0; $i<14 && $i<$NUM; $i++) {
+my $lo=$i<13?@B[$i+1]:"%o7";
+$code.=<<___;
+ ld [$bp+$i*8+0],$lo
+ ld [$bp+$i*8+4],@B[$i]
+ sllx @B[$i],32,@B[$i]
+ or $lo,@B[$i],@B[$i]
+___
+}
+$code.=<<___;
+ save %sp,-128,%sp; or $sentinel,%fp,%fp
+___
+for(; $i<$NUM; $i++) {
+my $lo=($i<$NUM-1)?@B[$i+1]:"%o7";
+$code.=<<___;
+ ld [$bp+$i*8+0],$lo
+ ld [$bp+$i*8+4],@B[$i]
+ sllx @B[$i],32,@B[$i]
+ or $lo,@B[$i],@B[$i]
+___
+}
+# magic ################################################################
+$code.=<<___;
+ .word 0x81b02920+$NUM-1 ! montmul $NUM-1
+.Lmresume_$NUM:
+ fbu,pn %fcc3,.Lmabort_$NUM
+#ifndef __arch64__
+ and %fp,$sentinel,$sentinel
+ brz,pn $sentinel,.Lmabort_$NUM
+#endif
+ nop
+#ifdef __arch64__
+ restore
+ restore
+ restore
+ restore
+ restore
+#else
+ restore; and %fp,$sentinel,$sentinel
+ restore; and %fp,$sentinel,$sentinel
+ restore; and %fp,$sentinel,$sentinel
+ restore; and %fp,$sentinel,$sentinel
+ brz,pn $sentinel,.Lmabort1_$NUM
+ restore
+#endif
+___
+
+# save tp[$NUM] ########################################################
+for($i=0; $i<14 && $i<$NUM; $i++) {
+$code.=<<___;
+ movxtod @A[$i],@R[$i]
+___
+}
+$code.=<<___;
+#ifdef __arch64__
+ restore
+#else
+ and %fp,$sentinel,$sentinel
+ restore
+ and $sentinel,1,%o7
+ and %fp,$sentinel,$sentinel
+ srl %fp,0,%fp ! just in case?
+ or %o7,$sentinel,$sentinel
+ brz,a,pn $sentinel,.Lmdone_$NUM
+ mov 0,%i0 ! return failure
+#endif
+___
+for($i=0; $i<12 && $i<$NUM; $i++) {
+@R[$i] =~ /%f([0-9]+)/;
+my $lo = "%f".($1+1);
+$code.=<<___;
+ st $lo,[$rp+$i*8+0]
+ st @R[$i],[$rp+$i*8+4]
+___
+}
+for(; $i<$NUM; $i++) {
+my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
+$code.=<<___;
+ fsrc2 @R[$i],$hi
+ st $lo,[$rp+$i*8+0]
+ st $hi,[$rp+$i*8+4]
+___
+}
+$code.=<<___;
+ mov 1,%i0 ! return success
+.Lmdone_$NUM:
+ ret
+ restore
+
+.Lmabort_$NUM:
+ restore
+ restore
+ restore
+ restore
+ restore
+.Lmabort1_$NUM:
+ restore
+
+ mov 0,%i0 ! return failure
+ ret
+ restore
+
+.align 32
+.Lmsquare_$NUM:
+ save %sp,-128,%sp; or $sentinel,%fp,%fp
+ save %sp,-128,%sp; or $sentinel,%fp,%fp
+ .word 0x81b02940+$NUM-1 ! montsqr $NUM-1
+ ba .Lmresume_$NUM
+ nop
+.type bn_mul_mont_t4_$NUM, #function
+.size bn_mul_mont_t4_$NUM, .-bn_mul_mont_t4_$NUM
+___
+}
+
+for ($i=8;$i<=32;$i+=8) {
+ &generate_bn_mul_mont_t4($i);
+}
+
+########################################################################
+#
+sub load_ccr {
+my ($ptbl,$pwr,$ccr,$skip_wr)=@_;
+$code.=<<___;
+ srl $pwr, 2, %o4
+ and $pwr, 3, %o5
+ and %o4, 7, %o4
+ sll %o5, 3, %o5 ! offset within first cache line
+ add %o5, $ptbl, $ptbl ! of the pwrtbl
+ or %g0, 1, %o5
+ sll %o5, %o4, $ccr
+___
+$code.=<<___ if (!$skip_wr);
+ wr $ccr, %g0, %ccr
+___
+}
+sub load_b_pair {
+my ($pwrtbl,$B0,$B1)=@_;
+
+$code.=<<___;
+ ldx [$pwrtbl+0*32], $B0
+ ldx [$pwrtbl+8*32], $B1
+ ldx [$pwrtbl+1*32], %o4
+ ldx [$pwrtbl+9*32], %o5
+ movvs %icc, %o4, $B0
+ ldx [$pwrtbl+2*32], %o4
+ movvs %icc, %o5, $B1
+ ldx [$pwrtbl+10*32],%o5
+ move %icc, %o4, $B0
+ ldx [$pwrtbl+3*32], %o4
+ move %icc, %o5, $B1
+ ldx [$pwrtbl+11*32],%o5
+ movneg %icc, %o4, $B0
+ ldx [$pwrtbl+4*32], %o4
+ movneg %icc, %o5, $B1
+ ldx [$pwrtbl+12*32],%o5
+ movcs %xcc, %o4, $B0
+ ldx [$pwrtbl+5*32],%o4
+ movcs %xcc, %o5, $B1
+ ldx [$pwrtbl+13*32],%o5
+ movvs %xcc, %o4, $B0
+ ldx [$pwrtbl+6*32], %o4
+ movvs %xcc, %o5, $B1
+ ldx [$pwrtbl+14*32],%o5
+ move %xcc, %o4, $B0
+ ldx [$pwrtbl+7*32], %o4
+ move %xcc, %o5, $B1
+ ldx [$pwrtbl+15*32],%o5
+ movneg %xcc, %o4, $B0
+ add $pwrtbl,16*32, $pwrtbl
+ movneg %xcc, %o5, $B1
+___
+}
+sub load_b {
+my ($pwrtbl,$Bi)=@_;
+
+$code.=<<___;
+ ldx [$pwrtbl+0*32], $Bi
+ ldx [$pwrtbl+1*32], %o4
+ ldx [$pwrtbl+2*32], %o5
+ movvs %icc, %o4, $Bi
+ ldx [$pwrtbl+3*32], %o4
+ move %icc, %o5, $Bi
+ ldx [$pwrtbl+4*32], %o5
+ movneg %icc, %o4, $Bi
+ ldx [$pwrtbl+5*32], %o4
+ movcs %xcc, %o5, $Bi
+ ldx [$pwrtbl+6*32], %o5
+ movvs %xcc, %o4, $Bi
+ ldx [$pwrtbl+7*32], %o4
+ move %xcc, %o5, $Bi
+ add $pwrtbl,8*32, $pwrtbl
+ movneg %xcc, %o4, $Bi
+___
+}
+
+########################################################################
+# int bn_pwr5_mont_t4_$NUM(u64 *tp,const u64 *np,const BN_ULONG *n0,
+# const u64 *pwrtbl,int pwr,int stride);
+#
+sub generate_bn_pwr5_mont_t4() {
+my $NUM=shift;
+my ($tp,$np,$pwrtbl,$pwr,$sentinel)=map("%g$_",(1..5));
+
+$code.=<<___;
+.globl bn_pwr5_mont_t4_$NUM
+.align 32
+bn_pwr5_mont_t4_$NUM:
+#ifdef __arch64__
+ mov 0,$sentinel
+ mov -128,%g4
+#elif defined(SPARCV9_64BIT_STACK)
+ SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
+ ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0]
+ mov -2047,%g4
+ and %g1,SPARCV9_64BIT_STACK,%g1
+ movrz %g1,0,%g4
+ mov -1,$sentinel
+ add %g4,-128,%g4
+#else
+ mov -1,$sentinel
+ mov -128,%g4
+#endif
+ sllx $sentinel,32,$sentinel
+ save %sp,%g4,%sp
+#ifndef __arch64__
+ save %sp,-128,%sp ! warm it up
+ save %sp,-128,%sp
+ save %sp,-128,%sp
+ save %sp,-128,%sp
+ save %sp,-128,%sp
+ save %sp,-128,%sp
+ restore
+ restore
+ restore
+ restore
+ restore
+ restore
+#endif
+ and %sp,1,%g4
+ or $sentinel,%fp,%fp
+ or %g4,$sentinel,$sentinel
+
+ ! copy arguments to global registers
+ mov %i0,$tp
+ mov %i1,$np
+ ld [%i2+0],%f1 ! load *n0
+ ld [%i2+4],%f0
+ mov %i3,$pwrtbl
+ srl %i4,%g0,%i4 ! pack last arguments
+ sllx %i5,32,$pwr
+ or %i4,$pwr,$pwr
+ fsrc2 %f0,%f60
+___
+
+# load tp[$NUM] ########################################################
+$code.=<<___;
+ save %sp,-128,%sp; or $sentinel,%fp,%fp
+___
+for($i=0; $i<14 && $i<$NUM; $i++) {
+$code.=<<___;
+ ldx [$tp+$i*8],@A[$i]
+___
+}
+for(; $i<$NUM; $i++) {
+$code.=<<___;
+ ldd [$tp+$i*8],@A[$i]
+___
+}
+# load np[$NUM] ########################################################
+$code.=<<___;
+ save %sp,-128,%sp; or $sentinel,%fp,%fp
+___
+for($i=0; $i<14 && $i<$NUM; $i++) {
+$code.=<<___;
+ ldx [$np+$i*8],@N[$i]
+___
+}
+$code.=<<___;
+ save %sp,-128,%sp; or $sentinel,%fp,%fp
+___
+for(; $i<28 && $i<$NUM; $i++) {
+$code.=<<___;
+ ldx [$np+$i*8],@N[$i]
+___
+}
+$code.=<<___;
+ save %sp,-128,%sp; or $sentinel,%fp,%fp
+___
+for(; $i<$NUM; $i++) {
+$code.=<<___;
+ ldx [$np+$i*8],@N[$i]
+___
+}
+# load pwrtbl[pwr] ########################################################
+$code.=<<___;
+ save %sp,-128,%sp; or $sentinel,%fp,%fp
+
+ srlx $pwr, 32, %o4 ! unpack $pwr
+ srl $pwr, %g0, %o5
+ sub %o4, 5, %o4
+ mov $pwrtbl, %o7
+ sllx %o4, 32, $pwr ! re-pack $pwr
+ or %o5, $pwr, $pwr
+ srl %o5, %o4, %o5
+___
+ &load_ccr("%o7","%o5","%o4");
+$code.=<<___;
+ b .Lstride_$NUM
+ nop
+.align 16
+.Lstride_$NUM:
+___
+for($i=0; $i<14 && $i<$NUM; $i+=2) {
+ &load_b_pair("%o7",@B[$i],@B[$i+1]);
+}
+$code.=<<___;
+ save %sp,-128,%sp; or $sentinel,%fp,%fp
+___
+for(; $i<$NUM; $i+=2) {
+ &load_b_pair("%i7",@B[$i],@B[$i+1]);
+}
+$code.=<<___;
+ srax $pwr, 32, %o4 ! unpack $pwr
+ srl $pwr, %g0, %o5
+ sub %o4, 5, %o4
+ mov $pwrtbl, %i7
+ sllx %o4, 32, $pwr ! re-pack $pwr
+ or %o5, $pwr, $pwr
+ srl %o5, %o4, %o5
+___
+ &load_ccr("%i7","%o5","%o4",1);
+
+# magic ################################################################
+for($i=0; $i<5; $i++) {
+$code.=<<___;
+ .word 0x81b02940+$NUM-1 ! montsqr $NUM-1
+ fbu,pn %fcc3,.Labort_$NUM
+#ifndef __arch64__
+ and %fp,$sentinel,$sentinel
+ brz,pn $sentinel,.Labort_$NUM
+#endif
+ nop
+___
+}
+$code.=<<___;
+ wr %o4, %g0, %ccr
+ .word 0x81b02920+$NUM-1 ! montmul $NUM-1
+ fbu,pn %fcc3,.Labort_$NUM
+#ifndef __arch64__
+ and %fp,$sentinel,$sentinel
+ brz,pn $sentinel,.Labort_$NUM
+#endif
+
+ srax $pwr, 32, %o4
+#ifdef __arch64__
+ brgez %o4,.Lstride_$NUM
+ restore
+ restore
+ restore
+ restore
+ restore
+#else
+ brgez %o4,.Lstride_$NUM
+ restore; and %fp,$sentinel,$sentinel
+ restore; and %fp,$sentinel,$sentinel
+ restore; and %fp,$sentinel,$sentinel
+ restore; and %fp,$sentinel,$sentinel
+ brz,pn $sentinel,.Labort1_$NUM
+ restore
+#endif
+___
+
+# save tp[$NUM] ########################################################
+for($i=0; $i<14 && $i<$NUM; $i++) {
+$code.=<<___;
+ movxtod @A[$i],@R[$i]
+___
+}
+$code.=<<___;
+#ifdef __arch64__
+ restore
+#else
+ and %fp,$sentinel,$sentinel
+ restore
+ and $sentinel,1,%o7
+ and %fp,$sentinel,$sentinel
+ srl %fp,0,%fp ! just in case?
+ or %o7,$sentinel,$sentinel
+ brz,a,pn $sentinel,.Ldone_$NUM
+ mov 0,%i0 ! return failure
+#endif
+___
+for($i=0; $i<$NUM; $i++) {
+$code.=<<___;
+ std @R[$i],[$tp+$i*8]
+___
+}
+$code.=<<___;
+ mov 1,%i0 ! return success
+.Ldone_$NUM:
+ ret
+ restore
+
+.Labort_$NUM:
+ restore
+ restore
+ restore
+ restore
+ restore
+.Labort1_$NUM:
+ restore
+
+ mov 0,%i0 ! return failure
+ ret
+ restore
+.type bn_pwr5_mont_t4_$NUM, #function
+.size bn_pwr5_mont_t4_$NUM, .-bn_pwr5_mont_t4_$NUM
+___
+}
+
+for ($i=8;$i<=32;$i+=8) {
+ &generate_bn_pwr5_mont_t4($i);
+}
+
+{
+########################################################################
+# Fall-back subroutines
+#
+# copy of bn_mul_mont_vis3 adjusted for vectors of 64-bit values
+#
+($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
+ (map("%g$_",(1..5)),map("%o$_",(0..5,7)));
+
+# int bn_mul_mont(
+$rp="%o0"; # u64 *rp,
+$ap="%o1"; # const u64 *ap,
+$bp="%o2"; # const u64 *bp,
+$np="%o3"; # const u64 *np,
+$n0p="%o4"; # const BN_ULONG *n0,
+$num="%o5"; # int num); # caller ensures that num is >=3
+$code.=<<___;
+.globl bn_mul_mont_t4
+.align 32
+bn_mul_mont_t4:
+ add %sp, STACK_BIAS, %g4 ! real top of stack
+ sll $num, 3, $num ! size in bytes
+ add $num, 63, %g1
+ andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes
+ sub %g4, %g1, %g1
+ andn %g1, 63, %g1 ! align at 64 byte
+ sub %g1, STACK_FRAME, %g1 ! new top of stack
+ sub %g1, %g4, %g1
+
+ save %sp, %g1, %sp
+___
+# +-------------------------------+<----- %sp
+# . .
+# +-------------------------------+<----- aligned at 64 bytes
+# | __int64 tmp[0] |
+# +-------------------------------+
+# . .
+# . .
+# +-------------------------------+<----- aligned at 64 bytes
+# . .
+($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
+($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz)=map("%l$_",(0..7));
+($ovf,$i)=($t0,$t1);
+$code.=<<___;
+ ld [$n0p+0], $t0 ! pull n0[0..1] value
+ ld [$n0p+4], $t1
+ add %sp, STACK_BIAS+STACK_FRAME, $tp
+ ldx [$bp+0], $m0 ! m0=bp[0]
+ sllx $t1, 32, $n0
+ add $bp, 8, $bp
+ or $t0, $n0, $n0
+
+ ldx [$ap+0], $aj ! ap[0]
+
+ mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
+ umulxhi $aj, $m0, $hi0
+
+ ldx [$ap+8], $aj ! ap[1]
+ add $ap, 16, $ap
+ ldx [$np+0], $nj ! np[0]
+
+ mulx $lo0, $n0, $m1 ! "tp[0]"*n0
+
+ mulx $aj, $m0, $alo ! ap[1]*bp[0]
+ umulxhi $aj, $m0, $aj ! ahi=aj
+
+ mulx $nj, $m1, $lo1 ! np[0]*m1
+ umulxhi $nj, $m1, $hi1
+
+ ldx [$np+8], $nj ! np[1]
+
+ addcc $lo0, $lo1, $lo1
+ add $np, 16, $np
+ addxc %g0, $hi1, $hi1
+
+ mulx $nj, $m1, $nlo ! np[1]*m1
+ umulxhi $nj, $m1, $nj ! nhi=nj
+
+ ba .L1st
+ sub $num, 24, $cnt ! cnt=num-3
+
+.align 16
+.L1st:
+ addcc $alo, $hi0, $lo0
+ addxc $aj, %g0, $hi0
+
+ ldx [$ap+0], $aj ! ap[j]
+ addcc $nlo, $hi1, $lo1
+ add $ap, 8, $ap
+ addxc $nj, %g0, $hi1 ! nhi=nj
+
+ ldx [$np+0], $nj ! np[j]
+ mulx $aj, $m0, $alo ! ap[j]*bp[0]
+ add $np, 8, $np
+ umulxhi $aj, $m0, $aj ! ahi=aj
+
+ mulx $nj, $m1, $nlo ! np[j]*m1
+ addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
+ umulxhi $nj, $m1, $nj ! nhi=nj
+ addxc %g0, $hi1, $hi1
+ stxa $lo1, [$tp]0xe2 ! tp[j-1]
+ add $tp, 8, $tp ! tp++
+
+ brnz,pt $cnt, .L1st
+ sub $cnt, 8, $cnt ! j--
+!.L1st
+ addcc $alo, $hi0, $lo0
+ addxc $aj, %g0, $hi0 ! ahi=aj
+
+ addcc $nlo, $hi1, $lo1
+ addxc $nj, %g0, $hi1
+ addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
+ addxc %g0, $hi1, $hi1
+ stxa $lo1, [$tp]0xe2 ! tp[j-1]
+ add $tp, 8, $tp
+
+ addcc $hi0, $hi1, $hi1
+ addxc %g0, %g0, $ovf ! upmost overflow bit
+ stxa $hi1, [$tp]0xe2
+ add $tp, 8, $tp
+
+ ba .Louter
+ sub $num, 16, $i ! i=num-2
+
+.align 16
+.Louter:
+ ldx [$bp+0], $m0 ! m0=bp[i]
+ add $bp, 8, $bp
+
+ sub $ap, $num, $ap ! rewind
+ sub $np, $num, $np
+ sub $tp, $num, $tp
+
+ ldx [$ap+0], $aj ! ap[0]
+ ldx [$np+0], $nj ! np[0]
+
+ mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
+ ldx [$tp], $tj ! tp[0]
+ umulxhi $aj, $m0, $hi0
+ ldx [$ap+8], $aj ! ap[1]
+ addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
+ mulx $aj, $m0, $alo ! ap[1]*bp[i]
+ addxc %g0, $hi0, $hi0
+ mulx $lo0, $n0, $m1 ! tp[0]*n0
+ umulxhi $aj, $m0, $aj ! ahi=aj
+ mulx $nj, $m1, $lo1 ! np[0]*m1
+ add $ap, 16, $ap
+ umulxhi $nj, $m1, $hi1
+ ldx [$np+8], $nj ! np[1]
+ add $np, 16, $np
+ addcc $lo1, $lo0, $lo1
+ mulx $nj, $m1, $nlo ! np[1]*m1
+ addxc %g0, $hi1, $hi1
+ umulxhi $nj, $m1, $nj ! nhi=nj
+
+ ba .Linner
+ sub $num, 24, $cnt ! cnt=num-3
+.align 16
+.Linner:
+ addcc $alo, $hi0, $lo0
+ ldx [$tp+8], $tj ! tp[j]
+ addxc $aj, %g0, $hi0 ! ahi=aj
+ ldx [$ap+0], $aj ! ap[j]
+ add $ap, 8, $ap
+ addcc $nlo, $hi1, $lo1
+ mulx $aj, $m0, $alo ! ap[j]*bp[i]
+ addxc $nj, %g0, $hi1 ! nhi=nj
+ ldx [$np+0], $nj ! np[j]
+ add $np, 8, $np
+ umulxhi $aj, $m0, $aj ! ahi=aj
+ addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
+ mulx $nj, $m1, $nlo ! np[j]*m1
+ addxc %g0, $hi0, $hi0
+ umulxhi $nj, $m1, $nj ! nhi=nj
+ addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
+ addxc %g0, $hi1, $hi1
+ stx $lo1, [$tp] ! tp[j-1]
+ add $tp, 8, $tp
+ brnz,pt $cnt, .Linner
+ sub $cnt, 8, $cnt
+!.Linner
+ ldx [$tp+8], $tj ! tp[j]
+ addcc $alo, $hi0, $lo0
+ addxc $aj, %g0, $hi0 ! ahi=aj
+ addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
+ addxc %g0, $hi0, $hi0
+
+ addcc $nlo, $hi1, $lo1
+ addxc $nj, %g0, $hi1 ! nhi=nj
+ addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
+ addxc %g0, $hi1, $hi1
+ stx $lo1, [$tp] ! tp[j-1]
+
+ subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
+ addxccc $hi1, $hi0, $hi1
+ addxc %g0, %g0, $ovf
+ stx $hi1, [$tp+8]
+ add $tp, 16, $tp
+
+ brnz,pt $i, .Louter
+ sub $i, 8, $i
+
+ sub $ap, $num, $ap ! rewind
+ sub $np, $num, $np
+ sub $tp, $num, $tp
+ ba .Lsub
+ subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
+
+.align 16
+.Lsub:
+ ldx [$tp], $tj
+ add $tp, 8, $tp
+ ldx [$np+0], $nj
+ add $np, 8, $np
+ subccc $tj, $nj, $t2 ! tp[j]-np[j]
+ srlx $tj, 32, $tj
+ srlx $nj, 32, $nj
+ subccc $tj, $nj, $t3
+ add $rp, 8, $rp
+ st $t2, [$rp-4] ! reverse order
+ st $t3, [$rp-8]
+ brnz,pt $cnt, .Lsub
+ sub $cnt, 8, $cnt
+
+ sub $np, $num, $np ! rewind
+ sub $tp, $num, $tp
+ sub $rp, $num, $rp
+
+ subc $ovf, %g0, $ovf ! handle upmost overflow bit
+ and $tp, $ovf, $ap
+ andn $rp, $ovf, $np
+ or $np, $ap, $ap ! ap=borrow?tp:rp
+ ba .Lcopy
+ sub $num, 8, $cnt
+
+.align 16
+.Lcopy: ! copy or in-place refresh
+ ldx [$ap+0], $t2
+ add $ap, 8, $ap
+ stx %g0, [$tp] ! zap
+ add $tp, 8, $tp
+ stx $t2, [$rp+0]
+ add $rp, 8, $rp
+ brnz $cnt, .Lcopy
+ sub $cnt, 8, $cnt
+
+ mov 1, %o0
+ ret
+ restore
+.type bn_mul_mont_t4, #function
+.size bn_mul_mont_t4, .-bn_mul_mont_t4
+___
+
+# int bn_mul_mont_gather5(
+$rp="%o0"; # u64 *rp,
+$ap="%o1"; # const u64 *ap,
+$bp="%o2"; # const u64 *pwrtbl,
+$np="%o3"; # const u64 *np,
+$n0p="%o4"; # const BN_ULONG *n0,
+$num="%o5"; # int num, # caller ensures that num is >=3
+ # int power);
+$code.=<<___;
+.globl bn_mul_mont_gather5_t4
+.align 32
+bn_mul_mont_gather5_t4:
+ add %sp, STACK_BIAS, %g4 ! real top of stack
+ sll $num, 3, $num ! size in bytes
+ add $num, 63, %g1
+ andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes
+ sub %g4, %g1, %g1
+ andn %g1, 63, %g1 ! align at 64 byte
+ sub %g1, STACK_FRAME, %g1 ! new top of stack
+ sub %g1, %g4, %g1
+ LDPTR [%sp+STACK_7thARG], %g4 ! load power, 7th argument
+
+ save %sp, %g1, %sp
+___
+# +-------------------------------+<----- %sp
+# . .
+# +-------------------------------+<----- aligned at 64 bytes
+# | __int64 tmp[0] |
+# +-------------------------------+
+# . .
+# . .
+# +-------------------------------+<----- aligned at 64 bytes
+# . .
+($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
+($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$ccr)=map("%l$_",(0..7));
+($ovf,$i)=($t0,$t1);
+ &load_ccr($bp,"%g4",$ccr);
+ &load_b($bp,$m0,"%o7"); # m0=bp[0]
+
+$code.=<<___;
+ ld [$n0p+0], $t0 ! pull n0[0..1] value
+ ld [$n0p+4], $t1
+ add %sp, STACK_BIAS+STACK_FRAME, $tp
+ sllx $t1, 32, $n0
+ or $t0, $n0, $n0
+
+ ldx [$ap+0], $aj ! ap[0]
+
+ mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
+ umulxhi $aj, $m0, $hi0
+
+ ldx [$ap+8], $aj ! ap[1]
+ add $ap, 16, $ap
+ ldx [$np+0], $nj ! np[0]
+
+ mulx $lo0, $n0, $m1 ! "tp[0]"*n0
+
+ mulx $aj, $m0, $alo ! ap[1]*bp[0]
+ umulxhi $aj, $m0, $aj ! ahi=aj
+
+ mulx $nj, $m1, $lo1 ! np[0]*m1
+ umulxhi $nj, $m1, $hi1
+
+ ldx [$np+8], $nj ! np[1]
+
+ addcc $lo0, $lo1, $lo1
+ add $np, 16, $np
+ addxc %g0, $hi1, $hi1
+
+ mulx $nj, $m1, $nlo ! np[1]*m1
+ umulxhi $nj, $m1, $nj ! nhi=nj
+
+ ba .L1st_g5
+ sub $num, 24, $cnt ! cnt=num-3
+
+.align 16
+.L1st_g5:
+ addcc $alo, $hi0, $lo0
+ addxc $aj, %g0, $hi0
+
+ ldx [$ap+0], $aj ! ap[j]
+ addcc $nlo, $hi1, $lo1
+ add $ap, 8, $ap
+ addxc $nj, %g0, $hi1 ! nhi=nj
+
+ ldx [$np+0], $nj ! np[j]
+ mulx $aj, $m0, $alo ! ap[j]*bp[0]
+ add $np, 8, $np
+ umulxhi $aj, $m0, $aj ! ahi=aj
+
+ mulx $nj, $m1, $nlo ! np[j]*m1
+ addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
+ umulxhi $nj, $m1, $nj ! nhi=nj
+ addxc %g0, $hi1, $hi1
+ stxa $lo1, [$tp]0xe2 ! tp[j-1]
+ add $tp, 8, $tp ! tp++
+
+ brnz,pt $cnt, .L1st_g5
+ sub $cnt, 8, $cnt ! j--
+!.L1st_g5
+ addcc $alo, $hi0, $lo0
+ addxc $aj, %g0, $hi0 ! ahi=aj
+
+ addcc $nlo, $hi1, $lo1
+ addxc $nj, %g0, $hi1
+ addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
+ addxc %g0, $hi1, $hi1
+ stxa $lo1, [$tp]0xe2 ! tp[j-1]
+ add $tp, 8, $tp
+
+ addcc $hi0, $hi1, $hi1
+ addxc %g0, %g0, $ovf ! upmost overflow bit
+ stxa $hi1, [$tp]0xe2
+ add $tp, 8, $tp
+
+ ba .Louter_g5
+ sub $num, 16, $i ! i=num-2
+
+.align 16
+.Louter_g5:
+ wr $ccr, %g0, %ccr
+___
+ &load_b($bp,$m0); # m0=bp[i]
+$code.=<<___;
+ sub $ap, $num, $ap ! rewind
+ sub $np, $num, $np
+ sub $tp, $num, $tp
+
+ ldx [$ap+0], $aj ! ap[0]
+ ldx [$np+0], $nj ! np[0]
+
+ mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
+ ldx [$tp], $tj ! tp[0]
+ umulxhi $aj, $m0, $hi0
+ ldx [$ap+8], $aj ! ap[1]
+ addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
+ mulx $aj, $m0, $alo ! ap[1]*bp[i]
+ addxc %g0, $hi0, $hi0
+ mulx $lo0, $n0, $m1 ! tp[0]*n0
+ umulxhi $aj, $m0, $aj ! ahi=aj
+ mulx $nj, $m1, $lo1 ! np[0]*m1
+ add $ap, 16, $ap
+ umulxhi $nj, $m1, $hi1
+ ldx [$np+8], $nj ! np[1]
+ add $np, 16, $np
+ addcc $lo1, $lo0, $lo1
+ mulx $nj, $m1, $nlo ! np[1]*m1
+ addxc %g0, $hi1, $hi1
+ umulxhi $nj, $m1, $nj ! nhi=nj
+
+ ba .Linner_g5
+ sub $num, 24, $cnt ! cnt=num-3
+.align 16
+.Linner_g5:
+ addcc $alo, $hi0, $lo0
+ ldx [$tp+8], $tj ! tp[j]
+ addxc $aj, %g0, $hi0 ! ahi=aj
+ ldx [$ap+0], $aj ! ap[j]
+ add $ap, 8, $ap
+ addcc $nlo, $hi1, $lo1
+ mulx $aj, $m0, $alo ! ap[j]*bp[i]
+ addxc $nj, %g0, $hi1 ! nhi=nj
+ ldx [$np+0], $nj ! np[j]
+ add $np, 8, $np
+ umulxhi $aj, $m0, $aj ! ahi=aj
+ addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
+ mulx $nj, $m1, $nlo ! np[j]*m1
+ addxc %g0, $hi0, $hi0
+ umulxhi $nj, $m1, $nj ! nhi=nj
+ addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
+ addxc %g0, $hi1, $hi1
+ stx $lo1, [$tp] ! tp[j-1]
+ add $tp, 8, $tp
+ brnz,pt $cnt, .Linner_g5
+ sub $cnt, 8, $cnt
+!.Linner_g5
+ ldx [$tp+8], $tj ! tp[j]
+ addcc $alo, $hi0, $lo0
+ addxc $aj, %g0, $hi0 ! ahi=aj
+ addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
+ addxc %g0, $hi0, $hi0
+
+ addcc $nlo, $hi1, $lo1
+ addxc $nj, %g0, $hi1 ! nhi=nj
+ addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
+ addxc %g0, $hi1, $hi1
+ stx $lo1, [$tp] ! tp[j-1]
+
+ subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
+ addxccc $hi1, $hi0, $hi1
+ addxc %g0, %g0, $ovf
+ stx $hi1, [$tp+8]
+ add $tp, 16, $tp
+
+ brnz,pt $i, .Louter_g5
+ sub $i, 8, $i
+
+ sub $ap, $num, $ap ! rewind
+ sub $np, $num, $np
+ sub $tp, $num, $tp
+ ba .Lsub_g5
+ subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
+
+.align 16
+.Lsub_g5:
+ ldx [$tp], $tj
+ add $tp, 8, $tp
+ ldx [$np+0], $nj
+ add $np, 8, $np
+ subccc $tj, $nj, $t2 ! tp[j]-np[j]
+ srlx $tj, 32, $tj
+ srlx $nj, 32, $nj
+ subccc $tj, $nj, $t3
+ add $rp, 8, $rp
+ st $t2, [$rp-4] ! reverse order
+ st $t3, [$rp-8]
+ brnz,pt $cnt, .Lsub_g5
+ sub $cnt, 8, $cnt
+
+ sub $np, $num, $np ! rewind
+ sub $tp, $num, $tp
+ sub $rp, $num, $rp
+
+ subc $ovf, %g0, $ovf ! handle upmost overflow bit
+ and $tp, $ovf, $ap
+ andn $rp, $ovf, $np
+ or $np, $ap, $ap ! ap=borrow?tp:rp
+ ba .Lcopy_g5
+ sub $num, 8, $cnt
+
+.align 16
+.Lcopy_g5: ! copy or in-place refresh
+ ldx [$ap+0], $t2
+ add $ap, 8, $ap
+ stx %g0, [$tp] ! zap
+ add $tp, 8, $tp
+ stx $t2, [$rp+0]
+ add $rp, 8, $rp
+ brnz $cnt, .Lcopy_g5
+ sub $cnt, 8, $cnt
+
+ mov 1, %o0
+ ret
+ restore
+.type bn_mul_mont_gather5_t4, #function
+.size bn_mul_mont_gather5_t4, .-bn_mul_mont_gather5_t4
+___
+}
+
+$code.=<<___;
+.globl bn_flip_t4
+.align 32
+bn_flip_t4:
+.Loop_flip:
+ ld [%o1+0], %o4
+ sub %o2, 1, %o2
+ ld [%o1+4], %o5
+ add %o1, 8, %o1
+ st %o5, [%o0+0]
+ st %o4, [%o0+4]
+ brnz %o2, .Loop_flip
+ add %o0, 8, %o0
+ retl
+ nop
+.type bn_flip_t4, #function
+.size bn_flip_t4, .-bn_flip_t4
+
+.globl bn_flip_n_scatter5_t4
+.align 32
+bn_flip_n_scatter5_t4:
+ sll %o3, 3, %o3
+ srl %o1, 1, %o1
+ add %o3, %o2, %o2 ! &pwrtbl[pwr]
+ sub %o1, 1, %o1
+.Loop_flip_n_scatter5:
+ ld [%o0+0], %o4 ! inp[i]
+ ld [%o0+4], %o5
+ add %o0, 8, %o0
+ sllx %o5, 32, %o5
+ or %o4, %o5, %o5
+ stx %o5, [%o2]
+ add %o2, 32*8, %o2
+ brnz %o1, .Loop_flip_n_scatter5
+ sub %o1, 1, %o1
+ retl
+ nop
+.type bn_flip_n_scatter5_t4, #function
+.size bn_flip_n_scatter5_t4, .-bn_flip_n_scatter5_t4
+
+.globl bn_gather5_t4
+.align 32
+bn_gather5_t4:
+___
+ &load_ccr("%o2","%o3","%g1");
+$code.=<<___;
+ sub %o1, 1, %o1
+.Loop_gather5:
+___
+ &load_b("%o2","%g1");
+$code.=<<___;
+ stx %g1, [%o0]
+ add %o0, 8, %o0
+ brnz %o1, .Loop_gather5
+ sub %o1, 1, %o1
+
+ retl
+ nop
+.type bn_gather5_t4, #function
+.size bn_gather5_t4, .-bn_gather5_t4
+
+.asciz "Montgomery Multiplication for SPARC T4, David S. Miller, Andy Polyakov"
+.align 4
+___
+
+&emit_assembler();
+
+close STDOUT;
diff --git a/crypto/bn/asm/sparcv9-gf2m.pl b/crypto/bn/asm/sparcv9-gf2m.pl
new file mode 100755
index 000000000000..ab94cd917c41
--- /dev/null
+++ b/crypto/bn/asm/sparcv9-gf2m.pl
@@ -0,0 +1,190 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# October 2012
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... Except that it has two code paths: one suitable
+# for all SPARCv9 processors and one for VIS3-capable ones. Former
+# delivers ~25-45% more, more for longer keys, heaviest DH and DSA
+# verify operations on venerable UltraSPARC II. On T4 VIS3 code is
+# ~100-230% faster than gcc-generated code and ~35-90% faster than
+# the pure SPARCv9 code path.
+
+$locals=16*8;
+
+$tab="%l0";
+
+@T=("%g2","%g3");
+@i=("%g4","%g5");
+
+($a1,$a2,$a4,$a8,$a12,$a48)=map("%o$_",(0..5));
+($lo,$hi,$b)=("%g1",$a8,"%o7"); $a=$lo;
+
+$code.=<<___;
+#include <sparc_arch.h>
+
+#ifdef __arch64__
+.register %g2,#scratch
+.register %g3,#scratch
+#endif
+
+#ifdef __PIC__
+SPARC_PIC_THUNK(%g1)
+#endif
+
+.globl bn_GF2m_mul_2x2
+.align 16
+bn_GF2m_mul_2x2:
+ SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
+ ld [%g1+0],%g1 ! OPENSSL_sparcv9cap_P[0]
+
+ andcc %g1, SPARCV9_VIS3, %g0
+ bz,pn %icc,.Lsoftware
+ nop
+
+ sllx %o1, 32, %o1
+ sllx %o3, 32, %o3
+ or %o2, %o1, %o1
+ or %o4, %o3, %o3
+ .word 0x95b262ab ! xmulx %o1, %o3, %o2
+ .word 0x99b262cb ! xmulxhi %o1, %o3, %o4
+ srlx %o2, 32, %o1 ! 13 cycles later
+ st %o2, [%o0+0]
+ st %o1, [%o0+4]
+ srlx %o4, 32, %o3
+ st %o4, [%o0+8]
+ retl
+ st %o3, [%o0+12]
+
+.align 16
+.Lsoftware:
+ save %sp,-STACK_FRAME-$locals,%sp
+
+ sllx %i1,32,$a
+ mov -1,$a12
+ sllx %i3,32,$b
+ or %i2,$a,$a
+ srlx $a12,1,$a48 ! 0x7fff...
+ or %i4,$b,$b
+ srlx $a12,2,$a12 ! 0x3fff...
+ add %sp,STACK_BIAS+STACK_FRAME,$tab
+
+ sllx $a,2,$a4
+ mov $a,$a1
+ sllx $a,1,$a2
+
+ srax $a4,63,@i[1] ! broadcast 61st bit
+ and $a48,$a4,$a4 ! (a<<2)&0x7fff...
+ srlx $a48,2,$a48
+ srax $a2,63,@i[0] ! broadcast 62nd bit
+ and $a12,$a2,$a2 ! (a<<1)&0x3fff...
+ srax $a1,63,$lo ! broadcast 63rd bit
+ and $a48,$a1,$a1 ! (a<<0)&0x1fff...
+
+ sllx $a1,3,$a8
+ and $b,$lo,$lo
+ and $b,@i[0],@i[0]
+ and $b,@i[1],@i[1]
+
+ stx %g0,[$tab+0*8] ! tab[0]=0
+ xor $a1,$a2,$a12
+ stx $a1,[$tab+1*8] ! tab[1]=a1
+ stx $a2,[$tab+2*8] ! tab[2]=a2
+ xor $a4,$a8,$a48
+ stx $a12,[$tab+3*8] ! tab[3]=a1^a2
+ xor $a4,$a1,$a1
+
+ stx $a4,[$tab+4*8] ! tab[4]=a4
+ xor $a4,$a2,$a2
+ stx $a1,[$tab+5*8] ! tab[5]=a1^a4
+ xor $a4,$a12,$a12
+ stx $a2,[$tab+6*8] ! tab[6]=a2^a4
+ xor $a48,$a1,$a1
+ stx $a12,[$tab+7*8] ! tab[7]=a1^a2^a4
+ xor $a48,$a2,$a2
+
+ stx $a8,[$tab+8*8] ! tab[8]=a8
+ xor $a48,$a12,$a12
+ stx $a1,[$tab+9*8] ! tab[9]=a1^a8
+ xor $a4,$a1,$a1
+ stx $a2,[$tab+10*8] ! tab[10]=a2^a8
+ xor $a4,$a2,$a2
+ stx $a12,[$tab+11*8] ! tab[11]=a1^a2^a8
+
+ xor $a4,$a12,$a12
+ stx $a48,[$tab+12*8] ! tab[12]=a4^a8
+ srlx $lo,1,$hi
+ stx $a1,[$tab+13*8] ! tab[13]=a1^a4^a8
+ sllx $lo,63,$lo
+ stx $a2,[$tab+14*8] ! tab[14]=a2^a4^a8
+ srlx @i[0],2,@T[0]
+ stx $a12,[$tab+15*8] ! tab[15]=a1^a2^a4^a8
+
+ sllx @i[0],62,$a1
+ sllx $b,3,@i[0]
+ srlx @i[1],3,@T[1]
+ and @i[0],`0xf<<3`,@i[0]
+ sllx @i[1],61,$a2
+ ldx [$tab+@i[0]],@i[0]
+ srlx $b,4-3,@i[1]
+ xor @T[0],$hi,$hi
+ and @i[1],`0xf<<3`,@i[1]
+ xor $a1,$lo,$lo
+ ldx [$tab+@i[1]],@i[1]
+ xor @T[1],$hi,$hi
+
+ xor @i[0],$lo,$lo
+ srlx $b,8-3,@i[0]
+ xor $a2,$lo,$lo
+ and @i[0],`0xf<<3`,@i[0]
+___
+for($n=1;$n<14;$n++) {
+$code.=<<___;
+ sllx @i[1],`$n*4`,@T[0]
+ ldx [$tab+@i[0]],@i[0]
+ srlx @i[1],`64-$n*4`,@T[1]
+ xor @T[0],$lo,$lo
+ srlx $b,`($n+2)*4`-3,@i[1]
+ xor @T[1],$hi,$hi
+ and @i[1],`0xf<<3`,@i[1]
+___
+ push(@i,shift(@i)); push(@T,shift(@T));
+}
+$code.=<<___;
+ sllx @i[1],`$n*4`,@T[0]
+ ldx [$tab+@i[0]],@i[0]
+ srlx @i[1],`64-$n*4`,@T[1]
+ xor @T[0],$lo,$lo
+
+ sllx @i[0],`($n+1)*4`,@T[0]
+ xor @T[1],$hi,$hi
+ srlx @i[0],`64-($n+1)*4`,@T[1]
+ xor @T[0],$lo,$lo
+ xor @T[1],$hi,$hi
+
+ srlx $lo,32,%i1
+ st $lo,[%i0+0]
+ st %i1,[%i0+4]
+ srlx $hi,32,%i2
+ st $hi,[%i0+8]
+ st %i2,[%i0+12]
+
+ ret
+ restore
+.type bn_GF2m_mul_2x2,#function
+.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+.asciz "GF(2^m) Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+.align 4
+___
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
diff --git a/crypto/bn/asm/vis3-mont.pl b/crypto/bn/asm/vis3-mont.pl
new file mode 100755
index 000000000000..263ac02b6f45
--- /dev/null
+++ b/crypto/bn/asm/vis3-mont.pl
@@ -0,0 +1,373 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# October 2012.
+#
+# SPARCv9 VIS3 Montgomery multiplicaion procedure suitable for T3 and
+# onward. There are three new instructions used here: umulxhi,
+# addxc[cc] and initializing store. On T3 RSA private key operations
+# are 1.54/1.87/2.11/2.26 times faster for 512/1024/2048/4096-bit key
+# lengths. This is without dedicated squaring procedure. On T4
+# corresponding coefficients are 1.47/2.10/2.80/2.90x, which is mostly
+# for reference purposes, because T4 has dedicated Montgomery
+# multiplication and squaring *instructions* that deliver even more.
+
+$bits=32;
+for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+if ($bits==64) { $bias=2047; $frame=192; }
+else { $bias=0; $frame=112; }
+
+$code.=<<___ if ($bits==64);
+.register %g2,#scratch
+.register %g3,#scratch
+___
+$code.=<<___;
+.section ".text",#alloc,#execinstr
+___
+
+($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
+ (map("%g$_",(1..5)),map("%o$_",(0..5,7)));
+
+# int bn_mul_mont(
+$rp="%o0"; # BN_ULONG *rp,
+$ap="%o1"; # const BN_ULONG *ap,
+$bp="%o2"; # const BN_ULONG *bp,
+$np="%o3"; # const BN_ULONG *np,
+$n0p="%o4"; # const BN_ULONG *n0,
+$num="%o5"; # int num); # caller ensures that num is even
+ # and >=6
+$code.=<<___;
+.globl bn_mul_mont_vis3
+.align 32
+bn_mul_mont_vis3:
+ add %sp, $bias, %g4 ! real top of stack
+ sll $num, 2, $num ! size in bytes
+ add $num, 63, %g5
+ andn %g5, 63, %g5 ! buffer size rounded up to 64 bytes
+ add %g5, %g5, %g1
+ add %g5, %g1, %g1 ! 3*buffer size
+ sub %g4, %g1, %g1
+ andn %g1, 63, %g1 ! align at 64 byte
+ sub %g1, $frame, %g1 ! new top of stack
+ sub %g1, %g4, %g1
+
+ save %sp, %g1, %sp
+___
+
+# +-------------------------------+<----- %sp
+# . .
+# +-------------------------------+<----- aligned at 64 bytes
+# | __int64 tmp[0] |
+# +-------------------------------+
+# . .
+# . .
+# +-------------------------------+<----- aligned at 64 bytes
+# | __int64 ap[1..0] | converted ap[]
+# +-------------------------------+
+# | __int64 np[1..0] | converted np[]
+# +-------------------------------+
+# | __int64 ap[3..2] |
+# . .
+# . .
+# +-------------------------------+
+($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
+($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$anp)=map("%l$_",(0..7));
+($ovf,$i)=($t0,$t1);
+$code.=<<___;
+ ld [$n0p+0], $t0 ! pull n0[0..1] value
+ add %sp, $bias+$frame, $tp
+ ld [$n0p+4], $t1
+ add $tp, %g5, $anp
+ ld [$bp+0], $t2 ! m0=bp[0]
+ sllx $t1, 32, $n0
+ ld [$bp+4], $t3
+ or $t0, $n0, $n0
+ add $bp, 8, $bp
+
+ ld [$ap+0], $t0 ! ap[0]
+ sllx $t3, 32, $m0
+ ld [$ap+4], $t1
+ or $t2, $m0, $m0
+
+ ld [$ap+8], $t2 ! ap[1]
+ sllx $t1, 32, $aj
+ ld [$ap+12], $t3
+ or $t0, $aj, $aj
+ add $ap, 16, $ap
+ stx $aj, [$anp] ! converted ap[0]
+
+ mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
+ umulxhi $aj, $m0, $hi0
+
+ ld [$np+0], $t0 ! np[0]
+ sllx $t3, 32, $aj
+ ld [$np+4], $t1
+ or $t2, $aj, $aj
+
+ ld [$np+8], $t2 ! np[1]
+ sllx $t1, 32, $nj
+ ld [$np+12], $t3
+ or $t0, $nj, $nj
+ add $np, 16, $np
+ stx $nj, [$anp+8] ! converted np[0]
+
+ mulx $lo0, $n0, $m1 ! "tp[0]"*n0
+ stx $aj, [$anp+16] ! converted ap[1]
+
+ mulx $aj, $m0, $alo ! ap[1]*bp[0]
+ umulxhi $aj, $m0, $aj ! ahi=aj
+
+ mulx $nj, $m1, $lo1 ! np[0]*m1
+ umulxhi $nj, $m1, $hi1
+
+ sllx $t3, 32, $nj
+ or $t2, $nj, $nj
+ stx $nj, [$anp+24] ! converted np[1]
+ add $anp, 32, $anp
+
+ addcc $lo0, $lo1, $lo1
+ addxc %g0, $hi1, $hi1
+
+ mulx $nj, $m1, $nlo ! np[1]*m1
+ umulxhi $nj, $m1, $nj ! nhi=nj
+
+ ba .L1st
+ sub $num, 24, $cnt ! cnt=num-3
+
+.align 16
+.L1st:
+ ld [$ap+0], $t0 ! ap[j]
+ addcc $alo, $hi0, $lo0
+ ld [$ap+4], $t1
+ addxc $aj, %g0, $hi0
+
+ sllx $t1, 32, $aj
+ add $ap, 8, $ap
+ or $t0, $aj, $aj
+ stx $aj, [$anp] ! converted ap[j]
+
+ ld [$np+0], $t2 ! np[j]
+ addcc $nlo, $hi1, $lo1
+ ld [$np+4], $t3
+ addxc $nj, %g0, $hi1 ! nhi=nj
+
+ sllx $t3, 32, $nj
+ add $np, 8, $np
+ mulx $aj, $m0, $alo ! ap[j]*bp[0]
+ or $t2, $nj, $nj
+ umulxhi $aj, $m0, $aj ! ahi=aj
+ stx $nj, [$anp+8] ! converted np[j]
+ add $anp, 16, $anp ! anp++
+
+ mulx $nj, $m1, $nlo ! np[j]*m1
+ addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
+ umulxhi $nj, $m1, $nj ! nhi=nj
+ addxc %g0, $hi1, $hi1
+ stx $lo1, [$tp] ! tp[j-1]
+ add $tp, 8, $tp ! tp++
+
+ brnz,pt $cnt, .L1st
+ sub $cnt, 8, $cnt ! j--
+!.L1st
+ addcc $alo, $hi0, $lo0
+ addxc $aj, %g0, $hi0 ! ahi=aj
+
+ addcc $nlo, $hi1, $lo1
+ addxc $nj, %g0, $hi1
+ addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
+ addxc %g0, $hi1, $hi1
+ stx $lo1, [$tp] ! tp[j-1]
+ add $tp, 8, $tp
+
+ addcc $hi0, $hi1, $hi1
+ addxc %g0, %g0, $ovf ! upmost overflow bit
+ stx $hi1, [$tp]
+ add $tp, 8, $tp
+
+ ba .Louter
+ sub $num, 16, $i ! i=num-2
+
+.align 16
+.Louter:
+ ld [$bp+0], $t2 ! m0=bp[i]
+ ld [$bp+4], $t3
+
+ sub $anp, $num, $anp ! rewind
+ sub $tp, $num, $tp
+ sub $anp, $num, $anp
+
+ add $bp, 8, $bp
+ sllx $t3, 32, $m0
+ ldx [$anp+0], $aj ! ap[0]
+ or $t2, $m0, $m0
+ ldx [$anp+8], $nj ! np[0]
+
+ mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
+ ldx [$tp], $tj ! tp[0]
+ umulxhi $aj, $m0, $hi0
+ ldx [$anp+16], $aj ! ap[1]
+ addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
+ mulx $aj, $m0, $alo ! ap[1]*bp[i]
+ addxc %g0, $hi0, $hi0
+ mulx $lo0, $n0, $m1 ! tp[0]*n0
+ umulxhi $aj, $m0, $aj ! ahi=aj
+ mulx $nj, $m1, $lo1 ! np[0]*m1
+ umulxhi $nj, $m1, $hi1
+ ldx [$anp+24], $nj ! np[1]
+ add $anp, 32, $anp
+ addcc $lo1, $lo0, $lo1
+ mulx $nj, $m1, $nlo ! np[1]*m1
+ addxc %g0, $hi1, $hi1
+ umulxhi $nj, $m1, $nj ! nhi=nj
+
+ ba .Linner
+ sub $num, 24, $cnt ! cnt=num-3
+.align 16
+.Linner:
+ addcc $alo, $hi0, $lo0
+ ldx [$tp+8], $tj ! tp[j]
+ addxc $aj, %g0, $hi0 ! ahi=aj
+ ldx [$anp+0], $aj ! ap[j]
+ addcc $nlo, $hi1, $lo1
+ mulx $aj, $m0, $alo ! ap[j]*bp[i]
+ addxc $nj, %g0, $hi1 ! nhi=nj
+ ldx [$anp+8], $nj ! np[j]
+ add $anp, 16, $anp
+ umulxhi $aj, $m0, $aj ! ahi=aj
+ addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
+ mulx $nj, $m1, $nlo ! np[j]*m1
+ addxc %g0, $hi0, $hi0
+ umulxhi $nj, $m1, $nj ! nhi=nj
+ addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
+ addxc %g0, $hi1, $hi1
+ stx $lo1, [$tp] ! tp[j-1]
+ add $tp, 8, $tp
+ brnz,pt $cnt, .Linner
+ sub $cnt, 8, $cnt
+!.Linner
+ ldx [$tp+8], $tj ! tp[j]
+ addcc $alo, $hi0, $lo0
+ addxc $aj, %g0, $hi0 ! ahi=aj
+ addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
+ addxc %g0, $hi0, $hi0
+
+ addcc $nlo, $hi1, $lo1
+ addxc $nj, %g0, $hi1 ! nhi=nj
+ addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
+ addxc %g0, $hi1, $hi1
+ stx $lo1, [$tp] ! tp[j-1]
+
+ subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
+ addxccc $hi1, $hi0, $hi1
+ addxc %g0, %g0, $ovf
+ stx $hi1, [$tp+8]
+ add $tp, 16, $tp
+
+ brnz,pt $i, .Louter
+ sub $i, 8, $i
+
+ sub $anp, $num, $anp ! rewind
+ sub $tp, $num, $tp
+ sub $anp, $num, $anp
+ ba .Lsub
+ subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
+
+.align 16
+.Lsub:
+ ldx [$tp], $tj
+ add $tp, 8, $tp
+ ldx [$anp+8], $nj
+ add $anp, 16, $anp
+ subccc $tj, $nj, $t2 ! tp[j]-np[j]
+ srlx $tj, 32, $tj
+ srlx $nj, 32, $nj
+ subccc $tj, $nj, $t3
+ add $rp, 8, $rp
+ st $t2, [$rp-4] ! reverse order
+ st $t3, [$rp-8]
+ brnz,pt $cnt, .Lsub
+ sub $cnt, 8, $cnt
+
+ sub $anp, $num, $anp ! rewind
+ sub $tp, $num, $tp
+ sub $anp, $num, $anp
+ sub $rp, $num, $rp
+
+ subc $ovf, %g0, $ovf ! handle upmost overflow bit
+ and $tp, $ovf, $ap
+ andn $rp, $ovf, $np
+ or $np, $ap, $ap ! ap=borrow?tp:rp
+ ba .Lcopy
+ sub $num, 8, $cnt
+
+.align 16
+.Lcopy: ! copy or in-place refresh
+ ld [$ap+0], $t2
+ ld [$ap+4], $t3
+ add $ap, 8, $ap
+ stx %g0, [$tp] ! zap
+ add $tp, 8, $tp
+ stx %g0, [$anp] ! zap
+ stx %g0, [$anp+8]
+ add $anp, 16, $anp
+ st $t3, [$rp+0] ! flip order
+ st $t2, [$rp+4]
+ add $rp, 8, $rp
+ brnz $cnt, .Lcopy
+ sub $cnt, 8, $cnt
+
+ mov 1, %o0
+ ret
+ restore
+.type bn_mul_mont_vis3, #function
+.size bn_mul_mont_vis3, .-bn_mul_mont_vis3
+.asciz "Montgomery Multiplication for SPARCv9 VIS3, CRYPTOGAMS by <appro\@openssl.org>"
+.align 4
+___
+
+# Purpose of these subroutines is to explicitly encode VIS instructions,
+# so that one can compile the module without having to specify VIS
+# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# Idea is to reserve for option to produce "universal" binary and let
+# programmer detect if current CPU is VIS capable at run-time.
+sub unvis3 {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
+my ($ref,$opf);
+my %visopf = ( "addxc" => 0x011,
+ "addxccc" => 0x013,
+ "umulxhi" => 0x016 );
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+ if ($opf=$visopf{$mnemonic}) {
+ foreach ($rs1,$rs2,$rd) {
+ return $ref if (!/%([goli])([0-9])/);
+ $_=$bias{$1}+$2;
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+
+ s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
+ &unvis3($1,$2,$3,$4)
+ /ge;
+
+ print $_,"\n";
+}
+
+close STDOUT;
diff --git a/crypto/bn/asm/x86_64-gcc.c b/crypto/bn/asm/x86_64-gcc.c
index 9c5074b30858..d5488866e0cf 100644
--- a/crypto/bn/asm/x86_64-gcc.c
+++ b/crypto/bn/asm/x86_64-gcc.c
@@ -55,7 +55,7 @@
* machine.
*/
-# ifdef _WIN64
+# if defined(_WIN64) || !defined(__LP64__)
# define BN_ULONG unsigned long long
# else
# define BN_ULONG unsigned long
@@ -63,7 +63,6 @@
# undef mul
# undef mul_add
-# undef sqr
/*-
* "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
@@ -99,8 +98,8 @@
: "cc"); \
(r)=carry, carry=high; \
} while (0)
-
-# define sqr(r0,r1,a) \
+# undef sqr
+# define sqr(r0,r1,a) \
asm ("mulq %2" \
: "=a"(r0),"=d"(r1) \
: "a"(a) \
@@ -204,20 +203,22 @@ BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
int n)
{
- BN_ULONG ret = 0, i = 0;
+ BN_ULONG ret;
+ size_t i = 0;
if (n <= 0)
return 0;
- asm volatile (" subq %2,%2 \n"
+ asm volatile (" subq %0,%0 \n" /* clear carry */
+ " jmp 1f \n"
".p2align 4 \n"
"1: movq (%4,%2,8),%0 \n"
" adcq (%5,%2,8),%0 \n"
" movq %0,(%3,%2,8) \n"
- " leaq 1(%2),%2 \n"
+ " lea 1(%2),%2 \n"
" loop 1b \n"
- " sbbq %0,%0 \n":"=&a" (ret), "+c"(n),
- "=&r"(i)
+ " sbbq %0,%0 \n":"=&r" (ret), "+c"(n),
+ "+r"(i)
:"r"(rp), "r"(ap), "r"(bp)
:"cc", "memory");
@@ -228,20 +229,22 @@ BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
int n)
{
- BN_ULONG ret = 0, i = 0;
+ BN_ULONG ret;
+ size_t i = 0;
if (n <= 0)
return 0;
- asm volatile (" subq %2,%2 \n"
+ asm volatile (" subq %0,%0 \n" /* clear borrow */
+ " jmp 1f \n"
".p2align 4 \n"
"1: movq (%4,%2,8),%0 \n"
" sbbq (%5,%2,8),%0 \n"
" movq %0,(%3,%2,8) \n"
- " leaq 1(%2),%2 \n"
+ " lea 1(%2),%2 \n"
" loop 1b \n"
- " sbbq %0,%0 \n":"=&a" (ret), "+c"(n),
- "=&r"(i)
+ " sbbq %0,%0 \n":"=&r" (ret), "+c"(n),
+ "+r"(i)
:"r"(rp), "r"(ap), "r"(bp)
:"cc", "memory");
@@ -313,55 +316,58 @@ BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
*/
# if 0
/* original macros are kept for reference purposes */
-# define mul_add_c(a,b,c0,c1,c2) { \
- BN_ULONG ta=(a),tb=(b); \
- t1 = ta * tb; \
- t2 = BN_UMULT_HIGH(ta,tb); \
- c0 += t1; t2 += (c0<t1)?1:0; \
- c1 += t2; c2 += (c1<t2)?1:0; \
- }
-
-# define mul_add_c2(a,b,c0,c1,c2) { \
- BN_ULONG ta=(a),tb=(b),t0; \
- t1 = BN_UMULT_HIGH(ta,tb); \
- t0 = ta * tb; \
- c0 += t0; t2 = t1+((c0<t0)?1:0);\
- c1 += t2; c2 += (c1<t2)?1:0; \
- c0 += t0; t1 += (c0<t0)?1:0; \
- c1 += t1; c2 += (c1<t1)?1:0; \
- }
+# define mul_add_c(a,b,c0,c1,c2) do { \
+ BN_ULONG ta = (a), tb = (b); \
+ BN_ULONG lo, hi; \
+ BN_UMULT_LOHI(lo,hi,ta,tb); \
+ c0 += lo; hi += (c0<lo)?1:0; \
+ c1 += hi; c2 += (c1<hi)?1:0; \
+ } while(0)
+
+# define mul_add_c2(a,b,c0,c1,c2) do { \
+ BN_ULONG ta = (a), tb = (b); \
+ BN_ULONG lo, hi, tt; \
+ BN_UMULT_LOHI(lo,hi,ta,tb); \
+ c0 += lo; tt = hi+((c0<lo)?1:0); \
+ c1 += tt; c2 += (c1<tt)?1:0; \
+ c0 += lo; hi += (c0<lo)?1:0; \
+ c1 += hi; c2 += (c1<hi)?1:0; \
+ } while(0)
+
+# define sqr_add_c(a,i,c0,c1,c2) do { \
+ BN_ULONG ta = (a)[i]; \
+ BN_ULONG lo, hi; \
+ BN_UMULT_LOHI(lo,hi,ta,ta); \
+ c0 += lo; hi += (c0<lo)?1:0; \
+ c1 += hi; c2 += (c1<hi)?1:0; \
+ } while(0)
# else
-# define mul_add_c(a,b,c0,c1,c2) do { \
+# define mul_add_c(a,b,c0,c1,c2) do { \
+ BN_ULONG t1,t2; \
asm ("mulq %3" \
: "=a"(t1),"=d"(t2) \
: "a"(a),"m"(b) \
: "cc"); \
- asm ("addq %2,%0; adcq %3,%1" \
- : "+r"(c0),"+d"(t2) \
- : "a"(t1),"g"(0) \
- : "cc"); \
- asm ("addq %2,%0; adcq %3,%1" \
- : "+r"(c1),"+r"(c2) \
- : "d"(t2),"g"(0) \
- : "cc"); \
+ asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
+ : "+r"(c0),"+r"(c1),"+r"(c2) \
+ : "r"(t1),"r"(t2),"g"(0) \
+ : "cc"); \
} while (0)
-# define sqr_add_c(a,i,c0,c1,c2) do { \
+# define sqr_add_c(a,i,c0,c1,c2) do { \
+ BN_ULONG t1,t2; \
asm ("mulq %2" \
: "=a"(t1),"=d"(t2) \
: "a"(a[i]) \
: "cc"); \
- asm ("addq %2,%0; adcq %3,%1" \
- : "+r"(c0),"+d"(t2) \
- : "a"(t1),"g"(0) \
- : "cc"); \
- asm ("addq %2,%0; adcq %3,%1" \
- : "+r"(c1),"+r"(c2) \
- : "d"(t2),"g"(0) \
- : "cc"); \
+ asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
+ : "+r"(c0),"+r"(c1),"+r"(c2) \
+ : "r"(t1),"r"(t2),"g"(0) \
+ : "cc"); \
} while (0)
-# define mul_add_c2(a,b,c0,c1,c2) do { \
+# define mul_add_c2(a,b,c0,c1,c2) do { \
+ BN_ULONG t1,t2; \
asm ("mulq %3" \
: "=a"(t1),"=d"(t2) \
: "a"(a),"m"(b) \
@@ -382,7 +388,6 @@ BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
{
- BN_ULONG t1, t2;
BN_ULONG c1, c2, c3;
c1 = 0;
@@ -486,7 +491,6 @@ void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
{
- BN_ULONG t1, t2;
BN_ULONG c1, c2, c3;
c1 = 0;
@@ -526,7 +530,6 @@ void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
{
- BN_ULONG t1, t2;
BN_ULONG c1, c2, c3;
c1 = 0;
@@ -602,7 +605,6 @@ void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
{
- BN_ULONG t1, t2;
BN_ULONG c1, c2, c3;
c1 = 0;
diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl
index 17fb94c84c9f..2989b58f256e 100755
--- a/crypto/bn/asm/x86_64-mont.pl
+++ b/crypto/bn/asm/x86_64-mont.pl
@@ -29,6 +29,16 @@
# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
+# June 2013.
+#
+# Optimize reduction in squaring procedure and improve 1024+-bit RSA
+# sign performance by 10-16% on Intel Sandy Bridge and later
+# (virtually same on non-Intel processors).
+
+# August 2013.
+#
+# Add MULX/ADOX/ADCX code path.
+
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -43,6 +53,21 @@ die "can't locate x86_64-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $addx = ($1>=2.23);
+}
+
+if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $addx = ($1>=2.10);
+}
+
+if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $addx = ($1>=12);
+}
+
# int bn_mul_mont(
$rp="%rdi"; # BN_ULONG *rp,
$ap="%rsi"; # const BN_ULONG *ap,
@@ -61,6 +86,8 @@ $m1="%rbp";
$code=<<___;
.text
+.extern OPENSSL_ia32cap_P
+
.globl bn_mul_mont
.type bn_mul_mont,\@function,6
.align 16
@@ -69,9 +96,16 @@ bn_mul_mont:
jnz .Lmul_enter
cmp \$8,${num}d
jb .Lmul_enter
+___
+$code.=<<___ if ($addx);
+ mov OPENSSL_ia32cap_P+8(%rip),%r11d
+___
+$code.=<<___;
cmp $ap,$bp
jne .Lmul4x_enter
- jmp .Lsqr4x_enter
+ test \$7,${num}d
+ jz .Lsqr8x_enter
+ jmp .Lmul4x_enter
.align 16
.Lmul_enter:
@@ -227,7 +261,7 @@ $code.=<<___;
lea 1($i),$i # i++
cmp $num,$i
- jl .Louter
+ jb .Louter
xor $i,$i # i=0 and clear CF!
mov (%rsp),%rax # tp[0]
@@ -280,6 +314,13 @@ $code.=<<___;
.align 16
bn_mul4x_mont:
.Lmul4x_enter:
+___
+$code.=<<___ if ($addx);
+ and \$0x80100,%r11d
+ cmp \$0x80100,%r11d
+ je .Lmulx4x_enter
+___
+$code.=<<___;
push %rbx
push %rbp
push %r12
@@ -401,7 +442,7 @@ $code.=<<___;
mov $N[1],-32(%rsp,$j,8) # tp[j-1]
mov %rdx,$N[0]
cmp $num,$j
- jl .L1st4x
+ jb .L1st4x
mulq $m0 # ap[j]*bp[0]
add %rax,$A[0]
@@ -549,7 +590,7 @@ $code.=<<___;
mov $N[1],-32(%rsp,$j,8) # tp[j-1]
mov %rdx,$N[0]
cmp $num,$j
- jl .Linner4x
+ jb .Linner4x
mulq $m0 # ap[j]*bp[i]
add %rax,$A[0]
@@ -595,7 +636,7 @@ $code.=<<___;
mov $N[1],(%rsp,$j,8) # store upmost overflow bit
cmp $num,$i
- jl .Louter4x
+ jb .Louter4x
___
{
my @ri=("%rax","%rdx",$m0,$m1);
@@ -688,25 +729,30 @@ ___
}}}
{{{
######################################################################
-# void bn_sqr4x_mont(
+# void bn_sqr8x_mont(
my $rptr="%rdi"; # const BN_ULONG *rptr,
my $aptr="%rsi"; # const BN_ULONG *aptr,
my $bptr="%rdx"; # not used
my $nptr="%rcx"; # const BN_ULONG *nptr,
my $n0 ="%r8"; # const BN_ULONG *n0);
-my $num ="%r9"; # int num, has to be divisible by 4 and
- # not less than 8
+my $num ="%r9"; # int num, has to be divisible by 8
my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
my @A0=("%r10","%r11");
my @A1=("%r12","%r13");
my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
+$code.=<<___ if ($addx);
+.extern bn_sqrx8x_internal # see x86_64-mont5 module
+___
$code.=<<___;
-.type bn_sqr4x_mont,\@function,6
-.align 16
-bn_sqr4x_mont:
-.Lsqr4x_enter:
+.extern bn_sqr8x_internal # see x86_64-mont5 module
+
+.type bn_sqr8x_mont,\@function,6
+.align 32
+bn_sqr8x_mont:
+.Lsqr8x_enter:
+ mov %rsp,%rax
push %rbx
push %rbp
push %r12
@@ -714,787 +760,445 @@ bn_sqr4x_mont:
push %r14
push %r15
+ mov ${num}d,%r10d
shl \$3,${num}d # convert $num to bytes
- xor %r10,%r10
- mov %rsp,%r11 # put aside %rsp
- sub $num,%r10 # -$num
- mov ($n0),$n0 # *n0
- lea -72(%rsp,%r10,2),%rsp # alloca(frame+2*$num)
- and \$-1024,%rsp # minimize TLB usage
- ##############################################################
- # Stack layout
- #
- # +0 saved $num, used in reduction section
- # +8 &t[2*$num], used in reduction section
- # +32 saved $rptr
- # +40 saved $nptr
- # +48 saved *n0
- # +56 saved %rsp
- # +64 t[2*$num]
- #
- mov $rptr,32(%rsp) # save $rptr
- mov $nptr,40(%rsp)
- mov $n0, 48(%rsp)
- mov %r11, 56(%rsp) # save original %rsp
-.Lsqr4x_body:
+ shl \$3+2,%r10 # 4*$num
+ neg $num
+
##############################################################
- # Squaring part:
- #
- # a) multiply-n-add everything but a[i]*a[i];
- # b) shift result of a) by 1 to the left and accumulate
- # a[i]*a[i] products;
+ # ensure that stack frame doesn't alias with $aptr modulo
+ # 4096. this is done to allow memory disambiguation logic
+ # do its job.
#
- lea 32(%r10),$i # $i=-($num-32)
- lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2]
-
- mov $num,$j # $j=$num
-
- # comments apply to $num==8 case
- mov -32($aptr,$i),$a0 # a[0]
- lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
- mov -24($aptr,$i),%rax # a[1]
- lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
- mov -16($aptr,$i),$ai # a[2]
- mov %rax,$a1
-
- mul $a0 # a[1]*a[0]
- mov %rax,$A0[0] # a[1]*a[0]
- mov $ai,%rax # a[2]
- mov %rdx,$A0[1]
- mov $A0[0],-24($tptr,$i) # t[1]
-
- xor $A0[0],$A0[0]
- mul $a0 # a[2]*a[0]
- add %rax,$A0[1]
- mov $ai,%rax
- adc %rdx,$A0[0]
- mov $A0[1],-16($tptr,$i) # t[2]
-
- lea -16($i),$j # j=-16
-
-
- mov 8($aptr,$j),$ai # a[3]
- mul $a1 # a[2]*a[1]
- mov %rax,$A1[0] # a[2]*a[1]+t[3]
- mov $ai,%rax
- mov %rdx,$A1[1]
-
- xor $A0[1],$A0[1]
- add $A1[0],$A0[0]
- lea 16($j),$j
- adc \$0,$A0[1]
- mul $a0 # a[3]*a[0]
- add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
- mov $ai,%rax
- adc %rdx,$A0[1]
- mov $A0[0],-8($tptr,$j) # t[3]
- jmp .Lsqr4x_1st
+ lea -64(%rsp,$num,4),%r11
+ mov ($n0),$n0 # *n0
+ sub $aptr,%r11
+ and \$4095,%r11
+ cmp %r11,%r10
+ jb .Lsqr8x_sp_alt
+ sub %r11,%rsp # align with $aptr
+ lea -64(%rsp,$num,4),%rsp # alloca(frame+4*$num)
+ jmp .Lsqr8x_sp_done
+
+.align 32
+.Lsqr8x_sp_alt:
+ lea 4096-64(,$num,4),%r10 # 4096-frame-4*$num
+ lea -64(%rsp,$num,4),%rsp # alloca(frame+4*$num)
+ sub %r10,%r11
+ mov \$0,%r10
+ cmovc %r10,%r11
+ sub %r11,%rsp
+.Lsqr8x_sp_done:
+ and \$-64,%rsp
+ mov $num,%r10
+ neg $num
+
+ lea 64(%rsp,$num,2),%r11 # copy of modulus
+ mov $n0, 32(%rsp)
+ mov %rax, 40(%rsp) # save original %rsp
+.Lsqr8x_body:
+
+ mov $num,$i
+ movq %r11, %xmm2 # save pointer to modulus copy
+ shr \$3+2,$i
+ mov OPENSSL_ia32cap_P+8(%rip),%eax
+ jmp .Lsqr8x_copy_n
+
+.align 32
+.Lsqr8x_copy_n:
+ movq 8*0($nptr),%xmm0
+ movq 8*1($nptr),%xmm1
+ movq 8*2($nptr),%xmm3
+ movq 8*3($nptr),%xmm4
+ lea 8*4($nptr),$nptr
+ movdqa %xmm0,16*0(%r11)
+ movdqa %xmm1,16*1(%r11)
+ movdqa %xmm3,16*2(%r11)
+ movdqa %xmm4,16*3(%r11)
+ lea 16*4(%r11),%r11
+ dec $i
+ jnz .Lsqr8x_copy_n
-.align 16
-.Lsqr4x_1st:
- mov ($aptr,$j),$ai # a[4]
- xor $A1[0],$A1[0]
- mul $a1 # a[3]*a[1]
- add %rax,$A1[1] # a[3]*a[1]+t[4]
- mov $ai,%rax
- adc %rdx,$A1[0]
-
- xor $A0[0],$A0[0]
- add $A1[1],$A0[1]
- adc \$0,$A0[0]
- mul $a0 # a[4]*a[0]
- add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4]
- mov $ai,%rax # a[3]
- adc %rdx,$A0[0]
- mov $A0[1],($tptr,$j) # t[4]
-
-
- mov 8($aptr,$j),$ai # a[5]
- xor $A1[1],$A1[1]
- mul $a1 # a[4]*a[3]
- add %rax,$A1[0] # a[4]*a[3]+t[5]
- mov $ai,%rax
- adc %rdx,$A1[1]
-
- xor $A0[1],$A0[1]
- add $A1[0],$A0[0]
- adc \$0,$A0[1]
- mul $a0 # a[5]*a[2]
- add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5]
- mov $ai,%rax
- adc %rdx,$A0[1]
- mov $A0[0],8($tptr,$j) # t[5]
-
- mov 16($aptr,$j),$ai # a[6]
- xor $A1[0],$A1[0]
- mul $a1 # a[5]*a[3]
- add %rax,$A1[1] # a[5]*a[3]+t[6]
- mov $ai,%rax
- adc %rdx,$A1[0]
-
- xor $A0[0],$A0[0]
- add $A1[1],$A0[1]
- adc \$0,$A0[0]
- mul $a0 # a[6]*a[2]
- add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6]
- mov $ai,%rax # a[3]
- adc %rdx,$A0[0]
- mov $A0[1],16($tptr,$j) # t[6]
-
-
- mov 24($aptr,$j),$ai # a[7]
- xor $A1[1],$A1[1]
- mul $a1 # a[6]*a[5]
- add %rax,$A1[0] # a[6]*a[5]+t[7]
- mov $ai,%rax
- adc %rdx,$A1[1]
-
- xor $A0[1],$A0[1]
- add $A1[0],$A0[0]
- lea 32($j),$j
- adc \$0,$A0[1]
- mul $a0 # a[7]*a[4]
- add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6]
- mov $ai,%rax
- adc %rdx,$A0[1]
- mov $A0[0],-8($tptr,$j) # t[7]
-
- cmp \$0,$j
- jne .Lsqr4x_1st
-
- xor $A1[0],$A1[0]
- add $A0[1],$A1[1]
- adc \$0,$A1[0]
- mul $a1 # a[7]*a[5]
- add %rax,$A1[1]
- adc %rdx,$A1[0]
-
- mov $A1[1],($tptr) # t[8]
- lea 16($i),$i
- mov $A1[0],8($tptr) # t[9]
- jmp .Lsqr4x_outer
+ pxor %xmm0,%xmm0
+ movq $rptr,%xmm1 # save $rptr
+ movq %r10, %xmm3 # -$num
+___
+$code.=<<___ if ($addx);
+ and \$0x80100,%eax
+ cmp \$0x80100,%eax
+ jne .Lsqr8x_nox
-.align 16
-.Lsqr4x_outer: # comments apply to $num==6 case
- mov -32($aptr,$i),$a0 # a[0]
- lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
- mov -24($aptr,$i),%rax # a[1]
- lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
- mov -16($aptr,$i),$ai # a[2]
- mov %rax,$a1
-
- mov -24($tptr,$i),$A0[0] # t[1]
- xor $A0[1],$A0[1]
- mul $a0 # a[1]*a[0]
- add %rax,$A0[0] # a[1]*a[0]+t[1]
- mov $ai,%rax # a[2]
- adc %rdx,$A0[1]
- mov $A0[0],-24($tptr,$i) # t[1]
-
- xor $A0[0],$A0[0]
- add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2]
- adc \$0,$A0[0]
- mul $a0 # a[2]*a[0]
- add %rax,$A0[1]
- mov $ai,%rax
- adc %rdx,$A0[0]
- mov $A0[1],-16($tptr,$i) # t[2]
-
- lea -16($i),$j # j=-16
- xor $A1[0],$A1[0]
-
-
- mov 8($aptr,$j),$ai # a[3]
- xor $A1[1],$A1[1]
- add 8($tptr,$j),$A1[0]
- adc \$0,$A1[1]
- mul $a1 # a[2]*a[1]
- add %rax,$A1[0] # a[2]*a[1]+t[3]
- mov $ai,%rax
- adc %rdx,$A1[1]
-
- xor $A0[1],$A0[1]
- add $A1[0],$A0[0]
- adc \$0,$A0[1]
- mul $a0 # a[3]*a[0]
- add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
- mov $ai,%rax
- adc %rdx,$A0[1]
- mov $A0[0],8($tptr,$j) # t[3]
-
- lea 16($j),$j
- jmp .Lsqr4x_inner
+ call bn_sqrx8x_internal # see x86_64-mont5 module
-.align 16
-.Lsqr4x_inner:
- mov ($aptr,$j),$ai # a[4]
- xor $A1[0],$A1[0]
- add ($tptr,$j),$A1[1]
- adc \$0,$A1[0]
- mul $a1 # a[3]*a[1]
- add %rax,$A1[1] # a[3]*a[1]+t[4]
- mov $ai,%rax
- adc %rdx,$A1[0]
-
- xor $A0[0],$A0[0]
- add $A1[1],$A0[1]
- adc \$0,$A0[0]
- mul $a0 # a[4]*a[0]
- add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4]
- mov $ai,%rax # a[3]
- adc %rdx,$A0[0]
- mov $A0[1],($tptr,$j) # t[4]
-
- mov 8($aptr,$j),$ai # a[5]
- xor $A1[1],$A1[1]
- add 8($tptr,$j),$A1[0]
- adc \$0,$A1[1]
- mul $a1 # a[4]*a[3]
- add %rax,$A1[0] # a[4]*a[3]+t[5]
- mov $ai,%rax
- adc %rdx,$A1[1]
-
- xor $A0[1],$A0[1]
- add $A1[0],$A0[0]
- lea 16($j),$j # j++
- adc \$0,$A0[1]
- mul $a0 # a[5]*a[2]
- add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5]
- mov $ai,%rax
- adc %rdx,$A0[1]
- mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below
-
- cmp \$0,$j
- jne .Lsqr4x_inner
-
- xor $A1[0],$A1[0]
- add $A0[1],$A1[1]
- adc \$0,$A1[0]
- mul $a1 # a[5]*a[3]
- add %rax,$A1[1]
- adc %rdx,$A1[0]
-
- mov $A1[1],($tptr) # t[6], "preloaded t[2]" below
- mov $A1[0],8($tptr) # t[7], "preloaded t[3]" below
-
- add \$16,$i
- jnz .Lsqr4x_outer
-
- # comments apply to $num==4 case
- mov -32($aptr),$a0 # a[0]
- lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
- mov -24($aptr),%rax # a[1]
- lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
- mov -16($aptr),$ai # a[2]
- mov %rax,$a1
-
- xor $A0[1],$A0[1]
- mul $a0 # a[1]*a[0]
- add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1]
- mov $ai,%rax # a[2]
- adc %rdx,$A0[1]
- mov $A0[0],-24($tptr) # t[1]
-
- xor $A0[0],$A0[0]
- add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2]
- adc \$0,$A0[0]
- mul $a0 # a[2]*a[0]
- add %rax,$A0[1]
- mov $ai,%rax
- adc %rdx,$A0[0]
- mov $A0[1],-16($tptr) # t[2]
-
- mov -8($aptr),$ai # a[3]
- mul $a1 # a[2]*a[1]
- add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3]
- mov $ai,%rax
- adc \$0,%rdx
-
- xor $A0[1],$A0[1]
- add $A1[0],$A0[0]
- mov %rdx,$A1[1]
- adc \$0,$A0[1]
- mul $a0 # a[3]*a[0]
- add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
- mov $ai,%rax
- adc %rdx,$A0[1]
- mov $A0[0],-8($tptr) # t[3]
-
- xor $A1[0],$A1[0]
- add $A0[1],$A1[1]
- adc \$0,$A1[0]
- mul $a1 # a[3]*a[1]
- add %rax,$A1[1]
- mov -16($aptr),%rax # a[2]
- adc %rdx,$A1[0]
-
- mov $A1[1],($tptr) # t[4]
- mov $A1[0],8($tptr) # t[5]
-
- mul $ai # a[2]*a[3]
+ pxor %xmm0,%xmm0
+ lea 48(%rsp),%rax
+ lea 64(%rsp,$num,2),%rdx
+ shr \$3+2,$num
+ mov 40(%rsp),%rsi # restore %rsp
+ jmp .Lsqr8x_zero
+
+.align 32
+.Lsqr8x_nox:
___
-{
-my ($shift,$carry)=($a0,$a1);
-my @S=(@A1,$ai,$n0);
$code.=<<___;
- add \$16,$i
- xor $shift,$shift
- sub $num,$i # $i=16-$num
- xor $carry,$carry
-
- add $A1[0],%rax # t[5]
- adc \$0,%rdx
- mov %rax,8($tptr) # t[5]
- mov %rdx,16($tptr) # t[6]
- mov $carry,24($tptr) # t[7]
-
- mov -16($aptr,$i),%rax # a[0]
- lea 64(%rsp,$num,2),$tptr
- xor $A0[0],$A0[0] # t[0]
- mov -24($tptr,$i,2),$A0[1] # t[1]
-
- lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
- shr \$63,$A0[0]
- lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
- shr \$63,$A0[1]
- or $A0[0],$S[1] # | t[2*i]>>63
- mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
- mov $A0[1],$shift # shift=t[2*i+1]>>63
- mul %rax # a[i]*a[i]
- neg $carry # mov $carry,cf
- mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
- adc %rax,$S[0]
- mov -8($aptr,$i),%rax # a[i+1] # prefetch
- mov $S[0],-32($tptr,$i,2)
- adc %rdx,$S[1]
-
- lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
- mov $S[1],-24($tptr,$i,2)
- sbb $carry,$carry # mov cf,$carry
- shr \$63,$A0[0]
- lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
- shr \$63,$A0[1]
- or $A0[0],$S[3] # | t[2*i]>>63
- mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
- mov $A0[1],$shift # shift=t[2*i+1]>>63
- mul %rax # a[i]*a[i]
- neg $carry # mov $carry,cf
- mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
- adc %rax,$S[2]
- mov 0($aptr,$i),%rax # a[i+1] # prefetch
- mov $S[2],-16($tptr,$i,2)
- adc %rdx,$S[3]
- lea 16($i),$i
- mov $S[3],-40($tptr,$i,2)
- sbb $carry,$carry # mov cf,$carry
- jmp .Lsqr4x_shift_n_add
+ call bn_sqr8x_internal # see x86_64-mont5 module
-.align 16
-.Lsqr4x_shift_n_add:
- lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
- shr \$63,$A0[0]
- lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
- shr \$63,$A0[1]
- or $A0[0],$S[1] # | t[2*i]>>63
- mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
- mov $A0[1],$shift # shift=t[2*i+1]>>63
- mul %rax # a[i]*a[i]
- neg $carry # mov $carry,cf
- mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
- adc %rax,$S[0]
- mov -8($aptr,$i),%rax # a[i+1] # prefetch
- mov $S[0],-32($tptr,$i,2)
- adc %rdx,$S[1]
-
- lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
- mov $S[1],-24($tptr,$i,2)
- sbb $carry,$carry # mov cf,$carry
- shr \$63,$A0[0]
- lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
- shr \$63,$A0[1]
- or $A0[0],$S[3] # | t[2*i]>>63
- mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
- mov $A0[1],$shift # shift=t[2*i+1]>>63
- mul %rax # a[i]*a[i]
- neg $carry # mov $carry,cf
- mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
- adc %rax,$S[2]
- mov 0($aptr,$i),%rax # a[i+1] # prefetch
- mov $S[2],-16($tptr,$i,2)
- adc %rdx,$S[3]
-
- lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
- mov $S[3],-8($tptr,$i,2)
- sbb $carry,$carry # mov cf,$carry
- shr \$63,$A0[0]
- lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
- shr \$63,$A0[1]
- or $A0[0],$S[1] # | t[2*i]>>63
- mov 16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
- mov $A0[1],$shift # shift=t[2*i+1]>>63
- mul %rax # a[i]*a[i]
- neg $carry # mov $carry,cf
- mov 24($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
- adc %rax,$S[0]
- mov 8($aptr,$i),%rax # a[i+1] # prefetch
- mov $S[0],0($tptr,$i,2)
- adc %rdx,$S[1]
-
- lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
- mov $S[1],8($tptr,$i,2)
- sbb $carry,$carry # mov cf,$carry
- shr \$63,$A0[0]
- lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
- shr \$63,$A0[1]
- or $A0[0],$S[3] # | t[2*i]>>63
- mov 32($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
- mov $A0[1],$shift # shift=t[2*i+1]>>63
- mul %rax # a[i]*a[i]
- neg $carry # mov $carry,cf
- mov 40($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
- adc %rax,$S[2]
- mov 16($aptr,$i),%rax # a[i+1] # prefetch
- mov $S[2],16($tptr,$i,2)
- adc %rdx,$S[3]
- mov $S[3],24($tptr,$i,2)
- sbb $carry,$carry # mov cf,$carry
- add \$32,$i
- jnz .Lsqr4x_shift_n_add
-
- lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
- shr \$63,$A0[0]
- lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
- shr \$63,$A0[1]
- or $A0[0],$S[1] # | t[2*i]>>63
- mov -16($tptr),$A0[0] # t[2*i+2] # prefetch
- mov $A0[1],$shift # shift=t[2*i+1]>>63
- mul %rax # a[i]*a[i]
- neg $carry # mov $carry,cf
- mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch
- adc %rax,$S[0]
- mov -8($aptr),%rax # a[i+1] # prefetch
- mov $S[0],-32($tptr)
- adc %rdx,$S[1]
-
- lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift
- mov $S[1],-24($tptr)
- sbb $carry,$carry # mov cf,$carry
- shr \$63,$A0[0]
- lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
- shr \$63,$A0[1]
- or $A0[0],$S[3] # | t[2*i]>>63
- mul %rax # a[i]*a[i]
- neg $carry # mov $carry,cf
- adc %rax,$S[2]
- adc %rdx,$S[3]
- mov $S[2],-16($tptr)
- mov $S[3],-8($tptr)
-___
-}
-##############################################################
-# Montgomery reduction part, "word-by-word" algorithm.
-#
-{
-my ($topbit,$nptr)=("%rbp",$aptr);
-my ($m0,$m1)=($a0,$a1);
-my @Ni=("%rbx","%r9");
-$code.=<<___;
- mov 40(%rsp),$nptr # restore $nptr
- mov 48(%rsp),$n0 # restore *n0
- xor $j,$j
- mov $num,0(%rsp) # save $num
- sub $num,$j # $j=-$num
- mov 64(%rsp),$A0[0] # t[0] # modsched #
- mov $n0,$m0 # # modsched #
- lea 64(%rsp,$num,2),%rax # end of t[] buffer
- lea 64(%rsp,$num),$tptr # end of t[] window
- mov %rax,8(%rsp) # save end of t[] buffer
- lea ($nptr,$num),$nptr # end of n[] buffer
- xor $topbit,$topbit # $topbit=0
-
- mov 0($nptr,$j),%rax # n[0] # modsched #
- mov 8($nptr,$j),$Ni[1] # n[1] # modsched #
- imulq $A0[0],$m0 # m0=t[0]*n0 # modsched #
- mov %rax,$Ni[0] # # modsched #
- jmp .Lsqr4x_mont_outer
+ pxor %xmm0,%xmm0
+ lea 48(%rsp),%rax
+ lea 64(%rsp,$num,2),%rdx
+ shr \$3+2,$num
+ mov 40(%rsp),%rsi # restore %rsp
+ jmp .Lsqr8x_zero
+
+.align 32
+.Lsqr8x_zero:
+ movdqa %xmm0,16*0(%rax) # wipe t
+ movdqa %xmm0,16*1(%rax)
+ movdqa %xmm0,16*2(%rax)
+ movdqa %xmm0,16*3(%rax)
+ lea 16*4(%rax),%rax
+ movdqa %xmm0,16*0(%rdx) # wipe n
+ movdqa %xmm0,16*1(%rdx)
+ movdqa %xmm0,16*2(%rdx)
+ movdqa %xmm0,16*3(%rdx)
+ lea 16*4(%rdx),%rdx
+ dec $num
+ jnz .Lsqr8x_zero
-.align 16
-.Lsqr4x_mont_outer:
- xor $A0[1],$A0[1]
- mul $m0 # n[0]*m0
- add %rax,$A0[0] # n[0]*m0+t[0]
- mov $Ni[1],%rax
- adc %rdx,$A0[1]
- mov $n0,$m1
+ mov \$1,%rax
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
+.Lsqr8x_epilogue:
+ ret
+.size bn_sqr8x_mont,.-bn_sqr8x_mont
+___
+}}}
+
+if ($addx) {{{
+my $bp="%rdx"; # original value
- xor $A0[0],$A0[0]
- add 8($tptr,$j),$A0[1]
- adc \$0,$A0[0]
- mul $m0 # n[1]*m0
- add %rax,$A0[1] # n[1]*m0+t[1]
- mov $Ni[0],%rax
- adc %rdx,$A0[0]
-
- imulq $A0[1],$m1
-
- mov 16($nptr,$j),$Ni[0] # n[2]
- xor $A1[1],$A1[1]
- add $A0[1],$A1[0]
- adc \$0,$A1[1]
- mul $m1 # n[0]*m1
- add %rax,$A1[0] # n[0]*m1+"t[1]"
- mov $Ni[0],%rax
- adc %rdx,$A1[1]
- mov $A1[0],8($tptr,$j) # "t[1]"
-
- xor $A0[1],$A0[1]
- add 16($tptr,$j),$A0[0]
- adc \$0,$A0[1]
- mul $m0 # n[2]*m0
- add %rax,$A0[0] # n[2]*m0+t[2]
- mov $Ni[1],%rax
- adc %rdx,$A0[1]
-
- mov 24($nptr,$j),$Ni[1] # n[3]
- xor $A1[0],$A1[0]
- add $A0[0],$A1[1]
- adc \$0,$A1[0]
- mul $m1 # n[1]*m1
- add %rax,$A1[1] # n[1]*m1+"t[2]"
- mov $Ni[1],%rax
- adc %rdx,$A1[0]
- mov $A1[1],16($tptr,$j) # "t[2]"
-
- xor $A0[0],$A0[0]
- add 24($tptr,$j),$A0[1]
- lea 32($j),$j
- adc \$0,$A0[0]
- mul $m0 # n[3]*m0
- add %rax,$A0[1] # n[3]*m0+t[3]
- mov $Ni[0],%rax
- adc %rdx,$A0[0]
- jmp .Lsqr4x_mont_inner
+$code.=<<___;
+.type bn_mulx4x_mont,\@function,6
+.align 32
+bn_mulx4x_mont:
+.Lmulx4x_enter:
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
-.align 16
-.Lsqr4x_mont_inner:
- mov ($nptr,$j),$Ni[0] # n[4]
- xor $A1[1],$A1[1]
- add $A0[1],$A1[0]
- adc \$0,$A1[1]
- mul $m1 # n[2]*m1
- add %rax,$A1[0] # n[2]*m1+"t[3]"
- mov $Ni[0],%rax
- adc %rdx,$A1[1]
- mov $A1[0],-8($tptr,$j) # "t[3]"
-
- xor $A0[1],$A0[1]
- add ($tptr,$j),$A0[0]
- adc \$0,$A0[1]
- mul $m0 # n[4]*m0
- add %rax,$A0[0] # n[4]*m0+t[4]
- mov $Ni[1],%rax
- adc %rdx,$A0[1]
-
- mov 8($nptr,$j),$Ni[1] # n[5]
- xor $A1[0],$A1[0]
- add $A0[0],$A1[1]
- adc \$0,$A1[0]
- mul $m1 # n[3]*m1
- add %rax,$A1[1] # n[3]*m1+"t[4]"
- mov $Ni[1],%rax
- adc %rdx,$A1[0]
- mov $A1[1],($tptr,$j) # "t[4]"
-
- xor $A0[0],$A0[0]
- add 8($tptr,$j),$A0[1]
- adc \$0,$A0[0]
- mul $m0 # n[5]*m0
- add %rax,$A0[1] # n[5]*m0+t[5]
- mov $Ni[0],%rax
- adc %rdx,$A0[0]
-
-
- mov 16($nptr,$j),$Ni[0] # n[6]
- xor $A1[1],$A1[1]
- add $A0[1],$A1[0]
- adc \$0,$A1[1]
- mul $m1 # n[4]*m1
- add %rax,$A1[0] # n[4]*m1+"t[5]"
- mov $Ni[0],%rax
- adc %rdx,$A1[1]
- mov $A1[0],8($tptr,$j) # "t[5]"
-
- xor $A0[1],$A0[1]
- add 16($tptr,$j),$A0[0]
- adc \$0,$A0[1]
- mul $m0 # n[6]*m0
- add %rax,$A0[0] # n[6]*m0+t[6]
- mov $Ni[1],%rax
- adc %rdx,$A0[1]
-
- mov 24($nptr,$j),$Ni[1] # n[7]
- xor $A1[0],$A1[0]
- add $A0[0],$A1[1]
- adc \$0,$A1[0]
- mul $m1 # n[5]*m1
- add %rax,$A1[1] # n[5]*m1+"t[6]"
- mov $Ni[1],%rax
- adc %rdx,$A1[0]
- mov $A1[1],16($tptr,$j) # "t[6]"
-
- xor $A0[0],$A0[0]
- add 24($tptr,$j),$A0[1]
- lea 32($j),$j
- adc \$0,$A0[0]
- mul $m0 # n[7]*m0
- add %rax,$A0[1] # n[7]*m0+t[7]
- mov $Ni[0],%rax
- adc %rdx,$A0[0]
- cmp \$0,$j
- jne .Lsqr4x_mont_inner
-
- sub 0(%rsp),$j # $j=-$num # modsched #
- mov $n0,$m0 # # modsched #
-
- xor $A1[1],$A1[1]
- add $A0[1],$A1[0]
- adc \$0,$A1[1]
- mul $m1 # n[6]*m1
- add %rax,$A1[0] # n[6]*m1+"t[7]"
- mov $Ni[1],%rax
- adc %rdx,$A1[1]
- mov $A1[0],-8($tptr) # "t[7]"
-
- xor $A0[1],$A0[1]
- add ($tptr),$A0[0] # +t[8]
- adc \$0,$A0[1]
- mov 0($nptr,$j),$Ni[0] # n[0] # modsched #
- add $topbit,$A0[0]
- adc \$0,$A0[1]
-
- imulq 16($tptr,$j),$m0 # m0=t[0]*n0 # modsched #
- xor $A1[0],$A1[0]
- mov 8($nptr,$j),$Ni[1] # n[1] # modsched #
- add $A0[0],$A1[1]
- mov 16($tptr,$j),$A0[0] # t[0] # modsched #
- adc \$0,$A1[0]
- mul $m1 # n[7]*m1
- add %rax,$A1[1] # n[7]*m1+"t[8]"
- mov $Ni[0],%rax # # modsched #
- adc %rdx,$A1[0]
- mov $A1[1],($tptr) # "t[8]"
-
- xor $topbit,$topbit
- add 8($tptr),$A1[0] # +t[9]
- adc $topbit,$topbit
- add $A0[1],$A1[0]
- lea 16($tptr),$tptr # "t[$num]>>128"
- adc \$0,$topbit
- mov $A1[0],-8($tptr) # "t[9]"
- cmp 8(%rsp),$tptr # are we done?
- jb .Lsqr4x_mont_outer
-
- mov 0(%rsp),$num # restore $num
- mov $topbit,($tptr) # save $topbit
+ shl \$3,${num}d # convert $num to bytes
+ .byte 0x67
+ xor %r10,%r10
+ sub $num,%r10 # -$num
+ mov ($n0),$n0 # *n0
+ lea -72(%rsp,%r10),%rsp # alloca(frame+$num+8)
+ lea ($bp,$num),%r10
+ and \$-128,%rsp
+ ##############################################################
+ # Stack layout
+ # +0 num
+ # +8 off-loaded &b[i]
+ # +16 end of b[num]
+ # +24 saved n0
+ # +32 saved rp
+ # +40 saved %rsp
+ # +48 inner counter
+ # +56
+ # +64 tmp[num+1]
+ #
+ mov $num,0(%rsp) # save $num
+ shr \$5,$num
+ mov %r10,16(%rsp) # end of b[num]
+ sub \$1,$num
+ mov $n0, 24(%rsp) # save *n0
+ mov $rp, 32(%rsp) # save $rp
+ mov %rax,40(%rsp) # save original %rsp
+ mov $num,48(%rsp) # inner counter
+ jmp .Lmulx4x_body
+
+.align 32
+.Lmulx4x_body:
___
-}
-##############################################################
-# Post-condition, 4x unrolled copy from bn_mul_mont
-#
-{
-my ($tptr,$nptr)=("%rbx",$aptr);
-my @ri=("%rax","%rdx","%r10","%r11");
+my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)=
+ ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
+my $rptr=$bptr;
$code.=<<___;
- mov 64(%rsp,$num),@ri[0] # tp[0]
- lea 64(%rsp,$num),$tptr # upper half of t[2*$num] holds result
- mov 40(%rsp),$nptr # restore $nptr
- shr \$5,$num # num/4
- mov 8($tptr),@ri[1] # t[1]
- xor $i,$i # i=0 and clear CF!
-
- mov 32(%rsp),$rptr # restore $rptr
- sub 0($nptr),@ri[0]
- mov 16($tptr),@ri[2] # t[2]
- mov 24($tptr),@ri[3] # t[3]
- sbb 8($nptr),@ri[1]
- lea -1($num),$j # j=num/4-1
- jmp .Lsqr4x_sub
-.align 16
-.Lsqr4x_sub:
- mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i]
- mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i]
- sbb 16($nptr,$i,8),@ri[2]
- mov 32($tptr,$i,8),@ri[0] # tp[i+1]
- mov 40($tptr,$i,8),@ri[1]
- sbb 24($nptr,$i,8),@ri[3]
- mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i]
- mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i]
- sbb 32($nptr,$i,8),@ri[0]
- mov 48($tptr,$i,8),@ri[2]
- mov 56($tptr,$i,8),@ri[3]
- sbb 40($nptr,$i,8),@ri[1]
- lea 4($i),$i # i++
- dec $j # doesn't affect CF!
- jnz .Lsqr4x_sub
-
- mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i]
- mov 32($tptr,$i,8),@ri[0] # load overflow bit
- sbb 16($nptr,$i,8),@ri[2]
- mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i]
- sbb 24($nptr,$i,8),@ri[3]
- mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i]
-
- sbb \$0,@ri[0] # handle upmost overflow bit
- mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i]
- xor $i,$i # i=0
- and @ri[0],$tptr
- not @ri[0]
- mov $rptr,$nptr
- and @ri[0],$nptr
- lea -1($num),$j
- or $nptr,$tptr # tp=borrow?tp:rp
+ lea 8($bp),$bptr
+ mov ($bp),%rdx # b[0], $bp==%rdx actually
+ lea 64+32(%rsp),$tptr
+ mov %rdx,$bi
+
+ mulx 0*8($aptr),$mi,%rax # a[0]*b[0]
+ mulx 1*8($aptr),%r11,%r14 # a[1]*b[0]
+ add %rax,%r11
+ mov $bptr,8(%rsp) # off-load &b[i]
+ mulx 2*8($aptr),%r12,%r13 # ...
+ adc %r14,%r12
+ adc \$0,%r13
+
+ mov $mi,$bptr # borrow $bptr
+ imulq 24(%rsp),$mi # "t[0]"*n0
+ xor $zero,$zero # cf=0, of=0
+
+ mulx 3*8($aptr),%rax,%r14
+ mov $mi,%rdx
+ lea 4*8($aptr),$aptr
+ adcx %rax,%r13
+ adcx $zero,%r14 # cf=0
+
+ mulx 0*8($nptr),%rax,%r10
+ adcx %rax,$bptr # discarded
+ adox %r11,%r10
+ mulx 1*8($nptr),%rax,%r11
+ adcx %rax,%r10
+ adox %r12,%r11
+ .byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 # mulx 2*8($nptr),%rax,%r12
+ mov 48(%rsp),$bptr # counter value
+ mov %r10,-4*8($tptr)
+ adcx %rax,%r11
+ adox %r13,%r12
+ mulx 3*8($nptr),%rax,%r15
+ mov $bi,%rdx
+ mov %r11,-3*8($tptr)
+ adcx %rax,%r12
+ adox $zero,%r15 # of=0
+ lea 4*8($nptr),$nptr
+ mov %r12,-2*8($tptr)
+
+ jmp .Lmulx4x_1st
+
+.align 32
+.Lmulx4x_1st:
+ adcx $zero,%r15 # cf=0, modulo-scheduled
+ mulx 0*8($aptr),%r10,%rax # a[4]*b[0]
+ adcx %r14,%r10
+ mulx 1*8($aptr),%r11,%r14 # a[5]*b[0]
+ adcx %rax,%r11
+ mulx 2*8($aptr),%r12,%rax # ...
+ adcx %r14,%r12
+ mulx 3*8($aptr),%r13,%r14
+ .byte 0x67,0x67
+ mov $mi,%rdx
+ adcx %rax,%r13
+ adcx $zero,%r14 # cf=0
+ lea 4*8($aptr),$aptr
+ lea 4*8($tptr),$tptr
+
+ adox %r15,%r10
+ mulx 0*8($nptr),%rax,%r15
+ adcx %rax,%r10
+ adox %r15,%r11
+ mulx 1*8($nptr),%rax,%r15
+ adcx %rax,%r11
+ adox %r15,%r12
+ mulx 2*8($nptr),%rax,%r15
+ mov %r10,-5*8($tptr)
+ adcx %rax,%r12
+ mov %r11,-4*8($tptr)
+ adox %r15,%r13
+ mulx 3*8($nptr),%rax,%r15
+ mov $bi,%rdx
+ mov %r12,-3*8($tptr)
+ adcx %rax,%r13
+ adox $zero,%r15
+ lea 4*8($nptr),$nptr
+ mov %r13,-2*8($tptr)
+
+ dec $bptr # of=0, pass cf
+ jnz .Lmulx4x_1st
+
+ mov 0(%rsp),$num # load num
+ mov 8(%rsp),$bptr # re-load &b[i]
+ adc $zero,%r15 # modulo-scheduled
+ add %r15,%r14
+ sbb %r15,%r15 # top-most carry
+ mov %r14,-1*8($tptr)
+ jmp .Lmulx4x_outer
+
+.align 32
+.Lmulx4x_outer:
+ mov ($bptr),%rdx # b[i]
+ lea 8($bptr),$bptr # b++
+ sub $num,$aptr # rewind $aptr
+ mov %r15,($tptr) # save top-most carry
+ lea 64+4*8(%rsp),$tptr
+ sub $num,$nptr # rewind $nptr
+
+ mulx 0*8($aptr),$mi,%r11 # a[0]*b[i]
+ xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0
+ mov %rdx,$bi
+ mulx 1*8($aptr),%r14,%r12 # a[1]*b[i]
+ adox -4*8($tptr),$mi
+ adcx %r14,%r11
+ mulx 2*8($aptr),%r15,%r13 # ...
+ adox -3*8($tptr),%r11
+ adcx %r15,%r12
+ adox $zero,%r12
+ adcx $zero,%r13
+
+ mov $bptr,8(%rsp) # off-load &b[i]
+ .byte 0x67
+ mov $mi,%r15
+ imulq 24(%rsp),$mi # "t[0]"*n0
+ xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0
+
+ mulx 3*8($aptr),%rax,%r14
+ mov $mi,%rdx
+ adox -2*8($tptr),%r12
+ adcx %rax,%r13
+ adox -1*8($tptr),%r13
+ adcx $zero,%r14
+ lea 4*8($aptr),$aptr
+ adox $zero,%r14
+
+ mulx 0*8($nptr),%rax,%r10
+ adcx %rax,%r15 # discarded
+ adox %r11,%r10
+ mulx 1*8($nptr),%rax,%r11
+ adcx %rax,%r10
+ adox %r12,%r11
+ mulx 2*8($nptr),%rax,%r12
+ mov %r10,-4*8($tptr)
+ adcx %rax,%r11
+ adox %r13,%r12
+ mulx 3*8($nptr),%rax,%r15
+ mov $bi,%rdx
+ mov %r11,-3*8($tptr)
+ lea 4*8($nptr),$nptr
+ adcx %rax,%r12
+ adox $zero,%r15 # of=0
+ mov 48(%rsp),$bptr # counter value
+ mov %r12,-2*8($tptr)
+
+ jmp .Lmulx4x_inner
+
+.align 32
+.Lmulx4x_inner:
+ mulx 0*8($aptr),%r10,%rax # a[4]*b[i]
+ adcx $zero,%r15 # cf=0, modulo-scheduled
+ adox %r14,%r10
+ mulx 1*8($aptr),%r11,%r14 # a[5]*b[i]
+ adcx 0*8($tptr),%r10
+ adox %rax,%r11
+ mulx 2*8($aptr),%r12,%rax # ...
+ adcx 1*8($tptr),%r11
+ adox %r14,%r12
+ mulx 3*8($aptr),%r13,%r14
+ mov $mi,%rdx
+ adcx 2*8($tptr),%r12
+ adox %rax,%r13
+ adcx 3*8($tptr),%r13
+ adox $zero,%r14 # of=0
+ lea 4*8($aptr),$aptr
+ lea 4*8($tptr),$tptr
+ adcx $zero,%r14 # cf=0
+
+ adox %r15,%r10
+ mulx 0*8($nptr),%rax,%r15
+ adcx %rax,%r10
+ adox %r15,%r11
+ mulx 1*8($nptr),%rax,%r15
+ adcx %rax,%r11
+ adox %r15,%r12
+ mulx 2*8($nptr),%rax,%r15
+ mov %r10,-5*8($tptr)
+ adcx %rax,%r12
+ adox %r15,%r13
+ mulx 3*8($nptr),%rax,%r15
+ mov $bi,%rdx
+ mov %r11,-4*8($tptr)
+ mov %r12,-3*8($tptr)
+ adcx %rax,%r13
+ adox $zero,%r15
+ lea 4*8($nptr),$nptr
+ mov %r13,-2*8($tptr)
+
+ dec $bptr # of=0, pass cf
+ jnz .Lmulx4x_inner
+
+ mov 0(%rsp),$num # load num
+ mov 8(%rsp),$bptr # re-load &b[i]
+ adc $zero,%r15 # modulo-scheduled
+ sub 0*8($tptr),$zero # pull top-most carry
+ adc %r15,%r14
+ mov -8($nptr),$mi
+ sbb %r15,%r15 # top-most carry
+ mov %r14,-1*8($tptr)
+
+ cmp 16(%rsp),$bptr
+ jne .Lmulx4x_outer
+
+ sub %r14,$mi # compare top-most words
+ sbb $mi,$mi
+ or $mi,%r15
+
+ neg $num
+ xor %rdx,%rdx
+ mov 32(%rsp),$rptr # restore rp
+ lea 64(%rsp),$tptr
pxor %xmm0,%xmm0
- lea 64(%rsp,$num,8),$nptr
- movdqu ($tptr),%xmm1
- lea ($nptr,$num,8),$nptr
- movdqa %xmm0,64(%rsp) # zap lower half of temporary vector
- movdqa %xmm0,($nptr) # zap upper half of temporary vector
- movdqu %xmm1,($rptr)
- jmp .Lsqr4x_copy
-.align 16
-.Lsqr4x_copy: # copy or in-place refresh
- movdqu 16($tptr,$i),%xmm2
- movdqu 32($tptr,$i),%xmm1
- movdqa %xmm0,80(%rsp,$i) # zap lower half of temporary vector
- movdqa %xmm0,96(%rsp,$i) # zap lower half of temporary vector
- movdqa %xmm0,16($nptr,$i) # zap upper half of temporary vector
- movdqa %xmm0,32($nptr,$i) # zap upper half of temporary vector
- movdqu %xmm2,16($rptr,$i)
- movdqu %xmm1,32($rptr,$i)
- lea 32($i),$i
- dec $j
- jnz .Lsqr4x_copy
-
- movdqu 16($tptr,$i),%xmm2
- movdqa %xmm0,80(%rsp,$i) # zap lower half of temporary vector
- movdqa %xmm0,16($nptr,$i) # zap upper half of temporary vector
- movdqu %xmm2,16($rptr,$i)
-___
-}
-$code.=<<___;
- mov 56(%rsp),%rsi # restore %rsp
+ mov 0*8($nptr,$num),%r8
+ mov 1*8($nptr,$num),%r9
+ neg %r8
+ jmp .Lmulx4x_sub_entry
+
+.align 32
+.Lmulx4x_sub:
+ mov 0*8($nptr,$num),%r8
+ mov 1*8($nptr,$num),%r9
+ not %r8
+.Lmulx4x_sub_entry:
+ mov 2*8($nptr,$num),%r10
+ not %r9
+ and %r15,%r8
+ mov 3*8($nptr,$num),%r11
+ not %r10
+ and %r15,%r9
+ not %r11
+ and %r15,%r10
+ and %r15,%r11
+
+ neg %rdx # mov %rdx,%cf
+ adc 0*8($tptr),%r8
+ adc 1*8($tptr),%r9
+ movdqa %xmm0,($tptr)
+ adc 2*8($tptr),%r10
+ adc 3*8($tptr),%r11
+ movdqa %xmm0,16($tptr)
+ lea 4*8($tptr),$tptr
+ sbb %rdx,%rdx # mov %cf,%rdx
+
+ mov %r8,0*8($rptr)
+ mov %r9,1*8($rptr)
+ mov %r10,2*8($rptr)
+ mov %r11,3*8($rptr)
+ lea 4*8($rptr),$rptr
+
+ add \$32,$num
+ jnz .Lmulx4x_sub
+
+ mov 40(%rsp),%rsi # restore %rsp
mov \$1,%rax
- mov 0(%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
-.Lsqr4x_epilogue:
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
+.Lmulx4x_epilogue:
ret
-.size bn_sqr4x_mont,.-bn_sqr4x_mont
+.size bn_mulx4x_mont,.-bn_mulx4x_mont
___
}}}
$code.=<<___;
@@ -1581,18 +1285,22 @@ sqr_handler:
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
- lea .Lsqr4x_body(%rip),%r10
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # end of prologue label
cmp %r10,%rbx # context->Rip<.Lsqr_body
jb .Lcommon_seh_tail
mov 152($context),%rax # pull context->Rsp
- lea .Lsqr4x_epilogue(%rip),%r10
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue
jae .Lcommon_seh_tail
- mov 56(%rax),%rax # pull saved stack pointer
- lea 48(%rax),%rax
+ mov 40(%rax),%rax # pull saved stack pointer
mov -8(%rax),%rbx
mov -16(%rax),%rbp
@@ -1657,10 +1365,16 @@ sqr_handler:
.rva .LSEH_end_bn_mul4x_mont
.rva .LSEH_info_bn_mul4x_mont
- .rva .LSEH_begin_bn_sqr4x_mont
- .rva .LSEH_end_bn_sqr4x_mont
- .rva .LSEH_info_bn_sqr4x_mont
-
+ .rva .LSEH_begin_bn_sqr8x_mont
+ .rva .LSEH_end_bn_sqr8x_mont
+ .rva .LSEH_info_bn_sqr8x_mont
+___
+$code.=<<___ if ($addx);
+ .rva .LSEH_begin_bn_mulx4x_mont
+ .rva .LSEH_end_bn_mulx4x_mont
+ .rva .LSEH_info_bn_mulx4x_mont
+___
+$code.=<<___;
.section .xdata
.align 8
.LSEH_info_bn_mul_mont:
@@ -1671,9 +1385,16 @@ sqr_handler:
.byte 9,0,0,0
.rva mul_handler
.rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
-.LSEH_info_bn_sqr4x_mont:
+.LSEH_info_bn_sqr8x_mont:
+ .byte 9,0,0,0
+ .rva sqr_handler
+ .rva .Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[]
+___
+$code.=<<___ if ($addx);
+.LSEH_info_bn_mulx4x_mont:
.byte 9,0,0,0
.rva sqr_handler
+ .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
___
}
diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl
index dae0fe245318..820de3d6f627 100755
--- a/crypto/bn/asm/x86_64-mont5.pl
+++ b/crypto/bn/asm/x86_64-mont5.pl
@@ -17,6 +17,13 @@
# is implemented, so that scatter-/gathering can be tuned without
# bn_exp.c modifications.
+# August 2013.
+#
+# Add MULX/AD*X code paths and additional interfaces to optimize for
+# branch prediction unit. For input lengths that are multiples of 8
+# the np argument is not just modulus value, but one interleaved
+# with 0. This is to optimize post-condition...
+
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -31,6 +38,21 @@ die "can't locate x86_64-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $addx = ($1>=2.23);
+}
+
+if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $addx = ($1>=2.10);
+}
+
+if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $addx = ($1>=12);
+}
+
# int bn_mul_mont_gather5(
$rp="%rdi"; # BN_ULONG *rp,
$ap="%rsi"; # const BN_ULONG *ap,
@@ -53,19 +75,25 @@ $m1="%rbp";
$code=<<___;
.text
+.extern OPENSSL_ia32cap_P
+
.globl bn_mul_mont_gather5
.type bn_mul_mont_gather5,\@function,6
.align 64
bn_mul_mont_gather5:
- test \$3,${num}d
+ test \$7,${num}d
jnz .Lmul_enter
- cmp \$8,${num}d
- jb .Lmul_enter
+___
+$code.=<<___ if ($addx);
+ mov OPENSSL_ia32cap_P+8(%rip),%r11d
+___
+$code.=<<___;
jmp .Lmul4x_enter
.align 16
.Lmul_enter:
mov ${num}d,${num}d
+ mov %rsp,%rax
mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
push %rbx
push %rbp
@@ -78,10 +106,8 @@ $code.=<<___ if ($win64);
lea -0x28(%rsp),%rsp
movaps %xmm6,(%rsp)
movaps %xmm7,0x10(%rsp)
-.Lmul_alloca:
___
$code.=<<___;
- mov %rsp,%rax
lea 2($num),%r11
neg %r11
lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2))
@@ -287,7 +313,7 @@ $code.=<<___;
lea 1($i),$i # i++
cmp $num,$i
- jl .Louter
+ jb .Louter
xor $i,$i # i=0 and clear CF!
mov (%rsp),%rax # tp[0]
@@ -323,18 +349,17 @@ $code.=<<___;
mov \$1,%rax
___
$code.=<<___ if ($win64);
- movaps (%rsi),%xmm6
- movaps 0x10(%rsi),%xmm7
- lea 0x28(%rsi),%rsi
+ movaps -88(%rsi),%xmm6
+ movaps -72(%rsi),%xmm7
___
$code.=<<___;
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
.Lmul_epilogue:
ret
.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
@@ -344,11 +369,18 @@ my @A=("%r10","%r11");
my @N=("%r13","%rdi");
$code.=<<___;
.type bn_mul4x_mont_gather5,\@function,6
-.align 16
+.align 32
bn_mul4x_mont_gather5:
.Lmul4x_enter:
- mov ${num}d,${num}d
- mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
+___
+$code.=<<___ if ($addx);
+ and \$0x80100,%r11d
+ cmp \$0x80100,%r11d
+ je .Lmulx4x_enter
+___
+$code.=<<___;
+ .byte 0x67
+ mov %rsp,%rax
push %rbx
push %rbp
push %r12
@@ -360,23 +392,78 @@ $code.=<<___ if ($win64);
lea -0x28(%rsp),%rsp
movaps %xmm6,(%rsp)
movaps %xmm7,0x10(%rsp)
-.Lmul4x_alloca:
___
$code.=<<___;
- mov %rsp,%rax
- lea 4($num),%r11
- neg %r11
- lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+4))
- and \$-1024,%rsp # minimize TLB usage
-
- mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
+ .byte 0x67
+ mov ${num}d,%r10d
+ shl \$3,${num}d
+ shl \$3+2,%r10d # 4*$num
+ neg $num # -$num
+
+ ##############################################################
+ # ensure that stack frame doesn't alias with $aptr+4*$num
+ # modulo 4096, which covers ret[num], am[num] and n[2*num]
+ # (see bn_exp.c). this is done to allow memory disambiguation
+ # logic do its magic. [excessive frame is allocated in order
+ # to allow bn_from_mont8x to clear it.]
+ #
+ lea -64(%rsp,$num,2),%r11
+ sub $ap,%r11
+ and \$4095,%r11
+ cmp %r11,%r10
+ jb .Lmul4xsp_alt
+ sub %r11,%rsp # align with $ap
+ lea -64(%rsp,$num,2),%rsp # alloca(128+num*8)
+ jmp .Lmul4xsp_done
+
+.align 32
+.Lmul4xsp_alt:
+ lea 4096-64(,$num,2),%r10
+ lea -64(%rsp,$num,2),%rsp # alloca(128+num*8)
+ sub %r10,%r11
+ mov \$0,%r10
+ cmovc %r10,%r11
+ sub %r11,%rsp
+.Lmul4xsp_done:
+ and \$-64,%rsp
+ neg $num
+
+ mov %rax,40(%rsp)
.Lmul4x_body:
- mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
- mov %rdx,%r12 # reassign $bp
+
+ call mul4x_internal
+
+ mov 40(%rsp),%rsi # restore %rsp
+ mov \$1,%rax
+___
+$code.=<<___ if ($win64);
+ movaps -88(%rsi),%xmm6
+ movaps -72(%rsi),%xmm7
+___
+$code.=<<___;
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
+.Lmul4x_epilogue:
+ ret
+.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
+
+.type mul4x_internal,\@abi-omnipotent
+.align 32
+mul4x_internal:
+ shl \$5,$num
+ mov `($win64?56:8)`(%rax),%r10d # load 7th argument
+ lea 256(%rdx,$num),%r13
+ shr \$5,$num # restore $num
___
$bp="%r12";
$STRIDE=2**5*8; # 5 is "window size"
$N=$STRIDE/4; # should match cache line size
+ $tp=$i;
$code.=<<___;
mov %r10,%r11
shr \$`log($N/8)/log(2)`,%r10
@@ -384,459 +471,2776 @@ $code.=<<___;
not %r10
lea .Lmagic_masks(%rip),%rax
and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
- lea 96($bp,%r11,8),$bp # pointer within 1st cache line
+ lea 96(%rdx,%r11,8),$bp # pointer within 1st cache line
movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
movq 8(%rax,%r10,8),%xmm5 # cache line contains element
+ add \$7,%r11
movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
movq 24(%rax,%r10,8),%xmm7
+ and \$7,%r11
movq `0*$STRIDE/4-96`($bp),%xmm0
+ lea $STRIDE($bp),$tp # borrow $tp
movq `1*$STRIDE/4-96`($bp),%xmm1
pand %xmm4,%xmm0
movq `2*$STRIDE/4-96`($bp),%xmm2
pand %xmm5,%xmm1
movq `3*$STRIDE/4-96`($bp),%xmm3
pand %xmm6,%xmm2
+ .byte 0x67
por %xmm1,%xmm0
+ movq `0*$STRIDE/4-96`($tp),%xmm1
+ .byte 0x67
pand %xmm7,%xmm3
+ .byte 0x67
por %xmm2,%xmm0
- lea $STRIDE($bp),$bp
+ movq `1*$STRIDE/4-96`($tp),%xmm2
+ .byte 0x67
+ pand %xmm4,%xmm1
+ .byte 0x67
por %xmm3,%xmm0
+ movq `2*$STRIDE/4-96`($tp),%xmm3
movq %xmm0,$m0 # m0=bp[0]
+ movq `3*$STRIDE/4-96`($tp),%xmm0
+ mov %r13,16+8(%rsp) # save end of b[num]
+ mov $rp, 56+8(%rsp) # save $rp
+
mov ($n0),$n0 # pull n0[0] value
mov ($ap),%rax
-
- xor $i,$i # i=0
- xor $j,$j # j=0
-
- movq `0*$STRIDE/4-96`($bp),%xmm0
- movq `1*$STRIDE/4-96`($bp),%xmm1
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($bp),%xmm2
- pand %xmm5,%xmm1
+ lea ($ap,$num),$ap # end of a[num]
+ neg $num
mov $n0,$m1
mulq $m0 # ap[0]*bp[0]
mov %rax,$A[0]
mov ($np),%rax
- movq `3*$STRIDE/4-96`($bp),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
+ pand %xmm5,%xmm2
+ pand %xmm6,%xmm3
+ por %xmm2,%xmm1
imulq $A[0],$m1 # "tp[0]"*n0
+ ##############################################################
+ # $tp is chosen so that writing to top-most element of the
+ # vector occurs just "above" references to powers table,
+ # "above" modulo cache-line size, which effectively precludes
+ # possibility of memory disambiguation logic failure when
+ # accessing the table.
+ #
+ lea 64+8(%rsp,%r11,8),$tp
mov %rdx,$A[1]
- por %xmm2,%xmm0
- lea $STRIDE($bp),$bp
- por %xmm3,%xmm0
+ pand %xmm7,%xmm0
+ por %xmm3,%xmm1
+ lea 2*$STRIDE($bp),$bp
+ por %xmm1,%xmm0
mulq $m1 # np[0]*m1
add %rax,$A[0] # discarded
- mov 8($ap),%rax
+ mov 8($ap,$num),%rax
adc \$0,%rdx
mov %rdx,$N[1]
mulq $m0
add %rax,$A[1]
- mov 8($np),%rax
+ mov 16*1($np),%rax # interleaved with 0, therefore 16*n
adc \$0,%rdx
mov %rdx,$A[0]
mulq $m1
add %rax,$N[1]
- mov 16($ap),%rax
+ mov 16($ap,$num),%rax
adc \$0,%rdx
add $A[1],$N[1]
- lea 4($j),$j # j++
+ lea 4*8($num),$j # j=4
+ lea 16*4($np),$np
adc \$0,%rdx
- mov $N[1],(%rsp)
+ mov $N[1],($tp)
mov %rdx,$N[0]
jmp .L1st4x
-.align 16
+
+.align 32
.L1st4x:
mulq $m0 # ap[j]*bp[0]
add %rax,$A[0]
- mov -16($np,$j,8),%rax
+ mov -16*2($np),%rax
+ lea 32($tp),$tp
adc \$0,%rdx
mov %rdx,$A[1]
mulq $m1 # np[j]*m1
add %rax,$N[0]
- mov -8($ap,$j,8),%rax
+ mov -8($ap,$j),%rax
adc \$0,%rdx
add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
- mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov $N[0],-24($tp) # tp[j-1]
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[0]
add %rax,$A[1]
- mov -8($np,$j,8),%rax
+ mov -16*1($np),%rax
adc \$0,%rdx
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
- mov ($ap,$j,8),%rax
+ mov ($ap,$j),%rax
adc \$0,%rdx
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
- mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov $N[1],-16($tp) # tp[j-1]
mov %rdx,$N[0]
mulq $m0 # ap[j]*bp[0]
add %rax,$A[0]
- mov ($np,$j,8),%rax
+ mov 16*0($np),%rax
adc \$0,%rdx
mov %rdx,$A[1]
mulq $m1 # np[j]*m1
add %rax,$N[0]
- mov 8($ap,$j,8),%rax
+ mov 8($ap,$j),%rax
adc \$0,%rdx
add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
- mov $N[0],-8(%rsp,$j,8) # tp[j-1]
+ mov $N[0],-8($tp) # tp[j-1]
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[0]
add %rax,$A[1]
- mov 8($np,$j,8),%rax
+ mov 16*1($np),%rax
adc \$0,%rdx
- lea 4($j),$j # j++
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
- mov -16($ap,$j,8),%rax
+ mov 16($ap,$j),%rax
adc \$0,%rdx
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
+ lea 16*4($np),$np
adc \$0,%rdx
- mov $N[1],-32(%rsp,$j,8) # tp[j-1]
+ mov $N[1],($tp) # tp[j-1]
mov %rdx,$N[0]
- cmp $num,$j
- jl .L1st4x
+
+ add \$32,$j # j+=4
+ jnz .L1st4x
mulq $m0 # ap[j]*bp[0]
add %rax,$A[0]
- mov -16($np,$j,8),%rax
+ mov -16*2($np),%rax
+ lea 32($tp),$tp
adc \$0,%rdx
mov %rdx,$A[1]
mulq $m1 # np[j]*m1
add %rax,$N[0]
- mov -8($ap,$j,8),%rax
+ mov -8($ap),%rax
adc \$0,%rdx
add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
- mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov $N[0],-24($tp) # tp[j-1]
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[0]
add %rax,$A[1]
- mov -8($np,$j,8),%rax
+ mov -16*1($np),%rax
adc \$0,%rdx
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
- mov ($ap),%rax # ap[0]
+ mov ($ap,$num),%rax # ap[0]
adc \$0,%rdx
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
- mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov $N[1],-16($tp) # tp[j-1]
mov %rdx,$N[0]
movq %xmm0,$m0 # bp[1]
+ lea ($np,$num,2),$np # rewind $np
xor $N[1],$N[1]
add $A[0],$N[0]
adc \$0,$N[1]
- mov $N[0],-8(%rsp,$j,8)
- mov $N[1],(%rsp,$j,8) # store upmost overflow bit
+ mov $N[0],-8($tp)
- lea 1($i),$i # i++
-.align 4
-.Louter4x:
- xor $j,$j # j=0
- movq `0*$STRIDE/4-96`($bp),%xmm0
- movq `1*$STRIDE/4-96`($bp),%xmm1
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($bp),%xmm2
- pand %xmm5,%xmm1
+ jmp .Louter4x
- mov (%rsp),$A[0]
+.align 32
+.Louter4x:
+ mov ($tp,$num),$A[0]
mov $n0,$m1
mulq $m0 # ap[0]*bp[i]
add %rax,$A[0] # ap[0]*bp[i]+tp[0]
mov ($np),%rax
adc \$0,%rdx
+ movq `0*$STRIDE/4-96`($bp),%xmm0
+ movq `1*$STRIDE/4-96`($bp),%xmm1
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-96`($bp),%xmm2
+ pand %xmm5,%xmm1
movq `3*$STRIDE/4-96`($bp),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
imulq $A[0],$m1 # tp[0]*n0
+ .byte 0x67
mov %rdx,$A[1]
+ mov $N[1],($tp) # store upmost overflow bit
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
por %xmm2,%xmm0
+ lea ($tp,$num),$tp # rewind $tp
lea $STRIDE($bp),$bp
por %xmm3,%xmm0
mulq $m1 # np[0]*m1
add %rax,$A[0] # "$N[0]", discarded
- mov 8($ap),%rax
+ mov 8($ap,$num),%rax
adc \$0,%rdx
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
- mov 8($np),%rax
+ mov 16*1($np),%rax # interleaved with 0, therefore 16*n
adc \$0,%rdx
- add 8(%rsp),$A[1] # +tp[1]
+ add 8($tp),$A[1] # +tp[1]
adc \$0,%rdx
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
- mov 16($ap),%rax
+ mov 16($ap,$num),%rax
adc \$0,%rdx
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
- lea 4($j),$j # j+=2
+ lea 4*8($num),$j # j=4
+ lea 16*4($np),$np
adc \$0,%rdx
mov %rdx,$N[0]
jmp .Linner4x
-.align 16
+
+.align 32
.Linner4x:
mulq $m0 # ap[j]*bp[i]
add %rax,$A[0]
- mov -16($np,$j,8),%rax
+ mov -16*2($np),%rax
adc \$0,%rdx
- add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
+ add 16($tp),$A[0] # ap[j]*bp[i]+tp[j]
+ lea 32($tp),$tp
adc \$0,%rdx
mov %rdx,$A[1]
mulq $m1 # np[j]*m1
add %rax,$N[0]
- mov -8($ap,$j,8),%rax
+ mov -8($ap,$j),%rax
adc \$0,%rdx
add $A[0],$N[0]
adc \$0,%rdx
- mov $N[1],-32(%rsp,$j,8) # tp[j-1]
+ mov $N[1],-32($tp) # tp[j-1]
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
- mov -8($np,$j,8),%rax
+ mov -16*1($np),%rax
adc \$0,%rdx
- add -8(%rsp,$j,8),$A[1]
+ add -8($tp),$A[1]
adc \$0,%rdx
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
- mov ($ap,$j,8),%rax
+ mov ($ap,$j),%rax
adc \$0,%rdx
add $A[1],$N[1]
adc \$0,%rdx
- mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov $N[0],-24($tp) # tp[j-1]
mov %rdx,$N[0]
mulq $m0 # ap[j]*bp[i]
add %rax,$A[0]
- mov ($np,$j,8),%rax
+ mov 16*0($np),%rax
adc \$0,%rdx
- add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
+ add ($tp),$A[0] # ap[j]*bp[i]+tp[j]
adc \$0,%rdx
mov %rdx,$A[1]
mulq $m1 # np[j]*m1
add %rax,$N[0]
- mov 8($ap,$j,8),%rax
+ mov 8($ap,$j),%rax
adc \$0,%rdx
add $A[0],$N[0]
adc \$0,%rdx
- mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov $N[1],-16($tp) # tp[j-1]
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
- mov 8($np,$j,8),%rax
+ mov 16*1($np),%rax
adc \$0,%rdx
- add 8(%rsp,$j,8),$A[1]
+ add 8($tp),$A[1]
adc \$0,%rdx
- lea 4($j),$j # j++
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
- mov -16($ap,$j,8),%rax
+ mov 16($ap,$j),%rax
adc \$0,%rdx
add $A[1],$N[1]
+ lea 16*4($np),$np
adc \$0,%rdx
- mov $N[0],-40(%rsp,$j,8) # tp[j-1]
+ mov $N[0],-8($tp) # tp[j-1]
mov %rdx,$N[0]
- cmp $num,$j
- jl .Linner4x
+
+ add \$32,$j # j+=4
+ jnz .Linner4x
mulq $m0 # ap[j]*bp[i]
add %rax,$A[0]
- mov -16($np,$j,8),%rax
+ mov -16*2($np),%rax
adc \$0,%rdx
- add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
+ add 16($tp),$A[0] # ap[j]*bp[i]+tp[j]
+ lea 32($tp),$tp
adc \$0,%rdx
mov %rdx,$A[1]
mulq $m1 # np[j]*m1
add %rax,$N[0]
- mov -8($ap,$j,8),%rax
+ mov -8($ap),%rax
adc \$0,%rdx
add $A[0],$N[0]
adc \$0,%rdx
- mov $N[1],-32(%rsp,$j,8) # tp[j-1]
+ mov $N[1],-32($tp) # tp[j-1]
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
- mov -8($np,$j,8),%rax
+ mov $m1,%rax
+ mov -16*1($np),$m1
adc \$0,%rdx
- add -8(%rsp,$j,8),$A[1]
+ add -8($tp),$A[1]
adc \$0,%rdx
- lea 1($i),$i # i++
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
- mov ($ap),%rax # ap[0]
+ mov ($ap,$num),%rax # ap[0]
adc \$0,%rdx
add $A[1],$N[1]
adc \$0,%rdx
- mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov $N[0],-24($tp) # tp[j-1]
mov %rdx,$N[0]
movq %xmm0,$m0 # bp[i+1]
- mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov $N[1],-16($tp) # tp[j-1]
+ lea ($np,$num,2),$np # rewind $np
xor $N[1],$N[1]
add $A[0],$N[0]
adc \$0,$N[1]
- add (%rsp,$num,8),$N[0] # pull upmost overflow bit
- adc \$0,$N[1]
- mov $N[0],-8(%rsp,$j,8)
- mov $N[1],(%rsp,$j,8) # store upmost overflow bit
+ add ($tp),$N[0] # pull upmost overflow bit
+ adc \$0,$N[1] # upmost overflow bit
+ mov $N[0],-8($tp)
- cmp $num,$i
- jl .Louter4x
+ cmp 16+8(%rsp),$bp
+ jb .Louter4x
___
-{
-my @ri=("%rax","%rdx",$m0,$m1);
+if (1) {
$code.=<<___;
- mov 16(%rsp,$num,8),$rp # restore $rp
- mov 0(%rsp),@ri[0] # tp[0]
- pxor %xmm0,%xmm0
- mov 8(%rsp),@ri[1] # tp[1]
- shr \$2,$num # num/=4
- lea (%rsp),$ap # borrow ap for tp
- xor $i,$i # i=0 and clear CF!
-
- sub 0($np),@ri[0]
- mov 16($ap),@ri[2] # tp[2]
- mov 24($ap),@ri[3] # tp[3]
- sbb 8($np),@ri[1]
- lea -1($num),$j # j=num/4-1
+ sub $N[0],$m1 # compare top-most words
+ adc $j,$j # $j is zero
+ or $j,$N[1]
+ xor \$1,$N[1]
+ lea ($tp,$num),%rbx # tptr in .sqr4x_sub
+ lea ($np,$N[1],8),%rbp # nptr in .sqr4x_sub
+ mov %r9,%rcx
+ sar \$3+2,%rcx # cf=0
+ mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub
+ jmp .Lsqr4x_sub
+___
+} else {
+my @ri=("%rax",$bp,$m0,$m1);
+my $rp="%rdx";
+$code.=<<___
+ xor \$1,$N[1]
+ lea ($tp,$num),$tp # rewind $tp
+ sar \$5,$num # cf=0
+ lea ($np,$N[1],8),$np
+ mov 56+8(%rsp),$rp # restore $rp
jmp .Lsub4x
-.align 16
+
+.align 32
.Lsub4x:
- mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
- mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
- sbb 16($np,$i,8),@ri[2]
- mov 32($ap,$i,8),@ri[0] # tp[i+1]
- mov 40($ap,$i,8),@ri[1]
- sbb 24($np,$i,8),@ri[3]
- mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
- mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
- sbb 32($np,$i,8),@ri[0]
- mov 48($ap,$i,8),@ri[2]
- mov 56($ap,$i,8),@ri[3]
- sbb 40($np,$i,8),@ri[1]
- lea 4($i),$i # i++
- dec $j # doesnn't affect CF!
+ .byte 0x66
+ mov 8*0($tp),@ri[0]
+ mov 8*1($tp),@ri[1]
+ .byte 0x66
+ sbb 16*0($np),@ri[0]
+ mov 8*2($tp),@ri[2]
+ sbb 16*1($np),@ri[1]
+ mov 3*8($tp),@ri[3]
+ lea 4*8($tp),$tp
+ sbb 16*2($np),@ri[2]
+ mov @ri[0],8*0($rp)
+ sbb 16*3($np),@ri[3]
+ lea 16*4($np),$np
+ mov @ri[1],8*1($rp)
+ mov @ri[2],8*2($rp)
+ mov @ri[3],8*3($rp)
+ lea 8*4($rp),$rp
+
+ inc $num
jnz .Lsub4x
- mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
- mov 32($ap,$i,8),@ri[0] # load overflow bit
- sbb 16($np,$i,8),@ri[2]
- mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
- sbb 24($np,$i,8),@ri[3]
- mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
+ ret
+___
+}
+$code.=<<___;
+.size mul4x_internal,.-mul4x_internal
+___
+}}}
+ {{{
+######################################################################
+# void bn_power5(
+my $rptr="%rdi"; # BN_ULONG *rptr,
+my $aptr="%rsi"; # const BN_ULONG *aptr,
+my $bptr="%rdx"; # const void *table,
+my $nptr="%rcx"; # const BN_ULONG *nptr,
+my $n0 ="%r8"; # const BN_ULONG *n0);
+my $num ="%r9"; # int num, has to be divisible by 8
+ # int pwr
+
+my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
+my @A0=("%r10","%r11");
+my @A1=("%r12","%r13");
+my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
- sbb \$0,@ri[0] # handle upmost overflow bit
- mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
- xor $i,$i # i=0
- and @ri[0],$ap
- not @ri[0]
- mov $rp,$np
- and @ri[0],$np
- lea -1($num),$j
- or $np,$ap # ap=borrow?tp:rp
+$code.=<<___;
+.globl bn_power5
+.type bn_power5,\@function,6
+.align 32
+bn_power5:
+___
+$code.=<<___ if ($addx);
+ mov OPENSSL_ia32cap_P+8(%rip),%r11d
+ and \$0x80100,%r11d
+ cmp \$0x80100,%r11d
+ je .Lpowerx5_enter
+___
+$code.=<<___;
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0x28(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+___
+$code.=<<___;
+ mov ${num}d,%r10d
+ shl \$3,${num}d # convert $num to bytes
+ shl \$3+2,%r10d # 4*$num
+ neg $num
+ mov ($n0),$n0 # *n0
+
+ ##############################################################
+ # ensure that stack frame doesn't alias with $aptr+4*$num
+ # modulo 4096, which covers ret[num], am[num] and n[2*num]
+ # (see bn_exp.c). this is done to allow memory disambiguation
+ # logic do its magic.
+ #
+ lea -64(%rsp,$num,2),%r11
+ sub $aptr,%r11
+ and \$4095,%r11
+ cmp %r11,%r10
+ jb .Lpwr_sp_alt
+ sub %r11,%rsp # align with $aptr
+ lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ jmp .Lpwr_sp_done
+
+.align 32
+.Lpwr_sp_alt:
+ lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
+ lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ sub %r10,%r11
+ mov \$0,%r10
+ cmovc %r10,%r11
+ sub %r11,%rsp
+.Lpwr_sp_done:
+ and \$-64,%rsp
+ mov $num,%r10
+ neg $num
+
+ ##############################################################
+ # Stack layout
+ #
+ # +0 saved $num, used in reduction section
+ # +8 &t[2*$num], used in reduction section
+ # +32 saved *n0
+ # +40 saved %rsp
+ # +48 t[2*$num]
+ #
+ mov $n0, 32(%rsp)
+ mov %rax, 40(%rsp) # save original %rsp
+.Lpower5_body:
+ movq $rptr,%xmm1 # save $rptr
+ movq $nptr,%xmm2 # save $nptr
+ movq %r10, %xmm3 # -$num
+ movq $bptr,%xmm4
+
+ call __bn_sqr8x_internal
+ call __bn_sqr8x_internal
+ call __bn_sqr8x_internal
+ call __bn_sqr8x_internal
+ call __bn_sqr8x_internal
+
+ movq %xmm2,$nptr
+ movq %xmm4,$bptr
+ mov $aptr,$rptr
+ mov 40(%rsp),%rax
+ lea 32(%rsp),$n0
+
+ call mul4x_internal
+
+ mov 40(%rsp),%rsi # restore %rsp
+ mov \$1,%rax
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
+.Lpower5_epilogue:
+ ret
+.size bn_power5,.-bn_power5
+
+.globl bn_sqr8x_internal
+.hidden bn_sqr8x_internal
+.type bn_sqr8x_internal,\@abi-omnipotent
+.align 32
+bn_sqr8x_internal:
+__bn_sqr8x_internal:
+ ##############################################################
+ # Squaring part:
+ #
+ # a) multiply-n-add everything but a[i]*a[i];
+ # b) shift result of a) by 1 to the left and accumulate
+ # a[i]*a[i] products;
+ #
+ ##############################################################
+ # a[1]a[0]
+ # a[2]a[0]
+ # a[3]a[0]
+ # a[2]a[1]
+ # a[4]a[0]
+ # a[3]a[1]
+ # a[5]a[0]
+ # a[4]a[1]
+ # a[3]a[2]
+ # a[6]a[0]
+ # a[5]a[1]
+ # a[4]a[2]
+ # a[7]a[0]
+ # a[6]a[1]
+ # a[5]a[2]
+ # a[4]a[3]
+ # a[7]a[1]
+ # a[6]a[2]
+ # a[5]a[3]
+ # a[7]a[2]
+ # a[6]a[3]
+ # a[5]a[4]
+ # a[7]a[3]
+ # a[6]a[4]
+ # a[7]a[4]
+ # a[6]a[5]
+ # a[7]a[5]
+ # a[7]a[6]
+ # a[1]a[0]
+ # a[2]a[0]
+ # a[3]a[0]
+ # a[4]a[0]
+ # a[5]a[0]
+ # a[6]a[0]
+ # a[7]a[0]
+ # a[2]a[1]
+ # a[3]a[1]
+ # a[4]a[1]
+ # a[5]a[1]
+ # a[6]a[1]
+ # a[7]a[1]
+ # a[3]a[2]
+ # a[4]a[2]
+ # a[5]a[2]
+ # a[6]a[2]
+ # a[7]a[2]
+ # a[4]a[3]
+ # a[5]a[3]
+ # a[6]a[3]
+ # a[7]a[3]
+ # a[5]a[4]
+ # a[6]a[4]
+ # a[7]a[4]
+ # a[6]a[5]
+ # a[7]a[5]
+ # a[7]a[6]
+ # a[0]a[0]
+ # a[1]a[1]
+ # a[2]a[2]
+ # a[3]a[3]
+ # a[4]a[4]
+ # a[5]a[5]
+ # a[6]a[6]
+ # a[7]a[7]
+
+ lea 32(%r10),$i # $i=-($num-32)
+ lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2]
+
+ mov $num,$j # $j=$num
+
+ # comments apply to $num==8 case
+ mov -32($aptr,$i),$a0 # a[0]
+ lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
+ mov -24($aptr,$i),%rax # a[1]
+ lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
+ mov -16($aptr,$i),$ai # a[2]
+ mov %rax,$a1
+
+ mul $a0 # a[1]*a[0]
+ mov %rax,$A0[0] # a[1]*a[0]
+ mov $ai,%rax # a[2]
+ mov %rdx,$A0[1]
+ mov $A0[0],-24($tptr,$i) # t[1]
+
+ mul $a0 # a[2]*a[0]
+ add %rax,$A0[1]
+ mov $ai,%rax
+ adc \$0,%rdx
+ mov $A0[1],-16($tptr,$i) # t[2]
+ mov %rdx,$A0[0]
+
+
+ mov -8($aptr,$i),$ai # a[3]
+ mul $a1 # a[2]*a[1]
+ mov %rax,$A1[0] # a[2]*a[1]+t[3]
+ mov $ai,%rax
+ mov %rdx,$A1[1]
+
+ lea ($i),$j
+ mul $a0 # a[3]*a[0]
+ add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
+ mov $ai,%rax
+ mov %rdx,$A0[1]
+ adc \$0,$A0[1]
+ add $A1[0],$A0[0]
+ adc \$0,$A0[1]
+ mov $A0[0],-8($tptr,$j) # t[3]
+ jmp .Lsqr4x_1st
+
+.align 32
+.Lsqr4x_1st:
+ mov ($aptr,$j),$ai # a[4]
+ mul $a1 # a[3]*a[1]
+ add %rax,$A1[1] # a[3]*a[1]+t[4]
+ mov $ai,%rax
+ mov %rdx,$A1[0]
+ adc \$0,$A1[0]
+
+ mul $a0 # a[4]*a[0]
+ add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4]
+ mov $ai,%rax # a[3]
+ mov 8($aptr,$j),$ai # a[5]
+ mov %rdx,$A0[0]
+ adc \$0,$A0[0]
+ add $A1[1],$A0[1]
+ adc \$0,$A0[0]
+
+
+ mul $a1 # a[4]*a[3]
+ add %rax,$A1[0] # a[4]*a[3]+t[5]
+ mov $ai,%rax
+ mov $A0[1],($tptr,$j) # t[4]
+ mov %rdx,$A1[1]
+ adc \$0,$A1[1]
+
+ mul $a0 # a[5]*a[2]
+ add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5]
+ mov $ai,%rax
+ mov 16($aptr,$j),$ai # a[6]
+ mov %rdx,$A0[1]
+ adc \$0,$A0[1]
+ add $A1[0],$A0[0]
+ adc \$0,$A0[1]
+
+ mul $a1 # a[5]*a[3]
+ add %rax,$A1[1] # a[5]*a[3]+t[6]
+ mov $ai,%rax
+ mov $A0[0],8($tptr,$j) # t[5]
+ mov %rdx,$A1[0]
+ adc \$0,$A1[0]
+
+ mul $a0 # a[6]*a[2]
+ add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6]
+ mov $ai,%rax # a[3]
+ mov 24($aptr,$j),$ai # a[7]
+ mov %rdx,$A0[0]
+ adc \$0,$A0[0]
+ add $A1[1],$A0[1]
+ adc \$0,$A0[0]
+
+
+ mul $a1 # a[6]*a[5]
+ add %rax,$A1[0] # a[6]*a[5]+t[7]
+ mov $ai,%rax
+ mov $A0[1],16($tptr,$j) # t[6]
+ mov %rdx,$A1[1]
+ adc \$0,$A1[1]
+ lea 32($j),$j
+
+ mul $a0 # a[7]*a[4]
+ add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6]
+ mov $ai,%rax
+ mov %rdx,$A0[1]
+ adc \$0,$A0[1]
+ add $A1[0],$A0[0]
+ adc \$0,$A0[1]
+ mov $A0[0],-8($tptr,$j) # t[7]
+
+ cmp \$0,$j
+ jne .Lsqr4x_1st
+
+ mul $a1 # a[7]*a[5]
+ add %rax,$A1[1]
+ lea 16($i),$i
+ adc \$0,%rdx
+ add $A0[1],$A1[1]
+ adc \$0,%rdx
+
+ mov $A1[1],($tptr) # t[8]
+ mov %rdx,$A1[0]
+ mov %rdx,8($tptr) # t[9]
+ jmp .Lsqr4x_outer
+
+.align 32
+.Lsqr4x_outer: # comments apply to $num==6 case
+ mov -32($aptr,$i),$a0 # a[0]
+ lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
+ mov -24($aptr,$i),%rax # a[1]
+ lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
+ mov -16($aptr,$i),$ai # a[2]
+ mov %rax,$a1
+
+ mul $a0 # a[1]*a[0]
+ mov -24($tptr,$i),$A0[0] # t[1]
+ add %rax,$A0[0] # a[1]*a[0]+t[1]
+ mov $ai,%rax # a[2]
+ adc \$0,%rdx
+ mov $A0[0],-24($tptr,$i) # t[1]
+ mov %rdx,$A0[1]
+
+ mul $a0 # a[2]*a[0]
+ add %rax,$A0[1]
+ mov $ai,%rax
+ adc \$0,%rdx
+ add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2]
+ mov %rdx,$A0[0]
+ adc \$0,$A0[0]
+ mov $A0[1],-16($tptr,$i) # t[2]
+
+ xor $A1[0],$A1[0]
+
+ mov -8($aptr,$i),$ai # a[3]
+ mul $a1 # a[2]*a[1]
+ add %rax,$A1[0] # a[2]*a[1]+t[3]
+ mov $ai,%rax
+ adc \$0,%rdx
+ add -8($tptr,$i),$A1[0]
+ mov %rdx,$A1[1]
+ adc \$0,$A1[1]
+
+ mul $a0 # a[3]*a[0]
+ add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
+ mov $ai,%rax
+ adc \$0,%rdx
+ add $A1[0],$A0[0]
+ mov %rdx,$A0[1]
+ adc \$0,$A0[1]
+ mov $A0[0],-8($tptr,$i) # t[3]
+
+ lea ($i),$j
+ jmp .Lsqr4x_inner
+
+.align 32
+.Lsqr4x_inner:
+ mov ($aptr,$j),$ai # a[4]
+ mul $a1 # a[3]*a[1]
+ add %rax,$A1[1] # a[3]*a[1]+t[4]
+ mov $ai,%rax
+ mov %rdx,$A1[0]
+ adc \$0,$A1[0]
+ add ($tptr,$j),$A1[1]
+ adc \$0,$A1[0]
+
+ .byte 0x67
+ mul $a0 # a[4]*a[0]
+ add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4]
+ mov $ai,%rax # a[3]
+ mov 8($aptr,$j),$ai # a[5]
+ mov %rdx,$A0[0]
+ adc \$0,$A0[0]
+ add $A1[1],$A0[1]
+ adc \$0,$A0[0]
+
+ mul $a1 # a[4]*a[3]
+ add %rax,$A1[0] # a[4]*a[3]+t[5]
+ mov $A0[1],($tptr,$j) # t[4]
+ mov $ai,%rax
+ mov %rdx,$A1[1]
+ adc \$0,$A1[1]
+ add 8($tptr,$j),$A1[0]
+ lea 16($j),$j # j++
+ adc \$0,$A1[1]
+
+ mul $a0 # a[5]*a[2]
+ add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5]
+ mov $ai,%rax
+ adc \$0,%rdx
+ add $A1[0],$A0[0]
+ mov %rdx,$A0[1]
+ adc \$0,$A0[1]
+ mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below
+
+ cmp \$0,$j
+ jne .Lsqr4x_inner
+
+ .byte 0x67
+ mul $a1 # a[5]*a[3]
+ add %rax,$A1[1]
+ adc \$0,%rdx
+ add $A0[1],$A1[1]
+ adc \$0,%rdx
+
+ mov $A1[1],($tptr) # t[6], "preloaded t[2]" below
+ mov %rdx,$A1[0]
+ mov %rdx,8($tptr) # t[7], "preloaded t[3]" below
+
+ add \$16,$i
+ jnz .Lsqr4x_outer
+
+ # comments apply to $num==4 case
+ mov -32($aptr),$a0 # a[0]
+ lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
+ mov -24($aptr),%rax # a[1]
+ lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
+ mov -16($aptr),$ai # a[2]
+ mov %rax,$a1
+
+ mul $a0 # a[1]*a[0]
+ add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1]
+ mov $ai,%rax # a[2]
+ mov %rdx,$A0[1]
+ adc \$0,$A0[1]
+
+ mul $a0 # a[2]*a[0]
+ add %rax,$A0[1]
+ mov $ai,%rax
+ mov $A0[0],-24($tptr) # t[1]
+ mov %rdx,$A0[0]
+ adc \$0,$A0[0]
+ add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2]
+ mov -8($aptr),$ai # a[3]
+ adc \$0,$A0[0]
+
+ mul $a1 # a[2]*a[1]
+ add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3]
+ mov $ai,%rax
+ mov $A0[1],-16($tptr) # t[2]
+ mov %rdx,$A1[1]
+ adc \$0,$A1[1]
+
+ mul $a0 # a[3]*a[0]
+ add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
+ mov $ai,%rax
+ mov %rdx,$A0[1]
+ adc \$0,$A0[1]
+ add $A1[0],$A0[0]
+ adc \$0,$A0[1]
+ mov $A0[0],-8($tptr) # t[3]
+
+ mul $a1 # a[3]*a[1]
+ add %rax,$A1[1]
+ mov -16($aptr),%rax # a[2]
+ adc \$0,%rdx
+ add $A0[1],$A1[1]
+ adc \$0,%rdx
+
+ mov $A1[1],($tptr) # t[4]
+ mov %rdx,$A1[0]
+ mov %rdx,8($tptr) # t[5]
+
+ mul $ai # a[2]*a[3]
+___
+{
+my ($shift,$carry)=($a0,$a1);
+my @S=(@A1,$ai,$n0);
+$code.=<<___;
+ add \$16,$i
+ xor $shift,$shift
+ sub $num,$i # $i=16-$num
+ xor $carry,$carry
+
+ add $A1[0],%rax # t[5]
+ adc \$0,%rdx
+ mov %rax,8($tptr) # t[5]
+ mov %rdx,16($tptr) # t[6]
+ mov $carry,24($tptr) # t[7]
+
+ mov -16($aptr,$i),%rax # a[0]
+ lea 48+8(%rsp),$tptr
+ xor $A0[0],$A0[0] # t[0]
+ mov 8($tptr),$A0[1] # t[1]
+
+ lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[1] # | t[2*i]>>63
+ mov 16($tptr),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[0]
+ mov -8($aptr,$i),%rax # a[i+1] # prefetch
+ mov $S[0],($tptr)
+ adc %rdx,$S[1]
+
+ lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
+ mov $S[1],8($tptr)
+ sbb $carry,$carry # mov cf,$carry
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[3] # | t[2*i]>>63
+ mov 32($tptr),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[2]
+ mov 0($aptr,$i),%rax # a[i+1] # prefetch
+ mov $S[2],16($tptr)
+ adc %rdx,$S[3]
+ lea 16($i),$i
+ mov $S[3],24($tptr)
+ sbb $carry,$carry # mov cf,$carry
+ lea 64($tptr),$tptr
+ jmp .Lsqr4x_shift_n_add
+
+.align 32
+.Lsqr4x_shift_n_add:
+ lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[1] # | t[2*i]>>63
+ mov -16($tptr),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[0]
+ mov -8($aptr,$i),%rax # a[i+1] # prefetch
+ mov $S[0],-32($tptr)
+ adc %rdx,$S[1]
+
+ lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
+ mov $S[1],-24($tptr)
+ sbb $carry,$carry # mov cf,$carry
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[3] # | t[2*i]>>63
+ mov 0($tptr),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov 8($tptr),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[2]
+ mov 0($aptr,$i),%rax # a[i+1] # prefetch
+ mov $S[2],-16($tptr)
+ adc %rdx,$S[3]
+
+ lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
+ mov $S[3],-8($tptr)
+ sbb $carry,$carry # mov cf,$carry
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[1] # | t[2*i]>>63
+ mov 16($tptr),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[0]
+ mov 8($aptr,$i),%rax # a[i+1] # prefetch
+ mov $S[0],0($tptr)
+ adc %rdx,$S[1]
+
+ lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
+ mov $S[1],8($tptr)
+ sbb $carry,$carry # mov cf,$carry
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[3] # | t[2*i]>>63
+ mov 32($tptr),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[2]
+ mov 16($aptr,$i),%rax # a[i+1] # prefetch
+ mov $S[2],16($tptr)
+ adc %rdx,$S[3]
+ mov $S[3],24($tptr)
+ sbb $carry,$carry # mov cf,$carry
+ lea 64($tptr),$tptr
+ add \$32,$i
+ jnz .Lsqr4x_shift_n_add
+
+ lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
+ .byte 0x67
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[1] # | t[2*i]>>63
+ mov -16($tptr),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[0]
+ mov -8($aptr),%rax # a[i+1] # prefetch
+ mov $S[0],-32($tptr)
+ adc %rdx,$S[1]
+
+ lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift
+ mov $S[1],-24($tptr)
+ sbb $carry,$carry # mov cf,$carry
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[3] # | t[2*i]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ adc %rax,$S[2]
+ adc %rdx,$S[3]
+ mov $S[2],-16($tptr)
+ mov $S[3],-8($tptr)
+___
+}
+######################################################################
+# Montgomery reduction part, "word-by-word" algorithm.
+#
+# This new path is inspired by multiple submissions from Intel, by
+# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
+# Vinodh Gopal...
+{
+my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx");
- movdqu ($ap),%xmm1
- movdqa %xmm0,(%rsp)
- movdqu %xmm1,($rp)
- jmp .Lcopy4x
-.align 16
-.Lcopy4x: # copy or in-place refresh
- movdqu 16($ap,$i),%xmm2
- movdqu 32($ap,$i),%xmm1
- movdqa %xmm0,16(%rsp,$i)
- movdqu %xmm2,16($rp,$i)
- movdqa %xmm0,32(%rsp,$i)
- movdqu %xmm1,32($rp,$i)
- lea 32($i),$i
- dec $j
- jnz .Lcopy4x
-
- shl \$2,$num
- movdqu 16($ap,$i),%xmm2
- movdqa %xmm0,16(%rsp,$i)
- movdqu %xmm2,16($rp,$i)
+$code.=<<___;
+ movq %xmm2,$nptr
+sqr8x_reduction:
+ xor %rax,%rax
+ lea ($nptr,$num,2),%rcx # end of n[]
+ lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer
+ mov %rcx,0+8(%rsp)
+ lea 48+8(%rsp,$num),$tptr # end of initial t[] window
+ mov %rdx,8+8(%rsp)
+ neg $num
+ jmp .L8x_reduction_loop
+
+.align 32
+.L8x_reduction_loop:
+ lea ($tptr,$num),$tptr # start of current t[] window
+ .byte 0x66
+ mov 8*0($tptr),$m0
+ mov 8*1($tptr),%r9
+ mov 8*2($tptr),%r10
+ mov 8*3($tptr),%r11
+ mov 8*4($tptr),%r12
+ mov 8*5($tptr),%r13
+ mov 8*6($tptr),%r14
+ mov 8*7($tptr),%r15
+ mov %rax,(%rdx) # store top-most carry bit
+ lea 8*8($tptr),$tptr
+
+ .byte 0x67
+ mov $m0,%r8
+ imulq 32+8(%rsp),$m0 # n0*a[0]
+ mov 16*0($nptr),%rax # n[0]
+ mov \$8,%ecx
+ jmp .L8x_reduce
+
+.align 32
+.L8x_reduce:
+ mulq $m0
+ mov 16*1($nptr),%rax # n[1]
+ neg %r8
+ mov %rdx,%r8
+ adc \$0,%r8
+
+ mulq $m0
+ add %rax,%r9
+ mov 16*2($nptr),%rax
+ adc \$0,%rdx
+ add %r9,%r8
+ mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i]
+ mov %rdx,%r9
+ adc \$0,%r9
+
+ mulq $m0
+ add %rax,%r10
+ mov 16*3($nptr),%rax
+ adc \$0,%rdx
+ add %r10,%r9
+ mov 32+8(%rsp),$carry # pull n0, borrow $carry
+ mov %rdx,%r10
+ adc \$0,%r10
+
+ mulq $m0
+ add %rax,%r11
+ mov 16*4($nptr),%rax
+ adc \$0,%rdx
+ imulq %r8,$carry # modulo-scheduled
+ add %r11,%r10
+ mov %rdx,%r11
+ adc \$0,%r11
+
+ mulq $m0
+ add %rax,%r12
+ mov 16*5($nptr),%rax
+ adc \$0,%rdx
+ add %r12,%r11
+ mov %rdx,%r12
+ adc \$0,%r12
+
+ mulq $m0
+ add %rax,%r13
+ mov 16*6($nptr),%rax
+ adc \$0,%rdx
+ add %r13,%r12
+ mov %rdx,%r13
+ adc \$0,%r13
+
+ mulq $m0
+ add %rax,%r14
+ mov 16*7($nptr),%rax
+ adc \$0,%rdx
+ add %r14,%r13
+ mov %rdx,%r14
+ adc \$0,%r14
+
+ mulq $m0
+ mov $carry,$m0 # n0*a[i]
+ add %rax,%r15
+ mov 16*0($nptr),%rax # n[0]
+ adc \$0,%rdx
+ add %r15,%r14
+ mov %rdx,%r15
+ adc \$0,%r15
+
+ dec %ecx
+ jnz .L8x_reduce
+
+ lea 16*8($nptr),$nptr
+ xor %rax,%rax
+ mov 8+8(%rsp),%rdx # pull end of t[]
+ cmp 0+8(%rsp),$nptr # end of n[]?
+ jae .L8x_no_tail
+
+ .byte 0x66
+ add 8*0($tptr),%r8
+ adc 8*1($tptr),%r9
+ adc 8*2($tptr),%r10
+ adc 8*3($tptr),%r11
+ adc 8*4($tptr),%r12
+ adc 8*5($tptr),%r13
+ adc 8*6($tptr),%r14
+ adc 8*7($tptr),%r15
+ sbb $carry,$carry # top carry
+
+ mov 48+56+8(%rsp),$m0 # pull n0*a[0]
+ mov \$8,%ecx
+ mov 16*0($nptr),%rax
+ jmp .L8x_tail
+
+.align 32
+.L8x_tail:
+ mulq $m0
+ add %rax,%r8
+ mov 16*1($nptr),%rax
+ mov %r8,($tptr) # save result
+ mov %rdx,%r8
+ adc \$0,%r8
+
+ mulq $m0
+ add %rax,%r9
+ mov 16*2($nptr),%rax
+ adc \$0,%rdx
+ add %r9,%r8
+ lea 8($tptr),$tptr # $tptr++
+ mov %rdx,%r9
+ adc \$0,%r9
+
+ mulq $m0
+ add %rax,%r10
+ mov 16*3($nptr),%rax
+ adc \$0,%rdx
+ add %r10,%r9
+ mov %rdx,%r10
+ adc \$0,%r10
+
+ mulq $m0
+ add %rax,%r11
+ mov 16*4($nptr),%rax
+ adc \$0,%rdx
+ add %r11,%r10
+ mov %rdx,%r11
+ adc \$0,%r11
+
+ mulq $m0
+ add %rax,%r12
+ mov 16*5($nptr),%rax
+ adc \$0,%rdx
+ add %r12,%r11
+ mov %rdx,%r12
+ adc \$0,%r12
+
+ mulq $m0
+ add %rax,%r13
+ mov 16*6($nptr),%rax
+ adc \$0,%rdx
+ add %r13,%r12
+ mov %rdx,%r13
+ adc \$0,%r13
+
+ mulq $m0
+ add %rax,%r14
+ mov 16*7($nptr),%rax
+ adc \$0,%rdx
+ add %r14,%r13
+ mov %rdx,%r14
+ adc \$0,%r14
+
+ mulq $m0
+ mov 48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i]
+ add %rax,%r15
+ adc \$0,%rdx
+ add %r15,%r14
+ mov 16*0($nptr),%rax # pull n[0]
+ mov %rdx,%r15
+ adc \$0,%r15
+
+ dec %ecx
+ jnz .L8x_tail
+
+ lea 16*8($nptr),$nptr
+ mov 8+8(%rsp),%rdx # pull end of t[]
+ cmp 0+8(%rsp),$nptr # end of n[]?
+ jae .L8x_tail_done # break out of loop
+
+ mov 48+56+8(%rsp),$m0 # pull n0*a[0]
+ neg $carry
+ mov 8*0($nptr),%rax # pull n[0]
+ adc 8*0($tptr),%r8
+ adc 8*1($tptr),%r9
+ adc 8*2($tptr),%r10
+ adc 8*3($tptr),%r11
+ adc 8*4($tptr),%r12
+ adc 8*5($tptr),%r13
+ adc 8*6($tptr),%r14
+ adc 8*7($tptr),%r15
+ sbb $carry,$carry # top carry
+
+ mov \$8,%ecx
+ jmp .L8x_tail
+
+.align 32
+.L8x_tail_done:
+ add (%rdx),%r8 # can this overflow?
+ xor %rax,%rax
+
+ neg $carry
+.L8x_no_tail:
+ adc 8*0($tptr),%r8
+ adc 8*1($tptr),%r9
+ adc 8*2($tptr),%r10
+ adc 8*3($tptr),%r11
+ adc 8*4($tptr),%r12
+ adc 8*5($tptr),%r13
+ adc 8*6($tptr),%r14
+ adc 8*7($tptr),%r15
+ adc \$0,%rax # top-most carry
+ mov -16($nptr),%rcx # np[num-1]
+ xor $carry,$carry
+
+ movq %xmm2,$nptr # restore $nptr
+
+ mov %r8,8*0($tptr) # store top 512 bits
+ mov %r9,8*1($tptr)
+ movq %xmm3,$num # $num is %r9, can't be moved upwards
+ mov %r10,8*2($tptr)
+ mov %r11,8*3($tptr)
+ mov %r12,8*4($tptr)
+ mov %r13,8*5($tptr)
+ mov %r14,8*6($tptr)
+ mov %r15,8*7($tptr)
+ lea 8*8($tptr),$tptr
+
+ cmp %rdx,$tptr # end of t[]?
+ jb .L8x_reduction_loop
+___
+}
+##############################################################
+# Post-condition, 4x unrolled
+#
+{
+my ($tptr,$nptr)=("%rbx","%rbp");
+$code.=<<___;
+ #xor %rsi,%rsi # %rsi was $carry above
+ sub %r15,%rcx # compare top-most words
+ lea (%rdi,$num),$tptr # %rdi was $tptr above
+ adc %rsi,%rsi
+ mov $num,%rcx
+ or %rsi,%rax
+ movq %xmm1,$rptr # restore $rptr
+ xor \$1,%rax
+ movq %xmm1,$aptr # prepare for back-to-back call
+ lea ($nptr,%rax,8),$nptr
+ sar \$3+2,%rcx # cf=0
+ jmp .Lsqr4x_sub
+
+.align 32
+.Lsqr4x_sub:
+ .byte 0x66
+ mov 8*0($tptr),%r12
+ mov 8*1($tptr),%r13
+ sbb 16*0($nptr),%r12
+ mov 8*2($tptr),%r14
+ sbb 16*1($nptr),%r13
+ mov 8*3($tptr),%r15
+ lea 8*4($tptr),$tptr
+ sbb 16*2($nptr),%r14
+ mov %r12,8*0($rptr)
+ sbb 16*3($nptr),%r15
+ lea 16*4($nptr),$nptr
+ mov %r13,8*1($rptr)
+ mov %r14,8*2($rptr)
+ mov %r15,8*3($rptr)
+ lea 8*4($rptr),$rptr
+
+ inc %rcx # pass %cf
+ jnz .Lsqr4x_sub
___
}
$code.=<<___;
- mov 8(%rsp,$num,8),%rsi # restore %rsp
- mov \$1,%rax
+ mov $num,%r10 # prepare for back-to-back call
+ neg $num # restore $num
+ ret
+.size bn_sqr8x_internal,.-bn_sqr8x_internal
+___
+{
+$code.=<<___;
+.globl bn_from_montgomery
+.type bn_from_montgomery,\@abi-omnipotent
+.align 32
+bn_from_montgomery:
+ testl \$7,`($win64?"48(%rsp)":"%r9d")`
+ jz bn_from_mont8x
+ xor %eax,%eax
+ ret
+.size bn_from_montgomery,.-bn_from_montgomery
+
+.type bn_from_mont8x,\@function,6
+.align 32
+bn_from_mont8x:
+ .byte 0x67
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
___
$code.=<<___ if ($win64);
- movaps (%rsi),%xmm6
- movaps 0x10(%rsi),%xmm7
- lea 0x28(%rsi),%rsi
+ lea -0x28(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
___
$code.=<<___;
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
-.Lmul4x_epilogue:
+ .byte 0x67
+ mov ${num}d,%r10d
+ shl \$3,${num}d # convert $num to bytes
+ shl \$3+2,%r10d # 4*$num
+ neg $num
+ mov ($n0),$n0 # *n0
+
+ ##############################################################
+ # ensure that stack frame doesn't alias with $aptr+4*$num
+ # modulo 4096, which covers ret[num], am[num] and n[2*num]
+ # (see bn_exp.c). this is done to allow memory disambiguation
+ # logic do its magic.
+ #
+ lea -64(%rsp,$num,2),%r11
+ sub $aptr,%r11
+ and \$4095,%r11
+ cmp %r11,%r10
+ jb .Lfrom_sp_alt
+ sub %r11,%rsp # align with $aptr
+ lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ jmp .Lfrom_sp_done
+
+.align 32
+.Lfrom_sp_alt:
+ lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
+ lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ sub %r10,%r11
+ mov \$0,%r10
+ cmovc %r10,%r11
+ sub %r11,%rsp
+.Lfrom_sp_done:
+ and \$-64,%rsp
+ mov $num,%r10
+ neg $num
+
+ ##############################################################
+ # Stack layout
+ #
+ # +0 saved $num, used in reduction section
+ # +8 &t[2*$num], used in reduction section
+ # +32 saved *n0
+ # +40 saved %rsp
+ # +48 t[2*$num]
+ #
+ mov $n0, 32(%rsp)
+ mov %rax, 40(%rsp) # save original %rsp
+.Lfrom_body:
+ mov $num,%r11
+ lea 48(%rsp),%rax
+ pxor %xmm0,%xmm0
+ jmp .Lmul_by_1
+
+.align 32
+.Lmul_by_1:
+ movdqu ($aptr),%xmm1
+ movdqu 16($aptr),%xmm2
+ movdqu 32($aptr),%xmm3
+ movdqa %xmm0,(%rax,$num)
+ movdqu 48($aptr),%xmm4
+ movdqa %xmm0,16(%rax,$num)
+ .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 # lea 64($aptr),$aptr
+ movdqa %xmm1,(%rax)
+ movdqa %xmm0,32(%rax,$num)
+ movdqa %xmm2,16(%rax)
+ movdqa %xmm0,48(%rax,$num)
+ movdqa %xmm3,32(%rax)
+ movdqa %xmm4,48(%rax)
+ lea 64(%rax),%rax
+ sub \$64,%r11
+ jnz .Lmul_by_1
+
+ movq $rptr,%xmm1
+ movq $nptr,%xmm2
+ .byte 0x67
+ mov $nptr,%rbp
+ movq %r10, %xmm3 # -num
+___
+$code.=<<___ if ($addx);
+ mov OPENSSL_ia32cap_P+8(%rip),%r11d
+ and \$0x80100,%r11d
+ cmp \$0x80100,%r11d
+ jne .Lfrom_mont_nox
+
+ lea (%rax,$num),$rptr
+ call sqrx8x_reduction
+
+ pxor %xmm0,%xmm0
+ lea 48(%rsp),%rax
+ mov 40(%rsp),%rsi # restore %rsp
+ jmp .Lfrom_mont_zero
+
+.align 32
+.Lfrom_mont_nox:
+___
+$code.=<<___;
+ call sqr8x_reduction
+
+ pxor %xmm0,%xmm0
+ lea 48(%rsp),%rax
+ mov 40(%rsp),%rsi # restore %rsp
+ jmp .Lfrom_mont_zero
+
+.align 32
+.Lfrom_mont_zero:
+ movdqa %xmm0,16*0(%rax)
+ movdqa %xmm0,16*1(%rax)
+ movdqa %xmm0,16*2(%rax)
+ movdqa %xmm0,16*3(%rax)
+ lea 16*4(%rax),%rax
+ sub \$32,$num
+ jnz .Lfrom_mont_zero
+
+ mov \$1,%rax
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
+.Lfrom_epilogue:
ret
-.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
+.size bn_from_mont8x,.-bn_from_mont8x
___
+}
}}}
+
+if ($addx) {{{
+my $bp="%rdx"; # restore original value
+
+$code.=<<___;
+.type bn_mulx4x_mont_gather5,\@function,6
+.align 32
+bn_mulx4x_mont_gather5:
+.Lmulx4x_enter:
+ .byte 0x67
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0x28(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+___
+$code.=<<___;
+ .byte 0x67
+ mov ${num}d,%r10d
+ shl \$3,${num}d # convert $num to bytes
+ shl \$3+2,%r10d # 4*$num
+ neg $num # -$num
+ mov ($n0),$n0 # *n0
+
+ ##############################################################
+ # ensure that stack frame doesn't alias with $aptr+4*$num
+ # modulo 4096, which covers a[num], ret[num] and n[2*num]
+ # (see bn_exp.c). this is done to allow memory disambiguation
+ # logic do its magic. [excessive frame is allocated in order
+ # to allow bn_from_mont8x to clear it.]
+ #
+ lea -64(%rsp,$num,2),%r11
+ sub $ap,%r11
+ and \$4095,%r11
+ cmp %r11,%r10
+ jb .Lmulx4xsp_alt
+ sub %r11,%rsp # align with $aptr
+ lea -64(%rsp,$num,2),%rsp # alloca(frame+$num)
+ jmp .Lmulx4xsp_done
+
+.align 32
+.Lmulx4xsp_alt:
+ lea 4096-64(,$num,2),%r10 # 4096-frame-$num
+ lea -64(%rsp,$num,2),%rsp # alloca(frame+$num)
+ sub %r10,%r11
+ mov \$0,%r10
+ cmovc %r10,%r11
+ sub %r11,%rsp
+.Lmulx4xsp_done:
+ and \$-64,%rsp # ensure alignment
+ ##############################################################
+ # Stack layout
+ # +0 -num
+ # +8 off-loaded &b[i]
+ # +16 end of b[num]
+ # +24 inner counter
+ # +32 saved n0
+ # +40 saved %rsp
+ # +48
+ # +56 saved rp
+ # +64 tmp[num+1]
+ #
+ mov $n0, 32(%rsp) # save *n0
+ mov %rax,40(%rsp) # save original %rsp
+.Lmulx4x_body:
+ call mulx4x_internal
+
+ mov 40(%rsp),%rsi # restore %rsp
+ mov \$1,%rax
+___
+$code.=<<___ if ($win64);
+ movaps -88(%rsi),%xmm6
+ movaps -72(%rsi),%xmm7
+___
+$code.=<<___;
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
+.Lmulx4x_epilogue:
+ ret
+.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
+
+.type mulx4x_internal,\@abi-omnipotent
+.align 32
+mulx4x_internal:
+ .byte 0x4c,0x89,0x8c,0x24,0x08,0x00,0x00,0x00 # mov $num,8(%rsp) # save -$num
+ .byte 0x67
+ neg $num # restore $num
+ shl \$5,$num
+ lea 256($bp,$num),%r13
+ shr \$5+5,$num
+ mov `($win64?56:8)`(%rax),%r10d # load 7th argument
+ sub \$1,$num
+ mov %r13,16+8(%rsp) # end of b[num]
+ mov $num,24+8(%rsp) # inner counter
+ mov $rp, 56+8(%rsp) # save $rp
+___
+my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)=
+ ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
+my $rptr=$bptr;
+my $STRIDE=2**5*8; # 5 is "window size"
+my $N=$STRIDE/4; # should match cache line size
+$code.=<<___;
+ mov %r10,%r11
+ shr \$`log($N/8)/log(2)`,%r10
+ and \$`$N/8-1`,%r11
+ not %r10
+ lea .Lmagic_masks(%rip),%rax
+ and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
+ lea 96($bp,%r11,8),$bptr # pointer within 1st cache line
+ movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
+ movq 8(%rax,%r10,8),%xmm5 # cache line contains element
+ add \$7,%r11
+ movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
+ movq 24(%rax,%r10,8),%xmm7
+ and \$7,%r11
+
+ movq `0*$STRIDE/4-96`($bptr),%xmm0
+ lea $STRIDE($bptr),$tptr # borrow $tptr
+ movq `1*$STRIDE/4-96`($bptr),%xmm1
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-96`($bptr),%xmm2
+ pand %xmm5,%xmm1
+ movq `3*$STRIDE/4-96`($bptr),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ movq `0*$STRIDE/4-96`($tptr),%xmm1
+ pand %xmm7,%xmm3
+ por %xmm2,%xmm0
+ movq `1*$STRIDE/4-96`($tptr),%xmm2
+ por %xmm3,%xmm0
+ .byte 0x67,0x67
+ pand %xmm4,%xmm1
+ movq `2*$STRIDE/4-96`($tptr),%xmm3
+
+ movq %xmm0,%rdx # bp[0]
+ movq `3*$STRIDE/4-96`($tptr),%xmm0
+ lea 2*$STRIDE($bptr),$bptr # next &b[i]
+ pand %xmm5,%xmm2
+ .byte 0x67,0x67
+ pand %xmm6,%xmm3
+ ##############################################################
+ # $tptr is chosen so that writing to top-most element of the
+ # vector occurs just "above" references to powers table,
+ # "above" modulo cache-line size, which effectively precludes
+ # possibility of memory disambiguation logic failure when
+ # accessing the table.
+ #
+ lea 64+8*4+8(%rsp,%r11,8),$tptr
+
+ mov %rdx,$bi
+ mulx 0*8($aptr),$mi,%rax # a[0]*b[0]
+ mulx 1*8($aptr),%r11,%r12 # a[1]*b[0]
+ add %rax,%r11
+ mulx 2*8($aptr),%rax,%r13 # ...
+ adc %rax,%r12
+ adc \$0,%r13
+ mulx 3*8($aptr),%rax,%r14
+
+ mov $mi,%r15
+ imulq 32+8(%rsp),$mi # "t[0]"*n0
+ xor $zero,$zero # cf=0, of=0
+ mov $mi,%rdx
+
+ por %xmm2,%xmm1
+ pand %xmm7,%xmm0
+ por %xmm3,%xmm1
+ mov $bptr,8+8(%rsp) # off-load &b[i]
+ por %xmm1,%xmm0
+
+ .byte 0x48,0x8d,0xb6,0x20,0x00,0x00,0x00 # lea 4*8($aptr),$aptr
+ adcx %rax,%r13
+ adcx $zero,%r14 # cf=0
+
+ mulx 0*16($nptr),%rax,%r10
+ adcx %rax,%r15 # discarded
+ adox %r11,%r10
+ mulx 1*16($nptr),%rax,%r11
+ adcx %rax,%r10
+ adox %r12,%r11
+ mulx 2*16($nptr),%rax,%r12
+ mov 24+8(%rsp),$bptr # counter value
+ .byte 0x66
+ mov %r10,-8*4($tptr)
+ adcx %rax,%r11
+ adox %r13,%r12
+ mulx 3*16($nptr),%rax,%r15
+ .byte 0x67,0x67
+ mov $bi,%rdx
+ mov %r11,-8*3($tptr)
+ adcx %rax,%r12
+ adox $zero,%r15 # of=0
+ .byte 0x48,0x8d,0x89,0x40,0x00,0x00,0x00 # lea 4*16($nptr),$nptr
+ mov %r12,-8*2($tptr)
+ #jmp .Lmulx4x_1st
+
+.align 32
+.Lmulx4x_1st:
+ adcx $zero,%r15 # cf=0, modulo-scheduled
+ mulx 0*8($aptr),%r10,%rax # a[4]*b[0]
+ adcx %r14,%r10
+ mulx 1*8($aptr),%r11,%r14 # a[5]*b[0]
+ adcx %rax,%r11
+ mulx 2*8($aptr),%r12,%rax # ...
+ adcx %r14,%r12
+ mulx 3*8($aptr),%r13,%r14
+ .byte 0x67,0x67
+ mov $mi,%rdx
+ adcx %rax,%r13
+ adcx $zero,%r14 # cf=0
+ lea 4*8($aptr),$aptr
+ lea 4*8($tptr),$tptr
+
+ adox %r15,%r10
+ mulx 0*16($nptr),%rax,%r15
+ adcx %rax,%r10
+ adox %r15,%r11
+ mulx 1*16($nptr),%rax,%r15
+ adcx %rax,%r11
+ adox %r15,%r12
+ mulx 2*16($nptr),%rax,%r15
+ mov %r10,-5*8($tptr)
+ adcx %rax,%r12
+ mov %r11,-4*8($tptr)
+ adox %r15,%r13
+ mulx 3*16($nptr),%rax,%r15
+ mov $bi,%rdx
+ mov %r12,-3*8($tptr)
+ adcx %rax,%r13
+ adox $zero,%r15
+ lea 4*16($nptr),$nptr
+ mov %r13,-2*8($tptr)
+
+ dec $bptr # of=0, pass cf
+ jnz .Lmulx4x_1st
+
+ mov 8(%rsp),$num # load -num
+ movq %xmm0,%rdx # bp[1]
+ adc $zero,%r15 # modulo-scheduled
+ lea ($aptr,$num),$aptr # rewind $aptr
+ add %r15,%r14
+ mov 8+8(%rsp),$bptr # re-load &b[i]
+ adc $zero,$zero # top-most carry
+ mov %r14,-1*8($tptr)
+ jmp .Lmulx4x_outer
+
+.align 32
+.Lmulx4x_outer:
+ mov $zero,($tptr) # save top-most carry
+ lea 4*8($tptr,$num),$tptr # rewind $tptr
+ mulx 0*8($aptr),$mi,%r11 # a[0]*b[i]
+ xor $zero,$zero # cf=0, of=0
+ mov %rdx,$bi
+ mulx 1*8($aptr),%r14,%r12 # a[1]*b[i]
+ adox -4*8($tptr),$mi # +t[0]
+ adcx %r14,%r11
+ mulx 2*8($aptr),%r15,%r13 # ...
+ adox -3*8($tptr),%r11
+ adcx %r15,%r12
+ mulx 3*8($aptr),%rdx,%r14
+ adox -2*8($tptr),%r12
+ adcx %rdx,%r13
+ lea ($nptr,$num,2),$nptr # rewind $nptr
+ lea 4*8($aptr),$aptr
+ adox -1*8($tptr),%r13
+ adcx $zero,%r14
+ adox $zero,%r14
+
+ .byte 0x67
+ mov $mi,%r15
+ imulq 32+8(%rsp),$mi # "t[0]"*n0
+
+ movq `0*$STRIDE/4-96`($bptr),%xmm0
+ .byte 0x67,0x67
+ mov $mi,%rdx
+ movq `1*$STRIDE/4-96`($bptr),%xmm1
+ .byte 0x67
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-96`($bptr),%xmm2
+ .byte 0x67
+ pand %xmm5,%xmm1
+ movq `3*$STRIDE/4-96`($bptr),%xmm3
+ add \$$STRIDE,$bptr # next &b[i]
+ .byte 0x67
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+ xor $zero,$zero # cf=0, of=0
+ mov $bptr,8+8(%rsp) # off-load &b[i]
+
+ mulx 0*16($nptr),%rax,%r10
+ adcx %rax,%r15 # discarded
+ adox %r11,%r10
+ mulx 1*16($nptr),%rax,%r11
+ adcx %rax,%r10
+ adox %r12,%r11
+ mulx 2*16($nptr),%rax,%r12
+ adcx %rax,%r11
+ adox %r13,%r12
+ mulx 3*16($nptr),%rax,%r15
+ mov $bi,%rdx
+ por %xmm2,%xmm0
+ mov 24+8(%rsp),$bptr # counter value
+ mov %r10,-8*4($tptr)
+ por %xmm3,%xmm0
+ adcx %rax,%r12
+ mov %r11,-8*3($tptr)
+ adox $zero,%r15 # of=0
+ mov %r12,-8*2($tptr)
+ lea 4*16($nptr),$nptr
+ jmp .Lmulx4x_inner
+
+.align 32
+.Lmulx4x_inner:
+ mulx 0*8($aptr),%r10,%rax # a[4]*b[i]
+ adcx $zero,%r15 # cf=0, modulo-scheduled
+ adox %r14,%r10
+ mulx 1*8($aptr),%r11,%r14 # a[5]*b[i]
+ adcx 0*8($tptr),%r10
+ adox %rax,%r11
+ mulx 2*8($aptr),%r12,%rax # ...
+ adcx 1*8($tptr),%r11
+ adox %r14,%r12
+ mulx 3*8($aptr),%r13,%r14
+ mov $mi,%rdx
+ adcx 2*8($tptr),%r12
+ adox %rax,%r13
+ adcx 3*8($tptr),%r13
+ adox $zero,%r14 # of=0
+ lea 4*8($aptr),$aptr
+ lea 4*8($tptr),$tptr
+ adcx $zero,%r14 # cf=0
+
+ adox %r15,%r10
+ mulx 0*16($nptr),%rax,%r15
+ adcx %rax,%r10
+ adox %r15,%r11
+ mulx 1*16($nptr),%rax,%r15
+ adcx %rax,%r11
+ adox %r15,%r12
+ mulx 2*16($nptr),%rax,%r15
+ mov %r10,-5*8($tptr)
+ adcx %rax,%r12
+ adox %r15,%r13
+ mov %r11,-4*8($tptr)
+ mulx 3*16($nptr),%rax,%r15
+ mov $bi,%rdx
+ lea 4*16($nptr),$nptr
+ mov %r12,-3*8($tptr)
+ adcx %rax,%r13
+ adox $zero,%r15
+ mov %r13,-2*8($tptr)
+
+ dec $bptr # of=0, pass cf
+ jnz .Lmulx4x_inner
+
+ mov 0+8(%rsp),$num # load -num
+ movq %xmm0,%rdx # bp[i+1]
+ adc $zero,%r15 # modulo-scheduled
+ sub 0*8($tptr),$bptr # pull top-most carry to %cf
+ mov 8+8(%rsp),$bptr # re-load &b[i]
+ mov 16+8(%rsp),%r10
+ adc %r15,%r14
+ lea ($aptr,$num),$aptr # rewind $aptr
+ adc $zero,$zero # top-most carry
+ mov %r14,-1*8($tptr)
+
+ cmp %r10,$bptr
+ jb .Lmulx4x_outer
+
+ mov -16($nptr),%r10
+ xor %r15,%r15
+ sub %r14,%r10 # compare top-most words
+ adc %r15,%r15
+ or %r15,$zero
+ xor \$1,$zero
+ lea ($tptr,$num),%rdi # rewind $tptr
+ lea ($nptr,$num,2),$nptr # rewind $nptr
+ .byte 0x67,0x67
+ sar \$3+2,$num # cf=0
+ lea ($nptr,$zero,8),%rbp
+ mov 56+8(%rsp),%rdx # restore rp
+ mov $num,%rcx
+ jmp .Lsqrx4x_sub # common post-condition
+.size mulx4x_internal,.-mulx4x_internal
+___
+} {
+######################################################################
+# void bn_power5(
+my $rptr="%rdi"; # BN_ULONG *rptr,
+my $aptr="%rsi"; # const BN_ULONG *aptr,
+my $bptr="%rdx"; # const void *table,
+my $nptr="%rcx"; # const BN_ULONG *nptr,
+my $n0 ="%r8"; # const BN_ULONG *n0);
+my $num ="%r9"; # int num, has to be divisible by 8
+ # int pwr);
+
+my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
+my @A0=("%r10","%r11");
+my @A1=("%r12","%r13");
+my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
+
+$code.=<<___;
+.type bn_powerx5,\@function,6
+.align 32
+bn_powerx5:
+.Lpowerx5_enter:
+ .byte 0x67
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0x28(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+___
+$code.=<<___;
+ .byte 0x67
+ mov ${num}d,%r10d
+ shl \$3,${num}d # convert $num to bytes
+ shl \$3+2,%r10d # 4*$num
+ neg $num
+ mov ($n0),$n0 # *n0
+
+ ##############################################################
+ # ensure that stack frame doesn't alias with $aptr+4*$num
+ # modulo 4096, which covers ret[num], am[num] and n[2*num]
+ # (see bn_exp.c). this is done to allow memory disambiguation
+ # logic do its magic.
+ #
+ lea -64(%rsp,$num,2),%r11
+ sub $aptr,%r11
+ and \$4095,%r11
+ cmp %r11,%r10
+ jb .Lpwrx_sp_alt
+ sub %r11,%rsp # align with $aptr
+ lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ jmp .Lpwrx_sp_done
+
+.align 32
+.Lpwrx_sp_alt:
+ lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
+ lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ sub %r10,%r11
+ mov \$0,%r10
+ cmovc %r10,%r11
+ sub %r11,%rsp
+.Lpwrx_sp_done:
+ and \$-64,%rsp
+ mov $num,%r10
+ neg $num
+
+ ##############################################################
+ # Stack layout
+ #
+ # +0 saved $num, used in reduction section
+ # +8 &t[2*$num], used in reduction section
+ # +16 intermediate carry bit
+ # +24 top-most carry bit, used in reduction section
+ # +32 saved *n0
+ # +40 saved %rsp
+ # +48 t[2*$num]
+ #
+ pxor %xmm0,%xmm0
+ movq $rptr,%xmm1 # save $rptr
+ movq $nptr,%xmm2 # save $nptr
+ movq %r10, %xmm3 # -$num
+ movq $bptr,%xmm4
+ mov $n0, 32(%rsp)
+ mov %rax, 40(%rsp) # save original %rsp
+.Lpowerx5_body:
+
+ call __bn_sqrx8x_internal
+ call __bn_sqrx8x_internal
+ call __bn_sqrx8x_internal
+ call __bn_sqrx8x_internal
+ call __bn_sqrx8x_internal
+
+ mov %r10,$num # -num
+ mov $aptr,$rptr
+ movq %xmm2,$nptr
+ movq %xmm4,$bptr
+ mov 40(%rsp),%rax
+
+ call mulx4x_internal
+
+ mov 40(%rsp),%rsi # restore %rsp
+ mov \$1,%rax
+___
+$code.=<<___ if ($win64);
+ movaps -88(%rsi),%xmm6
+ movaps -72(%rsi),%xmm7
+___
+$code.=<<___;
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
+.Lpowerx5_epilogue:
+ ret
+.size bn_powerx5,.-bn_powerx5
+
+.globl bn_sqrx8x_internal
+.hidden bn_sqrx8x_internal
+.type bn_sqrx8x_internal,\@abi-omnipotent
+.align 32
+bn_sqrx8x_internal:
+__bn_sqrx8x_internal:
+ ##################################################################
+ # Squaring part:
+ #
+ # a) multiply-n-add everything but a[i]*a[i];
+ # b) shift result of a) by 1 to the left and accumulate
+ # a[i]*a[i] products;
+ #
+ ##################################################################
+ # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
+ # a[1]a[0]
+ # a[2]a[0]
+ # a[3]a[0]
+ # a[2]a[1]
+ # a[3]a[1]
+ # a[3]a[2]
+ #
+ # a[4]a[0]
+ # a[5]a[0]
+ # a[6]a[0]
+ # a[7]a[0]
+ # a[4]a[1]
+ # a[5]a[1]
+ # a[6]a[1]
+ # a[7]a[1]
+ # a[4]a[2]
+ # a[5]a[2]
+ # a[6]a[2]
+ # a[7]a[2]
+ # a[4]a[3]
+ # a[5]a[3]
+ # a[6]a[3]
+ # a[7]a[3]
+ #
+ # a[5]a[4]
+ # a[6]a[4]
+ # a[7]a[4]
+ # a[6]a[5]
+ # a[7]a[5]
+ # a[7]a[6]
+ # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
+___
+{
+my ($zero,$carry)=("%rbp","%rcx");
+my $aaptr=$zero;
+$code.=<<___;
+ lea 48+8(%rsp),$tptr
+ lea ($aptr,$num),$aaptr
+ mov $num,0+8(%rsp) # save $num
+ mov $aaptr,8+8(%rsp) # save end of $aptr
+ jmp .Lsqr8x_zero_start
+
+.align 32
+.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
+.Lsqrx8x_zero:
+ .byte 0x3e
+ movdqa %xmm0,0*8($tptr)
+ movdqa %xmm0,2*8($tptr)
+ movdqa %xmm0,4*8($tptr)
+ movdqa %xmm0,6*8($tptr)
+.Lsqr8x_zero_start: # aligned at 32
+ movdqa %xmm0,8*8($tptr)
+ movdqa %xmm0,10*8($tptr)
+ movdqa %xmm0,12*8($tptr)
+ movdqa %xmm0,14*8($tptr)
+ lea 16*8($tptr),$tptr
+ sub \$64,$num
+ jnz .Lsqrx8x_zero
+
+ mov 0*8($aptr),%rdx # a[0], modulo-scheduled
+ #xor %r9,%r9 # t[1], ex-$num, zero already
+ xor %r10,%r10
+ xor %r11,%r11
+ xor %r12,%r12
+ xor %r13,%r13
+ xor %r14,%r14
+ xor %r15,%r15
+ lea 48+8(%rsp),$tptr
+ xor $zero,$zero # cf=0, cf=0
+ jmp .Lsqrx8x_outer_loop
+
+.align 32
+.Lsqrx8x_outer_loop:
+ mulx 1*8($aptr),%r8,%rax # a[1]*a[0]
+ adcx %r9,%r8 # a[1]*a[0]+=t[1]
+ adox %rax,%r10
+ mulx 2*8($aptr),%r9,%rax # a[2]*a[0]
+ adcx %r10,%r9
+ adox %rax,%r11
+ .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%r10,%rax # ...
+ adcx %r11,%r10
+ adox %rax,%r12
+ .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%r11,%rax
+ adcx %r12,%r11
+ adox %rax,%r13
+ mulx 5*8($aptr),%r12,%rax
+ adcx %r13,%r12
+ adox %rax,%r14
+ mulx 6*8($aptr),%r13,%rax
+ adcx %r14,%r13
+ adox %r15,%rax
+ mulx 7*8($aptr),%r14,%r15
+ mov 1*8($aptr),%rdx # a[1]
+ adcx %rax,%r14
+ adox $zero,%r15
+ adc 8*8($tptr),%r15
+ mov %r8,1*8($tptr) # t[1]
+ mov %r9,2*8($tptr) # t[2]
+ sbb $carry,$carry # mov %cf,$carry
+ xor $zero,$zero # cf=0, of=0
+
+
+ mulx 2*8($aptr),%r8,%rbx # a[2]*a[1]
+ mulx 3*8($aptr),%r9,%rax # a[3]*a[1]
+ adcx %r10,%r8
+ adox %rbx,%r9
+ mulx 4*8($aptr),%r10,%rbx # ...
+ adcx %r11,%r9
+ adox %rax,%r10
+ .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 # mulx 5*8($aptr),%r11,%rax
+ adcx %r12,%r10
+ adox %rbx,%r11
+ .byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r12,%rbx
+ adcx %r13,%r11
+ adox %r14,%r12
+ .byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r13,%r14
+ mov 2*8($aptr),%rdx # a[2]
+ adcx %rax,%r12
+ adox %rbx,%r13
+ adcx %r15,%r13
+ adox $zero,%r14 # of=0
+ adcx $zero,%r14 # cf=0
+
+ mov %r8,3*8($tptr) # t[3]
+ mov %r9,4*8($tptr) # t[4]
+
+ mulx 3*8($aptr),%r8,%rbx # a[3]*a[2]
+ mulx 4*8($aptr),%r9,%rax # a[4]*a[2]
+ adcx %r10,%r8
+ adox %rbx,%r9
+ mulx 5*8($aptr),%r10,%rbx # ...
+ adcx %r11,%r9
+ adox %rax,%r10
+ .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r11,%rax
+ adcx %r12,%r10
+ adox %r13,%r11
+ .byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r12,%r13
+ .byte 0x3e
+ mov 3*8($aptr),%rdx # a[3]
+ adcx %rbx,%r11
+ adox %rax,%r12
+ adcx %r14,%r12
+ mov %r8,5*8($tptr) # t[5]
+ mov %r9,6*8($tptr) # t[6]
+ mulx 4*8($aptr),%r8,%rax # a[4]*a[3]
+ adox $zero,%r13 # of=0
+ adcx $zero,%r13 # cf=0
+
+ mulx 5*8($aptr),%r9,%rbx # a[5]*a[3]
+ adcx %r10,%r8
+ adox %rax,%r9
+ mulx 6*8($aptr),%r10,%rax # ...
+ adcx %r11,%r9
+ adox %r12,%r10
+ mulx 7*8($aptr),%r11,%r12
+ mov 4*8($aptr),%rdx # a[4]
+ mov 5*8($aptr),%r14 # a[5]
+ adcx %rbx,%r10
+ adox %rax,%r11
+ mov 6*8($aptr),%r15 # a[6]
+ adcx %r13,%r11
+ adox $zero,%r12 # of=0
+ adcx $zero,%r12 # cf=0
+
+ mov %r8,7*8($tptr) # t[7]
+ mov %r9,8*8($tptr) # t[8]
+
+ mulx %r14,%r9,%rax # a[5]*a[4]
+ mov 7*8($aptr),%r8 # a[7]
+ adcx %r10,%r9
+ mulx %r15,%r10,%rbx # a[6]*a[4]
+ adox %rax,%r10
+ adcx %r11,%r10
+ mulx %r8,%r11,%rax # a[7]*a[4]
+ mov %r14,%rdx # a[5]
+ adox %rbx,%r11
+ adcx %r12,%r11
+ #adox $zero,%rax # of=0
+ adcx $zero,%rax # cf=0
+
+ mulx %r15,%r14,%rbx # a[6]*a[5]
+ mulx %r8,%r12,%r13 # a[7]*a[5]
+ mov %r15,%rdx # a[6]
+ lea 8*8($aptr),$aptr
+ adcx %r14,%r11
+ adox %rbx,%r12
+ adcx %rax,%r12
+ adox $zero,%r13
+
+ .byte 0x67,0x67
+ mulx %r8,%r8,%r14 # a[7]*a[6]
+ adcx %r8,%r13
+ adcx $zero,%r14
+
+ cmp 8+8(%rsp),$aptr
+ je .Lsqrx8x_outer_break
+
+ neg $carry # mov $carry,%cf
+ mov \$-8,%rcx
+ mov $zero,%r15
+ mov 8*8($tptr),%r8
+ adcx 9*8($tptr),%r9 # +=t[9]
+ adcx 10*8($tptr),%r10 # ...
+ adcx 11*8($tptr),%r11
+ adc 12*8($tptr),%r12
+ adc 13*8($tptr),%r13
+ adc 14*8($tptr),%r14
+ adc 15*8($tptr),%r15
+ lea ($aptr),$aaptr
+ lea 2*64($tptr),$tptr
+ sbb %rax,%rax # mov %cf,$carry
+
+ mov -64($aptr),%rdx # a[0]
+ mov %rax,16+8(%rsp) # offload $carry
+ mov $tptr,24+8(%rsp)
+
+ #lea 8*8($tptr),$tptr # see 2*8*8($tptr) above
+ xor %eax,%eax # cf=0, of=0
+ jmp .Lsqrx8x_loop
+
+.align 32
+.Lsqrx8x_loop:
+ mov %r8,%rbx
+ mulx 0*8($aaptr),%rax,%r8 # a[8]*a[i]
+ adcx %rax,%rbx # +=t[8]
+ adox %r9,%r8
+
+ mulx 1*8($aaptr),%rax,%r9 # ...
+ adcx %rax,%r8
+ adox %r10,%r9
+
+ mulx 2*8($aaptr),%rax,%r10
+ adcx %rax,%r9
+ adox %r11,%r10
+
+ mulx 3*8($aaptr),%rax,%r11
+ adcx %rax,%r10
+ adox %r12,%r11
+
+ .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 4*8($aaptr),%rax,%r12
+ adcx %rax,%r11
+ adox %r13,%r12
+
+ mulx 5*8($aaptr),%rax,%r13
+ adcx %rax,%r12
+ adox %r14,%r13
+
+ mulx 6*8($aaptr),%rax,%r14
+ mov %rbx,($tptr,%rcx,8) # store t[8+i]
+ mov \$0,%ebx
+ adcx %rax,%r13
+ adox %r15,%r14
+
+ .byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 # mulx 7*8($aaptr),%rax,%r15
+ mov 8($aptr,%rcx,8),%rdx # a[i]
+ adcx %rax,%r14
+ adox %rbx,%r15 # %rbx is 0, of=0
+ adcx %rbx,%r15 # cf=0
+
+ .byte 0x67
+ inc %rcx # of=0
+ jnz .Lsqrx8x_loop
+
+ lea 8*8($aaptr),$aaptr
+ mov \$-8,%rcx
+ cmp 8+8(%rsp),$aaptr # done?
+ je .Lsqrx8x_break
+
+ sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf
+ .byte 0x66
+ mov -64($aptr),%rdx
+ adcx 0*8($tptr),%r8
+ adcx 1*8($tptr),%r9
+ adc 2*8($tptr),%r10
+ adc 3*8($tptr),%r11
+ adc 4*8($tptr),%r12
+ adc 5*8($tptr),%r13
+ adc 6*8($tptr),%r14
+ adc 7*8($tptr),%r15
+ lea 8*8($tptr),$tptr
+ .byte 0x67
+ sbb %rax,%rax # mov %cf,%rax
+ xor %ebx,%ebx # cf=0, of=0
+ mov %rax,16+8(%rsp) # offload carry
+ jmp .Lsqrx8x_loop
+
+.align 32
+.Lsqrx8x_break:
+ sub 16+8(%rsp),%r8 # consume last carry
+ mov 24+8(%rsp),$carry # initial $tptr, borrow $carry
+ mov 0*8($aptr),%rdx # a[8], modulo-scheduled
+ xor %ebp,%ebp # xor $zero,$zero
+ mov %r8,0*8($tptr)
+ cmp $carry,$tptr # cf=0, of=0
+ je .Lsqrx8x_outer_loop
+
+ mov %r9,1*8($tptr)
+ mov 1*8($carry),%r9
+ mov %r10,2*8($tptr)
+ mov 2*8($carry),%r10
+ mov %r11,3*8($tptr)
+ mov 3*8($carry),%r11
+ mov %r12,4*8($tptr)
+ mov 4*8($carry),%r12
+ mov %r13,5*8($tptr)
+ mov 5*8($carry),%r13
+ mov %r14,6*8($tptr)
+ mov 6*8($carry),%r14
+ mov %r15,7*8($tptr)
+ mov 7*8($carry),%r15
+ mov $carry,$tptr
+ jmp .Lsqrx8x_outer_loop
+
+.align 32
+.Lsqrx8x_outer_break:
+ mov %r9,9*8($tptr) # t[9]
+ movq %xmm3,%rcx # -$num
+ mov %r10,10*8($tptr) # ...
+ mov %r11,11*8($tptr)
+ mov %r12,12*8($tptr)
+ mov %r13,13*8($tptr)
+ mov %r14,14*8($tptr)
+___
+} {
+my $i="%rcx";
+$code.=<<___;
+ lea 48+8(%rsp),$tptr
+ mov ($aptr,$i),%rdx # a[0]
+
+ mov 8($tptr),$A0[1] # t[1]
+ xor $A0[0],$A0[0] # t[0], of=0, cf=0
+ mov 0+8(%rsp),$num # restore $num
+ adox $A0[1],$A0[1]
+ mov 16($tptr),$A1[0] # t[2] # prefetch
+ mov 24($tptr),$A1[1] # t[3] # prefetch
+ #jmp .Lsqrx4x_shift_n_add # happens to be aligned
+
+.align 32
+.Lsqrx4x_shift_n_add:
+ mulx %rdx,%rax,%rbx
+ adox $A1[0],$A1[0]
+ adcx $A0[0],%rax
+ .byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 # mov 8($aptr,$i),%rdx # a[i+1] # prefetch
+ .byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 # mov 32($tptr),$A0[0] # t[2*i+4] # prefetch
+ adox $A1[1],$A1[1]
+ adcx $A0[1],%rbx
+ mov 40($tptr),$A0[1] # t[2*i+4+1] # prefetch
+ mov %rax,0($tptr)
+ mov %rbx,8($tptr)
+
+ mulx %rdx,%rax,%rbx
+ adox $A0[0],$A0[0]
+ adcx $A1[0],%rax
+ mov 16($aptr,$i),%rdx # a[i+2] # prefetch
+ mov 48($tptr),$A1[0] # t[2*i+6] # prefetch
+ adox $A0[1],$A0[1]
+ adcx $A1[1],%rbx
+ mov 56($tptr),$A1[1] # t[2*i+6+1] # prefetch
+ mov %rax,16($tptr)
+ mov %rbx,24($tptr)
+
+ mulx %rdx,%rax,%rbx
+ adox $A1[0],$A1[0]
+ adcx $A0[0],%rax
+ mov 24($aptr,$i),%rdx # a[i+3] # prefetch
+ lea 32($i),$i
+ mov 64($tptr),$A0[0] # t[2*i+8] # prefetch
+ adox $A1[1],$A1[1]
+ adcx $A0[1],%rbx
+ mov 72($tptr),$A0[1] # t[2*i+8+1] # prefetch
+ mov %rax,32($tptr)
+ mov %rbx,40($tptr)
+
+ mulx %rdx,%rax,%rbx
+ adox $A0[0],$A0[0]
+ adcx $A1[0],%rax
+ jrcxz .Lsqrx4x_shift_n_add_break
+ .byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 # mov 0($aptr,$i),%rdx # a[i+4] # prefetch
+ adox $A0[1],$A0[1]
+ adcx $A1[1],%rbx
+ mov 80($tptr),$A1[0] # t[2*i+10] # prefetch
+ mov 88($tptr),$A1[1] # t[2*i+10+1] # prefetch
+ mov %rax,48($tptr)
+ mov %rbx,56($tptr)
+ lea 64($tptr),$tptr
+ nop
+ jmp .Lsqrx4x_shift_n_add
+
+.align 32
+.Lsqrx4x_shift_n_add_break:
+ adcx $A1[1],%rbx
+ mov %rax,48($tptr)
+ mov %rbx,56($tptr)
+ lea 64($tptr),$tptr # end of t[] buffer
+___
+}
+######################################################################
+# Montgomery reduction part, "word-by-word" algorithm.
+#
+# This new path is inspired by multiple submissions from Intel, by
+# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
+# Vinodh Gopal...
+{
+my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx");
+
+$code.=<<___;
+ movq %xmm2,$nptr
+sqrx8x_reduction:
+ xor %eax,%eax # initial top-most carry bit
+ mov 32+8(%rsp),%rbx # n0
+ mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr)
+ lea -128($nptr,$num,2),%rcx # end of n[]
+ #lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer
+ mov %rcx, 0+8(%rsp) # save end of n[]
+ mov $tptr,8+8(%rsp) # save end of t[]
+
+ lea 48+8(%rsp),$tptr # initial t[] window
+ jmp .Lsqrx8x_reduction_loop
+
+.align 32
+.Lsqrx8x_reduction_loop:
+ mov 8*1($tptr),%r9
+ mov 8*2($tptr),%r10
+ mov 8*3($tptr),%r11
+ mov 8*4($tptr),%r12
+ mov %rdx,%r8
+ imulq %rbx,%rdx # n0*a[i]
+ mov 8*5($tptr),%r13
+ mov 8*6($tptr),%r14
+ mov 8*7($tptr),%r15
+ mov %rax,24+8(%rsp) # store top-most carry bit
+
+ lea 8*8($tptr),$tptr
+ xor $carry,$carry # cf=0,of=0
+ mov \$-8,%rcx
+ jmp .Lsqrx8x_reduce
+
+.align 32
+.Lsqrx8x_reduce:
+ mov %r8, %rbx
+ mulx 16*0($nptr),%rax,%r8 # n[0]
+ adcx %rbx,%rax # discarded
+ adox %r9,%r8
+
+ mulx 16*1($nptr),%rbx,%r9 # n[1]
+ adcx %rbx,%r8
+ adox %r10,%r9
+
+ mulx 16*2($nptr),%rbx,%r10
+ adcx %rbx,%r9
+ adox %r11,%r10
+
+ mulx 16*3($nptr),%rbx,%r11
+ adcx %rbx,%r10
+ adox %r12,%r11
+
+ .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x40,0x00,0x00,0x00 # mulx 16*4($nptr),%rbx,%r12
+ mov %rdx,%rax
+ mov %r8,%rdx
+ adcx %rbx,%r11
+ adox %r13,%r12
+
+ mulx 32+8(%rsp),%rbx,%rdx # %rdx discarded
+ mov %rax,%rdx
+ mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i]
+
+ mulx 16*5($nptr),%rax,%r13
+ adcx %rax,%r12
+ adox %r14,%r13
+
+ mulx 16*6($nptr),%rax,%r14
+ adcx %rax,%r13
+ adox %r15,%r14
+
+ mulx 16*7($nptr),%rax,%r15
+ mov %rbx,%rdx
+ adcx %rax,%r14
+ adox $carry,%r15 # $carry is 0
+ adcx $carry,%r15 # cf=0
+
+ .byte 0x67,0x67,0x67
+ inc %rcx # of=0
+ jnz .Lsqrx8x_reduce
+
+ mov $carry,%rax # xor %rax,%rax
+ cmp 0+8(%rsp),$nptr # end of n[]?
+ jae .Lsqrx8x_no_tail
+
+ mov 48+8(%rsp),%rdx # pull n0*a[0]
+ add 8*0($tptr),%r8
+ lea 16*8($nptr),$nptr
+ mov \$-8,%rcx
+ adcx 8*1($tptr),%r9
+ adcx 8*2($tptr),%r10
+ adc 8*3($tptr),%r11
+ adc 8*4($tptr),%r12
+ adc 8*5($tptr),%r13
+ adc 8*6($tptr),%r14
+ adc 8*7($tptr),%r15
+ lea 8*8($tptr),$tptr
+ sbb %rax,%rax # top carry
+
+ xor $carry,$carry # of=0, cf=0
+ mov %rax,16+8(%rsp)
+ jmp .Lsqrx8x_tail
+
+.align 32
+.Lsqrx8x_tail:
+ mov %r8,%rbx
+ mulx 16*0($nptr),%rax,%r8
+ adcx %rax,%rbx
+ adox %r9,%r8
+
+ mulx 16*1($nptr),%rax,%r9
+ adcx %rax,%r8
+ adox %r10,%r9
+
+ mulx 16*2($nptr),%rax,%r10
+ adcx %rax,%r9
+ adox %r11,%r10
+
+ mulx 16*3($nptr),%rax,%r11
+ adcx %rax,%r10
+ adox %r12,%r11
+
+ .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x40,0x00,0x00,0x00 # mulx 16*4($nptr),%rax,%r12
+ adcx %rax,%r11
+ adox %r13,%r12
+
+ mulx 16*5($nptr),%rax,%r13
+ adcx %rax,%r12
+ adox %r14,%r13
+
+ mulx 16*6($nptr),%rax,%r14
+ adcx %rax,%r13
+ adox %r15,%r14
+
+ mulx 16*7($nptr),%rax,%r15
+ mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i]
+ adcx %rax,%r14
+ adox $carry,%r15
+ mov %rbx,($tptr,%rcx,8) # save result
+ mov %r8,%rbx
+ adcx $carry,%r15 # cf=0
+
+ inc %rcx # of=0
+ jnz .Lsqrx8x_tail
+
+ cmp 0+8(%rsp),$nptr # end of n[]?
+ jae .Lsqrx8x_tail_done # break out of loop
+
+ sub 16+8(%rsp),$carry # mov 16(%rsp),%cf
+ mov 48+8(%rsp),%rdx # pull n0*a[0]
+ lea 16*8($nptr),$nptr
+ adc 8*0($tptr),%r8
+ adc 8*1($tptr),%r9
+ adc 8*2($tptr),%r10
+ adc 8*3($tptr),%r11
+ adc 8*4($tptr),%r12
+ adc 8*5($tptr),%r13
+ adc 8*6($tptr),%r14
+ adc 8*7($tptr),%r15
+ lea 8*8($tptr),$tptr
+ sbb %rax,%rax
+ sub \$8,%rcx # mov \$-8,%rcx
+
+ xor $carry,$carry # of=0, cf=0
+ mov %rax,16+8(%rsp)
+ jmp .Lsqrx8x_tail
+
+.align 32
+.Lsqrx8x_tail_done:
+ add 24+8(%rsp),%r8 # can this overflow?
+ mov $carry,%rax # xor %rax,%rax
+
+ sub 16+8(%rsp),$carry # mov 16(%rsp),%cf
+.Lsqrx8x_no_tail: # %cf is 0 if jumped here
+ adc 8*0($tptr),%r8
+ movq %xmm3,%rcx
+ adc 8*1($tptr),%r9
+ mov 16*7($nptr),$carry
+ movq %xmm2,$nptr # restore $nptr
+ adc 8*2($tptr),%r10
+ adc 8*3($tptr),%r11
+ adc 8*4($tptr),%r12
+ adc 8*5($tptr),%r13
+ adc 8*6($tptr),%r14
+ adc 8*7($tptr),%r15
+ adc %rax,%rax # top-most carry
+
+ mov 32+8(%rsp),%rbx # n0
+ mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8"
+
+ mov %r8,8*0($tptr) # store top 512 bits
+ lea 8*8($tptr),%r8 # borrow %r8
+ mov %r9,8*1($tptr)
+ mov %r10,8*2($tptr)
+ mov %r11,8*3($tptr)
+ mov %r12,8*4($tptr)
+ mov %r13,8*5($tptr)
+ mov %r14,8*6($tptr)
+ mov %r15,8*7($tptr)
+
+ lea 8*8($tptr,%rcx),$tptr # start of current t[] window
+ cmp 8+8(%rsp),%r8 # end of t[]?
+ jb .Lsqrx8x_reduction_loop
+___
+}
+##############################################################
+# Post-condition, 4x unrolled
+#
+{
+my ($rptr,$nptr)=("%rdx","%rbp");
+my @ri=map("%r$_",(10..13));
+my @ni=map("%r$_",(14..15));
+$code.=<<___;
+ xor %rbx,%rbx
+ sub %r15,%rsi # compare top-most words
+ adc %rbx,%rbx
+ mov %rcx,%r10 # -$num
+ .byte 0x67
+ or %rbx,%rax
+ .byte 0x67
+ mov %rcx,%r9 # -$num
+ xor \$1,%rax
+ sar \$3+2,%rcx # cf=0
+ #lea 48+8(%rsp,%r9),$tptr
+ lea ($nptr,%rax,8),$nptr
+ movq %xmm1,$rptr # restore $rptr
+ movq %xmm1,$aptr # prepare for back-to-back call
+ jmp .Lsqrx4x_sub
+
+.align 32
+.Lsqrx4x_sub:
+ .byte 0x66
+ mov 8*0($tptr),%r12
+ mov 8*1($tptr),%r13
+ sbb 16*0($nptr),%r12
+ mov 8*2($tptr),%r14
+ sbb 16*1($nptr),%r13
+ mov 8*3($tptr),%r15
+ lea 8*4($tptr),$tptr
+ sbb 16*2($nptr),%r14
+ mov %r12,8*0($rptr)
+ sbb 16*3($nptr),%r15
+ lea 16*4($nptr),$nptr
+ mov %r13,8*1($rptr)
+ mov %r14,8*2($rptr)
+ mov %r15,8*3($rptr)
+ lea 8*4($rptr),$rptr
+
+ inc %rcx
+ jnz .Lsqrx4x_sub
+___
+}
+$code.=<<___;
+ neg %r9 # restore $num
+ ret
+.size bn_sqrx8x_internal,.-bn_sqrx8x_internal
+___
+}}}
{
-my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
- ("%rdi","%rsi","%rdx","%rcx"); # Unix order
+my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order
+ ("%rdi","%esi","%rdx","%ecx"); # Unix order
my $out=$inp;
my $STRIDE=2**5*8;
my $N=$STRIDE/4;
$code.=<<___;
+.globl bn_get_bits5
+.type bn_get_bits5,\@abi-omnipotent
+.align 16
+bn_get_bits5:
+ lea 0($inp),%r10
+ lea 1($inp),%r11
+ mov $num,%ecx
+ shr \$4,$num
+ and \$15,%ecx
+ lea -8(%ecx),%eax
+ cmp \$11,%ecx
+ cmova %r11,%r10
+ cmova %eax,%ecx
+ movzw (%r10,$num,2),%eax
+ shrl %cl,%eax
+ and \$31,%eax
+ ret
+.size bn_get_bits5,.-bn_get_bits5
+
.globl bn_scatter5
.type bn_scatter5,\@abi-omnipotent
.align 16
@@ -868,13 +3272,13 @@ $code.=<<___ if ($win64);
.byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp)
___
$code.=<<___;
- mov $idx,%r11
+ mov $idx,%r11d
shr \$`log($N/8)/log(2)`,$idx
and \$`$N/8-1`,%r11
not $idx
lea .Lmagic_masks(%rip),%rax
and \$`2**5/($N/8)-1`,$idx # 5 is "window size"
- lea 96($tbl,%r11,8),$tbl # pointer within 1st cache line
+ lea 128($tbl,%r11,8),$tbl # pointer within 1st cache line
movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which
movq 8(%rax,$idx,8),%xmm5 # cache line contains element
movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument
@@ -882,15 +3286,16 @@ $code.=<<___;
jmp .Lgather
.align 16
.Lgather:
- movq `0*$STRIDE/4-96`($tbl),%xmm0
- movq `1*$STRIDE/4-96`($tbl),%xmm1
+ movq `0*$STRIDE/4-128`($tbl),%xmm0
+ movq `1*$STRIDE/4-128`($tbl),%xmm1
pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($tbl),%xmm2
+ movq `2*$STRIDE/4-128`($tbl),%xmm2
pand %xmm5,%xmm1
- movq `3*$STRIDE/4-96`($tbl),%xmm3
+ movq `3*$STRIDE/4-128`($tbl),%xmm3
pand %xmm6,%xmm2
por %xmm1,%xmm0
pand %xmm7,%xmm3
+ .byte 0x67,0x67
por %xmm2,%xmm0
lea $STRIDE($tbl),$tbl
por %xmm3,%xmm0
@@ -954,26 +3359,27 @@ mul_handler:
cmp %r10,%rbx # context->Rip<end of prologue label
jb .Lcommon_seh_tail
- lea `40+48`(%rax),%rax
-
- mov 4(%r11),%r10d # HandlerData[1]
- lea (%rsi,%r10),%r10 # end of alloca label
- cmp %r10,%rbx # context->Rip<end of alloca label
- jb .Lcommon_seh_tail
-
mov 152($context),%rax # pull context->Rsp
- mov 8(%r11),%r10d # HandlerData[2]
+ mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail
+ lea .Lmul_epilogue(%rip),%r10
+ cmp %r10,%rbx
+ jb .Lbody_40
+
mov 192($context),%r10 # pull $num
mov 8(%rax,%r10,8),%rax # pull saved stack pointer
+ jmp .Lbody_proceed
+
+.Lbody_40:
+ mov 40(%rax),%rax # pull saved stack pointer
+.Lbody_proceed:
- movaps (%rax),%xmm0
- movaps 16(%rax),%xmm1
- lea `40+48`(%rax),%rax
+ movaps -88(%rax),%xmm0
+ movaps -72(%rax),%xmm1
mov -8(%rax),%rbx
mov -16(%rax),%rbp
@@ -1040,6 +3446,24 @@ mul_handler:
.rva .LSEH_end_bn_mul4x_mont_gather5
.rva .LSEH_info_bn_mul4x_mont_gather5
+ .rva .LSEH_begin_bn_power5
+ .rva .LSEH_end_bn_power5
+ .rva .LSEH_info_bn_power5
+
+ .rva .LSEH_begin_bn_from_mont8x
+ .rva .LSEH_end_bn_from_mont8x
+ .rva .LSEH_info_bn_from_mont8x
+___
+$code.=<<___ if ($addx);
+ .rva .LSEH_begin_bn_mulx4x_mont_gather5
+ .rva .LSEH_end_bn_mulx4x_mont_gather5
+ .rva .LSEH_info_bn_mulx4x_mont_gather5
+
+ .rva .LSEH_begin_bn_powerx5
+ .rva .LSEH_end_bn_powerx5
+ .rva .LSEH_info_bn_powerx5
+___
+$code.=<<___;
.rva .LSEH_begin_bn_gather5
.rva .LSEH_end_bn_gather5
.rva .LSEH_info_bn_gather5
@@ -1049,12 +3473,36 @@ mul_handler:
.LSEH_info_bn_mul_mont_gather5:
.byte 9,0,0,0
.rva mul_handler
- .rva .Lmul_alloca,.Lmul_body,.Lmul_epilogue # HandlerData[]
+ .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
.align 8
.LSEH_info_bn_mul4x_mont_gather5:
.byte 9,0,0,0
.rva mul_handler
- .rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
+ .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
+.align 8
+.LSEH_info_bn_power5:
+ .byte 9,0,0,0
+ .rva mul_handler
+ .rva .Lpower5_body,.Lpower5_epilogue # HandlerData[]
+.align 8
+.LSEH_info_bn_from_mont8x:
+ .byte 9,0,0,0
+ .rva mul_handler
+ .rva .Lfrom_body,.Lfrom_epilogue # HandlerData[]
+___
+$code.=<<___ if ($addx);
+.align 8
+.LSEH_info_bn_mulx4x_mont_gather5:
+ .byte 9,0,0,0
+ .rva mul_handler
+ .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
+.align 8
+.LSEH_info_bn_powerx5:
+ .byte 9,0,0,0
+ .rva mul_handler
+ .rva .Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[]
+___
+$code.=<<___;
.align 8
.LSEH_info_bn_gather5:
.byte 0x01,0x0d,0x05,0x00
diff --git a/crypto/bn/bn.h b/crypto/bn/bn.h
index 47d8c71d9ed1..5696965e9a09 100644
--- a/crypto/bn/bn.h
+++ b/crypto/bn/bn.h
@@ -256,24 +256,6 @@ extern "C" {
# define BN_HEX_FMT2 "%08X"
# endif
-/*
- * 2011-02-22 SMS. In various places, a size_t variable or a type cast to
- * size_t was used to perform integer-only operations on pointers. This
- * failed on VMS with 64-bit pointers (CC /POINTER_SIZE = 64) because size_t
- * is still only 32 bits. What's needed in these cases is an integer type
- * with the same size as a pointer, which size_t is not certain to be. The
- * only fix here is VMS-specific.
- */
-# if defined(OPENSSL_SYS_VMS)
-# if __INITIAL_POINTER_SIZE == 64
-# define PTR_SIZE_INT long long
-# else /* __INITIAL_POINTER_SIZE == 64 */
-# define PTR_SIZE_INT int
-# endif /* __INITIAL_POINTER_SIZE == 64 [else] */
-# else /* defined(OPENSSL_SYS_VMS) */
-# define PTR_SIZE_INT size_t
-# endif /* defined(OPENSSL_SYS_VMS) [else] */
-
# define BN_DEFAULT_BITS 1280
# define BN_FLG_MALLOCED 0x01
diff --git a/crypto/bn/bn_asm.c b/crypto/bn/bn_asm.c
index 114acf3dc248..03a33cffe5ee 100644
--- a/crypto/bn/bn_asm.c
+++ b/crypto/bn/bn_asm.c
@@ -489,121 +489,144 @@ BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
* c=(c2,c1,c0)
*/
+# ifdef BN_LLONG
/*
- * Keep in mind that carrying into high part of multiplication result
- * can not overflow, because it cannot be all-ones.
+ * Keep in mind that additions to multiplication result can not
+ * overflow, because its high half cannot be all-ones.
*/
-# ifdef BN_LLONG
-# define mul_add_c(a,b,c0,c1,c2) \
- t=(BN_ULLONG)a*b; \
- t1=(BN_ULONG)Lw(t); \
- t2=(BN_ULONG)Hw(t); \
- c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
- c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
-
-# define mul_add_c2(a,b,c0,c1,c2) \
- t=(BN_ULLONG)a*b; \
- tt=(t+t)&BN_MASK; \
- if (tt < t) c2++; \
- t1=(BN_ULONG)Lw(tt); \
- t2=(BN_ULONG)Hw(tt); \
- c0=(c0+t1)&BN_MASK2; \
- if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \
- c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
-
-# define sqr_add_c(a,i,c0,c1,c2) \
- t=(BN_ULLONG)a[i]*a[i]; \
- t1=(BN_ULONG)Lw(t); \
- t2=(BN_ULONG)Hw(t); \
- c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
- c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
+# define mul_add_c(a,b,c0,c1,c2) do { \
+ BN_ULONG hi; \
+ BN_ULLONG t = (BN_ULLONG)(a)*(b); \
+ t += c0; /* no carry */ \
+ c0 = (BN_ULONG)Lw(t); \
+ hi = (BN_ULONG)Hw(t); \
+ c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
+ } while(0)
+
+# define mul_add_c2(a,b,c0,c1,c2) do { \
+ BN_ULONG hi; \
+ BN_ULLONG t = (BN_ULLONG)(a)*(b); \
+ BN_ULLONG tt = t+c0; /* no carry */ \
+ c0 = (BN_ULONG)Lw(tt); \
+ hi = (BN_ULONG)Hw(tt); \
+ c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
+ t += c0; /* no carry */ \
+ c0 = (BN_ULONG)Lw(t); \
+ hi = (BN_ULONG)Hw(t); \
+ c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
+ } while(0)
+
+# define sqr_add_c(a,i,c0,c1,c2) do { \
+ BN_ULONG hi; \
+ BN_ULLONG t = (BN_ULLONG)a[i]*a[i]; \
+ t += c0; /* no carry */ \
+ c0 = (BN_ULONG)Lw(t); \
+ hi = (BN_ULONG)Hw(t); \
+ c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
+ } while(0)
# define sqr_add_c2(a,i,j,c0,c1,c2) \
mul_add_c2((a)[i],(a)[j],c0,c1,c2)
# elif defined(BN_UMULT_LOHI)
-
-# define mul_add_c(a,b,c0,c1,c2) { \
- BN_ULONG ta=(a),tb=(b); \
- BN_UMULT_LOHI(t1,t2,ta,tb); \
- c0 += t1; t2 += (c0<t1)?1:0; \
- c1 += t2; c2 += (c1<t2)?1:0; \
- }
-
-# define mul_add_c2(a,b,c0,c1,c2) { \
- BN_ULONG ta=(a),tb=(b),t0; \
- BN_UMULT_LOHI(t0,t1,ta,tb); \
- c0 += t0; t2 = t1+((c0<t0)?1:0);\
- c1 += t2; c2 += (c1<t2)?1:0; \
- c0 += t0; t1 += (c0<t0)?1:0; \
- c1 += t1; c2 += (c1<t1)?1:0; \
- }
-
-# define sqr_add_c(a,i,c0,c1,c2) { \
- BN_ULONG ta=(a)[i]; \
- BN_UMULT_LOHI(t1,t2,ta,ta); \
- c0 += t1; t2 += (c0<t1)?1:0; \
- c1 += t2; c2 += (c1<t2)?1:0; \
- }
+/*
+ * Keep in mind that additions to hi can not overflow, because
+ * the high word of a multiplication result cannot be all-ones.
+ */
+# define mul_add_c(a,b,c0,c1,c2) do { \
+ BN_ULONG ta = (a), tb = (b); \
+ BN_ULONG lo, hi; \
+ BN_UMULT_LOHI(lo,hi,ta,tb); \
+ c0 += lo; hi += (c0<lo)?1:0; \
+ c1 += hi; c2 += (c1<hi)?1:0; \
+ } while(0)
+
+# define mul_add_c2(a,b,c0,c1,c2) do { \
+ BN_ULONG ta = (a), tb = (b); \
+ BN_ULONG lo, hi, tt; \
+ BN_UMULT_LOHI(lo,hi,ta,tb); \
+ c0 += lo; tt = hi+((c0<lo)?1:0); \
+ c1 += tt; c2 += (c1<tt)?1:0; \
+ c0 += lo; hi += (c0<lo)?1:0; \
+ c1 += hi; c2 += (c1<hi)?1:0; \
+ } while(0)
+
+# define sqr_add_c(a,i,c0,c1,c2) do { \
+ BN_ULONG ta = (a)[i]; \
+ BN_ULONG lo, hi; \
+ BN_UMULT_LOHI(lo,hi,ta,ta); \
+ c0 += lo; hi += (c0<lo)?1:0; \
+ c1 += hi; c2 += (c1<hi)?1:0; \
+ } while(0)
# define sqr_add_c2(a,i,j,c0,c1,c2) \
mul_add_c2((a)[i],(a)[j],c0,c1,c2)
# elif defined(BN_UMULT_HIGH)
-
-# define mul_add_c(a,b,c0,c1,c2) { \
- BN_ULONG ta=(a),tb=(b); \
- t1 = ta * tb; \
- t2 = BN_UMULT_HIGH(ta,tb); \
- c0 += t1; t2 += (c0<t1)?1:0; \
- c1 += t2; c2 += (c1<t2)?1:0; \
- }
-
-# define mul_add_c2(a,b,c0,c1,c2) { \
- BN_ULONG ta=(a),tb=(b),t0; \
- t1 = BN_UMULT_HIGH(ta,tb); \
- t0 = ta * tb; \
- c0 += t0; t2 = t1+((c0<t0)?1:0);\
- c1 += t2; c2 += (c1<t2)?1:0; \
- c0 += t0; t1 += (c0<t0)?1:0; \
- c1 += t1; c2 += (c1<t1)?1:0; \
- }
-
-# define sqr_add_c(a,i,c0,c1,c2) { \
- BN_ULONG ta=(a)[i]; \
- t1 = ta * ta; \
- t2 = BN_UMULT_HIGH(ta,ta); \
- c0 += t1; t2 += (c0<t1)?1:0; \
- c1 += t2; c2 += (c1<t2)?1:0; \
- }
+/*
+ * Keep in mind that additions to hi can not overflow, because
+ * the high word of a multiplication result cannot be all-ones.
+ */
+# define mul_add_c(a,b,c0,c1,c2) do { \
+ BN_ULONG ta = (a), tb = (b); \
+ BN_ULONG lo = ta * tb; \
+ BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \
+ c0 += lo; hi += (c0<lo)?1:0; \
+ c1 += hi; c2 += (c1<hi)?1:0; \
+ } while(0)
+
+# define mul_add_c2(a,b,c0,c1,c2) do { \
+ BN_ULONG ta = (a), tb = (b), tt; \
+ BN_ULONG lo = ta * tb; \
+ BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \
+ c0 += lo; tt = hi + ((c0<lo)?1:0); \
+ c1 += tt; c2 += (c1<tt)?1:0; \
+ c0 += lo; hi += (c0<lo)?1:0; \
+ c1 += hi; c2 += (c1<hi)?1:0; \
+ } while(0)
+
+# define sqr_add_c(a,i,c0,c1,c2) do { \
+ BN_ULONG ta = (a)[i]; \
+ BN_ULONG lo = ta * ta; \
+ BN_ULONG hi = BN_UMULT_HIGH(ta,ta); \
+ c0 += lo; hi += (c0<lo)?1:0; \
+ c1 += hi; c2 += (c1<hi)?1:0; \
+ } while(0)
# define sqr_add_c2(a,i,j,c0,c1,c2) \
mul_add_c2((a)[i],(a)[j],c0,c1,c2)
# else /* !BN_LLONG */
-# define mul_add_c(a,b,c0,c1,c2) \
- t1=LBITS(a); t2=HBITS(a); \
- bl=LBITS(b); bh=HBITS(b); \
- mul64(t1,t2,bl,bh); \
- c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
- c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
-
-# define mul_add_c2(a,b,c0,c1,c2) \
- t1=LBITS(a); t2=HBITS(a); \
- bl=LBITS(b); bh=HBITS(b); \
- mul64(t1,t2,bl,bh); \
- if (t2 & BN_TBIT) c2++; \
- t2=(t2+t2)&BN_MASK2; \
- if (t1 & BN_TBIT) t2++; \
- t1=(t1+t1)&BN_MASK2; \
- c0=(c0+t1)&BN_MASK2; \
- if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \
- c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
-
-# define sqr_add_c(a,i,c0,c1,c2) \
- sqr64(t1,t2,(a)[i]); \
- c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
- c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
+/*
+ * Keep in mind that additions to hi can not overflow, because
+ * the high word of a multiplication result cannot be all-ones.
+ */
+# define mul_add_c(a,b,c0,c1,c2) do { \
+ BN_ULONG lo = LBITS(a), hi = HBITS(a); \
+ BN_ULONG bl = LBITS(b), bh = HBITS(b); \
+ mul64(lo,hi,bl,bh); \
+ c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
+ c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
+ } while(0)
+
+# define mul_add_c2(a,b,c0,c1,c2) do { \
+ BN_ULONG tt; \
+ BN_ULONG lo = LBITS(a), hi = HBITS(a); \
+ BN_ULONG bl = LBITS(b), bh = HBITS(b); \
+ mul64(lo,hi,bl,bh); \
+ tt = hi; \
+ c0 = (c0+lo)&BN_MASK2; if (c0<lo) tt++; \
+ c1 = (c1+tt)&BN_MASK2; if (c1<tt) c2++; \
+ c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
+ c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
+ } while(0)
+
+# define sqr_add_c(a,i,c0,c1,c2) do { \
+ BN_ULONG lo, hi; \
+ sqr64(lo,hi,(a)[i]); \
+ c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
+ c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
+ } while(0)
# define sqr_add_c2(a,i,j,c0,c1,c2) \
mul_add_c2((a)[i],(a)[j],c0,c1,c2)
@@ -611,12 +634,6 @@ BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
{
-# ifdef BN_LLONG
- BN_ULLONG t;
-# else
- BN_ULONG bl, bh;
-# endif
- BN_ULONG t1, t2;
BN_ULONG c1, c2, c3;
c1 = 0;
@@ -720,12 +737,6 @@ void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
{
-# ifdef BN_LLONG
- BN_ULLONG t;
-# else
- BN_ULONG bl, bh;
-# endif
- BN_ULONG t1, t2;
BN_ULONG c1, c2, c3;
c1 = 0;
@@ -765,12 +776,6 @@ void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
{
-# ifdef BN_LLONG
- BN_ULLONG t, tt;
-# else
- BN_ULONG bl, bh;
-# endif
- BN_ULONG t1, t2;
BN_ULONG c1, c2, c3;
c1 = 0;
@@ -846,12 +851,6 @@ void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
{
-# ifdef BN_LLONG
- BN_ULLONG t, tt;
-# else
- BN_ULONG bl, bh;
-# endif
- BN_ULONG t1, t2;
BN_ULONG c1, c2, c3;
c1 = 0;
diff --git a/crypto/bn/bn_exp.c b/crypto/bn/bn_exp.c
index 27146c89e740..24afdd60a227 100644
--- a/crypto/bn/bn_exp.c
+++ b/crypto/bn/bn_exp.c
@@ -122,6 +122,17 @@
# ifndef alloca
# define alloca(s) __builtin_alloca((s))
# endif
+#elif defined(__sun)
+# include <alloca.h>
+#endif
+
+#include "rsaz_exp.h"
+
+#undef SPARC_T4_MONT
+#if defined(OPENSSL_BN_ASM_MONT) && (defined(__sparc__) || defined(__sparc))
+# include "sparc_arch.h"
+extern unsigned int OPENSSL_sparcv9cap_P[];
+# define SPARC_T4_MONT
#endif
/* maximum precomputation table size for *variable* sliding windows */
@@ -464,6 +475,23 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
wstart = bits - 1; /* The top bit of the window */
wend = 0; /* The bottom bit of the window */
+#if 1 /* by Shay Gueron's suggestion */
+ j = m->top; /* borrow j */
+ if (m->d[j - 1] & (((BN_ULONG)1) << (BN_BITS2 - 1))) {
+ if (bn_wexpand(r, j) == NULL)
+ goto err;
+ /* 2^(top*BN_BITS2) - m */
+ r->d[0] = (0 - m->d[0]) & BN_MASK2;
+ for (i = 1; i < j; i++)
+ r->d[i] = (~m->d[i]) & BN_MASK2;
+ r->top = j;
+ /*
+ * Upper words will be zero if the corresponding words of 'm' were
+ * 0xfff[...], so decrement r->top accordingly.
+ */
+ bn_correct_top(r);
+ } else
+#endif
if (!BN_to_montgomery(r, BN_value_one(), mont, ctx))
goto err;
for (;;) {
@@ -515,6 +543,17 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
if (wstart < 0)
break;
}
+#if defined(SPARC_T4_MONT)
+ if (OPENSSL_sparcv9cap_P[0] & (SPARCV9_VIS3 | SPARCV9_PREFER_FPU)) {
+ j = mont->N.top; /* borrow j */
+ val[0]->d[0] = 1; /* borrow val[0] */
+ for (i = 1; i < j; i++)
+ val[0]->d[i] = 0;
+ val[0]->top = j;
+ if (!BN_mod_mul_montgomery(rr, r, val[0], mont, ctx))
+ goto err;
+ } else
+#endif
if (!BN_from_montgomery(rr, r, mont, ctx))
goto err;
ret = 1;
@@ -526,6 +565,27 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
return (ret);
}
+#if defined(SPARC_T4_MONT)
+static BN_ULONG bn_get_bits(const BIGNUM *a, int bitpos)
+{
+ BN_ULONG ret = 0;
+ int wordpos;
+
+ wordpos = bitpos / BN_BITS2;
+ bitpos %= BN_BITS2;
+ if (wordpos >= 0 && wordpos < a->top) {
+ ret = a->d[wordpos] & BN_MASK2;
+ if (bitpos) {
+ ret >>= bitpos;
+ if (++wordpos < a->top)
+ ret |= a->d[wordpos] << (BN_BITS2 - bitpos);
+ }
+ }
+
+ return ret & BN_MASK2;
+}
+#endif
+
/*
* BN_mod_exp_mont_consttime() stores the precomputed powers in a specific
* layout so that accessing any of these table values shows the same access
@@ -594,6 +654,9 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
int powerbufLen = 0;
unsigned char *powerbuf = NULL;
BIGNUM tmp, am;
+#if defined(SPARC_T4_MONT)
+ unsigned int t4 = 0;
+#endif
bn_check_top(a);
bn_check_top(p);
@@ -626,21 +689,62 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
goto err;
}
+#ifdef RSAZ_ENABLED
+ /*
+ * If the size of the operands allow it, perform the optimized
+ * RSAZ exponentiation. For further information see
+ * crypto/bn/rsaz_exp.c and accompanying assembly modules.
+ */
+ if ((16 == a->top) && (16 == p->top) && (BN_num_bits(m) == 1024)
+ && rsaz_avx2_eligible()) {
+ if (NULL == bn_wexpand(rr, 16))
+ goto err;
+ RSAZ_1024_mod_exp_avx2(rr->d, a->d, p->d, m->d, mont->RR.d,
+ mont->n0[0]);
+ rr->top = 16;
+ rr->neg = 0;
+ bn_correct_top(rr);
+ ret = 1;
+ goto err;
+ } else if ((8 == a->top) && (8 == p->top) && (BN_num_bits(m) == 512)) {
+ if (NULL == bn_wexpand(rr, 8))
+ goto err;
+ RSAZ_512_mod_exp(rr->d, a->d, p->d, m->d, mont->n0[0], mont->RR.d);
+ rr->top = 8;
+ rr->neg = 0;
+ bn_correct_top(rr);
+ ret = 1;
+ goto err;
+ }
+#endif
+
/* Get the window size to use with size of p. */
window = BN_window_bits_for_ctime_exponent_size(bits);
+#if defined(SPARC_T4_MONT)
+ if (window >= 5 && (top & 15) == 0 && top <= 64 &&
+ (OPENSSL_sparcv9cap_P[1] & (CFR_MONTMUL | CFR_MONTSQR)) ==
+ (CFR_MONTMUL | CFR_MONTSQR) && (t4 = OPENSSL_sparcv9cap_P[0]))
+ window = 5;
+ else
+#endif
#if defined(OPENSSL_BN_ASM_MONT5)
- if (window == 6 && bits <= 1024)
- window = 5; /* ~5% improvement of 2048-bit RSA sign */
+ if (window >= 5) {
+ window = 5; /* ~5% improvement for RSA2048 sign, and even
+ * for RSA4096 */
+ if ((top & 7) == 0)
+ powerbufLen += 2 * top * sizeof(m->d[0]);
+ }
#endif
+ (void)0;
/*
* Allocate a buffer large enough to hold all of the pre-computed powers
* of am, am itself and tmp.
*/
numPowers = 1 << window;
- powerbufLen = sizeof(m->d[0]) * (top * numPowers +
- ((2 * top) >
- numPowers ? (2 * top) : numPowers));
+ powerbufLen += sizeof(m->d[0]) * (top * numPowers +
+ ((2 * top) >
+ numPowers ? (2 * top) : numPowers));
#ifdef alloca
if (powerbufLen < 3072)
powerbufFree =
@@ -670,15 +774,17 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
tmp.flags = am.flags = BN_FLG_STATIC_DATA;
/* prepare a^0 in Montgomery domain */
-#if 1
+#if 1 /* by Shay Gueron's suggestion */
+ if (m->d[top - 1] & (((BN_ULONG)1) << (BN_BITS2 - 1))) {
+ /* 2^(top*BN_BITS2) - m */
+ tmp.d[0] = (0 - m->d[0]) & BN_MASK2;
+ for (i = 1; i < top; i++)
+ tmp.d[i] = (~m->d[i]) & BN_MASK2;
+ tmp.top = top;
+ } else
+#endif
if (!BN_to_montgomery(&tmp, BN_value_one(), mont, ctx))
goto err;
-#else
- tmp.d[0] = (0 - m->d[0]) & BN_MASK2; /* 2^(top*BN_BITS2) - m */
- for (i = 1; i < top; i++)
- tmp.d[i] = (~m->d[i]) & BN_MASK2;
- tmp.top = top;
-#endif
/* prepare a^1 in Montgomery domain */
if (a->neg || BN_ucmp(a, m) >= 0) {
@@ -689,6 +795,138 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
} else if (!BN_to_montgomery(&am, a, mont, ctx))
goto err;
+#if defined(SPARC_T4_MONT)
+ if (t4) {
+ typedef int (*bn_pwr5_mont_f) (BN_ULONG *tp, const BN_ULONG *np,
+ const BN_ULONG *n0, const void *table,
+ int power, int bits);
+ int bn_pwr5_mont_t4_8(BN_ULONG *tp, const BN_ULONG *np,
+ const BN_ULONG *n0, const void *table,
+ int power, int bits);
+ int bn_pwr5_mont_t4_16(BN_ULONG *tp, const BN_ULONG *np,
+ const BN_ULONG *n0, const void *table,
+ int power, int bits);
+ int bn_pwr5_mont_t4_24(BN_ULONG *tp, const BN_ULONG *np,
+ const BN_ULONG *n0, const void *table,
+ int power, int bits);
+ int bn_pwr5_mont_t4_32(BN_ULONG *tp, const BN_ULONG *np,
+ const BN_ULONG *n0, const void *table,
+ int power, int bits);
+ static const bn_pwr5_mont_f pwr5_funcs[4] = {
+ bn_pwr5_mont_t4_8, bn_pwr5_mont_t4_16,
+ bn_pwr5_mont_t4_24, bn_pwr5_mont_t4_32
+ };
+ bn_pwr5_mont_f pwr5_worker = pwr5_funcs[top / 16 - 1];
+
+ typedef int (*bn_mul_mont_f) (BN_ULONG *rp, const BN_ULONG *ap,
+ const void *bp, const BN_ULONG *np,
+ const BN_ULONG *n0);
+ int bn_mul_mont_t4_8(BN_ULONG *rp, const BN_ULONG *ap, const void *bp,
+ const BN_ULONG *np, const BN_ULONG *n0);
+ int bn_mul_mont_t4_16(BN_ULONG *rp, const BN_ULONG *ap,
+ const void *bp, const BN_ULONG *np,
+ const BN_ULONG *n0);
+ int bn_mul_mont_t4_24(BN_ULONG *rp, const BN_ULONG *ap,
+ const void *bp, const BN_ULONG *np,
+ const BN_ULONG *n0);
+ int bn_mul_mont_t4_32(BN_ULONG *rp, const BN_ULONG *ap,
+ const void *bp, const BN_ULONG *np,
+ const BN_ULONG *n0);
+ static const bn_mul_mont_f mul_funcs[4] = {
+ bn_mul_mont_t4_8, bn_mul_mont_t4_16,
+ bn_mul_mont_t4_24, bn_mul_mont_t4_32
+ };
+ bn_mul_mont_f mul_worker = mul_funcs[top / 16 - 1];
+
+ void bn_mul_mont_vis3(BN_ULONG *rp, const BN_ULONG *ap,
+ const void *bp, const BN_ULONG *np,
+ const BN_ULONG *n0, int num);
+ void bn_mul_mont_t4(BN_ULONG *rp, const BN_ULONG *ap,
+ const void *bp, const BN_ULONG *np,
+ const BN_ULONG *n0, int num);
+ void bn_mul_mont_gather5_t4(BN_ULONG *rp, const BN_ULONG *ap,
+ const void *table, const BN_ULONG *np,
+ const BN_ULONG *n0, int num, int power);
+ void bn_flip_n_scatter5_t4(const BN_ULONG *inp, size_t num,
+ void *table, size_t power);
+ void bn_gather5_t4(BN_ULONG *out, size_t num,
+ void *table, size_t power);
+ void bn_flip_t4(BN_ULONG *dst, BN_ULONG *src, size_t num);
+
+ BN_ULONG *np = mont->N.d, *n0 = mont->n0;
+ int stride = 5 * (6 - (top / 16 - 1)); /* multiple of 5, but less
+ * than 32 */
+
+ /*
+ * BN_to_montgomery can contaminate words above .top [in
+ * BN_DEBUG[_DEBUG] build]...
+ */
+ for (i = am.top; i < top; i++)
+ am.d[i] = 0;
+ for (i = tmp.top; i < top; i++)
+ tmp.d[i] = 0;
+
+ bn_flip_n_scatter5_t4(tmp.d, top, powerbuf, 0);
+ bn_flip_n_scatter5_t4(am.d, top, powerbuf, 1);
+ if (!(*mul_worker) (tmp.d, am.d, am.d, np, n0) &&
+ !(*mul_worker) (tmp.d, am.d, am.d, np, n0))
+ bn_mul_mont_vis3(tmp.d, am.d, am.d, np, n0, top);
+ bn_flip_n_scatter5_t4(tmp.d, top, powerbuf, 2);
+
+ for (i = 3; i < 32; i++) {
+ /* Calculate a^i = a^(i-1) * a */
+ if (!(*mul_worker) (tmp.d, tmp.d, am.d, np, n0) &&
+ !(*mul_worker) (tmp.d, tmp.d, am.d, np, n0))
+ bn_mul_mont_vis3(tmp.d, tmp.d, am.d, np, n0, top);
+ bn_flip_n_scatter5_t4(tmp.d, top, powerbuf, i);
+ }
+
+ /* switch to 64-bit domain */
+ np = alloca(top * sizeof(BN_ULONG));
+ top /= 2;
+ bn_flip_t4(np, mont->N.d, top);
+
+ bits--;
+ for (wvalue = 0, i = bits % 5; i >= 0; i--, bits--)
+ wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
+ bn_gather5_t4(tmp.d, top, powerbuf, wvalue);
+
+ /*
+ * Scan the exponent one window at a time starting from the most
+ * significant bits.
+ */
+ while (bits >= 0) {
+ if (bits < stride)
+ stride = bits + 1;
+ bits -= stride;
+ wvalue = bn_get_bits(p, bits + 1);
+
+ if ((*pwr5_worker) (tmp.d, np, n0, powerbuf, wvalue, stride))
+ continue;
+ /* retry once and fall back */
+ if ((*pwr5_worker) (tmp.d, np, n0, powerbuf, wvalue, stride))
+ continue;
+
+ bits += stride - 5;
+ wvalue >>= stride - 5;
+ wvalue &= 31;
+ bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont_gather5_t4(tmp.d, tmp.d, powerbuf, np, n0, top,
+ wvalue);
+ }
+
+ bn_flip_t4(tmp.d, tmp.d, top);
+ top *= 2;
+ /* back to 32-bit domain */
+ tmp.top = top;
+ bn_correct_top(&tmp);
+ OPENSSL_cleanse(np, top * sizeof(BN_ULONG));
+ } else
+#endif
#if defined(OPENSSL_BN_ASM_MONT5)
if (window == 5 && top > 1) {
/*
@@ -707,8 +945,15 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
void bn_scatter5(const BN_ULONG *inp, size_t num,
void *table, size_t power);
void bn_gather5(BN_ULONG *out, size_t num, void *table, size_t power);
+ void bn_power5(BN_ULONG *rp, const BN_ULONG *ap,
+ const void *table, const BN_ULONG *np,
+ const BN_ULONG *n0, int num, int power);
+ int bn_get_bits5(const BN_ULONG *ap, int off);
+ int bn_from_montgomery(BN_ULONG *rp, const BN_ULONG *ap,
+ const BN_ULONG *not_used, const BN_ULONG *np,
+ const BN_ULONG *n0, int num);
- BN_ULONG *np = mont->N.d, *n0 = mont->n0;
+ BN_ULONG *np = mont->N.d, *n0 = mont->n0, *np2;
/*
* BN_to_montgomery can contaminate words above .top [in
@@ -719,6 +964,12 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
for (i = tmp.top; i < top; i++)
tmp.d[i] = 0;
+ if (top & 7)
+ np2 = np;
+ else
+ for (np2 = am.d + top, i = 0; i < top; i++)
+ np2[2 * i] = np[i];
+
bn_scatter5(tmp.d, top, powerbuf, 0);
bn_scatter5(am.d, am.top, powerbuf, 1);
bn_mul_mont(tmp.d, am.d, am.d, np, n0, top);
@@ -727,7 +978,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
# if 0
for (i = 3; i < 32; i++) {
/* Calculate a^i = a^(i-1) * a */
- bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
+ bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
bn_scatter5(tmp.d, top, powerbuf, i);
}
# else
@@ -738,7 +989,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
}
for (i = 3; i < 8; i += 2) {
int j;
- bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
+ bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
bn_scatter5(tmp.d, top, powerbuf, i);
for (j = 2 * i; j < 32; j *= 2) {
bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
@@ -746,13 +997,13 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
}
}
for (; i < 16; i += 2) {
- bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
+ bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
bn_scatter5(tmp.d, top, powerbuf, i);
bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
bn_scatter5(tmp.d, top, powerbuf, 2 * i);
}
for (; i < 32; i += 2) {
- bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
+ bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
bn_scatter5(tmp.d, top, powerbuf, i);
}
# endif
@@ -765,20 +1016,34 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
* Scan the exponent one window at a time starting from the most
* significant bits.
*/
- while (bits >= 0) {
- for (wvalue = 0, i = 0; i < 5; i++, bits--)
- wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
+ if (top & 7)
+ while (bits >= 0) {
+ for (wvalue = 0, i = 0; i < 5; i++, bits--)
+ wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
- bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
- bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
- bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
- bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
- bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
- bn_mul_mont_gather5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue);
+ bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont_gather5(tmp.d, tmp.d, powerbuf, np, n0, top,
+ wvalue);
+ } else {
+ while (bits >= 0) {
+ wvalue = bn_get_bits5(p->d, bits - 4);
+ bits -= 5;
+ bn_power5(tmp.d, tmp.d, powerbuf, np2, n0, top, wvalue);
+ }
}
+ ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np2, n0, top);
tmp.top = top;
bn_correct_top(&tmp);
+ if (ret) {
+ if (!BN_copy(rr, &tmp))
+ ret = 0;
+ goto err; /* non-zero ret means it's not error */
+ }
} else
#endif
{
@@ -844,6 +1109,15 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
}
/* Convert the final result from montgomery to standard format */
+#if defined(SPARC_T4_MONT)
+ if (OPENSSL_sparcv9cap_P[0] & (SPARCV9_VIS3 | SPARCV9_PREFER_FPU)) {
+ am.d[0] = 1; /* borrow am */
+ for (i = 1; i < top; i++)
+ am.d[i] = 0;
+ if (!BN_mod_mul_montgomery(rr, &tmp, &am, mont, ctx))
+ goto err;
+ } else
+#endif
if (!BN_from_montgomery(rr, &tmp, mont, ctx))
goto err;
ret = 1;
diff --git a/crypto/bn/bn_gf2m.c b/crypto/bn/bn_gf2m.c
index a0ba8de31ad4..cfa1c7ce1499 100644
--- a/crypto/bn/bn_gf2m.c
+++ b/crypto/bn/bn_gf2m.c
@@ -450,8 +450,7 @@ int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const int p[])
d0 = p[k] % BN_BITS2;
d1 = BN_BITS2 - d0;
z[n] ^= (zz << d0);
- tmp_ulong = zz >> d1;
- if (d0 && tmp_ulong)
+ if (d0 && (tmp_ulong = zz >> d1))
z[n + 1] ^= tmp_ulong;
}
diff --git a/crypto/bn/bn_lcl.h b/crypto/bn/bn_lcl.h
index 904a723497d1..00f4f09945b3 100644
--- a/crypto/bn/bn_lcl.h
+++ b/crypto/bn/bn_lcl.h
@@ -204,6 +204,24 @@ extern "C" {
# define BN_MUL_LOW_RECURSIVE_SIZE_NORMAL (32)/* 32 */
# define BN_MONT_CTX_SET_SIZE_WORD (64)/* 32 */
+/*
+ * 2011-02-22 SMS. In various places, a size_t variable or a type cast to
+ * size_t was used to perform integer-only operations on pointers. This
+ * failed on VMS with 64-bit pointers (CC /POINTER_SIZE = 64) because size_t
+ * is still only 32 bits. What's needed in these cases is an integer type
+ * with the same size as a pointer, which size_t is not certain to be. The
+ * only fix here is VMS-specific.
+ */
+# if defined(OPENSSL_SYS_VMS)
+# if __INITIAL_POINTER_SIZE == 64
+# define PTR_SIZE_INT long long
+# else /* __INITIAL_POINTER_SIZE == 64 */
+# define PTR_SIZE_INT int
+# endif /* __INITIAL_POINTER_SIZE == 64 [else] */
+# elif !defined(PTR_SIZE_INT) /* defined(OPENSSL_SYS_VMS) */
+# define PTR_SIZE_INT size_t
+# endif /* defined(OPENSSL_SYS_VMS) [else] */
+
# if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) && !defined(PEDANTIC)
/*
* BN_UMULT_HIGH section.
@@ -295,6 +313,15 @@ unsigned __int64 _umul128(unsigned __int64 a, unsigned __int64 b,
: "r"(a), "r"(b));
# endif
# endif
+# elif defined(__aarch64__) && defined(SIXTY_FOUR_BIT_LONG)
+# if defined(__GNUC__) && __GNUC__>=2
+# define BN_UMULT_HIGH(a,b) ({ \
+ register BN_ULONG ret; \
+ asm ("umulh %0,%1,%2" \
+ : "=r"(ret) \
+ : "r"(a), "r"(b)); \
+ ret; })
+# endif
# endif /* cpu */
# endif /* OPENSSL_NO_ASM */
diff --git a/crypto/bn/bntest.c b/crypto/bn/bntest.c
index 06662c58b32f..470d5dabf1ec 100644
--- a/crypto/bn/bntest.c
+++ b/crypto/bn/bntest.c
@@ -1042,7 +1042,6 @@ int test_mod_exp_mont_consttime(BIO *bp, BN_CTX *ctx)
int test_mod_exp_mont5(BIO *bp, BN_CTX *ctx)
{
BIGNUM *a, *p, *m, *d, *e;
-
BN_MONT_CTX *mont;
a = BN_new();
@@ -1050,7 +1049,6 @@ int test_mod_exp_mont5(BIO *bp, BN_CTX *ctx)
m = BN_new();
d = BN_new();
e = BN_new();
-
mont = BN_MONT_CTX_new();
BN_bntest_rand(m, 1024, 0, 1); /* must be odd for montgomery */
@@ -1099,6 +1097,7 @@ int test_mod_exp_mont5(BIO *bp, BN_CTX *ctx)
fprintf(stderr, "Modular exponentiation test failed!\n");
return 0;
}
+ BN_MONT_CTX_free(mont);
BN_free(a);
BN_free(p);
BN_free(m);
diff --git a/crypto/bn/rsaz_exp.c b/crypto/bn/rsaz_exp.c
new file mode 100644
index 000000000000..c54c6feb51b5
--- /dev/null
+++ b/crypto/bn/rsaz_exp.c
@@ -0,0 +1,346 @@
+/*****************************************************************************
+* *
+* Copyright (c) 2012, Intel Corporation *
+* *
+* All rights reserved. *
+* *
+* Redistribution and use in source and binary forms, with or without *
+* modification, are permitted provided that the following conditions are *
+* met: *
+* *
+* * Redistributions of source code must retain the above copyright *
+* notice, this list of conditions and the following disclaimer. *
+* *
+* * Redistributions in binary form must reproduce the above copyright *
+* notice, this list of conditions and the following disclaimer in the *
+* documentation and/or other materials provided with the *
+* distribution. *
+* *
+* * Neither the name of the Intel Corporation nor the names of its *
+* contributors may be used to endorse or promote products derived from *
+* this software without specific prior written permission. *
+* *
+* *
+* THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY *
+* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE *
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR *
+* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR *
+* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, *
+* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, *
+* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR *
+* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF *
+* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
+* *
+******************************************************************************
+* Developers and authors: *
+* Shay Gueron (1, 2), and Vlad Krasnov (1) *
+* (1) Intel Corporation, Israel Development Center, Haifa, Israel *
+* (2) University of Haifa, Israel *
+*****************************************************************************/
+
+#include "rsaz_exp.h"
+
+#ifdef RSAZ_ENABLED
+
+/*
+ * See crypto/bn/asm/rsaz-avx2.pl for further details.
+ */
+void rsaz_1024_norm2red_avx2(void *red, const void *norm);
+void rsaz_1024_mul_avx2(void *ret, const void *a, const void *b,
+ const void *n, BN_ULONG k);
+void rsaz_1024_sqr_avx2(void *ret, const void *a, const void *n, BN_ULONG k,
+ int cnt);
+void rsaz_1024_scatter5_avx2(void *tbl, const void *val, int i);
+void rsaz_1024_gather5_avx2(void *val, const void *tbl, int i);
+void rsaz_1024_red2norm_avx2(void *norm, const void *red);
+
+#if defined(__GNUC__)
+# define ALIGN64 __attribute__((aligned(64)))
+#elif defined(_MSC_VER)
+# define ALIGN64 __declspec(align(64))
+#elif defined(__SUNPRO_C)
+# define ALIGN64
+# pragma align 64(one,two80)
+#else
+/* not fatal, might hurt performance a little */
+# define ALIGN64
+#endif
+
+ALIGN64 static const BN_ULONG one[40] = {
+ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+ALIGN64 static const BN_ULONG two80[40] = {
+ 0, 0, 1 << 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+void RSAZ_1024_mod_exp_avx2(BN_ULONG result_norm[16],
+ const BN_ULONG base_norm[16],
+ const BN_ULONG exponent[16],
+ const BN_ULONG m_norm[16], const BN_ULONG RR[16],
+ BN_ULONG k0)
+{
+ unsigned char storage[320 * 3 + 32 * 9 * 16 + 64]; /* 5.5KB */
+ unsigned char *p_str = storage + (64 - ((size_t)storage % 64));
+ unsigned char *a_inv, *m, *result;
+ unsigned char *table_s = p_str + 320 * 3;
+ unsigned char *R2 = table_s; /* borrow */
+ int index;
+ int wvalue;
+
+ if ((((size_t)p_str & 4095) + 320) >> 12) {
+ result = p_str;
+ a_inv = p_str + 320;
+ m = p_str + 320 * 2; /* should not cross page */
+ } else {
+ m = p_str; /* should not cross page */
+ result = p_str + 320;
+ a_inv = p_str + 320 * 2;
+ }
+
+ rsaz_1024_norm2red_avx2(m, m_norm);
+ rsaz_1024_norm2red_avx2(a_inv, base_norm);
+ rsaz_1024_norm2red_avx2(R2, RR);
+
+ rsaz_1024_mul_avx2(R2, R2, R2, m, k0);
+ rsaz_1024_mul_avx2(R2, R2, two80, m, k0);
+
+ /* table[0] = 1 */
+ rsaz_1024_mul_avx2(result, R2, one, m, k0);
+ /* table[1] = a_inv^1 */
+ rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0);
+
+ rsaz_1024_scatter5_avx2(table_s, result, 0);
+ rsaz_1024_scatter5_avx2(table_s, a_inv, 1);
+
+ /* table[2] = a_inv^2 */
+ rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1);
+ rsaz_1024_scatter5_avx2(table_s, result, 2);
+#if 0
+ /* this is almost 2x smaller and less than 1% slower */
+ for (index = 3; index < 32; index++) {
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+ rsaz_1024_scatter5_avx2(table_s, result, index);
+ }
+#else
+ /* table[4] = a_inv^4 */
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+ rsaz_1024_scatter5_avx2(table_s, result, 4);
+ /* table[8] = a_inv^8 */
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+ rsaz_1024_scatter5_avx2(table_s, result, 8);
+ /* table[16] = a_inv^16 */
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+ rsaz_1024_scatter5_avx2(table_s, result, 16);
+ /* table[17] = a_inv^17 */
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+ rsaz_1024_scatter5_avx2(table_s, result, 17);
+
+ /* table[3] */
+ rsaz_1024_gather5_avx2(result, table_s, 2);
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+ rsaz_1024_scatter5_avx2(table_s, result, 3);
+ /* table[6] */
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+ rsaz_1024_scatter5_avx2(table_s, result, 6);
+ /* table[12] */
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+ rsaz_1024_scatter5_avx2(table_s, result, 12);
+ /* table[24] */
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+ rsaz_1024_scatter5_avx2(table_s, result, 24);
+ /* table[25] */
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+ rsaz_1024_scatter5_avx2(table_s, result, 25);
+
+ /* table[5] */
+ rsaz_1024_gather5_avx2(result, table_s, 4);
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+ rsaz_1024_scatter5_avx2(table_s, result, 5);
+ /* table[10] */
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+ rsaz_1024_scatter5_avx2(table_s, result, 10);
+ /* table[20] */
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+ rsaz_1024_scatter5_avx2(table_s, result, 20);
+ /* table[21] */
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+ rsaz_1024_scatter5_avx2(table_s, result, 21);
+
+ /* table[7] */
+ rsaz_1024_gather5_avx2(result, table_s, 6);
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+ rsaz_1024_scatter5_avx2(table_s, result, 7);
+ /* table[14] */
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+ rsaz_1024_scatter5_avx2(table_s, result, 14);
+ /* table[28] */
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+ rsaz_1024_scatter5_avx2(table_s, result, 28);
+ /* table[29] */
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+ rsaz_1024_scatter5_avx2(table_s, result, 29);
+
+ /* table[9] */
+ rsaz_1024_gather5_avx2(result, table_s, 8);
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+ rsaz_1024_scatter5_avx2(table_s, result, 9);
+ /* table[18] */
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+ rsaz_1024_scatter5_avx2(table_s, result, 18);
+ /* table[19] */
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+ rsaz_1024_scatter5_avx2(table_s, result, 19);
+
+ /* table[11] */
+ rsaz_1024_gather5_avx2(result, table_s, 10);
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+ rsaz_1024_scatter5_avx2(table_s, result, 11);
+ /* table[22] */
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+ rsaz_1024_scatter5_avx2(table_s, result, 22);
+ /* table[23] */
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+ rsaz_1024_scatter5_avx2(table_s, result, 23);
+
+ /* table[13] */
+ rsaz_1024_gather5_avx2(result, table_s, 12);
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+ rsaz_1024_scatter5_avx2(table_s, result, 13);
+ /* table[26] */
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+ rsaz_1024_scatter5_avx2(table_s, result, 26);
+ /* table[27] */
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+ rsaz_1024_scatter5_avx2(table_s, result, 27);
+
+ /* table[15] */
+ rsaz_1024_gather5_avx2(result, table_s, 14);
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+ rsaz_1024_scatter5_avx2(table_s, result, 15);
+ /* table[30] */
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+ rsaz_1024_scatter5_avx2(table_s, result, 30);
+ /* table[31] */
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+ rsaz_1024_scatter5_avx2(table_s, result, 31);
+#endif
+
+ /* load first window */
+ p_str = (unsigned char *)exponent;
+ wvalue = p_str[127] >> 3;
+ rsaz_1024_gather5_avx2(result, table_s, wvalue);
+
+ index = 1014;
+
+ while (index > -1) { /* loop for the remaining 127 windows */
+
+ rsaz_1024_sqr_avx2(result, result, m, k0, 5);
+
+ wvalue = *((unsigned short *)&p_str[index / 8]);
+ wvalue = (wvalue >> (index % 8)) & 31;
+ index -= 5;
+
+ rsaz_1024_gather5_avx2(a_inv, table_s, wvalue); /* borrow a_inv */
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+ }
+
+ /* square four times */
+ rsaz_1024_sqr_avx2(result, result, m, k0, 4);
+
+ wvalue = p_str[0] & 15;
+
+ rsaz_1024_gather5_avx2(a_inv, table_s, wvalue); /* borrow a_inv */
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+
+ /* from Montgomery */
+ rsaz_1024_mul_avx2(result, result, one, m, k0);
+
+ rsaz_1024_red2norm_avx2(result_norm, result);
+
+ OPENSSL_cleanse(storage, sizeof(storage));
+}
+
+/*
+ * See crypto/bn/rsaz-x86_64.pl for further details.
+ */
+void rsaz_512_mul(void *ret, const void *a, const void *b, const void *n,
+ BN_ULONG k);
+void rsaz_512_mul_scatter4(void *ret, const void *a, const void *n,
+ BN_ULONG k, const void *tbl, unsigned int power);
+void rsaz_512_mul_gather4(void *ret, const void *a, const void *tbl,
+ const void *n, BN_ULONG k, unsigned int power);
+void rsaz_512_mul_by_one(void *ret, const void *a, const void *n, BN_ULONG k);
+void rsaz_512_sqr(void *ret, const void *a, const void *n, BN_ULONG k,
+ int cnt);
+void rsaz_512_scatter4(void *tbl, const BN_ULONG *val, int power);
+void rsaz_512_gather4(BN_ULONG *val, const void *tbl, int power);
+
+void RSAZ_512_mod_exp(BN_ULONG result[8],
+ const BN_ULONG base[8], const BN_ULONG exponent[8],
+ const BN_ULONG m[8], BN_ULONG k0, const BN_ULONG RR[8])
+{
+ unsigned char storage[16 * 8 * 8 + 64 * 2 + 64]; /* 1.2KB */
+ unsigned char *table = storage + (64 - ((size_t)storage % 64));
+ BN_ULONG *a_inv = (BN_ULONG *)(table + 16 * 8 * 8);
+ BN_ULONG *temp = (BN_ULONG *)(table + 16 * 8 * 8 + 8 * 8);
+ unsigned char *p_str = (unsigned char *)exponent;
+ int index;
+ unsigned int wvalue;
+
+ /* table[0] = 1_inv */
+ temp[0] = 0 - m[0];
+ temp[1] = ~m[1];
+ temp[2] = ~m[2];
+ temp[3] = ~m[3];
+ temp[4] = ~m[4];
+ temp[5] = ~m[5];
+ temp[6] = ~m[6];
+ temp[7] = ~m[7];
+ rsaz_512_scatter4(table, temp, 0);
+
+ /* table [1] = a_inv^1 */
+ rsaz_512_mul(a_inv, base, RR, m, k0);
+ rsaz_512_scatter4(table, a_inv, 1);
+
+ /* table [2] = a_inv^2 */
+ rsaz_512_sqr(temp, a_inv, m, k0, 1);
+ rsaz_512_scatter4(table, temp, 2);
+
+ for (index = 3; index < 16; index++)
+ rsaz_512_mul_scatter4(temp, a_inv, m, k0, table, index);
+
+ /* load first window */
+ wvalue = p_str[63];
+
+ rsaz_512_gather4(temp, table, wvalue >> 4);
+ rsaz_512_sqr(temp, temp, m, k0, 4);
+ rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue & 0xf);
+
+ for (index = 62; index >= 0; index--) {
+ wvalue = p_str[index];
+
+ rsaz_512_sqr(temp, temp, m, k0, 4);
+ rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue >> 4);
+
+ rsaz_512_sqr(temp, temp, m, k0, 4);
+ rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue & 0x0f);
+ }
+
+ /* from Montgomery */
+ rsaz_512_mul_by_one(result, temp, m, k0);
+
+ OPENSSL_cleanse(storage, sizeof(storage));
+}
+
+#else
+
+# if defined(PEDANTIC) || defined(__DECC) || defined(__clang__)
+static void *dummy = &dummy;
+# endif
+
+#endif
diff --git a/crypto/bn/rsaz_exp.h b/crypto/bn/rsaz_exp.h
new file mode 100644
index 000000000000..33361de99572
--- /dev/null
+++ b/crypto/bn/rsaz_exp.h
@@ -0,0 +1,56 @@
+/******************************************************************************
+* Copyright(c) 2012, Intel Corp.
+* Developers and authors:
+* Shay Gueron (1, 2), and Vlad Krasnov (1)
+* (1) Intel Corporation, Israel Development Center, Haifa, Israel
+* (2) University of Haifa, Israel
+******************************************************************************
+* LICENSE:
+* This submission to OpenSSL is to be made available under the OpenSSL
+* license, and only to the OpenSSL project, in order to allow integration
+* into the publicly distributed code.
+* The use of this code, or portions of this code, or concepts embedded in
+* this code, or modification of this code and/or algorithm(s) in it, or the
+* use of this code for any other purpose than stated above, requires special
+* licensing.
+******************************************************************************
+* DISCLAIMER:
+* THIS SOFTWARE IS PROVIDED BY THE CONTRIBUTORS AND THE COPYRIGHT OWNERS
+* ``AS IS''. ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS OR THE COPYRIGHT
+* OWNERS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+* POSSIBILITY OF SUCH DAMAGE.
+******************************************************************************/
+
+#ifndef RSAZ_EXP_H
+# define RSAZ_EXP_H
+
+# undef RSAZ_ENABLED
+# if defined(OPENSSL_BN_ASM_MONT) && \
+ (defined(__x86_64) || defined(__x86_64__) || \
+ defined(_M_AMD64) || defined(_M_X64))
+# define RSAZ_ENABLED
+
+# include <openssl/bn.h>
+
+void RSAZ_1024_mod_exp_avx2(BN_ULONG result[16],
+ const BN_ULONG base_norm[16],
+ const BN_ULONG exponent[16],
+ const BN_ULONG m_norm[16], const BN_ULONG RR[16],
+ BN_ULONG k0);
+int rsaz_avx2_eligible();
+
+void RSAZ_512_mod_exp(BN_ULONG result[8],
+ const BN_ULONG base_norm[8], const BN_ULONG exponent[8],
+ const BN_ULONG m_norm[8], BN_ULONG k0,
+ const BN_ULONG RR[8]);
+
+# endif
+
+#endif
diff --git a/crypto/buffer/buf_str.c b/crypto/buffer/buf_str.c
index fdde3d7db4ba..ebc5ab4646ce 100644
--- a/crypto/buffer/buf_str.c
+++ b/crypto/buffer/buf_str.c
@@ -60,6 +60,15 @@
#include "cryptlib.h"
#include <openssl/buffer.h>
+size_t BUF_strnlen(const char *str, size_t maxlen)
+{
+ const char *p;
+
+ for (p = str; maxlen-- != 0 && *p != '\0'; ++p) ;
+
+ return p - str;
+}
+
char *BUF_strdup(const char *str)
{
if (str == NULL)
@@ -74,6 +83,8 @@ char *BUF_strndup(const char *str, size_t siz)
if (str == NULL)
return (NULL);
+ siz = BUF_strnlen(str, siz);
+
ret = OPENSSL_malloc(siz + 1);
if (ret == NULL) {
BUFerr(BUF_F_BUF_STRNDUP, ERR_R_MALLOC_FAILURE);
diff --git a/crypto/buffer/buffer.h b/crypto/buffer/buffer.h
index 632df93c657c..c343dd772f1e 100644
--- a/crypto/buffer/buffer.h
+++ b/crypto/buffer/buffer.h
@@ -84,6 +84,7 @@ BUF_MEM *BUF_MEM_new(void);
void BUF_MEM_free(BUF_MEM *a);
int BUF_MEM_grow(BUF_MEM *str, size_t len);
int BUF_MEM_grow_clean(BUF_MEM *str, size_t len);
+size_t BUF_strnlen(const char *str, size_t maxlen);
char *BUF_strdup(const char *str);
char *BUF_strndup(const char *str, size_t siz);
void *BUF_memdup(const void *data, size_t siz);
diff --git a/crypto/camellia/Makefile b/crypto/camellia/Makefile
index 228f1dc713d5..ab1225e7d902 100644
--- a/crypto/camellia/Makefile
+++ b/crypto/camellia/Makefile
@@ -48,6 +48,8 @@ cmll-x86.s: asm/cmll-x86.pl ../perlasm/x86asm.pl
$(PERL) asm/cmll-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@
cmll-x86_64.s: asm/cmll-x86_64.pl
$(PERL) asm/cmll-x86_64.pl $(PERLASM_SCHEME) > $@
+cmllt4-sparcv9.s: asm/cmllt4-sparcv9.pl ../perlasm/sparcv9_modes.pl
+ $(PERL) asm/cmllt4-sparcv9.pl $(CFLAGS) > $@
files:
$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
diff --git a/crypto/camellia/asm/cmll-x86_64.pl b/crypto/camellia/asm/cmll-x86_64.pl
index 9f4b82fa4821..d94f46b887e5 100755
--- a/crypto/camellia/asm/cmll-x86_64.pl
+++ b/crypto/camellia/asm/cmll-x86_64.pl
@@ -72,7 +72,7 @@ my $i=@_[0];
my $seed=defined(@_[1])?@_[1]:0;
my $scale=$seed<0?-8:8;
my $j=($i&1)*2;
-my $s0=@S[($j)%4],$s1=@S[($j+1)%4],$s2=@S[($j+2)%4],$s3=@S[($j+3)%4];
+my ($s0,$s1,$s2,$s3)=(@S[($j)%4],@S[($j+1)%4],@S[($j+2)%4],@S[($j+3)%4]);
$code.=<<___;
xor $s0,$t0 # t0^=key[0]
@@ -409,7 +409,7 @@ Camellia_Ekeygen:
push %r15
.Lkey_prologue:
- mov %rdi,$keyend # put away arguments, keyBitLength
+ mov %edi,${keyend}d # put away arguments, keyBitLength
mov %rdx,$out # keyTable
mov 0(%rsi),@S[0] # load 0-127 bits
diff --git a/crypto/camellia/asm/cmllt4-sparcv9.pl b/crypto/camellia/asm/cmllt4-sparcv9.pl
new file mode 100755
index 000000000000..a813168b42ec
--- /dev/null
+++ b/crypto/camellia/asm/cmllt4-sparcv9.pl
@@ -0,0 +1,929 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by David S. Miller <davem@devemloft.net> and Andy Polyakov
+# <appro@openssl.org>. The module is licensed under 2-clause BSD
+# license. October 2012. All rights reserved.
+# ====================================================================
+
+######################################################################
+# Camellia for SPARC T4.
+#
+# As with AES below results [for aligned data] are virtually identical
+# to critical path lenths for 3-cycle instruction latency:
+#
+# 128-bit key 192/256-
+# CBC encrypt 4.14/4.21(*) 5.46/5.52
+# (*) numbers after slash are for
+# misaligned data.
+#
+# As with Intel AES-NI, question is if it's possible to improve
+# performance of parallelizeable modes by interleaving round
+# instructions. In Camellia every instruction is dependent on
+# previous, which means that there is place for 2 additional ones
+# in between two dependent. Can we expect 3x performance improvement?
+# At least one can argue that it should be possible to break 2x
+# barrier... For some reason not even 2x appears to be possible:
+#
+# 128-bit key 192/256-
+# CBC decrypt 2.21/2.74 2.99/3.40
+# CTR 2.15/2.68(*) 2.93/3.34
+# (*) numbers after slash are for
+# misaligned data.
+#
+# This is for 2x interleave. But compared to 1x interleave CBC decrypt
+# improved by ... 0% for 128-bit key, and 11% for 192/256-bit one.
+# So that out-of-order execution logic can take non-interleaved code
+# to 1.87x, but can't take 2x interleaved one any further. There
+# surely is some explanation... As result 3x interleave was not even
+# attempted. Instead an effort was made to share specific modes
+# implementations with AES module (therefore sparct4_modes.pl).
+#
+# To anchor to something else, software C implementation processes
+# one byte in 38 cycles with 128-bit key on same processor.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "sparcv9_modes.pl";
+
+&asm_init(@ARGV);
+
+$::evp=1; # if $evp is set to 0, script generates module with
+# Camellia_[en|de]crypt, Camellia_set_key and Camellia_cbc_encrypt
+# entry points. These are fully compatible with openssl/camellia.h.
+
+######################################################################
+# single-round subroutines
+#
+{
+my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5));
+
+$code=<<___;
+.text
+
+.globl cmll_t4_encrypt
+.align 32
+cmll_t4_encrypt:
+ andcc $inp, 7, %g1 ! is input aligned?
+ andn $inp, 7, $inp
+
+ ldx [$key + 0], %g4
+ ldx [$key + 8], %g5
+
+ ldx [$inp + 0], %o4
+ bz,pt %icc, 1f
+ ldx [$inp + 8], %o5
+ ldx [$inp + 16], $inp
+ sll %g1, 3, %g1
+ sub %g0, %g1, %o3
+ sllx %o4, %g1, %o4
+ sllx %o5, %g1, %g1
+ srlx %o5, %o3, %o5
+ srlx $inp, %o3, %o3
+ or %o5, %o4, %o4
+ or %o3, %g1, %o5
+1:
+ ld [$key + 272], $rounds ! grandRounds, 3 or 4
+ ldd [$key + 16], %f12
+ ldd [$key + 24], %f14
+ xor %g4, %o4, %o4
+ xor %g5, %o5, %o5
+ ldd [$key + 32], %f16
+ ldd [$key + 40], %f18
+ movxtod %o4, %f0
+ movxtod %o5, %f2
+ ldd [$key + 48], %f20
+ ldd [$key + 56], %f22
+ sub $rounds, 1, $rounds
+ ldd [$key + 64], %f24
+ ldd [$key + 72], %f26
+ add $key, 80, $key
+
+.Lenc:
+ camellia_f %f12, %f2, %f0, %f2
+ ldd [$key + 0], %f12
+ sub $rounds,1,$rounds
+ camellia_f %f14, %f0, %f2, %f0
+ ldd [$key + 8], %f14
+ camellia_f %f16, %f2, %f0, %f2
+ ldd [$key + 16], %f16
+ camellia_f %f18, %f0, %f2, %f0
+ ldd [$key + 24], %f18
+ camellia_f %f20, %f2, %f0, %f2
+ ldd [$key + 32], %f20
+ camellia_f %f22, %f0, %f2, %f0
+ ldd [$key + 40], %f22
+ camellia_fl %f24, %f0, %f0
+ ldd [$key + 48], %f24
+ camellia_fli %f26, %f2, %f2
+ ldd [$key + 56], %f26
+ brnz,pt $rounds, .Lenc
+ add $key, 64, $key
+
+ andcc $out, 7, $tmp ! is output aligned?
+ camellia_f %f12, %f2, %f0, %f2
+ camellia_f %f14, %f0, %f2, %f0
+ camellia_f %f16, %f2, %f0, %f2
+ camellia_f %f18, %f0, %f2, %f0
+ camellia_f %f20, %f2, %f0, %f4
+ camellia_f %f22, %f0, %f4, %f2
+ fxor %f24, %f4, %f0
+ fxor %f26, %f2, %f2
+
+ bnz,pn %icc, 2f
+ nop
+
+ std %f0, [$out + 0]
+ retl
+ std %f2, [$out + 8]
+
+2: alignaddrl $out, %g0, $out
+ mov 0xff, $mask
+ srl $mask, $tmp, $mask
+
+ faligndata %f0, %f0, %f4
+ faligndata %f0, %f2, %f6
+ faligndata %f2, %f2, %f8
+
+ stda %f4, [$out + $mask]0xc0 ! partial store
+ std %f6, [$out + 8]
+ add $out, 16, $out
+ orn %g0, $mask, $mask
+ retl
+ stda %f8, [$out + $mask]0xc0 ! partial store
+.type cmll_t4_encrypt,#function
+.size cmll_t4_encrypt,.-cmll_t4_encrypt
+
+.globl cmll_t4_decrypt
+.align 32
+cmll_t4_decrypt:
+ ld [$key + 272], $rounds ! grandRounds, 3 or 4
+ andcc $inp, 7, %g1 ! is input aligned?
+ andn $inp, 7, $inp
+
+ sll $rounds, 6, $rounds
+ add $rounds, $key, $key
+
+ ldx [$inp + 0], %o4
+ bz,pt %icc, 1f
+ ldx [$inp + 8], %o5
+ ldx [$inp + 16], $inp
+ sll %g1, 3, %g1
+ sub %g0, %g1, %g4
+ sllx %o4, %g1, %o4
+ sllx %o5, %g1, %g1
+ srlx %o5, %g4, %o5
+ srlx $inp, %g4, %g4
+ or %o5, %o4, %o4
+ or %g4, %g1, %o5
+1:
+ ldx [$key + 0], %g4
+ ldx [$key + 8], %g5
+ ldd [$key - 8], %f12
+ ldd [$key - 16], %f14
+ xor %g4, %o4, %o4
+ xor %g5, %o5, %o5
+ ldd [$key - 24], %f16
+ ldd [$key - 32], %f18
+ movxtod %o4, %f0
+ movxtod %o5, %f2
+ ldd [$key - 40], %f20
+ ldd [$key - 48], %f22
+ sub $rounds, 64, $rounds
+ ldd [$key - 56], %f24
+ ldd [$key - 64], %f26
+ sub $key, 64, $key
+
+.Ldec:
+ camellia_f %f12, %f2, %f0, %f2
+ ldd [$key - 8], %f12
+ sub $rounds, 64, $rounds
+ camellia_f %f14, %f0, %f2, %f0
+ ldd [$key - 16], %f14
+ camellia_f %f16, %f2, %f0, %f2
+ ldd [$key - 24], %f16
+ camellia_f %f18, %f0, %f2, %f0
+ ldd [$key - 32], %f18
+ camellia_f %f20, %f2, %f0, %f2
+ ldd [$key - 40], %f20
+ camellia_f %f22, %f0, %f2, %f0
+ ldd [$key - 48], %f22
+ camellia_fl %f24, %f0, %f0
+ ldd [$key - 56], %f24
+ camellia_fli %f26, %f2, %f2
+ ldd [$key - 64], %f26
+ brnz,pt $rounds, .Ldec
+ sub $key, 64, $key
+
+ andcc $out, 7, $tmp ! is output aligned?
+ camellia_f %f12, %f2, %f0, %f2
+ camellia_f %f14, %f0, %f2, %f0
+ camellia_f %f16, %f2, %f0, %f2
+ camellia_f %f18, %f0, %f2, %f0
+ camellia_f %f20, %f2, %f0, %f4
+ camellia_f %f22, %f0, %f4, %f2
+ fxor %f26, %f4, %f0
+ fxor %f24, %f2, %f2
+
+ bnz,pn %icc, 2f
+ nop
+
+ std %f0, [$out + 0]
+ retl
+ std %f2, [$out + 8]
+
+2: alignaddrl $out, %g0, $out
+ mov 0xff, $mask
+ srl $mask, $tmp, $mask
+
+ faligndata %f0, %f0, %f4
+ faligndata %f0, %f2, %f6
+ faligndata %f2, %f2, %f8
+
+ stda %f4, [$out + $mask]0xc0 ! partial store
+ std %f6, [$out + 8]
+ add $out, 16, $out
+ orn %g0, $mask, $mask
+ retl
+ stda %f8, [$out + $mask]0xc0 ! partial store
+.type cmll_t4_decrypt,#function
+.size cmll_t4_decrypt,.-cmll_t4_decrypt
+___
+}
+
+######################################################################
+# key setup subroutines
+#
+{
+sub ROTL128 {
+ my $rot = shift;
+
+ "srlx %o4, 64-$rot, %g4\n\t".
+ "sllx %o4, $rot, %o4\n\t".
+ "srlx %o5, 64-$rot, %g5\n\t".
+ "sllx %o5, $rot, %o5\n\t".
+ "or %o4, %g5, %o4\n\t".
+ "or %o5, %g4, %o5";
+}
+
+my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5));
+$code.=<<___;
+.globl cmll_t4_set_key
+.align 32
+cmll_t4_set_key:
+ and $inp, 7, $tmp
+ alignaddr $inp, %g0, $inp
+ cmp $bits, 192
+ ldd [$inp + 0], %f0
+ bl,pt %icc,.L128
+ ldd [$inp + 8], %f2
+
+ be,pt %icc,.L192
+ ldd [$inp + 16], %f4
+
+ brz,pt $tmp, .L256aligned
+ ldd [$inp + 24], %f6
+
+ ldd [$inp + 32], %f8
+ faligndata %f0, %f2, %f0
+ faligndata %f2, %f4, %f2
+ faligndata %f4, %f6, %f4
+ b .L256aligned
+ faligndata %f6, %f8, %f6
+
+.align 16
+.L192:
+ brz,a,pt $tmp, .L256aligned
+ fnot2 %f4, %f6
+
+ ldd [$inp + 24], %f6
+ nop
+ faligndata %f0, %f2, %f0
+ faligndata %f2, %f4, %f2
+ faligndata %f4, %f6, %f4
+ fnot2 %f4, %f6
+
+.L256aligned:
+ std %f0, [$out + 0] ! k[0, 1]
+ fsrc2 %f0, %f28
+ std %f2, [$out + 8] ! k[2, 3]
+ fsrc2 %f2, %f30
+ fxor %f4, %f0, %f0
+ b .L128key
+ fxor %f6, %f2, %f2
+
+.align 16
+.L128:
+ brz,pt $tmp, .L128aligned
+ nop
+
+ ldd [$inp + 16], %f4
+ nop
+ faligndata %f0, %f2, %f0
+ faligndata %f2, %f4, %f2
+
+.L128aligned:
+ std %f0, [$out + 0] ! k[0, 1]
+ fsrc2 %f0, %f28
+ std %f2, [$out + 8] ! k[2, 3]
+ fsrc2 %f2, %f30
+
+.L128key:
+ mov %o7, %o5
+1: call .+8
+ add %o7, SIGMA-1b, %o4
+ mov %o5, %o7
+
+ ldd [%o4 + 0], %f16
+ ldd [%o4 + 8], %f18
+ ldd [%o4 + 16], %f20
+ ldd [%o4 + 24], %f22
+
+ camellia_f %f16, %f2, %f0, %f2
+ camellia_f %f18, %f0, %f2, %f0
+ fxor %f28, %f0, %f0
+ fxor %f30, %f2, %f2
+ camellia_f %f20, %f2, %f0, %f2
+ camellia_f %f22, %f0, %f2, %f0
+
+ bge,pn %icc, .L256key
+ nop
+ std %f0, [$out + 0x10] ! k[ 4, 5]
+ std %f2, [$out + 0x18] ! k[ 6, 7]
+
+ movdtox %f0, %o4
+ movdtox %f2, %o5
+ `&ROTL128(15)`
+ stx %o4, [$out + 0x30] ! k[12, 13]
+ stx %o5, [$out + 0x38] ! k[14, 15]
+ `&ROTL128(15)`
+ stx %o4, [$out + 0x40] ! k[16, 17]
+ stx %o5, [$out + 0x48] ! k[18, 19]
+ `&ROTL128(15)`
+ stx %o4, [$out + 0x60] ! k[24, 25]
+ `&ROTL128(15)`
+ stx %o4, [$out + 0x70] ! k[28, 29]
+ stx %o5, [$out + 0x78] ! k[30, 31]
+ `&ROTL128(34)`
+ stx %o4, [$out + 0xa0] ! k[40, 41]
+ stx %o5, [$out + 0xa8] ! k[42, 43]
+ `&ROTL128(17)`
+ stx %o4, [$out + 0xc0] ! k[48, 49]
+ stx %o5, [$out + 0xc8] ! k[50, 51]
+
+ movdtox %f28, %o4 ! k[ 0, 1]
+ movdtox %f30, %o5 ! k[ 2, 3]
+ `&ROTL128(15)`
+ stx %o4, [$out + 0x20] ! k[ 8, 9]
+ stx %o5, [$out + 0x28] ! k[10, 11]
+ `&ROTL128(30)`
+ stx %o4, [$out + 0x50] ! k[20, 21]
+ stx %o5, [$out + 0x58] ! k[22, 23]
+ `&ROTL128(15)`
+ stx %o5, [$out + 0x68] ! k[26, 27]
+ `&ROTL128(17)`
+ stx %o4, [$out + 0x80] ! k[32, 33]
+ stx %o5, [$out + 0x88] ! k[34, 35]
+ `&ROTL128(17)`
+ stx %o4, [$out + 0x90] ! k[36, 37]
+ stx %o5, [$out + 0x98] ! k[38, 39]
+ `&ROTL128(17)`
+ stx %o4, [$out + 0xb0] ! k[44, 45]
+ stx %o5, [$out + 0xb8] ! k[46, 47]
+
+ mov 3, $tmp
+ st $tmp, [$out + 0x110]
+ retl
+ xor %o0, %o0, %o0
+
+.align 16
+.L256key:
+ ldd [%o4 + 32], %f24
+ ldd [%o4 + 40], %f26
+
+ std %f0, [$out + 0x30] ! k[12, 13]
+ std %f2, [$out + 0x38] ! k[14, 15]
+
+ fxor %f4, %f0, %f0
+ fxor %f6, %f2, %f2
+ camellia_f %f24, %f2, %f0, %f2
+ camellia_f %f26, %f0, %f2, %f0
+
+ std %f0, [$out + 0x10] ! k[ 4, 5]
+ std %f2, [$out + 0x18] ! k[ 6, 7]
+
+ movdtox %f0, %o4
+ movdtox %f2, %o5
+ `&ROTL128(30)`
+ stx %o4, [$out + 0x50] ! k[20, 21]
+ stx %o5, [$out + 0x58] ! k[22, 23]
+ `&ROTL128(30)`
+ stx %o4, [$out + 0xa0] ! k[40, 41]
+ stx %o5, [$out + 0xa8] ! k[42, 43]
+ `&ROTL128(51)`
+ stx %o4, [$out + 0x100] ! k[64, 65]
+ stx %o5, [$out + 0x108] ! k[66, 67]
+
+ movdtox %f4, %o4 ! k[ 8, 9]
+ movdtox %f6, %o5 ! k[10, 11]
+ `&ROTL128(15)`
+ stx %o4, [$out + 0x20] ! k[ 8, 9]
+ stx %o5, [$out + 0x28] ! k[10, 11]
+ `&ROTL128(15)`
+ stx %o4, [$out + 0x40] ! k[16, 17]
+ stx %o5, [$out + 0x48] ! k[18, 19]
+ `&ROTL128(30)`
+ stx %o4, [$out + 0x90] ! k[36, 37]
+ stx %o5, [$out + 0x98] ! k[38, 39]
+ `&ROTL128(34)`
+ stx %o4, [$out + 0xd0] ! k[52, 53]
+ stx %o5, [$out + 0xd8] ! k[54, 55]
+ ldx [$out + 0x30], %o4 ! k[12, 13]
+ ldx [$out + 0x38], %o5 ! k[14, 15]
+ `&ROTL128(15)`
+ stx %o4, [$out + 0x30] ! k[12, 13]
+ stx %o5, [$out + 0x38] ! k[14, 15]
+ `&ROTL128(30)`
+ stx %o4, [$out + 0x70] ! k[28, 29]
+ stx %o5, [$out + 0x78] ! k[30, 31]
+ srlx %o4, 32, %g4
+ srlx %o5, 32, %g5
+ st %o4, [$out + 0xc0] ! k[48]
+ st %g5, [$out + 0xc4] ! k[49]
+ st %o5, [$out + 0xc8] ! k[50]
+ st %g4, [$out + 0xcc] ! k[51]
+ `&ROTL128(49)`
+ stx %o4, [$out + 0xe0] ! k[56, 57]
+ stx %o5, [$out + 0xe8] ! k[58, 59]
+
+ movdtox %f28, %o4 ! k[ 0, 1]
+ movdtox %f30, %o5 ! k[ 2, 3]
+ `&ROTL128(45)`
+ stx %o4, [$out + 0x60] ! k[24, 25]
+ stx %o5, [$out + 0x68] ! k[26, 27]
+ `&ROTL128(15)`
+ stx %o4, [$out + 0x80] ! k[32, 33]
+ stx %o5, [$out + 0x88] ! k[34, 35]
+ `&ROTL128(17)`
+ stx %o4, [$out + 0xb0] ! k[44, 45]
+ stx %o5, [$out + 0xb8] ! k[46, 47]
+ `&ROTL128(34)`
+ stx %o4, [$out + 0xf0] ! k[60, 61]
+ stx %o5, [$out + 0xf8] ! k[62, 63]
+
+ mov 4, $tmp
+ st $tmp, [$out + 0x110]
+ retl
+ xor %o0, %o0, %o0
+.type cmll_t4_set_key,#function
+.size cmll_t4_set_key,.-cmll_t4_set_key
+.align 32
+SIGMA:
+ .long 0xa09e667f, 0x3bcc908b, 0xb67ae858, 0x4caa73b2
+ .long 0xc6ef372f, 0xe94f82be, 0x54ff53a5, 0xf1d36f1c
+ .long 0x10e527fa, 0xde682d1d, 0xb05688c2, 0xb3e6c1fd
+.type SIGMA,#object
+.size SIGMA,.-SIGMA
+.asciz "Camellia for SPARC T4, David S. Miller, Andy Polyakov"
+___
+}
+
+{{{
+my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5));
+my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7));
+
+$code.=<<___;
+.align 32
+_cmll128_load_enckey:
+ ldx [$key + 0], %g4
+ ldx [$key + 8], %g5
+___
+for ($i=2; $i<26;$i++) { # load key schedule
+ $code.=<<___;
+ ldd [$key + `8*$i`], %f`12+2*$i`
+___
+}
+$code.=<<___;
+ retl
+ nop
+.type _cmll128_load_enckey,#function
+.size _cmll128_load_enckey,.-_cmll128_load_enckey
+_cmll256_load_enckey=_cmll128_load_enckey
+
+.align 32
+_cmll256_load_deckey:
+ ldd [$key + 64], %f62
+ ldd [$key + 72], %f60
+ b .Load_deckey
+ add $key, 64, $key
+_cmll128_load_deckey:
+ ldd [$key + 0], %f60
+ ldd [$key + 8], %f62
+.Load_deckey:
+___
+for ($i=2; $i<24;$i++) { # load key schedule
+ $code.=<<___;
+ ldd [$key + `8*$i`], %f`62-2*$i`
+___
+}
+$code.=<<___;
+ ldx [$key + 192], %g4
+ retl
+ ldx [$key + 200], %g5
+.type _cmll256_load_deckey,#function
+.size _cmll256_load_deckey,.-_cmll256_load_deckey
+
+.align 32
+_cmll128_encrypt_1x:
+___
+for ($i=0; $i<3; $i++) {
+ $code.=<<___;
+ camellia_f %f`16+16*$i+0`, %f2, %f0, %f2
+ camellia_f %f`16+16*$i+2`, %f0, %f2, %f0
+ camellia_f %f`16+16*$i+4`, %f2, %f0, %f2
+ camellia_f %f`16+16*$i+6`, %f0, %f2, %f0
+___
+$code.=<<___ if ($i<2);
+ camellia_f %f`16+16*$i+8`, %f2, %f0, %f2
+ camellia_f %f`16+16*$i+10`, %f0, %f2, %f0
+ camellia_fl %f`16+16*$i+12`, %f0, %f0
+ camellia_fli %f`16+16*$i+14`, %f2, %f2
+___
+}
+$code.=<<___;
+ camellia_f %f56, %f2, %f0, %f4
+ camellia_f %f58, %f0, %f4, %f2
+ fxor %f60, %f4, %f0
+ retl
+ fxor %f62, %f2, %f2
+.type _cmll128_encrypt_1x,#function
+.size _cmll128_encrypt_1x,.-_cmll128_encrypt_1x
+_cmll128_decrypt_1x=_cmll128_encrypt_1x
+
+.align 32
+_cmll128_encrypt_2x:
+___
+for ($i=0; $i<3; $i++) {
+ $code.=<<___;
+ camellia_f %f`16+16*$i+0`, %f2, %f0, %f2
+ camellia_f %f`16+16*$i+0`, %f6, %f4, %f6
+ camellia_f %f`16+16*$i+2`, %f0, %f2, %f0
+ camellia_f %f`16+16*$i+2`, %f4, %f6, %f4
+ camellia_f %f`16+16*$i+4`, %f2, %f0, %f2
+ camellia_f %f`16+16*$i+4`, %f6, %f4, %f6
+ camellia_f %f`16+16*$i+6`, %f0, %f2, %f0
+ camellia_f %f`16+16*$i+6`, %f4, %f6, %f4
+___
+$code.=<<___ if ($i<2);
+ camellia_f %f`16+16*$i+8`, %f2, %f0, %f2
+ camellia_f %f`16+16*$i+8`, %f6, %f4, %f6
+ camellia_f %f`16+16*$i+10`, %f0, %f2, %f0
+ camellia_f %f`16+16*$i+10`, %f4, %f6, %f4
+ camellia_fl %f`16+16*$i+12`, %f0, %f0
+ camellia_fl %f`16+16*$i+12`, %f4, %f4
+ camellia_fli %f`16+16*$i+14`, %f2, %f2
+ camellia_fli %f`16+16*$i+14`, %f6, %f6
+___
+}
+$code.=<<___;
+ camellia_f %f56, %f2, %f0, %f8
+ camellia_f %f56, %f6, %f4, %f10
+ camellia_f %f58, %f0, %f8, %f2
+ camellia_f %f58, %f4, %f10, %f6
+ fxor %f60, %f8, %f0
+ fxor %f60, %f10, %f4
+ fxor %f62, %f2, %f2
+ retl
+ fxor %f62, %f6, %f6
+.type _cmll128_encrypt_2x,#function
+.size _cmll128_encrypt_2x,.-_cmll128_encrypt_2x
+_cmll128_decrypt_2x=_cmll128_encrypt_2x
+
+.align 32
+_cmll256_encrypt_1x:
+ camellia_f %f16, %f2, %f0, %f2
+ camellia_f %f18, %f0, %f2, %f0
+ ldd [$key + 208], %f16
+ ldd [$key + 216], %f18
+ camellia_f %f20, %f2, %f0, %f2
+ camellia_f %f22, %f0, %f2, %f0
+ ldd [$key + 224], %f20
+ ldd [$key + 232], %f22
+ camellia_f %f24, %f2, %f0, %f2
+ camellia_f %f26, %f0, %f2, %f0
+ ldd [$key + 240], %f24
+ ldd [$key + 248], %f26
+ camellia_fl %f28, %f0, %f0
+ camellia_fli %f30, %f2, %f2
+ ldd [$key + 256], %f28
+ ldd [$key + 264], %f30
+___
+for ($i=1; $i<3; $i++) {
+ $code.=<<___;
+ camellia_f %f`16+16*$i+0`, %f2, %f0, %f2
+ camellia_f %f`16+16*$i+2`, %f0, %f2, %f0
+ camellia_f %f`16+16*$i+4`, %f2, %f0, %f2
+ camellia_f %f`16+16*$i+6`, %f0, %f2, %f0
+ camellia_f %f`16+16*$i+8`, %f2, %f0, %f2
+ camellia_f %f`16+16*$i+10`, %f0, %f2, %f0
+ camellia_fl %f`16+16*$i+12`, %f0, %f0
+ camellia_fli %f`16+16*$i+14`, %f2, %f2
+___
+}
+$code.=<<___;
+ camellia_f %f16, %f2, %f0, %f2
+ camellia_f %f18, %f0, %f2, %f0
+ ldd [$key + 16], %f16
+ ldd [$key + 24], %f18
+ camellia_f %f20, %f2, %f0, %f2
+ camellia_f %f22, %f0, %f2, %f0
+ ldd [$key + 32], %f20
+ ldd [$key + 40], %f22
+ camellia_f %f24, %f2, %f0, %f4
+ camellia_f %f26, %f0, %f4, %f2
+ ldd [$key + 48], %f24
+ ldd [$key + 56], %f26
+ fxor %f28, %f4, %f0
+ fxor %f30, %f2, %f2
+ ldd [$key + 64], %f28
+ retl
+ ldd [$key + 72], %f30
+.type _cmll256_encrypt_1x,#function
+.size _cmll256_encrypt_1x,.-_cmll256_encrypt_1x
+
+.align 32
+_cmll256_encrypt_2x:
+ camellia_f %f16, %f2, %f0, %f2
+ camellia_f %f16, %f6, %f4, %f6
+ camellia_f %f18, %f0, %f2, %f0
+ camellia_f %f18, %f4, %f6, %f4
+ ldd [$key + 208], %f16
+ ldd [$key + 216], %f18
+ camellia_f %f20, %f2, %f0, %f2
+ camellia_f %f20, %f6, %f4, %f6
+ camellia_f %f22, %f0, %f2, %f0
+ camellia_f %f22, %f4, %f6, %f4
+ ldd [$key + 224], %f20
+ ldd [$key + 232], %f22
+ camellia_f %f24, %f2, %f0, %f2
+ camellia_f %f24, %f6, %f4, %f6
+ camellia_f %f26, %f0, %f2, %f0
+ camellia_f %f26, %f4, %f6, %f4
+ ldd [$key + 240], %f24
+ ldd [$key + 248], %f26
+ camellia_fl %f28, %f0, %f0
+ camellia_fl %f28, %f4, %f4
+ camellia_fli %f30, %f2, %f2
+ camellia_fli %f30, %f6, %f6
+ ldd [$key + 256], %f28
+ ldd [$key + 264], %f30
+___
+for ($i=1; $i<3; $i++) {
+ $code.=<<___;
+ camellia_f %f`16+16*$i+0`, %f2, %f0, %f2
+ camellia_f %f`16+16*$i+0`, %f6, %f4, %f6
+ camellia_f %f`16+16*$i+2`, %f0, %f2, %f0
+ camellia_f %f`16+16*$i+2`, %f4, %f6, %f4
+ camellia_f %f`16+16*$i+4`, %f2, %f0, %f2
+ camellia_f %f`16+16*$i+4`, %f6, %f4, %f6
+ camellia_f %f`16+16*$i+6`, %f0, %f2, %f0
+ camellia_f %f`16+16*$i+6`, %f4, %f6, %f4
+ camellia_f %f`16+16*$i+8`, %f2, %f0, %f2
+ camellia_f %f`16+16*$i+8`, %f6, %f4, %f6
+ camellia_f %f`16+16*$i+10`, %f0, %f2, %f0
+ camellia_f %f`16+16*$i+10`, %f4, %f6, %f4
+ camellia_fl %f`16+16*$i+12`, %f0, %f0
+ camellia_fl %f`16+16*$i+12`, %f4, %f4
+ camellia_fli %f`16+16*$i+14`, %f2, %f2
+ camellia_fli %f`16+16*$i+14`, %f6, %f6
+___
+}
+$code.=<<___;
+ camellia_f %f16, %f2, %f0, %f2
+ camellia_f %f16, %f6, %f4, %f6
+ camellia_f %f18, %f0, %f2, %f0
+ camellia_f %f18, %f4, %f6, %f4
+ ldd [$key + 16], %f16
+ ldd [$key + 24], %f18
+ camellia_f %f20, %f2, %f0, %f2
+ camellia_f %f20, %f6, %f4, %f6
+ camellia_f %f22, %f0, %f2, %f0
+ camellia_f %f22, %f4, %f6, %f4
+ ldd [$key + 32], %f20
+ ldd [$key + 40], %f22
+ camellia_f %f24, %f2, %f0, %f8
+ camellia_f %f24, %f6, %f4, %f10
+ camellia_f %f26, %f0, %f8, %f2
+ camellia_f %f26, %f4, %f10, %f6
+ ldd [$key + 48], %f24
+ ldd [$key + 56], %f26
+ fxor %f28, %f8, %f0
+ fxor %f28, %f10, %f4
+ fxor %f30, %f2, %f2
+ fxor %f30, %f6, %f6
+ ldd [$key + 64], %f28
+ retl
+ ldd [$key + 72], %f30
+.type _cmll256_encrypt_2x,#function
+.size _cmll256_encrypt_2x,.-_cmll256_encrypt_2x
+
+.align 32
+_cmll256_decrypt_1x:
+ camellia_f %f16, %f2, %f0, %f2
+ camellia_f %f18, %f0, %f2, %f0
+ ldd [$key - 8], %f16
+ ldd [$key - 16], %f18
+ camellia_f %f20, %f2, %f0, %f2
+ camellia_f %f22, %f0, %f2, %f0
+ ldd [$key - 24], %f20
+ ldd [$key - 32], %f22
+ camellia_f %f24, %f2, %f0, %f2
+ camellia_f %f26, %f0, %f2, %f0
+ ldd [$key - 40], %f24
+ ldd [$key - 48], %f26
+ camellia_fl %f28, %f0, %f0
+ camellia_fli %f30, %f2, %f2
+ ldd [$key - 56], %f28
+ ldd [$key - 64], %f30
+___
+for ($i=1; $i<3; $i++) {
+ $code.=<<___;
+ camellia_f %f`16+16*$i+0`, %f2, %f0, %f2
+ camellia_f %f`16+16*$i+2`, %f0, %f2, %f0
+ camellia_f %f`16+16*$i+4`, %f2, %f0, %f2
+ camellia_f %f`16+16*$i+6`, %f0, %f2, %f0
+ camellia_f %f`16+16*$i+8`, %f2, %f0, %f2
+ camellia_f %f`16+16*$i+10`, %f0, %f2, %f0
+ camellia_fl %f`16+16*$i+12`, %f0, %f0
+ camellia_fli %f`16+16*$i+14`, %f2, %f2
+___
+}
+$code.=<<___;
+ camellia_f %f16, %f2, %f0, %f2
+ camellia_f %f18, %f0, %f2, %f0
+ ldd [$key + 184], %f16
+ ldd [$key + 176], %f18
+ camellia_f %f20, %f2, %f0, %f2
+ camellia_f %f22, %f0, %f2, %f0
+ ldd [$key + 168], %f20
+ ldd [$key + 160], %f22
+ camellia_f %f24, %f2, %f0, %f4
+ camellia_f %f26, %f0, %f4, %f2
+ ldd [$key + 152], %f24
+ ldd [$key + 144], %f26
+ fxor %f30, %f4, %f0
+ fxor %f28, %f2, %f2
+ ldd [$key + 136], %f28
+ retl
+ ldd [$key + 128], %f30
+.type _cmll256_decrypt_1x,#function
+.size _cmll256_decrypt_1x,.-_cmll256_decrypt_1x
+
+.align 32
+_cmll256_decrypt_2x:
+ camellia_f %f16, %f2, %f0, %f2
+ camellia_f %f16, %f6, %f4, %f6
+ camellia_f %f18, %f0, %f2, %f0
+ camellia_f %f18, %f4, %f6, %f4
+ ldd [$key - 8], %f16
+ ldd [$key - 16], %f18
+ camellia_f %f20, %f2, %f0, %f2
+ camellia_f %f20, %f6, %f4, %f6
+ camellia_f %f22, %f0, %f2, %f0
+ camellia_f %f22, %f4, %f6, %f4
+ ldd [$key - 24], %f20
+ ldd [$key - 32], %f22
+ camellia_f %f24, %f2, %f0, %f2
+ camellia_f %f24, %f6, %f4, %f6
+ camellia_f %f26, %f0, %f2, %f0
+ camellia_f %f26, %f4, %f6, %f4
+ ldd [$key - 40], %f24
+ ldd [$key - 48], %f26
+ camellia_fl %f28, %f0, %f0
+ camellia_fl %f28, %f4, %f4
+ camellia_fli %f30, %f2, %f2
+ camellia_fli %f30, %f6, %f6
+ ldd [$key - 56], %f28
+ ldd [$key - 64], %f30
+___
+for ($i=1; $i<3; $i++) {
+ $code.=<<___;
+ camellia_f %f`16+16*$i+0`, %f2, %f0, %f2
+ camellia_f %f`16+16*$i+0`, %f6, %f4, %f6
+ camellia_f %f`16+16*$i+2`, %f0, %f2, %f0
+ camellia_f %f`16+16*$i+2`, %f4, %f6, %f4
+ camellia_f %f`16+16*$i+4`, %f2, %f0, %f2
+ camellia_f %f`16+16*$i+4`, %f6, %f4, %f6
+ camellia_f %f`16+16*$i+6`, %f0, %f2, %f0
+ camellia_f %f`16+16*$i+6`, %f4, %f6, %f4
+ camellia_f %f`16+16*$i+8`, %f2, %f0, %f2
+ camellia_f %f`16+16*$i+8`, %f6, %f4, %f6
+ camellia_f %f`16+16*$i+10`, %f0, %f2, %f0
+ camellia_f %f`16+16*$i+10`, %f4, %f6, %f4
+ camellia_fl %f`16+16*$i+12`, %f0, %f0
+ camellia_fl %f`16+16*$i+12`, %f4, %f4
+ camellia_fli %f`16+16*$i+14`, %f2, %f2
+ camellia_fli %f`16+16*$i+14`, %f6, %f6
+___
+}
+$code.=<<___;
+ camellia_f %f16, %f2, %f0, %f2
+ camellia_f %f16, %f6, %f4, %f6
+ camellia_f %f18, %f0, %f2, %f0
+ camellia_f %f18, %f4, %f6, %f4
+ ldd [$key + 184], %f16
+ ldd [$key + 176], %f18
+ camellia_f %f20, %f2, %f0, %f2
+ camellia_f %f20, %f6, %f4, %f6
+ camellia_f %f22, %f0, %f2, %f0
+ camellia_f %f22, %f4, %f6, %f4
+ ldd [$key + 168], %f20
+ ldd [$key + 160], %f22
+ camellia_f %f24, %f2, %f0, %f8
+ camellia_f %f24, %f6, %f4, %f10
+ camellia_f %f26, %f0, %f8, %f2
+ camellia_f %f26, %f4, %f10, %f6
+ ldd [$key + 152], %f24
+ ldd [$key + 144], %f26
+ fxor %f30, %f8, %f0
+ fxor %f30, %f10, %f4
+ fxor %f28, %f2, %f2
+ fxor %f28, %f6, %f6
+ ldd [$key + 136], %f28
+ retl
+ ldd [$key + 128], %f30
+.type _cmll256_decrypt_2x,#function
+.size _cmll256_decrypt_2x,.-_cmll256_decrypt_2x
+___
+
+&alg_cbc_encrypt_implement("cmll",128);
+&alg_cbc_encrypt_implement("cmll",256);
+
+&alg_cbc_decrypt_implement("cmll",128);
+&alg_cbc_decrypt_implement("cmll",256);
+
+if ($::evp) {
+ &alg_ctr32_implement("cmll",128);
+ &alg_ctr32_implement("cmll",256);
+}
+}}}
+
+if (!$::evp) {
+$code.=<<___;
+.global Camellia_encrypt
+Camellia_encrypt=cmll_t4_encrypt
+.global Camellia_decrypt
+Camellia_decrypt=cmll_t4_decrypt
+.global Camellia_set_key
+.align 32
+Camellia_set_key:
+ andcc %o2, 7, %g0 ! double-check alignment
+ bnz,a,pn %icc, 1f
+ mov -1, %o0
+ brz,a,pn %o0, 1f
+ mov -1, %o0
+ brz,a,pn %o2, 1f
+ mov -1, %o0
+ andncc %o1, 0x1c0, %g0
+ bnz,a,pn %icc, 1f
+ mov -2, %o0
+ cmp %o1, 128
+ bl,a,pn %icc, 1f
+ mov -2, %o0
+ b cmll_t4_set_key
+ nop
+1: retl
+ nop
+.type Camellia_set_key,#function
+.size Camellia_set_key,.-Camellia_set_key
+___
+
+my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5));
+
+$code.=<<___;
+.globl Camellia_cbc_encrypt
+.align 32
+Camellia_cbc_encrypt:
+ ld [$key + 272], %g1
+ nop
+ brz $enc, .Lcbc_decrypt
+ cmp %g1, 3
+
+ be,pt %icc, cmll128_t4_cbc_encrypt
+ nop
+ ba cmll256_t4_cbc_encrypt
+ nop
+
+.Lcbc_decrypt:
+ be,pt %icc, cmll128_t4_cbc_decrypt
+ nop
+ ba cmll256_t4_cbc_decrypt
+ nop
+.type Camellia_cbc_encrypt,#function
+.size Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
+___
+}
+
+&emit_assembler();
+
+close STDOUT;
diff --git a/crypto/cast/cast_lcl.h b/crypto/cast/cast_lcl.h
index 7c4ad41b2157..b0f08294e373 100644
--- a/crypto/cast/cast_lcl.h
+++ b/crypto/cast/cast_lcl.h
@@ -152,6 +152,8 @@
#if defined(OPENSSL_SYS_WIN32) && defined(_MSC_VER)
# define ROTL(a,n) (_lrotl(a,n))
+#elif defined(PEDANTIC)
+# define ROTL(a,n) ((((a)<<(n))&0xffffffffL)|((a)>>((32-(n))&31)))
#else
# define ROTL(a,n) ((((a)<<(n))&0xffffffffL)|((a)>>(32-(n))))
#endif
diff --git a/crypto/cms/Makefile b/crypto/cms/Makefile
index b124b5dbf212..6f3a83202638 100644
--- a/crypto/cms/Makefile
+++ b/crypto/cms/Makefile
@@ -19,10 +19,10 @@ APPS=
LIB=$(TOP)/libcrypto.a
LIBSRC= cms_lib.c cms_asn1.c cms_att.c cms_io.c cms_smime.c cms_err.c \
cms_sd.c cms_dd.c cms_cd.c cms_env.c cms_enc.c cms_ess.c \
- cms_pwri.c
+ cms_pwri.c cms_kari.c
LIBOBJ= cms_lib.o cms_asn1.o cms_att.o cms_io.o cms_smime.o cms_err.o \
cms_sd.o cms_dd.o cms_cd.o cms_env.o cms_enc.o cms_ess.o \
- cms_pwri.o
+ cms_pwri.o cms_kari.o
SRC= $(LIBSRC)
@@ -220,20 +220,39 @@ cms_io.o: ../../include/openssl/safestack.h ../../include/openssl/sha.h
cms_io.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
cms_io.o: ../../include/openssl/x509.h ../../include/openssl/x509_vfy.h cms.h
cms_io.o: cms_io.c cms_lcl.h
+cms_kari.o: ../../e_os.h ../../include/openssl/aes.h
+cms_kari.o: ../../include/openssl/asn1.h ../../include/openssl/asn1t.h
+cms_kari.o: ../../include/openssl/bio.h ../../include/openssl/buffer.h
+cms_kari.o: ../../include/openssl/cms.h ../../include/openssl/conf.h
+cms_kari.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
+cms_kari.o: ../../include/openssl/ec.h ../../include/openssl/ecdh.h
+cms_kari.o: ../../include/openssl/ecdsa.h ../../include/openssl/err.h
+cms_kari.o: ../../include/openssl/evp.h ../../include/openssl/lhash.h
+cms_kari.o: ../../include/openssl/obj_mac.h ../../include/openssl/objects.h
+cms_kari.o: ../../include/openssl/opensslconf.h
+cms_kari.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+cms_kari.o: ../../include/openssl/pem.h ../../include/openssl/pem2.h
+cms_kari.o: ../../include/openssl/pkcs7.h ../../include/openssl/rand.h
+cms_kari.o: ../../include/openssl/safestack.h ../../include/openssl/sha.h
+cms_kari.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
+cms_kari.o: ../../include/openssl/x509.h ../../include/openssl/x509_vfy.h
+cms_kari.o: ../../include/openssl/x509v3.h ../asn1/asn1_locl.h ../cryptlib.h
+cms_kari.o: cms_kari.c cms_lcl.h
cms_lib.o: ../../include/openssl/asn1.h ../../include/openssl/asn1t.h
cms_lib.o: ../../include/openssl/bio.h ../../include/openssl/buffer.h
-cms_lib.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
-cms_lib.o: ../../include/openssl/ec.h ../../include/openssl/ecdh.h
-cms_lib.o: ../../include/openssl/ecdsa.h ../../include/openssl/err.h
-cms_lib.o: ../../include/openssl/evp.h ../../include/openssl/lhash.h
-cms_lib.o: ../../include/openssl/obj_mac.h ../../include/openssl/objects.h
-cms_lib.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h
-cms_lib.o: ../../include/openssl/ossl_typ.h ../../include/openssl/pem.h
-cms_lib.o: ../../include/openssl/pem2.h ../../include/openssl/pkcs7.h
-cms_lib.o: ../../include/openssl/safestack.h ../../include/openssl/sha.h
-cms_lib.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
-cms_lib.o: ../../include/openssl/x509.h ../../include/openssl/x509_vfy.h cms.h
-cms_lib.o: cms_lcl.h cms_lib.c
+cms_lib.o: ../../include/openssl/conf.h ../../include/openssl/crypto.h
+cms_lib.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h
+cms_lib.o: ../../include/openssl/ecdh.h ../../include/openssl/ecdsa.h
+cms_lib.o: ../../include/openssl/err.h ../../include/openssl/evp.h
+cms_lib.o: ../../include/openssl/lhash.h ../../include/openssl/obj_mac.h
+cms_lib.o: ../../include/openssl/objects.h ../../include/openssl/opensslconf.h
+cms_lib.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+cms_lib.o: ../../include/openssl/pem.h ../../include/openssl/pem2.h
+cms_lib.o: ../../include/openssl/pkcs7.h ../../include/openssl/safestack.h
+cms_lib.o: ../../include/openssl/sha.h ../../include/openssl/stack.h
+cms_lib.o: ../../include/openssl/symhacks.h ../../include/openssl/x509.h
+cms_lib.o: ../../include/openssl/x509_vfy.h ../../include/openssl/x509v3.h
+cms_lib.o: cms.h cms_lcl.h cms_lib.c
cms_pwri.o: ../../e_os.h ../../include/openssl/aes.h
cms_pwri.o: ../../include/openssl/asn1.h ../../include/openssl/asn1t.h
cms_pwri.o: ../../include/openssl/bio.h ../../include/openssl/buffer.h
@@ -283,4 +302,4 @@ cms_smime.o: ../../include/openssl/pkcs7.h ../../include/openssl/safestack.h
cms_smime.o: ../../include/openssl/sha.h ../../include/openssl/stack.h
cms_smime.o: ../../include/openssl/symhacks.h ../../include/openssl/x509.h
cms_smime.o: ../../include/openssl/x509_vfy.h ../../include/openssl/x509v3.h
-cms_smime.o: ../cryptlib.h cms_lcl.h cms_smime.c
+cms_smime.o: ../asn1/asn1_locl.h ../cryptlib.h cms_lcl.h cms_smime.c
diff --git a/crypto/cms/cms.h b/crypto/cms/cms.h
index a2281eda5b0a..e6c7f964bf48 100644
--- a/crypto/cms/cms.h
+++ b/crypto/cms/cms.h
@@ -72,9 +72,12 @@ typedef struct CMS_RevocationInfoChoice_st CMS_RevocationInfoChoice;
typedef struct CMS_RecipientInfo_st CMS_RecipientInfo;
typedef struct CMS_ReceiptRequest_st CMS_ReceiptRequest;
typedef struct CMS_Receipt_st CMS_Receipt;
+typedef struct CMS_RecipientEncryptedKey_st CMS_RecipientEncryptedKey;
+typedef struct CMS_OtherKeyAttribute_st CMS_OtherKeyAttribute;
DECLARE_STACK_OF(CMS_SignerInfo)
DECLARE_STACK_OF(GENERAL_NAMES)
+DECLARE_STACK_OF(CMS_RecipientEncryptedKey)
DECLARE_ASN1_FUNCTIONS(CMS_ContentInfo)
DECLARE_ASN1_FUNCTIONS(CMS_ReceiptRequest)
DECLARE_ASN1_PRINT_FUNCTION(CMS_ContentInfo)
@@ -82,6 +85,7 @@ DECLARE_ASN1_PRINT_FUNCTION(CMS_ContentInfo)
# define CMS_SIGNERINFO_ISSUER_SERIAL 0
# define CMS_SIGNERINFO_KEYIDENTIFIER 1
+# define CMS_RECIPINFO_NONE -1
# define CMS_RECIPINFO_TRANS 0
# define CMS_RECIPINFO_AGREE 1
# define CMS_RECIPINFO_KEK 2
@@ -111,6 +115,7 @@ DECLARE_ASN1_PRINT_FUNCTION(CMS_ContentInfo)
# define CMS_REUSE_DIGEST 0x8000
# define CMS_USE_KEYID 0x10000
# define CMS_DEBUG_DECRYPT 0x20000
+# define CMS_KEY_PARAM 0x40000
const ASN1_OBJECT *CMS_get0_type(CMS_ContentInfo *cms);
@@ -189,6 +194,7 @@ int CMS_decrypt_set1_password(CMS_ContentInfo *cms,
STACK_OF(CMS_RecipientInfo) *CMS_get0_RecipientInfos(CMS_ContentInfo *cms);
int CMS_RecipientInfo_type(CMS_RecipientInfo *ri);
+EVP_PKEY_CTX *CMS_RecipientInfo_get0_pkey_ctx(CMS_RecipientInfo *ri);
CMS_ContentInfo *CMS_EnvelopedData_create(const EVP_CIPHER *cipher);
CMS_RecipientInfo *CMS_add1_recipient_cert(CMS_ContentInfo *cms,
X509 *recip, unsigned int flags);
@@ -234,6 +240,7 @@ CMS_RecipientInfo *CMS_add0_recipient_password(CMS_ContentInfo *cms,
const EVP_CIPHER *kekciph);
int CMS_RecipientInfo_decrypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri);
+int CMS_RecipientInfo_encrypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri);
int CMS_uncompress(CMS_ContentInfo *cms, BIO *dcont, BIO *out,
unsigned int flags);
@@ -256,6 +263,8 @@ int CMS_SignedData_init(CMS_ContentInfo *cms);
CMS_SignerInfo *CMS_add1_signer(CMS_ContentInfo *cms,
X509 *signer, EVP_PKEY *pk, const EVP_MD *md,
unsigned int flags);
+EVP_PKEY_CTX *CMS_SignerInfo_get0_pkey_ctx(CMS_SignerInfo *si);
+EVP_MD_CTX *CMS_SignerInfo_get0_md_ctx(CMS_SignerInfo *si);
STACK_OF(CMS_SignerInfo) *CMS_get0_SignerInfos(CMS_ContentInfo *cms);
void CMS_SignerInfo_set1_signer_cert(CMS_SignerInfo *si, X509 *signer);
@@ -268,6 +277,7 @@ int CMS_set1_signers_certs(CMS_ContentInfo *cms, STACK_OF(X509) *certs,
void CMS_SignerInfo_get0_algs(CMS_SignerInfo *si, EVP_PKEY **pk,
X509 **signer, X509_ALGOR **pdig,
X509_ALGOR **psig);
+ASN1_OCTET_STRING *CMS_SignerInfo_get0_signature(CMS_SignerInfo *si);
int CMS_SignerInfo_sign(CMS_SignerInfo *si);
int CMS_SignerInfo_verify(CMS_SignerInfo *si);
int CMS_SignerInfo_verify_content(CMS_SignerInfo *si, BIO *chain);
@@ -331,8 +341,37 @@ void CMS_ReceiptRequest_get0_values(CMS_ReceiptRequest *rr,
int *pallorfirst,
STACK_OF(GENERAL_NAMES) **plist,
STACK_OF(GENERAL_NAMES) **prto);
-
# endif
+int CMS_RecipientInfo_kari_get0_alg(CMS_RecipientInfo *ri,
+ X509_ALGOR **palg,
+ ASN1_OCTET_STRING **pukm);
+STACK_OF(CMS_RecipientEncryptedKey)
+*CMS_RecipientInfo_kari_get0_reks(CMS_RecipientInfo *ri);
+
+int CMS_RecipientInfo_kari_get0_orig_id(CMS_RecipientInfo *ri,
+ X509_ALGOR **pubalg,
+ ASN1_BIT_STRING **pubkey,
+ ASN1_OCTET_STRING **keyid,
+ X509_NAME **issuer,
+ ASN1_INTEGER **sno);
+
+int CMS_RecipientInfo_kari_orig_id_cmp(CMS_RecipientInfo *ri, X509 *cert);
+
+int CMS_RecipientEncryptedKey_get0_id(CMS_RecipientEncryptedKey *rek,
+ ASN1_OCTET_STRING **keyid,
+ ASN1_GENERALIZEDTIME **tm,
+ CMS_OtherKeyAttribute **other,
+ X509_NAME **issuer, ASN1_INTEGER **sno);
+int CMS_RecipientEncryptedKey_cert_cmp(CMS_RecipientEncryptedKey *rek,
+ X509 *cert);
+int CMS_RecipientInfo_kari_set0_pkey(CMS_RecipientInfo *ri, EVP_PKEY *pk);
+EVP_CIPHER_CTX *CMS_RecipientInfo_kari_get0_ctx(CMS_RecipientInfo *ri);
+int CMS_RecipientInfo_kari_decrypt(CMS_ContentInfo *cms,
+ CMS_RecipientInfo *ri,
+ CMS_RecipientEncryptedKey *rek);
+
+int CMS_SharedInfo_encode(unsigned char **pder, X509_ALGOR *kekalg,
+ ASN1_OCTET_STRING *ukm, int keylen);
/* BEGIN ERROR CODES */
/*
@@ -377,6 +416,7 @@ void ERR_load_CMS_strings(void);
# define CMS_F_CMS_ENVELOPEDDATA_CREATE 124
# define CMS_F_CMS_ENVELOPEDDATA_INIT_BIO 125
# define CMS_F_CMS_ENVELOPED_DATA_INIT 126
+# define CMS_F_CMS_ENV_ASN1_CTRL 171
# define CMS_F_CMS_FINAL 127
# define CMS_F_CMS_GET0_CERTIFICATE_CHOICES 128
# define CMS_F_CMS_GET0_CONTENT 129
@@ -388,6 +428,12 @@ void ERR_load_CMS_strings(void);
# define CMS_F_CMS_RECEIPTREQUEST_CREATE0 159
# define CMS_F_CMS_RECEIPT_VERIFY 160
# define CMS_F_CMS_RECIPIENTINFO_DECRYPT 134
+# define CMS_F_CMS_RECIPIENTINFO_ENCRYPT 169
+# define CMS_F_CMS_RECIPIENTINFO_KARI_ENCRYPT 178
+# define CMS_F_CMS_RECIPIENTINFO_KARI_GET0_ALG 175
+# define CMS_F_CMS_RECIPIENTINFO_KARI_GET0_ORIG_ID 173
+# define CMS_F_CMS_RECIPIENTINFO_KARI_GET0_REKS 172
+# define CMS_F_CMS_RECIPIENTINFO_KARI_ORIG_ID_CMP 174
# define CMS_F_CMS_RECIPIENTINFO_KEKRI_DECRYPT 135
# define CMS_F_CMS_RECIPIENTINFO_KEKRI_ENCRYPT 136
# define CMS_F_CMS_RECIPIENTINFO_KEKRI_GET0_ID 137
@@ -401,6 +447,9 @@ void ERR_load_CMS_strings(void);
# define CMS_F_CMS_RECIPIENTINFO_SET0_KEY 144
# define CMS_F_CMS_RECIPIENTINFO_SET0_PASSWORD 168
# define CMS_F_CMS_RECIPIENTINFO_SET0_PKEY 145
+# define CMS_F_CMS_SD_ASN1_CTRL 170
+# define CMS_F_CMS_SET1_IAS 176
+# define CMS_F_CMS_SET1_KEYID 177
# define CMS_F_CMS_SET1_SIGNERIDENTIFIER 146
# define CMS_F_CMS_SET_DETACHED 147
# define CMS_F_CMS_SIGN 148
@@ -452,6 +501,7 @@ void ERR_load_CMS_strings(void);
# define CMS_R_NOT_A_SIGNED_RECEIPT 165
# define CMS_R_NOT_ENCRYPTED_DATA 122
# define CMS_R_NOT_KEK 123
+# define CMS_R_NOT_KEY_AGREEMENT 181
# define CMS_R_NOT_KEY_TRANSPORT 124
# define CMS_R_NOT_PWRI 177
# define CMS_R_NOT_SUPPORTED_FOR_THIS_KEY_TYPE 125
diff --git a/crypto/cms/cms_asn1.c b/crypto/cms/cms_asn1.c
index f9f267afdcd7..81a3407f1249 100644
--- a/crypto/cms/cms_asn1.c
+++ b/crypto/cms/cms_asn1.c
@@ -97,6 +97,8 @@ static int cms_si_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it,
EVP_PKEY_free(si->pkey);
if (si->signer)
X509_free(si->signer);
+ if (si->pctx)
+ EVP_MD_CTX_cleanup(&si->mctx);
}
return 1;
}
@@ -164,10 +166,21 @@ ASN1_CHOICE(CMS_KeyAgreeRecipientIdentifier) = {
ASN1_IMP(CMS_KeyAgreeRecipientIdentifier, d.rKeyId, CMS_RecipientKeyIdentifier, 0)
} ASN1_CHOICE_END(CMS_KeyAgreeRecipientIdentifier)
-ASN1_SEQUENCE(CMS_RecipientEncryptedKey) = {
+static int cms_rek_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it,
+ void *exarg)
+{
+ CMS_RecipientEncryptedKey *rek = (CMS_RecipientEncryptedKey *)*pval;
+ if (operation == ASN1_OP_FREE_POST) {
+ if (rek->pkey)
+ EVP_PKEY_free(rek->pkey);
+ }
+ return 1;
+}
+
+ASN1_SEQUENCE_cb(CMS_RecipientEncryptedKey, cms_rek_cb) = {
ASN1_SIMPLE(CMS_RecipientEncryptedKey, rid, CMS_KeyAgreeRecipientIdentifier),
ASN1_SIMPLE(CMS_RecipientEncryptedKey, encryptedKey, ASN1_OCTET_STRING)
-} ASN1_SEQUENCE_END(CMS_RecipientEncryptedKey)
+} ASN1_SEQUENCE_END_cb(CMS_RecipientEncryptedKey, CMS_RecipientEncryptedKey)
ASN1_SEQUENCE(CMS_OriginatorPublicKey) = {
ASN1_SIMPLE(CMS_OriginatorPublicKey, algorithm, X509_ALGOR),
@@ -180,13 +193,29 @@ ASN1_CHOICE(CMS_OriginatorIdentifierOrKey) = {
ASN1_IMP(CMS_OriginatorIdentifierOrKey, d.originatorKey, CMS_OriginatorPublicKey, 1)
} ASN1_CHOICE_END(CMS_OriginatorIdentifierOrKey)
-ASN1_SEQUENCE(CMS_KeyAgreeRecipientInfo) = {
+static int cms_kari_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it,
+ void *exarg)
+{
+ CMS_KeyAgreeRecipientInfo *kari = (CMS_KeyAgreeRecipientInfo *)*pval;
+ if (operation == ASN1_OP_NEW_POST) {
+ EVP_CIPHER_CTX_init(&kari->ctx);
+ EVP_CIPHER_CTX_set_flags(&kari->ctx, EVP_CIPHER_CTX_FLAG_WRAP_ALLOW);
+ kari->pctx = NULL;
+ } else if (operation == ASN1_OP_FREE_POST) {
+ if (kari->pctx)
+ EVP_PKEY_CTX_free(kari->pctx);
+ EVP_CIPHER_CTX_cleanup(&kari->ctx);
+ }
+ return 1;
+}
+
+ASN1_SEQUENCE_cb(CMS_KeyAgreeRecipientInfo, cms_kari_cb) = {
ASN1_SIMPLE(CMS_KeyAgreeRecipientInfo, version, LONG),
ASN1_EXP(CMS_KeyAgreeRecipientInfo, originator, CMS_OriginatorIdentifierOrKey, 0),
ASN1_EXP_OPT(CMS_KeyAgreeRecipientInfo, ukm, ASN1_OCTET_STRING, 1),
ASN1_SIMPLE(CMS_KeyAgreeRecipientInfo, keyEncryptionAlgorithm, X509_ALGOR),
ASN1_SEQUENCE_OF(CMS_KeyAgreeRecipientInfo, recipientEncryptedKeys, CMS_RecipientEncryptedKey)
-} ASN1_SEQUENCE_END(CMS_KeyAgreeRecipientInfo)
+} ASN1_SEQUENCE_END_cb(CMS_KeyAgreeRecipientInfo, CMS_KeyAgreeRecipientInfo)
ASN1_SEQUENCE(CMS_KEKIdentifier) = {
ASN1_SIMPLE(CMS_KEKIdentifier, keyIdentifier, ASN1_OCTET_STRING),
@@ -225,6 +254,8 @@ static int cms_ri_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it,
EVP_PKEY_free(ktri->pkey);
if (ktri->recip)
X509_free(ktri->recip);
+ if (ktri->pctx)
+ EVP_PKEY_CTX_free(ktri->pctx);
} else if (ri->type == CMS_RECIPINFO_KEK) {
CMS_KEKRecipientInfo *kekri = ri->d.kekri;
if (kekri->key) {
@@ -379,3 +410,50 @@ ASN1_SEQUENCE(CMS_Receipt) = {
ASN1_SIMPLE(CMS_Receipt, signedContentIdentifier, ASN1_OCTET_STRING),
ASN1_SIMPLE(CMS_Receipt, originatorSignatureValue, ASN1_OCTET_STRING)
} ASN1_SEQUENCE_END(CMS_Receipt)
+
+/*
+ * Utilities to encode the CMS_SharedInfo structure used during key
+ * derivation.
+ */
+
+typedef struct {
+ X509_ALGOR *keyInfo;
+ ASN1_OCTET_STRING *entityUInfo;
+ ASN1_OCTET_STRING *suppPubInfo;
+} CMS_SharedInfo;
+
+ASN1_SEQUENCE(CMS_SharedInfo) = {
+ ASN1_SIMPLE(CMS_SharedInfo, keyInfo, X509_ALGOR),
+ ASN1_EXP_OPT(CMS_SharedInfo, entityUInfo, ASN1_OCTET_STRING, 0),
+ ASN1_EXP_OPT(CMS_SharedInfo, suppPubInfo, ASN1_OCTET_STRING, 2),
+} ASN1_SEQUENCE_END(CMS_SharedInfo)
+
+int CMS_SharedInfo_encode(unsigned char **pder, X509_ALGOR *kekalg,
+ ASN1_OCTET_STRING *ukm, int keylen)
+{
+ union {
+ CMS_SharedInfo *pecsi;
+ ASN1_VALUE *a;
+ } intsi = {
+ NULL
+ };
+
+ ASN1_OCTET_STRING oklen;
+ unsigned char kl[4];
+ CMS_SharedInfo ecsi;
+
+ keylen <<= 3;
+ kl[0] = (keylen >> 24) & 0xff;
+ kl[1] = (keylen >> 16) & 0xff;
+ kl[2] = (keylen >> 8) & 0xff;
+ kl[3] = keylen & 0xff;
+ oklen.length = 4;
+ oklen.data = kl;
+ oklen.type = V_ASN1_OCTET_STRING;
+ oklen.flags = 0;
+ ecsi.keyInfo = kekalg;
+ ecsi.entityUInfo = ukm;
+ ecsi.suppPubInfo = &oklen;
+ intsi.pecsi = &ecsi;
+ return ASN1_item_i2d(intsi.a, pder, ASN1_ITEM_rptr(CMS_SharedInfo));
+}
diff --git a/crypto/cms/cms_env.c b/crypto/cms/cms_env.c
index 1c3046ce9238..93c06cb00a8f 100644
--- a/crypto/cms/cms_env.c
+++ b/crypto/cms/cms_env.c
@@ -100,6 +100,36 @@ static CMS_EnvelopedData *cms_enveloped_data_init(CMS_ContentInfo *cms)
return cms_get0_enveloped(cms);
}
+int cms_env_asn1_ctrl(CMS_RecipientInfo *ri, int cmd)
+{
+ EVP_PKEY *pkey;
+ int i;
+ if (ri->type == CMS_RECIPINFO_TRANS)
+ pkey = ri->d.ktri->pkey;
+ else if (ri->type == CMS_RECIPINFO_AGREE) {
+ EVP_PKEY_CTX *pctx = ri->d.kari->pctx;
+ if (!pctx)
+ return 0;
+ pkey = EVP_PKEY_CTX_get0_pkey(pctx);
+ if (!pkey)
+ return 0;
+ } else
+ return 0;
+ if (!pkey->ameth || !pkey->ameth->pkey_ctrl)
+ return 1;
+ i = pkey->ameth->pkey_ctrl(pkey, ASN1_PKEY_CTRL_CMS_ENVELOPE, cmd, ri);
+ if (i == -2) {
+ CMSerr(CMS_F_CMS_ENV_ASN1_CTRL,
+ CMS_R_NOT_SUPPORTED_FOR_THIS_KEY_TYPE);
+ return 0;
+ }
+ if (i <= 0) {
+ CMSerr(CMS_F_CMS_ENV_ASN1_CTRL, CMS_R_CTRL_FAILURE);
+ return 0;
+ }
+ return 1;
+}
+
STACK_OF(CMS_RecipientInfo) *CMS_get0_RecipientInfos(CMS_ContentInfo *cms)
{
CMS_EnvelopedData *env;
@@ -114,6 +144,15 @@ int CMS_RecipientInfo_type(CMS_RecipientInfo *ri)
return ri->type;
}
+EVP_PKEY_CTX *CMS_RecipientInfo_get0_pkey_ctx(CMS_RecipientInfo *ri)
+{
+ if (ri->type == CMS_RECIPINFO_TRANS)
+ return ri->d.ktri->pctx;
+ else if (ri->type == CMS_RECIPINFO_AGREE)
+ return ri->d.kari->pctx;
+ return NULL;
+}
+
CMS_ContentInfo *CMS_EnvelopedData_create(const EVP_CIPHER *cipher)
{
CMS_ContentInfo *cms;
@@ -137,19 +176,63 @@ CMS_ContentInfo *CMS_EnvelopedData_create(const EVP_CIPHER *cipher)
/* Key Transport Recipient Info (KTRI) routines */
+/* Initialise a ktri based on passed certificate and key */
+
+static int cms_RecipientInfo_ktri_init(CMS_RecipientInfo *ri, X509 *recip,
+ EVP_PKEY *pk, unsigned int flags)
+{
+ CMS_KeyTransRecipientInfo *ktri;
+ int idtype;
+
+ ri->d.ktri = M_ASN1_new_of(CMS_KeyTransRecipientInfo);
+ if (!ri->d.ktri)
+ return 0;
+ ri->type = CMS_RECIPINFO_TRANS;
+
+ ktri = ri->d.ktri;
+
+ if (flags & CMS_USE_KEYID) {
+ ktri->version = 2;
+ idtype = CMS_RECIPINFO_KEYIDENTIFIER;
+ } else {
+ ktri->version = 0;
+ idtype = CMS_RECIPINFO_ISSUER_SERIAL;
+ }
+
+ /*
+ * Not a typo: RecipientIdentifier and SignerIdentifier are the same
+ * structure.
+ */
+
+ if (!cms_set1_SignerIdentifier(ktri->rid, recip, idtype))
+ return 0;
+
+ CRYPTO_add(&recip->references, 1, CRYPTO_LOCK_X509);
+ CRYPTO_add(&pk->references, 1, CRYPTO_LOCK_EVP_PKEY);
+ ktri->pkey = pk;
+ ktri->recip = recip;
+
+ if (flags & CMS_KEY_PARAM) {
+ ktri->pctx = EVP_PKEY_CTX_new(ktri->pkey, NULL);
+ if (!ktri->pctx)
+ return 0;
+ if (EVP_PKEY_encrypt_init(ktri->pctx) <= 0)
+ return 0;
+ } else if (!cms_env_asn1_ctrl(ri, 0))
+ return 0;
+ return 1;
+}
+
/*
- * Add a recipient certificate. For now only handle key transport. If we ever
- * handle key agreement will need updating.
+ * Add a recipient certificate using appropriate type of RecipientInfo
*/
CMS_RecipientInfo *CMS_add1_recipient_cert(CMS_ContentInfo *cms,
X509 *recip, unsigned int flags)
{
CMS_RecipientInfo *ri = NULL;
- CMS_KeyTransRecipientInfo *ktri;
CMS_EnvelopedData *env;
EVP_PKEY *pk = NULL;
- int i, type;
env = cms_get0_enveloped(cms);
if (!env)
goto err;
@@ -159,59 +242,36 @@ CMS_RecipientInfo *CMS_add1_recipient_cert(CMS_ContentInfo *cms,
if (!ri)
goto merr;
- /* Initialize and add key transport recipient info */
-
- ri->d.ktri = M_ASN1_new_of(CMS_KeyTransRecipientInfo);
- if (!ri->d.ktri)
- goto merr;
- ri->type = CMS_RECIPINFO_TRANS;
-
- ktri = ri->d.ktri;
-
- X509_check_purpose(recip, -1, -1);
pk = X509_get_pubkey(recip);
if (!pk) {
CMSerr(CMS_F_CMS_ADD1_RECIPIENT_CERT, CMS_R_ERROR_GETTING_PUBLIC_KEY);
goto err;
}
- CRYPTO_add(&recip->references, 1, CRYPTO_LOCK_X509);
- ktri->pkey = pk;
- ktri->recip = recip;
- if (flags & CMS_USE_KEYID) {
- ktri->version = 2;
- if (env->version < 2)
- env->version = 2;
- type = CMS_RECIPINFO_KEYIDENTIFIER;
- } else {
- ktri->version = 0;
- type = CMS_RECIPINFO_ISSUER_SERIAL;
- }
+ switch (cms_pkey_get_ri_type(pk)) {
- /*
- * Not a typo: RecipientIdentifier and SignerIdentifier are the same
- * structure.
- */
+ case CMS_RECIPINFO_TRANS:
+ if (!cms_RecipientInfo_ktri_init(ri, recip, pk, flags))
+ goto err;
+ break;
- if (!cms_set1_SignerIdentifier(ktri->rid, recip, type))
+ case CMS_RECIPINFO_AGREE:
+ if (!cms_RecipientInfo_kari_init(ri, recip, pk, flags))
+ goto err;
+ break;
+
+ default:
+ CMSerr(CMS_F_CMS_ADD1_RECIPIENT_CERT,
+ CMS_R_NOT_SUPPORTED_FOR_THIS_KEY_TYPE);
goto err;
- if (pk->ameth && pk->ameth->pkey_ctrl) {
- i = pk->ameth->pkey_ctrl(pk, ASN1_PKEY_CTRL_CMS_ENVELOPE, 0, ri);
- if (i == -2) {
- CMSerr(CMS_F_CMS_ADD1_RECIPIENT_CERT,
- CMS_R_NOT_SUPPORTED_FOR_THIS_KEY_TYPE);
- goto err;
- }
- if (i <= 0) {
- CMSerr(CMS_F_CMS_ADD1_RECIPIENT_CERT, CMS_R_CTRL_FAILURE);
- goto err;
- }
}
if (!sk_CMS_RecipientInfo_push(env->recipientInfos, ri))
goto merr;
+ EVP_PKEY_free(pk);
+
return ri;
merr:
@@ -219,6 +279,8 @@ CMS_RecipientInfo *CMS_add1_recipient_cert(CMS_ContentInfo *cms,
err:
if (ri)
M_ASN1_free_of(ri, CMS_RecipientInfo);
+ if (pk)
+ EVP_PKEY_free(pk);
return NULL;
}
@@ -288,7 +350,7 @@ static int cms_RecipientInfo_ktri_encrypt(CMS_ContentInfo *cms,
{
CMS_KeyTransRecipientInfo *ktri;
CMS_EncryptedContentInfo *ec;
- EVP_PKEY_CTX *pctx = NULL;
+ EVP_PKEY_CTX *pctx;
unsigned char *ek = NULL;
size_t eklen;
@@ -301,12 +363,19 @@ static int cms_RecipientInfo_ktri_encrypt(CMS_ContentInfo *cms,
ktri = ri->d.ktri;
ec = cms->d.envelopedData->encryptedContentInfo;
- pctx = EVP_PKEY_CTX_new(ktri->pkey, NULL);
- if (!pctx)
- return 0;
+ pctx = ktri->pctx;
- if (EVP_PKEY_encrypt_init(pctx) <= 0)
- goto err;
+ if (pctx) {
+ if (!cms_env_asn1_ctrl(ri, 0))
+ goto err;
+ } else {
+ pctx = EVP_PKEY_CTX_new(ktri->pkey, NULL);
+ if (!pctx)
+ return 0;
+
+ if (EVP_PKEY_encrypt_init(pctx) <= 0)
+ goto err;
+ }
if (EVP_PKEY_CTX_ctrl(pctx, -1, EVP_PKEY_OP_ENCRYPT,
EVP_PKEY_CTRL_CMS_ENCRYPT, 0, ri) <= 0) {
@@ -333,8 +402,10 @@ static int cms_RecipientInfo_ktri_encrypt(CMS_ContentInfo *cms,
ret = 1;
err:
- if (pctx)
+ if (pctx) {
EVP_PKEY_CTX_free(pctx);
+ ktri->pctx = NULL;
+ }
if (ek)
OPENSSL_free(ek);
return ret;
@@ -347,7 +418,7 @@ static int cms_RecipientInfo_ktri_decrypt(CMS_ContentInfo *cms,
CMS_RecipientInfo *ri)
{
CMS_KeyTransRecipientInfo *ktri = ri->d.ktri;
- EVP_PKEY_CTX *pctx = NULL;
+ EVP_PKEY *pkey = ktri->pkey;
unsigned char *ek = NULL;
size_t eklen;
int ret = 0;
@@ -359,20 +430,23 @@ static int cms_RecipientInfo_ktri_decrypt(CMS_ContentInfo *cms,
return 0;
}
- pctx = EVP_PKEY_CTX_new(ktri->pkey, NULL);
- if (!pctx)
+ ktri->pctx = EVP_PKEY_CTX_new(pkey, NULL);
+ if (!ktri->pctx)
return 0;
- if (EVP_PKEY_decrypt_init(pctx) <= 0)
+ if (EVP_PKEY_decrypt_init(ktri->pctx) <= 0)
goto err;
- if (EVP_PKEY_CTX_ctrl(pctx, -1, EVP_PKEY_OP_DECRYPT,
+ if (!cms_env_asn1_ctrl(ri, 1))
+ goto err;
+
+ if (EVP_PKEY_CTX_ctrl(ktri->pctx, -1, EVP_PKEY_OP_DECRYPT,
EVP_PKEY_CTRL_CMS_DECRYPT, 0, ri) <= 0) {
CMSerr(CMS_F_CMS_RECIPIENTINFO_KTRI_DECRYPT, CMS_R_CTRL_ERROR);
goto err;
}
- if (EVP_PKEY_decrypt(pctx, NULL, &eklen,
+ if (EVP_PKEY_decrypt(ktri->pctx, NULL, &eklen,
ktri->encryptedKey->data,
ktri->encryptedKey->length) <= 0)
goto err;
@@ -384,7 +458,7 @@ static int cms_RecipientInfo_ktri_decrypt(CMS_ContentInfo *cms,
goto err;
}
- if (EVP_PKEY_decrypt(pctx, ek, &eklen,
+ if (EVP_PKEY_decrypt(ktri->pctx, ek, &eklen,
ktri->encryptedKey->data,
ktri->encryptedKey->length) <= 0) {
CMSerr(CMS_F_CMS_RECIPIENTINFO_KTRI_DECRYPT, CMS_R_CMS_LIB);
@@ -402,8 +476,10 @@ static int cms_RecipientInfo_ktri_decrypt(CMS_ContentInfo *cms,
ec->keylen = eklen;
err:
- if (pctx)
- EVP_PKEY_CTX_free(pctx);
+ if (ktri->pctx) {
+ EVP_PKEY_CTX_free(ktri->pctx);
+ ktri->pctx = NULL;
+ }
if (!ret && ek)
OPENSSL_free(ek);
@@ -745,12 +821,99 @@ int CMS_RecipientInfo_decrypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri)
}
}
+int CMS_RecipientInfo_encrypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri)
+{
+ switch (ri->type) {
+ case CMS_RECIPINFO_TRANS:
+ return cms_RecipientInfo_ktri_encrypt(cms, ri);
+
+ case CMS_RECIPINFO_AGREE:
+ return cms_RecipientInfo_kari_encrypt(cms, ri);
+
+ case CMS_RECIPINFO_KEK:
+ return cms_RecipientInfo_kekri_encrypt(cms, ri);
+ break;
+
+ case CMS_RECIPINFO_PASS:
+ return cms_RecipientInfo_pwri_crypt(cms, ri, 1);
+ break;
+
+ default:
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_ENCRYPT,
+ CMS_R_UNSUPPORTED_RECIPIENT_TYPE);
+ return 0;
+ }
+}
+
+/* Check structures and fixup version numbers (if necessary) */
+
+static void cms_env_set_originfo_version(CMS_EnvelopedData *env)
+{
+ CMS_OriginatorInfo *org = env->originatorInfo;
+ int i;
+ if (org == NULL)
+ return;
+ for (i = 0; i < sk_CMS_CertificateChoices_num(org->certificates); i++) {
+ CMS_CertificateChoices *cch;
+ cch = sk_CMS_CertificateChoices_value(org->certificates, i);
+ if (cch->type == CMS_CERTCHOICE_OTHER) {
+ env->version = 4;
+ return;
+ } else if (cch->type == CMS_CERTCHOICE_V2ACERT) {
+ if (env->version < 3)
+ env->version = 3;
+ }
+ }
+
+ for (i = 0; i < sk_CMS_RevocationInfoChoice_num(org->crls); i++) {
+ CMS_RevocationInfoChoice *rch;
+ rch = sk_CMS_RevocationInfoChoice_value(org->crls, i);
+ if (rch->type == CMS_REVCHOICE_OTHER) {
+ env->version = 4;
+ return;
+ }
+ }
+}
+
+static void cms_env_set_version(CMS_EnvelopedData *env)
+{
+ int i;
+ CMS_RecipientInfo *ri;
+
+ /*
+ * Can't set version higher than 4 so if 4 or more already nothing to do.
+ */
+ if (env->version >= 4)
+ return;
+
+ cms_env_set_originfo_version(env);
+
+ if (env->version >= 3)
+ return;
+
+ for (i = 0; i < sk_CMS_RecipientInfo_num(env->recipientInfos); i++) {
+ ri = sk_CMS_RecipientInfo_value(env->recipientInfos, i);
+ if (ri->type == CMS_RECIPINFO_PASS || ri->type == CMS_RECIPINFO_OTHER) {
+ env->version = 3;
+ return;
+ } else if (ri->type != CMS_RECIPINFO_TRANS
+ || ri->d.ktri->version != 0) {
+ env->version = 2;
+ }
+ }
+ if (env->version == 2)
+ return;
+ if (env->originatorInfo || env->unprotectedAttrs)
+ env->version = 2;
+ env->version = 0;
+}
+
BIO *cms_EnvelopedData_init_bio(CMS_ContentInfo *cms)
{
CMS_EncryptedContentInfo *ec;
STACK_OF(CMS_RecipientInfo) *rinfos;
CMS_RecipientInfo *ri;
- int i, r, ok = 0;
+ int i, ok = 0;
BIO *ret;
/* Get BIO first to set up key */
@@ -769,32 +932,13 @@ BIO *cms_EnvelopedData_init_bio(CMS_ContentInfo *cms)
for (i = 0; i < sk_CMS_RecipientInfo_num(rinfos); i++) {
ri = sk_CMS_RecipientInfo_value(rinfos, i);
-
- switch (ri->type) {
- case CMS_RECIPINFO_TRANS:
- r = cms_RecipientInfo_ktri_encrypt(cms, ri);
- break;
-
- case CMS_RECIPINFO_KEK:
- r = cms_RecipientInfo_kekri_encrypt(cms, ri);
- break;
-
- case CMS_RECIPINFO_PASS:
- r = cms_RecipientInfo_pwri_crypt(cms, ri, 1);
- break;
-
- default:
- CMSerr(CMS_F_CMS_ENVELOPEDDATA_INIT_BIO,
- CMS_R_UNSUPPORTED_RECIPIENT_TYPE);
- goto err;
- }
-
- if (r <= 0) {
+ if (CMS_RecipientInfo_encrypt(cms, ri) <= 0) {
CMSerr(CMS_F_CMS_ENVELOPEDDATA_INIT_BIO,
CMS_R_ERROR_SETTING_RECIPIENTINFO);
goto err;
}
}
+ cms_env_set_version(cms->d.envelopedData);
ok = 1;
@@ -812,3 +956,19 @@ BIO *cms_EnvelopedData_init_bio(CMS_ContentInfo *cms)
return NULL;
}
+
+/*
+ * Get RecipientInfo type (if any) supported by a key (public or private). To
+ * retain compatibility with previous behaviour if the ctrl value isn't
+ * supported we assume key transport.
+ */
+int cms_pkey_get_ri_type(EVP_PKEY *pk)
+{
+ if (pk->ameth && pk->ameth->pkey_ctrl) {
+ int i, r;
+ i = pk->ameth->pkey_ctrl(pk, ASN1_PKEY_CTRL_CMS_RI_TYPE, 0, &r);
+ if (i > 0)
+ return r;
+ }
+ return CMS_RECIPINFO_TRANS;
+}
diff --git a/crypto/cms/cms_err.c b/crypto/cms/cms_err.c
index faf2fccd3b8c..15572ea348c8 100644
--- a/crypto/cms/cms_err.c
+++ b/crypto/cms/cms_err.c
@@ -1,6 +1,6 @@
/* crypto/cms/cms_err.c */
/* ====================================================================
- * Copyright (c) 1999-2009 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1999-2013 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -110,6 +110,7 @@ static ERR_STRING_DATA CMS_str_functs[] = {
{ERR_FUNC(CMS_F_CMS_ENVELOPEDDATA_INIT_BIO),
"cms_EnvelopedData_init_bio"},
{ERR_FUNC(CMS_F_CMS_ENVELOPED_DATA_INIT), "CMS_ENVELOPED_DATA_INIT"},
+ {ERR_FUNC(CMS_F_CMS_ENV_ASN1_CTRL), "cms_env_asn1_ctrl"},
{ERR_FUNC(CMS_F_CMS_FINAL), "CMS_final"},
{ERR_FUNC(CMS_F_CMS_GET0_CERTIFICATE_CHOICES),
"CMS_GET0_CERTIFICATE_CHOICES"},
@@ -124,6 +125,17 @@ static ERR_STRING_DATA CMS_str_functs[] = {
"CMS_ReceiptRequest_create0"},
{ERR_FUNC(CMS_F_CMS_RECEIPT_VERIFY), "cms_Receipt_verify"},
{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_DECRYPT), "CMS_RecipientInfo_decrypt"},
+ {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_ENCRYPT), "CMS_RecipientInfo_encrypt"},
+ {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KARI_ENCRYPT),
+ "cms_RecipientInfo_kari_encrypt"},
+ {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KARI_GET0_ALG),
+ "CMS_RecipientInfo_kari_get0_alg"},
+ {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KARI_GET0_ORIG_ID),
+ "CMS_RecipientInfo_kari_get0_orig_id"},
+ {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KARI_GET0_REKS),
+ "CMS_RecipientInfo_kari_get0_reks"},
+ {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KARI_ORIG_ID_CMP),
+ "CMS_RecipientInfo_kari_orig_id_cmp"},
{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KEKRI_DECRYPT),
"CMS_RECIPIENTINFO_KEKRI_DECRYPT"},
{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KEKRI_ENCRYPT),
@@ -150,6 +162,9 @@ static ERR_STRING_DATA CMS_str_functs[] = {
"CMS_RecipientInfo_set0_password"},
{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_SET0_PKEY),
"CMS_RecipientInfo_set0_pkey"},
+ {ERR_FUNC(CMS_F_CMS_SD_ASN1_CTRL), "CMS_SD_ASN1_CTRL"},
+ {ERR_FUNC(CMS_F_CMS_SET1_IAS), "cms_set1_ias"},
+ {ERR_FUNC(CMS_F_CMS_SET1_KEYID), "cms_set1_keyid"},
{ERR_FUNC(CMS_F_CMS_SET1_SIGNERIDENTIFIER), "cms_set1_SignerIdentifier"},
{ERR_FUNC(CMS_F_CMS_SET_DETACHED), "CMS_set_detached"},
{ERR_FUNC(CMS_F_CMS_SIGN), "CMS_sign"},
@@ -221,6 +236,7 @@ static ERR_STRING_DATA CMS_str_reasons[] = {
{ERR_REASON(CMS_R_NOT_A_SIGNED_RECEIPT), "not a signed receipt"},
{ERR_REASON(CMS_R_NOT_ENCRYPTED_DATA), "not encrypted data"},
{ERR_REASON(CMS_R_NOT_KEK), "not kek"},
+ {ERR_REASON(CMS_R_NOT_KEY_AGREEMENT), "not key agreement"},
{ERR_REASON(CMS_R_NOT_KEY_TRANSPORT), "not key transport"},
{ERR_REASON(CMS_R_NOT_PWRI), "not pwri"},
{ERR_REASON(CMS_R_NOT_SUPPORTED_FOR_THIS_KEY_TYPE),
diff --git a/crypto/cms/cms_kari.c b/crypto/cms/cms_kari.c
new file mode 100644
index 000000000000..2cfcdb29cd50
--- /dev/null
+++ b/crypto/cms/cms_kari.c
@@ -0,0 +1,465 @@
+/* crypto/cms/cms_kari.c */
+/*
+ * Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2013 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include "cryptlib.h"
+#include <openssl/asn1t.h>
+#include <openssl/pem.h>
+#include <openssl/x509v3.h>
+#include <openssl/err.h>
+#include <openssl/cms.h>
+#include <openssl/rand.h>
+#include <openssl/aes.h>
+#include "cms_lcl.h"
+#include "asn1_locl.h"
+
+DECLARE_ASN1_ITEM(CMS_KeyAgreeRecipientInfo)
+DECLARE_ASN1_ITEM(CMS_RecipientEncryptedKey)
+DECLARE_ASN1_ITEM(CMS_OriginatorPublicKey)
+DECLARE_ASN1_ITEM(CMS_RecipientKeyIdentifier)
+
+/* Key Agreement Recipient Info (KARI) routines */
+
+int CMS_RecipientInfo_kari_get0_alg(CMS_RecipientInfo *ri,
+ X509_ALGOR **palg,
+ ASN1_OCTET_STRING **pukm)
+{
+ if (ri->type != CMS_RECIPINFO_AGREE) {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_KARI_GET0_ALG,
+ CMS_R_NOT_KEY_AGREEMENT);
+ return 0;
+ }
+ if (palg)
+ *palg = ri->d.kari->keyEncryptionAlgorithm;
+ if (pukm)
+ *pukm = ri->d.kari->ukm;
+ return 1;
+}
+
+/* Retrieve recipient encrypted keys from a kari */
+
+STACK_OF(CMS_RecipientEncryptedKey)
+*CMS_RecipientInfo_kari_get0_reks(CMS_RecipientInfo *ri)
+{
+ if (ri->type != CMS_RECIPINFO_AGREE) {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_KARI_GET0_REKS,
+ CMS_R_NOT_KEY_AGREEMENT);
+ return NULL;
+ }
+ return ri->d.kari->recipientEncryptedKeys;
+}
+
+int CMS_RecipientInfo_kari_get0_orig_id(CMS_RecipientInfo *ri,
+ X509_ALGOR **pubalg,
+ ASN1_BIT_STRING **pubkey,
+ ASN1_OCTET_STRING **keyid,
+ X509_NAME **issuer,
+ ASN1_INTEGER **sno)
+{
+ CMS_OriginatorIdentifierOrKey *oik;
+ if (ri->type != CMS_RECIPINFO_AGREE) {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_KARI_GET0_ORIG_ID,
+ CMS_R_NOT_KEY_AGREEMENT);
+ return 0;
+ }
+ oik = ri->d.kari->originator;
+ if (issuer)
+ *issuer = NULL;
+ if (sno)
+ *sno = NULL;
+ if (keyid)
+ *keyid = NULL;
+ if (pubalg)
+ *pubalg = NULL;
+ if (pubkey)
+ *pubkey = NULL;
+ if (oik->type == CMS_OIK_ISSUER_SERIAL) {
+ if (issuer)
+ *issuer = oik->d.issuerAndSerialNumber->issuer;
+ if (sno)
+ *sno = oik->d.issuerAndSerialNumber->serialNumber;
+ } else if (oik->type == CMS_OIK_KEYIDENTIFIER) {
+ if (keyid)
+ *keyid = oik->d.subjectKeyIdentifier;
+ } else if (oik->type == CMS_OIK_PUBKEY) {
+ if (pubalg)
+ *pubalg = oik->d.originatorKey->algorithm;
+ if (pubkey)
+ *pubkey = oik->d.originatorKey->publicKey;
+ } else
+ return 0;
+ return 1;
+}
+
+int CMS_RecipientInfo_kari_orig_id_cmp(CMS_RecipientInfo *ri, X509 *cert)
+{
+ CMS_OriginatorIdentifierOrKey *oik;
+ if (ri->type != CMS_RECIPINFO_AGREE) {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_KARI_ORIG_ID_CMP,
+ CMS_R_NOT_KEY_AGREEMENT);
+ return -2;
+ }
+ oik = ri->d.kari->originator;
+ if (oik->type == CMS_OIK_ISSUER_SERIAL)
+ return cms_ias_cert_cmp(oik->d.issuerAndSerialNumber, cert);
+ else if (oik->type == CMS_OIK_KEYIDENTIFIER)
+ return cms_keyid_cert_cmp(oik->d.subjectKeyIdentifier, cert);
+ return -1;
+}
+
+int CMS_RecipientEncryptedKey_get0_id(CMS_RecipientEncryptedKey *rek,
+ ASN1_OCTET_STRING **keyid,
+ ASN1_GENERALIZEDTIME **tm,
+ CMS_OtherKeyAttribute **other,
+ X509_NAME **issuer, ASN1_INTEGER **sno)
+{
+ CMS_KeyAgreeRecipientIdentifier *rid = rek->rid;
+ if (rid->type == CMS_REK_ISSUER_SERIAL) {
+ if (issuer)
+ *issuer = rid->d.issuerAndSerialNumber->issuer;
+ if (sno)
+ *sno = rid->d.issuerAndSerialNumber->serialNumber;
+ if (keyid)
+ *keyid = NULL;
+ if (tm)
+ *tm = NULL;
+ if (other)
+ *other = NULL;
+ } else if (rid->type == CMS_REK_KEYIDENTIFIER) {
+ if (keyid)
+ *keyid = rid->d.rKeyId->subjectKeyIdentifier;
+ if (tm)
+ *tm = rid->d.rKeyId->date;
+ if (other)
+ *other = rid->d.rKeyId->other;
+ if (issuer)
+ *issuer = NULL;
+ if (sno)
+ *sno = NULL;
+ } else
+ return 0;
+ return 1;
+}
+
+int CMS_RecipientEncryptedKey_cert_cmp(CMS_RecipientEncryptedKey *rek,
+ X509 *cert)
+{
+ CMS_KeyAgreeRecipientIdentifier *rid = rek->rid;
+ if (rid->type == CMS_REK_ISSUER_SERIAL)
+ return cms_ias_cert_cmp(rid->d.issuerAndSerialNumber, cert);
+ else if (rid->type == CMS_REK_KEYIDENTIFIER)
+ return cms_keyid_cert_cmp(rid->d.rKeyId->subjectKeyIdentifier, cert);
+ else
+ return -1;
+}
+
+int CMS_RecipientInfo_kari_set0_pkey(CMS_RecipientInfo *ri, EVP_PKEY *pk)
+{
+ EVP_PKEY_CTX *pctx;
+ CMS_KeyAgreeRecipientInfo *kari = ri->d.kari;
+ if (kari->pctx) {
+ EVP_PKEY_CTX_free(kari->pctx);
+ kari->pctx = NULL;
+ }
+ if (!pk)
+ return 1;
+ pctx = EVP_PKEY_CTX_new(pk, NULL);
+ if (!pctx || !EVP_PKEY_derive_init(pctx))
+ goto err;
+ kari->pctx = pctx;
+ return 1;
+ err:
+ if (pctx)
+ EVP_PKEY_CTX_free(pctx);
+ return 0;
+}
+
+EVP_CIPHER_CTX *CMS_RecipientInfo_kari_get0_ctx(CMS_RecipientInfo *ri)
+{
+ if (ri->type == CMS_RECIPINFO_AGREE)
+ return &ri->d.kari->ctx;
+ return NULL;
+}
+
+/*
+ * Derive KEK and decrypt/encrypt with it to produce either the original CEK
+ * or the encrypted CEK.
+ */
+
+static int cms_kek_cipher(unsigned char **pout, size_t *poutlen,
+ const unsigned char *in, size_t inlen,
+ CMS_KeyAgreeRecipientInfo *kari, int enc)
+{
+ /* Key encryption key */
+ unsigned char kek[EVP_MAX_KEY_LENGTH];
+ size_t keklen;
+ int rv = 0;
+ unsigned char *out = NULL;
+ int outlen;
+ keklen = EVP_CIPHER_CTX_key_length(&kari->ctx);
+ if (keklen > EVP_MAX_KEY_LENGTH)
+ return 0;
+ /* Derive KEK */
+ if (EVP_PKEY_derive(kari->pctx, kek, &keklen) <= 0)
+ goto err;
+ /* Set KEK in context */
+ if (!EVP_CipherInit_ex(&kari->ctx, NULL, NULL, kek, NULL, enc))
+ goto err;
+ /* obtain output length of ciphered key */
+ if (!EVP_CipherUpdate(&kari->ctx, NULL, &outlen, in, inlen))
+ goto err;
+ out = OPENSSL_malloc(outlen);
+ if (!out)
+ goto err;
+ if (!EVP_CipherUpdate(&kari->ctx, out, &outlen, in, inlen))
+ goto err;
+ *pout = out;
+ *poutlen = (size_t)outlen;
+ rv = 1;
+
+ err:
+ OPENSSL_cleanse(kek, keklen);
+ if (!rv && out)
+ OPENSSL_free(out);
+ EVP_CIPHER_CTX_cleanup(&kari->ctx);
+ EVP_PKEY_CTX_free(kari->pctx);
+ kari->pctx = NULL;
+ return rv;
+}
+
+int CMS_RecipientInfo_kari_decrypt(CMS_ContentInfo *cms,
+ CMS_RecipientInfo *ri,
+ CMS_RecipientEncryptedKey *rek)
+{
+ int rv = 0;
+ unsigned char *enckey = NULL, *cek = NULL;
+ size_t enckeylen;
+ size_t ceklen;
+ CMS_EncryptedContentInfo *ec;
+ enckeylen = rek->encryptedKey->length;
+ enckey = rek->encryptedKey->data;
+ /* Setup all parameters to derive KEK */
+ if (!cms_env_asn1_ctrl(ri, 1))
+ goto err;
+ /* Attempt to decrypt CEK */
+ if (!cms_kek_cipher(&cek, &ceklen, enckey, enckeylen, ri->d.kari, 0))
+ goto err;
+ ec = cms->d.envelopedData->encryptedContentInfo;
+ if (ec->key) {
+ OPENSSL_cleanse(ec->key, ec->keylen);
+ OPENSSL_free(ec->key);
+ }
+ ec->key = cek;
+ ec->keylen = ceklen;
+ cek = NULL;
+ rv = 1;
+ err:
+ if (cek)
+ OPENSSL_free(cek);
+ return rv;
+}
+
+/* Create ephemeral key and initialise context based on it */
+static int cms_kari_create_ephemeral_key(CMS_KeyAgreeRecipientInfo *kari,
+ EVP_PKEY *pk)
+{
+ EVP_PKEY_CTX *pctx = NULL;
+ EVP_PKEY *ekey = NULL;
+ int rv = 0;
+ pctx = EVP_PKEY_CTX_new(pk, NULL);
+ if (!pctx)
+ goto err;
+ if (EVP_PKEY_keygen_init(pctx) <= 0)
+ goto err;
+ if (EVP_PKEY_keygen(pctx, &ekey) <= 0)
+ goto err;
+ EVP_PKEY_CTX_free(pctx);
+ pctx = EVP_PKEY_CTX_new(ekey, NULL);
+ if (!pctx)
+ goto err;
+ if (EVP_PKEY_derive_init(pctx) <= 0)
+ goto err;
+ kari->pctx = pctx;
+ rv = 1;
+ err:
+ if (!rv && pctx)
+ EVP_PKEY_CTX_free(pctx);
+ if (ekey)
+ EVP_PKEY_free(ekey);
+ return rv;
+}
+
+/* Initialise a ktri based on passed certificate and key */
+
+int cms_RecipientInfo_kari_init(CMS_RecipientInfo *ri, X509 *recip,
+ EVP_PKEY *pk, unsigned int flags)
+{
+ CMS_KeyAgreeRecipientInfo *kari;
+ CMS_RecipientEncryptedKey *rek = NULL;
+
+ ri->d.kari = M_ASN1_new_of(CMS_KeyAgreeRecipientInfo);
+ if (!ri->d.kari)
+ return 0;
+ ri->type = CMS_RECIPINFO_AGREE;
+
+ kari = ri->d.kari;
+ kari->version = 3;
+
+ rek = M_ASN1_new_of(CMS_RecipientEncryptedKey);
+ if (!sk_CMS_RecipientEncryptedKey_push(kari->recipientEncryptedKeys, rek)) {
+ M_ASN1_free_of(rek, CMS_RecipientEncryptedKey);
+ return 0;
+ }
+
+ if (flags & CMS_USE_KEYID) {
+ rek->rid->type = CMS_REK_KEYIDENTIFIER;
+ rek->rid->d.rKeyId = M_ASN1_new_of(CMS_RecipientKeyIdentifier);
+ if (rek->rid->d.rKeyId == NULL)
+ return 0;
+ if (!cms_set1_keyid(&rek->rid->d.rKeyId->subjectKeyIdentifier, recip))
+ return 0;
+ } else {
+ rek->rid->type = CMS_REK_ISSUER_SERIAL;
+ if (!cms_set1_ias(&rek->rid->d.issuerAndSerialNumber, recip))
+ return 0;
+ }
+
+ /* Create ephemeral key */
+ if (!cms_kari_create_ephemeral_key(kari, pk))
+ return 0;
+
+ CRYPTO_add(&pk->references, 1, CRYPTO_LOCK_EVP_PKEY);
+ rek->pkey = pk;
+ return 1;
+}
+
+static int cms_wrap_init(CMS_KeyAgreeRecipientInfo *kari,
+ const EVP_CIPHER *cipher)
+{
+ EVP_CIPHER_CTX *ctx = &kari->ctx;
+ const EVP_CIPHER *kekcipher;
+ int keylen = EVP_CIPHER_key_length(cipher);
+ /* If a suitable wrap algorithm is already set nothing to do */
+ kekcipher = EVP_CIPHER_CTX_cipher(ctx);
+
+ if (kekcipher) {
+ if (EVP_CIPHER_CTX_mode(ctx) != EVP_CIPH_WRAP_MODE)
+ return 0;
+ return 1;
+ }
+ /*
+ * Pick a cipher based on content encryption cipher. If it is DES3 use
+ * DES3 wrap otherwise use AES wrap similar to key size.
+ */
+ if (EVP_CIPHER_type(cipher) == NID_des_ede3_cbc)
+ kekcipher = EVP_des_ede3_wrap();
+ else if (keylen <= 16)
+ kekcipher = EVP_aes_128_wrap();
+ else if (keylen <= 24)
+ kekcipher = EVP_aes_192_wrap();
+ else
+ kekcipher = EVP_aes_256_wrap();
+ return EVP_EncryptInit_ex(ctx, kekcipher, NULL, NULL, NULL);
+}
+
+/* Encrypt content key in key agreement recipient info */
+
+int cms_RecipientInfo_kari_encrypt(CMS_ContentInfo *cms,
+ CMS_RecipientInfo *ri)
+{
+ CMS_KeyAgreeRecipientInfo *kari;
+ CMS_EncryptedContentInfo *ec;
+ CMS_RecipientEncryptedKey *rek;
+ STACK_OF(CMS_RecipientEncryptedKey) *reks;
+ int i;
+
+ if (ri->type != CMS_RECIPINFO_AGREE) {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_KARI_ENCRYPT, CMS_R_NOT_KEY_AGREEMENT);
+ return 0;
+ }
+ kari = ri->d.kari;
+ reks = kari->recipientEncryptedKeys;
+ ec = cms->d.envelopedData->encryptedContentInfo;
+ /* Initialise wrap algorithm parameters */
+ if (!cms_wrap_init(kari, ec->cipher))
+ return 0;
+ /*
+ * If no orignator key set up initialise for ephemeral key the public key
+ * ASN1 structure will set the actual public key value.
+ */
+ if (kari->originator->type == -1) {
+ CMS_OriginatorIdentifierOrKey *oik = kari->originator;
+ oik->type = CMS_OIK_PUBKEY;
+ oik->d.originatorKey = M_ASN1_new_of(CMS_OriginatorPublicKey);
+ if (!oik->d.originatorKey)
+ return 0;
+ }
+ /* Initialise KDF algorithm */
+ if (!cms_env_asn1_ctrl(ri, 0))
+ return 0;
+ /* For each rek, derive KEK, encrypt CEK */
+ for (i = 0; i < sk_CMS_RecipientEncryptedKey_num(reks); i++) {
+ unsigned char *enckey;
+ size_t enckeylen;
+ rek = sk_CMS_RecipientEncryptedKey_value(reks, i);
+ if (EVP_PKEY_derive_set_peer(kari->pctx, rek->pkey) <= 0)
+ return 0;
+ if (!cms_kek_cipher(&enckey, &enckeylen, ec->key, ec->keylen,
+ kari, 1))
+ return 0;
+ ASN1_STRING_set0(rek->encryptedKey, enckey, enckeylen);
+ }
+
+ return 1;
+
+}
diff --git a/crypto/cms/cms_lcl.h b/crypto/cms/cms_lcl.h
index 4f4c4c7434f3..20f2c25f5ae9 100644
--- a/crypto/cms/cms_lcl.h
+++ b/crypto/cms/cms_lcl.h
@@ -84,11 +84,9 @@ typedef struct CMS_KeyTransRecipientInfo_st CMS_KeyTransRecipientInfo;
typedef struct CMS_OriginatorPublicKey_st CMS_OriginatorPublicKey;
typedef struct CMS_OriginatorIdentifierOrKey_st CMS_OriginatorIdentifierOrKey;
typedef struct CMS_KeyAgreeRecipientInfo_st CMS_KeyAgreeRecipientInfo;
-typedef struct CMS_OtherKeyAttribute_st CMS_OtherKeyAttribute;
typedef struct CMS_RecipientKeyIdentifier_st CMS_RecipientKeyIdentifier;
typedef struct CMS_KeyAgreeRecipientIdentifier_st
CMS_KeyAgreeRecipientIdentifier;
-typedef struct CMS_RecipientEncryptedKey_st CMS_RecipientEncryptedKey;
typedef struct CMS_KEKIdentifier_st CMS_KEKIdentifier;
typedef struct CMS_KEKRecipientInfo_st CMS_KEKRecipientInfo;
typedef struct CMS_PasswordRecipientInfo_st CMS_PasswordRecipientInfo;
@@ -138,6 +136,9 @@ struct CMS_SignerInfo_st {
/* Signing certificate and key */
X509 *signer;
EVP_PKEY *pkey;
+ /* Digest and public key context for alternative parameters */
+ EVP_MD_CTX mctx;
+ EVP_PKEY_CTX *pctx;
};
struct CMS_SignerIdentifier_st {
@@ -194,6 +195,8 @@ struct CMS_KeyTransRecipientInfo_st {
/* Recipient Key and cert */
X509 *recip;
EVP_PKEY *pkey;
+ /* Public key context for this operation */
+ EVP_PKEY_CTX *pctx;
};
struct CMS_KeyAgreeRecipientInfo_st {
@@ -202,6 +205,10 @@ struct CMS_KeyAgreeRecipientInfo_st {
ASN1_OCTET_STRING *ukm;
X509_ALGOR *keyEncryptionAlgorithm;
STACK_OF(CMS_RecipientEncryptedKey) *recipientEncryptedKeys;
+ /* Public key context associated with current operation */
+ EVP_PKEY_CTX *pctx;
+ /* Cipher context for CEK wrapping */
+ EVP_CIPHER_CTX ctx;
};
struct CMS_OriginatorIdentifierOrKey_st {
@@ -221,6 +228,8 @@ struct CMS_OriginatorPublicKey_st {
struct CMS_RecipientEncryptedKey_st {
CMS_KeyAgreeRecipientIdentifier *rid;
ASN1_OCTET_STRING *encryptedKey;
+ /* Public key associated with this recipient */
+ EVP_PKEY *pkey;
};
struct CMS_KeyAgreeRecipientIdentifier_st {
@@ -394,6 +403,13 @@ DECLARE_ASN1_ALLOC_FUNCTIONS(CMS_IssuerAndSerialNumber)
# define CMS_RECIPINFO_ISSUER_SERIAL 0
# define CMS_RECIPINFO_KEYIDENTIFIER 1
+# define CMS_REK_ISSUER_SERIAL 0
+# define CMS_REK_KEYIDENTIFIER 1
+
+# define CMS_OIK_ISSUER_SERIAL 0
+# define CMS_OIK_KEYIDENTIFIER 1
+# define CMS_OIK_PUBKEY 2
+
BIO *cms_content_bio(CMS_ContentInfo *cms);
CMS_ContentInfo *cms_Data_create(void);
@@ -420,6 +436,11 @@ BIO *cms_DigestAlgorithm_init_bio(X509_ALGOR *digestAlgorithm);
int cms_DigestAlgorithm_find_ctx(EVP_MD_CTX *mctx, BIO *chain,
X509_ALGOR *mdalg);
+int cms_ias_cert_cmp(CMS_IssuerAndSerialNumber *ias, X509 *cert);
+int cms_keyid_cert_cmp(ASN1_OCTET_STRING *keyid, X509 *cert);
+int cms_set1_ias(CMS_IssuerAndSerialNumber **pias, X509 *cert);
+int cms_set1_keyid(ASN1_OCTET_STRING **pkeyid, X509 *cert);
+
BIO *cms_EncryptedContent_init_bio(CMS_EncryptedContentInfo *ec);
BIO *cms_EncryptedData_init_bio(CMS_ContentInfo *cms);
int cms_EncryptedContent_init(CMS_EncryptedContentInfo *ec,
@@ -432,6 +453,13 @@ ASN1_OCTET_STRING *cms_encode_Receipt(CMS_SignerInfo *si);
BIO *cms_EnvelopedData_init_bio(CMS_ContentInfo *cms);
CMS_EnvelopedData *cms_get0_enveloped(CMS_ContentInfo *cms);
+int cms_env_asn1_ctrl(CMS_RecipientInfo *ri, int cmd);
+int cms_pkey_get_ri_type(EVP_PKEY *pk);
+/* KARI routines */
+int cms_RecipientInfo_kari_init(CMS_RecipientInfo *ri, X509 *recip,
+ EVP_PKEY *pk, unsigned int flags);
+int cms_RecipientInfo_kari_encrypt(CMS_ContentInfo *cms,
+ CMS_RecipientInfo *ri);
/* PWRI routines */
int cms_RecipientInfo_pwri_crypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri,
diff --git a/crypto/cms/cms_lib.c b/crypto/cms/cms_lib.c
index e93846168038..d6cb60d02d15 100644
--- a/crypto/cms/cms_lib.c
+++ b/crypto/cms/cms_lib.c
@@ -53,7 +53,7 @@
*/
#include <openssl/asn1t.h>
-#include <openssl/x509.h>
+#include <openssl/x509v3.h>
#include <openssl/err.h>
#include <openssl/pem.h>
#include <openssl/bio.h>
@@ -593,3 +593,60 @@ STACK_OF(X509_CRL) *CMS_get1_crls(CMS_ContentInfo *cms)
}
return crls;
}
+
+int cms_ias_cert_cmp(CMS_IssuerAndSerialNumber *ias, X509 *cert)
+{
+ int ret;
+ ret = X509_NAME_cmp(ias->issuer, X509_get_issuer_name(cert));
+ if (ret)
+ return ret;
+ return ASN1_INTEGER_cmp(ias->serialNumber, X509_get_serialNumber(cert));
+}
+
+int cms_keyid_cert_cmp(ASN1_OCTET_STRING *keyid, X509 *cert)
+{
+ X509_check_purpose(cert, -1, -1);
+ if (!cert->skid)
+ return -1;
+ return ASN1_OCTET_STRING_cmp(keyid, cert->skid);
+}
+
+int cms_set1_ias(CMS_IssuerAndSerialNumber **pias, X509 *cert)
+{
+ CMS_IssuerAndSerialNumber *ias;
+ ias = M_ASN1_new_of(CMS_IssuerAndSerialNumber);
+ if (!ias)
+ goto err;
+ if (!X509_NAME_set(&ias->issuer, X509_get_issuer_name(cert)))
+ goto err;
+ if (!ASN1_STRING_copy(ias->serialNumber, X509_get_serialNumber(cert)))
+ goto err;
+ if (*pias)
+ M_ASN1_free_of(*pias, CMS_IssuerAndSerialNumber);
+ *pias = ias;
+ return 1;
+ err:
+ if (ias)
+ M_ASN1_free_of(ias, CMS_IssuerAndSerialNumber);
+ CMSerr(CMS_F_CMS_SET1_IAS, ERR_R_MALLOC_FAILURE);
+ return 0;
+}
+
+int cms_set1_keyid(ASN1_OCTET_STRING **pkeyid, X509 *cert)
+{
+ ASN1_OCTET_STRING *keyid = NULL;
+ X509_check_purpose(cert, -1, -1);
+ if (!cert->skid) {
+ CMSerr(CMS_F_CMS_SET1_KEYID, CMS_R_CERTIFICATE_HAS_NO_KEYID);
+ return 0;
+ }
+ keyid = ASN1_STRING_dup(cert->skid);
+ if (!keyid) {
+ CMSerr(CMS_F_CMS_SET1_KEYID, ERR_R_MALLOC_FAILURE);
+ return 0;
+ }
+ if (*pkeyid)
+ ASN1_OCTET_STRING_free(*pkeyid);
+ *pkeyid = keyid;
+ return 1;
+}
diff --git a/crypto/cms/cms_sd.c b/crypto/cms/cms_sd.c
index 6daa2627dbab..721ffd5afb85 100644
--- a/crypto/cms/cms_sd.c
+++ b/crypto/cms/cms_sd.c
@@ -55,6 +55,7 @@
#include "cryptlib.h"
#include <openssl/asn1t.h>
#include <openssl/pem.h>
+#include <openssl/x509.h>
#include <openssl/x509v3.h>
#include <openssl/err.h>
#include <openssl/cms.h>
@@ -197,27 +198,13 @@ int cms_set1_SignerIdentifier(CMS_SignerIdentifier *sid, X509 *cert, int type)
{
switch (type) {
case CMS_SIGNERINFO_ISSUER_SERIAL:
- sid->d.issuerAndSerialNumber =
- M_ASN1_new_of(CMS_IssuerAndSerialNumber);
- if (!sid->d.issuerAndSerialNumber)
- goto merr;
- if (!X509_NAME_set(&sid->d.issuerAndSerialNumber->issuer,
- X509_get_issuer_name(cert)))
- goto merr;
- if (!ASN1_STRING_copy(sid->d.issuerAndSerialNumber->serialNumber,
- X509_get_serialNumber(cert)))
- goto merr;
+ if (!cms_set1_ias(&sid->d.issuerAndSerialNumber, cert))
+ return 0;
break;
case CMS_SIGNERINFO_KEYIDENTIFIER:
- if (!cert->skid) {
- CMSerr(CMS_F_CMS_SET1_SIGNERIDENTIFIER,
- CMS_R_CERTIFICATE_HAS_NO_KEYID);
+ if (!cms_set1_keyid(&sid->d.subjectKeyIdentifier, cert))
return 0;
- }
- sid->d.subjectKeyIdentifier = ASN1_STRING_dup(cert->skid);
- if (!sid->d.subjectKeyIdentifier)
- goto merr;
break;
default:
@@ -228,11 +215,6 @@ int cms_set1_SignerIdentifier(CMS_SignerIdentifier *sid, X509 *cert, int type)
sid->type = type;
return 1;
-
- merr:
- CMSerr(CMS_F_CMS_SET1_SIGNERIDENTIFIER, ERR_R_MALLOC_FAILURE);
- return 0;
-
}
int cms_SignerIdentifier_get0_signer_id(CMS_SignerIdentifier *sid,
@@ -255,23 +237,32 @@ int cms_SignerIdentifier_get0_signer_id(CMS_SignerIdentifier *sid,
int cms_SignerIdentifier_cert_cmp(CMS_SignerIdentifier *sid, X509 *cert)
{
- int ret;
- if (sid->type == CMS_SIGNERINFO_ISSUER_SERIAL) {
- ret = X509_NAME_cmp(sid->d.issuerAndSerialNumber->issuer,
- X509_get_issuer_name(cert));
- if (ret)
- return ret;
- return ASN1_INTEGER_cmp(sid->d.issuerAndSerialNumber->serialNumber,
- X509_get_serialNumber(cert));
- } else if (sid->type == CMS_SIGNERINFO_KEYIDENTIFIER) {
- X509_check_purpose(cert, -1, -1);
- if (!cert->skid)
- return -1;
- return ASN1_OCTET_STRING_cmp(sid->d.subjectKeyIdentifier, cert->skid);
- } else
+ if (sid->type == CMS_SIGNERINFO_ISSUER_SERIAL)
+ return cms_ias_cert_cmp(sid->d.issuerAndSerialNumber, cert);
+ else if (sid->type == CMS_SIGNERINFO_KEYIDENTIFIER)
+ return cms_keyid_cert_cmp(sid->d.subjectKeyIdentifier, cert);
+ else
return -1;
}
+static int cms_sd_asn1_ctrl(CMS_SignerInfo *si, int cmd)
+{
+ EVP_PKEY *pkey = si->pkey;
+ int i;
+ if (!pkey->ameth || !pkey->ameth->pkey_ctrl)
+ return 1;
+ i = pkey->ameth->pkey_ctrl(pkey, ASN1_PKEY_CTRL_CMS_SIGN, cmd, si);
+ if (i == -2) {
+ CMSerr(CMS_F_CMS_SD_ASN1_CTRL, CMS_R_NOT_SUPPORTED_FOR_THIS_KEY_TYPE);
+ return 0;
+ }
+ if (i <= 0) {
+ CMSerr(CMS_F_CMS_SD_ASN1_CTRL, CMS_R_CTRL_FAILURE);
+ return 0;
+ }
+ return 1;
+}
+
CMS_SignerInfo *CMS_add1_signer(CMS_ContentInfo *cms,
X509 *signer, EVP_PKEY *pk, const EVP_MD *md,
unsigned int flags)
@@ -298,6 +289,8 @@ CMS_SignerInfo *CMS_add1_signer(CMS_ContentInfo *cms,
si->pkey = pk;
si->signer = signer;
+ EVP_MD_CTX_init(&si->mctx);
+ si->pctx = NULL;
if (flags & CMS_USE_KEYID) {
si->version = 3;
@@ -350,19 +343,8 @@ CMS_SignerInfo *CMS_add1_signer(CMS_ContentInfo *cms,
}
}
- if (pk->ameth && pk->ameth->pkey_ctrl) {
- i = pk->ameth->pkey_ctrl(pk, ASN1_PKEY_CTRL_CMS_SIGN, 0, si);
- if (i == -2) {
- CMSerr(CMS_F_CMS_ADD1_SIGNER,
- CMS_R_NOT_SUPPORTED_FOR_THIS_KEY_TYPE);
- goto err;
- }
- if (i <= 0) {
- CMSerr(CMS_F_CMS_ADD1_SIGNER, CMS_R_CTRL_FAILURE);
- goto err;
- }
- }
-
+ if (!(flags & CMS_KEY_PARAM) && !cms_sd_asn1_ctrl(si, 0))
+ goto err;
if (!(flags & CMS_NOATTR)) {
/*
* Initialialize signed attributes strutucture so other attributes
@@ -386,7 +368,8 @@ CMS_SignerInfo *CMS_add1_signer(CMS_ContentInfo *cms,
if (flags & CMS_REUSE_DIGEST) {
if (!cms_copy_messageDigest(cms, si))
goto err;
- if (!(flags & CMS_PARTIAL) && !CMS_SignerInfo_sign(si))
+ if (!(flags & (CMS_PARTIAL | CMS_KEY_PARAM)) &&
+ !CMS_SignerInfo_sign(si))
goto err;
}
}
@@ -397,6 +380,20 @@ CMS_SignerInfo *CMS_add1_signer(CMS_ContentInfo *cms,
goto merr;
}
+ if (flags & CMS_KEY_PARAM) {
+ if (flags & CMS_NOATTR) {
+ si->pctx = EVP_PKEY_CTX_new(si->pkey, NULL);
+ if (!si->pctx)
+ goto err;
+ if (EVP_PKEY_sign_init(si->pctx) <= 0)
+ goto err;
+ if (EVP_PKEY_CTX_set_signature_md(si->pctx, md) <= 0)
+ goto err;
+ } else if (EVP_DigestSignInit(&si->mctx, &si->pctx, md, NULL, pk) <=
+ 0)
+ goto err;
+ }
+
if (!sd->signerInfos)
sd->signerInfos = sk_CMS_SignerInfo_new_null();
if (!sd->signerInfos || !sk_CMS_SignerInfo_push(sd->signerInfos, si))
@@ -443,6 +440,16 @@ static int cms_add1_signingTime(CMS_SignerInfo *si, ASN1_TIME *t)
}
+EVP_PKEY_CTX *CMS_SignerInfo_get0_pkey_ctx(CMS_SignerInfo *si)
+{
+ return si->pctx;
+}
+
+EVP_MD_CTX *CMS_SignerInfo_get0_md_ctx(CMS_SignerInfo *si)
+{
+ return &si->mctx;
+}
+
STACK_OF(CMS_SignerInfo) *CMS_get0_SignerInfos(CMS_ContentInfo *cms)
{
CMS_SignedData *sd;
@@ -561,11 +568,17 @@ void CMS_SignerInfo_get0_algs(CMS_SignerInfo *si, EVP_PKEY **pk,
*psig = si->signatureAlgorithm;
}
+ASN1_OCTET_STRING *CMS_SignerInfo_get0_signature(CMS_SignerInfo *si)
+{
+ return si->signature;
+}
+
static int cms_SignerInfo_content_sign(CMS_ContentInfo *cms,
CMS_SignerInfo *si, BIO *chain)
{
EVP_MD_CTX mctx;
int r = 0;
+ EVP_PKEY_CTX *pctx = NULL;
EVP_MD_CTX_init(&mctx);
if (!si->pkey) {
@@ -575,6 +588,9 @@ static int cms_SignerInfo_content_sign(CMS_ContentInfo *cms,
if (!cms_DigestAlgorithm_find_ctx(&mctx, chain, si->digestAlgorithm))
goto err;
+ /* Set SignerInfo algortihm details if we used custom parametsr */
+ if (si->pctx && !cms_sd_asn1_ctrl(si, 0))
+ goto err;
/*
* If any signed attributes calculate and add messageDigest attribute
@@ -596,6 +612,23 @@ static int cms_SignerInfo_content_sign(CMS_ContentInfo *cms,
goto err;
if (!CMS_SignerInfo_sign(si))
goto err;
+ } else if (si->pctx) {
+ unsigned char *sig;
+ size_t siglen;
+ unsigned char md[EVP_MAX_MD_SIZE];
+ unsigned int mdlen;
+ pctx = si->pctx;
+ if (!EVP_DigestFinal_ex(&mctx, md, &mdlen))
+ goto err;
+ siglen = EVP_PKEY_size(si->pkey);
+ sig = OPENSSL_malloc(siglen);
+ if (!sig) {
+ CMSerr(CMS_F_CMS_SIGNERINFO_CONTENT_SIGN, ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+ if (EVP_PKEY_sign(pctx, sig, &siglen, md, mdlen) <= 0)
+ goto err;
+ ASN1_STRING_set0(si->signature, sig, siglen);
} else {
unsigned char *sig;
unsigned int siglen;
@@ -616,6 +649,8 @@ static int cms_SignerInfo_content_sign(CMS_ContentInfo *cms,
err:
EVP_MD_CTX_cleanup(&mctx);
+ if (pctx)
+ EVP_PKEY_CTX_free(pctx);
return r;
}
@@ -637,7 +672,7 @@ int cms_SignedData_final(CMS_ContentInfo *cms, BIO *chain)
int CMS_SignerInfo_sign(CMS_SignerInfo *si)
{
- EVP_MD_CTX mctx;
+ EVP_MD_CTX *mctx = &si->mctx;
EVP_PKEY_CTX *pctx;
unsigned char *abuf = NULL;
int alen;
@@ -648,15 +683,18 @@ int CMS_SignerInfo_sign(CMS_SignerInfo *si)
if (md == NULL)
return 0;
- EVP_MD_CTX_init(&mctx);
-
if (CMS_signed_get_attr_by_NID(si, NID_pkcs9_signingTime, -1) < 0) {
if (!cms_add1_signingTime(si, NULL))
goto err;
}
- if (EVP_DigestSignInit(&mctx, &pctx, md, NULL, si->pkey) <= 0)
- goto err;
+ if (si->pctx)
+ pctx = si->pctx;
+ else {
+ EVP_MD_CTX_init(mctx);
+ if (EVP_DigestSignInit(mctx, &pctx, md, NULL, si->pkey) <= 0)
+ goto err;
+ }
if (EVP_PKEY_CTX_ctrl(pctx, -1, EVP_PKEY_OP_SIGN,
EVP_PKEY_CTRL_CMS_SIGN, 0, si) <= 0) {
@@ -668,15 +706,15 @@ int CMS_SignerInfo_sign(CMS_SignerInfo *si)
ASN1_ITEM_rptr(CMS_Attributes_Sign));
if (!abuf)
goto err;
- if (EVP_DigestSignUpdate(&mctx, abuf, alen) <= 0)
+ if (EVP_DigestSignUpdate(mctx, abuf, alen) <= 0)
goto err;
- if (EVP_DigestSignFinal(&mctx, NULL, &siglen) <= 0)
+ if (EVP_DigestSignFinal(mctx, NULL, &siglen) <= 0)
goto err;
OPENSSL_free(abuf);
abuf = OPENSSL_malloc(siglen);
if (!abuf)
goto err;
- if (EVP_DigestSignFinal(&mctx, abuf, &siglen) <= 0)
+ if (EVP_DigestSignFinal(mctx, abuf, &siglen) <= 0)
goto err;
if (EVP_PKEY_CTX_ctrl(pctx, -1, EVP_PKEY_OP_SIGN,
@@ -685,7 +723,7 @@ int CMS_SignerInfo_sign(CMS_SignerInfo *si)
goto err;
}
- EVP_MD_CTX_cleanup(&mctx);
+ EVP_MD_CTX_cleanup(mctx);
ASN1_STRING_set0(si->signature, abuf, siglen);
@@ -694,15 +732,14 @@ int CMS_SignerInfo_sign(CMS_SignerInfo *si)
err:
if (abuf)
OPENSSL_free(abuf);
- EVP_MD_CTX_cleanup(&mctx);
+ EVP_MD_CTX_cleanup(mctx);
return 0;
}
int CMS_SignerInfo_verify(CMS_SignerInfo *si)
{
- EVP_MD_CTX mctx;
- EVP_PKEY_CTX *pctx;
+ EVP_MD_CTX *mctx = &si->mctx;
unsigned char *abuf = NULL;
int alen, r = -1;
const EVP_MD *md = NULL;
@@ -715,26 +752,29 @@ int CMS_SignerInfo_verify(CMS_SignerInfo *si)
md = EVP_get_digestbyobj(si->digestAlgorithm->algorithm);
if (md == NULL)
return -1;
- EVP_MD_CTX_init(&mctx);
- if (EVP_DigestVerifyInit(&mctx, &pctx, md, NULL, si->pkey) <= 0)
+ EVP_MD_CTX_init(mctx);
+ if (EVP_DigestVerifyInit(mctx, &si->pctx, md, NULL, si->pkey) <= 0)
+ goto err;
+
+ if (!cms_sd_asn1_ctrl(si, 1))
goto err;
alen = ASN1_item_i2d((ASN1_VALUE *)si->signedAttrs, &abuf,
ASN1_ITEM_rptr(CMS_Attributes_Verify));
if (!abuf)
goto err;
- r = EVP_DigestVerifyUpdate(&mctx, abuf, alen);
+ r = EVP_DigestVerifyUpdate(mctx, abuf, alen);
OPENSSL_free(abuf);
if (r <= 0) {
r = -1;
goto err;
}
- r = EVP_DigestVerifyFinal(&mctx,
+ r = EVP_DigestVerifyFinal(mctx,
si->signature->data, si->signature->length);
if (r <= 0)
CMSerr(CMS_F_CMS_SIGNERINFO_VERIFY, CMS_R_VERIFICATION_FAILURE);
err:
- EVP_MD_CTX_cleanup(&mctx);
+ EVP_MD_CTX_cleanup(mctx);
return r;
}
@@ -773,7 +813,10 @@ int CMS_SignerInfo_verify_content(CMS_SignerInfo *si, BIO *chain)
{
ASN1_OCTET_STRING *os = NULL;
EVP_MD_CTX mctx;
+ EVP_PKEY_CTX *pkctx = NULL;
int r = -1;
+ unsigned char mval[EVP_MAX_MD_SIZE];
+ unsigned int mlen;
EVP_MD_CTX_init(&mctx);
/* If we have any signed attributes look for messageDigest value */
if (CMS_signed_get_attr_count(si) >= 0) {
@@ -790,16 +833,15 @@ int CMS_SignerInfo_verify_content(CMS_SignerInfo *si, BIO *chain)
if (!cms_DigestAlgorithm_find_ctx(&mctx, chain, si->digestAlgorithm))
goto err;
+ if (EVP_DigestFinal_ex(&mctx, mval, &mlen) <= 0) {
+ CMSerr(CMS_F_CMS_SIGNERINFO_VERIFY_CONTENT,
+ CMS_R_UNABLE_TO_FINALIZE_CONTEXT);
+ goto err;
+ }
+
/* If messageDigest found compare it */
if (os) {
- unsigned char mval[EVP_MAX_MD_SIZE];
- unsigned int mlen;
- if (EVP_DigestFinal_ex(&mctx, mval, &mlen) <= 0) {
- CMSerr(CMS_F_CMS_SIGNERINFO_VERIFY_CONTENT,
- CMS_R_UNABLE_TO_FINALIZE_CONTEXT);
- goto err;
- }
if (mlen != (unsigned int)os->length) {
CMSerr(CMS_F_CMS_SIGNERINFO_VERIFY_CONTENT,
CMS_R_MESSAGEDIGEST_ATTRIBUTE_WRONG_LENGTH);
@@ -813,8 +855,17 @@ int CMS_SignerInfo_verify_content(CMS_SignerInfo *si, BIO *chain)
} else
r = 1;
} else {
- r = EVP_VerifyFinal(&mctx, si->signature->data,
- si->signature->length, si->pkey);
+ const EVP_MD *md = EVP_MD_CTX_md(&mctx);
+ pkctx = EVP_PKEY_CTX_new(si->pkey, NULL);
+ if (EVP_PKEY_verify_init(pkctx) <= 0)
+ goto err;
+ if (EVP_PKEY_CTX_set_signature_md(pkctx, md) <= 0)
+ goto err;
+ si->pctx = pkctx;
+ if (!cms_sd_asn1_ctrl(si, 1))
+ goto err;
+ r = EVP_PKEY_verify(pkctx, si->signature->data,
+ si->signature->length, mval, mlen);
if (r <= 0) {
CMSerr(CMS_F_CMS_SIGNERINFO_VERIFY_CONTENT,
CMS_R_VERIFICATION_FAILURE);
@@ -823,6 +874,8 @@ int CMS_SignerInfo_verify_content(CMS_SignerInfo *si, BIO *chain)
}
err:
+ if (pkctx)
+ EVP_PKEY_CTX_free(pkctx);
EVP_MD_CTX_cleanup(&mctx);
return r;
diff --git a/crypto/cms/cms_smime.c b/crypto/cms/cms_smime.c
index 8b37560821ad..5522a376acb6 100644
--- a/crypto/cms/cms_smime.c
+++ b/crypto/cms/cms_smime.c
@@ -59,6 +59,7 @@
#include <openssl/err.h>
#include <openssl/cms.h>
#include "cms_lcl.h"
+#include "asn1_locl.h"
static int cms_copy_content(BIO *out, BIO *in, unsigned int flags)
{
@@ -373,7 +374,7 @@ int CMS_verify(CMS_ContentInfo *cms, STACK_OF(X509) *certs,
tmpin = BIO_new_mem_buf(ptr, len);
if (tmpin == NULL) {
CMSerr(CMS_F_CMS_VERIFY, ERR_R_MALLOC_FAILURE);
- return 0;
+ goto err2;
}
} else
tmpin = dcont;
@@ -404,6 +405,7 @@ int CMS_verify(CMS_ContentInfo *cms, STACK_OF(X509) *certs,
else
BIO_free_all(cmsbio);
+ err2:
if (cms_certs)
sk_X509_pop_free(cms_certs, X509_free);
if (crls)
@@ -567,25 +569,63 @@ CMS_ContentInfo *CMS_encrypt(STACK_OF(X509) *certs, BIO *data,
return NULL;
}
+static int cms_kari_set1_pkey(CMS_ContentInfo *cms, CMS_RecipientInfo *ri,
+ EVP_PKEY *pk, X509 *cert)
+{
+ int i;
+ STACK_OF(CMS_RecipientEncryptedKey) *reks;
+ CMS_RecipientEncryptedKey *rek;
+ reks = CMS_RecipientInfo_kari_get0_reks(ri);
+ if (!cert)
+ return 0;
+ for (i = 0; i < sk_CMS_RecipientEncryptedKey_num(reks); i++) {
+ int rv;
+ rek = sk_CMS_RecipientEncryptedKey_value(reks, i);
+ if (CMS_RecipientEncryptedKey_cert_cmp(rek, cert))
+ continue;
+ CMS_RecipientInfo_kari_set0_pkey(ri, pk);
+ rv = CMS_RecipientInfo_kari_decrypt(cms, ri, rek);
+ CMS_RecipientInfo_kari_set0_pkey(ri, NULL);
+ if (rv > 0)
+ return 1;
+ return -1;
+ }
+ return 0;
+}
+
int CMS_decrypt_set1_pkey(CMS_ContentInfo *cms, EVP_PKEY *pk, X509 *cert)
{
STACK_OF(CMS_RecipientInfo) *ris;
CMS_RecipientInfo *ri;
- int i, r;
- int debug = 0, ri_match = 0;
+ int i, r, ri_type;
+ int debug = 0, match_ri = 0;
ris = CMS_get0_RecipientInfos(cms);
if (ris)
debug = cms->d.envelopedData->encryptedContentInfo->debug;
+ ri_type = cms_pkey_get_ri_type(pk);
+ if (ri_type == CMS_RECIPINFO_NONE) {
+ CMSerr(CMS_F_CMS_DECRYPT_SET1_PKEY,
+ CMS_R_NOT_SUPPORTED_FOR_THIS_KEY_TYPE);
+ return 0;
+ }
+
for (i = 0; i < sk_CMS_RecipientInfo_num(ris); i++) {
ri = sk_CMS_RecipientInfo_value(ris, i);
- if (CMS_RecipientInfo_type(ri) != CMS_RECIPINFO_TRANS)
+ if (CMS_RecipientInfo_type(ri) != ri_type)
continue;
- ri_match = 1;
+ match_ri = 1;
+ if (ri_type == CMS_RECIPINFO_AGREE) {
+ r = cms_kari_set1_pkey(cms, ri, pk, cert);
+ if (r > 0)
+ return 1;
+ if (r < 0)
+ return 0;
+ }
/*
* If we have a cert try matching RecipientInfo otherwise try them
* all.
*/
- if (!cert || (CMS_RecipientInfo_ktri_cert_cmp(ri, cert) == 0)) {
+ else if (!cert || !CMS_RecipientInfo_ktri_cert_cmp(ri, cert)) {
CMS_RecipientInfo_set0_pkey(ri, pk);
r = CMS_RecipientInfo_decrypt(cms, ri);
CMS_RecipientInfo_set0_pkey(ri, NULL);
@@ -613,7 +653,7 @@ int CMS_decrypt_set1_pkey(CMS_ContentInfo *cms, EVP_PKEY *pk, X509 *cert)
}
}
/* If no cert and not debugging always return success */
- if (ri_match && !cert && !debug) {
+ if (match_ri && !cert && !debug) {
ERR_clear_error();
return 1;
}
diff --git a/crypto/cryptlib.c b/crypto/cryptlib.c
index c654a5c84f2c..ca0e3ccc0c7a 100644
--- a/crypto/cryptlib.c
+++ b/crypto/cryptlib.c
@@ -653,7 +653,7 @@ const char *CRYPTO_get_lock_name(int type)
defined(__x86_64) || defined(__x86_64__) || \
defined(_M_AMD64) || defined(_M_X64)
-unsigned int OPENSSL_ia32cap_P[2];
+extern unsigned int OPENSSL_ia32cap_P[4];
unsigned long *OPENSSL_ia32cap_loc(void)
{
if (sizeof(long) == 4)
@@ -663,6 +663,9 @@ unsigned long *OPENSSL_ia32cap_loc(void)
* is 32-bit.
*/
OPENSSL_ia32cap_P[1] = 0;
+
+ OPENSSL_ia32cap_P[2] = 0;
+
return (unsigned long *)OPENSSL_ia32cap_P;
}
@@ -676,7 +679,7 @@ typedef unsigned long long IA32CAP;
void OPENSSL_cpuid_setup(void)
{
static int trigger = 0;
- IA32CAP OPENSSL_ia32_cpuid(void);
+ IA32CAP OPENSSL_ia32_cpuid(unsigned int *);
IA32CAP vec;
char *env;
@@ -694,9 +697,23 @@ void OPENSSL_cpuid_setup(void)
vec = strtoul(env + off, NULL, 0);
# endif
if (off)
- vec = OPENSSL_ia32_cpuid() & ~vec;
+ vec = OPENSSL_ia32_cpuid(OPENSSL_ia32cap_P) & ~vec;
+ else if (env[0] == ':')
+ vec = OPENSSL_ia32_cpuid(OPENSSL_ia32cap_P);
+
+ OPENSSL_ia32cap_P[2] = 0;
+ if ((env = strchr(env, ':'))) {
+ unsigned int vecx;
+ env++;
+ off = (env[0] == '~') ? 1 : 0;
+ vecx = strtoul(env + off, NULL, 0);
+ if (off)
+ OPENSSL_ia32cap_P[2] &= ~vecx;
+ else
+ OPENSSL_ia32cap_P[2] = vecx;
+ }
} else
- vec = OPENSSL_ia32_cpuid();
+ vec = OPENSSL_ia32_cpuid(OPENSSL_ia32cap_P);
/*
* |(1<<10) sets a reserved bit to signal that variable
@@ -706,6 +723,8 @@ void OPENSSL_cpuid_setup(void)
OPENSSL_ia32cap_P[0] = (unsigned int)vec | (1 << 10);
OPENSSL_ia32cap_P[1] = (unsigned int)(vec >> 32);
}
+# else
+unsigned int OPENSSL_ia32cap_P[4];
# endif
#else
@@ -857,8 +876,12 @@ void OPENSSL_showfatal(const char *fmta, ...)
if ((h = GetStdHandle(STD_ERROR_HANDLE)) != NULL &&
GetFileType(h) != FILE_TYPE_UNKNOWN) {
/* must be console application */
+ int len;
+ DWORD out;
+
va_start(ap, fmta);
- vfprintf(stderr, fmta, ap);
+ len = _vsnprintf((char *)buf, sizeof(buf), fmta, ap);
+ WriteFile(h, buf, len < 0 ? sizeof(buf) : (DWORD) len, &out, NULL);
va_end(ap);
return;
}
@@ -965,7 +988,9 @@ void OpenSSLDie(const char *file, int line, const char *assertion)
/*
* Win32 abort() customarily shows a dialog, but we just did that...
*/
+# if !defined(_WIN32_WCE)
raise(SIGABRT);
+# endif
_exit(3);
#endif
}
diff --git a/crypto/cversion.c b/crypto/cversion.c
index 9e6f50d78182..bfff6995c12e 100644
--- a/crypto/cversion.c
+++ b/crypto/cversion.c
@@ -68,7 +68,11 @@ const char *SSLeay_version(int t)
return OPENSSL_VERSION_TEXT;
if (t == SSLEAY_BUILT_ON) {
#ifdef DATE
+# ifdef OPENSSL_USE_BUILD_DATE
return (DATE);
+# else
+ return ("built on: reproducible build, date unspecified");
+# endif
#else
return ("built on: date not available");
#endif
diff --git a/crypto/des/Makefile b/crypto/des/Makefile
index fbc77c163cd3..8b5166ca9ff5 100644
--- a/crypto/des/Makefile
+++ b/crypto/des/Makefile
@@ -61,6 +61,8 @@ des: des.o cbc3_enc.o lib
des_enc-sparc.S: asm/des_enc.m4
m4 -B 8192 asm/des_enc.m4 > des_enc-sparc.S
+dest4-sparcv9.s: asm/dest4-sparcv9.pl
+ $(PERL) asm/dest4-sparcv9.pl $(CFLAGS) > $@
des-586.s: asm/des-586.pl ../perlasm/x86asm.pl ../perlasm/cbc.pl
$(PERL) asm/des-586.pl $(PERLASM_SCHEME) $(CFLAGS) > $@
diff --git a/crypto/des/asm/des-586.pl b/crypto/des/asm/des-586.pl
index 5b5f39cebd13..bd6a7dd6b76b 100644
--- a/crypto/des/asm/des-586.pl
+++ b/crypto/des/asm/des-586.pl
@@ -25,6 +25,7 @@ $small_footprint=1 if (grep(/\-DOPENSSL_SMALL_FOOTPRINT/,@ARGV));
# the folded loop is only 3% slower than unrolled, but >7 times smaller
&public_label("DES_SPtrans");
+&static_label("des_sptrans");
&DES_encrypt_internal();
&DES_decrypt_internal();
@@ -158,7 +159,7 @@ sub DES_encrypt
&call (&label("pic_point"));
&set_label("pic_point");
&blindpop($trans);
- &lea ($trans,&DWP(&label("DES_SPtrans")."-".&label("pic_point"),$trans));
+ &lea ($trans,&DWP(&label("des_sptrans")."-".&label("pic_point"),$trans));
&mov( "ecx", &wparam(1) );
@@ -315,6 +316,7 @@ sub FP_new
sub DES_SPtrans
{
&set_label("DES_SPtrans",64);
+ &set_label("des_sptrans");
&data_word(0x02080800, 0x00080000, 0x02000002, 0x02080802);
&data_word(0x02000000, 0x00080802, 0x00080002, 0x02000002);
&data_word(0x00080802, 0x02080800, 0x02080000, 0x00000802);
diff --git a/crypto/des/asm/des_enc.m4 b/crypto/des/asm/des_enc.m4
index 328059547820..dda08e126dc6 100644
--- a/crypto/des/asm/des_enc.m4
+++ b/crypto/des/asm/des_enc.m4
@@ -46,6 +46,8 @@
.ident "des_enc.m4 2.1"
.file "des_enc-sparc.S"
+#include <openssl/opensslconf.h>
+
#if defined(__SUNPRO_C) && defined(__sparcv9)
# define ABI64 /* They've said -xarch=v9 at command line */
#elif defined(__GNUC__) && defined(__arch64__)
diff --git a/crypto/des/asm/dest4-sparcv9.pl b/crypto/des/asm/dest4-sparcv9.pl
new file mode 100755
index 000000000000..1dc60243d4fb
--- /dev/null
+++ b/crypto/des/asm/dest4-sparcv9.pl
@@ -0,0 +1,617 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by David S. Miller <davem@devemloft.net> and Andy Polyakov
+# <appro@openssl.org>. The module is licensed under 2-clause BSD
+# license. March 2013. All rights reserved.
+# ====================================================================
+
+######################################################################
+# DES for SPARC T4.
+#
+# As with other hardware-assisted ciphers CBC encrypt results [for
+# aligned data] are virtually identical to critical path lengths:
+#
+# DES Triple-DES
+# CBC encrypt 4.14/4.15(*) 11.7/11.7
+# CBC decrypt 1.77/4.11(**) 6.42/7.47
+#
+# (*) numbers after slash are for
+# misaligned data;
+# (**) this is result for largest
+# block size, unlike all other
+# cases smaller blocks results
+# are better[?];
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "sparcv9_modes.pl";
+
+&asm_init(@ARGV);
+
+$code.=<<___ if ($::abibits==64);
+.register %g2,#scratch
+.register %g3,#scratch
+___
+
+$code.=<<___;
+.text
+___
+
+{ my ($inp,$out)=("%o0","%o1");
+
+$code.=<<___;
+.align 32
+.globl des_t4_key_expand
+.type des_t4_key_expand,#function
+des_t4_key_expand:
+ andcc $inp, 0x7, %g0
+ alignaddr $inp, %g0, $inp
+ bz,pt %icc, 1f
+ ldd [$inp + 0x00], %f0
+ ldd [$inp + 0x08], %f2
+ faligndata %f0, %f2, %f0
+1: des_kexpand %f0, 0, %f0
+ des_kexpand %f0, 1, %f2
+ std %f0, [$out + 0x00]
+ des_kexpand %f2, 3, %f6
+ std %f2, [$out + 0x08]
+ des_kexpand %f2, 2, %f4
+ des_kexpand %f6, 3, %f10
+ std %f6, [$out + 0x18]
+ des_kexpand %f6, 2, %f8
+ std %f4, [$out + 0x10]
+ des_kexpand %f10, 3, %f14
+ std %f10, [$out + 0x28]
+ des_kexpand %f10, 2, %f12
+ std %f8, [$out + 0x20]
+ des_kexpand %f14, 1, %f16
+ std %f14, [$out + 0x38]
+ des_kexpand %f16, 3, %f20
+ std %f12, [$out + 0x30]
+ des_kexpand %f16, 2, %f18
+ std %f16, [$out + 0x40]
+ des_kexpand %f20, 3, %f24
+ std %f20, [$out + 0x50]
+ des_kexpand %f20, 2, %f22
+ std %f18, [$out + 0x48]
+ des_kexpand %f24, 3, %f28
+ std %f24, [$out + 0x60]
+ des_kexpand %f24, 2, %f26
+ std %f22, [$out + 0x58]
+ des_kexpand %f28, 1, %f30
+ std %f28, [$out + 0x70]
+ std %f26, [$out + 0x68]
+ retl
+ std %f30, [$out + 0x78]
+.size des_t4_key_expand,.-des_t4_key_expand
+___
+}
+{ my ($inp,$out,$len,$key,$ivec) = map("%o$_",(0..4));
+ my ($ileft,$iright,$omask) = map("%g$_",(1..3));
+
+$code.=<<___;
+.globl des_t4_cbc_encrypt
+.align 32
+des_t4_cbc_encrypt:
+ cmp $len, 0
+ be,pn $::size_t_cc, .Lcbc_abort
+ nop
+ ld [$ivec + 0], %f0 ! load ivec
+ ld [$ivec + 4], %f1
+
+ and $inp, 7, $ileft
+ andn $inp, 7, $inp
+ sll $ileft, 3, $ileft
+ mov 0xff, $omask
+ prefetch [$inp], 20
+ prefetch [$inp + 63], 20
+ sub %g0, $ileft, $iright
+ and $out, 7, %g4
+ alignaddrl $out, %g0, $out
+ srl $omask, %g4, $omask
+ srlx $len, 3, $len
+ movrz %g4, 0, $omask
+ prefetch [$out], 22
+
+ ldd [$key + 0x00], %f4 ! load key schedule
+ ldd [$key + 0x08], %f6
+ ldd [$key + 0x10], %f8
+ ldd [$key + 0x18], %f10
+ ldd [$key + 0x20], %f12
+ ldd [$key + 0x28], %f14
+ ldd [$key + 0x30], %f16
+ ldd [$key + 0x38], %f18
+ ldd [$key + 0x40], %f20
+ ldd [$key + 0x48], %f22
+ ldd [$key + 0x50], %f24
+ ldd [$key + 0x58], %f26
+ ldd [$key + 0x60], %f28
+ ldd [$key + 0x68], %f30
+ ldd [$key + 0x70], %f32
+ ldd [$key + 0x78], %f34
+
+.Ldes_cbc_enc_loop:
+ ldx [$inp + 0], %g4
+ brz,pt $ileft, 4f
+ nop
+
+ ldx [$inp + 8], %g5
+ sllx %g4, $ileft, %g4
+ srlx %g5, $iright, %g5
+ or %g5, %g4, %g4
+4:
+ movxtod %g4, %f2
+ prefetch [$inp + 8+63], 20
+ add $inp, 8, $inp
+ fxor %f2, %f0, %f0 ! ^= ivec
+ prefetch [$out + 63], 22
+
+ des_ip %f0, %f0
+ des_round %f4, %f6, %f0, %f0
+ des_round %f8, %f10, %f0, %f0
+ des_round %f12, %f14, %f0, %f0
+ des_round %f16, %f18, %f0, %f0
+ des_round %f20, %f22, %f0, %f0
+ des_round %f24, %f26, %f0, %f0
+ des_round %f28, %f30, %f0, %f0
+ des_round %f32, %f34, %f0, %f0
+ des_iip %f0, %f0
+
+ brnz,pn $omask, 2f
+ sub $len, 1, $len
+
+ std %f0, [$out + 0]
+ brnz,pt $len, .Ldes_cbc_enc_loop
+ add $out, 8, $out
+
+ st %f0, [$ivec + 0] ! write out ivec
+ retl
+ st %f1, [$ivec + 4]
+.Lcbc_abort:
+ retl
+ nop
+
+.align 16
+2: ldxa [$inp]0x82, %g4 ! avoid read-after-write hazard
+ ! and ~4x deterioration
+ ! in inp==out case
+ faligndata %f0, %f0, %f2 ! handle unaligned output
+
+ stda %f2, [$out + $omask]0xc0 ! partial store
+ add $out, 8, $out
+ orn %g0, $omask, $omask
+ stda %f2, [$out + $omask]0xc0 ! partial store
+
+ brnz,pt $len, .Ldes_cbc_enc_loop+4
+ orn %g0, $omask, $omask
+
+ st %f0, [$ivec + 0] ! write out ivec
+ retl
+ st %f1, [$ivec + 4]
+.type des_t4_cbc_encrypt,#function
+.size des_t4_cbc_encrypt,.-des_t4_cbc_encrypt
+
+.globl des_t4_cbc_decrypt
+.align 32
+des_t4_cbc_decrypt:
+ cmp $len, 0
+ be,pn $::size_t_cc, .Lcbc_abort
+ nop
+ ld [$ivec + 0], %f2 ! load ivec
+ ld [$ivec + 4], %f3
+
+ and $inp, 7, $ileft
+ andn $inp, 7, $inp
+ sll $ileft, 3, $ileft
+ mov 0xff, $omask
+ prefetch [$inp], 20
+ prefetch [$inp + 63], 20
+ sub %g0, $ileft, $iright
+ and $out, 7, %g4
+ alignaddrl $out, %g0, $out
+ srl $omask, %g4, $omask
+ srlx $len, 3, $len
+ movrz %g4, 0, $omask
+ prefetch [$out], 22
+
+ ldd [$key + 0x78], %f4 ! load key schedule
+ ldd [$key + 0x70], %f6
+ ldd [$key + 0x68], %f8
+ ldd [$key + 0x60], %f10
+ ldd [$key + 0x58], %f12
+ ldd [$key + 0x50], %f14
+ ldd [$key + 0x48], %f16
+ ldd [$key + 0x40], %f18
+ ldd [$key + 0x38], %f20
+ ldd [$key + 0x30], %f22
+ ldd [$key + 0x28], %f24
+ ldd [$key + 0x20], %f26
+ ldd [$key + 0x18], %f28
+ ldd [$key + 0x10], %f30
+ ldd [$key + 0x08], %f32
+ ldd [$key + 0x00], %f34
+
+.Ldes_cbc_dec_loop:
+ ldx [$inp + 0], %g4
+ brz,pt $ileft, 4f
+ nop
+
+ ldx [$inp + 8], %g5
+ sllx %g4, $ileft, %g4
+ srlx %g5, $iright, %g5
+ or %g5, %g4, %g4
+4:
+ movxtod %g4, %f0
+ prefetch [$inp + 8+63], 20
+ add $inp, 8, $inp
+ prefetch [$out + 63], 22
+
+ des_ip %f0, %f0
+ des_round %f4, %f6, %f0, %f0
+ des_round %f8, %f10, %f0, %f0
+ des_round %f12, %f14, %f0, %f0
+ des_round %f16, %f18, %f0, %f0
+ des_round %f20, %f22, %f0, %f0
+ des_round %f24, %f26, %f0, %f0
+ des_round %f28, %f30, %f0, %f0
+ des_round %f32, %f34, %f0, %f0
+ des_iip %f0, %f0
+
+ fxor %f2, %f0, %f0 ! ^= ivec
+ movxtod %g4, %f2
+
+ brnz,pn $omask, 2f
+ sub $len, 1, $len
+
+ std %f0, [$out + 0]
+ brnz,pt $len, .Ldes_cbc_dec_loop
+ add $out, 8, $out
+
+ st %f2, [$ivec + 0] ! write out ivec
+ retl
+ st %f3, [$ivec + 4]
+
+.align 16
+2: ldxa [$inp]0x82, %g4 ! avoid read-after-write hazard
+ ! and ~4x deterioration
+ ! in inp==out case
+ faligndata %f0, %f0, %f0 ! handle unaligned output
+
+ stda %f0, [$out + $omask]0xc0 ! partial store
+ add $out, 8, $out
+ orn %g0, $omask, $omask
+ stda %f0, [$out + $omask]0xc0 ! partial store
+
+ brnz,pt $len, .Ldes_cbc_dec_loop+4
+ orn %g0, $omask, $omask
+
+ st %f2, [$ivec + 0] ! write out ivec
+ retl
+ st %f3, [$ivec + 4]
+.type des_t4_cbc_decrypt,#function
+.size des_t4_cbc_decrypt,.-des_t4_cbc_decrypt
+___
+
+# One might wonder why does one have back-to-back des_iip/des_ip
+# pairs between EDE passes. Indeed, aren't they inverse of each other?
+# They almost are. Outcome of the pair is 32-bit words being swapped
+# in target register. Consider pair of des_iip/des_ip as a way to
+# perform the due swap, it's actually fastest way in this case.
+
+$code.=<<___;
+.globl des_t4_ede3_cbc_encrypt
+.align 32
+des_t4_ede3_cbc_encrypt:
+ cmp $len, 0
+ be,pn $::size_t_cc, .Lcbc_abort
+ nop
+ ld [$ivec + 0], %f0 ! load ivec
+ ld [$ivec + 4], %f1
+
+ and $inp, 7, $ileft
+ andn $inp, 7, $inp
+ sll $ileft, 3, $ileft
+ mov 0xff, $omask
+ prefetch [$inp], 20
+ prefetch [$inp + 63], 20
+ sub %g0, $ileft, $iright
+ and $out, 7, %g4
+ alignaddrl $out, %g0, $out
+ srl $omask, %g4, $omask
+ srlx $len, 3, $len
+ movrz %g4, 0, $omask
+ prefetch [$out], 22
+
+ ldd [$key + 0x00], %f4 ! load key schedule
+ ldd [$key + 0x08], %f6
+ ldd [$key + 0x10], %f8
+ ldd [$key + 0x18], %f10
+ ldd [$key + 0x20], %f12
+ ldd [$key + 0x28], %f14
+ ldd [$key + 0x30], %f16
+ ldd [$key + 0x38], %f18
+ ldd [$key + 0x40], %f20
+ ldd [$key + 0x48], %f22
+ ldd [$key + 0x50], %f24
+ ldd [$key + 0x58], %f26
+ ldd [$key + 0x60], %f28
+ ldd [$key + 0x68], %f30
+ ldd [$key + 0x70], %f32
+ ldd [$key + 0x78], %f34
+
+.Ldes_ede3_cbc_enc_loop:
+ ldx [$inp + 0], %g4
+ brz,pt $ileft, 4f
+ nop
+
+ ldx [$inp + 8], %g5
+ sllx %g4, $ileft, %g4
+ srlx %g5, $iright, %g5
+ or %g5, %g4, %g4
+4:
+ movxtod %g4, %f2
+ prefetch [$inp + 8+63], 20
+ add $inp, 8, $inp
+ fxor %f2, %f0, %f0 ! ^= ivec
+ prefetch [$out + 63], 22
+
+ des_ip %f0, %f0
+ des_round %f4, %f6, %f0, %f0
+ des_round %f8, %f10, %f0, %f0
+ des_round %f12, %f14, %f0, %f0
+ des_round %f16, %f18, %f0, %f0
+ ldd [$key + 0x100-0x08], %f36
+ ldd [$key + 0x100-0x10], %f38
+ des_round %f20, %f22, %f0, %f0
+ ldd [$key + 0x100-0x18], %f40
+ ldd [$key + 0x100-0x20], %f42
+ des_round %f24, %f26, %f0, %f0
+ ldd [$key + 0x100-0x28], %f44
+ ldd [$key + 0x100-0x30], %f46
+ des_round %f28, %f30, %f0, %f0
+ ldd [$key + 0x100-0x38], %f48
+ ldd [$key + 0x100-0x40], %f50
+ des_round %f32, %f34, %f0, %f0
+ ldd [$key + 0x100-0x48], %f52
+ ldd [$key + 0x100-0x50], %f54
+ des_iip %f0, %f0
+
+ ldd [$key + 0x100-0x58], %f56
+ ldd [$key + 0x100-0x60], %f58
+ des_ip %f0, %f0
+ ldd [$key + 0x100-0x68], %f60
+ ldd [$key + 0x100-0x70], %f62
+ des_round %f36, %f38, %f0, %f0
+ ldd [$key + 0x100-0x78], %f36
+ ldd [$key + 0x100-0x80], %f38
+ des_round %f40, %f42, %f0, %f0
+ des_round %f44, %f46, %f0, %f0
+ des_round %f48, %f50, %f0, %f0
+ ldd [$key + 0x100+0x00], %f40
+ ldd [$key + 0x100+0x08], %f42
+ des_round %f52, %f54, %f0, %f0
+ ldd [$key + 0x100+0x10], %f44
+ ldd [$key + 0x100+0x18], %f46
+ des_round %f56, %f58, %f0, %f0
+ ldd [$key + 0x100+0x20], %f48
+ ldd [$key + 0x100+0x28], %f50
+ des_round %f60, %f62, %f0, %f0
+ ldd [$key + 0x100+0x30], %f52
+ ldd [$key + 0x100+0x38], %f54
+ des_round %f36, %f38, %f0, %f0
+ ldd [$key + 0x100+0x40], %f56
+ ldd [$key + 0x100+0x48], %f58
+ des_iip %f0, %f0
+
+ ldd [$key + 0x100+0x50], %f60
+ ldd [$key + 0x100+0x58], %f62
+ des_ip %f0, %f0
+ ldd [$key + 0x100+0x60], %f36
+ ldd [$key + 0x100+0x68], %f38
+ des_round %f40, %f42, %f0, %f0
+ ldd [$key + 0x100+0x70], %f40
+ ldd [$key + 0x100+0x78], %f42
+ des_round %f44, %f46, %f0, %f0
+ des_round %f48, %f50, %f0, %f0
+ des_round %f52, %f54, %f0, %f0
+ des_round %f56, %f58, %f0, %f0
+ des_round %f60, %f62, %f0, %f0
+ des_round %f36, %f38, %f0, %f0
+ des_round %f40, %f42, %f0, %f0
+ des_iip %f0, %f0
+
+ brnz,pn $omask, 2f
+ sub $len, 1, $len
+
+ std %f0, [$out + 0]
+ brnz,pt $len, .Ldes_ede3_cbc_enc_loop
+ add $out, 8, $out
+
+ st %f0, [$ivec + 0] ! write out ivec
+ retl
+ st %f1, [$ivec + 4]
+
+.align 16
+2: ldxa [$inp]0x82, %g4 ! avoid read-after-write hazard
+ ! and ~2x deterioration
+ ! in inp==out case
+ faligndata %f0, %f0, %f2 ! handle unaligned output
+
+ stda %f2, [$out + $omask]0xc0 ! partial store
+ add $out, 8, $out
+ orn %g0, $omask, $omask
+ stda %f2, [$out + $omask]0xc0 ! partial store
+
+ brnz,pt $len, .Ldes_ede3_cbc_enc_loop+4
+ orn %g0, $omask, $omask
+
+ st %f0, [$ivec + 0] ! write out ivec
+ retl
+ st %f1, [$ivec + 4]
+.type des_t4_ede3_cbc_encrypt,#function
+.size des_t4_ede3_cbc_encrypt,.-des_t4_ede3_cbc_encrypt
+
+.globl des_t4_ede3_cbc_decrypt
+.align 32
+des_t4_ede3_cbc_decrypt:
+ cmp $len, 0
+ be,pn $::size_t_cc, .Lcbc_abort
+ nop
+ ld [$ivec + 0], %f2 ! load ivec
+ ld [$ivec + 4], %f3
+
+ and $inp, 7, $ileft
+ andn $inp, 7, $inp
+ sll $ileft, 3, $ileft
+ mov 0xff, $omask
+ prefetch [$inp], 20
+ prefetch [$inp + 63], 20
+ sub %g0, $ileft, $iright
+ and $out, 7, %g4
+ alignaddrl $out, %g0, $out
+ srl $omask, %g4, $omask
+ srlx $len, 3, $len
+ movrz %g4, 0, $omask
+ prefetch [$out], 22
+
+ ldd [$key + 0x100+0x78], %f4 ! load key schedule
+ ldd [$key + 0x100+0x70], %f6
+ ldd [$key + 0x100+0x68], %f8
+ ldd [$key + 0x100+0x60], %f10
+ ldd [$key + 0x100+0x58], %f12
+ ldd [$key + 0x100+0x50], %f14
+ ldd [$key + 0x100+0x48], %f16
+ ldd [$key + 0x100+0x40], %f18
+ ldd [$key + 0x100+0x38], %f20
+ ldd [$key + 0x100+0x30], %f22
+ ldd [$key + 0x100+0x28], %f24
+ ldd [$key + 0x100+0x20], %f26
+ ldd [$key + 0x100+0x18], %f28
+ ldd [$key + 0x100+0x10], %f30
+ ldd [$key + 0x100+0x08], %f32
+ ldd [$key + 0x100+0x00], %f34
+
+.Ldes_ede3_cbc_dec_loop:
+ ldx [$inp + 0], %g4
+ brz,pt $ileft, 4f
+ nop
+
+ ldx [$inp + 8], %g5
+ sllx %g4, $ileft, %g4
+ srlx %g5, $iright, %g5
+ or %g5, %g4, %g4
+4:
+ movxtod %g4, %f0
+ prefetch [$inp + 8+63], 20
+ add $inp, 8, $inp
+ prefetch [$out + 63], 22
+
+ des_ip %f0, %f0
+ des_round %f4, %f6, %f0, %f0
+ des_round %f8, %f10, %f0, %f0
+ des_round %f12, %f14, %f0, %f0
+ des_round %f16, %f18, %f0, %f0
+ ldd [$key + 0x80+0x00], %f36
+ ldd [$key + 0x80+0x08], %f38
+ des_round %f20, %f22, %f0, %f0
+ ldd [$key + 0x80+0x10], %f40
+ ldd [$key + 0x80+0x18], %f42
+ des_round %f24, %f26, %f0, %f0
+ ldd [$key + 0x80+0x20], %f44
+ ldd [$key + 0x80+0x28], %f46
+ des_round %f28, %f30, %f0, %f0
+ ldd [$key + 0x80+0x30], %f48
+ ldd [$key + 0x80+0x38], %f50
+ des_round %f32, %f34, %f0, %f0
+ ldd [$key + 0x80+0x40], %f52
+ ldd [$key + 0x80+0x48], %f54
+ des_iip %f0, %f0
+
+ ldd [$key + 0x80+0x50], %f56
+ ldd [$key + 0x80+0x58], %f58
+ des_ip %f0, %f0
+ ldd [$key + 0x80+0x60], %f60
+ ldd [$key + 0x80+0x68], %f62
+ des_round %f36, %f38, %f0, %f0
+ ldd [$key + 0x80+0x70], %f36
+ ldd [$key + 0x80+0x78], %f38
+ des_round %f40, %f42, %f0, %f0
+ des_round %f44, %f46, %f0, %f0
+ des_round %f48, %f50, %f0, %f0
+ ldd [$key + 0x80-0x08], %f40
+ ldd [$key + 0x80-0x10], %f42
+ des_round %f52, %f54, %f0, %f0
+ ldd [$key + 0x80-0x18], %f44
+ ldd [$key + 0x80-0x20], %f46
+ des_round %f56, %f58, %f0, %f0
+ ldd [$key + 0x80-0x28], %f48
+ ldd [$key + 0x80-0x30], %f50
+ des_round %f60, %f62, %f0, %f0
+ ldd [$key + 0x80-0x38], %f52
+ ldd [$key + 0x80-0x40], %f54
+ des_round %f36, %f38, %f0, %f0
+ ldd [$key + 0x80-0x48], %f56
+ ldd [$key + 0x80-0x50], %f58
+ des_iip %f0, %f0
+
+ ldd [$key + 0x80-0x58], %f60
+ ldd [$key + 0x80-0x60], %f62
+ des_ip %f0, %f0
+ ldd [$key + 0x80-0x68], %f36
+ ldd [$key + 0x80-0x70], %f38
+ des_round %f40, %f42, %f0, %f0
+ ldd [$key + 0x80-0x78], %f40
+ ldd [$key + 0x80-0x80], %f42
+ des_round %f44, %f46, %f0, %f0
+ des_round %f48, %f50, %f0, %f0
+ des_round %f52, %f54, %f0, %f0
+ des_round %f56, %f58, %f0, %f0
+ des_round %f60, %f62, %f0, %f0
+ des_round %f36, %f38, %f0, %f0
+ des_round %f40, %f42, %f0, %f0
+ des_iip %f0, %f0
+
+ fxor %f2, %f0, %f0 ! ^= ivec
+ movxtod %g4, %f2
+
+ brnz,pn $omask, 2f
+ sub $len, 1, $len
+
+ std %f0, [$out + 0]
+ brnz,pt $len, .Ldes_ede3_cbc_dec_loop
+ add $out, 8, $out
+
+ st %f2, [$ivec + 0] ! write out ivec
+ retl
+ st %f3, [$ivec + 4]
+
+.align 16
+2: ldxa [$inp]0x82, %g4 ! avoid read-after-write hazard
+ ! and ~3x deterioration
+ ! in inp==out case
+ faligndata %f0, %f0, %f0 ! handle unaligned output
+
+ stda %f0, [$out + $omask]0xc0 ! partial store
+ add $out, 8, $out
+ orn %g0, $omask, $omask
+ stda %f0, [$out + $omask]0xc0 ! partial store
+
+ brnz,pt $len, .Ldes_ede3_cbc_dec_loop+4
+ orn %g0, $omask, $omask
+
+ st %f2, [$ivec + 0] ! write out ivec
+ retl
+ st %f3, [$ivec + 4]
+.type des_t4_ede3_cbc_decrypt,#function
+.size des_t4_ede3_cbc_decrypt,.-des_t4_ede3_cbc_decrypt
+___
+}
+$code.=<<___;
+.asciz "DES for SPARC T4, David S. Miller, Andy Polyakov"
+.align 4
+___
+
+&emit_assembler();
+
+close STDOUT;
diff --git a/crypto/des/des_locl.h b/crypto/des/des_locl.h
index 1a8e41dd29c7..23ea9d32a7be 100644
--- a/crypto/des/des_locl.h
+++ b/crypto/des/des_locl.h
@@ -162,8 +162,10 @@
} \
}
-# if (defined(OPENSSL_SYS_WIN32) && defined(_MSC_VER)) || defined(__ICC)
+# if (defined(OPENSSL_SYS_WIN32) && defined(_MSC_VER))
# define ROTATE(a,n) (_lrotr(a,n))
+# elif defined(__ICC)
+# define ROTATE(a,n) (_rotr(a,n))
# elif defined(__GNUC__) && __GNUC__>=2 && !defined(__STRICT_ANSI__) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) && !defined(PEDANTIC)
# if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)
# define ROTATE(a,n) ({ register unsigned int ret; \
diff --git a/crypto/des/read_pwd.c b/crypto/des/read_pwd.c
index 16ba0a9eb2a1..514a7063b4bf 100644
--- a/crypto/des/read_pwd.c
+++ b/crypto/des/read_pwd.c
@@ -172,7 +172,7 @@
# include <sys/ioctl.h>
#endif
-#if defined(OPENSSL_SYS_MSDOS) && !defined(__CYGWIN32__) && !defined(OPENSSL_SYS_WINCE)
+#if defined(OPENSSL_SYS_MSDOS) && !defined(OPENSSL_SYS_WINCE)
# include <conio.h>
# define fgets(a,b,c) noecho_fgets(a,b,c)
#endif
diff --git a/crypto/dh/Makefile b/crypto/dh/Makefile
index 6d574f407d39..46fa5ac57b47 100644
--- a/crypto/dh/Makefile
+++ b/crypto/dh/Makefile
@@ -18,9 +18,9 @@ APPS=
LIB=$(TOP)/libcrypto.a
LIBSRC= dh_asn1.c dh_gen.c dh_key.c dh_lib.c dh_check.c dh_err.c dh_depr.c \
- dh_ameth.c dh_pmeth.c dh_prn.c
+ dh_ameth.c dh_pmeth.c dh_prn.c dh_rfc5114.c dh_kdf.c
LIBOBJ= dh_asn1.o dh_gen.o dh_key.o dh_lib.o dh_check.o dh_err.o dh_depr.o \
- dh_ameth.o dh_pmeth.o dh_prn.o
+ dh_ameth.o dh_pmeth.o dh_prn.o dh_rfc5114.o dh_kdf.o
SRC= $(LIBSRC)
@@ -80,13 +80,13 @@ clean:
dh_ameth.o: ../../e_os.h ../../include/openssl/asn1.h
dh_ameth.o: ../../include/openssl/bio.h ../../include/openssl/bn.h
-dh_ameth.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-dh_ameth.o: ../../include/openssl/dh.h ../../include/openssl/e_os2.h
-dh_ameth.o: ../../include/openssl/ec.h ../../include/openssl/ecdh.h
-dh_ameth.o: ../../include/openssl/ecdsa.h ../../include/openssl/err.h
-dh_ameth.o: ../../include/openssl/evp.h ../../include/openssl/lhash.h
-dh_ameth.o: ../../include/openssl/obj_mac.h ../../include/openssl/objects.h
-dh_ameth.o: ../../include/openssl/opensslconf.h
+dh_ameth.o: ../../include/openssl/buffer.h ../../include/openssl/cms.h
+dh_ameth.o: ../../include/openssl/crypto.h ../../include/openssl/dh.h
+dh_ameth.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h
+dh_ameth.o: ../../include/openssl/ecdh.h ../../include/openssl/ecdsa.h
+dh_ameth.o: ../../include/openssl/err.h ../../include/openssl/evp.h
+dh_ameth.o: ../../include/openssl/lhash.h ../../include/openssl/obj_mac.h
+dh_ameth.o: ../../include/openssl/objects.h ../../include/openssl/opensslconf.h
dh_ameth.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
dh_ameth.o: ../../include/openssl/pkcs7.h ../../include/openssl/safestack.h
dh_ameth.o: ../../include/openssl/sha.h ../../include/openssl/stack.h
@@ -134,6 +134,19 @@ dh_gen.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h
dh_gen.o: ../../include/openssl/ossl_typ.h ../../include/openssl/safestack.h
dh_gen.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
dh_gen.o: ../cryptlib.h dh_gen.c
+dh_kdf.o: ../../include/openssl/asn1.h ../../include/openssl/bio.h
+dh_kdf.o: ../../include/openssl/buffer.h ../../include/openssl/cms.h
+dh_kdf.o: ../../include/openssl/crypto.h ../../include/openssl/dh.h
+dh_kdf.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h
+dh_kdf.o: ../../include/openssl/ecdh.h ../../include/openssl/ecdsa.h
+dh_kdf.o: ../../include/openssl/evp.h ../../include/openssl/lhash.h
+dh_kdf.o: ../../include/openssl/obj_mac.h ../../include/openssl/objects.h
+dh_kdf.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h
+dh_kdf.o: ../../include/openssl/ossl_typ.h ../../include/openssl/pkcs7.h
+dh_kdf.o: ../../include/openssl/safestack.h ../../include/openssl/sha.h
+dh_kdf.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
+dh_kdf.o: ../../include/openssl/x509.h ../../include/openssl/x509_vfy.h
+dh_kdf.o: dh_kdf.c
dh_key.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
dh_key.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
dh_key.o: ../../include/openssl/dh.h ../../include/openssl/e_os2.h
@@ -160,11 +173,12 @@ dh_pmeth.o: ../../e_os.h ../../include/openssl/asn1.h
dh_pmeth.o: ../../include/openssl/asn1t.h ../../include/openssl/bio.h
dh_pmeth.o: ../../include/openssl/bn.h ../../include/openssl/buffer.h
dh_pmeth.o: ../../include/openssl/crypto.h ../../include/openssl/dh.h
-dh_pmeth.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h
-dh_pmeth.o: ../../include/openssl/ecdh.h ../../include/openssl/ecdsa.h
-dh_pmeth.o: ../../include/openssl/err.h ../../include/openssl/evp.h
-dh_pmeth.o: ../../include/openssl/lhash.h ../../include/openssl/obj_mac.h
-dh_pmeth.o: ../../include/openssl/objects.h ../../include/openssl/opensslconf.h
+dh_pmeth.o: ../../include/openssl/dsa.h ../../include/openssl/e_os2.h
+dh_pmeth.o: ../../include/openssl/ec.h ../../include/openssl/ecdh.h
+dh_pmeth.o: ../../include/openssl/ecdsa.h ../../include/openssl/err.h
+dh_pmeth.o: ../../include/openssl/evp.h ../../include/openssl/lhash.h
+dh_pmeth.o: ../../include/openssl/obj_mac.h ../../include/openssl/objects.h
+dh_pmeth.o: ../../include/openssl/opensslconf.h
dh_pmeth.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
dh_pmeth.o: ../../include/openssl/pkcs7.h ../../include/openssl/safestack.h
dh_pmeth.o: ../../include/openssl/sha.h ../../include/openssl/stack.h
@@ -180,3 +194,11 @@ dh_prn.o: ../../include/openssl/objects.h ../../include/openssl/opensslconf.h
dh_prn.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
dh_prn.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
dh_prn.o: ../../include/openssl/symhacks.h ../cryptlib.h dh_prn.c
+dh_rfc5114.o: ../../e_os.h ../../include/openssl/bio.h
+dh_rfc5114.o: ../../include/openssl/bn.h ../../include/openssl/buffer.h
+dh_rfc5114.o: ../../include/openssl/crypto.h ../../include/openssl/dh.h
+dh_rfc5114.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
+dh_rfc5114.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
+dh_rfc5114.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+dh_rfc5114.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+dh_rfc5114.o: ../../include/openssl/symhacks.h ../cryptlib.h dh_rfc5114.c
diff --git a/crypto/dh/dh.h b/crypto/dh/dh.h
index 4cbaa9784dba..0502f1a9cc14 100644
--- a/crypto/dh/dh.h
+++ b/crypto/dh/dh.h
@@ -167,6 +167,9 @@ struct dh_st {
# define DH_CHECK_P_NOT_SAFE_PRIME 0x02
# define DH_UNABLE_TO_CHECK_GENERATOR 0x04
# define DH_NOT_SUITABLE_GENERATOR 0x08
+# define DH_CHECK_Q_NOT_PRIME 0x10
+# define DH_CHECK_INVALID_Q_VALUE 0x20
+# define DH_CHECK_INVALID_J_VALUE 0x40
/* DH_check_pub_key error codes */
# define DH_CHECK_PUBKEY_TOO_SMALL 0x01
@@ -217,8 +220,11 @@ int DH_check(const DH *dh, int *codes);
int DH_check_pub_key(const DH *dh, const BIGNUM *pub_key, int *codes);
int DH_generate_key(DH *dh);
int DH_compute_key(unsigned char *key, const BIGNUM *pub_key, DH *dh);
+int DH_compute_key_padded(unsigned char *key, const BIGNUM *pub_key, DH *dh);
DH *d2i_DHparams(DH **a, const unsigned char **pp, long length);
int i2d_DHparams(const DH *a, unsigned char **pp);
+DH *d2i_DHxparams(DH **a, const unsigned char **pp, long length);
+int i2d_DHxparams(const DH *a, unsigned char **pp);
# ifndef OPENSSL_NO_FP_API
int DHparams_print_fp(FILE *fp, const DH *x);
# endif
@@ -228,16 +234,109 @@ int DHparams_print(BIO *bp, const DH *x);
int DHparams_print(char *bp, const DH *x);
# endif
+/* RFC 5114 parameters */
+DH *DH_get_1024_160(void);
+DH *DH_get_2048_224(void);
+DH *DH_get_2048_256(void);
+
+/* RFC2631 KDF */
+int DH_KDF_X9_42(unsigned char *out, size_t outlen,
+ const unsigned char *Z, size_t Zlen,
+ ASN1_OBJECT *key_oid,
+ const unsigned char *ukm, size_t ukmlen, const EVP_MD *md);
+
# define EVP_PKEY_CTX_set_dh_paramgen_prime_len(ctx, len) \
EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_DH, EVP_PKEY_OP_PARAMGEN, \
EVP_PKEY_CTRL_DH_PARAMGEN_PRIME_LEN, len, NULL)
+# define EVP_PKEY_CTX_set_dh_paramgen_subprime_len(ctx, len) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_DH, EVP_PKEY_OP_PARAMGEN, \
+ EVP_PKEY_CTRL_DH_PARAMGEN_SUBPRIME_LEN, len, NULL)
+
+# define EVP_PKEY_CTX_set_dh_paramgen_type(ctx, typ) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_DH, EVP_PKEY_OP_PARAMGEN, \
+ EVP_PKEY_CTRL_DH_PARAMGEN_TYPE, typ, NULL)
+
# define EVP_PKEY_CTX_set_dh_paramgen_generator(ctx, gen) \
EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_DH, EVP_PKEY_OP_PARAMGEN, \
EVP_PKEY_CTRL_DH_PARAMGEN_GENERATOR, gen, NULL)
+# define EVP_PKEY_CTX_set_dh_rfc5114(ctx, gen) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_DHX, EVP_PKEY_OP_PARAMGEN, \
+ EVP_PKEY_CTRL_DH_RFC5114, gen, NULL)
+
+# define EVP_PKEY_CTX_set_dhx_rfc5114(ctx, gen) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_DHX, EVP_PKEY_OP_PARAMGEN, \
+ EVP_PKEY_CTRL_DH_RFC5114, gen, NULL)
+
+# define EVP_PKEY_CTX_set_dh_kdf_type(ctx, kdf) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_DHX, \
+ EVP_PKEY_OP_DERIVE, \
+ EVP_PKEY_CTRL_DH_KDF_TYPE, kdf, NULL)
+
+# define EVP_PKEY_CTX_get_dh_kdf_type(ctx) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_DHX, \
+ EVP_PKEY_OP_DERIVE, \
+ EVP_PKEY_CTRL_DH_KDF_TYPE, -2, NULL)
+
+# define EVP_PKEY_CTX_set0_dh_kdf_oid(ctx, oid) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_DHX, \
+ EVP_PKEY_OP_DERIVE, \
+ EVP_PKEY_CTRL_DH_KDF_OID, 0, (void *)oid)
+
+# define EVP_PKEY_CTX_get0_dh_kdf_oid(ctx, poid) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_DHX, \
+ EVP_PKEY_OP_DERIVE, \
+ EVP_PKEY_CTRL_GET_DH_KDF_OID, 0, (void *)poid)
+
+# define EVP_PKEY_CTX_set_dh_kdf_md(ctx, md) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_DHX, \
+ EVP_PKEY_OP_DERIVE, \
+ EVP_PKEY_CTRL_DH_KDF_MD, 0, (void *)md)
+
+# define EVP_PKEY_CTX_get_dh_kdf_md(ctx, pmd) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_DHX, \
+ EVP_PKEY_OP_DERIVE, \
+ EVP_PKEY_CTRL_GET_DH_KDF_MD, 0, (void *)pmd)
+
+# define EVP_PKEY_CTX_set_dh_kdf_outlen(ctx, len) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_DHX, \
+ EVP_PKEY_OP_DERIVE, \
+ EVP_PKEY_CTRL_DH_KDF_OUTLEN, len, NULL)
+
+# define EVP_PKEY_CTX_get_dh_kdf_outlen(ctx, plen) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_DHX, \
+ EVP_PKEY_OP_DERIVE, \
+ EVP_PKEY_CTRL_GET_DH_KDF_OUTLEN, 0, (void *)plen)
+
+# define EVP_PKEY_CTX_set0_dh_kdf_ukm(ctx, p, plen) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_DHX, \
+ EVP_PKEY_OP_DERIVE, \
+ EVP_PKEY_CTRL_DH_KDF_UKM, plen, (void *)p)
+
+# define EVP_PKEY_CTX_get0_dh_kdf_ukm(ctx, p) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_DHX, \
+ EVP_PKEY_OP_DERIVE, \
+ EVP_PKEY_CTRL_GET_DH_KDF_UKM, 0, (void *)p)
+
# define EVP_PKEY_CTRL_DH_PARAMGEN_PRIME_LEN (EVP_PKEY_ALG_CTRL + 1)
# define EVP_PKEY_CTRL_DH_PARAMGEN_GENERATOR (EVP_PKEY_ALG_CTRL + 2)
+# define EVP_PKEY_CTRL_DH_RFC5114 (EVP_PKEY_ALG_CTRL + 3)
+# define EVP_PKEY_CTRL_DH_PARAMGEN_SUBPRIME_LEN (EVP_PKEY_ALG_CTRL + 4)
+# define EVP_PKEY_CTRL_DH_PARAMGEN_TYPE (EVP_PKEY_ALG_CTRL + 5)
+# define EVP_PKEY_CTRL_DH_KDF_TYPE (EVP_PKEY_ALG_CTRL + 6)
+# define EVP_PKEY_CTRL_DH_KDF_MD (EVP_PKEY_ALG_CTRL + 7)
+# define EVP_PKEY_CTRL_GET_DH_KDF_MD (EVP_PKEY_ALG_CTRL + 8)
+# define EVP_PKEY_CTRL_DH_KDF_OUTLEN (EVP_PKEY_ALG_CTRL + 9)
+# define EVP_PKEY_CTRL_GET_DH_KDF_OUTLEN (EVP_PKEY_ALG_CTRL + 10)
+# define EVP_PKEY_CTRL_DH_KDF_UKM (EVP_PKEY_ALG_CTRL + 11)
+# define EVP_PKEY_CTRL_GET_DH_KDF_UKM (EVP_PKEY_ALG_CTRL + 12)
+# define EVP_PKEY_CTRL_DH_KDF_OID (EVP_PKEY_ALG_CTRL + 13)
+# define EVP_PKEY_CTRL_GET_DH_KDF_OID (EVP_PKEY_ALG_CTRL + 14)
+
+/* KDF types */
+# define EVP_PKEY_DH_KDF_NONE 1
+# define EVP_PKEY_DH_KDF_X9_42 2
/* BEGIN ERROR CODES */
/*
@@ -252,6 +351,9 @@ void ERR_load_DH_strings(void);
# define DH_F_COMPUTE_KEY 102
# define DH_F_DHPARAMS_PRINT_FP 101
# define DH_F_DH_BUILTIN_GENPARAMS 106
+# define DH_F_DH_CMS_DECRYPT 117
+# define DH_F_DH_CMS_SET_PEERKEY 118
+# define DH_F_DH_CMS_SET_SHARED_INFO 119
# define DH_F_DH_COMPUTE_KEY 114
# define DH_F_DH_GENERATE_KEY 115
# define DH_F_DH_GENERATE_PARAMETERS_EX 116
@@ -273,6 +375,7 @@ void ERR_load_DH_strings(void);
# define DH_R_BN_ERROR 106
# define DH_R_DECODE_ERROR 104
# define DH_R_INVALID_PUBKEY 102
+# define DH_R_KDF_PARAMETER_ERROR 112
# define DH_R_KEYS_NOT_SET 108
# define DH_R_KEY_SIZE_TOO_SMALL 110
# define DH_R_MODULUS_TOO_LARGE 103
@@ -280,6 +383,8 @@ void ERR_load_DH_strings(void);
# define DH_R_NO_PARAMETERS_SET 107
# define DH_R_NO_PRIVATE_VALUE 100
# define DH_R_PARAMETER_ENCODING_ERROR 105
+# define DH_R_PEER_KEY_ERROR 113
+# define DH_R_SHARED_INFO_ERROR 114
#ifdef __cplusplus
}
diff --git a/crypto/dh/dh_ameth.c b/crypto/dh/dh_ameth.c
index 873eb2e22de8..ac72468bd14b 100644
--- a/crypto/dh/dh_ameth.c
+++ b/crypto/dh/dh_ameth.c
@@ -63,6 +63,31 @@
#include <openssl/dh.h>
#include <openssl/bn.h>
#include "asn1_locl.h"
+#ifndef OPENSSL_NO_CMS
+# include <openssl/cms.h>
+#endif
+
+extern const EVP_PKEY_ASN1_METHOD dhx_asn1_meth;
+
+/*
+ * i2d/d2i like DH parameter functions which use the appropriate routine for
+ * PKCS#3 DH or X9.42 DH.
+ */
+
+static DH *d2i_dhp(const EVP_PKEY *pkey, const unsigned char **pp,
+ long length)
+{
+ if (pkey->ameth == &dhx_asn1_meth)
+ return d2i_DHxparams(NULL, pp, length);
+ return d2i_DHparams(NULL, pp, length);
+}
+
+static int i2d_dhp(const EVP_PKEY *pkey, const DH *a, unsigned char **pp)
+{
+ if (pkey->ameth == &dhx_asn1_meth)
+ return i2d_DHxparams(a, pp);
+ return i2d_DHparams(a, pp);
+}
static void int_dh_free(EVP_PKEY *pkey)
{
@@ -94,7 +119,7 @@ static int dh_pub_decode(EVP_PKEY *pkey, X509_PUBKEY *pubkey)
pm = pstr->data;
pmlen = pstr->length;
- if (!(dh = d2i_DHparams(NULL, &pm, pmlen))) {
+ if (!(dh = d2i_dhp(pkey, &pm, pmlen))) {
DHerr(DH_F_DH_PUB_DECODE, DH_R_DECODE_ERROR);
goto err;
}
@@ -111,7 +136,7 @@ static int dh_pub_decode(EVP_PKEY *pkey, X509_PUBKEY *pubkey)
}
ASN1_INTEGER_free(public_key);
- EVP_PKEY_assign_DH(pkey, dh);
+ EVP_PKEY_assign(pkey, pkey->ameth->pkey_id, dh);
return 1;
err:
@@ -139,7 +164,7 @@ static int dh_pub_encode(X509_PUBKEY *pk, const EVP_PKEY *pkey)
DHerr(DH_F_DH_PUB_ENCODE, ERR_R_MALLOC_FAILURE);
goto err;
}
- str->length = i2d_DHparams(dh, &str->data);
+ str->length = i2d_dhp(pkey, dh, &str->data);
if (str->length <= 0) {
DHerr(DH_F_DH_PUB_ENCODE, ERR_R_MALLOC_FAILURE);
goto err;
@@ -159,7 +184,7 @@ static int dh_pub_encode(X509_PUBKEY *pk, const EVP_PKEY *pkey)
goto err;
}
- if (X509_PUBKEY_set0_param(pk, OBJ_nid2obj(EVP_PKEY_DH),
+ if (X509_PUBKEY_set0_param(pk, OBJ_nid2obj(pkey->ameth->pkey_id),
ptype, str, penc, penclen))
return 1;
@@ -204,7 +229,7 @@ static int dh_priv_decode(EVP_PKEY *pkey, PKCS8_PRIV_KEY_INFO *p8)
pstr = pval;
pm = pstr->data;
pmlen = pstr->length;
- if (!(dh = d2i_DHparams(NULL, &pm, pmlen)))
+ if (!(dh = d2i_dhp(pkey, &pm, pmlen)))
goto decerr;
/* We have parameters now set private key */
if (!(dh->priv_key = ASN1_INTEGER_to_BN(privkey, NULL))) {
@@ -215,7 +240,7 @@ static int dh_priv_decode(EVP_PKEY *pkey, PKCS8_PRIV_KEY_INFO *p8)
if (!DH_generate_key(dh))
goto dherr;
- EVP_PKEY_assign_DH(pkey, dh);
+ EVP_PKEY_assign(pkey, pkey->ameth->pkey_id, dh);
ASN1_STRING_clear_free(privkey);
@@ -243,7 +268,7 @@ static int dh_priv_encode(PKCS8_PRIV_KEY_INFO *p8, const EVP_PKEY *pkey)
goto err;
}
- params->length = i2d_DHparams(pkey->pkey.dh, &params->data);
+ params->length = i2d_dhp(pkey, pkey->pkey.dh, &params->data);
if (params->length <= 0) {
DHerr(DH_F_DH_PRIV_ENCODE, ERR_R_MALLOC_FAILURE);
goto err;
@@ -263,7 +288,7 @@ static int dh_priv_encode(PKCS8_PRIV_KEY_INFO *p8, const EVP_PKEY *pkey)
ASN1_STRING_clear_free(prkey);
prkey = NULL;
- if (!PKCS8_pkey_set0(p8, OBJ_nid2obj(NID_dhKeyAgreement), 0,
+ if (!PKCS8_pkey_set0(p8, OBJ_nid2obj(pkey->ameth->pkey_id), 0,
V_ASN1_SEQUENCE, params, dp, dplen))
goto err;
@@ -292,17 +317,17 @@ static int dh_param_decode(EVP_PKEY *pkey,
const unsigned char **pder, int derlen)
{
DH *dh;
- if (!(dh = d2i_DHparams(NULL, pder, derlen))) {
+ if (!(dh = d2i_dhp(pkey, pder, derlen))) {
DHerr(DH_F_DH_PARAM_DECODE, ERR_R_DH_LIB);
return 0;
}
- EVP_PKEY_assign_DH(pkey, dh);
+ EVP_PKEY_assign(pkey, pkey->ameth->pkey_id, dh);
return 1;
}
static int dh_param_encode(const EVP_PKEY *pkey, unsigned char **pder)
{
- return i2d_DHparams(pkey->pkey.dh, pder);
+ return i2d_dhp(pkey, pkey->pkey.dh, pder);
}
static int do_dh_print(BIO *bp, const DH *x, int indent,
@@ -334,15 +359,18 @@ static int do_dh_print(BIO *bp, const DH *x, int indent,
}
update_buflen(x->g, &buf_len);
+ update_buflen(x->q, &buf_len);
+ update_buflen(x->j, &buf_len);
+ update_buflen(x->counter, &buf_len);
update_buflen(pub_key, &buf_len);
update_buflen(priv_key, &buf_len);
if (ptype == 2)
- ktype = "PKCS#3 DH Private-Key";
+ ktype = "DH Private-Key";
else if (ptype == 1)
- ktype = "PKCS#3 DH Public-Key";
+ ktype = "DH Public-Key";
else
- ktype = "PKCS#3 DH Parameters";
+ ktype = "DH Parameters";
m = OPENSSL_malloc(buf_len + 10);
if (m == NULL) {
@@ -364,6 +392,29 @@ static int do_dh_print(BIO *bp, const DH *x, int indent,
goto err;
if (!ASN1_bn_print(bp, "generator:", x->g, m, indent))
goto err;
+ if (x->q && !ASN1_bn_print(bp, "subgroup order:", x->q, m, indent))
+ goto err;
+ if (x->j && !ASN1_bn_print(bp, "subgroup factor:", x->j, m, indent))
+ goto err;
+ if (x->seed) {
+ int i;
+ BIO_indent(bp, indent, 128);
+ BIO_puts(bp, "seed:");
+ for (i = 0; i < x->seedlen; i++) {
+ if ((i % 15) == 0) {
+ if (BIO_puts(bp, "\n") <= 0
+ || !BIO_indent(bp, indent + 4, 128))
+ goto err;
+ }
+ if (BIO_printf(bp, "%02x%s", x->seed[i],
+ ((i + 1) == x->seedlen) ? "" : ":") <= 0)
+ goto err;
+ }
+ if (BIO_write(bp, "\n", 1) <= 0)
+ return (0);
+ }
+ if (x->counter && !ASN1_bn_print(bp, "counter:", x->counter, m, indent))
+ goto err;
if (x->length != 0) {
BIO_indent(bp, indent, 128);
if (BIO_printf(bp, "recommended-private-length: %d bits\n",
@@ -396,29 +447,76 @@ static int dh_cmp_parameters(const EVP_PKEY *a, const EVP_PKEY *b)
if (BN_cmp(a->pkey.dh->p, b->pkey.dh->p) ||
BN_cmp(a->pkey.dh->g, b->pkey.dh->g))
return 0;
- else
- return 1;
+ else if (a->ameth == &dhx_asn1_meth) {
+ if (BN_cmp(a->pkey.dh->q, b->pkey.dh->q))
+ return 0;
+ }
+ return 1;
}
-static int dh_copy_parameters(EVP_PKEY *to, const EVP_PKEY *from)
+static int int_dh_bn_cpy(BIGNUM **dst, const BIGNUM *src)
{
BIGNUM *a;
+ if (src) {
+ a = BN_dup(src);
+ if (!a)
+ return 0;
+ } else
+ a = NULL;
+ if (*dst)
+ BN_free(*dst);
+ *dst = a;
+ return 1;
+}
- if ((a = BN_dup(from->pkey.dh->p)) == NULL)
+static int int_dh_param_copy(DH *to, const DH *from, int is_x942)
+{
+ if (is_x942 == -1)
+ is_x942 = ! !from->q;
+ if (!int_dh_bn_cpy(&to->p, from->p))
return 0;
- if (to->pkey.dh->p != NULL)
- BN_free(to->pkey.dh->p);
- to->pkey.dh->p = a;
-
- if ((a = BN_dup(from->pkey.dh->g)) == NULL)
+ if (!int_dh_bn_cpy(&to->g, from->g))
return 0;
- if (to->pkey.dh->g != NULL)
- BN_free(to->pkey.dh->g);
- to->pkey.dh->g = a;
-
+ if (is_x942) {
+ if (!int_dh_bn_cpy(&to->q, from->q))
+ return 0;
+ if (!int_dh_bn_cpy(&to->j, from->j))
+ return 0;
+ if (to->seed) {
+ OPENSSL_free(to->seed);
+ to->seed = NULL;
+ to->seedlen = 0;
+ }
+ if (from->seed) {
+ to->seed = BUF_memdup(from->seed, from->seedlen);
+ if (!to->seed)
+ return 0;
+ to->seedlen = from->seedlen;
+ }
+ } else
+ to->length = from->length;
return 1;
}
+DH *DHparams_dup(DH *dh)
+{
+ DH *ret;
+ ret = DH_new();
+ if (!ret)
+ return NULL;
+ if (!int_dh_param_copy(ret, dh, -1)) {
+ DH_free(ret);
+ return NULL;
+ }
+ return ret;
+}
+
+static int dh_copy_parameters(EVP_PKEY *to, const EVP_PKEY *from)
+{
+ return int_dh_param_copy(to->pkey.dh, from->pkey.dh,
+ from->ameth == &dhx_asn1_meth);
+}
+
static int dh_missing_parameters(const EVP_PKEY *a)
{
if (!a->pkey.dh->p || !a->pkey.dh->g)
@@ -459,6 +557,33 @@ int DHparams_print(BIO *bp, const DH *x)
return do_dh_print(bp, x, 4, NULL, 0);
}
+#ifndef OPENSSL_NO_CMS
+static int dh_cms_decrypt(CMS_RecipientInfo *ri);
+static int dh_cms_encrypt(CMS_RecipientInfo *ri);
+#endif
+
+static int dh_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2)
+{
+ switch (op) {
+#ifndef OPENSSL_NO_CMS
+
+ case ASN1_PKEY_CTRL_CMS_ENVELOPE:
+ if (arg1 == 1)
+ return dh_cms_decrypt(arg2);
+ else if (arg1 == 0)
+ return dh_cms_encrypt(arg2);
+ return -2;
+
+ case ASN1_PKEY_CTRL_CMS_RI_TYPE:
+ *(int *)arg2 = CMS_RECIPINFO_AGREE;
+ return 1;
+#endif
+ default:
+ return -2;
+ }
+
+}
+
const EVP_PKEY_ASN1_METHOD dh_asn1_meth = {
EVP_PKEY_DH,
EVP_PKEY_DH,
@@ -490,3 +615,343 @@ const EVP_PKEY_ASN1_METHOD dh_asn1_meth = {
int_dh_free,
0
};
+
+const EVP_PKEY_ASN1_METHOD dhx_asn1_meth = {
+ EVP_PKEY_DHX,
+ EVP_PKEY_DHX,
+ 0,
+
+ "X9.42 DH",
+ "OpenSSL X9.42 DH method",
+
+ dh_pub_decode,
+ dh_pub_encode,
+ dh_pub_cmp,
+ dh_public_print,
+
+ dh_priv_decode,
+ dh_priv_encode,
+ dh_private_print,
+
+ int_dh_size,
+ dh_bits,
+
+ dh_param_decode,
+ dh_param_encode,
+ dh_missing_parameters,
+ dh_copy_parameters,
+ dh_cmp_parameters,
+ dh_param_print,
+ 0,
+
+ int_dh_free,
+ dh_pkey_ctrl
+};
+
+#ifndef OPENSSL_NO_CMS
+
+static int dh_cms_set_peerkey(EVP_PKEY_CTX *pctx,
+ X509_ALGOR *alg, ASN1_BIT_STRING *pubkey)
+{
+ ASN1_OBJECT *aoid;
+ int atype;
+ void *aval;
+ ASN1_INTEGER *public_key = NULL;
+ int rv = 0;
+ EVP_PKEY *pkpeer = NULL, *pk = NULL;
+ DH *dhpeer = NULL;
+ const unsigned char *p;
+ int plen;
+
+ X509_ALGOR_get0(&aoid, &atype, &aval, alg);
+ if (OBJ_obj2nid(aoid) != NID_dhpublicnumber)
+ goto err;
+ /* Only absent parameters allowed in RFC XXXX */
+ if (atype != V_ASN1_UNDEF && atype == V_ASN1_NULL)
+ goto err;
+
+ pk = EVP_PKEY_CTX_get0_pkey(pctx);
+ if (!pk)
+ goto err;
+ if (pk->type != EVP_PKEY_DHX)
+ goto err;
+ /* Get parameters from parent key */
+ dhpeer = DHparams_dup(pk->pkey.dh);
+ /* We have parameters now set public key */
+ plen = ASN1_STRING_length(pubkey);
+ p = ASN1_STRING_data(pubkey);
+ if (!p || !plen)
+ goto err;
+
+ if (!(public_key = d2i_ASN1_INTEGER(NULL, &p, plen))) {
+ DHerr(DH_F_DH_CMS_SET_PEERKEY, DH_R_DECODE_ERROR);
+ goto err;
+ }
+
+ /* We have parameters now set public key */
+ if (!(dhpeer->pub_key = ASN1_INTEGER_to_BN(public_key, NULL))) {
+ DHerr(DH_F_DH_CMS_SET_PEERKEY, DH_R_BN_DECODE_ERROR);
+ goto err;
+ }
+
+ pkpeer = EVP_PKEY_new();
+ if (!pkpeer)
+ goto err;
+ EVP_PKEY_assign(pkpeer, pk->ameth->pkey_id, dhpeer);
+ dhpeer = NULL;
+ if (EVP_PKEY_derive_set_peer(pctx, pkpeer) > 0)
+ rv = 1;
+ err:
+ if (public_key)
+ ASN1_INTEGER_free(public_key);
+ if (pkpeer)
+ EVP_PKEY_free(pkpeer);
+ if (dhpeer)
+ DH_free(dhpeer);
+ return rv;
+}
+
+static int dh_cms_set_shared_info(EVP_PKEY_CTX *pctx, CMS_RecipientInfo *ri)
+{
+ int rv = 0;
+
+ X509_ALGOR *alg, *kekalg = NULL;
+ ASN1_OCTET_STRING *ukm;
+ const unsigned char *p;
+ unsigned char *dukm = NULL;
+ size_t dukmlen = 0;
+ int keylen, plen;
+ const EVP_CIPHER *kekcipher;
+ EVP_CIPHER_CTX *kekctx;
+
+ if (!CMS_RecipientInfo_kari_get0_alg(ri, &alg, &ukm))
+ goto err;
+
+ /*
+ * For DH we only have one OID permissible. If ever any more get defined
+ * we will need something cleverer.
+ */
+ if (OBJ_obj2nid(alg->algorithm) != NID_id_smime_alg_ESDH) {
+ DHerr(DH_F_DH_CMS_SET_SHARED_INFO, DH_R_KDF_PARAMETER_ERROR);
+ goto err;
+ }
+
+ if (EVP_PKEY_CTX_set_dh_kdf_type(pctx, EVP_PKEY_DH_KDF_X9_42) <= 0)
+ goto err;
+
+ if (EVP_PKEY_CTX_set_dh_kdf_md(pctx, EVP_sha1()) <= 0)
+ goto err;
+
+ if (alg->parameter->type != V_ASN1_SEQUENCE)
+ goto err;
+
+ p = alg->parameter->value.sequence->data;
+ plen = alg->parameter->value.sequence->length;
+ kekalg = d2i_X509_ALGOR(NULL, &p, plen);
+ if (!kekalg)
+ goto err;
+ kekctx = CMS_RecipientInfo_kari_get0_ctx(ri);
+ if (!kekctx)
+ goto err;
+ kekcipher = EVP_get_cipherbyobj(kekalg->algorithm);
+ if (!kekcipher || EVP_CIPHER_mode(kekcipher) != EVP_CIPH_WRAP_MODE)
+ goto err;
+ if (!EVP_EncryptInit_ex(kekctx, kekcipher, NULL, NULL, NULL))
+ goto err;
+ if (EVP_CIPHER_asn1_to_param(kekctx, kekalg->parameter) <= 0)
+ goto err;
+
+ keylen = EVP_CIPHER_CTX_key_length(kekctx);
+ if (EVP_PKEY_CTX_set_dh_kdf_outlen(pctx, keylen) <= 0)
+ goto err;
+ /* Use OBJ_nid2obj to ensure we use built in OID that isn't freed */
+ if (EVP_PKEY_CTX_set0_dh_kdf_oid(pctx,
+ OBJ_nid2obj(EVP_CIPHER_type(kekcipher)))
+ <= 0)
+ goto err;
+
+ if (ukm) {
+ dukmlen = ASN1_STRING_length(ukm);
+ dukm = BUF_memdup(ASN1_STRING_data(ukm), dukmlen);
+ if (!dukm)
+ goto err;
+ }
+
+ if (EVP_PKEY_CTX_set0_dh_kdf_ukm(pctx, dukm, dukmlen) <= 0)
+ goto err;
+ dukm = NULL;
+
+ rv = 1;
+ err:
+ if (kekalg)
+ X509_ALGOR_free(kekalg);
+ if (dukm)
+ OPENSSL_free(dukm);
+ return rv;
+}
+
+static int dh_cms_decrypt(CMS_RecipientInfo *ri)
+{
+ EVP_PKEY_CTX *pctx;
+ pctx = CMS_RecipientInfo_get0_pkey_ctx(ri);
+ if (!pctx)
+ return 0;
+ /* See if we need to set peer key */
+ if (!EVP_PKEY_CTX_get0_peerkey(pctx)) {
+ X509_ALGOR *alg;
+ ASN1_BIT_STRING *pubkey;
+ if (!CMS_RecipientInfo_kari_get0_orig_id(ri, &alg, &pubkey,
+ NULL, NULL, NULL))
+ return 0;
+ if (!alg || !pubkey)
+ return 0;
+ if (!dh_cms_set_peerkey(pctx, alg, pubkey)) {
+ DHerr(DH_F_DH_CMS_DECRYPT, DH_R_PEER_KEY_ERROR);
+ return 0;
+ }
+ }
+ /* Set DH derivation parameters and initialise unwrap context */
+ if (!dh_cms_set_shared_info(pctx, ri)) {
+ DHerr(DH_F_DH_CMS_DECRYPT, DH_R_SHARED_INFO_ERROR);
+ return 0;
+ }
+ return 1;
+}
+
+static int dh_cms_encrypt(CMS_RecipientInfo *ri)
+{
+ EVP_PKEY_CTX *pctx;
+ EVP_PKEY *pkey;
+ EVP_CIPHER_CTX *ctx;
+ int keylen;
+ X509_ALGOR *talg, *wrap_alg = NULL;
+ ASN1_OBJECT *aoid;
+ ASN1_BIT_STRING *pubkey;
+ ASN1_STRING *wrap_str;
+ ASN1_OCTET_STRING *ukm;
+ unsigned char *penc = NULL, *dukm = NULL;
+ int penclen;
+ size_t dukmlen = 0;
+ int rv = 0;
+ int kdf_type, wrap_nid;
+ const EVP_MD *kdf_md;
+ pctx = CMS_RecipientInfo_get0_pkey_ctx(ri);
+ if (!pctx)
+ return 0;
+ /* Get ephemeral key */
+ pkey = EVP_PKEY_CTX_get0_pkey(pctx);
+ if (!CMS_RecipientInfo_kari_get0_orig_id(ri, &talg, &pubkey,
+ NULL, NULL, NULL))
+ goto err;
+ X509_ALGOR_get0(&aoid, NULL, NULL, talg);
+ /* Is everything uninitialised? */
+ if (aoid == OBJ_nid2obj(NID_undef)) {
+ ASN1_INTEGER *pubk;
+ pubk = BN_to_ASN1_INTEGER(pkey->pkey.dh->pub_key, NULL);
+ if (!pubk)
+ goto err;
+ /* Set the key */
+
+ penclen = i2d_ASN1_INTEGER(pubk, &penc);
+ ASN1_INTEGER_free(pubk);
+ if (penclen <= 0)
+ goto err;
+ ASN1_STRING_set0(pubkey, penc, penclen);
+ pubkey->flags &= ~(ASN1_STRING_FLAG_BITS_LEFT | 0x07);
+ pubkey->flags |= ASN1_STRING_FLAG_BITS_LEFT;
+
+ penc = NULL;
+ X509_ALGOR_set0(talg, OBJ_nid2obj(NID_dhpublicnumber),
+ V_ASN1_UNDEF, NULL);
+ }
+
+ /* See if custom paraneters set */
+ kdf_type = EVP_PKEY_CTX_get_dh_kdf_type(pctx);
+ if (kdf_type <= 0)
+ goto err;
+ if (!EVP_PKEY_CTX_get_dh_kdf_md(pctx, &kdf_md))
+ goto err;
+
+ if (kdf_type == EVP_PKEY_DH_KDF_NONE) {
+ kdf_type = EVP_PKEY_DH_KDF_X9_42;
+ if (EVP_PKEY_CTX_set_dh_kdf_type(pctx, kdf_type) <= 0)
+ goto err;
+ } else if (kdf_type != EVP_PKEY_DH_KDF_X9_42)
+ /* Unknown KDF */
+ goto err;
+ if (kdf_md == NULL) {
+ /* Only SHA1 supported */
+ kdf_md = EVP_sha1();
+ if (EVP_PKEY_CTX_set_dh_kdf_md(pctx, kdf_md) <= 0)
+ goto err;
+ } else if (EVP_MD_type(kdf_md) != NID_sha1)
+ /* Unsupported digest */
+ goto err;
+
+ if (!CMS_RecipientInfo_kari_get0_alg(ri, &talg, &ukm))
+ goto err;
+
+ /* Get wrap NID */
+ ctx = CMS_RecipientInfo_kari_get0_ctx(ri);
+ wrap_nid = EVP_CIPHER_CTX_type(ctx);
+ if (EVP_PKEY_CTX_set0_dh_kdf_oid(pctx, OBJ_nid2obj(wrap_nid)) <= 0)
+ goto err;
+ keylen = EVP_CIPHER_CTX_key_length(ctx);
+
+ /* Package wrap algorithm in an AlgorithmIdentifier */
+
+ wrap_alg = X509_ALGOR_new();
+ if (!wrap_alg)
+ goto err;
+ wrap_alg->algorithm = OBJ_nid2obj(wrap_nid);
+ wrap_alg->parameter = ASN1_TYPE_new();
+ if (!wrap_alg->parameter)
+ goto err;
+ if (EVP_CIPHER_param_to_asn1(ctx, wrap_alg->parameter) <= 0)
+ goto err;
+ if (ASN1_TYPE_get(wrap_alg->parameter) == NID_undef) {
+ ASN1_TYPE_free(wrap_alg->parameter);
+ wrap_alg->parameter = NULL;
+ }
+
+ if (EVP_PKEY_CTX_set_dh_kdf_outlen(pctx, keylen) <= 0)
+ goto err;
+
+ if (ukm) {
+ dukmlen = ASN1_STRING_length(ukm);
+ dukm = BUF_memdup(ASN1_STRING_data(ukm), dukmlen);
+ if (!dukm)
+ goto err;
+ }
+
+ if (EVP_PKEY_CTX_set0_dh_kdf_ukm(pctx, dukm, dukmlen) <= 0)
+ goto err;
+ dukm = NULL;
+
+ /*
+ * Now need to wrap encoding of wrap AlgorithmIdentifier into parameter
+ * of another AlgorithmIdentifier.
+ */
+ penc = NULL;
+ penclen = i2d_X509_ALGOR(wrap_alg, &penc);
+ if (!penc || !penclen)
+ goto err;
+ wrap_str = ASN1_STRING_new();
+ if (!wrap_str)
+ goto err;
+ ASN1_STRING_set0(wrap_str, penc, penclen);
+ penc = NULL;
+ X509_ALGOR_set0(talg, OBJ_nid2obj(NID_id_smime_alg_ESDH),
+ V_ASN1_SEQUENCE, wrap_str);
+
+ rv = 1;
+
+ err:
+ if (penc)
+ OPENSSL_free(penc);
+ if (wrap_alg)
+ X509_ALGOR_free(wrap_alg);
+ return rv;
+}
+
+#endif
diff --git a/crypto/dh/dh_asn1.c b/crypto/dh/dh_asn1.c
index e6ee3cfc66cc..f470214399b6 100644
--- a/crypto/dh/dh_asn1.c
+++ b/crypto/dh/dh_asn1.c
@@ -89,7 +89,101 @@ ASN1_SEQUENCE_cb(DHparams, dh_cb) = {
IMPLEMENT_ASN1_ENCODE_FUNCTIONS_const_fname(DH, DHparams, DHparams)
-DH *DHparams_dup(DH *dh)
+/*
+ * Internal only structures for handling X9.42 DH: this gets translated to or
+ * from a DH structure straight away.
+ */
+
+typedef struct {
+ ASN1_BIT_STRING *seed;
+ BIGNUM *counter;
+} int_dhvparams;
+
+typedef struct {
+ BIGNUM *p;
+ BIGNUM *q;
+ BIGNUM *g;
+ BIGNUM *j;
+ int_dhvparams *vparams;
+} int_dhx942_dh;
+
+ASN1_SEQUENCE(DHvparams) = {
+ ASN1_SIMPLE(int_dhvparams, seed, ASN1_BIT_STRING),
+ ASN1_SIMPLE(int_dhvparams, counter, BIGNUM)
+} ASN1_SEQUENCE_END_name(int_dhvparams, DHvparams)
+
+ASN1_SEQUENCE(DHxparams) = {
+ ASN1_SIMPLE(int_dhx942_dh, p, BIGNUM),
+ ASN1_SIMPLE(int_dhx942_dh, g, BIGNUM),
+ ASN1_SIMPLE(int_dhx942_dh, q, BIGNUM),
+ ASN1_OPT(int_dhx942_dh, j, BIGNUM),
+ ASN1_OPT(int_dhx942_dh, vparams, DHvparams),
+} ASN1_SEQUENCE_END_name(int_dhx942_dh, DHxparams)
+
+int_dhx942_dh *d2i_int_dhx(int_dhx942_dh **a,
+ const unsigned char **pp, long length);
+int i2d_int_dhx(const int_dhx942_dh *a, unsigned char **pp);
+
+IMPLEMENT_ASN1_ENCODE_FUNCTIONS_const_fname(int_dhx942_dh, DHxparams, int_dhx)
+
+/* Application leve function: read in X9.42 DH parameters into DH structure */
+
+DH *d2i_DHxparams(DH **a, const unsigned char **pp, long length)
{
- return ASN1_item_dup(ASN1_ITEM_rptr(DHparams), dh);
+ int_dhx942_dh *dhx = NULL;
+ DH *dh = NULL;
+ dh = DH_new();
+ if (!dh)
+ return NULL;
+ dhx = d2i_int_dhx(NULL, pp, length);
+ if (!dhx) {
+ DH_free(dh);
+ return NULL;
+ }
+
+ if (a) {
+ if (*a)
+ DH_free(*a);
+ *a = dh;
+ }
+
+ dh->p = dhx->p;
+ dh->q = dhx->q;
+ dh->g = dhx->g;
+ dh->j = dhx->j;
+
+ if (dhx->vparams) {
+ dh->seed = dhx->vparams->seed->data;
+ dh->seedlen = dhx->vparams->seed->length;
+ dh->counter = dhx->vparams->counter;
+ dhx->vparams->seed->data = NULL;
+ ASN1_BIT_STRING_free(dhx->vparams->seed);
+ OPENSSL_free(dhx->vparams);
+ dhx->vparams = NULL;
+ }
+
+ OPENSSL_free(dhx);
+ return dh;
+}
+
+int i2d_DHxparams(const DH *dh, unsigned char **pp)
+{
+ int_dhx942_dh dhx;
+ int_dhvparams dhv;
+ ASN1_BIT_STRING bs;
+ dhx.p = dh->p;
+ dhx.g = dh->g;
+ dhx.q = dh->q;
+ dhx.j = dh->j;
+ if (dh->counter && dh->seed && dh->seedlen > 0) {
+ bs.flags = ASN1_STRING_FLAG_BITS_LEFT;
+ bs.data = dh->seed;
+ bs.length = dh->seedlen;
+ dhv.seed = &bs;
+ dhv.counter = dh->counter;
+ dhx.vparams = &dhv;
+ } else
+ dhx.vparams = NULL;
+
+ return i2d_int_dhx(&dhx, pp);
}
diff --git a/crypto/dh/dh_check.c b/crypto/dh/dh_check.c
index c39ed97ba627..347467c6a433 100644
--- a/crypto/dh/dh_check.c
+++ b/crypto/dh/dh_check.c
@@ -76,17 +76,43 @@ int DH_check(const DH *dh, int *ret)
int ok = 0;
BN_CTX *ctx = NULL;
BN_ULONG l;
- BIGNUM *q = NULL;
+ BIGNUM *t1 = NULL, *t2 = NULL;
*ret = 0;
ctx = BN_CTX_new();
if (ctx == NULL)
goto err;
- q = BN_new();
- if (q == NULL)
+ BN_CTX_start(ctx);
+ t1 = BN_CTX_get(ctx);
+ if (t1 == NULL)
+ goto err;
+ t2 = BN_CTX_get(ctx);
+ if (t2 == NULL)
goto err;
- if (BN_is_word(dh->g, DH_GENERATOR_2)) {
+ if (dh->q) {
+ if (BN_cmp(dh->g, BN_value_one()) <= 0)
+ *ret |= DH_NOT_SUITABLE_GENERATOR;
+ else if (BN_cmp(dh->g, dh->p) >= 0)
+ *ret |= DH_NOT_SUITABLE_GENERATOR;
+ else {
+ /* Check g^q == 1 mod p */
+ if (!BN_mod_exp(t1, dh->g, dh->q, dh->p, ctx))
+ goto err;
+ if (!BN_is_one(t1))
+ *ret |= DH_NOT_SUITABLE_GENERATOR;
+ }
+ if (!BN_is_prime_ex(dh->q, BN_prime_checks, ctx, NULL))
+ *ret |= DH_CHECK_Q_NOT_PRIME;
+ /* Check p == 1 mod q i.e. q divides p - 1 */
+ if (!BN_div(t1, t2, dh->p, dh->q, ctx))
+ goto err;
+ if (!BN_is_one(t2))
+ *ret |= DH_CHECK_INVALID_Q_VALUE;
+ if (dh->j && BN_cmp(dh->j, t1))
+ *ret |= DH_CHECK_INVALID_J_VALUE;
+
+ } else if (BN_is_word(dh->g, DH_GENERATOR_2)) {
l = BN_mod_word(dh->p, 24);
if (l != 11)
*ret |= DH_NOT_SUITABLE_GENERATOR;
@@ -107,18 +133,18 @@ int DH_check(const DH *dh, int *ret)
if (!BN_is_prime_ex(dh->p, BN_prime_checks, ctx, NULL))
*ret |= DH_CHECK_P_NOT_PRIME;
- else {
- if (!BN_rshift1(q, dh->p))
+ else if (!dh->q) {
+ if (!BN_rshift1(t1, dh->p))
goto err;
- if (!BN_is_prime_ex(q, BN_prime_checks, ctx, NULL))
+ if (!BN_is_prime_ex(t1, BN_prime_checks, ctx, NULL))
*ret |= DH_CHECK_P_NOT_SAFE_PRIME;
}
ok = 1;
err:
- if (ctx != NULL)
+ if (ctx != NULL) {
+ BN_CTX_end(ctx);
BN_CTX_free(ctx);
- if (q != NULL)
- BN_free(q);
+ }
return (ok);
}
diff --git a/crypto/dh/dh_err.c b/crypto/dh/dh_err.c
index 6ed5eb789016..b890cca81748 100644
--- a/crypto/dh/dh_err.c
+++ b/crypto/dh/dh_err.c
@@ -1,6 +1,6 @@
/* crypto/dh/dh_err.c */
/* ====================================================================
- * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1999-2013 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -73,6 +73,9 @@ static ERR_STRING_DATA DH_str_functs[] = {
{ERR_FUNC(DH_F_COMPUTE_KEY), "COMPUTE_KEY"},
{ERR_FUNC(DH_F_DHPARAMS_PRINT_FP), "DHparams_print_fp"},
{ERR_FUNC(DH_F_DH_BUILTIN_GENPARAMS), "DH_BUILTIN_GENPARAMS"},
+ {ERR_FUNC(DH_F_DH_CMS_DECRYPT), "DH_CMS_DECRYPT"},
+ {ERR_FUNC(DH_F_DH_CMS_SET_PEERKEY), "DH_CMS_SET_PEERKEY"},
+ {ERR_FUNC(DH_F_DH_CMS_SET_SHARED_INFO), "DH_CMS_SET_SHARED_INFO"},
{ERR_FUNC(DH_F_DH_COMPUTE_KEY), "DH_compute_key"},
{ERR_FUNC(DH_F_DH_GENERATE_KEY), "DH_generate_key"},
{ERR_FUNC(DH_F_DH_GENERATE_PARAMETERS_EX), "DH_generate_parameters_ex"},
@@ -96,6 +99,7 @@ static ERR_STRING_DATA DH_str_reasons[] = {
{ERR_REASON(DH_R_BN_ERROR), "bn error"},
{ERR_REASON(DH_R_DECODE_ERROR), "decode error"},
{ERR_REASON(DH_R_INVALID_PUBKEY), "invalid public key"},
+ {ERR_REASON(DH_R_KDF_PARAMETER_ERROR), "kdf parameter error"},
{ERR_REASON(DH_R_KEYS_NOT_SET), "keys not set"},
{ERR_REASON(DH_R_KEY_SIZE_TOO_SMALL), "key size too small"},
{ERR_REASON(DH_R_MODULUS_TOO_LARGE), "modulus too large"},
@@ -103,6 +107,8 @@ static ERR_STRING_DATA DH_str_reasons[] = {
{ERR_REASON(DH_R_NO_PARAMETERS_SET), "no parameters set"},
{ERR_REASON(DH_R_NO_PRIVATE_VALUE), "no private value"},
{ERR_REASON(DH_R_PARAMETER_ENCODING_ERROR), "parameter encoding error"},
+ {ERR_REASON(DH_R_PEER_KEY_ERROR), "peer key error"},
+ {ERR_REASON(DH_R_SHARED_INFO_ERROR), "shared info error"},
{0, NULL}
};
diff --git a/crypto/dh/dh_kdf.c b/crypto/dh/dh_kdf.c
new file mode 100644
index 000000000000..a882cb286e0e
--- /dev/null
+++ b/crypto/dh/dh_kdf.c
@@ -0,0 +1,187 @@
+/* crypto/dh/dh_kdf.c */
+/*
+ * Written by Stephen Henson for the OpenSSL project.
+ */
+/* ====================================================================
+ * Copyright (c) 2013 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <string.h>
+#include <openssl/dh.h>
+#include <openssl/evp.h>
+#include <openssl/asn1.h>
+#include <openssl/cms.h>
+
+/* Key derivation from X9.42/RFC2631 */
+
+#define DH_KDF_MAX (1L << 30)
+
+/* Skip past an ASN1 structure: for OBJECT skip content octets too */
+
+static int skip_asn1(unsigned char **pp, long *plen, int exptag)
+{
+ const unsigned char *q = *pp;
+ int i, tag, xclass;
+ long tmplen;
+ i = ASN1_get_object(&q, &tmplen, &tag, &xclass, *plen);
+ if (i & 0x80)
+ return 0;
+ if (tag != exptag || xclass != V_ASN1_UNIVERSAL)
+ return 0;
+ if (tag == V_ASN1_OBJECT)
+ q += tmplen;
+ *plen -= q - *pp;
+ *pp = (unsigned char *)q;
+ return 1;
+}
+
+/*
+ * Encode the DH shared info structure, return an offset to the counter value
+ * so we can update the structure without reencoding it.
+ */
+
+static int dh_sharedinfo_encode(unsigned char **pder, unsigned char **pctr,
+ ASN1_OBJECT *key_oid, size_t outlen,
+ const unsigned char *ukm, size_t ukmlen)
+{
+ unsigned char *p;
+ int derlen;
+ long tlen;
+ /* "magic" value to check offset is sane */
+ static unsigned char ctr[4] = { 0xF3, 0x17, 0x22, 0x53 };
+ X509_ALGOR atmp;
+ ASN1_OCTET_STRING ctr_oct, ukm_oct, *pukm_oct;
+ ASN1_TYPE ctr_atype;
+ if (ukmlen > DH_KDF_MAX || outlen > DH_KDF_MAX)
+ return 0;
+ ctr_oct.data = ctr;
+ ctr_oct.length = 4;
+ ctr_oct.flags = 0;
+ ctr_oct.type = V_ASN1_OCTET_STRING;
+ ctr_atype.type = V_ASN1_OCTET_STRING;
+ ctr_atype.value.octet_string = &ctr_oct;
+ atmp.algorithm = key_oid;
+ atmp.parameter = &ctr_atype;
+ if (ukm) {
+ ukm_oct.type = V_ASN1_OCTET_STRING;
+ ukm_oct.flags = 0;
+ ukm_oct.data = (unsigned char *)ukm;
+ ukm_oct.length = ukmlen;
+ pukm_oct = &ukm_oct;
+ } else
+ pukm_oct = NULL;
+ derlen = CMS_SharedInfo_encode(pder, &atmp, pukm_oct, outlen);
+ if (derlen <= 0)
+ return 0;
+ p = *pder;
+ tlen = derlen;
+ if (!skip_asn1(&p, &tlen, V_ASN1_SEQUENCE))
+ return 0;
+ if (!skip_asn1(&p, &tlen, V_ASN1_SEQUENCE))
+ return 0;
+ if (!skip_asn1(&p, &tlen, V_ASN1_OBJECT))
+ return 0;
+ if (!skip_asn1(&p, &tlen, V_ASN1_OCTET_STRING))
+ return 0;
+ if (CRYPTO_memcmp(p, ctr, 4))
+ return 0;
+ *pctr = p;
+ return derlen;
+}
+
+int DH_KDF_X9_42(unsigned char *out, size_t outlen,
+ const unsigned char *Z, size_t Zlen,
+ ASN1_OBJECT *key_oid,
+ const unsigned char *ukm, size_t ukmlen, const EVP_MD *md)
+{
+ EVP_MD_CTX mctx;
+ int rv = 0;
+ unsigned int i;
+ size_t mdlen;
+ unsigned char *der = NULL, *ctr;
+ int derlen;
+ if (Zlen > DH_KDF_MAX)
+ return 0;
+ mdlen = EVP_MD_size(md);
+ EVP_MD_CTX_init(&mctx);
+ derlen = dh_sharedinfo_encode(&der, &ctr, key_oid, outlen, ukm, ukmlen);
+ if (derlen == 0)
+ goto err;
+ for (i = 1;; i++) {
+ unsigned char mtmp[EVP_MAX_MD_SIZE];
+ EVP_DigestInit_ex(&mctx, md, NULL);
+ if (!EVP_DigestUpdate(&mctx, Z, Zlen))
+ goto err;
+ ctr[3] = i & 0xFF;
+ ctr[2] = (i >> 8) & 0xFF;
+ ctr[1] = (i >> 16) & 0xFF;
+ ctr[0] = (i >> 24) & 0xFF;
+ if (!EVP_DigestUpdate(&mctx, der, derlen))
+ goto err;
+ if (outlen >= mdlen) {
+ if (!EVP_DigestFinal(&mctx, out, NULL))
+ goto err;
+ outlen -= mdlen;
+ if (outlen == 0)
+ break;
+ out += mdlen;
+ } else {
+ if (!EVP_DigestFinal(&mctx, mtmp, NULL))
+ goto err;
+ memcpy(out, mtmp, outlen);
+ OPENSSL_cleanse(mtmp, mdlen);
+ break;
+ }
+ }
+ rv = 1;
+ err:
+ if (der)
+ OPENSSL_free(der);
+ EVP_MD_CTX_cleanup(&mctx);
+ return rv;
+}
diff --git a/crypto/dh/dh_key.c b/crypto/dh/dh_key.c
index 9e1d8e58225b..1d80fb2c5f60 100644
--- a/crypto/dh/dh_key.c
+++ b/crypto/dh/dh_key.c
@@ -94,6 +94,20 @@ int DH_compute_key(unsigned char *key, const BIGNUM *pub_key, DH *dh)
return dh->meth->compute_key(key, pub_key, dh);
}
+int DH_compute_key_padded(unsigned char *key, const BIGNUM *pub_key, DH *dh)
+{
+ int rv, pad;
+ rv = dh->meth->compute_key(key, pub_key, dh);
+ if (rv <= 0)
+ return rv;
+ pad = BN_num_bytes(dh->p) - rv;
+ if (pad > 0) {
+ memmove(key + pad, key, rv);
+ memset(key, 0, pad);
+ }
+ return rv + pad;
+}
+
static DH_METHOD dh_ossl = {
"OpenSSL DH Method",
generate_key,
diff --git a/crypto/dh/dh_pmeth.c b/crypto/dh/dh_pmeth.c
index 65bc3882c7f2..b58e3fa86fad 100644
--- a/crypto/dh/dh_pmeth.c
+++ b/crypto/dh/dh_pmeth.c
@@ -63,6 +63,10 @@
#include <openssl/evp.h>
#include <openssl/dh.h>
#include <openssl/bn.h>
+#ifndef OPENSSL_NO_DSA
+# include <openssl/dsa.h>
+#endif
+#include <openssl/objects.h>
#include "evp_locl.h"
/* DH pkey context structure */
@@ -72,9 +76,23 @@ typedef struct {
int prime_len;
int generator;
int use_dsa;
+ int subprime_len;
+ /* message digest used for parameter generation */
+ const EVP_MD *md;
+ int rfc5114_param;
/* Keygen callback info */
int gentmp[2];
- /* message digest */
+ /* KDF (if any) to use for DH */
+ char kdf_type;
+ /* OID to use for KDF */
+ ASN1_OBJECT *kdf_oid;
+ /* Message digest to use for key derivation */
+ const EVP_MD *kdf_md;
+ /* User key material */
+ unsigned char *kdf_ukm;
+ size_t kdf_ukmlen;
+ /* KDF output length */
+ size_t kdf_outlen;
} DH_PKEY_CTX;
static int pkey_dh_init(EVP_PKEY_CTX *ctx)
@@ -84,8 +102,18 @@ static int pkey_dh_init(EVP_PKEY_CTX *ctx)
if (!dctx)
return 0;
dctx->prime_len = 1024;
+ dctx->subprime_len = -1;
dctx->generator = 2;
dctx->use_dsa = 0;
+ dctx->md = NULL;
+ dctx->rfc5114_param = 0;
+
+ dctx->kdf_type = EVP_PKEY_DH_KDF_NONE;
+ dctx->kdf_oid = NULL;
+ dctx->kdf_md = NULL;
+ dctx->kdf_ukm = NULL;
+ dctx->kdf_ukmlen = 0;
+ dctx->kdf_outlen = 0;
ctx->data = dctx;
ctx->keygen_info = dctx->gentmp;
@@ -102,16 +130,35 @@ static int pkey_dh_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src)
sctx = src->data;
dctx = dst->data;
dctx->prime_len = sctx->prime_len;
+ dctx->subprime_len = sctx->subprime_len;
dctx->generator = sctx->generator;
dctx->use_dsa = sctx->use_dsa;
+ dctx->md = sctx->md;
+ dctx->rfc5114_param = sctx->rfc5114_param;
+
+ dctx->kdf_type = sctx->kdf_type;
+ dctx->kdf_oid = OBJ_dup(sctx->kdf_oid);
+ if (!dctx->kdf_oid)
+ return 0;
+ dctx->kdf_md = sctx->kdf_md;
+ if (dctx->kdf_ukm) {
+ dctx->kdf_ukm = BUF_memdup(sctx->kdf_ukm, sctx->kdf_ukmlen);
+ dctx->kdf_ukmlen = sctx->kdf_ukmlen;
+ }
+ dctx->kdf_outlen = sctx->kdf_outlen;
return 1;
}
static void pkey_dh_cleanup(EVP_PKEY_CTX *ctx)
{
DH_PKEY_CTX *dctx = ctx->data;
- if (dctx)
+ if (dctx) {
+ if (dctx->kdf_ukm)
+ OPENSSL_free(dctx->kdf_ukm);
+ if (dctx->kdf_oid)
+ ASN1_OBJECT_free(dctx->kdf_oid);
OPENSSL_free(dctx);
+ }
}
static int pkey_dh_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
@@ -124,14 +171,89 @@ static int pkey_dh_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
dctx->prime_len = p1;
return 1;
+ case EVP_PKEY_CTRL_DH_PARAMGEN_SUBPRIME_LEN:
+ if (dctx->use_dsa == 0)
+ return -2;
+ dctx->subprime_len = p1;
+ return 1;
+
case EVP_PKEY_CTRL_DH_PARAMGEN_GENERATOR:
+ if (dctx->use_dsa)
+ return -2;
dctx->generator = p1;
return 1;
+ case EVP_PKEY_CTRL_DH_PARAMGEN_TYPE:
+#ifdef OPENSSL_NO_DSA
+ if (p1 != 0)
+ return -2;
+#else
+ if (p1 < 0 || p1 > 2)
+ return -2;
+#endif
+ dctx->use_dsa = p1;
+ return 1;
+
+ case EVP_PKEY_CTRL_DH_RFC5114:
+ if (p1 < 1 || p1 > 3)
+ return -2;
+ dctx->rfc5114_param = p1;
+ return 1;
+
case EVP_PKEY_CTRL_PEER_KEY:
/* Default behaviour is OK */
return 1;
+ case EVP_PKEY_CTRL_DH_KDF_TYPE:
+ if (p1 == -2)
+ return dctx->kdf_type;
+ if (p1 != EVP_PKEY_DH_KDF_NONE && p1 != EVP_PKEY_DH_KDF_X9_42)
+ return -2;
+ dctx->kdf_type = p1;
+ return 1;
+
+ case EVP_PKEY_CTRL_DH_KDF_MD:
+ dctx->kdf_md = p2;
+ return 1;
+
+ case EVP_PKEY_CTRL_GET_DH_KDF_MD:
+ *(const EVP_MD **)p2 = dctx->kdf_md;
+ return 1;
+
+ case EVP_PKEY_CTRL_DH_KDF_OUTLEN:
+ if (p1 <= 0)
+ return -2;
+ dctx->kdf_outlen = (size_t)p1;
+ return 1;
+
+ case EVP_PKEY_CTRL_GET_DH_KDF_OUTLEN:
+ *(int *)p2 = dctx->kdf_outlen;
+ return 1;
+
+ case EVP_PKEY_CTRL_DH_KDF_UKM:
+ if (dctx->kdf_ukm)
+ OPENSSL_free(dctx->kdf_ukm);
+ dctx->kdf_ukm = p2;
+ if (p2)
+ dctx->kdf_ukmlen = p1;
+ else
+ dctx->kdf_ukmlen = 0;
+ return 1;
+
+ case EVP_PKEY_CTRL_GET_DH_KDF_UKM:
+ *(unsigned char **)p2 = dctx->kdf_ukm;
+ return dctx->kdf_ukmlen;
+
+ case EVP_PKEY_CTRL_DH_KDF_OID:
+ if (dctx->kdf_oid)
+ ASN1_OBJECT_free(dctx->kdf_oid);
+ dctx->kdf_oid = p2;
+ return 1;
+
+ case EVP_PKEY_CTRL_GET_DH_KDF_OID:
+ *(ASN1_OBJECT **)p2 = dctx->kdf_oid;
+ return 1;
+
default:
return -2;
@@ -146,30 +268,139 @@ static int pkey_dh_ctrl_str(EVP_PKEY_CTX *ctx,
len = atoi(value);
return EVP_PKEY_CTX_set_dh_paramgen_prime_len(ctx, len);
}
+ if (!strcmp(type, "dh_rfc5114")) {
+ DH_PKEY_CTX *dctx = ctx->data;
+ int len;
+ len = atoi(value);
+ if (len < 0 || len > 3)
+ return -2;
+ dctx->rfc5114_param = len;
+ return 1;
+ }
if (!strcmp(type, "dh_paramgen_generator")) {
int len;
len = atoi(value);
return EVP_PKEY_CTX_set_dh_paramgen_generator(ctx, len);
}
+ if (!strcmp(type, "dh_paramgen_subprime_len")) {
+ int len;
+ len = atoi(value);
+ return EVP_PKEY_CTX_set_dh_paramgen_subprime_len(ctx, len);
+ }
+ if (!strcmp(type, "dh_paramgen_type")) {
+ int typ;
+ typ = atoi(value);
+ return EVP_PKEY_CTX_set_dh_paramgen_type(ctx, typ);
+ }
return -2;
}
+#ifndef OPENSSL_NO_DSA
+
+extern int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits,
+ const EVP_MD *evpmd,
+ const unsigned char *seed_in, size_t seed_len,
+ unsigned char *seed_out, int *counter_ret,
+ unsigned long *h_ret, BN_GENCB *cb);
+
+extern int dsa_builtin_paramgen2(DSA *ret, size_t L, size_t N,
+ const EVP_MD *evpmd,
+ const unsigned char *seed_in,
+ size_t seed_len, int idx,
+ unsigned char *seed_out, int *counter_ret,
+ unsigned long *h_ret, BN_GENCB *cb);
+
+static DSA *dsa_dh_generate(DH_PKEY_CTX *dctx, BN_GENCB *pcb)
+{
+ DSA *ret;
+ int rv = 0;
+ int prime_len = dctx->prime_len;
+ int subprime_len = dctx->subprime_len;
+ const EVP_MD *md = dctx->md;
+ if (dctx->use_dsa > 2)
+ return NULL;
+ ret = DSA_new();
+ if (!ret)
+ return NULL;
+ if (subprime_len == -1) {
+ if (prime_len >= 2048)
+ subprime_len = 256;
+ else
+ subprime_len = 160;
+ }
+ if (md == NULL) {
+ if (prime_len >= 2048)
+ md = EVP_sha256();
+ else
+ md = EVP_sha1();
+ }
+ if (dctx->use_dsa == 1)
+ rv = dsa_builtin_paramgen(ret, prime_len, subprime_len, md,
+ NULL, 0, NULL, NULL, NULL, pcb);
+ else if (dctx->use_dsa == 2)
+ rv = dsa_builtin_paramgen2(ret, prime_len, subprime_len, md,
+ NULL, 0, -1, NULL, NULL, NULL, pcb);
+ if (rv <= 0) {
+ DSA_free(ret);
+ return NULL;
+ }
+ return ret;
+}
+
+#endif
+
static int pkey_dh_paramgen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey)
{
DH *dh = NULL;
DH_PKEY_CTX *dctx = ctx->data;
BN_GENCB *pcb, cb;
int ret;
+ if (dctx->rfc5114_param) {
+ switch (dctx->rfc5114_param) {
+ case 1:
+ dh = DH_get_1024_160();
+ break;
+
+ case 2:
+ dh = DH_get_2048_224();
+ break;
+
+ case 3:
+ dh = DH_get_2048_256();
+ break;
+
+ default:
+ return -2;
+ }
+ EVP_PKEY_assign(pkey, EVP_PKEY_DHX, dh);
+ return 1;
+ }
+
if (ctx->pkey_gencb) {
pcb = &cb;
evp_pkey_set_cb_translate(pcb, ctx);
} else
pcb = NULL;
+#ifndef OPENSSL_NO_DSA
+ if (dctx->use_dsa) {
+ DSA *dsa_dh;
+ dsa_dh = dsa_dh_generate(dctx, pcb);
+ if (!dsa_dh)
+ return 0;
+ dh = DSA_dup_DH(dsa_dh);
+ DSA_free(dsa_dh);
+ if (!dh)
+ return 0;
+ EVP_PKEY_assign(pkey, EVP_PKEY_DHX, dh);
+ return 1;
+ }
+#endif
dh = DH_new();
if (!dh)
return 0;
ret = DH_generate_parameters_ex(dh,
dctx->prime_len, dctx->generator, pcb);
+
if (ret)
EVP_PKEY_assign_DH(pkey, dh);
else
@@ -187,7 +418,7 @@ static int pkey_dh_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey)
dh = DH_new();
if (!dh)
return 0;
- EVP_PKEY_assign_DH(pkey, dh);
+ EVP_PKEY_assign(pkey, ctx->pmeth->pkey_id, dh);
/* Note: if error return, pkey is freed by parent routine */
if (!EVP_PKEY_copy_parameters(pkey, ctx->pkey))
return 0;
@@ -198,21 +429,96 @@ static int pkey_dh_derive(EVP_PKEY_CTX *ctx, unsigned char *key,
size_t *keylen)
{
int ret;
+ DH *dh;
+ DH_PKEY_CTX *dctx = ctx->data;
+ BIGNUM *dhpub;
if (!ctx->pkey || !ctx->peerkey) {
DHerr(DH_F_PKEY_DH_DERIVE, DH_R_KEYS_NOT_SET);
return 0;
}
- ret = DH_compute_key(key, ctx->peerkey->pkey.dh->pub_key,
- ctx->pkey->pkey.dh);
- if (ret < 0)
+ dh = ctx->pkey->pkey.dh;
+ dhpub = ctx->peerkey->pkey.dh->pub_key;
+ if (dctx->kdf_type == EVP_PKEY_DH_KDF_NONE) {
+ if (key == NULL) {
+ *keylen = DH_size(dh);
+ return 1;
+ }
+ ret = DH_compute_key(key, dhpub, dh);
+ if (ret < 0)
+ return ret;
+ *keylen = ret;
+ return 1;
+ } else if (dctx->kdf_type == EVP_PKEY_DH_KDF_X9_42) {
+ unsigned char *Z = NULL;
+ size_t Zlen = 0;
+ if (!dctx->kdf_outlen || !dctx->kdf_oid)
+ return 0;
+ if (key == NULL) {
+ *keylen = dctx->kdf_outlen;
+ return 1;
+ }
+ if (*keylen != dctx->kdf_outlen)
+ return 0;
+ ret = 0;
+ Zlen = DH_size(dh);
+ Z = OPENSSL_malloc(Zlen);
+ if (!Z) {
+ goto err;
+ }
+ if (DH_compute_key_padded(Z, dhpub, dh) <= 0)
+ goto err;
+ if (!DH_KDF_X9_42(key, *keylen, Z, Zlen, dctx->kdf_oid,
+ dctx->kdf_ukm, dctx->kdf_ukmlen, dctx->kdf_md))
+ goto err;
+ *keylen = dctx->kdf_outlen;
+ ret = 1;
+ err:
+ if (Z) {
+ OPENSSL_cleanse(Z, Zlen);
+ OPENSSL_free(Z);
+ }
return ret;
- *keylen = ret;
+ }
return 1;
}
const EVP_PKEY_METHOD dh_pkey_meth = {
EVP_PKEY_DH,
- EVP_PKEY_FLAG_AUTOARGLEN,
+ 0,
+ pkey_dh_init,
+ pkey_dh_copy,
+ pkey_dh_cleanup,
+
+ 0,
+ pkey_dh_paramgen,
+
+ 0,
+ pkey_dh_keygen,
+
+ 0,
+ 0,
+
+ 0,
+ 0,
+
+ 0, 0,
+
+ 0, 0, 0, 0,
+
+ 0, 0,
+
+ 0, 0,
+
+ 0,
+ pkey_dh_derive,
+
+ pkey_dh_ctrl,
+ pkey_dh_ctrl_str
+};
+
+const EVP_PKEY_METHOD dhx_pkey_meth = {
+ EVP_PKEY_DHX,
+ 0,
pkey_dh_init,
pkey_dh_copy,
pkey_dh_cleanup,
diff --git a/crypto/dh/dh_rfc5114.c b/crypto/dh/dh_rfc5114.c
new file mode 100644
index 000000000000..e96e2aa3fc52
--- /dev/null
+++ b/crypto/dh/dh_rfc5114.c
@@ -0,0 +1,285 @@
+/*
+ * Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL project
+ * 2011.
+ */
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#include <stdio.h>
+#include "cryptlib.h"
+#include <openssl/dh.h>
+#include <openssl/bn.h>
+
+/* DH parameters from RFC5114 */
+
+#if BN_BITS2 == 64
+static const BN_ULONG dh1024_160_p[] = {
+ 0xDF1FB2BC2E4A4371ULL, 0xE68CFDA76D4DA708ULL, 0x45BF37DF365C1A65ULL,
+ 0xA151AF5F0DC8B4BDULL, 0xFAA31A4FF55BCCC0ULL, 0x4EFFD6FAE5644738ULL,
+ 0x98488E9C219A7372ULL, 0xACCBDD7D90C4BD70ULL, 0x24975C3CD49B83BFULL,
+ 0x13ECB4AEA9061123ULL, 0x9838EF1E2EE652C0ULL, 0x6073E28675A23D18ULL,
+ 0x9A6A9DCA52D23B61ULL, 0x52C99FBCFB06A3C6ULL, 0xDE92DE5EAE5D54ECULL,
+ 0xB10B8F96A080E01DULL
+};
+
+static const BN_ULONG dh1024_160_g[] = {
+ 0x855E6EEB22B3B2E5ULL, 0x858F4DCEF97C2A24ULL, 0x2D779D5918D08BC8ULL,
+ 0xD662A4D18E73AFA3ULL, 0x1DBF0A0169B6A28AULL, 0xA6A24C087A091F53ULL,
+ 0x909D0D2263F80A76ULL, 0xD7FBD7D3B9A92EE1ULL, 0x5E91547F9E2749F4ULL,
+ 0x160217B4B01B886AULL, 0x777E690F5504F213ULL, 0x266FEA1E5C41564BULL,
+ 0xD6406CFF14266D31ULL, 0xF8104DD258AC507FULL, 0x6765A442EFB99905ULL,
+ 0xA4D1CBD5C3FD3412ULL
+};
+
+static const BN_ULONG dh1024_160_q[] = {
+ 0x64B7CB9D49462353ULL, 0x81A8DF278ABA4E7DULL, 0x00000000F518AA87ULL
+};
+
+static const BN_ULONG dh2048_224_p[] = {
+ 0x0AC4DFFE0C10E64FULL, 0xCF9DE5384E71B81CULL, 0x7EF363E2FFA31F71ULL,
+ 0xE3FB73C16B8E75B9ULL, 0xC9B53DCF4BA80A29ULL, 0x23F10B0E16E79763ULL,
+ 0xC52172E413042E9BULL, 0xBE60E69CC928B2B9ULL, 0x80CD86A1B9E587E8ULL,
+ 0x315D75E198C641A4ULL, 0xCDF93ACC44328387ULL, 0x15987D9ADC0A486DULL,
+ 0x7310F7121FD5A074ULL, 0x278273C7DE31EFDCULL, 0x1602E714415D9330ULL,
+ 0x81286130BC8985DBULL, 0xB3BF8A3170918836ULL, 0x6A00E0A0B9C49708ULL,
+ 0xC6BA0B2C8BBC27BEULL, 0xC9F98D11ED34DBF6ULL, 0x7AD5B7D0B6C12207ULL,
+ 0xD91E8FEF55B7394BULL, 0x9037C9EDEFDA4DF8ULL, 0x6D3F8152AD6AC212ULL,
+ 0x1DE6B85A1274A0A6ULL, 0xEB3D688A309C180EULL, 0xAF9A3C407BA1DF15ULL,
+ 0xE6FA141DF95A56DBULL, 0xB54B1597B61D0A75ULL, 0xA20D64E5683B9FD1ULL,
+ 0xD660FAA79559C51FULL, 0xAD107E1E9123A9D0ULL
+};
+
+static const BN_ULONG dh2048_224_g[] = {
+ 0x84B890D3191F2BFAULL, 0x81BC087F2A7065B3ULL, 0x19C418E1F6EC0179ULL,
+ 0x7B5A0F1C71CFFF4CULL, 0xEDFE72FE9B6AA4BDULL, 0x81E1BCFE94B30269ULL,
+ 0x566AFBB48D6C0191ULL, 0xB539CCE3409D13CDULL, 0x6AA21E7F5F2FF381ULL,
+ 0xD9E263E4770589EFULL, 0x10E183EDD19963DDULL, 0xB70A8137150B8EEBULL,
+ 0x051AE3D428C8F8ACULL, 0xBB77A86F0C1AB15BULL, 0x6E3025E316A330EFULL,
+ 0x19529A45D6F83456ULL, 0xF180EB34118E98D1ULL, 0xB5F6C6B250717CBEULL,
+ 0x09939D54DA7460CDULL, 0xE247150422EA1ED4ULL, 0xB8A762D0521BC98AULL,
+ 0xF4D027275AC1348BULL, 0xC17669101999024AULL, 0xBE5E9001A8D66AD7ULL,
+ 0xC57DB17C620A8652ULL, 0xAB739D7700C29F52ULL, 0xDD921F01A70C4AFAULL,
+ 0xA6824A4E10B9A6F0ULL, 0x74866A08CFE4FFE3ULL, 0x6CDEBE7B89998CAFULL,
+ 0x9DF30B5C8FFDAC50ULL, 0xAC4032EF4F2D9AE3ULL
+};
+
+static const BN_ULONG dh2048_224_q[] = {
+ 0xBF389A99B36371EBULL, 0x1F80535A4738CEBCULL, 0xC58D93FE99717710ULL,
+ 0x00000000801C0D34ULL
+};
+
+static const BN_ULONG dh2048_256_p[] = {
+ 0xDB094AE91E1A1597ULL, 0x693877FAD7EF09CAULL, 0x6116D2276E11715FULL,
+ 0xA4B54330C198AF12ULL, 0x75F26375D7014103ULL, 0xC3A3960A54E710C3ULL,
+ 0xDED4010ABD0BE621ULL, 0xC0B857F689962856ULL, 0xB3CA3F7971506026ULL,
+ 0x1CCACB83E6B486F6ULL, 0x67E144E514056425ULL, 0xF6A167B5A41825D9ULL,
+ 0x3AD8347796524D8EULL, 0xF13C6D9A51BFA4ABULL, 0x2D52526735488A0EULL,
+ 0xB63ACAE1CAA6B790ULL, 0x4FDB70C581B23F76ULL, 0xBC39A0BF12307F5CULL,
+ 0xB941F54EB1E59BB8ULL, 0x6C5BFC11D45F9088ULL, 0x22E0B1EF4275BF7BULL,
+ 0x91F9E6725B4758C0ULL, 0x5A8A9D306BCF67EDULL, 0x209E0C6497517ABDULL,
+ 0x3BF4296D830E9A7CULL, 0x16C3D91134096FAAULL, 0xFAF7DF4561B2AA30ULL,
+ 0xE00DF8F1D61957D4ULL, 0x5D2CEED4435E3B00ULL, 0x8CEEF608660DD0F2ULL,
+ 0xFFBBD19C65195999ULL, 0x87A8E61DB4B6663CULL
+};
+
+static const BN_ULONG dh2048_256_g[] = {
+ 0x664B4C0F6CC41659ULL, 0x5E2327CFEF98C582ULL, 0xD647D148D4795451ULL,
+ 0x2F63078490F00EF8ULL, 0x184B523D1DB246C3ULL, 0xC7891428CDC67EB6ULL,
+ 0x7FD028370DF92B52ULL, 0xB3353BBB64E0EC37ULL, 0xECD06E1557CD0915ULL,
+ 0xB7D2BBD2DF016199ULL, 0xC8484B1E052588B9ULL, 0xDB2A3B7313D3FE14ULL,
+ 0xD052B985D182EA0AULL, 0xA4BD1BFFE83B9C80ULL, 0xDFC967C1FB3F2E55ULL,
+ 0xB5045AF2767164E1ULL, 0x1D14348F6F2F9193ULL, 0x64E67982428EBC83ULL,
+ 0x8AC376D282D6ED38ULL, 0x777DE62AAAB8A862ULL, 0xDDF463E5E9EC144BULL,
+ 0x0196F931C77A57F2ULL, 0xA55AE31341000A65ULL, 0x901228F8C28CBB18ULL,
+ 0xBC3773BF7E8C6F62ULL, 0xBE3A6C1B0C6B47B1ULL, 0xFF4FED4AAC0BB555ULL,
+ 0x10DBC15077BE463FULL, 0x07F4793A1A0BA125ULL, 0x4CA7B18F21EF2054ULL,
+ 0x2E77506660EDBD48ULL, 0x3FB32C9B73134D0BULL
+};
+
+static const BN_ULONG dh2048_256_q[] = {
+ 0xA308B0FE64F5FBD3ULL, 0x99B1A47D1EB3750BULL, 0xB447997640129DA2ULL,
+ 0x8CF83642A709A097ULL
+};
+
+#elif BN_BITS2 == 32
+
+static const BN_ULONG dh1024_160_p[] = {
+ 0x2E4A4371, 0xDF1FB2BC, 0x6D4DA708, 0xE68CFDA7, 0x365C1A65, 0x45BF37DF,
+ 0x0DC8B4BD, 0xA151AF5F, 0xF55BCCC0, 0xFAA31A4F, 0xE5644738, 0x4EFFD6FA,
+ 0x219A7372, 0x98488E9C, 0x90C4BD70, 0xACCBDD7D, 0xD49B83BF, 0x24975C3C,
+ 0xA9061123, 0x13ECB4AE, 0x2EE652C0, 0x9838EF1E, 0x75A23D18, 0x6073E286,
+ 0x52D23B61, 0x9A6A9DCA, 0xFB06A3C6, 0x52C99FBC, 0xAE5D54EC, 0xDE92DE5E,
+ 0xA080E01D, 0xB10B8F96
+};
+
+static const BN_ULONG dh1024_160_g[] = {
+ 0x22B3B2E5, 0x855E6EEB, 0xF97C2A24, 0x858F4DCE, 0x18D08BC8, 0x2D779D59,
+ 0x8E73AFA3, 0xD662A4D1, 0x69B6A28A, 0x1DBF0A01, 0x7A091F53, 0xA6A24C08,
+ 0x63F80A76, 0x909D0D22, 0xB9A92EE1, 0xD7FBD7D3, 0x9E2749F4, 0x5E91547F,
+ 0xB01B886A, 0x160217B4, 0x5504F213, 0x777E690F, 0x5C41564B, 0x266FEA1E,
+ 0x14266D31, 0xD6406CFF, 0x58AC507F, 0xF8104DD2, 0xEFB99905, 0x6765A442,
+ 0xC3FD3412, 0xA4D1CBD5
+};
+
+static const BN_ULONG dh1024_160_q[] = {
+ 0x49462353, 0x64B7CB9D, 0x8ABA4E7D, 0x81A8DF27, 0xF518AA87
+};
+
+static const BN_ULONG dh2048_224_p[] = {
+ 0x0C10E64F, 0x0AC4DFFE, 0x4E71B81C, 0xCF9DE538, 0xFFA31F71, 0x7EF363E2,
+ 0x6B8E75B9, 0xE3FB73C1, 0x4BA80A29, 0xC9B53DCF, 0x16E79763, 0x23F10B0E,
+ 0x13042E9B, 0xC52172E4, 0xC928B2B9, 0xBE60E69C, 0xB9E587E8, 0x80CD86A1,
+ 0x98C641A4, 0x315D75E1, 0x44328387, 0xCDF93ACC, 0xDC0A486D, 0x15987D9A,
+ 0x1FD5A074, 0x7310F712, 0xDE31EFDC, 0x278273C7, 0x415D9330, 0x1602E714,
+ 0xBC8985DB, 0x81286130, 0x70918836, 0xB3BF8A31, 0xB9C49708, 0x6A00E0A0,
+ 0x8BBC27BE, 0xC6BA0B2C, 0xED34DBF6, 0xC9F98D11, 0xB6C12207, 0x7AD5B7D0,
+ 0x55B7394B, 0xD91E8FEF, 0xEFDA4DF8, 0x9037C9ED, 0xAD6AC212, 0x6D3F8152,
+ 0x1274A0A6, 0x1DE6B85A, 0x309C180E, 0xEB3D688A, 0x7BA1DF15, 0xAF9A3C40,
+ 0xF95A56DB, 0xE6FA141D, 0xB61D0A75, 0xB54B1597, 0x683B9FD1, 0xA20D64E5,
+ 0x9559C51F, 0xD660FAA7, 0x9123A9D0, 0xAD107E1E
+};
+
+static const BN_ULONG dh2048_224_g[] = {
+ 0x191F2BFA, 0x84B890D3, 0x2A7065B3, 0x81BC087F, 0xF6EC0179, 0x19C418E1,
+ 0x71CFFF4C, 0x7B5A0F1C, 0x9B6AA4BD, 0xEDFE72FE, 0x94B30269, 0x81E1BCFE,
+ 0x8D6C0191, 0x566AFBB4, 0x409D13CD, 0xB539CCE3, 0x5F2FF381, 0x6AA21E7F,
+ 0x770589EF, 0xD9E263E4, 0xD19963DD, 0x10E183ED, 0x150B8EEB, 0xB70A8137,
+ 0x28C8F8AC, 0x051AE3D4, 0x0C1AB15B, 0xBB77A86F, 0x16A330EF, 0x6E3025E3,
+ 0xD6F83456, 0x19529A45, 0x118E98D1, 0xF180EB34, 0x50717CBE, 0xB5F6C6B2,
+ 0xDA7460CD, 0x09939D54, 0x22EA1ED4, 0xE2471504, 0x521BC98A, 0xB8A762D0,
+ 0x5AC1348B, 0xF4D02727, 0x1999024A, 0xC1766910, 0xA8D66AD7, 0xBE5E9001,
+ 0x620A8652, 0xC57DB17C, 0x00C29F52, 0xAB739D77, 0xA70C4AFA, 0xDD921F01,
+ 0x10B9A6F0, 0xA6824A4E, 0xCFE4FFE3, 0x74866A08, 0x89998CAF, 0x6CDEBE7B,
+ 0x8FFDAC50, 0x9DF30B5C, 0x4F2D9AE3, 0xAC4032EF
+};
+
+static const BN_ULONG dh2048_224_q[] = {
+ 0xB36371EB, 0xBF389A99, 0x4738CEBC, 0x1F80535A, 0x99717710, 0xC58D93FE,
+ 0x801C0D34
+};
+
+static const BN_ULONG dh2048_256_p[] = {
+ 0x1E1A1597, 0xDB094AE9, 0xD7EF09CA, 0x693877FA, 0x6E11715F, 0x6116D227,
+ 0xC198AF12, 0xA4B54330, 0xD7014103, 0x75F26375, 0x54E710C3, 0xC3A3960A,
+ 0xBD0BE621, 0xDED4010A, 0x89962856, 0xC0B857F6, 0x71506026, 0xB3CA3F79,
+ 0xE6B486F6, 0x1CCACB83, 0x14056425, 0x67E144E5, 0xA41825D9, 0xF6A167B5,
+ 0x96524D8E, 0x3AD83477, 0x51BFA4AB, 0xF13C6D9A, 0x35488A0E, 0x2D525267,
+ 0xCAA6B790, 0xB63ACAE1, 0x81B23F76, 0x4FDB70C5, 0x12307F5C, 0xBC39A0BF,
+ 0xB1E59BB8, 0xB941F54E, 0xD45F9088, 0x6C5BFC11, 0x4275BF7B, 0x22E0B1EF,
+ 0x5B4758C0, 0x91F9E672, 0x6BCF67ED, 0x5A8A9D30, 0x97517ABD, 0x209E0C64,
+ 0x830E9A7C, 0x3BF4296D, 0x34096FAA, 0x16C3D911, 0x61B2AA30, 0xFAF7DF45,
+ 0xD61957D4, 0xE00DF8F1, 0x435E3B00, 0x5D2CEED4, 0x660DD0F2, 0x8CEEF608,
+ 0x65195999, 0xFFBBD19C, 0xB4B6663C, 0x87A8E61D
+};
+
+static const BN_ULONG dh2048_256_g[] = {
+ 0x6CC41659, 0x664B4C0F, 0xEF98C582, 0x5E2327CF, 0xD4795451, 0xD647D148,
+ 0x90F00EF8, 0x2F630784, 0x1DB246C3, 0x184B523D, 0xCDC67EB6, 0xC7891428,
+ 0x0DF92B52, 0x7FD02837, 0x64E0EC37, 0xB3353BBB, 0x57CD0915, 0xECD06E15,
+ 0xDF016199, 0xB7D2BBD2, 0x052588B9, 0xC8484B1E, 0x13D3FE14, 0xDB2A3B73,
+ 0xD182EA0A, 0xD052B985, 0xE83B9C80, 0xA4BD1BFF, 0xFB3F2E55, 0xDFC967C1,
+ 0x767164E1, 0xB5045AF2, 0x6F2F9193, 0x1D14348F, 0x428EBC83, 0x64E67982,
+ 0x82D6ED38, 0x8AC376D2, 0xAAB8A862, 0x777DE62A, 0xE9EC144B, 0xDDF463E5,
+ 0xC77A57F2, 0x0196F931, 0x41000A65, 0xA55AE313, 0xC28CBB18, 0x901228F8,
+ 0x7E8C6F62, 0xBC3773BF, 0x0C6B47B1, 0xBE3A6C1B, 0xAC0BB555, 0xFF4FED4A,
+ 0x77BE463F, 0x10DBC150, 0x1A0BA125, 0x07F4793A, 0x21EF2054, 0x4CA7B18F,
+ 0x60EDBD48, 0x2E775066, 0x73134D0B, 0x3FB32C9B
+};
+
+static const BN_ULONG dh2048_256_q[] = {
+ 0x64F5FBD3, 0xA308B0FE, 0x1EB3750B, 0x99B1A47D, 0x40129DA2, 0xB4479976,
+ 0xA709A097, 0x8CF83642
+};
+
+#else
+# error "unsupported BN_BITS2"
+#endif
+
+/* Macro to make a BIGNUM from static data */
+
+#define make_dh_bn(x) static const BIGNUM _bignum_##x = { (BN_ULONG *) x, \
+ sizeof(x)/sizeof(BN_ULONG),\
+ sizeof(x)/sizeof(BN_ULONG),\
+ 0, BN_FLG_STATIC_DATA }
+
+/*
+ * Macro to make a DH structure from BIGNUM data. NB: although just copying
+ * the BIGNUM static pointers would be more efficient we can't as they get
+ * wiped using BN_clear_free() when DH_free() is called.
+ */
+
+#define make_dh(x) \
+DH * DH_get_##x(void) \
+ { \
+ DH *dh; \
+ make_dh_bn(dh##x##_p); \
+ make_dh_bn(dh##x##_q); \
+ make_dh_bn(dh##x##_g); \
+ dh = DH_new(); \
+ if (!dh) \
+ return NULL; \
+ dh->p = BN_dup(&_bignum_dh##x##_p); \
+ dh->g = BN_dup(&_bignum_dh##x##_g); \
+ dh->q = BN_dup(&_bignum_dh##x##_q); \
+ if (!dh->p || !dh->q || !dh->g) \
+ { \
+ DH_free(dh); \
+ return NULL; \
+ } \
+ return dh; \
+ }
+
+make_dh(1024_160)
+make_dh(2048_224)
+make_dh(2048_256)
diff --git a/crypto/dh/dhtest.c b/crypto/dh/dhtest.c
index 5a4ee9a35d83..c9dd76bc75e1 100644
--- a/crypto/dh/dhtest.c
+++ b/crypto/dh/dhtest.c
@@ -96,6 +96,8 @@ static int MS_CALLBACK cb(int p, int n, BN_GENCB *arg);
static const char rnd_seed[] =
"string to make the random number generator think it has entropy";
+static int run_rfc5114_tests(void);
+
int main(int argc, char *argv[])
{
BN_GENCB _cb;
@@ -199,6 +201,8 @@ int main(int argc, char *argv[])
ret = 1;
} else
ret = 0;
+ if (!run_rfc5114_tests())
+ ret = 1;
err:
ERR_print_errors_fp(stderr);
@@ -238,4 +242,323 @@ static int MS_CALLBACK cb(int p, int n, BN_GENCB *arg)
# endif
return 1;
}
+
+/* Test data from RFC 5114 */
+
+static const unsigned char dhtest_1024_160_xA[] = {
+ 0xB9, 0xA3, 0xB3, 0xAE, 0x8F, 0xEF, 0xC1, 0xA2, 0x93, 0x04, 0x96, 0x50,
+ 0x70, 0x86, 0xF8, 0x45, 0x5D, 0x48, 0x94, 0x3E
+};
+
+static const unsigned char dhtest_1024_160_yA[] = {
+ 0x2A, 0x85, 0x3B, 0x3D, 0x92, 0x19, 0x75, 0x01, 0xB9, 0x01, 0x5B, 0x2D,
+ 0xEB, 0x3E, 0xD8, 0x4F, 0x5E, 0x02, 0x1D, 0xCC, 0x3E, 0x52, 0xF1, 0x09,
+ 0xD3, 0x27, 0x3D, 0x2B, 0x75, 0x21, 0x28, 0x1C, 0xBA, 0xBE, 0x0E, 0x76,
+ 0xFF, 0x57, 0x27, 0xFA, 0x8A, 0xCC, 0xE2, 0x69, 0x56, 0xBA, 0x9A, 0x1F,
+ 0xCA, 0x26, 0xF2, 0x02, 0x28, 0xD8, 0x69, 0x3F, 0xEB, 0x10, 0x84, 0x1D,
+ 0x84, 0xA7, 0x36, 0x00, 0x54, 0xEC, 0xE5, 0xA7, 0xF5, 0xB7, 0xA6, 0x1A,
+ 0xD3, 0xDF, 0xB3, 0xC6, 0x0D, 0x2E, 0x43, 0x10, 0x6D, 0x87, 0x27, 0xDA,
+ 0x37, 0xDF, 0x9C, 0xCE, 0x95, 0xB4, 0x78, 0x75, 0x5D, 0x06, 0xBC, 0xEA,
+ 0x8F, 0x9D, 0x45, 0x96, 0x5F, 0x75, 0xA5, 0xF3, 0xD1, 0xDF, 0x37, 0x01,
+ 0x16, 0x5F, 0xC9, 0xE5, 0x0C, 0x42, 0x79, 0xCE, 0xB0, 0x7F, 0x98, 0x95,
+ 0x40, 0xAE, 0x96, 0xD5, 0xD8, 0x8E, 0xD7, 0x76
+};
+
+static const unsigned char dhtest_1024_160_xB[] = {
+ 0x93, 0x92, 0xC9, 0xF9, 0xEB, 0x6A, 0x7A, 0x6A, 0x90, 0x22, 0xF7, 0xD8,
+ 0x3E, 0x72, 0x23, 0xC6, 0x83, 0x5B, 0xBD, 0xDA
+};
+
+static const unsigned char dhtest_1024_160_yB[] = {
+ 0x71, 0x7A, 0x6C, 0xB0, 0x53, 0x37, 0x1F, 0xF4, 0xA3, 0xB9, 0x32, 0x94,
+ 0x1C, 0x1E, 0x56, 0x63, 0xF8, 0x61, 0xA1, 0xD6, 0xAD, 0x34, 0xAE, 0x66,
+ 0x57, 0x6D, 0xFB, 0x98, 0xF6, 0xC6, 0xCB, 0xF9, 0xDD, 0xD5, 0xA5, 0x6C,
+ 0x78, 0x33, 0xF6, 0xBC, 0xFD, 0xFF, 0x09, 0x55, 0x82, 0xAD, 0x86, 0x8E,
+ 0x44, 0x0E, 0x8D, 0x09, 0xFD, 0x76, 0x9E, 0x3C, 0xEC, 0xCD, 0xC3, 0xD3,
+ 0xB1, 0xE4, 0xCF, 0xA0, 0x57, 0x77, 0x6C, 0xAA, 0xF9, 0x73, 0x9B, 0x6A,
+ 0x9F, 0xEE, 0x8E, 0x74, 0x11, 0xF8, 0xD6, 0xDA, 0xC0, 0x9D, 0x6A, 0x4E,
+ 0xDB, 0x46, 0xCC, 0x2B, 0x5D, 0x52, 0x03, 0x09, 0x0E, 0xAE, 0x61, 0x26,
+ 0x31, 0x1E, 0x53, 0xFD, 0x2C, 0x14, 0xB5, 0x74, 0xE6, 0xA3, 0x10, 0x9A,
+ 0x3D, 0xA1, 0xBE, 0x41, 0xBD, 0xCE, 0xAA, 0x18, 0x6F, 0x5C, 0xE0, 0x67,
+ 0x16, 0xA2, 0xB6, 0xA0, 0x7B, 0x3C, 0x33, 0xFE
+};
+
+static const unsigned char dhtest_1024_160_Z[] = {
+ 0x5C, 0x80, 0x4F, 0x45, 0x4D, 0x30, 0xD9, 0xC4, 0xDF, 0x85, 0x27, 0x1F,
+ 0x93, 0x52, 0x8C, 0x91, 0xDF, 0x6B, 0x48, 0xAB, 0x5F, 0x80, 0xB3, 0xB5,
+ 0x9C, 0xAA, 0xC1, 0xB2, 0x8F, 0x8A, 0xCB, 0xA9, 0xCD, 0x3E, 0x39, 0xF3,
+ 0xCB, 0x61, 0x45, 0x25, 0xD9, 0x52, 0x1D, 0x2E, 0x64, 0x4C, 0x53, 0xB8,
+ 0x07, 0xB8, 0x10, 0xF3, 0x40, 0x06, 0x2F, 0x25, 0x7D, 0x7D, 0x6F, 0xBF,
+ 0xE8, 0xD5, 0xE8, 0xF0, 0x72, 0xE9, 0xB6, 0xE9, 0xAF, 0xDA, 0x94, 0x13,
+ 0xEA, 0xFB, 0x2E, 0x8B, 0x06, 0x99, 0xB1, 0xFB, 0x5A, 0x0C, 0xAC, 0xED,
+ 0xDE, 0xAE, 0xAD, 0x7E, 0x9C, 0xFB, 0xB3, 0x6A, 0xE2, 0xB4, 0x20, 0x83,
+ 0x5B, 0xD8, 0x3A, 0x19, 0xFB, 0x0B, 0x5E, 0x96, 0xBF, 0x8F, 0xA4, 0xD0,
+ 0x9E, 0x34, 0x55, 0x25, 0x16, 0x7E, 0xCD, 0x91, 0x55, 0x41, 0x6F, 0x46,
+ 0xF4, 0x08, 0xED, 0x31, 0xB6, 0x3C, 0x6E, 0x6D
+};
+
+static const unsigned char dhtest_2048_224_xA[] = {
+ 0x22, 0xE6, 0x26, 0x01, 0xDB, 0xFF, 0xD0, 0x67, 0x08, 0xA6, 0x80, 0xF7,
+ 0x47, 0xF3, 0x61, 0xF7, 0x6D, 0x8F, 0x4F, 0x72, 0x1A, 0x05, 0x48, 0xE4,
+ 0x83, 0x29, 0x4B, 0x0C
+};
+
+static const unsigned char dhtest_2048_224_yA[] = {
+ 0x1B, 0x3A, 0x63, 0x45, 0x1B, 0xD8, 0x86, 0xE6, 0x99, 0xE6, 0x7B, 0x49,
+ 0x4E, 0x28, 0x8B, 0xD7, 0xF8, 0xE0, 0xD3, 0x70, 0xBA, 0xDD, 0xA7, 0xA0,
+ 0xEF, 0xD2, 0xFD, 0xE7, 0xD8, 0xF6, 0x61, 0x45, 0xCC, 0x9F, 0x28, 0x04,
+ 0x19, 0x97, 0x5E, 0xB8, 0x08, 0x87, 0x7C, 0x8A, 0x4C, 0x0C, 0x8E, 0x0B,
+ 0xD4, 0x8D, 0x4A, 0x54, 0x01, 0xEB, 0x1E, 0x87, 0x76, 0xBF, 0xEE, 0xE1,
+ 0x34, 0xC0, 0x38, 0x31, 0xAC, 0x27, 0x3C, 0xD9, 0xD6, 0x35, 0xAB, 0x0C,
+ 0xE0, 0x06, 0xA4, 0x2A, 0x88, 0x7E, 0x3F, 0x52, 0xFB, 0x87, 0x66, 0xB6,
+ 0x50, 0xF3, 0x80, 0x78, 0xBC, 0x8E, 0xE8, 0x58, 0x0C, 0xEF, 0xE2, 0x43,
+ 0x96, 0x8C, 0xFC, 0x4F, 0x8D, 0xC3, 0xDB, 0x08, 0x45, 0x54, 0x17, 0x1D,
+ 0x41, 0xBF, 0x2E, 0x86, 0x1B, 0x7B, 0xB4, 0xD6, 0x9D, 0xD0, 0xE0, 0x1E,
+ 0xA3, 0x87, 0xCB, 0xAA, 0x5C, 0xA6, 0x72, 0xAF, 0xCB, 0xE8, 0xBD, 0xB9,
+ 0xD6, 0x2D, 0x4C, 0xE1, 0x5F, 0x17, 0xDD, 0x36, 0xF9, 0x1E, 0xD1, 0xEE,
+ 0xDD, 0x65, 0xCA, 0x4A, 0x06, 0x45, 0x5C, 0xB9, 0x4C, 0xD4, 0x0A, 0x52,
+ 0xEC, 0x36, 0x0E, 0x84, 0xB3, 0xC9, 0x26, 0xE2, 0x2C, 0x43, 0x80, 0xA3,
+ 0xBF, 0x30, 0x9D, 0x56, 0x84, 0x97, 0x68, 0xB7, 0xF5, 0x2C, 0xFD, 0xF6,
+ 0x55, 0xFD, 0x05, 0x3A, 0x7E, 0xF7, 0x06, 0x97, 0x9E, 0x7E, 0x58, 0x06,
+ 0xB1, 0x7D, 0xFA, 0xE5, 0x3A, 0xD2, 0xA5, 0xBC, 0x56, 0x8E, 0xBB, 0x52,
+ 0x9A, 0x7A, 0x61, 0xD6, 0x8D, 0x25, 0x6F, 0x8F, 0xC9, 0x7C, 0x07, 0x4A,
+ 0x86, 0x1D, 0x82, 0x7E, 0x2E, 0xBC, 0x8C, 0x61, 0x34, 0x55, 0x31, 0x15,
+ 0xB7, 0x0E, 0x71, 0x03, 0x92, 0x0A, 0xA1, 0x6D, 0x85, 0xE5, 0x2B, 0xCB,
+ 0xAB, 0x8D, 0x78, 0x6A, 0x68, 0x17, 0x8F, 0xA8, 0xFF, 0x7C, 0x2F, 0x5C,
+ 0x71, 0x64, 0x8D, 0x6F
+};
+
+static const unsigned char dhtest_2048_224_xB[] = {
+ 0x4F, 0xF3, 0xBC, 0x96, 0xC7, 0xFC, 0x6A, 0x6D, 0x71, 0xD3, 0xB3, 0x63,
+ 0x80, 0x0A, 0x7C, 0xDF, 0xEF, 0x6F, 0xC4, 0x1B, 0x44, 0x17, 0xEA, 0x15,
+ 0x35, 0x3B, 0x75, 0x90
+};
+
+static const unsigned char dhtest_2048_224_yB[] = {
+ 0x4D, 0xCE, 0xE9, 0x92, 0xA9, 0x76, 0x2A, 0x13, 0xF2, 0xF8, 0x38, 0x44,
+ 0xAD, 0x3D, 0x77, 0xEE, 0x0E, 0x31, 0xC9, 0x71, 0x8B, 0x3D, 0xB6, 0xC2,
+ 0x03, 0x5D, 0x39, 0x61, 0x18, 0x2C, 0x3E, 0x0B, 0xA2, 0x47, 0xEC, 0x41,
+ 0x82, 0xD7, 0x60, 0xCD, 0x48, 0xD9, 0x95, 0x99, 0x97, 0x06, 0x22, 0xA1,
+ 0x88, 0x1B, 0xBA, 0x2D, 0xC8, 0x22, 0x93, 0x9C, 0x78, 0xC3, 0x91, 0x2C,
+ 0x66, 0x61, 0xFA, 0x54, 0x38, 0xB2, 0x07, 0x66, 0x22, 0x2B, 0x75, 0xE2,
+ 0x4C, 0x2E, 0x3A, 0xD0, 0xC7, 0x28, 0x72, 0x36, 0x12, 0x95, 0x25, 0xEE,
+ 0x15, 0xB5, 0xDD, 0x79, 0x98, 0xAA, 0x04, 0xC4, 0xA9, 0x69, 0x6C, 0xAC,
+ 0xD7, 0x17, 0x20, 0x83, 0xA9, 0x7A, 0x81, 0x66, 0x4E, 0xAD, 0x2C, 0x47,
+ 0x9E, 0x44, 0x4E, 0x4C, 0x06, 0x54, 0xCC, 0x19, 0xE2, 0x8D, 0x77, 0x03,
+ 0xCE, 0xE8, 0xDA, 0xCD, 0x61, 0x26, 0xF5, 0xD6, 0x65, 0xEC, 0x52, 0xC6,
+ 0x72, 0x55, 0xDB, 0x92, 0x01, 0x4B, 0x03, 0x7E, 0xB6, 0x21, 0xA2, 0xAC,
+ 0x8E, 0x36, 0x5D, 0xE0, 0x71, 0xFF, 0xC1, 0x40, 0x0A, 0xCF, 0x07, 0x7A,
+ 0x12, 0x91, 0x3D, 0xD8, 0xDE, 0x89, 0x47, 0x34, 0x37, 0xAB, 0x7B, 0xA3,
+ 0x46, 0x74, 0x3C, 0x1B, 0x21, 0x5D, 0xD9, 0xC1, 0x21, 0x64, 0xA7, 0xE4,
+ 0x05, 0x31, 0x18, 0xD1, 0x99, 0xBE, 0xC8, 0xEF, 0x6F, 0xC5, 0x61, 0x17,
+ 0x0C, 0x84, 0xC8, 0x7D, 0x10, 0xEE, 0x9A, 0x67, 0x4A, 0x1F, 0xA8, 0xFF,
+ 0xE1, 0x3B, 0xDF, 0xBA, 0x1D, 0x44, 0xDE, 0x48, 0x94, 0x6D, 0x68, 0xDC,
+ 0x0C, 0xDD, 0x77, 0x76, 0x35, 0xA7, 0xAB, 0x5B, 0xFB, 0x1E, 0x4B, 0xB7,
+ 0xB8, 0x56, 0xF9, 0x68, 0x27, 0x73, 0x4C, 0x18, 0x41, 0x38, 0xE9, 0x15,
+ 0xD9, 0xC3, 0x00, 0x2E, 0xBC, 0xE5, 0x31, 0x20, 0x54, 0x6A, 0x7E, 0x20,
+ 0x02, 0x14, 0x2B, 0x6C
+};
+
+static const unsigned char dhtest_2048_224_Z[] = {
+ 0x34, 0xD9, 0xBD, 0xDC, 0x1B, 0x42, 0x17, 0x6C, 0x31, 0x3F, 0xEA, 0x03,
+ 0x4C, 0x21, 0x03, 0x4D, 0x07, 0x4A, 0x63, 0x13, 0xBB, 0x4E, 0xCD, 0xB3,
+ 0x70, 0x3F, 0xFF, 0x42, 0x45, 0x67, 0xA4, 0x6B, 0xDF, 0x75, 0x53, 0x0E,
+ 0xDE, 0x0A, 0x9D, 0xA5, 0x22, 0x9D, 0xE7, 0xD7, 0x67, 0x32, 0x28, 0x6C,
+ 0xBC, 0x0F, 0x91, 0xDA, 0x4C, 0x3C, 0x85, 0x2F, 0xC0, 0x99, 0xC6, 0x79,
+ 0x53, 0x1D, 0x94, 0xC7, 0x8A, 0xB0, 0x3D, 0x9D, 0xEC, 0xB0, 0xA4, 0xE4,
+ 0xCA, 0x8B, 0x2B, 0xB4, 0x59, 0x1C, 0x40, 0x21, 0xCF, 0x8C, 0xE3, 0xA2,
+ 0x0A, 0x54, 0x1D, 0x33, 0x99, 0x40, 0x17, 0xD0, 0x20, 0x0A, 0xE2, 0xC9,
+ 0x51, 0x6E, 0x2F, 0xF5, 0x14, 0x57, 0x79, 0x26, 0x9E, 0x86, 0x2B, 0x0F,
+ 0xB4, 0x74, 0xA2, 0xD5, 0x6D, 0xC3, 0x1E, 0xD5, 0x69, 0xA7, 0x70, 0x0B,
+ 0x4C, 0x4A, 0xB1, 0x6B, 0x22, 0xA4, 0x55, 0x13, 0x53, 0x1E, 0xF5, 0x23,
+ 0xD7, 0x12, 0x12, 0x07, 0x7B, 0x5A, 0x16, 0x9B, 0xDE, 0xFF, 0xAD, 0x7A,
+ 0xD9, 0x60, 0x82, 0x84, 0xC7, 0x79, 0x5B, 0x6D, 0x5A, 0x51, 0x83, 0xB8,
+ 0x70, 0x66, 0xDE, 0x17, 0xD8, 0xD6, 0x71, 0xC9, 0xEB, 0xD8, 0xEC, 0x89,
+ 0x54, 0x4D, 0x45, 0xEC, 0x06, 0x15, 0x93, 0xD4, 0x42, 0xC6, 0x2A, 0xB9,
+ 0xCE, 0x3B, 0x1C, 0xB9, 0x94, 0x3A, 0x1D, 0x23, 0xA5, 0xEA, 0x3B, 0xCF,
+ 0x21, 0xA0, 0x14, 0x71, 0xE6, 0x7E, 0x00, 0x3E, 0x7F, 0x8A, 0x69, 0xC7,
+ 0x28, 0xBE, 0x49, 0x0B, 0x2F, 0xC8, 0x8C, 0xFE, 0xB9, 0x2D, 0xB6, 0xA2,
+ 0x15, 0xE5, 0xD0, 0x3C, 0x17, 0xC4, 0x64, 0xC9, 0xAC, 0x1A, 0x46, 0xE2,
+ 0x03, 0xE1, 0x3F, 0x95, 0x29, 0x95, 0xFB, 0x03, 0xC6, 0x9D, 0x3C, 0xC4,
+ 0x7F, 0xCB, 0x51, 0x0B, 0x69, 0x98, 0xFF, 0xD3, 0xAA, 0x6D, 0xE7, 0x3C,
+ 0xF9, 0xF6, 0x38, 0x69
+};
+
+static const unsigned char dhtest_2048_256_xA[] = {
+ 0x08, 0x81, 0x38, 0x2C, 0xDB, 0x87, 0x66, 0x0C, 0x6D, 0xC1, 0x3E, 0x61,
+ 0x49, 0x38, 0xD5, 0xB9, 0xC8, 0xB2, 0xF2, 0x48, 0x58, 0x1C, 0xC5, 0xE3,
+ 0x1B, 0x35, 0x45, 0x43, 0x97, 0xFC, 0xE5, 0x0E
+};
+
+static const unsigned char dhtest_2048_256_yA[] = {
+ 0x2E, 0x93, 0x80, 0xC8, 0x32, 0x3A, 0xF9, 0x75, 0x45, 0xBC, 0x49, 0x41,
+ 0xDE, 0xB0, 0xEC, 0x37, 0x42, 0xC6, 0x2F, 0xE0, 0xEC, 0xE8, 0x24, 0xA6,
+ 0xAB, 0xDB, 0xE6, 0x6C, 0x59, 0xBE, 0xE0, 0x24, 0x29, 0x11, 0xBF, 0xB9,
+ 0x67, 0x23, 0x5C, 0xEB, 0xA3, 0x5A, 0xE1, 0x3E, 0x4E, 0xC7, 0x52, 0xBE,
+ 0x63, 0x0B, 0x92, 0xDC, 0x4B, 0xDE, 0x28, 0x47, 0xA9, 0xC6, 0x2C, 0xB8,
+ 0x15, 0x27, 0x45, 0x42, 0x1F, 0xB7, 0xEB, 0x60, 0xA6, 0x3C, 0x0F, 0xE9,
+ 0x15, 0x9F, 0xCC, 0xE7, 0x26, 0xCE, 0x7C, 0xD8, 0x52, 0x3D, 0x74, 0x50,
+ 0x66, 0x7E, 0xF8, 0x40, 0xE4, 0x91, 0x91, 0x21, 0xEB, 0x5F, 0x01, 0xC8,
+ 0xC9, 0xB0, 0xD3, 0xD6, 0x48, 0xA9, 0x3B, 0xFB, 0x75, 0x68, 0x9E, 0x82,
+ 0x44, 0xAC, 0x13, 0x4A, 0xF5, 0x44, 0x71, 0x1C, 0xE7, 0x9A, 0x02, 0xDC,
+ 0xC3, 0x42, 0x26, 0x68, 0x47, 0x80, 0xDD, 0xDC, 0xB4, 0x98, 0x59, 0x41,
+ 0x06, 0xC3, 0x7F, 0x5B, 0xC7, 0x98, 0x56, 0x48, 0x7A, 0xF5, 0xAB, 0x02,
+ 0x2A, 0x2E, 0x5E, 0x42, 0xF0, 0x98, 0x97, 0xC1, 0xA8, 0x5A, 0x11, 0xEA,
+ 0x02, 0x12, 0xAF, 0x04, 0xD9, 0xB4, 0xCE, 0xBC, 0x93, 0x7C, 0x3C, 0x1A,
+ 0x3E, 0x15, 0xA8, 0xA0, 0x34, 0x2E, 0x33, 0x76, 0x15, 0xC8, 0x4E, 0x7F,
+ 0xE3, 0xB8, 0xB9, 0xB8, 0x7F, 0xB1, 0xE7, 0x3A, 0x15, 0xAF, 0x12, 0xA3,
+ 0x0D, 0x74, 0x6E, 0x06, 0xDF, 0xC3, 0x4F, 0x29, 0x0D, 0x79, 0x7C, 0xE5,
+ 0x1A, 0xA1, 0x3A, 0xA7, 0x85, 0xBF, 0x66, 0x58, 0xAF, 0xF5, 0xE4, 0xB0,
+ 0x93, 0x00, 0x3C, 0xBE, 0xAF, 0x66, 0x5B, 0x3C, 0x2E, 0x11, 0x3A, 0x3A,
+ 0x4E, 0x90, 0x52, 0x69, 0x34, 0x1D, 0xC0, 0x71, 0x14, 0x26, 0x68, 0x5F,
+ 0x4E, 0xF3, 0x7E, 0x86, 0x8A, 0x81, 0x26, 0xFF, 0x3F, 0x22, 0x79, 0xB5,
+ 0x7C, 0xA6, 0x7E, 0x29
+};
+
+static const unsigned char dhtest_2048_256_xB[] = {
+ 0x7D, 0x62, 0xA7, 0xE3, 0xEF, 0x36, 0xDE, 0x61, 0x7B, 0x13, 0xD1, 0xAF,
+ 0xB8, 0x2C, 0x78, 0x0D, 0x83, 0xA2, 0x3B, 0xD4, 0xEE, 0x67, 0x05, 0x64,
+ 0x51, 0x21, 0xF3, 0x71, 0xF5, 0x46, 0xA5, 0x3D
+};
+
+static const unsigned char dhtest_2048_256_yB[] = {
+ 0x57, 0x5F, 0x03, 0x51, 0xBD, 0x2B, 0x1B, 0x81, 0x74, 0x48, 0xBD, 0xF8,
+ 0x7A, 0x6C, 0x36, 0x2C, 0x1E, 0x28, 0x9D, 0x39, 0x03, 0xA3, 0x0B, 0x98,
+ 0x32, 0xC5, 0x74, 0x1F, 0xA2, 0x50, 0x36, 0x3E, 0x7A, 0xCB, 0xC7, 0xF7,
+ 0x7F, 0x3D, 0xAC, 0xBC, 0x1F, 0x13, 0x1A, 0xDD, 0x8E, 0x03, 0x36, 0x7E,
+ 0xFF, 0x8F, 0xBB, 0xB3, 0xE1, 0xC5, 0x78, 0x44, 0x24, 0x80, 0x9B, 0x25,
+ 0xAF, 0xE4, 0xD2, 0x26, 0x2A, 0x1A, 0x6F, 0xD2, 0xFA, 0xB6, 0x41, 0x05,
+ 0xCA, 0x30, 0xA6, 0x74, 0xE0, 0x7F, 0x78, 0x09, 0x85, 0x20, 0x88, 0x63,
+ 0x2F, 0xC0, 0x49, 0x23, 0x37, 0x91, 0xAD, 0x4E, 0xDD, 0x08, 0x3A, 0x97,
+ 0x8B, 0x88, 0x3E, 0xE6, 0x18, 0xBC, 0x5E, 0x0D, 0xD0, 0x47, 0x41, 0x5F,
+ 0x2D, 0x95, 0xE6, 0x83, 0xCF, 0x14, 0x82, 0x6B, 0x5F, 0xBE, 0x10, 0xD3,
+ 0xCE, 0x41, 0xC6, 0xC1, 0x20, 0xC7, 0x8A, 0xB2, 0x00, 0x08, 0xC6, 0x98,
+ 0xBF, 0x7F, 0x0B, 0xCA, 0xB9, 0xD7, 0xF4, 0x07, 0xBE, 0xD0, 0xF4, 0x3A,
+ 0xFB, 0x29, 0x70, 0xF5, 0x7F, 0x8D, 0x12, 0x04, 0x39, 0x63, 0xE6, 0x6D,
+ 0xDD, 0x32, 0x0D, 0x59, 0x9A, 0xD9, 0x93, 0x6C, 0x8F, 0x44, 0x13, 0x7C,
+ 0x08, 0xB1, 0x80, 0xEC, 0x5E, 0x98, 0x5C, 0xEB, 0xE1, 0x86, 0xF3, 0xD5,
+ 0x49, 0x67, 0x7E, 0x80, 0x60, 0x73, 0x31, 0xEE, 0x17, 0xAF, 0x33, 0x80,
+ 0xA7, 0x25, 0xB0, 0x78, 0x23, 0x17, 0xD7, 0xDD, 0x43, 0xF5, 0x9D, 0x7A,
+ 0xF9, 0x56, 0x8A, 0x9B, 0xB6, 0x3A, 0x84, 0xD3, 0x65, 0xF9, 0x22, 0x44,
+ 0xED, 0x12, 0x09, 0x88, 0x21, 0x93, 0x02, 0xF4, 0x29, 0x24, 0xC7, 0xCA,
+ 0x90, 0xB8, 0x9D, 0x24, 0xF7, 0x1B, 0x0A, 0xB6, 0x97, 0x82, 0x3D, 0x7D,
+ 0xEB, 0x1A, 0xFF, 0x5B, 0x0E, 0x8E, 0x4A, 0x45, 0xD4, 0x9F, 0x7F, 0x53,
+ 0x75, 0x7E, 0x19, 0x13
+};
+
+static const unsigned char dhtest_2048_256_Z[] = {
+ 0x86, 0xC7, 0x0B, 0xF8, 0xD0, 0xBB, 0x81, 0xBB, 0x01, 0x07, 0x8A, 0x17,
+ 0x21, 0x9C, 0xB7, 0xD2, 0x72, 0x03, 0xDB, 0x2A, 0x19, 0xC8, 0x77, 0xF1,
+ 0xD1, 0xF1, 0x9F, 0xD7, 0xD7, 0x7E, 0xF2, 0x25, 0x46, 0xA6, 0x8F, 0x00,
+ 0x5A, 0xD5, 0x2D, 0xC8, 0x45, 0x53, 0xB7, 0x8F, 0xC6, 0x03, 0x30, 0xBE,
+ 0x51, 0xEA, 0x7C, 0x06, 0x72, 0xCA, 0xC1, 0x51, 0x5E, 0x4B, 0x35, 0xC0,
+ 0x47, 0xB9, 0xA5, 0x51, 0xB8, 0x8F, 0x39, 0xDC, 0x26, 0xDA, 0x14, 0xA0,
+ 0x9E, 0xF7, 0x47, 0x74, 0xD4, 0x7C, 0x76, 0x2D, 0xD1, 0x77, 0xF9, 0xED,
+ 0x5B, 0xC2, 0xF1, 0x1E, 0x52, 0xC8, 0x79, 0xBD, 0x95, 0x09, 0x85, 0x04,
+ 0xCD, 0x9E, 0xEC, 0xD8, 0xA8, 0xF9, 0xB3, 0xEF, 0xBD, 0x1F, 0x00, 0x8A,
+ 0xC5, 0x85, 0x30, 0x97, 0xD9, 0xD1, 0x83, 0x7F, 0x2B, 0x18, 0xF7, 0x7C,
+ 0xD7, 0xBE, 0x01, 0xAF, 0x80, 0xA7, 0xC7, 0xB5, 0xEA, 0x3C, 0xA5, 0x4C,
+ 0xC0, 0x2D, 0x0C, 0x11, 0x6F, 0xEE, 0x3F, 0x95, 0xBB, 0x87, 0x39, 0x93,
+ 0x85, 0x87, 0x5D, 0x7E, 0x86, 0x74, 0x7E, 0x67, 0x6E, 0x72, 0x89, 0x38,
+ 0xAC, 0xBF, 0xF7, 0x09, 0x8E, 0x05, 0xBE, 0x4D, 0xCF, 0xB2, 0x40, 0x52,
+ 0xB8, 0x3A, 0xEF, 0xFB, 0x14, 0x78, 0x3F, 0x02, 0x9A, 0xDB, 0xDE, 0x7F,
+ 0x53, 0xFA, 0xE9, 0x20, 0x84, 0x22, 0x40, 0x90, 0xE0, 0x07, 0xCE, 0xE9,
+ 0x4D, 0x4B, 0xF2, 0xBA, 0xCE, 0x9F, 0xFD, 0x4B, 0x57, 0xD2, 0xAF, 0x7C,
+ 0x72, 0x4D, 0x0C, 0xAA, 0x19, 0xBF, 0x05, 0x01, 0xF6, 0xF1, 0x7B, 0x4A,
+ 0xA1, 0x0F, 0x42, 0x5E, 0x3E, 0xA7, 0x60, 0x80, 0xB4, 0xB9, 0xD6, 0xB3,
+ 0xCE, 0xFE, 0xA1, 0x15, 0xB2, 0xCE, 0xB8, 0x78, 0x9B, 0xB8, 0xA3, 0xB0,
+ 0xEA, 0x87, 0xFE, 0xBE, 0x63, 0xB6, 0xC8, 0xF8, 0x46, 0xEC, 0x6D, 0xB0,
+ 0xC2, 0x6C, 0x5D, 0x7C
+};
+
+typedef struct {
+ DH *(*get_param) (void);
+ const unsigned char *xA;
+ size_t xA_len;
+ const unsigned char *yA;
+ size_t yA_len;
+ const unsigned char *xB;
+ size_t xB_len;
+ const unsigned char *yB;
+ size_t yB_len;
+ const unsigned char *Z;
+ size_t Z_len;
+} rfc5114_td;
+
+# define make_rfc5114_td(pre) { \
+ DH_get_##pre, \
+ dhtest_##pre##_xA, sizeof(dhtest_##pre##_xA), \
+ dhtest_##pre##_yA, sizeof(dhtest_##pre##_yA), \
+ dhtest_##pre##_xB, sizeof(dhtest_##pre##_xB), \
+ dhtest_##pre##_yB, sizeof(dhtest_##pre##_yB), \
+ dhtest_##pre##_Z, sizeof(dhtest_##pre##_Z) \
+ }
+
+static const rfc5114_td rfctd[] = {
+ make_rfc5114_td(1024_160),
+ make_rfc5114_td(2048_224),
+ make_rfc5114_td(2048_256)
+};
+
+static int run_rfc5114_tests(void)
+{
+ int i;
+ for (i = 0; i < (int)(sizeof(rfctd) / sizeof(rfc5114_td)); i++) {
+ DH *dhA, *dhB;
+ unsigned char *Z1 = NULL, *Z2 = NULL;
+ const rfc5114_td *td = rfctd + i;
+ /* Set up DH structures setting key components */
+ dhA = td->get_param();
+ dhB = td->get_param();
+ if (!dhA || !dhB)
+ goto bad_err;
+
+ dhA->priv_key = BN_bin2bn(td->xA, td->xA_len, NULL);
+ dhA->pub_key = BN_bin2bn(td->yA, td->yA_len, NULL);
+
+ dhB->priv_key = BN_bin2bn(td->xB, td->xB_len, NULL);
+ dhB->pub_key = BN_bin2bn(td->yB, td->yB_len, NULL);
+
+ if (!dhA->priv_key || !dhA->pub_key
+ || !dhB->priv_key || !dhB->pub_key)
+ goto bad_err;
+
+ if ((td->Z_len != (size_t)DH_size(dhA))
+ || (td->Z_len != (size_t)DH_size(dhB)))
+ goto err;
+
+ Z1 = OPENSSL_malloc(DH_size(dhA));
+ Z2 = OPENSSL_malloc(DH_size(dhB));
+ /*
+ * Work out shared secrets using both sides and compare with expected
+ * values.
+ */
+ if (!DH_compute_key(Z1, dhB->pub_key, dhA))
+ goto bad_err;
+ if (!DH_compute_key(Z2, dhA->pub_key, dhB))
+ goto bad_err;
+
+ if (memcmp(Z1, td->Z, td->Z_len))
+ goto err;
+ if (memcmp(Z2, td->Z, td->Z_len))
+ goto err;
+
+ printf("RFC5114 parameter test %d OK\n", i + 1);
+
+ DH_free(dhA);
+ DH_free(dhB);
+ OPENSSL_free(Z1);
+ OPENSSL_free(Z2);
+
+ }
+ return 1;
+ bad_err:
+ fprintf(stderr, "Initalisation error RFC5114 set %d\n", i + 1);
+ ERR_print_errors_fp(stderr);
+ return 0;
+ err:
+ fprintf(stderr, "Test failed RFC5114 set %d\n", i + 1);
+ return 0;
+}
+
#endif
diff --git a/crypto/dsa/dsa.h b/crypto/dsa/dsa.h
index a2f0ee786370..545358fd02b2 100644
--- a/crypto/dsa/dsa.h
+++ b/crypto/dsa/dsa.h
@@ -287,6 +287,7 @@ void ERR_load_DSA_strings(void);
# define DSA_F_DO_DSA_PRINT 104
# define DSA_F_DSAPARAMS_PRINT 100
# define DSA_F_DSAPARAMS_PRINT_FP 101
+# define DSA_F_DSA_BUILTIN_PARAMGEN2 126
# define DSA_F_DSA_DO_SIGN 112
# define DSA_F_DSA_DO_VERIFY 113
# define DSA_F_DSA_GENERATE_KEY 124
@@ -316,12 +317,14 @@ void ERR_load_DSA_strings(void);
# define DSA_R_DATA_TOO_LARGE_FOR_KEY_SIZE 100
# define DSA_R_DECODE_ERROR 104
# define DSA_R_INVALID_DIGEST_TYPE 106
+# define DSA_R_INVALID_PARAMETERS 112
# define DSA_R_MISSING_PARAMETERS 101
# define DSA_R_MODULUS_TOO_LARGE 103
# define DSA_R_NEED_NEW_SETUP_VALUES 110
# define DSA_R_NON_FIPS_DSA_METHOD 111
# define DSA_R_NO_PARAMETERS_SET 107
# define DSA_R_PARAMETER_ENCODING_ERROR 105
+# define DSA_R_Q_NOT_PRIME 113
#ifdef __cplusplus
}
diff --git a/crypto/dsa/dsa_ameth.c b/crypto/dsa/dsa_ameth.c
index a2840eaed095..2a5cd71371a7 100644
--- a/crypto/dsa/dsa_ameth.c
+++ b/crypto/dsa/dsa_ameth.c
@@ -601,10 +601,14 @@ static int dsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2)
X509_ALGOR_set0(alg2, OBJ_nid2obj(snid), V_ASN1_UNDEF, 0);
}
return 1;
+
+ case ASN1_PKEY_CTRL_CMS_RI_TYPE:
+ *(int *)arg2 = CMS_RECIPINFO_NONE;
+ return 1;
#endif
case ASN1_PKEY_CTRL_DEFAULT_MD_NID:
- *(int *)arg2 = NID_sha1;
+ *(int *)arg2 = NID_sha256;
return 2;
default:
diff --git a/crypto/dsa/dsa_err.c b/crypto/dsa/dsa_err.c
index 746f5dfe6dc7..f5ddc66b8a73 100644
--- a/crypto/dsa/dsa_err.c
+++ b/crypto/dsa/dsa_err.c
@@ -1,6 +1,6 @@
/* crypto/dsa/dsa_err.c */
/* ====================================================================
- * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1999-2013 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -74,6 +74,7 @@ static ERR_STRING_DATA DSA_str_functs[] = {
{ERR_FUNC(DSA_F_DO_DSA_PRINT), "DO_DSA_PRINT"},
{ERR_FUNC(DSA_F_DSAPARAMS_PRINT), "DSAparams_print"},
{ERR_FUNC(DSA_F_DSAPARAMS_PRINT_FP), "DSAparams_print_fp"},
+ {ERR_FUNC(DSA_F_DSA_BUILTIN_PARAMGEN2), "DSA_BUILTIN_PARAMGEN2"},
{ERR_FUNC(DSA_F_DSA_DO_SIGN), "DSA_do_sign"},
{ERR_FUNC(DSA_F_DSA_DO_VERIFY), "DSA_do_verify"},
{ERR_FUNC(DSA_F_DSA_GENERATE_KEY), "DSA_generate_key"},
@@ -107,12 +108,14 @@ static ERR_STRING_DATA DSA_str_reasons[] = {
"data too large for key size"},
{ERR_REASON(DSA_R_DECODE_ERROR), "decode error"},
{ERR_REASON(DSA_R_INVALID_DIGEST_TYPE), "invalid digest type"},
+ {ERR_REASON(DSA_R_INVALID_PARAMETERS), "invalid parameters"},
{ERR_REASON(DSA_R_MISSING_PARAMETERS), "missing parameters"},
{ERR_REASON(DSA_R_MODULUS_TOO_LARGE), "modulus too large"},
{ERR_REASON(DSA_R_NEED_NEW_SETUP_VALUES), "need new setup values"},
{ERR_REASON(DSA_R_NON_FIPS_DSA_METHOD), "non fips dsa method"},
{ERR_REASON(DSA_R_NO_PARAMETERS_SET), "no parameters set"},
{ERR_REASON(DSA_R_PARAMETER_ENCODING_ERROR), "parameter encoding error"},
+ {ERR_REASON(DSA_R_Q_NOT_PRIME), "q not prime"},
{0, NULL}
};
diff --git a/crypto/dsa/dsa_gen.c b/crypto/dsa/dsa_gen.c
index d686ab0af770..5a328aaab5b4 100644
--- a/crypto/dsa/dsa_gen.c
+++ b/crypto/dsa/dsa_gen.c
@@ -86,6 +86,8 @@
# include "dsa_locl.h"
# ifdef OPENSSL_FIPS
+/* Workaround bug in prototype */
+# define fips_dsa_builtin_paramgen2 fips_dsa_paramgen_bad
# include <openssl/fips.h>
# endif
@@ -383,4 +385,371 @@ int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits,
BN_MONT_CTX_free(mont);
return ok;
}
+
+# ifdef OPENSSL_FIPS
+# undef fips_dsa_builtin_paramgen2
+extern int fips_dsa_builtin_paramgen2(DSA *ret, size_t L, size_t N,
+ const EVP_MD *evpmd,
+ const unsigned char *seed_in,
+ size_t seed_len, int idx,
+ unsigned char *seed_out,
+ int *counter_ret, unsigned long *h_ret,
+ BN_GENCB *cb);
+# endif
+
+/*
+ * This is a parameter generation algorithm for the DSA2 algorithm as
+ * described in FIPS 186-3.
+ */
+
+int dsa_builtin_paramgen2(DSA *ret, size_t L, size_t N,
+ const EVP_MD *evpmd, const unsigned char *seed_in,
+ size_t seed_len, int idx, unsigned char *seed_out,
+ int *counter_ret, unsigned long *h_ret,
+ BN_GENCB *cb)
+{
+ int ok = -1;
+ unsigned char *seed = NULL, *seed_tmp = NULL;
+ unsigned char md[EVP_MAX_MD_SIZE];
+ int mdsize;
+ BIGNUM *r0, *W, *X, *c, *test;
+ BIGNUM *g = NULL, *q = NULL, *p = NULL;
+ BN_MONT_CTX *mont = NULL;
+ int i, k, n = 0, m = 0, qsize = N >> 3;
+ int counter = 0;
+ int r = 0;
+ BN_CTX *ctx = NULL;
+ EVP_MD_CTX mctx;
+ unsigned int h = 2;
+
+# ifdef OPENSSL_FIPS
+
+ if (FIPS_mode())
+ return fips_dsa_builtin_paramgen2(ret, L, N, evpmd,
+ seed_in, seed_len, idx,
+ seed_out, counter_ret, h_ret, cb);
+# endif
+
+ EVP_MD_CTX_init(&mctx);
+
+ if (evpmd == NULL) {
+ if (N == 160)
+ evpmd = EVP_sha1();
+ else if (N == 224)
+ evpmd = EVP_sha224();
+ else
+ evpmd = EVP_sha256();
+ }
+
+ mdsize = EVP_MD_size(evpmd);
+ /* If unverificable g generation only don't need seed */
+ if (!ret->p || !ret->q || idx >= 0) {
+ if (seed_len == 0)
+ seed_len = mdsize;
+
+ seed = OPENSSL_malloc(seed_len);
+
+ if (seed_out)
+ seed_tmp = seed_out;
+ else
+ seed_tmp = OPENSSL_malloc(seed_len);
+
+ if (!seed || !seed_tmp)
+ goto err;
+
+ if (seed_in)
+ memcpy(seed, seed_in, seed_len);
+
+ }
+
+ if ((ctx = BN_CTX_new()) == NULL)
+ goto err;
+
+ if ((mont = BN_MONT_CTX_new()) == NULL)
+ goto err;
+
+ BN_CTX_start(ctx);
+ r0 = BN_CTX_get(ctx);
+ g = BN_CTX_get(ctx);
+ W = BN_CTX_get(ctx);
+ X = BN_CTX_get(ctx);
+ c = BN_CTX_get(ctx);
+ test = BN_CTX_get(ctx);
+
+ /* if p, q already supplied generate g only */
+ if (ret->p && ret->q) {
+ p = ret->p;
+ q = ret->q;
+ if (idx >= 0)
+ memcpy(seed_tmp, seed, seed_len);
+ goto g_only;
+ } else {
+ p = BN_CTX_get(ctx);
+ q = BN_CTX_get(ctx);
+ }
+
+ if (!BN_lshift(test, BN_value_one(), L - 1))
+ goto err;
+ for (;;) {
+ for (;;) { /* find q */
+ unsigned char *pmd;
+ /* step 1 */
+ if (!BN_GENCB_call(cb, 0, m++))
+ goto err;
+
+ if (!seed_in) {
+ if (RAND_pseudo_bytes(seed, seed_len) < 0)
+ goto err;
+ }
+ /* step 2 */
+ if (!EVP_Digest(seed, seed_len, md, NULL, evpmd, NULL))
+ goto err;
+ /* Take least significant bits of md */
+ if (mdsize > qsize)
+ pmd = md + mdsize - qsize;
+ else
+ pmd = md;
+
+ if (mdsize < qsize)
+ memset(md + mdsize, 0, qsize - mdsize);
+
+ /* step 3 */
+ pmd[0] |= 0x80;
+ pmd[qsize - 1] |= 0x01;
+ if (!BN_bin2bn(pmd, qsize, q))
+ goto err;
+
+ /* step 4 */
+ r = BN_is_prime_fasttest_ex(q, DSS_prime_checks, ctx,
+ seed_in ? 1 : 0, cb);
+ if (r > 0)
+ break;
+ if (r != 0)
+ goto err;
+ /* Provided seed didn't produce a prime: error */
+ if (seed_in) {
+ ok = 0;
+ DSAerr(DSA_F_DSA_BUILTIN_PARAMGEN2, DSA_R_Q_NOT_PRIME);
+ goto err;
+ }
+
+ /* do a callback call */
+ /* step 5 */
+ }
+ /* Copy seed to seed_out before we mess with it */
+ if (seed_out)
+ memcpy(seed_out, seed, seed_len);
+
+ if (!BN_GENCB_call(cb, 2, 0))
+ goto err;
+ if (!BN_GENCB_call(cb, 3, 0))
+ goto err;
+
+ /* step 6 */
+ counter = 0;
+ /* "offset = 1" */
+
+ n = (L - 1) / (mdsize << 3);
+
+ for (;;) {
+ if ((counter != 0) && !BN_GENCB_call(cb, 0, counter))
+ goto err;
+
+ /* step 7 */
+ BN_zero(W);
+ /* now 'buf' contains "SEED + offset - 1" */
+ for (k = 0; k <= n; k++) {
+ /*
+ * obtain "SEED + offset + k" by incrementing:
+ */
+ for (i = seed_len - 1; i >= 0; i--) {
+ seed[i]++;
+ if (seed[i] != 0)
+ break;
+ }
+
+ if (!EVP_Digest(seed, seed_len, md, NULL, evpmd, NULL))
+ goto err;
+
+ /* step 8 */
+ if (!BN_bin2bn(md, mdsize, r0))
+ goto err;
+ if (!BN_lshift(r0, r0, (mdsize << 3) * k))
+ goto err;
+ if (!BN_add(W, W, r0))
+ goto err;
+ }
+
+ /* more of step 8 */
+ if (!BN_mask_bits(W, L - 1))
+ goto err;
+ if (!BN_copy(X, W))
+ goto err;
+ if (!BN_add(X, X, test))
+ goto err;
+
+ /* step 9 */
+ if (!BN_lshift1(r0, q))
+ goto err;
+ if (!BN_mod(c, X, r0, ctx))
+ goto err;
+ if (!BN_sub(r0, c, BN_value_one()))
+ goto err;
+ if (!BN_sub(p, X, r0))
+ goto err;
+
+ /* step 10 */
+ if (BN_cmp(p, test) >= 0) {
+ /* step 11 */
+ r = BN_is_prime_fasttest_ex(p, DSS_prime_checks, ctx, 1, cb);
+ if (r > 0)
+ goto end; /* found it */
+ if (r != 0)
+ goto err;
+ }
+
+ /* step 13 */
+ counter++;
+ /* "offset = offset + n + 1" */
+
+ /* step 14 */
+ if (counter >= (int)(4 * L))
+ break;
+ }
+ if (seed_in) {
+ ok = 0;
+ DSAerr(DSA_F_DSA_BUILTIN_PARAMGEN2, DSA_R_INVALID_PARAMETERS);
+ goto err;
+ }
+ }
+ end:
+ if (!BN_GENCB_call(cb, 2, 1))
+ goto err;
+
+ g_only:
+
+ /* We now need to generate g */
+ /* Set r0=(p-1)/q */
+ if (!BN_sub(test, p, BN_value_one()))
+ goto err;
+ if (!BN_div(r0, NULL, test, q, ctx))
+ goto err;
+
+ if (idx < 0) {
+ if (!BN_set_word(test, h))
+ goto err;
+ } else
+ h = 1;
+ if (!BN_MONT_CTX_set(mont, p, ctx))
+ goto err;
+
+ for (;;) {
+ static const unsigned char ggen[4] = { 0x67, 0x67, 0x65, 0x6e };
+ if (idx >= 0) {
+ md[0] = idx & 0xff;
+ md[1] = (h >> 8) & 0xff;
+ md[2] = h & 0xff;
+ if (!EVP_DigestInit_ex(&mctx, evpmd, NULL))
+ goto err;
+ if (!EVP_DigestUpdate(&mctx, seed_tmp, seed_len))
+ goto err;
+ if (!EVP_DigestUpdate(&mctx, ggen, sizeof(ggen)))
+ goto err;
+ if (!EVP_DigestUpdate(&mctx, md, 3))
+ goto err;
+ if (!EVP_DigestFinal_ex(&mctx, md, NULL))
+ goto err;
+ if (!BN_bin2bn(md, mdsize, test))
+ goto err;
+ }
+ /* g=test^r0%p */
+ if (!BN_mod_exp_mont(g, test, r0, p, ctx, mont))
+ goto err;
+ if (!BN_is_one(g))
+ break;
+ if (idx < 0 && !BN_add(test, test, BN_value_one()))
+ goto err;
+ h++;
+ if (idx >= 0 && h > 0xffff)
+ goto err;
+ }
+
+ if (!BN_GENCB_call(cb, 3, 1))
+ goto err;
+
+ ok = 1;
+ err:
+ if (ok == 1) {
+ if (p != ret->p) {
+ if (ret->p)
+ BN_free(ret->p);
+ ret->p = BN_dup(p);
+ }
+ if (q != ret->q) {
+ if (ret->q)
+ BN_free(ret->q);
+ ret->q = BN_dup(q);
+ }
+ if (ret->g)
+ BN_free(ret->g);
+ ret->g = BN_dup(g);
+ if (ret->p == NULL || ret->q == NULL || ret->g == NULL) {
+ ok = -1;
+ goto err;
+ }
+ if (counter_ret != NULL)
+ *counter_ret = counter;
+ if (h_ret != NULL)
+ *h_ret = h;
+ }
+ if (seed)
+ OPENSSL_free(seed);
+ if (seed_out != seed_tmp)
+ OPENSSL_free(seed_tmp);
+ if (ctx) {
+ BN_CTX_end(ctx);
+ BN_CTX_free(ctx);
+ }
+ if (mont != NULL)
+ BN_MONT_CTX_free(mont);
+ EVP_MD_CTX_cleanup(&mctx);
+ return ok;
+}
+
+int dsa_paramgen_check_g(DSA *dsa)
+{
+ BN_CTX *ctx;
+ BIGNUM *tmp;
+ BN_MONT_CTX *mont = NULL;
+ int rv = -1;
+ ctx = BN_CTX_new();
+ if (!ctx)
+ return -1;
+ BN_CTX_start(ctx);
+ if (BN_cmp(dsa->g, BN_value_one()) <= 0)
+ return 0;
+ if (BN_cmp(dsa->g, dsa->p) >= 0)
+ return 0;
+ tmp = BN_CTX_get(ctx);
+ if (!tmp)
+ goto err;
+ if ((mont = BN_MONT_CTX_new()) == NULL)
+ goto err;
+ if (!BN_MONT_CTX_set(mont, dsa->p, ctx))
+ goto err;
+ /* Work out g^q mod p */
+ if (!BN_mod_exp_mont(tmp, dsa->g, dsa->q, dsa->p, ctx, mont))
+ goto err;
+ if (!BN_cmp(tmp, BN_value_one()))
+ rv = 1;
+ else
+ rv = 0;
+ err:
+ BN_CTX_end(ctx);
+ if (mont)
+ BN_MONT_CTX_free(mont);
+ BN_CTX_free(ctx);
+ return rv;
+
+}
#endif
diff --git a/crypto/dsa/dsa_locl.h b/crypto/dsa/dsa_locl.h
index f32ee964d0e3..9c23c3ef90e2 100644
--- a/crypto/dsa/dsa_locl.h
+++ b/crypto/dsa/dsa_locl.h
@@ -59,3 +59,11 @@ int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits,
size_t seed_len, unsigned char *seed_out,
int *counter_ret, unsigned long *h_ret,
BN_GENCB *cb);
+
+int dsa_builtin_paramgen2(DSA *ret, size_t L, size_t N,
+ const EVP_MD *evpmd, const unsigned char *seed_in,
+ size_t seed_len, int idx, unsigned char *seed_out,
+ int *counter_ret, unsigned long *h_ret,
+ BN_GENCB *cb);
+
+int dsa_paramgen_check_g(DSA *dsa);
diff --git a/crypto/dsa/dsa_ossl.c b/crypto/dsa/dsa_ossl.c
index 6edb26d97397..f0ec8faa84cc 100644
--- a/crypto/dsa/dsa_ossl.c
+++ b/crypto/dsa/dsa_ossl.c
@@ -398,11 +398,7 @@ static int dsa_do_verify(const unsigned char *dgst, int dgst_len,
ret = (BN_ucmp(&u1, sig->r) == 0);
err:
- /*
- * XXX: surely this is wrong - if ret is 0, it just didn't verify; there
- * is no error in BN. Test should be ret == -1 (Ben)
- */
- if (ret != 1)
+ if (ret < 0)
DSAerr(DSA_F_DSA_DO_VERIFY, ERR_R_BN_LIB);
if (ctx != NULL)
BN_CTX_free(ctx);
diff --git a/crypto/dsa/dsa_pmeth.c b/crypto/dsa/dsa_pmeth.c
index 0d480f6a707e..42b8bb086251 100644
--- a/crypto/dsa/dsa_pmeth.c
+++ b/crypto/dsa/dsa_pmeth.c
@@ -197,6 +197,10 @@ static int pkey_dsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
dctx->md = p2;
return 1;
+ case EVP_PKEY_CTRL_GET_MD:
+ *(const EVP_MD **)p2 = dctx->md;
+ return 1;
+
case EVP_PKEY_CTRL_DIGESTINIT:
case EVP_PKEY_CTRL_PKCS7_SIGN:
case EVP_PKEY_CTRL_CMS_SIGN:
diff --git a/crypto/ebcdic.c b/crypto/ebcdic.c
index 4b7652c0ecf7..fd6df92b468b 100644
--- a/crypto/ebcdic.c
+++ b/crypto/ebcdic.c
@@ -3,7 +3,7 @@
#ifndef CHARSET_EBCDIC
# include <openssl/e_os2.h>
-# if defined(PEDANTIC) || defined(__DECC) || defined(OPENSSL_SYS_MACOSX)
+# if defined(PEDANTIC) || defined(__DECC) || defined(OPENSSL_SYS_MACOSX) || defined(__clang__)
static void *dummy = &dummy;
# endif
diff --git a/crypto/ec/Makefile b/crypto/ec/Makefile
index 2753b28bec0a..359ef4e40fd4 100644
--- a/crypto/ec/Makefile
+++ b/crypto/ec/Makefile
@@ -11,6 +11,8 @@ MAKEFILE= Makefile
AR= ar r
CFLAGS= $(INCLUDES) $(CFLAG)
+ASFLAGS= $(INCLUDES) $(ASFLAG)
+AFLAGS= $(ASFLAGS)
GENERAL=Makefile
TEST=ectest.c
@@ -27,7 +29,7 @@ LIBOBJ= ec_lib.o ecp_smpl.o ecp_mont.o ecp_nist.o ec_cvt.o ec_mult.o\
ec_err.o ec_curve.o ec_check.o ec_print.o ec_asn1.o ec_key.o\
ec2_smpl.o ec2_mult.o ec_ameth.o ec_pmeth.o eck_prn.o \
ecp_nistp224.o ecp_nistp256.o ecp_nistp521.o ecp_nistputil.o \
- ecp_oct.o ec2_oct.o ec_oct.o
+ ecp_oct.o ec2_oct.o ec_oct.o $(EC_ASM)
SRC= $(LIBSRC)
@@ -46,6 +48,12 @@ lib: $(LIBOBJ)
$(RANLIB) $(LIB) || echo Never mind.
@touch lib
+ecp_nistz256-x86_64.s: asm/ecp_nistz256-x86_64.pl
+ $(PERL) asm/ecp_nistz256-x86_64.pl $(PERLASM_SCHEME) > $@
+
+ecp_nistz256-avx2.s: asm/ecp_nistz256-avx2.pl
+ $(PERL) asm/ecp_nistz256-avx2.pl $(PERLASM_SCHEME) > $@
+
files:
$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
@@ -110,14 +118,14 @@ ec2_smpl.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
ec2_smpl.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
ec2_smpl.o: ../../include/openssl/symhacks.h ec2_smpl.c ec_lcl.h
ec_ameth.o: ../../e_os.h ../../include/openssl/asn1.h
-ec_ameth.o: ../../include/openssl/bio.h ../../include/openssl/bn.h
-ec_ameth.o: ../../include/openssl/buffer.h ../../include/openssl/cms.h
-ec_ameth.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
-ec_ameth.o: ../../include/openssl/ec.h ../../include/openssl/ecdh.h
-ec_ameth.o: ../../include/openssl/ecdsa.h ../../include/openssl/err.h
-ec_ameth.o: ../../include/openssl/evp.h ../../include/openssl/lhash.h
-ec_ameth.o: ../../include/openssl/obj_mac.h ../../include/openssl/objects.h
-ec_ameth.o: ../../include/openssl/opensslconf.h
+ec_ameth.o: ../../include/openssl/asn1t.h ../../include/openssl/bio.h
+ec_ameth.o: ../../include/openssl/bn.h ../../include/openssl/buffer.h
+ec_ameth.o: ../../include/openssl/cms.h ../../include/openssl/crypto.h
+ec_ameth.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h
+ec_ameth.o: ../../include/openssl/ecdh.h ../../include/openssl/ecdsa.h
+ec_ameth.o: ../../include/openssl/err.h ../../include/openssl/evp.h
+ec_ameth.o: ../../include/openssl/lhash.h ../../include/openssl/obj_mac.h
+ec_ameth.o: ../../include/openssl/objects.h ../../include/openssl/opensslconf.h
ec_ameth.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
ec_ameth.o: ../../include/openssl/pkcs7.h ../../include/openssl/safestack.h
ec_ameth.o: ../../include/openssl/sha.h ../../include/openssl/stack.h
@@ -198,18 +206,19 @@ ec_oct.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
ec_oct.o: ../../include/openssl/symhacks.h ec_lcl.h ec_oct.c
ec_pmeth.o: ../../e_os.h ../../include/openssl/asn1.h
ec_pmeth.o: ../../include/openssl/asn1t.h ../../include/openssl/bio.h
-ec_pmeth.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-ec_pmeth.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h
-ec_pmeth.o: ../../include/openssl/ecdh.h ../../include/openssl/ecdsa.h
-ec_pmeth.o: ../../include/openssl/err.h ../../include/openssl/evp.h
-ec_pmeth.o: ../../include/openssl/lhash.h ../../include/openssl/obj_mac.h
-ec_pmeth.o: ../../include/openssl/objects.h ../../include/openssl/opensslconf.h
+ec_pmeth.o: ../../include/openssl/bn.h ../../include/openssl/buffer.h
+ec_pmeth.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
+ec_pmeth.o: ../../include/openssl/ec.h ../../include/openssl/ecdh.h
+ec_pmeth.o: ../../include/openssl/ecdsa.h ../../include/openssl/err.h
+ec_pmeth.o: ../../include/openssl/evp.h ../../include/openssl/lhash.h
+ec_pmeth.o: ../../include/openssl/obj_mac.h ../../include/openssl/objects.h
+ec_pmeth.o: ../../include/openssl/opensslconf.h
ec_pmeth.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
ec_pmeth.o: ../../include/openssl/pkcs7.h ../../include/openssl/safestack.h
ec_pmeth.o: ../../include/openssl/sha.h ../../include/openssl/stack.h
ec_pmeth.o: ../../include/openssl/symhacks.h ../../include/openssl/x509.h
ec_pmeth.o: ../../include/openssl/x509_vfy.h ../cryptlib.h ../evp/evp_locl.h
-ec_pmeth.o: ec_pmeth.c
+ec_pmeth.o: ec_lcl.h ec_pmeth.c
ec_print.o: ../../include/openssl/asn1.h ../../include/openssl/bio.h
ec_print.o: ../../include/openssl/bn.h ../../include/openssl/crypto.h
ec_print.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h
diff --git a/crypto/ec/asm/ecp_nistz256-avx2.pl b/crypto/ec/asm/ecp_nistz256-avx2.pl
new file mode 100755
index 000000000000..4c220aa645f1
--- /dev/null
+++ b/crypto/ec/asm/ecp_nistz256-avx2.pl
@@ -0,0 +1,2093 @@
+#!/usr/bin/env perl
+
+##############################################################################
+# #
+# Copyright 2014 Intel Corporation #
+# #
+# Licensed under the Apache License, Version 2.0 (the "License"); #
+# you may not use this file except in compliance with the License. #
+# You may obtain a copy of the License at #
+# #
+# http://www.apache.org/licenses/LICENSE-2.0 #
+# #
+# Unless required by applicable law or agreed to in writing, software #
+# distributed under the License is distributed on an "AS IS" BASIS, #
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
+# See the License for the specific language governing permissions and #
+# limitations under the License. #
+# #
+##############################################################################
+# #
+# Developers and authors: #
+# Shay Gueron (1, 2), and Vlad Krasnov (1) #
+# (1) Intel Corporation, Israel Development Center #
+# (2) University of Haifa #
+# Reference: #
+# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with#
+# 256 Bit Primes" #
+# #
+##############################################################################
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22);
+ $addx = ($1>=2.23);
+}
+
+if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+ $addx = ($1>=2.10);
+}
+
+if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+ $addx = ($1>=12);
+}
+
+if (!$addx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
+ my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
+ $avx = ($ver>=3.0) + ($ver>=3.01);
+ $addx = ($ver>=3.03);
+}
+
+if ($avx>=2) {{
+$digit_size = "\$29";
+$n_digits = "\$9";
+
+$code.=<<___;
+.text
+
+.align 64
+.LAVX2_AND_MASK:
+.LAVX2_POLY:
+.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
+.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
+.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
+.quad 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff
+.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
+.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
+.quad 0x00040000, 0x00040000, 0x00040000, 0x00040000
+.quad 0x1fe00000, 0x1fe00000, 0x1fe00000, 0x1fe00000
+.quad 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff
+
+.LAVX2_POLY_x2:
+.quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
+.quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
+.quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
+.quad 0x400007FC, 0x400007FC, 0x400007FC, 0x400007FC
+.quad 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE
+.quad 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE
+.quad 0x400FFFFE, 0x400FFFFE, 0x400FFFFE, 0x400FFFFE
+.quad 0x7F7FFFFE, 0x7F7FFFFE, 0x7F7FFFFE, 0x7F7FFFFE
+.quad 0x03FFFFFC, 0x03FFFFFC, 0x03FFFFFC, 0x03FFFFFC
+
+.LAVX2_POLY_x8:
+.quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8
+.quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8
+.quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8
+.quad 0x80000FF8, 0x80000FF8, 0x80000FF8, 0x80000FF8
+.quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
+.quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
+.quad 0x801FFFFC, 0x801FFFFC, 0x801FFFFC, 0x801FFFFC
+.quad 0xFEFFFFFC, 0xFEFFFFFC, 0xFEFFFFFC, 0xFEFFFFFC
+.quad 0x07FFFFF8, 0x07FFFFF8, 0x07FFFFF8, 0x07FFFFF8
+
+.LONE:
+.quad 0x00000020, 0x00000020, 0x00000020, 0x00000020
+.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
+.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
+.quad 0x1fffc000, 0x1fffc000, 0x1fffc000, 0x1fffc000
+.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
+.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
+.quad 0x1f7fffff, 0x1f7fffff, 0x1f7fffff, 0x1f7fffff
+.quad 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff
+.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
+
+# RR = 2^266 mod p in AVX2 format, to transform from the native OpenSSL
+# Montgomery form (*2^256) to our format (*2^261)
+
+.LTO_MONT_AVX2:
+.quad 0x00000400, 0x00000400, 0x00000400, 0x00000400
+.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
+.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
+.quad 0x1ff80000, 0x1ff80000, 0x1ff80000, 0x1ff80000
+.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
+.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
+.quad 0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff
+.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
+.quad 0x00000003, 0x00000003, 0x00000003, 0x00000003
+
+.LFROM_MONT_AVX2:
+.quad 0x00000001, 0x00000001, 0x00000001, 0x00000001
+.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
+.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
+.quad 0x1ffffe00, 0x1ffffe00, 0x1ffffe00, 0x1ffffe00
+.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
+.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
+.quad 0x1ffbffff, 0x1ffbffff, 0x1ffbffff, 0x1ffbffff
+.quad 0x001fffff, 0x001fffff, 0x001fffff, 0x001fffff
+.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
+
+.LIntOne:
+.long 1,1,1,1,1,1,1,1
+___
+
+{
+# This function recieves a pointer to an array of four affine points
+# (X, Y, <1>) and rearanges the data for AVX2 execution, while
+# converting it to 2^29 radix redundant form
+
+my ($X0,$X1,$X2,$X3, $Y0,$Y1,$Y2,$Y3,
+ $T0,$T1,$T2,$T3, $T4,$T5,$T6,$T7)=map("%ymm$_",(0..15));
+
+$code.=<<___;
+.globl ecp_nistz256_avx2_transpose_convert
+.type ecp_nistz256_avx2_transpose_convert,\@function,2
+.align 64
+ecp_nistz256_avx2_transpose_convert:
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ lea -8-16*10(%rsp), %rsp
+ vmovaps %xmm6, -8-16*10(%rax)
+ vmovaps %xmm7, -8-16*9(%rax)
+ vmovaps %xmm8, -8-16*8(%rax)
+ vmovaps %xmm9, -8-16*7(%rax)
+ vmovaps %xmm10, -8-16*6(%rax)
+ vmovaps %xmm11, -8-16*5(%rax)
+ vmovaps %xmm12, -8-16*4(%rax)
+ vmovaps %xmm13, -8-16*3(%rax)
+ vmovaps %xmm14, -8-16*2(%rax)
+ vmovaps %xmm15, -8-16*1(%rax)
+___
+$code.=<<___;
+ # Load the data
+ vmovdqa 32*0(%rsi), $X0
+ lea 112(%rsi), %rax # size optimization
+ vmovdqa 32*1(%rsi), $Y0
+ lea .LAVX2_AND_MASK(%rip), %rdx
+ vmovdqa 32*2(%rsi), $X1
+ vmovdqa 32*3(%rsi), $Y1
+ vmovdqa 32*4-112(%rax), $X2
+ vmovdqa 32*5-112(%rax), $Y2
+ vmovdqa 32*6-112(%rax), $X3
+ vmovdqa 32*7-112(%rax), $Y3
+
+ # Transpose X and Y independently
+ vpunpcklqdq $X1, $X0, $T0 # T0 = [B2 A2 B0 A0]
+ vpunpcklqdq $X3, $X2, $T1 # T1 = [D2 C2 D0 C0]
+ vpunpckhqdq $X1, $X0, $T2 # T2 = [B3 A3 B1 A1]
+ vpunpckhqdq $X3, $X2, $T3 # T3 = [D3 C3 D1 C1]
+
+ vpunpcklqdq $Y1, $Y0, $T4
+ vpunpcklqdq $Y3, $Y2, $T5
+ vpunpckhqdq $Y1, $Y0, $T6
+ vpunpckhqdq $Y3, $Y2, $T7
+
+ vperm2i128 \$0x20, $T1, $T0, $X0 # X0 = [D0 C0 B0 A0]
+ vperm2i128 \$0x20, $T3, $T2, $X1 # X1 = [D1 C1 B1 A1]
+ vperm2i128 \$0x31, $T1, $T0, $X2 # X2 = [D2 C2 B2 A2]
+ vperm2i128 \$0x31, $T3, $T2, $X3 # X3 = [D3 C3 B3 A3]
+
+ vperm2i128 \$0x20, $T5, $T4, $Y0
+ vperm2i128 \$0x20, $T7, $T6, $Y1
+ vperm2i128 \$0x31, $T5, $T4, $Y2
+ vperm2i128 \$0x31, $T7, $T6, $Y3
+ vmovdqa (%rdx), $T7
+
+ vpand (%rdx), $X0, $T0 # out[0] = in[0] & mask;
+ vpsrlq \$29, $X0, $X0
+ vpand $T7, $X0, $T1 # out[1] = (in[0] >> shift) & mask;
+ vpsrlq \$29, $X0, $X0
+ vpsllq \$6, $X1, $T2
+ vpxor $X0, $T2, $T2
+ vpand $T7, $T2, $T2 # out[2] = ((in[0] >> (shift*2)) ^ (in[1] << (64-shift*2))) & mask;
+ vpsrlq \$23, $X1, $X1
+ vpand $T7, $X1, $T3 # out[3] = (in[1] >> ((shift*3)%64)) & mask;
+ vpsrlq \$29, $X1, $X1
+ vpsllq \$12, $X2, $T4
+ vpxor $X1, $T4, $T4
+ vpand $T7, $T4, $T4 # out[4] = ((in[1] >> ((shift*4)%64)) ^ (in[2] << (64*2-shift*4))) & mask;
+ vpsrlq \$17, $X2, $X2
+ vpand $T7, $X2, $T5 # out[5] = (in[2] >> ((shift*5)%64)) & mask;
+ vpsrlq \$29, $X2, $X2
+ vpsllq \$18, $X3, $T6
+ vpxor $X2, $T6, $T6
+ vpand $T7, $T6, $T6 # out[6] = ((in[2] >> ((shift*6)%64)) ^ (in[3] << (64*3-shift*6))) & mask;
+ vpsrlq \$11, $X3, $X3
+ vmovdqa $T0, 32*0(%rdi)
+ lea 112(%rdi), %rax # size optimization
+ vpand $T7, $X3, $T0 # out[7] = (in[3] >> ((shift*7)%64)) & mask;
+ vpsrlq \$29, $X3, $X3 # out[8] = (in[3] >> ((shift*8)%64)) & mask;
+
+ vmovdqa $T1, 32*1(%rdi)
+ vmovdqa $T2, 32*2(%rdi)
+ vmovdqa $T3, 32*3(%rdi)
+ vmovdqa $T4, 32*4-112(%rax)
+ vmovdqa $T5, 32*5-112(%rax)
+ vmovdqa $T6, 32*6-112(%rax)
+ vmovdqa $T0, 32*7-112(%rax)
+ vmovdqa $X3, 32*8-112(%rax)
+ lea 448(%rdi), %rax # size optimization
+
+ vpand $T7, $Y0, $T0 # out[0] = in[0] & mask;
+ vpsrlq \$29, $Y0, $Y0
+ vpand $T7, $Y0, $T1 # out[1] = (in[0] >> shift) & mask;
+ vpsrlq \$29, $Y0, $Y0
+ vpsllq \$6, $Y1, $T2
+ vpxor $Y0, $T2, $T2
+ vpand $T7, $T2, $T2 # out[2] = ((in[0] >> (shift*2)) ^ (in[1] << (64-shift*2))) & mask;
+ vpsrlq \$23, $Y1, $Y1
+ vpand $T7, $Y1, $T3 # out[3] = (in[1] >> ((shift*3)%64)) & mask;
+ vpsrlq \$29, $Y1, $Y1
+ vpsllq \$12, $Y2, $T4
+ vpxor $Y1, $T4, $T4
+ vpand $T7, $T4, $T4 # out[4] = ((in[1] >> ((shift*4)%64)) ^ (in[2] << (64*2-shift*4))) & mask;
+ vpsrlq \$17, $Y2, $Y2
+ vpand $T7, $Y2, $T5 # out[5] = (in[2] >> ((shift*5)%64)) & mask;
+ vpsrlq \$29, $Y2, $Y2
+ vpsllq \$18, $Y3, $T6
+ vpxor $Y2, $T6, $T6
+ vpand $T7, $T6, $T6 # out[6] = ((in[2] >> ((shift*6)%64)) ^ (in[3] << (64*3-shift*6))) & mask;
+ vpsrlq \$11, $Y3, $Y3
+ vmovdqa $T0, 32*9-448(%rax)
+ vpand $T7, $Y3, $T0 # out[7] = (in[3] >> ((shift*7)%64)) & mask;
+ vpsrlq \$29, $Y3, $Y3 # out[8] = (in[3] >> ((shift*8)%64)) & mask;
+
+ vmovdqa $T1, 32*10-448(%rax)
+ vmovdqa $T2, 32*11-448(%rax)
+ vmovdqa $T3, 32*12-448(%rax)
+ vmovdqa $T4, 32*13-448(%rax)
+ vmovdqa $T5, 32*14-448(%rax)
+ vmovdqa $T6, 32*15-448(%rax)
+ vmovdqa $T0, 32*16-448(%rax)
+ vmovdqa $Y3, 32*17-448(%rax)
+
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps 16*0(%rsp), %xmm6
+ movaps 16*1(%rsp), %xmm7
+ movaps 16*2(%rsp), %xmm8
+ movaps 16*3(%rsp), %xmm9
+ movaps 16*4(%rsp), %xmm10
+ movaps 16*5(%rsp), %xmm11
+ movaps 16*6(%rsp), %xmm12
+ movaps 16*7(%rsp), %xmm13
+ movaps 16*8(%rsp), %xmm14
+ movaps 16*9(%rsp), %xmm15
+ lea 8+16*10(%rsp), %rsp
+___
+$code.=<<___;
+ ret
+.size ecp_nistz256_avx2_transpose_convert,.-ecp_nistz256_avx2_transpose_convert
+___
+}
+{
+################################################################################
+# This function recieves a pointer to an array of four AVX2 formatted points
+# (X, Y, Z) convert the data to normal representation, and rearanges the data
+
+my ($D0,$D1,$D2,$D3, $D4,$D5,$D6,$D7, $D8)=map("%ymm$_",(0..8));
+my ($T0,$T1,$T2,$T3, $T4,$T5,$T6)=map("%ymm$_",(9..15));
+
+$code.=<<___;
+
+.globl ecp_nistz256_avx2_convert_transpose_back
+.type ecp_nistz256_avx2_convert_transpose_back,\@function,2
+.align 32
+ecp_nistz256_avx2_convert_transpose_back:
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ lea -8-16*10(%rsp), %rsp
+ vmovaps %xmm6, -8-16*10(%rax)
+ vmovaps %xmm7, -8-16*9(%rax)
+ vmovaps %xmm8, -8-16*8(%rax)
+ vmovaps %xmm9, -8-16*7(%rax)
+ vmovaps %xmm10, -8-16*6(%rax)
+ vmovaps %xmm11, -8-16*5(%rax)
+ vmovaps %xmm12, -8-16*4(%rax)
+ vmovaps %xmm13, -8-16*3(%rax)
+ vmovaps %xmm14, -8-16*2(%rax)
+ vmovaps %xmm15, -8-16*1(%rax)
+___
+$code.=<<___;
+ mov \$3, %ecx
+
+.Lconv_loop:
+ vmovdqa 32*0(%rsi), $D0
+ lea 160(%rsi), %rax # size optimization
+ vmovdqa 32*1(%rsi), $D1
+ vmovdqa 32*2(%rsi), $D2
+ vmovdqa 32*3(%rsi), $D3
+ vmovdqa 32*4-160(%rax), $D4
+ vmovdqa 32*5-160(%rax), $D5
+ vmovdqa 32*6-160(%rax), $D6
+ vmovdqa 32*7-160(%rax), $D7
+ vmovdqa 32*8-160(%rax), $D8
+
+ vpsllq \$29, $D1, $D1
+ vpsllq \$58, $D2, $T0
+ vpaddq $D1, $D0, $D0
+ vpaddq $T0, $D0, $D0 # out[0] = (in[0]) ^ (in[1] << shift*1) ^ (in[2] << shift*2);
+
+ vpsrlq \$6, $D2, $D2
+ vpsllq \$23, $D3, $D3
+ vpsllq \$52, $D4, $T1
+ vpaddq $D2, $D3, $D3
+ vpaddq $D3, $T1, $D1 # out[1] = (in[2] >> (64*1-shift*2)) ^ (in[3] << shift*3%64) ^ (in[4] << shift*4%64);
+
+ vpsrlq \$12, $D4, $D4
+ vpsllq \$17, $D5, $D5
+ vpsllq \$46, $D6, $T2
+ vpaddq $D4, $D5, $D5
+ vpaddq $D5, $T2, $D2 # out[2] = (in[4] >> (64*2-shift*4)) ^ (in[5] << shift*5%64) ^ (in[6] << shift*6%64);
+
+ vpsrlq \$18, $D6, $D6
+ vpsllq \$11, $D7, $D7
+ vpsllq \$40, $D8, $T3
+ vpaddq $D6, $D7, $D7
+ vpaddq $D7, $T3, $D3 # out[3] = (in[6] >> (64*3-shift*6)) ^ (in[7] << shift*7%64) ^ (in[8] << shift*8%64);
+
+ vpunpcklqdq $D1, $D0, $T0 # T0 = [B2 A2 B0 A0]
+ vpunpcklqdq $D3, $D2, $T1 # T1 = [D2 C2 D0 C0]
+ vpunpckhqdq $D1, $D0, $T2 # T2 = [B3 A3 B1 A1]
+ vpunpckhqdq $D3, $D2, $T3 # T3 = [D3 C3 D1 C1]
+
+ vperm2i128 \$0x20, $T1, $T0, $D0 # X0 = [D0 C0 B0 A0]
+ vperm2i128 \$0x20, $T3, $T2, $D1 # X1 = [D1 C1 B1 A1]
+ vperm2i128 \$0x31, $T1, $T0, $D2 # X2 = [D2 C2 B2 A2]
+ vperm2i128 \$0x31, $T3, $T2, $D3 # X3 = [D3 C3 B3 A3]
+
+ vmovdqa $D0, 32*0(%rdi)
+ vmovdqa $D1, 32*3(%rdi)
+ vmovdqa $D2, 32*6(%rdi)
+ vmovdqa $D3, 32*9(%rdi)
+
+ lea 32*9(%rsi), %rsi
+ lea 32*1(%rdi), %rdi
+
+ dec %ecx
+ jnz .Lconv_loop
+
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps 16*0(%rsp), %xmm6
+ movaps 16*1(%rsp), %xmm7
+ movaps 16*2(%rsp), %xmm8
+ movaps 16*3(%rsp), %xmm9
+ movaps 16*4(%rsp), %xmm10
+ movaps 16*5(%rsp), %xmm11
+ movaps 16*6(%rsp), %xmm12
+ movaps 16*7(%rsp), %xmm13
+ movaps 16*8(%rsp), %xmm14
+ movaps 16*9(%rsp), %xmm15
+ lea 8+16*10(%rsp), %rsp
+___
+$code.=<<___;
+ ret
+.size ecp_nistz256_avx2_convert_transpose_back,.-ecp_nistz256_avx2_convert_transpose_back
+___
+}
+{
+my ($r_ptr,$a_ptr,$b_ptr,$itr)=("%rdi","%rsi","%rdx","%ecx");
+my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4,$ACC5,$ACC6,$ACC7,$ACC8)=map("%ymm$_",(0..8));
+my ($B,$Y,$T0,$AND_MASK,$OVERFLOW)=map("%ymm$_",(9..13));
+
+sub NORMALIZE {
+my $ret=<<___;
+ vpsrlq $digit_size, $ACC0, $T0
+ vpand $AND_MASK, $ACC0, $ACC0
+ vpaddq $T0, $ACC1, $ACC1
+
+ vpsrlq $digit_size, $ACC1, $T0
+ vpand $AND_MASK, $ACC1, $ACC1
+ vpaddq $T0, $ACC2, $ACC2
+
+ vpsrlq $digit_size, $ACC2, $T0
+ vpand $AND_MASK, $ACC2, $ACC2
+ vpaddq $T0, $ACC3, $ACC3
+
+ vpsrlq $digit_size, $ACC3, $T0
+ vpand $AND_MASK, $ACC3, $ACC3
+ vpaddq $T0, $ACC4, $ACC4
+
+ vpsrlq $digit_size, $ACC4, $T0
+ vpand $AND_MASK, $ACC4, $ACC4
+ vpaddq $T0, $ACC5, $ACC5
+
+ vpsrlq $digit_size, $ACC5, $T0
+ vpand $AND_MASK, $ACC5, $ACC5
+ vpaddq $T0, $ACC6, $ACC6
+
+ vpsrlq $digit_size, $ACC6, $T0
+ vpand $AND_MASK, $ACC6, $ACC6
+ vpaddq $T0, $ACC7, $ACC7
+
+ vpsrlq $digit_size, $ACC7, $T0
+ vpand $AND_MASK, $ACC7, $ACC7
+ vpaddq $T0, $ACC8, $ACC8
+ #vpand $AND_MASK, $ACC8, $ACC8
+___
+ $ret;
+}
+
+sub STORE {
+my $ret=<<___;
+ vmovdqa $ACC0, 32*0(%rdi)
+ lea 160(%rdi), %rax # size optimization
+ vmovdqa $ACC1, 32*1(%rdi)
+ vmovdqa $ACC2, 32*2(%rdi)
+ vmovdqa $ACC3, 32*3(%rdi)
+ vmovdqa $ACC4, 32*4-160(%rax)
+ vmovdqa $ACC5, 32*5-160(%rax)
+ vmovdqa $ACC6, 32*6-160(%rax)
+ vmovdqa $ACC7, 32*7-160(%rax)
+ vmovdqa $ACC8, 32*8-160(%rax)
+___
+ $ret;
+}
+
+$code.=<<___;
+.type avx2_normalize,\@abi-omnipotent
+.align 32
+avx2_normalize:
+ vpsrlq $digit_size, $ACC0, $T0
+ vpand $AND_MASK, $ACC0, $ACC0
+ vpaddq $T0, $ACC1, $ACC1
+
+ vpsrlq $digit_size, $ACC1, $T0
+ vpand $AND_MASK, $ACC1, $ACC1
+ vpaddq $T0, $ACC2, $ACC2
+
+ vpsrlq $digit_size, $ACC2, $T0
+ vpand $AND_MASK, $ACC2, $ACC2
+ vpaddq $T0, $ACC3, $ACC3
+
+ vpsrlq $digit_size, $ACC3, $T0
+ vpand $AND_MASK, $ACC3, $ACC3
+ vpaddq $T0, $ACC4, $ACC4
+
+ vpsrlq $digit_size, $ACC4, $T0
+ vpand $AND_MASK, $ACC4, $ACC4
+ vpaddq $T0, $ACC5, $ACC5
+
+ vpsrlq $digit_size, $ACC5, $T0
+ vpand $AND_MASK, $ACC5, $ACC5
+ vpaddq $T0, $ACC6, $ACC6
+
+ vpsrlq $digit_size, $ACC6, $T0
+ vpand $AND_MASK, $ACC6, $ACC6
+ vpaddq $T0, $ACC7, $ACC7
+
+ vpsrlq $digit_size, $ACC7, $T0
+ vpand $AND_MASK, $ACC7, $ACC7
+ vpaddq $T0, $ACC8, $ACC8
+ #vpand $AND_MASK, $ACC8, $ACC8
+
+ ret
+.size avx2_normalize,.-avx2_normalize
+
+.type avx2_normalize_n_store,\@abi-omnipotent
+.align 32
+avx2_normalize_n_store:
+ vpsrlq $digit_size, $ACC0, $T0
+ vpand $AND_MASK, $ACC0, $ACC0
+ vpaddq $T0, $ACC1, $ACC1
+
+ vpsrlq $digit_size, $ACC1, $T0
+ vpand $AND_MASK, $ACC1, $ACC1
+ vmovdqa $ACC0, 32*0(%rdi)
+ lea 160(%rdi), %rax # size optimization
+ vpaddq $T0, $ACC2, $ACC2
+
+ vpsrlq $digit_size, $ACC2, $T0
+ vpand $AND_MASK, $ACC2, $ACC2
+ vmovdqa $ACC1, 32*1(%rdi)
+ vpaddq $T0, $ACC3, $ACC3
+
+ vpsrlq $digit_size, $ACC3, $T0
+ vpand $AND_MASK, $ACC3, $ACC3
+ vmovdqa $ACC2, 32*2(%rdi)
+ vpaddq $T0, $ACC4, $ACC4
+
+ vpsrlq $digit_size, $ACC4, $T0
+ vpand $AND_MASK, $ACC4, $ACC4
+ vmovdqa $ACC3, 32*3(%rdi)
+ vpaddq $T0, $ACC5, $ACC5
+
+ vpsrlq $digit_size, $ACC5, $T0
+ vpand $AND_MASK, $ACC5, $ACC5
+ vmovdqa $ACC4, 32*4-160(%rax)
+ vpaddq $T0, $ACC6, $ACC6
+
+ vpsrlq $digit_size, $ACC6, $T0
+ vpand $AND_MASK, $ACC6, $ACC6
+ vmovdqa $ACC5, 32*5-160(%rax)
+ vpaddq $T0, $ACC7, $ACC7
+
+ vpsrlq $digit_size, $ACC7, $T0
+ vpand $AND_MASK, $ACC7, $ACC7
+ vmovdqa $ACC6, 32*6-160(%rax)
+ vpaddq $T0, $ACC8, $ACC8
+ #vpand $AND_MASK, $ACC8, $ACC8
+ vmovdqa $ACC7, 32*7-160(%rax)
+ vmovdqa $ACC8, 32*8-160(%rax)
+
+ ret
+.size avx2_normalize_n_store,.-avx2_normalize_n_store
+
+################################################################################
+# void avx2_mul_x4(void* RESULTx4, void *Ax4, void *Bx4);
+.type avx2_mul_x4,\@abi-omnipotent
+.align 32
+avx2_mul_x4:
+ lea .LAVX2_POLY(%rip), %rax
+
+ vpxor $ACC0, $ACC0, $ACC0
+ vpxor $ACC1, $ACC1, $ACC1
+ vpxor $ACC2, $ACC2, $ACC2
+ vpxor $ACC3, $ACC3, $ACC3
+ vpxor $ACC4, $ACC4, $ACC4
+ vpxor $ACC5, $ACC5, $ACC5
+ vpxor $ACC6, $ACC6, $ACC6
+ vpxor $ACC7, $ACC7, $ACC7
+
+ vmovdqa 32*7(%rax), %ymm14
+ vmovdqa 32*8(%rax), %ymm15
+
+ mov $n_digits, $itr
+ lea -512($a_ptr), $a_ptr # strategic bias to control u-op density
+ jmp .Lavx2_mul_x4_loop
+
+.align 32
+.Lavx2_mul_x4_loop:
+ vmovdqa 32*0($b_ptr), $B
+ lea 32*1($b_ptr), $b_ptr
+
+ vpmuludq 32*0+512($a_ptr), $B, $T0
+ vpmuludq 32*1+512($a_ptr), $B, $OVERFLOW # borrow $OVERFLOW
+ vpaddq $T0, $ACC0, $ACC0
+ vpmuludq 32*2+512($a_ptr), $B, $T0
+ vpaddq $OVERFLOW, $ACC1, $ACC1
+ vpand $AND_MASK, $ACC0, $Y
+ vpmuludq 32*3+512($a_ptr), $B, $OVERFLOW
+ vpaddq $T0, $ACC2, $ACC2
+ vpmuludq 32*4+512($a_ptr), $B, $T0
+ vpaddq $OVERFLOW, $ACC3, $ACC3
+ vpmuludq 32*5+512($a_ptr), $B, $OVERFLOW
+ vpaddq $T0, $ACC4, $ACC4
+ vpmuludq 32*6+512($a_ptr), $B, $T0
+ vpaddq $OVERFLOW, $ACC5, $ACC5
+ vpmuludq 32*7+512($a_ptr), $B, $OVERFLOW
+ vpaddq $T0, $ACC6, $ACC6
+
+ # Skip some multiplications, optimizing for the constant poly
+ vpmuludq $AND_MASK, $Y, $T0
+ vpaddq $OVERFLOW, $ACC7, $ACC7
+ vpmuludq 32*8+512($a_ptr), $B, $ACC8
+ vpaddq $T0, $ACC0, $OVERFLOW
+ vpaddq $T0, $ACC1, $ACC0
+ vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
+ vpaddq $T0, $ACC2, $ACC1
+ vpmuludq 32*3(%rax), $Y, $T0
+ vpaddq $OVERFLOW, $ACC0, $ACC0
+ vpaddq $T0, $ACC3, $ACC2
+ .byte 0x67
+ vmovdqa $ACC4, $ACC3
+ vpsllq \$18, $Y, $OVERFLOW
+ .byte 0x67
+ vmovdqa $ACC5, $ACC4
+ vpmuludq %ymm14, $Y, $T0
+ vpaddq $OVERFLOW, $ACC6, $ACC5
+ vpmuludq %ymm15, $Y, $OVERFLOW
+ vpaddq $T0, $ACC7, $ACC6
+ vpaddq $OVERFLOW, $ACC8, $ACC7
+
+ dec $itr
+ jnz .Lavx2_mul_x4_loop
+
+ vpxor $ACC8, $ACC8, $ACC8
+
+ ret
+.size avx2_mul_x4,.-avx2_mul_x4
+
+# Function optimized for the constant 1
+################################################################################
+# void avx2_mul_by1_x4(void* RESULTx4, void *Ax4);
+.type avx2_mul_by1_x4,\@abi-omnipotent
+.align 32
+avx2_mul_by1_x4:
+ lea .LAVX2_POLY(%rip), %rax
+
+ vpxor $ACC0, $ACC0, $ACC0
+ vpxor $ACC1, $ACC1, $ACC1
+ vpxor $ACC2, $ACC2, $ACC2
+ vpxor $ACC3, $ACC3, $ACC3
+ vpxor $ACC4, $ACC4, $ACC4
+ vpxor $ACC5, $ACC5, $ACC5
+ vpxor $ACC6, $ACC6, $ACC6
+ vpxor $ACC7, $ACC7, $ACC7
+ vpxor $ACC8, $ACC8, $ACC8
+
+ vmovdqa 32*3+.LONE(%rip), %ymm14
+ vmovdqa 32*7+.LONE(%rip), %ymm15
+
+ mov $n_digits, $itr
+ jmp .Lavx2_mul_by1_x4_loop
+
+.align 32
+.Lavx2_mul_by1_x4_loop:
+ vmovdqa 32*0($a_ptr), $B
+ .byte 0x48,0x8d,0xb6,0x20,0,0,0 # lea 32*1($a_ptr), $a_ptr
+
+ vpsllq \$5, $B, $OVERFLOW
+ vpmuludq %ymm14, $B, $T0
+ vpaddq $OVERFLOW, $ACC0, $ACC0
+ vpaddq $T0, $ACC3, $ACC3
+ .byte 0x67
+ vpmuludq $AND_MASK, $B, $T0
+ vpand $AND_MASK, $ACC0, $Y
+ vpaddq $T0, $ACC4, $ACC4
+ vpaddq $T0, $ACC5, $ACC5
+ vpaddq $T0, $ACC6, $ACC6
+ vpsllq \$23, $B, $T0
+
+ .byte 0x67,0x67
+ vpmuludq %ymm15, $B, $OVERFLOW
+ vpsubq $T0, $ACC6, $ACC6
+
+ vpmuludq $AND_MASK, $Y, $T0
+ vpaddq $OVERFLOW, $ACC7, $ACC7
+ vpaddq $T0, $ACC0, $OVERFLOW
+ vpaddq $T0, $ACC1, $ACC0
+ .byte 0x67,0x67
+ vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
+ vpaddq $T0, $ACC2, $ACC1
+ vpmuludq 32*3(%rax), $Y, $T0
+ vpaddq $OVERFLOW, $ACC0, $ACC0
+ vpaddq $T0, $ACC3, $ACC2
+ vmovdqa $ACC4, $ACC3
+ vpsllq \$18, $Y, $OVERFLOW
+ vmovdqa $ACC5, $ACC4
+ vpmuludq 32*7(%rax), $Y, $T0
+ vpaddq $OVERFLOW, $ACC6, $ACC5
+ vpaddq $T0, $ACC7, $ACC6
+ vpmuludq 32*8(%rax), $Y, $ACC7
+
+ dec $itr
+ jnz .Lavx2_mul_by1_x4_loop
+
+ ret
+.size avx2_mul_by1_x4,.-avx2_mul_by1_x4
+
+################################################################################
+# void avx2_sqr_x4(void* RESULTx4, void *Ax4, void *Bx4);
+.type avx2_sqr_x4,\@abi-omnipotent
+.align 32
+avx2_sqr_x4:
+ lea .LAVX2_POLY(%rip), %rax
+
+ vmovdqa 32*7(%rax), %ymm14
+ vmovdqa 32*8(%rax), %ymm15
+
+ vmovdqa 32*0($a_ptr), $B
+ vmovdqa 32*1($a_ptr), $ACC1
+ vmovdqa 32*2($a_ptr), $ACC2
+ vmovdqa 32*3($a_ptr), $ACC3
+ vmovdqa 32*4($a_ptr), $ACC4
+ vmovdqa 32*5($a_ptr), $ACC5
+ vmovdqa 32*6($a_ptr), $ACC6
+ vmovdqa 32*7($a_ptr), $ACC7
+ vpaddq $ACC1, $ACC1, $ACC1 # 2*$ACC0..7
+ vmovdqa 32*8($a_ptr), $ACC8
+ vpaddq $ACC2, $ACC2, $ACC2
+ vmovdqa $ACC1, 32*0(%rcx)
+ vpaddq $ACC3, $ACC3, $ACC3
+ vmovdqa $ACC2, 32*1(%rcx)
+ vpaddq $ACC4, $ACC4, $ACC4
+ vmovdqa $ACC3, 32*2(%rcx)
+ vpaddq $ACC5, $ACC5, $ACC5
+ vmovdqa $ACC4, 32*3(%rcx)
+ vpaddq $ACC6, $ACC6, $ACC6
+ vmovdqa $ACC5, 32*4(%rcx)
+ vpaddq $ACC7, $ACC7, $ACC7
+ vmovdqa $ACC6, 32*5(%rcx)
+ vpaddq $ACC8, $ACC8, $ACC8
+ vmovdqa $ACC7, 32*6(%rcx)
+ vmovdqa $ACC8, 32*7(%rcx)
+
+ #itr 1
+ vpmuludq $B, $B, $ACC0
+ vpmuludq $B, $ACC1, $ACC1
+ vpand $AND_MASK, $ACC0, $Y
+ vpmuludq $B, $ACC2, $ACC2
+ vpmuludq $B, $ACC3, $ACC3
+ vpmuludq $B, $ACC4, $ACC4
+ vpmuludq $B, $ACC5, $ACC5
+ vpmuludq $B, $ACC6, $ACC6
+ vpmuludq $AND_MASK, $Y, $T0
+ vpmuludq $B, $ACC7, $ACC7
+ vpmuludq $B, $ACC8, $ACC8
+ vmovdqa 32*1($a_ptr), $B
+
+ vpaddq $T0, $ACC0, $OVERFLOW
+ vpaddq $T0, $ACC1, $ACC0
+ vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
+ vpaddq $T0, $ACC2, $ACC1
+ vpmuludq 32*3(%rax), $Y, $T0
+ vpaddq $OVERFLOW, $ACC0, $ACC0
+ vpaddq $T0, $ACC3, $ACC2
+ vmovdqa $ACC4, $ACC3
+ vpsllq \$18, $Y, $T0
+ vmovdqa $ACC5, $ACC4
+ vpmuludq %ymm14, $Y, $OVERFLOW
+ vpaddq $T0, $ACC6, $ACC5
+ vpmuludq %ymm15, $Y, $T0
+ vpaddq $OVERFLOW, $ACC7, $ACC6
+ vpaddq $T0, $ACC8, $ACC7
+
+ #itr 2
+ vpmuludq $B, $B, $OVERFLOW
+ vpand $AND_MASK, $ACC0, $Y
+ vpmuludq 32*1(%rcx), $B, $T0
+ vpaddq $OVERFLOW, $ACC1, $ACC1
+ vpmuludq 32*2(%rcx), $B, $OVERFLOW
+ vpaddq $T0, $ACC2, $ACC2
+ vpmuludq 32*3(%rcx), $B, $T0
+ vpaddq $OVERFLOW, $ACC3, $ACC3
+ vpmuludq 32*4(%rcx), $B, $OVERFLOW
+ vpaddq $T0, $ACC4, $ACC4
+ vpmuludq 32*5(%rcx), $B, $T0
+ vpaddq $OVERFLOW, $ACC5, $ACC5
+ vpmuludq 32*6(%rcx), $B, $OVERFLOW
+ vpaddq $T0, $ACC6, $ACC6
+
+ vpmuludq $AND_MASK, $Y, $T0
+ vpaddq $OVERFLOW, $ACC7, $ACC7
+ vpmuludq 32*7(%rcx), $B, $ACC8
+ vmovdqa 32*2($a_ptr), $B
+ vpaddq $T0, $ACC0, $OVERFLOW
+ vpaddq $T0, $ACC1, $ACC0
+ vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
+ vpaddq $T0, $ACC2, $ACC1
+ vpmuludq 32*3(%rax), $Y, $T0
+ vpaddq $OVERFLOW, $ACC0, $ACC0
+ vpaddq $T0, $ACC3, $ACC2
+ vmovdqa $ACC4, $ACC3
+ vpsllq \$18, $Y, $T0
+ vmovdqa $ACC5, $ACC4
+ vpmuludq %ymm14, $Y, $OVERFLOW
+ vpaddq $T0, $ACC6, $ACC5
+ vpmuludq %ymm15, $Y, $T0
+ vpaddq $OVERFLOW, $ACC7, $ACC6
+ vpaddq $T0, $ACC8, $ACC7
+
+ #itr 3
+ vpmuludq $B, $B, $T0
+ vpand $AND_MASK, $ACC0, $Y
+ vpmuludq 32*2(%rcx), $B, $OVERFLOW
+ vpaddq $T0, $ACC2, $ACC2
+ vpmuludq 32*3(%rcx), $B, $T0
+ vpaddq $OVERFLOW, $ACC3, $ACC3
+ vpmuludq 32*4(%rcx), $B, $OVERFLOW
+ vpaddq $T0, $ACC4, $ACC4
+ vpmuludq 32*5(%rcx), $B, $T0
+ vpaddq $OVERFLOW, $ACC5, $ACC5
+ vpmuludq 32*6(%rcx), $B, $OVERFLOW
+ vpaddq $T0, $ACC6, $ACC6
+
+ vpmuludq $AND_MASK, $Y, $T0
+ vpaddq $OVERFLOW, $ACC7, $ACC7
+ vpmuludq 32*7(%rcx), $B, $ACC8
+ vmovdqa 32*3($a_ptr), $B
+ vpaddq $T0, $ACC0, $OVERFLOW
+ vpaddq $T0, $ACC1, $ACC0
+ vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
+ vpaddq $T0, $ACC2, $ACC1
+ vpmuludq 32*3(%rax), $Y, $T0
+ vpaddq $OVERFLOW, $ACC0, $ACC0
+ vpaddq $T0, $ACC3, $ACC2
+ vmovdqa $ACC4, $ACC3
+ vpsllq \$18, $Y, $T0
+ vmovdqa $ACC5, $ACC4
+ vpmuludq %ymm14, $Y, $OVERFLOW
+ vpaddq $T0, $ACC6, $ACC5
+ vpmuludq %ymm15, $Y, $T0
+ vpand $AND_MASK, $ACC0, $Y
+ vpaddq $OVERFLOW, $ACC7, $ACC6
+ vpaddq $T0, $ACC8, $ACC7
+
+ #itr 4
+ vpmuludq $B, $B, $OVERFLOW
+ vpmuludq 32*3(%rcx), $B, $T0
+ vpaddq $OVERFLOW, $ACC3, $ACC3
+ vpmuludq 32*4(%rcx), $B, $OVERFLOW
+ vpaddq $T0, $ACC4, $ACC4
+ vpmuludq 32*5(%rcx), $B, $T0
+ vpaddq $OVERFLOW, $ACC5, $ACC5
+ vpmuludq 32*6(%rcx), $B, $OVERFLOW
+ vpaddq $T0, $ACC6, $ACC6
+
+ vpmuludq $AND_MASK, $Y, $T0
+ vpaddq $OVERFLOW, $ACC7, $ACC7
+ vpmuludq 32*7(%rcx), $B, $ACC8
+ vmovdqa 32*4($a_ptr), $B
+ vpaddq $T0, $ACC0, $OVERFLOW
+ vpaddq $T0, $ACC1, $ACC0
+ vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
+ vpaddq $T0, $ACC2, $ACC1
+ vpmuludq 32*3(%rax), $Y, $T0
+ vpaddq $OVERFLOW, $ACC0, $ACC0
+ vpaddq $T0, $ACC3, $ACC2
+ vmovdqa $ACC4, $ACC3
+ vpsllq \$18, $Y, $T0
+ vmovdqa $ACC5, $ACC4
+ vpmuludq %ymm14, $Y, $OVERFLOW
+ vpaddq $T0, $ACC6, $ACC5
+ vpmuludq %ymm15, $Y, $T0
+ vpand $AND_MASK, $ACC0, $Y
+ vpaddq $OVERFLOW, $ACC7, $ACC6
+ vpaddq $T0, $ACC8, $ACC7
+
+ #itr 5
+ vpmuludq $B, $B, $T0
+ vpmuludq 32*4(%rcx), $B, $OVERFLOW
+ vpaddq $T0, $ACC4, $ACC4
+ vpmuludq 32*5(%rcx), $B, $T0
+ vpaddq $OVERFLOW, $ACC5, $ACC5
+ vpmuludq 32*6(%rcx), $B, $OVERFLOW
+ vpaddq $T0, $ACC6, $ACC6
+
+ vpmuludq $AND_MASK, $Y, $T0
+ vpaddq $OVERFLOW, $ACC7, $ACC7
+ vpmuludq 32*7(%rcx), $B, $ACC8
+ vmovdqa 32*5($a_ptr), $B
+ vpaddq $T0, $ACC0, $OVERFLOW
+ vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
+ vpaddq $T0, $ACC1, $ACC0
+ vpaddq $T0, $ACC2, $ACC1
+ vpmuludq 32*3+.LAVX2_POLY(%rip), $Y, $T0
+ vpaddq $OVERFLOW, $ACC0, $ACC0
+ vpaddq $T0, $ACC3, $ACC2
+ vmovdqa $ACC4, $ACC3
+ vpsllq \$18, $Y, $T0
+ vmovdqa $ACC5, $ACC4
+ vpmuludq %ymm14, $Y, $OVERFLOW
+ vpaddq $T0, $ACC6, $ACC5
+ vpmuludq %ymm15, $Y, $T0
+ vpand $AND_MASK, $ACC0, $Y
+ vpaddq $OVERFLOW, $ACC7, $ACC6
+ vpaddq $T0, $ACC8, $ACC7
+
+ #itr 6
+ vpmuludq $B, $B, $OVERFLOW
+ vpmuludq 32*5(%rcx), $B, $T0
+ vpaddq $OVERFLOW, $ACC5, $ACC5
+ vpmuludq 32*6(%rcx), $B, $OVERFLOW
+ vpaddq $T0, $ACC6, $ACC6
+
+ vpmuludq $AND_MASK, $Y, $T0
+ vpaddq $OVERFLOW, $ACC7, $ACC7
+ vpmuludq 32*7(%rcx), $B, $ACC8
+ vmovdqa 32*6($a_ptr), $B
+ vpaddq $T0, $ACC0, $OVERFLOW
+ vpaddq $T0, $ACC1, $ACC0
+ vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
+ vpaddq $T0, $ACC2, $ACC1
+ vpmuludq 32*3(%rax), $Y, $T0
+ vpaddq $OVERFLOW, $ACC0, $ACC0
+ vpaddq $T0, $ACC3, $ACC2
+ vmovdqa $ACC4, $ACC3
+ vpsllq \$18, $Y, $T0
+ vmovdqa $ACC5, $ACC4
+ vpmuludq %ymm14, $Y, $OVERFLOW
+ vpaddq $T0, $ACC6, $ACC5
+ vpmuludq %ymm15, $Y, $T0
+ vpand $AND_MASK, $ACC0, $Y
+ vpaddq $OVERFLOW, $ACC7, $ACC6
+ vpaddq $T0, $ACC8, $ACC7
+
+ #itr 7
+ vpmuludq $B, $B, $T0
+ vpmuludq 32*6(%rcx), $B, $OVERFLOW
+ vpaddq $T0, $ACC6, $ACC6
+
+ vpmuludq $AND_MASK, $Y, $T0
+ vpaddq $OVERFLOW, $ACC7, $ACC7
+ vpmuludq 32*7(%rcx), $B, $ACC8
+ vmovdqa 32*7($a_ptr), $B
+ vpaddq $T0, $ACC0, $OVERFLOW
+ vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
+ vpaddq $T0, $ACC1, $ACC0
+ vpaddq $T0, $ACC2, $ACC1
+ vpmuludq 32*3(%rax), $Y, $T0
+ vpaddq $OVERFLOW, $ACC0, $ACC0
+ vpaddq $T0, $ACC3, $ACC2
+ vmovdqa $ACC4, $ACC3
+ vpsllq \$18, $Y, $T0
+ vmovdqa $ACC5, $ACC4
+ vpmuludq %ymm14, $Y, $OVERFLOW
+ vpaddq $T0, $ACC6, $ACC5
+ vpmuludq %ymm15, $Y, $T0
+ vpand $AND_MASK, $ACC0, $Y
+ vpaddq $OVERFLOW, $ACC7, $ACC6
+ vpaddq $T0, $ACC8, $ACC7
+
+ #itr 8
+ vpmuludq $B, $B, $OVERFLOW
+
+ vpmuludq $AND_MASK, $Y, $T0
+ vpaddq $OVERFLOW, $ACC7, $ACC7
+ vpmuludq 32*7(%rcx), $B, $ACC8
+ vmovdqa 32*8($a_ptr), $B
+ vpaddq $T0, $ACC0, $OVERFLOW
+ vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
+ vpaddq $T0, $ACC1, $ACC0
+ vpaddq $T0, $ACC2, $ACC1
+ vpmuludq 32*3(%rax), $Y, $T0
+ vpaddq $OVERFLOW, $ACC0, $ACC0
+ vpaddq $T0, $ACC3, $ACC2
+ vmovdqa $ACC4, $ACC3
+ vpsllq \$18, $Y, $T0
+ vmovdqa $ACC5, $ACC4
+ vpmuludq %ymm14, $Y, $OVERFLOW
+ vpaddq $T0, $ACC6, $ACC5
+ vpmuludq %ymm15, $Y, $T0
+ vpand $AND_MASK, $ACC0, $Y
+ vpaddq $OVERFLOW, $ACC7, $ACC6
+ vpaddq $T0, $ACC8, $ACC7
+
+ #itr 9
+ vpmuludq $B, $B, $ACC8
+
+ vpmuludq $AND_MASK, $Y, $T0
+ vpaddq $T0, $ACC0, $OVERFLOW
+ vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
+ vpaddq $T0, $ACC1, $ACC0
+ vpaddq $T0, $ACC2, $ACC1
+ vpmuludq 32*3(%rax), $Y, $T0
+ vpaddq $OVERFLOW, $ACC0, $ACC0
+ vpaddq $T0, $ACC3, $ACC2
+ vmovdqa $ACC4, $ACC3
+ vpsllq \$18, $Y, $T0
+ vmovdqa $ACC5, $ACC4
+ vpmuludq %ymm14, $Y, $OVERFLOW
+ vpaddq $T0, $ACC6, $ACC5
+ vpmuludq %ymm15, $Y, $T0
+ vpaddq $OVERFLOW, $ACC7, $ACC6
+ vpaddq $T0, $ACC8, $ACC7
+
+ vpxor $ACC8, $ACC8, $ACC8
+
+ ret
+.size avx2_sqr_x4,.-avx2_sqr_x4
+
+################################################################################
+# void avx2_sub_x4(void* RESULTx4, void *Ax4, void *Bx4);
+.type avx2_sub_x4,\@abi-omnipotent
+.align 32
+avx2_sub_x4:
+ vmovdqa 32*0($a_ptr), $ACC0
+ lea 160($a_ptr), $a_ptr
+ lea .LAVX2_POLY_x8+128(%rip), %rax
+ lea 128($b_ptr), $b_ptr
+ vmovdqa 32*1-160($a_ptr), $ACC1
+ vmovdqa 32*2-160($a_ptr), $ACC2
+ vmovdqa 32*3-160($a_ptr), $ACC3
+ vmovdqa 32*4-160($a_ptr), $ACC4
+ vmovdqa 32*5-160($a_ptr), $ACC5
+ vmovdqa 32*6-160($a_ptr), $ACC6
+ vmovdqa 32*7-160($a_ptr), $ACC7
+ vmovdqa 32*8-160($a_ptr), $ACC8
+
+ vpaddq 32*0-128(%rax), $ACC0, $ACC0
+ vpaddq 32*1-128(%rax), $ACC1, $ACC1
+ vpaddq 32*2-128(%rax), $ACC2, $ACC2
+ vpaddq 32*3-128(%rax), $ACC3, $ACC3
+ vpaddq 32*4-128(%rax), $ACC4, $ACC4
+ vpaddq 32*5-128(%rax), $ACC5, $ACC5
+ vpaddq 32*6-128(%rax), $ACC6, $ACC6
+ vpaddq 32*7-128(%rax), $ACC7, $ACC7
+ vpaddq 32*8-128(%rax), $ACC8, $ACC8
+
+ vpsubq 32*0-128($b_ptr), $ACC0, $ACC0
+ vpsubq 32*1-128($b_ptr), $ACC1, $ACC1
+ vpsubq 32*2-128($b_ptr), $ACC2, $ACC2
+ vpsubq 32*3-128($b_ptr), $ACC3, $ACC3
+ vpsubq 32*4-128($b_ptr), $ACC4, $ACC4
+ vpsubq 32*5-128($b_ptr), $ACC5, $ACC5
+ vpsubq 32*6-128($b_ptr), $ACC6, $ACC6
+ vpsubq 32*7-128($b_ptr), $ACC7, $ACC7
+ vpsubq 32*8-128($b_ptr), $ACC8, $ACC8
+
+ ret
+.size avx2_sub_x4,.-avx2_sub_x4
+
+.type avx2_select_n_store,\@abi-omnipotent
+.align 32
+avx2_select_n_store:
+ vmovdqa `8+32*9*8`(%rsp), $Y
+ vpor `8+32*9*8+32`(%rsp), $Y, $Y
+
+ vpandn $ACC0, $Y, $ACC0
+ vpandn $ACC1, $Y, $ACC1
+ vpandn $ACC2, $Y, $ACC2
+ vpandn $ACC3, $Y, $ACC3
+ vpandn $ACC4, $Y, $ACC4
+ vpandn $ACC5, $Y, $ACC5
+ vpandn $ACC6, $Y, $ACC6
+ vmovdqa `8+32*9*8+32`(%rsp), $B
+ vpandn $ACC7, $Y, $ACC7
+ vpandn `8+32*9*8`(%rsp), $B, $B
+ vpandn $ACC8, $Y, $ACC8
+
+ vpand 32*0(%rsi), $B, $T0
+ lea 160(%rsi), %rax
+ vpand 32*1(%rsi), $B, $Y
+ vpxor $T0, $ACC0, $ACC0
+ vpand 32*2(%rsi), $B, $T0
+ vpxor $Y, $ACC1, $ACC1
+ vpand 32*3(%rsi), $B, $Y
+ vpxor $T0, $ACC2, $ACC2
+ vpand 32*4-160(%rax), $B, $T0
+ vpxor $Y, $ACC3, $ACC3
+ vpand 32*5-160(%rax), $B, $Y
+ vpxor $T0, $ACC4, $ACC4
+ vpand 32*6-160(%rax), $B, $T0
+ vpxor $Y, $ACC5, $ACC5
+ vpand 32*7-160(%rax), $B, $Y
+ vpxor $T0, $ACC6, $ACC6
+ vpand 32*8-160(%rax), $B, $T0
+ vmovdqa `8+32*9*8+32`(%rsp), $B
+ vpxor $Y, $ACC7, $ACC7
+
+ vpand 32*0(%rdx), $B, $Y
+ lea 160(%rdx), %rax
+ vpxor $T0, $ACC8, $ACC8
+ vpand 32*1(%rdx), $B, $T0
+ vpxor $Y, $ACC0, $ACC0
+ vpand 32*2(%rdx), $B, $Y
+ vpxor $T0, $ACC1, $ACC1
+ vpand 32*3(%rdx), $B, $T0
+ vpxor $Y, $ACC2, $ACC2
+ vpand 32*4-160(%rax), $B, $Y
+ vpxor $T0, $ACC3, $ACC3
+ vpand 32*5-160(%rax), $B, $T0
+ vpxor $Y, $ACC4, $ACC4
+ vpand 32*6-160(%rax), $B, $Y
+ vpxor $T0, $ACC5, $ACC5
+ vpand 32*7-160(%rax), $B, $T0
+ vpxor $Y, $ACC6, $ACC6
+ vpand 32*8-160(%rax), $B, $Y
+ vpxor $T0, $ACC7, $ACC7
+ vpxor $Y, $ACC8, $ACC8
+ `&STORE`
+
+ ret
+.size avx2_select_n_store,.-avx2_select_n_store
+___
+$code.=<<___ if (0); # inlined
+################################################################################
+# void avx2_mul_by2_x4(void* RESULTx4, void *Ax4);
+.type avx2_mul_by2_x4,\@abi-omnipotent
+.align 32
+avx2_mul_by2_x4:
+ vmovdqa 32*0($a_ptr), $ACC0
+ lea 160($a_ptr), %rax
+ vmovdqa 32*1($a_ptr), $ACC1
+ vmovdqa 32*2($a_ptr), $ACC2
+ vmovdqa 32*3($a_ptr), $ACC3
+ vmovdqa 32*4-160(%rax), $ACC4
+ vmovdqa 32*5-160(%rax), $ACC5
+ vmovdqa 32*6-160(%rax), $ACC6
+ vmovdqa 32*7-160(%rax), $ACC7
+ vmovdqa 32*8-160(%rax), $ACC8
+
+ vpaddq $ACC0, $ACC0, $ACC0
+ vpaddq $ACC1, $ACC1, $ACC1
+ vpaddq $ACC2, $ACC2, $ACC2
+ vpaddq $ACC3, $ACC3, $ACC3
+ vpaddq $ACC4, $ACC4, $ACC4
+ vpaddq $ACC5, $ACC5, $ACC5
+ vpaddq $ACC6, $ACC6, $ACC6
+ vpaddq $ACC7, $ACC7, $ACC7
+ vpaddq $ACC8, $ACC8, $ACC8
+
+ ret
+.size avx2_mul_by2_x4,.-avx2_mul_by2_x4
+___
+my ($r_ptr_in,$a_ptr_in,$b_ptr_in)=("%rdi","%rsi","%rdx");
+my ($r_ptr,$a_ptr,$b_ptr)=("%r8","%r9","%r10");
+
+$code.=<<___;
+################################################################################
+# void ecp_nistz256_avx2_point_add_affine_x4(void* RESULTx4, void *Ax4, void *Bx4);
+.globl ecp_nistz256_avx2_point_add_affine_x4
+.type ecp_nistz256_avx2_point_add_affine_x4,\@function,3
+.align 32
+ecp_nistz256_avx2_point_add_affine_x4:
+ mov %rsp, %rax
+ push %rbp
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ lea -16*10(%rsp), %rsp
+ vmovaps %xmm6, -8-16*10(%rax)
+ vmovaps %xmm7, -8-16*9(%rax)
+ vmovaps %xmm8, -8-16*8(%rax)
+ vmovaps %xmm9, -8-16*7(%rax)
+ vmovaps %xmm10, -8-16*6(%rax)
+ vmovaps %xmm11, -8-16*5(%rax)
+ vmovaps %xmm12, -8-16*4(%rax)
+ vmovaps %xmm13, -8-16*3(%rax)
+ vmovaps %xmm14, -8-16*2(%rax)
+ vmovaps %xmm15, -8-16*1(%rax)
+___
+$code.=<<___;
+ lea -8(%rax), %rbp
+
+# Result + 32*0 = Result.X
+# Result + 32*9 = Result.Y
+# Result + 32*18 = Result.Z
+
+# A + 32*0 = A.X
+# A + 32*9 = A.Y
+# A + 32*18 = A.Z
+
+# B + 32*0 = B.X
+# B + 32*9 = B.Y
+
+ sub \$`32*9*8+32*2+32*8`, %rsp
+ and \$-64, %rsp
+
+ mov $r_ptr_in, $r_ptr
+ mov $a_ptr_in, $a_ptr
+ mov $b_ptr_in, $b_ptr
+
+ vmovdqa 32*0($a_ptr_in), %ymm0
+ vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
+ vpxor %ymm1, %ymm1, %ymm1
+ lea 256($a_ptr_in), %rax # size optimization
+ vpor 32*1($a_ptr_in), %ymm0, %ymm0
+ vpor 32*2($a_ptr_in), %ymm0, %ymm0
+ vpor 32*3($a_ptr_in), %ymm0, %ymm0
+ vpor 32*4-256(%rax), %ymm0, %ymm0
+ lea 256(%rax), %rcx # size optimization
+ vpor 32*5-256(%rax), %ymm0, %ymm0
+ vpor 32*6-256(%rax), %ymm0, %ymm0
+ vpor 32*7-256(%rax), %ymm0, %ymm0
+ vpor 32*8-256(%rax), %ymm0, %ymm0
+ vpor 32*9-256(%rax), %ymm0, %ymm0
+ vpor 32*10-256(%rax), %ymm0, %ymm0
+ vpor 32*11-256(%rax), %ymm0, %ymm0
+ vpor 32*12-512(%rcx), %ymm0, %ymm0
+ vpor 32*13-512(%rcx), %ymm0, %ymm0
+ vpor 32*14-512(%rcx), %ymm0, %ymm0
+ vpor 32*15-512(%rcx), %ymm0, %ymm0
+ vpor 32*16-512(%rcx), %ymm0, %ymm0
+ vpor 32*17-512(%rcx), %ymm0, %ymm0
+ vpcmpeqq %ymm1, %ymm0, %ymm0
+ vmovdqa %ymm0, `32*9*8`(%rsp)
+
+ vpxor %ymm1, %ymm1, %ymm1
+ vmovdqa 32*0($b_ptr), %ymm0
+ lea 256($b_ptr), %rax # size optimization
+ vpor 32*1($b_ptr), %ymm0, %ymm0
+ vpor 32*2($b_ptr), %ymm0, %ymm0
+ vpor 32*3($b_ptr), %ymm0, %ymm0
+ vpor 32*4-256(%rax), %ymm0, %ymm0
+ lea 256(%rax), %rcx # size optimization
+ vpor 32*5-256(%rax), %ymm0, %ymm0
+ vpor 32*6-256(%rax), %ymm0, %ymm0
+ vpor 32*7-256(%rax), %ymm0, %ymm0
+ vpor 32*8-256(%rax), %ymm0, %ymm0
+ vpor 32*9-256(%rax), %ymm0, %ymm0
+ vpor 32*10-256(%rax), %ymm0, %ymm0
+ vpor 32*11-256(%rax), %ymm0, %ymm0
+ vpor 32*12-512(%rcx), %ymm0, %ymm0
+ vpor 32*13-512(%rcx), %ymm0, %ymm0
+ vpor 32*14-512(%rcx), %ymm0, %ymm0
+ vpor 32*15-512(%rcx), %ymm0, %ymm0
+ vpor 32*16-512(%rcx), %ymm0, %ymm0
+ vpor 32*17-512(%rcx), %ymm0, %ymm0
+ vpcmpeqq %ymm1, %ymm0, %ymm0
+ vmovdqa %ymm0, `32*9*8+32`(%rsp)
+
+ # Z1^2 = Z1*Z1
+ lea `32*9*2`($a_ptr), %rsi
+ lea `32*9*2`(%rsp), %rdi
+ lea `32*9*8+32*2`(%rsp), %rcx # temporary vector
+ call avx2_sqr_x4
+ call avx2_normalize_n_store
+
+ # U2 = X2*Z1^2
+ lea `32*9*0`($b_ptr), %rsi
+ lea `32*9*2`(%rsp), %rdx
+ lea `32*9*0`(%rsp), %rdi
+ call avx2_mul_x4
+ #call avx2_normalize
+ `&STORE`
+
+ # S2 = Z1*Z1^2 = Z1^3
+ lea `32*9*2`($a_ptr), %rsi
+ lea `32*9*2`(%rsp), %rdx
+ lea `32*9*1`(%rsp), %rdi
+ call avx2_mul_x4
+ call avx2_normalize_n_store
+
+ # S2 = S2*Y2 = Y2*Z1^3
+ lea `32*9*1`($b_ptr), %rsi
+ lea `32*9*1`(%rsp), %rdx
+ lea `32*9*1`(%rsp), %rdi
+ call avx2_mul_x4
+ call avx2_normalize_n_store
+
+ # H = U2 - U1 = U2 - X1
+ lea `32*9*0`(%rsp), %rsi
+ lea `32*9*0`($a_ptr), %rdx
+ lea `32*9*3`(%rsp), %rdi
+ call avx2_sub_x4
+ call avx2_normalize_n_store
+
+ # R = S2 - S1 = S2 - Y1
+ lea `32*9*1`(%rsp), %rsi
+ lea `32*9*1`($a_ptr), %rdx
+ lea `32*9*4`(%rsp), %rdi
+ call avx2_sub_x4
+ call avx2_normalize_n_store
+
+ # Z3 = H*Z1*Z2
+ lea `32*9*3`(%rsp), %rsi
+ lea `32*9*2`($a_ptr), %rdx
+ lea `32*9*2`($r_ptr), %rdi
+ call avx2_mul_x4
+ call avx2_normalize
+
+ lea .LONE(%rip), %rsi
+ lea `32*9*2`($a_ptr), %rdx
+ call avx2_select_n_store
+
+ # R^2 = R^2
+ lea `32*9*4`(%rsp), %rsi
+ lea `32*9*6`(%rsp), %rdi
+ lea `32*9*8+32*2`(%rsp), %rcx # temporary vector
+ call avx2_sqr_x4
+ call avx2_normalize_n_store
+
+ # H^2 = H^2
+ lea `32*9*3`(%rsp), %rsi
+ lea `32*9*5`(%rsp), %rdi
+ call avx2_sqr_x4
+ call avx2_normalize_n_store
+
+ # H^3 = H^2*H
+ lea `32*9*3`(%rsp), %rsi
+ lea `32*9*5`(%rsp), %rdx
+ lea `32*9*7`(%rsp), %rdi
+ call avx2_mul_x4
+ call avx2_normalize_n_store
+
+ # U2 = U1*H^2
+ lea `32*9*0`($a_ptr), %rsi
+ lea `32*9*5`(%rsp), %rdx
+ lea `32*9*0`(%rsp), %rdi
+ call avx2_mul_x4
+ #call avx2_normalize
+ `&STORE`
+
+ # Hsqr = U2*2
+ #lea 32*9*0(%rsp), %rsi
+ #lea 32*9*5(%rsp), %rdi
+ #call avx2_mul_by2_x4
+
+ vpaddq $ACC0, $ACC0, $ACC0 # inlined avx2_mul_by2_x4
+ lea `32*9*5`(%rsp), %rdi
+ vpaddq $ACC1, $ACC1, $ACC1
+ vpaddq $ACC2, $ACC2, $ACC2
+ vpaddq $ACC3, $ACC3, $ACC3
+ vpaddq $ACC4, $ACC4, $ACC4
+ vpaddq $ACC5, $ACC5, $ACC5
+ vpaddq $ACC6, $ACC6, $ACC6
+ vpaddq $ACC7, $ACC7, $ACC7
+ vpaddq $ACC8, $ACC8, $ACC8
+ call avx2_normalize_n_store
+
+ # X3 = R^2 - H^3
+ #lea 32*9*6(%rsp), %rsi
+ #lea 32*9*7(%rsp), %rdx
+ #lea 32*9*5(%rsp), %rcx
+ #lea 32*9*0($r_ptr), %rdi
+ #call avx2_sub_x4
+ #NORMALIZE
+ #STORE
+
+ # X3 = X3 - U2*2
+ #lea 32*9*0($r_ptr), %rsi
+ #lea 32*9*0($r_ptr), %rdi
+ #call avx2_sub_x4
+ #NORMALIZE
+ #STORE
+
+ lea `32*9*6+128`(%rsp), %rsi
+ lea .LAVX2_POLY_x2+128(%rip), %rax
+ lea `32*9*7+128`(%rsp), %rdx
+ lea `32*9*5+128`(%rsp), %rcx
+ lea `32*9*0`($r_ptr), %rdi
+
+ vmovdqa 32*0-128(%rsi), $ACC0
+ vmovdqa 32*1-128(%rsi), $ACC1
+ vmovdqa 32*2-128(%rsi), $ACC2
+ vmovdqa 32*3-128(%rsi), $ACC3
+ vmovdqa 32*4-128(%rsi), $ACC4
+ vmovdqa 32*5-128(%rsi), $ACC5
+ vmovdqa 32*6-128(%rsi), $ACC6
+ vmovdqa 32*7-128(%rsi), $ACC7
+ vmovdqa 32*8-128(%rsi), $ACC8
+
+ vpaddq 32*0-128(%rax), $ACC0, $ACC0
+ vpaddq 32*1-128(%rax), $ACC1, $ACC1
+ vpaddq 32*2-128(%rax), $ACC2, $ACC2
+ vpaddq 32*3-128(%rax), $ACC3, $ACC3
+ vpaddq 32*4-128(%rax), $ACC4, $ACC4
+ vpaddq 32*5-128(%rax), $ACC5, $ACC5
+ vpaddq 32*6-128(%rax), $ACC6, $ACC6
+ vpaddq 32*7-128(%rax), $ACC7, $ACC7
+ vpaddq 32*8-128(%rax), $ACC8, $ACC8
+
+ vpsubq 32*0-128(%rdx), $ACC0, $ACC0
+ vpsubq 32*1-128(%rdx), $ACC1, $ACC1
+ vpsubq 32*2-128(%rdx), $ACC2, $ACC2
+ vpsubq 32*3-128(%rdx), $ACC3, $ACC3
+ vpsubq 32*4-128(%rdx), $ACC4, $ACC4
+ vpsubq 32*5-128(%rdx), $ACC5, $ACC5
+ vpsubq 32*6-128(%rdx), $ACC6, $ACC6
+ vpsubq 32*7-128(%rdx), $ACC7, $ACC7
+ vpsubq 32*8-128(%rdx), $ACC8, $ACC8
+
+ vpsubq 32*0-128(%rcx), $ACC0, $ACC0
+ vpsubq 32*1-128(%rcx), $ACC1, $ACC1
+ vpsubq 32*2-128(%rcx), $ACC2, $ACC2
+ vpsubq 32*3-128(%rcx), $ACC3, $ACC3
+ vpsubq 32*4-128(%rcx), $ACC4, $ACC4
+ vpsubq 32*5-128(%rcx), $ACC5, $ACC5
+ vpsubq 32*6-128(%rcx), $ACC6, $ACC6
+ vpsubq 32*7-128(%rcx), $ACC7, $ACC7
+ vpsubq 32*8-128(%rcx), $ACC8, $ACC8
+ call avx2_normalize
+
+ lea 32*0($b_ptr), %rsi
+ lea 32*0($a_ptr), %rdx
+ call avx2_select_n_store
+
+ # H = U2 - X3
+ lea `32*9*0`(%rsp), %rsi
+ lea `32*9*0`($r_ptr), %rdx
+ lea `32*9*3`(%rsp), %rdi
+ call avx2_sub_x4
+ call avx2_normalize_n_store
+
+ #
+ lea `32*9*3`(%rsp), %rsi
+ lea `32*9*4`(%rsp), %rdx
+ lea `32*9*3`(%rsp), %rdi
+ call avx2_mul_x4
+ call avx2_normalize_n_store
+
+ #
+ lea `32*9*7`(%rsp), %rsi
+ lea `32*9*1`($a_ptr), %rdx
+ lea `32*9*1`(%rsp), %rdi
+ call avx2_mul_x4
+ call avx2_normalize_n_store
+
+ #
+ lea `32*9*3`(%rsp), %rsi
+ lea `32*9*1`(%rsp), %rdx
+ lea `32*9*1`($r_ptr), %rdi
+ call avx2_sub_x4
+ call avx2_normalize
+
+ lea 32*9($b_ptr), %rsi
+ lea 32*9($a_ptr), %rdx
+ call avx2_select_n_store
+
+ #lea 32*9*0($r_ptr), %rsi
+ #lea 32*9*0($r_ptr), %rdi
+ #call avx2_mul_by1_x4
+ #NORMALIZE
+ #STORE
+
+ lea `32*9*1`($r_ptr), %rsi
+ lea `32*9*1`($r_ptr), %rdi
+ call avx2_mul_by1_x4
+ call avx2_normalize_n_store
+
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6, -16*10(%rbp)
+ movaps %xmm7, -16*9(%rbp)
+ movaps %xmm8, -16*8(%rbp)
+ movaps %xmm9, -16*7(%rbp)
+ movaps %xmm10, -16*6(%rbp)
+ movaps %xmm11, -16*5(%rbp)
+ movaps %xmm12, -16*4(%rbp)
+ movaps %xmm13, -16*3(%rbp)
+ movaps %xmm14, -16*2(%rbp)
+ movaps %xmm15, -16*1(%rbp)
+___
+$code.=<<___;
+ mov %rbp, %rsp
+ pop %rbp
+ ret
+.size ecp_nistz256_avx2_point_add_affine_x4,.-ecp_nistz256_avx2_point_add_affine_x4
+
+################################################################################
+# void ecp_nistz256_avx2_point_add_affines_x4(void* RESULTx4, void *Ax4, void *Bx4);
+.globl ecp_nistz256_avx2_point_add_affines_x4
+.type ecp_nistz256_avx2_point_add_affines_x4,\@function,3
+.align 32
+ecp_nistz256_avx2_point_add_affines_x4:
+ mov %rsp, %rax
+ push %rbp
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ lea -16*10(%rsp), %rsp
+ vmovaps %xmm6, -8-16*10(%rax)
+ vmovaps %xmm7, -8-16*9(%rax)
+ vmovaps %xmm8, -8-16*8(%rax)
+ vmovaps %xmm9, -8-16*7(%rax)
+ vmovaps %xmm10, -8-16*6(%rax)
+ vmovaps %xmm11, -8-16*5(%rax)
+ vmovaps %xmm12, -8-16*4(%rax)
+ vmovaps %xmm13, -8-16*3(%rax)
+ vmovaps %xmm14, -8-16*2(%rax)
+ vmovaps %xmm15, -8-16*1(%rax)
+___
+$code.=<<___;
+ lea -8(%rax), %rbp
+
+# Result + 32*0 = Result.X
+# Result + 32*9 = Result.Y
+# Result + 32*18 = Result.Z
+
+# A + 32*0 = A.X
+# A + 32*9 = A.Y
+
+# B + 32*0 = B.X
+# B + 32*9 = B.Y
+
+ sub \$`32*9*8+32*2+32*8`, %rsp
+ and \$-64, %rsp
+
+ mov $r_ptr_in, $r_ptr
+ mov $a_ptr_in, $a_ptr
+ mov $b_ptr_in, $b_ptr
+
+ vmovdqa 32*0($a_ptr_in), %ymm0
+ vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
+ vpxor %ymm1, %ymm1, %ymm1
+ lea 256($a_ptr_in), %rax # size optimization
+ vpor 32*1($a_ptr_in), %ymm0, %ymm0
+ vpor 32*2($a_ptr_in), %ymm0, %ymm0
+ vpor 32*3($a_ptr_in), %ymm0, %ymm0
+ vpor 32*4-256(%rax), %ymm0, %ymm0
+ lea 256(%rax), %rcx # size optimization
+ vpor 32*5-256(%rax), %ymm0, %ymm0
+ vpor 32*6-256(%rax), %ymm0, %ymm0
+ vpor 32*7-256(%rax), %ymm0, %ymm0
+ vpor 32*8-256(%rax), %ymm0, %ymm0
+ vpor 32*9-256(%rax), %ymm0, %ymm0
+ vpor 32*10-256(%rax), %ymm0, %ymm0
+ vpor 32*11-256(%rax), %ymm0, %ymm0
+ vpor 32*12-512(%rcx), %ymm0, %ymm0
+ vpor 32*13-512(%rcx), %ymm0, %ymm0
+ vpor 32*14-512(%rcx), %ymm0, %ymm0
+ vpor 32*15-512(%rcx), %ymm0, %ymm0
+ vpor 32*16-512(%rcx), %ymm0, %ymm0
+ vpor 32*17-512(%rcx), %ymm0, %ymm0
+ vpcmpeqq %ymm1, %ymm0, %ymm0
+ vmovdqa %ymm0, `32*9*8`(%rsp)
+
+ vpxor %ymm1, %ymm1, %ymm1
+ vmovdqa 32*0($b_ptr), %ymm0
+ lea 256($b_ptr), %rax # size optimization
+ vpor 32*1($b_ptr), %ymm0, %ymm0
+ vpor 32*2($b_ptr), %ymm0, %ymm0
+ vpor 32*3($b_ptr), %ymm0, %ymm0
+ vpor 32*4-256(%rax), %ymm0, %ymm0
+ lea 256(%rax), %rcx # size optimization
+ vpor 32*5-256(%rax), %ymm0, %ymm0
+ vpor 32*6-256(%rax), %ymm0, %ymm0
+ vpor 32*7-256(%rax), %ymm0, %ymm0
+ vpor 32*8-256(%rax), %ymm0, %ymm0
+ vpor 32*9-256(%rax), %ymm0, %ymm0
+ vpor 32*10-256(%rax), %ymm0, %ymm0
+ vpor 32*11-256(%rax), %ymm0, %ymm0
+ vpor 32*12-512(%rcx), %ymm0, %ymm0
+ vpor 32*13-512(%rcx), %ymm0, %ymm0
+ vpor 32*14-512(%rcx), %ymm0, %ymm0
+ vpor 32*15-512(%rcx), %ymm0, %ymm0
+ vpor 32*16-512(%rcx), %ymm0, %ymm0
+ vpor 32*17-512(%rcx), %ymm0, %ymm0
+ vpcmpeqq %ymm1, %ymm0, %ymm0
+ vmovdqa %ymm0, `32*9*8+32`(%rsp)
+
+ # H = U2 - U1 = X2 - X1
+ lea `32*9*0`($b_ptr), %rsi
+ lea `32*9*0`($a_ptr), %rdx
+ lea `32*9*3`(%rsp), %rdi
+ call avx2_sub_x4
+ call avx2_normalize_n_store
+
+ # R = S2 - S1 = Y2 - Y1
+ lea `32*9*1`($b_ptr), %rsi
+ lea `32*9*1`($a_ptr), %rdx
+ lea `32*9*4`(%rsp), %rdi
+ call avx2_sub_x4
+ call avx2_normalize_n_store
+
+ # Z3 = H*Z1*Z2 = H
+ lea `32*9*3`(%rsp), %rsi
+ lea `32*9*2`($r_ptr), %rdi
+ call avx2_mul_by1_x4
+ call avx2_normalize
+
+ vmovdqa `32*9*8`(%rsp), $B
+ vpor `32*9*8+32`(%rsp), $B, $B
+
+ vpandn $ACC0, $B, $ACC0
+ lea .LONE+128(%rip), %rax
+ vpandn $ACC1, $B, $ACC1
+ vpandn $ACC2, $B, $ACC2
+ vpandn $ACC3, $B, $ACC3
+ vpandn $ACC4, $B, $ACC4
+ vpandn $ACC5, $B, $ACC5
+ vpandn $ACC6, $B, $ACC6
+ vpandn $ACC7, $B, $ACC7
+
+ vpand 32*0-128(%rax), $B, $T0
+ vpandn $ACC8, $B, $ACC8
+ vpand 32*1-128(%rax), $B, $Y
+ vpxor $T0, $ACC0, $ACC0
+ vpand 32*2-128(%rax), $B, $T0
+ vpxor $Y, $ACC1, $ACC1
+ vpand 32*3-128(%rax), $B, $Y
+ vpxor $T0, $ACC2, $ACC2
+ vpand 32*4-128(%rax), $B, $T0
+ vpxor $Y, $ACC3, $ACC3
+ vpand 32*5-128(%rax), $B, $Y
+ vpxor $T0, $ACC4, $ACC4
+ vpand 32*6-128(%rax), $B, $T0
+ vpxor $Y, $ACC5, $ACC5
+ vpand 32*7-128(%rax), $B, $Y
+ vpxor $T0, $ACC6, $ACC6
+ vpand 32*8-128(%rax), $B, $T0
+ vpxor $Y, $ACC7, $ACC7
+ vpxor $T0, $ACC8, $ACC8
+ `&STORE`
+
+ # R^2 = R^2
+ lea `32*9*4`(%rsp), %rsi
+ lea `32*9*6`(%rsp), %rdi
+ lea `32*9*8+32*2`(%rsp), %rcx # temporary vector
+ call avx2_sqr_x4
+ call avx2_normalize_n_store
+
+ # H^2 = H^2
+ lea `32*9*3`(%rsp), %rsi
+ lea `32*9*5`(%rsp), %rdi
+ call avx2_sqr_x4
+ call avx2_normalize_n_store
+
+ # H^3 = H^2*H
+ lea `32*9*3`(%rsp), %rsi
+ lea `32*9*5`(%rsp), %rdx
+ lea `32*9*7`(%rsp), %rdi
+ call avx2_mul_x4
+ call avx2_normalize_n_store
+
+ # U2 = U1*H^2
+ lea `32*9*0`($a_ptr), %rsi
+ lea `32*9*5`(%rsp), %rdx
+ lea `32*9*0`(%rsp), %rdi
+ call avx2_mul_x4
+ #call avx2_normalize
+ `&STORE`
+
+ # Hsqr = U2*2
+ #lea 32*9*0(%rsp), %rsi
+ #lea 32*9*5(%rsp), %rdi
+ #call avx2_mul_by2_x4
+
+ vpaddq $ACC0, $ACC0, $ACC0 # inlined avx2_mul_by2_x4
+ lea `32*9*5`(%rsp), %rdi
+ vpaddq $ACC1, $ACC1, $ACC1
+ vpaddq $ACC2, $ACC2, $ACC2
+ vpaddq $ACC3, $ACC3, $ACC3
+ vpaddq $ACC4, $ACC4, $ACC4
+ vpaddq $ACC5, $ACC5, $ACC5
+ vpaddq $ACC6, $ACC6, $ACC6
+ vpaddq $ACC7, $ACC7, $ACC7
+ vpaddq $ACC8, $ACC8, $ACC8
+ call avx2_normalize_n_store
+
+ # X3 = R^2 - H^3
+ #lea 32*9*6(%rsp), %rsi
+ #lea 32*9*7(%rsp), %rdx
+ #lea 32*9*5(%rsp), %rcx
+ #lea 32*9*0($r_ptr), %rdi
+ #call avx2_sub_x4
+ #NORMALIZE
+ #STORE
+
+ # X3 = X3 - U2*2
+ #lea 32*9*0($r_ptr), %rsi
+ #lea 32*9*0($r_ptr), %rdi
+ #call avx2_sub_x4
+ #NORMALIZE
+ #STORE
+
+ lea `32*9*6+128`(%rsp), %rsi
+ lea .LAVX2_POLY_x2+128(%rip), %rax
+ lea `32*9*7+128`(%rsp), %rdx
+ lea `32*9*5+128`(%rsp), %rcx
+ lea `32*9*0`($r_ptr), %rdi
+
+ vmovdqa 32*0-128(%rsi), $ACC0
+ vmovdqa 32*1-128(%rsi), $ACC1
+ vmovdqa 32*2-128(%rsi), $ACC2
+ vmovdqa 32*3-128(%rsi), $ACC3
+ vmovdqa 32*4-128(%rsi), $ACC4
+ vmovdqa 32*5-128(%rsi), $ACC5
+ vmovdqa 32*6-128(%rsi), $ACC6
+ vmovdqa 32*7-128(%rsi), $ACC7
+ vmovdqa 32*8-128(%rsi), $ACC8
+
+ vpaddq 32*0-128(%rax), $ACC0, $ACC0
+ vpaddq 32*1-128(%rax), $ACC1, $ACC1
+ vpaddq 32*2-128(%rax), $ACC2, $ACC2
+ vpaddq 32*3-128(%rax), $ACC3, $ACC3
+ vpaddq 32*4-128(%rax), $ACC4, $ACC4
+ vpaddq 32*5-128(%rax), $ACC5, $ACC5
+ vpaddq 32*6-128(%rax), $ACC6, $ACC6
+ vpaddq 32*7-128(%rax), $ACC7, $ACC7
+ vpaddq 32*8-128(%rax), $ACC8, $ACC8
+
+ vpsubq 32*0-128(%rdx), $ACC0, $ACC0
+ vpsubq 32*1-128(%rdx), $ACC1, $ACC1
+ vpsubq 32*2-128(%rdx), $ACC2, $ACC2
+ vpsubq 32*3-128(%rdx), $ACC3, $ACC3
+ vpsubq 32*4-128(%rdx), $ACC4, $ACC4
+ vpsubq 32*5-128(%rdx), $ACC5, $ACC5
+ vpsubq 32*6-128(%rdx), $ACC6, $ACC6
+ vpsubq 32*7-128(%rdx), $ACC7, $ACC7
+ vpsubq 32*8-128(%rdx), $ACC8, $ACC8
+
+ vpsubq 32*0-128(%rcx), $ACC0, $ACC0
+ vpsubq 32*1-128(%rcx), $ACC1, $ACC1
+ vpsubq 32*2-128(%rcx), $ACC2, $ACC2
+ vpsubq 32*3-128(%rcx), $ACC3, $ACC3
+ vpsubq 32*4-128(%rcx), $ACC4, $ACC4
+ vpsubq 32*5-128(%rcx), $ACC5, $ACC5
+ vpsubq 32*6-128(%rcx), $ACC6, $ACC6
+ vpsubq 32*7-128(%rcx), $ACC7, $ACC7
+ vpsubq 32*8-128(%rcx), $ACC8, $ACC8
+ call avx2_normalize
+
+ lea 32*0($b_ptr), %rsi
+ lea 32*0($a_ptr), %rdx
+ call avx2_select_n_store
+
+ # H = U2 - X3
+ lea `32*9*0`(%rsp), %rsi
+ lea `32*9*0`($r_ptr), %rdx
+ lea `32*9*3`(%rsp), %rdi
+ call avx2_sub_x4
+ call avx2_normalize_n_store
+
+ # H = H*R
+ lea `32*9*3`(%rsp), %rsi
+ lea `32*9*4`(%rsp), %rdx
+ lea `32*9*3`(%rsp), %rdi
+ call avx2_mul_x4
+ call avx2_normalize_n_store
+
+ # S2 = S1 * H^3
+ lea `32*9*7`(%rsp), %rsi
+ lea `32*9*1`($a_ptr), %rdx
+ lea `32*9*1`(%rsp), %rdi
+ call avx2_mul_x4
+ call avx2_normalize_n_store
+
+ #
+ lea `32*9*3`(%rsp), %rsi
+ lea `32*9*1`(%rsp), %rdx
+ lea `32*9*1`($r_ptr), %rdi
+ call avx2_sub_x4
+ call avx2_normalize
+
+ lea 32*9($b_ptr), %rsi
+ lea 32*9($a_ptr), %rdx
+ call avx2_select_n_store
+
+ #lea 32*9*0($r_ptr), %rsi
+ #lea 32*9*0($r_ptr), %rdi
+ #call avx2_mul_by1_x4
+ #NORMALIZE
+ #STORE
+
+ lea `32*9*1`($r_ptr), %rsi
+ lea `32*9*1`($r_ptr), %rdi
+ call avx2_mul_by1_x4
+ call avx2_normalize_n_store
+
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6, -16*10(%rbp)
+ movaps %xmm7, -16*9(%rbp)
+ movaps %xmm8, -16*8(%rbp)
+ movaps %xmm9, -16*7(%rbp)
+ movaps %xmm10, -16*6(%rbp)
+ movaps %xmm11, -16*5(%rbp)
+ movaps %xmm12, -16*4(%rbp)
+ movaps %xmm13, -16*3(%rbp)
+ movaps %xmm14, -16*2(%rbp)
+ movaps %xmm15, -16*1(%rbp)
+___
+$code.=<<___;
+ mov %rbp, %rsp
+ pop %rbp
+ ret
+.size ecp_nistz256_avx2_point_add_affines_x4,.-ecp_nistz256_avx2_point_add_affines_x4
+
+################################################################################
+# void ecp_nistz256_avx2_to_mont(void* RESULTx4, void *Ax4);
+.globl ecp_nistz256_avx2_to_mont
+.type ecp_nistz256_avx2_to_mont,\@function,2
+.align 32
+ecp_nistz256_avx2_to_mont:
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ lea -8-16*10(%rsp), %rsp
+ vmovaps %xmm6, -8-16*10(%rax)
+ vmovaps %xmm7, -8-16*9(%rax)
+ vmovaps %xmm8, -8-16*8(%rax)
+ vmovaps %xmm9, -8-16*7(%rax)
+ vmovaps %xmm10, -8-16*6(%rax)
+ vmovaps %xmm11, -8-16*5(%rax)
+ vmovaps %xmm12, -8-16*4(%rax)
+ vmovaps %xmm13, -8-16*3(%rax)
+ vmovaps %xmm14, -8-16*2(%rax)
+ vmovaps %xmm15, -8-16*1(%rax)
+___
+$code.=<<___;
+ vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
+ lea .LTO_MONT_AVX2(%rip), %rdx
+ call avx2_mul_x4
+ call avx2_normalize_n_store
+
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps 16*0(%rsp), %xmm6
+ movaps 16*1(%rsp), %xmm7
+ movaps 16*2(%rsp), %xmm8
+ movaps 16*3(%rsp), %xmm9
+ movaps 16*4(%rsp), %xmm10
+ movaps 16*5(%rsp), %xmm11
+ movaps 16*6(%rsp), %xmm12
+ movaps 16*7(%rsp), %xmm13
+ movaps 16*8(%rsp), %xmm14
+ movaps 16*9(%rsp), %xmm15
+ lea 8+16*10(%rsp), %rsp
+___
+$code.=<<___;
+ ret
+.size ecp_nistz256_avx2_to_mont,.-ecp_nistz256_avx2_to_mont
+
+################################################################################
+# void ecp_nistz256_avx2_from_mont(void* RESULTx4, void *Ax4);
+.globl ecp_nistz256_avx2_from_mont
+.type ecp_nistz256_avx2_from_mont,\@function,2
+.align 32
+ecp_nistz256_avx2_from_mont:
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ lea -8-16*10(%rsp), %rsp
+ vmovaps %xmm6, -8-16*10(%rax)
+ vmovaps %xmm7, -8-16*9(%rax)
+ vmovaps %xmm8, -8-16*8(%rax)
+ vmovaps %xmm9, -8-16*7(%rax)
+ vmovaps %xmm10, -8-16*6(%rax)
+ vmovaps %xmm11, -8-16*5(%rax)
+ vmovaps %xmm12, -8-16*4(%rax)
+ vmovaps %xmm13, -8-16*3(%rax)
+ vmovaps %xmm14, -8-16*2(%rax)
+ vmovaps %xmm15, -8-16*1(%rax)
+___
+$code.=<<___;
+ vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
+ lea .LFROM_MONT_AVX2(%rip), %rdx
+ call avx2_mul_x4
+ call avx2_normalize_n_store
+
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps 16*0(%rsp), %xmm6
+ movaps 16*1(%rsp), %xmm7
+ movaps 16*2(%rsp), %xmm8
+ movaps 16*3(%rsp), %xmm9
+ movaps 16*4(%rsp), %xmm10
+ movaps 16*5(%rsp), %xmm11
+ movaps 16*6(%rsp), %xmm12
+ movaps 16*7(%rsp), %xmm13
+ movaps 16*8(%rsp), %xmm14
+ movaps 16*9(%rsp), %xmm15
+ lea 8+16*10(%rsp), %rsp
+___
+$code.=<<___;
+ ret
+.size ecp_nistz256_avx2_from_mont,.-ecp_nistz256_avx2_from_mont
+
+################################################################################
+# void ecp_nistz256_avx2_set1(void* RESULTx4);
+.globl ecp_nistz256_avx2_set1
+.type ecp_nistz256_avx2_set1,\@function,1
+.align 32
+ecp_nistz256_avx2_set1:
+ lea .LONE+128(%rip), %rax
+ lea 128(%rdi), %rdi
+ vzeroupper
+ vmovdqa 32*0-128(%rax), %ymm0
+ vmovdqa 32*1-128(%rax), %ymm1
+ vmovdqa 32*2-128(%rax), %ymm2
+ vmovdqa 32*3-128(%rax), %ymm3
+ vmovdqa 32*4-128(%rax), %ymm4
+ vmovdqa 32*5-128(%rax), %ymm5
+ vmovdqa %ymm0, 32*0-128(%rdi)
+ vmovdqa 32*6-128(%rax), %ymm0
+ vmovdqa %ymm1, 32*1-128(%rdi)
+ vmovdqa 32*7-128(%rax), %ymm1
+ vmovdqa %ymm2, 32*2-128(%rdi)
+ vmovdqa 32*8-128(%rax), %ymm2
+ vmovdqa %ymm3, 32*3-128(%rdi)
+ vmovdqa %ymm4, 32*4-128(%rdi)
+ vmovdqa %ymm5, 32*5-128(%rdi)
+ vmovdqa %ymm0, 32*6-128(%rdi)
+ vmovdqa %ymm1, 32*7-128(%rdi)
+ vmovdqa %ymm2, 32*8-128(%rdi)
+
+ vzeroupper
+ ret
+.size ecp_nistz256_avx2_set1,.-ecp_nistz256_avx2_set1
+___
+}
+{
+################################################################################
+# void ecp_nistz256_avx2_multi_select_w7(void* RESULT, void *in,
+# int index0, int index1, int index2, int index3);
+################################################################################
+
+my ($val,$in_t,$index0,$index1,$index2,$index3)=("%rdi","%rsi","%edx","%ecx","%r8d","%r9d");
+my ($INDEX0,$INDEX1,$INDEX2,$INDEX3)=map("%ymm$_",(0..3));
+my ($R0a,$R0b,$R1a,$R1b,$R2a,$R2b,$R3a,$R3b)=map("%ymm$_",(4..11));
+my ($M0,$T0,$T1,$TMP0)=map("%ymm$_",(12..15));
+
+$code.=<<___;
+.globl ecp_nistz256_avx2_multi_select_w7
+.type ecp_nistz256_avx2_multi_select_w7,\@function,6
+.align 32
+ecp_nistz256_avx2_multi_select_w7:
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ lea -8-16*10(%rsp), %rsp
+ vmovaps %xmm6, -8-16*10(%rax)
+ vmovaps %xmm7, -8-16*9(%rax)
+ vmovaps %xmm8, -8-16*8(%rax)
+ vmovaps %xmm9, -8-16*7(%rax)
+ vmovaps %xmm10, -8-16*6(%rax)
+ vmovaps %xmm11, -8-16*5(%rax)
+ vmovaps %xmm12, -8-16*4(%rax)
+ vmovaps %xmm13, -8-16*3(%rax)
+ vmovaps %xmm14, -8-16*2(%rax)
+ vmovaps %xmm15, -8-16*1(%rax)
+___
+$code.=<<___;
+ lea .LIntOne(%rip), %rax
+
+ vmovd $index0, %xmm0
+ vmovd $index1, %xmm1
+ vmovd $index2, %xmm2
+ vmovd $index3, %xmm3
+
+ vpxor $R0a, $R0a, $R0a
+ vpxor $R0b, $R0b, $R0b
+ vpxor $R1a, $R1a, $R1a
+ vpxor $R1b, $R1b, $R1b
+ vpxor $R2a, $R2a, $R2a
+ vpxor $R2b, $R2b, $R2b
+ vpxor $R3a, $R3a, $R3a
+ vpxor $R3b, $R3b, $R3b
+ vmovdqa (%rax), $M0
+
+ vpermd $INDEX0, $R0a, $INDEX0
+ vpermd $INDEX1, $R0a, $INDEX1
+ vpermd $INDEX2, $R0a, $INDEX2
+ vpermd $INDEX3, $R0a, $INDEX3
+
+ mov \$64, %ecx
+ lea 112($val), $val # size optimization
+ jmp .Lmulti_select_loop_avx2
+
+# INDEX=0, corresponds to the point at infty (0,0)
+.align 32
+.Lmulti_select_loop_avx2:
+ vpcmpeqd $INDEX0, $M0, $TMP0
+
+ vmovdqa `32*0+32*64*2*0`($in_t), $T0
+ vmovdqa `32*1+32*64*2*0`($in_t), $T1
+ vpand $TMP0, $T0, $T0
+ vpand $TMP0, $T1, $T1
+ vpxor $T0, $R0a, $R0a
+ vpxor $T1, $R0b, $R0b
+
+ vpcmpeqd $INDEX1, $M0, $TMP0
+
+ vmovdqa `32*0+32*64*2*1`($in_t), $T0
+ vmovdqa `32*1+32*64*2*1`($in_t), $T1
+ vpand $TMP0, $T0, $T0
+ vpand $TMP0, $T1, $T1
+ vpxor $T0, $R1a, $R1a
+ vpxor $T1, $R1b, $R1b
+
+ vpcmpeqd $INDEX2, $M0, $TMP0
+
+ vmovdqa `32*0+32*64*2*2`($in_t), $T0
+ vmovdqa `32*1+32*64*2*2`($in_t), $T1
+ vpand $TMP0, $T0, $T0
+ vpand $TMP0, $T1, $T1
+ vpxor $T0, $R2a, $R2a
+ vpxor $T1, $R2b, $R2b
+
+ vpcmpeqd $INDEX3, $M0, $TMP0
+
+ vmovdqa `32*0+32*64*2*3`($in_t), $T0
+ vmovdqa `32*1+32*64*2*3`($in_t), $T1
+ vpand $TMP0, $T0, $T0
+ vpand $TMP0, $T1, $T1
+ vpxor $T0, $R3a, $R3a
+ vpxor $T1, $R3b, $R3b
+
+ vpaddd (%rax), $M0, $M0 # increment
+ lea 32*2($in_t), $in_t
+
+ dec %ecx
+ jnz .Lmulti_select_loop_avx2
+
+ vmovdqu $R0a, 32*0-112($val)
+ vmovdqu $R0b, 32*1-112($val)
+ vmovdqu $R1a, 32*2-112($val)
+ vmovdqu $R1b, 32*3-112($val)
+ vmovdqu $R2a, 32*4-112($val)
+ vmovdqu $R2b, 32*5-112($val)
+ vmovdqu $R3a, 32*6-112($val)
+ vmovdqu $R3b, 32*7-112($val)
+
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps 16*0(%rsp), %xmm6
+ movaps 16*1(%rsp), %xmm7
+ movaps 16*2(%rsp), %xmm8
+ movaps 16*3(%rsp), %xmm9
+ movaps 16*4(%rsp), %xmm10
+ movaps 16*5(%rsp), %xmm11
+ movaps 16*6(%rsp), %xmm12
+ movaps 16*7(%rsp), %xmm13
+ movaps 16*8(%rsp), %xmm14
+ movaps 16*9(%rsp), %xmm15
+ lea 8+16*10(%rsp), %rsp
+___
+$code.=<<___;
+ ret
+.size ecp_nistz256_avx2_multi_select_w7,.-ecp_nistz256_avx2_multi_select_w7
+
+.extern OPENSSL_ia32cap_P
+.globl ecp_nistz_avx2_eligible
+.type ecp_nistz_avx2_eligible,\@abi-omnipotent
+.align 32
+ecp_nistz_avx2_eligible:
+ mov OPENSSL_ia32cap_P+8(%rip),%eax
+ shr \$5,%eax
+ and \$1,%eax
+ ret
+.size ecp_nistz_avx2_eligible,.-ecp_nistz_avx2_eligible
+___
+}
+}} else {{ # assembler is too old
+$code.=<<___;
+.text
+
+.globl ecp_nistz256_avx2_transpose_convert
+.globl ecp_nistz256_avx2_convert_transpose_back
+.globl ecp_nistz256_avx2_point_add_affine_x4
+.globl ecp_nistz256_avx2_point_add_affines_x4
+.globl ecp_nistz256_avx2_to_mont
+.globl ecp_nistz256_avx2_from_mont
+.globl ecp_nistz256_avx2_set1
+.globl ecp_nistz256_avx2_multi_select_w7
+.type ecp_nistz256_avx2_multi_select_w7,\@abi-omnipotent
+ecp_nistz256_avx2_transpose_convert:
+ecp_nistz256_avx2_convert_transpose_back:
+ecp_nistz256_avx2_point_add_affine_x4:
+ecp_nistz256_avx2_point_add_affines_x4:
+ecp_nistz256_avx2_to_mont:
+ecp_nistz256_avx2_from_mont:
+ecp_nistz256_avx2_set1:
+ecp_nistz256_avx2_multi_select_w7:
+ .byte 0x0f,0x0b # ud2
+ ret
+.size ecp_nistz256_avx2_multi_select_w7,.-ecp_nistz256_avx2_multi_select_w7
+
+.globl ecp_nistz_avx2_eligible
+.type ecp_nistz_avx2_eligible,\@abi-omnipotent
+ecp_nistz_avx2_eligible:
+ xor %eax,%eax
+ ret
+.size ecp_nistz_avx2_eligible,.-ecp_nistz_avx2_eligible
+___
+}}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval($1)/geo;
+
+ print $_,"\n";
+}
+
+close STDOUT;
diff --git a/crypto/ec/asm/ecp_nistz256-x86_64.pl b/crypto/ec/asm/ecp_nistz256-x86_64.pl
new file mode 100755
index 000000000000..84379fce1cb9
--- /dev/null
+++ b/crypto/ec/asm/ecp_nistz256-x86_64.pl
@@ -0,0 +1,2997 @@
+#!/usr/bin/env perl
+
+##############################################################################
+# #
+# Copyright 2014 Intel Corporation #
+# #
+# Licensed under the Apache License, Version 2.0 (the "License"); #
+# you may not use this file except in compliance with the License. #
+# You may obtain a copy of the License at #
+# #
+# http://www.apache.org/licenses/LICENSE-2.0 #
+# #
+# Unless required by applicable law or agreed to in writing, software #
+# distributed under the License is distributed on an "AS IS" BASIS, #
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
+# See the License for the specific language governing permissions and #
+# limitations under the License. #
+# #
+##############################################################################
+# #
+# Developers and authors: #
+# Shay Gueron (1, 2), and Vlad Krasnov (1) #
+# (1) Intel Corporation, Israel Development Center #
+# (2) University of Haifa #
+# Reference: #
+# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with#
+# 256 Bit Primes" #
+# #
+##############################################################################
+
+# Further optimization by <appro@openssl.org>:
+#
+# this/original with/without -DECP_NISTZ256_ASM(*)
+# Opteron +12-49% +110-150%
+# Bulldozer +14-45% +175-210%
+# P4 +18-46% n/a :-(
+# Westmere +12-34% +80-87%
+# Sandy Bridge +9-35% +110-120%
+# Ivy Bridge +9-35% +110-125%
+# Haswell +8-37% +140-160%
+# Broadwell +18-58% +145-210%
+# Atom +15-50% +130-180%
+# VIA Nano +43-160% +300-480%
+#
+# (*) "without -DECP_NISTZ256_ASM" refers to build with
+# "enable-ec_nistp_64_gcc_128";
+#
+# Ranges denote minimum and maximum improvement coefficients depending
+# on benchmark. Lower coefficients are for ECDSA sign, relatively fastest
+# server-side operation. Keep in mind that +100% means 2x improvement.
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22);
+ $addx = ($1>=2.23);
+}
+
+if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+ $addx = ($1>=2.10);
+}
+
+if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+ $addx = ($1>=12);
+}
+
+if (!$addx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
+ my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
+ $avx = ($ver>=3.0) + ($ver>=3.01);
+ $addx = ($ver>=3.03);
+}
+
+$code.=<<___;
+.text
+.extern OPENSSL_ia32cap_P
+
+# The polynomial
+.align 64
+.Lpoly:
+.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
+
+# 2^512 mod P precomputed for NIST P256 polynomial
+.LRR:
+.quad 0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd
+
+.LOne:
+.long 1,1,1,1,1,1,1,1
+.LTwo:
+.long 2,2,2,2,2,2,2,2
+.LThree:
+.long 3,3,3,3,3,3,3,3
+.LONE_mont:
+.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
+___
+
+{
+################################################################################
+# void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]);
+
+my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11));
+my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13");
+my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx");
+
+$code.=<<___;
+
+.globl ecp_nistz256_mul_by_2
+.type ecp_nistz256_mul_by_2,\@function,2
+.align 64
+ecp_nistz256_mul_by_2:
+ push %r12
+ push %r13
+
+ mov 8*0($a_ptr), $a0
+ mov 8*1($a_ptr), $a1
+ add $a0, $a0 # a0:a3+a0:a3
+ mov 8*2($a_ptr), $a2
+ adc $a1, $a1
+ mov 8*3($a_ptr), $a3
+ lea .Lpoly(%rip), $a_ptr
+ mov $a0, $t0
+ adc $a2, $a2
+ adc $a3, $a3
+ mov $a1, $t1
+ sbb $t4, $t4
+
+ sub 8*0($a_ptr), $a0
+ mov $a2, $t2
+ sbb 8*1($a_ptr), $a1
+ sbb 8*2($a_ptr), $a2
+ mov $a3, $t3
+ sbb 8*3($a_ptr), $a3
+ test $t4, $t4
+
+ cmovz $t0, $a0
+ cmovz $t1, $a1
+ mov $a0, 8*0($r_ptr)
+ cmovz $t2, $a2
+ mov $a1, 8*1($r_ptr)
+ cmovz $t3, $a3
+ mov $a2, 8*2($r_ptr)
+ mov $a3, 8*3($r_ptr)
+
+ pop %r13
+ pop %r12
+ ret
+.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
+
+################################################################################
+# void ecp_nistz256_div_by_2(uint64_t res[4], uint64_t a[4]);
+.globl ecp_nistz256_div_by_2
+.type ecp_nistz256_div_by_2,\@function,2
+.align 32
+ecp_nistz256_div_by_2:
+ push %r12
+ push %r13
+
+ mov 8*0($a_ptr), $a0
+ mov 8*1($a_ptr), $a1
+ mov 8*2($a_ptr), $a2
+ mov $a0, $t0
+ mov 8*3($a_ptr), $a3
+ lea .Lpoly(%rip), $a_ptr
+
+ mov $a1, $t1
+ xor $t4, $t4
+ add 8*0($a_ptr), $a0
+ mov $a2, $t2
+ adc 8*1($a_ptr), $a1
+ adc 8*2($a_ptr), $a2
+ mov $a3, $t3
+ adc 8*3($a_ptr), $a3
+ adc \$0, $t4
+ xor $a_ptr, $a_ptr # borrow $a_ptr
+ test \$1, $t0
+
+ cmovz $t0, $a0
+ cmovz $t1, $a1
+ cmovz $t2, $a2
+ cmovz $t3, $a3
+ cmovz $a_ptr, $t4
+
+ mov $a1, $t0 # a0:a3>>1
+ shr \$1, $a0
+ shl \$63, $t0
+ mov $a2, $t1
+ shr \$1, $a1
+ or $t0, $a0
+ shl \$63, $t1
+ mov $a3, $t2
+ shr \$1, $a2
+ or $t1, $a1
+ shl \$63, $t2
+ shr \$1, $a3
+ shl \$63, $t4
+ or $t2, $a2
+ or $t4, $a3
+
+ mov $a0, 8*0($r_ptr)
+ mov $a1, 8*1($r_ptr)
+ mov $a2, 8*2($r_ptr)
+ mov $a3, 8*3($r_ptr)
+
+ pop %r13
+ pop %r12
+ ret
+.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
+
+################################################################################
+# void ecp_nistz256_mul_by_3(uint64_t res[4], uint64_t a[4]);
+.globl ecp_nistz256_mul_by_3
+.type ecp_nistz256_mul_by_3,\@function,2
+.align 32
+ecp_nistz256_mul_by_3:
+ push %r12
+ push %r13
+
+ mov 8*0($a_ptr), $a0
+ xor $t4, $t4
+ mov 8*1($a_ptr), $a1
+ add $a0, $a0 # a0:a3+a0:a3
+ mov 8*2($a_ptr), $a2
+ adc $a1, $a1
+ mov 8*3($a_ptr), $a3
+ mov $a0, $t0
+ adc $a2, $a2
+ adc $a3, $a3
+ mov $a1, $t1
+ adc \$0, $t4
+
+ sub \$-1, $a0
+ mov $a2, $t2
+ sbb .Lpoly+8*1(%rip), $a1
+ sbb \$0, $a2
+ mov $a3, $t3
+ sbb .Lpoly+8*3(%rip), $a3
+ test $t4, $t4
+
+ cmovz $t0, $a0
+ cmovz $t1, $a1
+ cmovz $t2, $a2
+ cmovz $t3, $a3
+
+ xor $t4, $t4
+ add 8*0($a_ptr), $a0 # a0:a3+=a_ptr[0:3]
+ adc 8*1($a_ptr), $a1
+ mov $a0, $t0
+ adc 8*2($a_ptr), $a2
+ adc 8*3($a_ptr), $a3
+ mov $a1, $t1
+ adc \$0, $t4
+
+ sub \$-1, $a0
+ mov $a2, $t2
+ sbb .Lpoly+8*1(%rip), $a1
+ sbb \$0, $a2
+ mov $a3, $t3
+ sbb .Lpoly+8*3(%rip), $a3
+ test $t4, $t4
+
+ cmovz $t0, $a0
+ cmovz $t1, $a1
+ mov $a0, 8*0($r_ptr)
+ cmovz $t2, $a2
+ mov $a1, 8*1($r_ptr)
+ cmovz $t3, $a3
+ mov $a2, 8*2($r_ptr)
+ mov $a3, 8*3($r_ptr)
+
+ pop %r13
+ pop %r12
+ ret
+.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
+
+################################################################################
+# void ecp_nistz256_add(uint64_t res[4], uint64_t a[4], uint64_t b[4]);
+.globl ecp_nistz256_add
+.type ecp_nistz256_add,\@function,3
+.align 32
+ecp_nistz256_add:
+ push %r12
+ push %r13
+
+ mov 8*0($a_ptr), $a0
+ xor $t4, $t4
+ mov 8*1($a_ptr), $a1
+ mov 8*2($a_ptr), $a2
+ mov 8*3($a_ptr), $a3
+ lea .Lpoly(%rip), $a_ptr
+
+ add 8*0($b_ptr), $a0
+ adc 8*1($b_ptr), $a1
+ mov $a0, $t0
+ adc 8*2($b_ptr), $a2
+ adc 8*3($b_ptr), $a3
+ mov $a1, $t1
+ adc \$0, $t4
+
+ sub 8*0($a_ptr), $a0
+ mov $a2, $t2
+ sbb 8*1($a_ptr), $a1
+ sbb 8*2($a_ptr), $a2
+ mov $a3, $t3
+ sbb 8*3($a_ptr), $a3
+ test $t4, $t4
+
+ cmovz $t0, $a0
+ cmovz $t1, $a1
+ mov $a0, 8*0($r_ptr)
+ cmovz $t2, $a2
+ mov $a1, 8*1($r_ptr)
+ cmovz $t3, $a3
+ mov $a2, 8*2($r_ptr)
+ mov $a3, 8*3($r_ptr)
+
+ pop %r13
+ pop %r12
+ ret
+.size ecp_nistz256_add,.-ecp_nistz256_add
+
+################################################################################
+# void ecp_nistz256_sub(uint64_t res[4], uint64_t a[4], uint64_t b[4]);
+.globl ecp_nistz256_sub
+.type ecp_nistz256_sub,\@function,3
+.align 32
+ecp_nistz256_sub:
+ push %r12
+ push %r13
+
+ mov 8*0($a_ptr), $a0
+ xor $t4, $t4
+ mov 8*1($a_ptr), $a1
+ mov 8*2($a_ptr), $a2
+ mov 8*3($a_ptr), $a3
+ lea .Lpoly(%rip), $a_ptr
+
+ sub 8*0($b_ptr), $a0
+ sbb 8*1($b_ptr), $a1
+ mov $a0, $t0
+ sbb 8*2($b_ptr), $a2
+ sbb 8*3($b_ptr), $a3
+ mov $a1, $t1
+ sbb \$0, $t4
+
+ add 8*0($a_ptr), $a0
+ mov $a2, $t2
+ adc 8*1($a_ptr), $a1
+ adc 8*2($a_ptr), $a2
+ mov $a3, $t3
+ adc 8*3($a_ptr), $a3
+ test $t4, $t4
+
+ cmovz $t0, $a0
+ cmovz $t1, $a1
+ mov $a0, 8*0($r_ptr)
+ cmovz $t2, $a2
+ mov $a1, 8*1($r_ptr)
+ cmovz $t3, $a3
+ mov $a2, 8*2($r_ptr)
+ mov $a3, 8*3($r_ptr)
+
+ pop %r13
+ pop %r12
+ ret
+.size ecp_nistz256_sub,.-ecp_nistz256_sub
+
+################################################################################
+# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
+.globl ecp_nistz256_neg
+.type ecp_nistz256_neg,\@function,2
+.align 32
+ecp_nistz256_neg:
+ push %r12
+ push %r13
+
+ xor $a0, $a0
+ xor $a1, $a1
+ xor $a2, $a2
+ xor $a3, $a3
+ xor $t4, $t4
+
+ sub 8*0($a_ptr), $a0
+ sbb 8*1($a_ptr), $a1
+ sbb 8*2($a_ptr), $a2
+ mov $a0, $t0
+ sbb 8*3($a_ptr), $a3
+ lea .Lpoly(%rip), $a_ptr
+ mov $a1, $t1
+ sbb \$0, $t4
+
+ add 8*0($a_ptr), $a0
+ mov $a2, $t2
+ adc 8*1($a_ptr), $a1
+ adc 8*2($a_ptr), $a2
+ mov $a3, $t3
+ adc 8*3($a_ptr), $a3
+ test $t4, $t4
+
+ cmovz $t0, $a0
+ cmovz $t1, $a1
+ mov $a0, 8*0($r_ptr)
+ cmovz $t2, $a2
+ mov $a1, 8*1($r_ptr)
+ cmovz $t3, $a3
+ mov $a2, 8*2($r_ptr)
+ mov $a3, 8*3($r_ptr)
+
+ pop %r13
+ pop %r12
+ ret
+.size ecp_nistz256_neg,.-ecp_nistz256_neg
+___
+}
+{
+my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
+my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
+my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
+my ($poly1,$poly3)=($acc6,$acc7);
+
+$code.=<<___;
+################################################################################
+# void ecp_nistz256_to_mont(
+# uint64_t res[4],
+# uint64_t in[4]);
+.globl ecp_nistz256_to_mont
+.type ecp_nistz256_to_mont,\@function,2
+.align 32
+ecp_nistz256_to_mont:
+___
+$code.=<<___ if ($addx);
+ mov \$0x80100, %ecx
+ and OPENSSL_ia32cap_P+8(%rip), %ecx
+___
+$code.=<<___;
+ lea .LRR(%rip), $b_org
+ jmp .Lmul_mont
+.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
+
+################################################################################
+# void ecp_nistz256_mul_mont(
+# uint64_t res[4],
+# uint64_t a[4],
+# uint64_t b[4]);
+
+.globl ecp_nistz256_mul_mont
+.type ecp_nistz256_mul_mont,\@function,3
+.align 32
+ecp_nistz256_mul_mont:
+___
+$code.=<<___ if ($addx);
+ mov \$0x80100, %ecx
+ and OPENSSL_ia32cap_P+8(%rip), %ecx
+___
+$code.=<<___;
+.Lmul_mont:
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($addx);
+ cmp \$0x80100, %ecx
+ je .Lmul_montx
+___
+$code.=<<___;
+ mov $b_org, $b_ptr
+ mov 8*0($b_org), %rax
+ mov 8*0($a_ptr), $acc1
+ mov 8*1($a_ptr), $acc2
+ mov 8*2($a_ptr), $acc3
+ mov 8*3($a_ptr), $acc4
+
+ call __ecp_nistz256_mul_montq
+___
+$code.=<<___ if ($addx);
+ jmp .Lmul_mont_done
+
+.align 32
+.Lmul_montx:
+ mov $b_org, $b_ptr
+ mov 8*0($b_org), %rdx
+ mov 8*0($a_ptr), $acc1
+ mov 8*1($a_ptr), $acc2
+ mov 8*2($a_ptr), $acc3
+ mov 8*3($a_ptr), $acc4
+ lea -128($a_ptr), $a_ptr # control u-op density
+
+ call __ecp_nistz256_mul_montx
+___
+$code.=<<___;
+.Lmul_mont_done:
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+ pop %rbp
+ ret
+.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
+
+.type __ecp_nistz256_mul_montq,\@abi-omnipotent
+.align 32
+__ecp_nistz256_mul_montq:
+ ########################################################################
+ # Multiply a by b[0]
+ mov %rax, $t1
+ mulq $acc1
+ mov .Lpoly+8*1(%rip),$poly1
+ mov %rax, $acc0
+ mov $t1, %rax
+ mov %rdx, $acc1
+
+ mulq $acc2
+ mov .Lpoly+8*3(%rip),$poly3
+ add %rax, $acc1
+ mov $t1, %rax
+ adc \$0, %rdx
+ mov %rdx, $acc2
+
+ mulq $acc3
+ add %rax, $acc2
+ mov $t1, %rax
+ adc \$0, %rdx
+ mov %rdx, $acc3
+
+ mulq $acc4
+ add %rax, $acc3
+ mov $acc0, %rax
+ adc \$0, %rdx
+ xor $acc5, $acc5
+ mov %rdx, $acc4
+
+ ########################################################################
+ # First reduction step
+ # Basically now we want to multiply acc[0] by p256,
+ # and add the result to the acc.
+ # Due to the special form of p256 we do some optimizations
+ #
+ # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
+ # then we add acc[0] and get acc[0] x 2^96
+
+ mov $acc0, $t1
+ shl \$32, $acc0
+ mulq $poly3
+ shr \$32, $t1
+ add $acc0, $acc1 # +=acc[0]<<96
+ adc $t1, $acc2
+ adc %rax, $acc3
+ mov 8*1($b_ptr), %rax
+ adc %rdx, $acc4
+ adc \$0, $acc5
+ xor $acc0, $acc0
+
+ ########################################################################
+ # Multiply by b[1]
+ mov %rax, $t1
+ mulq 8*0($a_ptr)
+ add %rax, $acc1
+ mov $t1, %rax
+ adc \$0, %rdx
+ mov %rdx, $t0
+
+ mulq 8*1($a_ptr)
+ add $t0, $acc2
+ adc \$0, %rdx
+ add %rax, $acc2
+ mov $t1, %rax
+ adc \$0, %rdx
+ mov %rdx, $t0
+
+ mulq 8*2($a_ptr)
+ add $t0, $acc3
+ adc \$0, %rdx
+ add %rax, $acc3
+ mov $t1, %rax
+ adc \$0, %rdx
+ mov %rdx, $t0
+
+ mulq 8*3($a_ptr)
+ add $t0, $acc4
+ adc \$0, %rdx
+ add %rax, $acc4
+ mov $acc1, %rax
+ adc %rdx, $acc5
+ adc \$0, $acc0
+
+ ########################################################################
+ # Second reduction step
+ mov $acc1, $t1
+ shl \$32, $acc1
+ mulq $poly3
+ shr \$32, $t1
+ add $acc1, $acc2
+ adc $t1, $acc3
+ adc %rax, $acc4
+ mov 8*2($b_ptr), %rax
+ adc %rdx, $acc5
+ adc \$0, $acc0
+ xor $acc1, $acc1
+
+ ########################################################################
+ # Multiply by b[2]
+ mov %rax, $t1
+ mulq 8*0($a_ptr)
+ add %rax, $acc2
+ mov $t1, %rax
+ adc \$0, %rdx
+ mov %rdx, $t0
+
+ mulq 8*1($a_ptr)
+ add $t0, $acc3
+ adc \$0, %rdx
+ add %rax, $acc3
+ mov $t1, %rax
+ adc \$0, %rdx
+ mov %rdx, $t0
+
+ mulq 8*2($a_ptr)
+ add $t0, $acc4
+ adc \$0, %rdx
+ add %rax, $acc4
+ mov $t1, %rax
+ adc \$0, %rdx
+ mov %rdx, $t0
+
+ mulq 8*3($a_ptr)
+ add $t0, $acc5
+ adc \$0, %rdx
+ add %rax, $acc5
+ mov $acc2, %rax
+ adc %rdx, $acc0
+ adc \$0, $acc1
+
+ ########################################################################
+ # Third reduction step
+ mov $acc2, $t1
+ shl \$32, $acc2
+ mulq $poly3
+ shr \$32, $t1
+ add $acc2, $acc3
+ adc $t1, $acc4
+ adc %rax, $acc5
+ mov 8*3($b_ptr), %rax
+ adc %rdx, $acc0
+ adc \$0, $acc1
+ xor $acc2, $acc2
+
+ ########################################################################
+ # Multiply by b[3]
+ mov %rax, $t1
+ mulq 8*0($a_ptr)
+ add %rax, $acc3
+ mov $t1, %rax
+ adc \$0, %rdx
+ mov %rdx, $t0
+
+ mulq 8*1($a_ptr)
+ add $t0, $acc4
+ adc \$0, %rdx
+ add %rax, $acc4
+ mov $t1, %rax
+ adc \$0, %rdx
+ mov %rdx, $t0
+
+ mulq 8*2($a_ptr)
+ add $t0, $acc5
+ adc \$0, %rdx
+ add %rax, $acc5
+ mov $t1, %rax
+ adc \$0, %rdx
+ mov %rdx, $t0
+
+ mulq 8*3($a_ptr)
+ add $t0, $acc0
+ adc \$0, %rdx
+ add %rax, $acc0
+ mov $acc3, %rax
+ adc %rdx, $acc1
+ adc \$0, $acc2
+
+ ########################################################################
+ # Final reduction step
+ mov $acc3, $t1
+ shl \$32, $acc3
+ mulq $poly3
+ shr \$32, $t1
+ add $acc3, $acc4
+ adc $t1, $acc5
+ mov $acc4, $t0
+ adc %rax, $acc0
+ adc %rdx, $acc1
+ mov $acc5, $t1
+ adc \$0, $acc2
+
+ ########################################################################
+ # Branch-less conditional subtraction of P
+ sub \$-1, $acc4 # .Lpoly[0]
+ mov $acc0, $t2
+ sbb $poly1, $acc5 # .Lpoly[1]
+ sbb \$0, $acc0 # .Lpoly[2]
+ mov $acc1, $t3
+ sbb $poly3, $acc1 # .Lpoly[3]
+ sbb \$0, $acc2
+
+ cmovc $t0, $acc4
+ cmovc $t1, $acc5
+ mov $acc4, 8*0($r_ptr)
+ cmovc $t2, $acc0
+ mov $acc5, 8*1($r_ptr)
+ cmovc $t3, $acc1
+ mov $acc0, 8*2($r_ptr)
+ mov $acc1, 8*3($r_ptr)
+
+ ret
+.size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
+
+################################################################################
+# void ecp_nistz256_sqr_mont(
+# uint64_t res[4],
+# uint64_t a[4]);
+
+# we optimize the square according to S.Gueron and V.Krasnov,
+# "Speeding up Big-Number Squaring"
+.globl ecp_nistz256_sqr_mont
+.type ecp_nistz256_sqr_mont,\@function,2
+.align 32
+ecp_nistz256_sqr_mont:
+___
+$code.=<<___ if ($addx);
+ mov \$0x80100, %ecx
+ and OPENSSL_ia32cap_P+8(%rip), %ecx
+___
+$code.=<<___;
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($addx);
+ cmp \$0x80100, %ecx
+ je .Lsqr_montx
+___
+$code.=<<___;
+ mov 8*0($a_ptr), %rax
+ mov 8*1($a_ptr), $acc6
+ mov 8*2($a_ptr), $acc7
+ mov 8*3($a_ptr), $acc0
+
+ call __ecp_nistz256_sqr_montq
+___
+$code.=<<___ if ($addx);
+ jmp .Lsqr_mont_done
+
+.align 32
+.Lsqr_montx:
+ mov 8*0($a_ptr), %rdx
+ mov 8*1($a_ptr), $acc6
+ mov 8*2($a_ptr), $acc7
+ mov 8*3($a_ptr), $acc0
+ lea -128($a_ptr), $a_ptr # control u-op density
+
+ call __ecp_nistz256_sqr_montx
+___
+$code.=<<___;
+.Lsqr_mont_done:
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+ pop %rbp
+ ret
+.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
+
+.type __ecp_nistz256_sqr_montq,\@abi-omnipotent
+.align 32
+__ecp_nistz256_sqr_montq:
+ mov %rax, $acc5
+ mulq $acc6 # a[1]*a[0]
+ mov %rax, $acc1
+ mov $acc7, %rax
+ mov %rdx, $acc2
+
+ mulq $acc5 # a[0]*a[2]
+ add %rax, $acc2
+ mov $acc0, %rax
+ adc \$0, %rdx
+ mov %rdx, $acc3
+
+ mulq $acc5 # a[0]*a[3]
+ add %rax, $acc3
+ mov $acc7, %rax
+ adc \$0, %rdx
+ mov %rdx, $acc4
+
+ #################################
+ mulq $acc6 # a[1]*a[2]
+ add %rax, $acc3
+ mov $acc0, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mulq $acc6 # a[1]*a[3]
+ add %rax, $acc4
+ mov $acc0, %rax
+ adc \$0, %rdx
+ add $t1, $acc4
+ mov %rdx, $acc5
+ adc \$0, $acc5
+
+ #################################
+ mulq $acc7 # a[2]*a[3]
+ xor $acc7, $acc7
+ add %rax, $acc5
+ mov 8*0($a_ptr), %rax
+ mov %rdx, $acc6
+ adc \$0, $acc6
+
+ add $acc1, $acc1 # acc1:6<<1
+ adc $acc2, $acc2
+ adc $acc3, $acc3
+ adc $acc4, $acc4
+ adc $acc5, $acc5
+ adc $acc6, $acc6
+ adc \$0, $acc7
+
+ mulq %rax
+ mov %rax, $acc0
+ mov 8*1($a_ptr), %rax
+ mov %rdx, $t0
+
+ mulq %rax
+ add $t0, $acc1
+ adc %rax, $acc2
+ mov 8*2($a_ptr), %rax
+ adc \$0, %rdx
+ mov %rdx, $t0
+
+ mulq %rax
+ add $t0, $acc3
+ adc %rax, $acc4
+ mov 8*3($a_ptr), %rax
+ adc \$0, %rdx
+ mov %rdx, $t0
+
+ mulq %rax
+ add $t0, $acc5
+ adc %rax, $acc6
+ mov $acc0, %rax
+ adc %rdx, $acc7
+
+ mov .Lpoly+8*1(%rip), $a_ptr
+ mov .Lpoly+8*3(%rip), $t1
+
+ ##########################################
+ # Now the reduction
+ # First iteration
+ mov $acc0, $t0
+ shl \$32, $acc0
+ mulq $t1
+ shr \$32, $t0
+ add $acc0, $acc1 # +=acc[0]<<96
+ adc $t0, $acc2
+ adc %rax, $acc3
+ mov $acc1, %rax
+ adc \$0, %rdx
+
+ ##########################################
+ # Second iteration
+ mov $acc1, $t0
+ shl \$32, $acc1
+ mov %rdx, $acc0
+ mulq $t1
+ shr \$32, $t0
+ add $acc1, $acc2
+ adc $t0, $acc3
+ adc %rax, $acc0
+ mov $acc2, %rax
+ adc \$0, %rdx
+
+ ##########################################
+ # Third iteration
+ mov $acc2, $t0
+ shl \$32, $acc2
+ mov %rdx, $acc1
+ mulq $t1
+ shr \$32, $t0
+ add $acc2, $acc3
+ adc $t0, $acc0
+ adc %rax, $acc1
+ mov $acc3, %rax
+ adc \$0, %rdx
+
+ ###########################################
+ # Last iteration
+ mov $acc3, $t0
+ shl \$32, $acc3
+ mov %rdx, $acc2
+ mulq $t1
+ shr \$32, $t0
+ add $acc3, $acc0
+ adc $t0, $acc1
+ adc %rax, $acc2
+ adc \$0, %rdx
+ xor $acc3, $acc3
+
+ ############################################
+ # Add the rest of the acc
+ add $acc0, $acc4
+ adc $acc1, $acc5
+ mov $acc4, $acc0
+ adc $acc2, $acc6
+ adc %rdx, $acc7
+ mov $acc5, $acc1
+ adc \$0, $acc3
+
+ sub \$-1, $acc4 # .Lpoly[0]
+ mov $acc6, $acc2
+ sbb $a_ptr, $acc5 # .Lpoly[1]
+ sbb \$0, $acc6 # .Lpoly[2]
+ mov $acc7, $t0
+ sbb $t1, $acc7 # .Lpoly[3]
+ sbb \$0, $acc3
+
+ cmovc $acc0, $acc4
+ cmovc $acc1, $acc5
+ mov $acc4, 8*0($r_ptr)
+ cmovc $acc2, $acc6
+ mov $acc5, 8*1($r_ptr)
+ cmovc $t0, $acc7
+ mov $acc6, 8*2($r_ptr)
+ mov $acc7, 8*3($r_ptr)
+
+ ret
+.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
+___
+
+if ($addx) {
+$code.=<<___;
+.type __ecp_nistz256_mul_montx,\@abi-omnipotent
+.align 32
+__ecp_nistz256_mul_montx:
+ ########################################################################
+ # Multiply by b[0]
+ mulx $acc1, $acc0, $acc1
+ mulx $acc2, $t0, $acc2
+ mov \$32, $poly1
+ xor $acc5, $acc5 # cf=0
+ mulx $acc3, $t1, $acc3
+ mov .Lpoly+8*3(%rip), $poly3
+ adc $t0, $acc1
+ mulx $acc4, $t0, $acc4
+ mov $acc0, %rdx
+ adc $t1, $acc2
+ shlx $poly1,$acc0,$t1
+ adc $t0, $acc3
+ shrx $poly1,$acc0,$t0
+ adc \$0, $acc4
+
+ ########################################################################
+ # First reduction step
+ add $t1, $acc1
+ adc $t0, $acc2
+
+ mulx $poly3, $t0, $t1
+ mov 8*1($b_ptr), %rdx
+ adc $t0, $acc3
+ adc $t1, $acc4
+ adc \$0, $acc5
+ xor $acc0, $acc0 # $acc0=0,cf=0,of=0
+
+ ########################################################################
+ # Multiply by b[1]
+ mulx 8*0+128($a_ptr), $t0, $t1
+ adcx $t0, $acc1
+ adox $t1, $acc2
+
+ mulx 8*1+128($a_ptr), $t0, $t1
+ adcx $t0, $acc2
+ adox $t1, $acc3
+
+ mulx 8*2+128($a_ptr), $t0, $t1
+ adcx $t0, $acc3
+ adox $t1, $acc4
+
+ mulx 8*3+128($a_ptr), $t0, $t1
+ mov $acc1, %rdx
+ adcx $t0, $acc4
+ shlx $poly1, $acc1, $t0
+ adox $t1, $acc5
+ shrx $poly1, $acc1, $t1
+
+ adcx $acc0, $acc5
+ adox $acc0, $acc0
+ adc \$0, $acc0
+
+ ########################################################################
+ # Second reduction step
+ add $t0, $acc2
+ adc $t1, $acc3
+
+ mulx $poly3, $t0, $t1
+ mov 8*2($b_ptr), %rdx
+ adc $t0, $acc4
+ adc $t1, $acc5
+ adc \$0, $acc0
+ xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0
+
+ ########################################################################
+ # Multiply by b[2]
+ mulx 8*0+128($a_ptr), $t0, $t1
+ adcx $t0, $acc2
+ adox $t1, $acc3
+
+ mulx 8*1+128($a_ptr), $t0, $t1
+ adcx $t0, $acc3
+ adox $t1, $acc4
+
+ mulx 8*2+128($a_ptr), $t0, $t1
+ adcx $t0, $acc4
+ adox $t1, $acc5
+
+ mulx 8*3+128($a_ptr), $t0, $t1
+ mov $acc2, %rdx
+ adcx $t0, $acc5
+ shlx $poly1, $acc2, $t0
+ adox $t1, $acc0
+ shrx $poly1, $acc2, $t1
+
+ adcx $acc1, $acc0
+ adox $acc1, $acc1
+ adc \$0, $acc1
+
+ ########################################################################
+ # Third reduction step
+ add $t0, $acc3
+ adc $t1, $acc4
+
+ mulx $poly3, $t0, $t1
+ mov 8*3($b_ptr), %rdx
+ adc $t0, $acc5
+ adc $t1, $acc0
+ adc \$0, $acc1
+ xor $acc2, $acc2 # $acc2=0,cf=0,of=0
+
+ ########################################################################
+ # Multiply by b[3]
+ mulx 8*0+128($a_ptr), $t0, $t1
+ adcx $t0, $acc3
+ adox $t1, $acc4
+
+ mulx 8*1+128($a_ptr), $t0, $t1
+ adcx $t0, $acc4
+ adox $t1, $acc5
+
+ mulx 8*2+128($a_ptr), $t0, $t1
+ adcx $t0, $acc5
+ adox $t1, $acc0
+
+ mulx 8*3+128($a_ptr), $t0, $t1
+ mov $acc3, %rdx
+ adcx $t0, $acc0
+ shlx $poly1, $acc3, $t0
+ adox $t1, $acc1
+ shrx $poly1, $acc3, $t1
+
+ adcx $acc2, $acc1
+ adox $acc2, $acc2
+ adc \$0, $acc2
+
+ ########################################################################
+ # Fourth reduction step
+ add $t0, $acc4
+ adc $t1, $acc5
+
+ mulx $poly3, $t0, $t1
+ mov $acc4, $t2
+ mov .Lpoly+8*1(%rip), $poly1
+ adc $t0, $acc0
+ mov $acc5, $t3
+ adc $t1, $acc1
+ adc \$0, $acc2
+
+ ########################################################################
+ # Branch-less conditional subtraction of P
+ xor %eax, %eax
+ mov $acc0, $t0
+ sbb \$-1, $acc4 # .Lpoly[0]
+ sbb $poly1, $acc5 # .Lpoly[1]
+ sbb \$0, $acc0 # .Lpoly[2]
+ mov $acc1, $t1
+ sbb $poly3, $acc1 # .Lpoly[3]
+ sbb \$0, $acc2
+
+ cmovc $t2, $acc4
+ cmovc $t3, $acc5
+ mov $acc4, 8*0($r_ptr)
+ cmovc $t0, $acc0
+ mov $acc5, 8*1($r_ptr)
+ cmovc $t1, $acc1
+ mov $acc0, 8*2($r_ptr)
+ mov $acc1, 8*3($r_ptr)
+
+ ret
+.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
+
+.type __ecp_nistz256_sqr_montx,\@abi-omnipotent
+.align 32
+__ecp_nistz256_sqr_montx:
+ mulx $acc6, $acc1, $acc2 # a[0]*a[1]
+ mulx $acc7, $t0, $acc3 # a[0]*a[2]
+ xor %eax, %eax
+ adc $t0, $acc2
+ mulx $acc0, $t1, $acc4 # a[0]*a[3]
+ mov $acc6, %rdx
+ adc $t1, $acc3
+ adc \$0, $acc4
+ xor $acc5, $acc5 # $acc5=0,cf=0,of=0
+
+ #################################
+ mulx $acc7, $t0, $t1 # a[1]*a[2]
+ adcx $t0, $acc3
+ adox $t1, $acc4
+
+ mulx $acc0, $t0, $t1 # a[1]*a[3]
+ mov $acc7, %rdx
+ adcx $t0, $acc4
+ adox $t1, $acc5
+ adc \$0, $acc5
+
+ #################################
+ mulx $acc0, $t0, $acc6 # a[2]*a[3]
+ mov 8*0+128($a_ptr), %rdx
+ xor $acc7, $acc7 # $acc7=0,cf=0,of=0
+ adcx $acc1, $acc1 # acc1:6<<1
+ adox $t0, $acc5
+ adcx $acc2, $acc2
+ adox $acc7, $acc6 # of=0
+
+ mulx %rdx, $acc0, $t1
+ mov 8*1+128($a_ptr), %rdx
+ adcx $acc3, $acc3
+ adox $t1, $acc1
+ adcx $acc4, $acc4
+ mulx %rdx, $t0, $t4
+ mov 8*2+128($a_ptr), %rdx
+ adcx $acc5, $acc5
+ adox $t0, $acc2
+ adcx $acc6, $acc6
+ .byte 0x67
+ mulx %rdx, $t0, $t1
+ mov 8*3+128($a_ptr), %rdx
+ adox $t4, $acc3
+ adcx $acc7, $acc7
+ adox $t0, $acc4
+ mov \$32, $a_ptr
+ adox $t1, $acc5
+ .byte 0x67,0x67
+ mulx %rdx, $t0, $t4
+ mov $acc0, %rdx
+ adox $t0, $acc6
+ shlx $a_ptr, $acc0, $t0
+ adox $t4, $acc7
+ shrx $a_ptr, $acc0, $t4
+ mov .Lpoly+8*3(%rip), $t1
+
+ # reduction step 1
+ add $t0, $acc1
+ adc $t4, $acc2
+
+ mulx $t1, $t0, $acc0
+ mov $acc1, %rdx
+ adc $t0, $acc3
+ shlx $a_ptr, $acc1, $t0
+ adc \$0, $acc0
+ shrx $a_ptr, $acc1, $t4
+
+ # reduction step 2
+ add $t0, $acc2
+ adc $t4, $acc3
+
+ mulx $t1, $t0, $acc1
+ mov $acc2, %rdx
+ adc $t0, $acc0
+ shlx $a_ptr, $acc2, $t0
+ adc \$0, $acc1
+ shrx $a_ptr, $acc2, $t4
+
+ # reduction step 3
+ add $t0, $acc3
+ adc $t4, $acc0
+
+ mulx $t1, $t0, $acc2
+ mov $acc3, %rdx
+ adc $t0, $acc1
+ shlx $a_ptr, $acc3, $t0
+ adc \$0, $acc2
+ shrx $a_ptr, $acc3, $t4
+
+ # reduction step 4
+ add $t0, $acc0
+ adc $t4, $acc1
+
+ mulx $t1, $t0, $acc3
+ adc $t0, $acc2
+ adc \$0, $acc3
+
+ xor $t3, $t3 # cf=0
+ adc $acc0, $acc4 # accumulate upper half
+ mov .Lpoly+8*1(%rip), $a_ptr
+ adc $acc1, $acc5
+ mov $acc4, $acc0
+ adc $acc2, $acc6
+ adc $acc3, $acc7
+ mov $acc5, $acc1
+ adc \$0, $t3
+
+ xor %eax, %eax # cf=0
+ sbb \$-1, $acc4 # .Lpoly[0]
+ mov $acc6, $acc2
+ sbb $a_ptr, $acc5 # .Lpoly[1]
+ sbb \$0, $acc6 # .Lpoly[2]
+ mov $acc7, $acc3
+ sbb $t1, $acc7 # .Lpoly[3]
+ sbb \$0, $t3
+
+ cmovc $acc0, $acc4
+ cmovc $acc1, $acc5
+ mov $acc4, 8*0($r_ptr)
+ cmovc $acc2, $acc6
+ mov $acc5, 8*1($r_ptr)
+ cmovc $acc3, $acc7
+ mov $acc6, 8*2($r_ptr)
+ mov $acc7, 8*3($r_ptr)
+
+ ret
+.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
+___
+}
+}
+{
+my ($r_ptr,$in_ptr)=("%rdi","%rsi");
+my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11));
+my ($t0,$t1,$t2)=("%rcx","%r12","%r13");
+
+$code.=<<___;
+################################################################################
+# void ecp_nistz256_from_mont(
+# uint64_t res[4],
+# uint64_t in[4]);
+# This one performs Montgomery multiplication by 1, so we only need the reduction
+
+.globl ecp_nistz256_from_mont
+.type ecp_nistz256_from_mont,\@function,2
+.align 32
+ecp_nistz256_from_mont:
+ push %r12
+ push %r13
+
+ mov 8*0($in_ptr), %rax
+ mov .Lpoly+8*3(%rip), $t2
+ mov 8*1($in_ptr), $acc1
+ mov 8*2($in_ptr), $acc2
+ mov 8*3($in_ptr), $acc3
+ mov %rax, $acc0
+ mov .Lpoly+8*1(%rip), $t1
+
+ #########################################
+ # First iteration
+ mov %rax, $t0
+ shl \$32, $acc0
+ mulq $t2
+ shr \$32, $t0
+ add $acc0, $acc1
+ adc $t0, $acc2
+ adc %rax, $acc3
+ mov $acc1, %rax
+ adc \$0, %rdx
+
+ #########################################
+ # Second iteration
+ mov $acc1, $t0
+ shl \$32, $acc1
+ mov %rdx, $acc0
+ mulq $t2
+ shr \$32, $t0
+ add $acc1, $acc2
+ adc $t0, $acc3
+ adc %rax, $acc0
+ mov $acc2, %rax
+ adc \$0, %rdx
+
+ ##########################################
+ # Third iteration
+ mov $acc2, $t0
+ shl \$32, $acc2
+ mov %rdx, $acc1
+ mulq $t2
+ shr \$32, $t0
+ add $acc2, $acc3
+ adc $t0, $acc0
+ adc %rax, $acc1
+ mov $acc3, %rax
+ adc \$0, %rdx
+
+ ###########################################
+ # Last iteration
+ mov $acc3, $t0
+ shl \$32, $acc3
+ mov %rdx, $acc2
+ mulq $t2
+ shr \$32, $t0
+ add $acc3, $acc0
+ adc $t0, $acc1
+ mov $acc0, $t0
+ adc %rax, $acc2
+ mov $acc1, $in_ptr
+ adc \$0, %rdx
+
+ ###########################################
+ # Branch-less conditional subtraction
+ sub \$-1, $acc0
+ mov $acc2, %rax
+ sbb $t1, $acc1
+ sbb \$0, $acc2
+ mov %rdx, $acc3
+ sbb $t2, %rdx
+ sbb $t2, $t2
+
+ cmovnz $t0, $acc0
+ cmovnz $in_ptr, $acc1
+ mov $acc0, 8*0($r_ptr)
+ cmovnz %rax, $acc2
+ mov $acc1, 8*1($r_ptr)
+ cmovz %rdx, $acc3
+ mov $acc2, 8*2($r_ptr)
+ mov $acc3, 8*3($r_ptr)
+
+ pop %r13
+ pop %r12
+ ret
+.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
+___
+}
+{
+my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
+my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7));
+my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15));
+my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15));
+
+$code.=<<___;
+################################################################################
+# void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
+.globl ecp_nistz256_select_w5
+.type ecp_nistz256_select_w5,\@abi-omnipotent
+.align 32
+ecp_nistz256_select_w5:
+___
+$code.=<<___ if ($avx>1);
+ mov OPENSSL_ia32cap_P+8(%rip), %eax
+ test \$`1<<5`, %eax
+ jnz .Lavx2_select_w5
+___
+$code.=<<___ if ($win64);
+ lea -0x88(%rsp), %rax
+.LSEH_begin_ecp_nistz256_select_w5:
+ .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
+ .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax)
+ .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax)
+ .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax)
+ .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax)
+ .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax)
+ .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax)
+ .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax)
+ .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax)
+ .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax)
+ .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax)
+___
+$code.=<<___;
+ movdqa .LOne(%rip), $ONE
+ movd $index, $INDEX
+
+ pxor $Ra, $Ra
+ pxor $Rb, $Rb
+ pxor $Rc, $Rc
+ pxor $Rd, $Rd
+ pxor $Re, $Re
+ pxor $Rf, $Rf
+
+ movdqa $ONE, $M0
+ pshufd \$0, $INDEX, $INDEX
+
+ mov \$16, %rax
+.Lselect_loop_sse_w5:
+
+ movdqa $M0, $TMP0
+ paddd $ONE, $M0
+ pcmpeqd $INDEX, $TMP0
+
+ movdqa 16*0($in_t), $T0a
+ movdqa 16*1($in_t), $T0b
+ movdqa 16*2($in_t), $T0c
+ movdqa 16*3($in_t), $T0d
+ movdqa 16*4($in_t), $T0e
+ movdqa 16*5($in_t), $T0f
+ lea 16*6($in_t), $in_t
+
+ pand $TMP0, $T0a
+ pand $TMP0, $T0b
+ por $T0a, $Ra
+ pand $TMP0, $T0c
+ por $T0b, $Rb
+ pand $TMP0, $T0d
+ por $T0c, $Rc
+ pand $TMP0, $T0e
+ por $T0d, $Rd
+ pand $TMP0, $T0f
+ por $T0e, $Re
+ por $T0f, $Rf
+
+ dec %rax
+ jnz .Lselect_loop_sse_w5
+
+ movdqu $Ra, 16*0($val)
+ movdqu $Rb, 16*1($val)
+ movdqu $Rc, 16*2($val)
+ movdqu $Rd, 16*3($val)
+ movdqu $Re, 16*4($val)
+ movdqu $Rf, 16*5($val)
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp), %xmm6
+ movaps 0x10(%rsp), %xmm7
+ movaps 0x20(%rsp), %xmm8
+ movaps 0x30(%rsp), %xmm9
+ movaps 0x40(%rsp), %xmm10
+ movaps 0x50(%rsp), %xmm11
+ movaps 0x60(%rsp), %xmm12
+ movaps 0x70(%rsp), %xmm13
+ movaps 0x80(%rsp), %xmm14
+ movaps 0x90(%rsp), %xmm15
+ lea 0xa8(%rsp), %rsp
+.LSEH_end_ecp_nistz256_select_w5:
+___
+$code.=<<___;
+ ret
+.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
+
+################################################################################
+# void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
+.globl ecp_nistz256_select_w7
+.type ecp_nistz256_select_w7,\@abi-omnipotent
+.align 32
+ecp_nistz256_select_w7:
+___
+$code.=<<___ if ($avx>1);
+ mov OPENSSL_ia32cap_P+8(%rip), %eax
+ test \$`1<<5`, %eax
+ jnz .Lavx2_select_w7
+___
+$code.=<<___ if ($win64);
+ lea -0x88(%rsp), %rax
+.LSEH_begin_ecp_nistz256_select_w7:
+ .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
+ .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax)
+ .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax)
+ .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax)
+ .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax)
+ .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax)
+ .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax)
+ .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax)
+ .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax)
+ .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax)
+ .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax)
+___
+$code.=<<___;
+ movdqa .LOne(%rip), $M0
+ movd $index, $INDEX
+
+ pxor $Ra, $Ra
+ pxor $Rb, $Rb
+ pxor $Rc, $Rc
+ pxor $Rd, $Rd
+
+ movdqa $M0, $ONE
+ pshufd \$0, $INDEX, $INDEX
+ mov \$64, %rax
+
+.Lselect_loop_sse_w7:
+ movdqa $M0, $TMP0
+ paddd $ONE, $M0
+ movdqa 16*0($in_t), $T0a
+ movdqa 16*1($in_t), $T0b
+ pcmpeqd $INDEX, $TMP0
+ movdqa 16*2($in_t), $T0c
+ movdqa 16*3($in_t), $T0d
+ lea 16*4($in_t), $in_t
+
+ pand $TMP0, $T0a
+ pand $TMP0, $T0b
+ por $T0a, $Ra
+ pand $TMP0, $T0c
+ por $T0b, $Rb
+ pand $TMP0, $T0d
+ por $T0c, $Rc
+ prefetcht0 255($in_t)
+ por $T0d, $Rd
+
+ dec %rax
+ jnz .Lselect_loop_sse_w7
+
+ movdqu $Ra, 16*0($val)
+ movdqu $Rb, 16*1($val)
+ movdqu $Rc, 16*2($val)
+ movdqu $Rd, 16*3($val)
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp), %xmm6
+ movaps 0x10(%rsp), %xmm7
+ movaps 0x20(%rsp), %xmm8
+ movaps 0x30(%rsp), %xmm9
+ movaps 0x40(%rsp), %xmm10
+ movaps 0x50(%rsp), %xmm11
+ movaps 0x60(%rsp), %xmm12
+ movaps 0x70(%rsp), %xmm13
+ movaps 0x80(%rsp), %xmm14
+ movaps 0x90(%rsp), %xmm15
+ lea 0xa8(%rsp), %rsp
+.LSEH_end_ecp_nistz256_select_w7:
+___
+$code.=<<___;
+ ret
+.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
+___
+}
+if ($avx>1) {
+my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
+my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4));
+my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9));
+my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14));
+
+$code.=<<___;
+################################################################################
+# void ecp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index);
+.type ecp_nistz256_avx2_select_w5,\@abi-omnipotent
+.align 32
+ecp_nistz256_avx2_select_w5:
+.Lavx2_select_w5:
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ lea -0x88(%rsp), %rax
+.LSEH_begin_ecp_nistz256_avx2_select_w5:
+ .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
+ .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax)
+ .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax)
+ .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax)
+ .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax)
+ .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax)
+ .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax)
+ .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax)
+ .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax)
+ .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax)
+ .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax)
+___
+$code.=<<___;
+ vmovdqa .LTwo(%rip), $TWO
+
+ vpxor $Ra, $Ra, $Ra
+ vpxor $Rb, $Rb, $Rb
+ vpxor $Rc, $Rc, $Rc
+
+ vmovdqa .LOne(%rip), $M0
+ vmovdqa .LTwo(%rip), $M1
+
+ vmovd $index, %xmm1
+ vpermd $INDEX, $Ra, $INDEX
+
+ mov \$8, %rax
+.Lselect_loop_avx2_w5:
+
+ vmovdqa 32*0($in_t), $T0a
+ vmovdqa 32*1($in_t), $T0b
+ vmovdqa 32*2($in_t), $T0c
+
+ vmovdqa 32*3($in_t), $T1a
+ vmovdqa 32*4($in_t), $T1b
+ vmovdqa 32*5($in_t), $T1c
+
+ vpcmpeqd $INDEX, $M0, $TMP0
+ vpcmpeqd $INDEX, $M1, $TMP1
+
+ vpaddd $TWO, $M0, $M0
+ vpaddd $TWO, $M1, $M1
+ lea 32*6($in_t), $in_t
+
+ vpand $TMP0, $T0a, $T0a
+ vpand $TMP0, $T0b, $T0b
+ vpand $TMP0, $T0c, $T0c
+ vpand $TMP1, $T1a, $T1a
+ vpand $TMP1, $T1b, $T1b
+ vpand $TMP1, $T1c, $T1c
+
+ vpxor $T0a, $Ra, $Ra
+ vpxor $T0b, $Rb, $Rb
+ vpxor $T0c, $Rc, $Rc
+ vpxor $T1a, $Ra, $Ra
+ vpxor $T1b, $Rb, $Rb
+ vpxor $T1c, $Rc, $Rc
+
+ dec %rax
+ jnz .Lselect_loop_avx2_w5
+
+ vmovdqu $Ra, 32*0($val)
+ vmovdqu $Rb, 32*1($val)
+ vmovdqu $Rc, 32*2($val)
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp), %xmm6
+ movaps 0x10(%rsp), %xmm7
+ movaps 0x20(%rsp), %xmm8
+ movaps 0x30(%rsp), %xmm9
+ movaps 0x40(%rsp), %xmm10
+ movaps 0x50(%rsp), %xmm11
+ movaps 0x60(%rsp), %xmm12
+ movaps 0x70(%rsp), %xmm13
+ movaps 0x80(%rsp), %xmm14
+ movaps 0x90(%rsp), %xmm15
+ lea 0xa8(%rsp), %rsp
+.LSEH_end_ecp_nistz256_avx2_select_w5:
+___
+$code.=<<___;
+ ret
+.size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5
+___
+}
+if ($avx>1) {
+my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
+my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3));
+my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7));
+my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11));
+my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15));
+
+$code.=<<___;
+
+################################################################################
+# void ecp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index);
+.globl ecp_nistz256_avx2_select_w7
+.type ecp_nistz256_avx2_select_w7,\@abi-omnipotent
+.align 32
+ecp_nistz256_avx2_select_w7:
+.Lavx2_select_w7:
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ lea -0x88(%rsp), %rax
+.LSEH_begin_ecp_nistz256_avx2_select_w7:
+ .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
+ .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax)
+ .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax)
+ .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax)
+ .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax)
+ .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax)
+ .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax)
+ .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax)
+ .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax)
+ .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax)
+ .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax)
+___
+$code.=<<___;
+ vmovdqa .LThree(%rip), $THREE
+
+ vpxor $Ra, $Ra, $Ra
+ vpxor $Rb, $Rb, $Rb
+
+ vmovdqa .LOne(%rip), $M0
+ vmovdqa .LTwo(%rip), $M1
+ vmovdqa .LThree(%rip), $M2
+
+ vmovd $index, %xmm1
+ vpermd $INDEX, $Ra, $INDEX
+ # Skip index = 0, because it is implicitly the point at infinity
+
+ mov \$21, %rax
+.Lselect_loop_avx2_w7:
+
+ vmovdqa 32*0($in_t), $T0a
+ vmovdqa 32*1($in_t), $T0b
+
+ vmovdqa 32*2($in_t), $T1a
+ vmovdqa 32*3($in_t), $T1b
+
+ vmovdqa 32*4($in_t), $T2a
+ vmovdqa 32*5($in_t), $T2b
+
+ vpcmpeqd $INDEX, $M0, $TMP0
+ vpcmpeqd $INDEX, $M1, $TMP1
+ vpcmpeqd $INDEX, $M2, $TMP2
+
+ vpaddd $THREE, $M0, $M0
+ vpaddd $THREE, $M1, $M1
+ vpaddd $THREE, $M2, $M2
+ lea 32*6($in_t), $in_t
+
+ vpand $TMP0, $T0a, $T0a
+ vpand $TMP0, $T0b, $T0b
+ vpand $TMP1, $T1a, $T1a
+ vpand $TMP1, $T1b, $T1b
+ vpand $TMP2, $T2a, $T2a
+ vpand $TMP2, $T2b, $T2b
+
+ vpxor $T0a, $Ra, $Ra
+ vpxor $T0b, $Rb, $Rb
+ vpxor $T1a, $Ra, $Ra
+ vpxor $T1b, $Rb, $Rb
+ vpxor $T2a, $Ra, $Ra
+ vpxor $T2b, $Rb, $Rb
+
+ dec %rax
+ jnz .Lselect_loop_avx2_w7
+
+
+ vmovdqa 32*0($in_t), $T0a
+ vmovdqa 32*1($in_t), $T0b
+
+ vpcmpeqd $INDEX, $M0, $TMP0
+
+ vpand $TMP0, $T0a, $T0a
+ vpand $TMP0, $T0b, $T0b
+
+ vpxor $T0a, $Ra, $Ra
+ vpxor $T0b, $Rb, $Rb
+
+ vmovdqu $Ra, 32*0($val)
+ vmovdqu $Rb, 32*1($val)
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp), %xmm6
+ movaps 0x10(%rsp), %xmm7
+ movaps 0x20(%rsp), %xmm8
+ movaps 0x30(%rsp), %xmm9
+ movaps 0x40(%rsp), %xmm10
+ movaps 0x50(%rsp), %xmm11
+ movaps 0x60(%rsp), %xmm12
+ movaps 0x70(%rsp), %xmm13
+ movaps 0x80(%rsp), %xmm14
+ movaps 0x90(%rsp), %xmm15
+ lea 0xa8(%rsp), %rsp
+.LSEH_end_ecp_nistz256_avx2_select_w7:
+___
+$code.=<<___;
+ ret
+.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
+___
+} else {
+$code.=<<___;
+.globl ecp_nistz256_avx2_select_w7
+.type ecp_nistz256_avx2_select_w7,\@function,3
+.align 32
+ecp_nistz256_avx2_select_w7:
+ .byte 0x0f,0x0b # ud2
+ ret
+.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
+___
+}
+{{{
+########################################################################
+# This block implements higher level point_double, point_add and
+# point_add_affine. The key to performance in this case is to allow
+# out-of-order execution logic to overlap computations from next step
+# with tail processing from current step. By using tailored calling
+# sequence we minimize inter-step overhead to give processor better
+# shot at overlapping operations...
+#
+# You will notice that input data is copied to stack. Trouble is that
+# there are no registers to spare for holding original pointers and
+# reloading them, pointers, would create undesired dependencies on
+# effective addresses calculation paths. In other words it's too done
+# to favour out-of-order execution logic.
+# <appro@openssl.org>
+
+my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
+my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
+my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4);
+my ($poly1,$poly3)=($acc6,$acc7);
+
+sub load_for_mul () {
+my ($a,$b,$src0) = @_;
+my $bias = $src0 eq "%rax" ? 0 : -128;
+
+" mov $b, $src0
+ lea $b, $b_ptr
+ mov 8*0+$a, $acc1
+ mov 8*1+$a, $acc2
+ lea $bias+$a, $a_ptr
+ mov 8*2+$a, $acc3
+ mov 8*3+$a, $acc4"
+}
+
+sub load_for_sqr () {
+my ($a,$src0) = @_;
+my $bias = $src0 eq "%rax" ? 0 : -128;
+
+" mov 8*0+$a, $src0
+ mov 8*1+$a, $acc6
+ lea $bias+$a, $a_ptr
+ mov 8*2+$a, $acc7
+ mov 8*3+$a, $acc0"
+}
+
+ {
+########################################################################
+# operate in 4-5-0-1 "name space" that matches multiplication output
+#
+my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
+
+$code.=<<___;
+.type __ecp_nistz256_add_toq,\@abi-omnipotent
+.align 32
+__ecp_nistz256_add_toq:
+ add 8*0($b_ptr), $a0
+ adc 8*1($b_ptr), $a1
+ mov $a0, $t0
+ adc 8*2($b_ptr), $a2
+ adc 8*3($b_ptr), $a3
+ mov $a1, $t1
+ sbb $t4, $t4
+
+ sub \$-1, $a0
+ mov $a2, $t2
+ sbb $poly1, $a1
+ sbb \$0, $a2
+ mov $a3, $t3
+ sbb $poly3, $a3
+ test $t4, $t4
+
+ cmovz $t0, $a0
+ cmovz $t1, $a1
+ mov $a0, 8*0($r_ptr)
+ cmovz $t2, $a2
+ mov $a1, 8*1($r_ptr)
+ cmovz $t3, $a3
+ mov $a2, 8*2($r_ptr)
+ mov $a3, 8*3($r_ptr)
+
+ ret
+.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
+
+.type __ecp_nistz256_sub_fromq,\@abi-omnipotent
+.align 32
+__ecp_nistz256_sub_fromq:
+ sub 8*0($b_ptr), $a0
+ sbb 8*1($b_ptr), $a1
+ mov $a0, $t0
+ sbb 8*2($b_ptr), $a2
+ sbb 8*3($b_ptr), $a3
+ mov $a1, $t1
+ sbb $t4, $t4
+
+ add \$-1, $a0
+ mov $a2, $t2
+ adc $poly1, $a1
+ adc \$0, $a2
+ mov $a3, $t3
+ adc $poly3, $a3
+ test $t4, $t4
+
+ cmovz $t0, $a0
+ cmovz $t1, $a1
+ mov $a0, 8*0($r_ptr)
+ cmovz $t2, $a2
+ mov $a1, 8*1($r_ptr)
+ cmovz $t3, $a3
+ mov $a2, 8*2($r_ptr)
+ mov $a3, 8*3($r_ptr)
+
+ ret
+.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
+
+.type __ecp_nistz256_subq,\@abi-omnipotent
+.align 32
+__ecp_nistz256_subq:
+ sub $a0, $t0
+ sbb $a1, $t1
+ mov $t0, $a0
+ sbb $a2, $t2
+ sbb $a3, $t3
+ mov $t1, $a1
+ sbb $t4, $t4
+
+ add \$-1, $t0
+ mov $t2, $a2
+ adc $poly1, $t1
+ adc \$0, $t2
+ mov $t3, $a3
+ adc $poly3, $t3
+ test $t4, $t4
+
+ cmovnz $t0, $a0
+ cmovnz $t1, $a1
+ cmovnz $t2, $a2
+ cmovnz $t3, $a3
+
+ ret
+.size __ecp_nistz256_subq,.-__ecp_nistz256_subq
+
+.type __ecp_nistz256_mul_by_2q,\@abi-omnipotent
+.align 32
+__ecp_nistz256_mul_by_2q:
+ add $a0, $a0 # a0:a3+a0:a3
+ adc $a1, $a1
+ mov $a0, $t0
+ adc $a2, $a2
+ adc $a3, $a3
+ mov $a1, $t1
+ sbb $t4, $t4
+
+ sub \$-1, $a0
+ mov $a2, $t2
+ sbb $poly1, $a1
+ sbb \$0, $a2
+ mov $a3, $t3
+ sbb $poly3, $a3
+ test $t4, $t4
+
+ cmovz $t0, $a0
+ cmovz $t1, $a1
+ mov $a0, 8*0($r_ptr)
+ cmovz $t2, $a2
+ mov $a1, 8*1($r_ptr)
+ cmovz $t3, $a3
+ mov $a2, 8*2($r_ptr)
+ mov $a3, 8*3($r_ptr)
+
+ ret
+.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
+___
+ }
+sub gen_double () {
+ my $x = shift;
+ my ($src0,$sfx,$bias);
+ my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
+
+ if ($x ne "x") {
+ $src0 = "%rax";
+ $sfx = "";
+ $bias = 0;
+
+$code.=<<___;
+.globl ecp_nistz256_point_double
+.type ecp_nistz256_point_double,\@function,2
+.align 32
+ecp_nistz256_point_double:
+___
+$code.=<<___ if ($addx);
+ mov \$0x80100, %ecx
+ and OPENSSL_ia32cap_P+8(%rip), %ecx
+ cmp \$0x80100, %ecx
+ je .Lpoint_doublex
+___
+ } else {
+ $src0 = "%rdx";
+ $sfx = "x";
+ $bias = 128;
+
+$code.=<<___;
+.type ecp_nistz256_point_doublex,\@function,2
+.align 32
+ecp_nistz256_point_doublex:
+.Lpoint_doublex:
+___
+ }
+$code.=<<___;
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ sub \$32*5+8, %rsp
+
+ movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x
+ mov $a_ptr, $b_ptr # backup copy
+ movdqu 0x10($a_ptr), %xmm1
+ mov 0x20+8*0($a_ptr), $acc4 # load in_y in "5-4-0-1" order
+ mov 0x20+8*1($a_ptr), $acc5
+ mov 0x20+8*2($a_ptr), $acc0
+ mov 0x20+8*3($a_ptr), $acc1
+ mov .Lpoly+8*1(%rip), $poly1
+ mov .Lpoly+8*3(%rip), $poly3
+ movdqa %xmm0, $in_x(%rsp)
+ movdqa %xmm1, $in_x+0x10(%rsp)
+ lea 0x20($r_ptr), $acc2
+ lea 0x40($r_ptr), $acc3
+ movq $r_ptr, %xmm0
+ movq $acc2, %xmm1
+ movq $acc3, %xmm2
+
+ lea $S(%rsp), $r_ptr
+ call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(S, in_y);
+
+ mov 0x40+8*0($a_ptr), $src0
+ mov 0x40+8*1($a_ptr), $acc6
+ mov 0x40+8*2($a_ptr), $acc7
+ mov 0x40+8*3($a_ptr), $acc0
+ lea 0x40-$bias($a_ptr), $a_ptr
+ lea $Zsqr(%rsp), $r_ptr
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Zsqr, in_z);
+
+ `&load_for_sqr("$S(%rsp)", "$src0")`
+ lea $S(%rsp), $r_ptr
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(S, S);
+
+ mov 0x20($b_ptr), $src0 # $b_ptr is still valid
+ mov 0x40+8*0($b_ptr), $acc1
+ mov 0x40+8*1($b_ptr), $acc2
+ mov 0x40+8*2($b_ptr), $acc3
+ mov 0x40+8*3($b_ptr), $acc4
+ lea 0x40-$bias($b_ptr), $a_ptr
+ lea 0x20($b_ptr), $b_ptr
+ movq %xmm2, $r_ptr
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, in_z, in_y);
+ call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(res_z, res_z);
+
+ mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order
+ mov $in_x+8*1(%rsp), $acc5
+ lea $Zsqr(%rsp), $b_ptr
+ mov $in_x+8*2(%rsp), $acc0
+ mov $in_x+8*3(%rsp), $acc1
+ lea $M(%rsp), $r_ptr
+ call __ecp_nistz256_add_to$x # p256_add(M, in_x, Zsqr);
+
+ mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order
+ mov $in_x+8*1(%rsp), $acc5
+ lea $Zsqr(%rsp), $b_ptr
+ mov $in_x+8*2(%rsp), $acc0
+ mov $in_x+8*3(%rsp), $acc1
+ lea $Zsqr(%rsp), $r_ptr
+ call __ecp_nistz256_sub_from$x # p256_sub(Zsqr, in_x, Zsqr);
+
+ `&load_for_sqr("$S(%rsp)", "$src0")`
+ movq %xmm1, $r_ptr
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S);
+___
+{
+######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
+# operate in 4-5-6-7 "name space" that matches squaring output
+#
+my ($poly1,$poly3)=($a_ptr,$t1);
+my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2);
+
+$code.=<<___;
+ xor $t4, $t4
+ mov $a0, $t0
+ add \$-1, $a0
+ mov $a1, $t1
+ adc $poly1, $a1
+ mov $a2, $t2
+ adc \$0, $a2
+ mov $a3, $t3
+ adc $poly3, $a3
+ adc \$0, $t4
+ xor $a_ptr, $a_ptr # borrow $a_ptr
+ test \$1, $t0
+
+ cmovz $t0, $a0
+ cmovz $t1, $a1
+ cmovz $t2, $a2
+ cmovz $t3, $a3
+ cmovz $a_ptr, $t4
+
+ mov $a1, $t0 # a0:a3>>1
+ shr \$1, $a0
+ shl \$63, $t0
+ mov $a2, $t1
+ shr \$1, $a1
+ or $t0, $a0
+ shl \$63, $t1
+ mov $a3, $t2
+ shr \$1, $a2
+ or $t1, $a1
+ shl \$63, $t2
+ mov $a0, 8*0($r_ptr)
+ shr \$1, $a3
+ mov $a1, 8*1($r_ptr)
+ shl \$63, $t4
+ or $t2, $a2
+ or $t4, $a3
+ mov $a2, 8*2($r_ptr)
+ mov $a3, 8*3($r_ptr)
+___
+}
+$code.=<<___;
+ `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")`
+ lea $M(%rsp), $r_ptr
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(M, M, Zsqr);
+
+ lea $tmp0(%rsp), $r_ptr
+ call __ecp_nistz256_mul_by_2$x
+
+ lea $M(%rsp), $b_ptr
+ lea $M(%rsp), $r_ptr
+ call __ecp_nistz256_add_to$x # p256_mul_by_3(M, M);
+
+ `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")`
+ lea $S(%rsp), $r_ptr
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, in_x);
+
+ lea $tmp0(%rsp), $r_ptr
+ call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(tmp0, S);
+
+ `&load_for_sqr("$M(%rsp)", "$src0")`
+ movq %xmm0, $r_ptr
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_x, M);
+
+ lea $tmp0(%rsp), $b_ptr
+ mov $acc6, $acc0 # harmonize sqr output and sub input
+ mov $acc7, $acc1
+ mov $a_ptr, $poly1
+ mov $t1, $poly3
+ call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, tmp0);
+
+ mov $S+8*0(%rsp), $t0
+ mov $S+8*1(%rsp), $t1
+ mov $S+8*2(%rsp), $t2
+ mov $S+8*3(%rsp), $acc2 # "4-5-0-1" order
+ lea $S(%rsp), $r_ptr
+ call __ecp_nistz256_sub$x # p256_sub(S, S, res_x);
+
+ mov $M(%rsp), $src0
+ lea $M(%rsp), $b_ptr
+ mov $acc4, $acc6 # harmonize sub output and mul input
+ xor %ecx, %ecx
+ mov $acc4, $S+8*0(%rsp) # have to save:-(
+ mov $acc5, $acc2
+ mov $acc5, $S+8*1(%rsp)
+ cmovz $acc0, $acc3
+ mov $acc0, $S+8*2(%rsp)
+ lea $S-$bias(%rsp), $a_ptr
+ cmovz $acc1, $acc4
+ mov $acc1, $S+8*3(%rsp)
+ mov $acc6, $acc1
+ lea $S(%rsp), $r_ptr
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, M);
+
+ movq %xmm1, $b_ptr
+ movq %xmm1, $r_ptr
+ call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y);
+
+ add \$32*5+8, %rsp
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+ pop %rbp
+ ret
+.size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx
+___
+}
+&gen_double("q");
+
+sub gen_add () {
+ my $x = shift;
+ my ($src0,$sfx,$bias);
+ my ($H,$Hsqr,$R,$Rsqr,$Hcub,
+ $U1,$U2,$S1,$S2,
+ $res_x,$res_y,$res_z,
+ $in1_x,$in1_y,$in1_z,
+ $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17));
+ my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
+
+ if ($x ne "x") {
+ $src0 = "%rax";
+ $sfx = "";
+ $bias = 0;
+
+$code.=<<___;
+.globl ecp_nistz256_point_add
+.type ecp_nistz256_point_add,\@function,3
+.align 32
+ecp_nistz256_point_add:
+___
+$code.=<<___ if ($addx);
+ mov \$0x80100, %ecx
+ and OPENSSL_ia32cap_P+8(%rip), %ecx
+ cmp \$0x80100, %ecx
+ je .Lpoint_addx
+___
+ } else {
+ $src0 = "%rdx";
+ $sfx = "x";
+ $bias = 128;
+
+$code.=<<___;
+.type ecp_nistz256_point_addx,\@function,3
+.align 32
+ecp_nistz256_point_addx:
+.Lpoint_addx:
+___
+ }
+$code.=<<___;
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ sub \$32*18+8, %rsp
+
+ movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr
+ movdqu 0x10($a_ptr), %xmm1
+ movdqu 0x20($a_ptr), %xmm2
+ movdqu 0x30($a_ptr), %xmm3
+ movdqu 0x40($a_ptr), %xmm4
+ movdqu 0x50($a_ptr), %xmm5
+ mov $a_ptr, $b_ptr # reassign
+ mov $b_org, $a_ptr # reassign
+ movdqa %xmm0, $in1_x(%rsp)
+ movdqa %xmm1, $in1_x+0x10(%rsp)
+ por %xmm0, %xmm1
+ movdqa %xmm2, $in1_y(%rsp)
+ movdqa %xmm3, $in1_y+0x10(%rsp)
+ por %xmm2, %xmm3
+ movdqa %xmm4, $in1_z(%rsp)
+ movdqa %xmm5, $in1_z+0x10(%rsp)
+ por %xmm1, %xmm3
+
+ movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr
+ pshufd \$0xb1, %xmm3, %xmm5
+ movdqu 0x10($a_ptr), %xmm1
+ movdqu 0x20($a_ptr), %xmm2
+ por %xmm3, %xmm5
+ movdqu 0x30($a_ptr), %xmm3
+ mov 0x40+8*0($a_ptr), $src0 # load original in2_z
+ mov 0x40+8*1($a_ptr), $acc6
+ mov 0x40+8*2($a_ptr), $acc7
+ mov 0x40+8*3($a_ptr), $acc0
+ movdqa %xmm0, $in2_x(%rsp)
+ pshufd \$0x1e, %xmm5, %xmm4
+ movdqa %xmm1, $in2_x+0x10(%rsp)
+ por %xmm0, %xmm1
+ movq $r_ptr, %xmm0 # save $r_ptr
+ movdqa %xmm2, $in2_y(%rsp)
+ movdqa %xmm3, $in2_y+0x10(%rsp)
+ por %xmm2, %xmm3
+ por %xmm4, %xmm5
+ pxor %xmm4, %xmm4
+ por %xmm1, %xmm3
+
+ lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid
+ mov $src0, $in2_z+8*0(%rsp) # make in2_z copy
+ mov $acc6, $in2_z+8*1(%rsp)
+ mov $acc7, $in2_z+8*2(%rsp)
+ mov $acc0, $in2_z+8*3(%rsp)
+ lea $Z2sqr(%rsp), $r_ptr # Z2^2
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z);
+
+ pcmpeqd %xmm4, %xmm5
+ pshufd \$0xb1, %xmm3, %xmm4
+ por %xmm3, %xmm4
+ pshufd \$0, %xmm5, %xmm5 # in1infty
+ pshufd \$0x1e, %xmm4, %xmm3
+ por %xmm3, %xmm4
+ pxor %xmm3, %xmm3
+ pcmpeqd %xmm3, %xmm4
+ pshufd \$0, %xmm4, %xmm4 # in2infty
+ mov 0x40+8*0($b_ptr), $src0 # load original in1_z
+ mov 0x40+8*1($b_ptr), $acc6
+ mov 0x40+8*2($b_ptr), $acc7
+ mov 0x40+8*3($b_ptr), $acc0
+
+ lea 0x40-$bias($b_ptr), $a_ptr
+ lea $Z1sqr(%rsp), $r_ptr # Z1^2
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z);
+
+ `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")`
+ lea $S1(%rsp), $r_ptr # S1 = Z2^3
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, Z2sqr, in2_z);
+
+ `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
+ lea $S2(%rsp), $r_ptr # S2 = Z1^3
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z);
+
+ `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")`
+ lea $S1(%rsp), $r_ptr # S1 = Y1*Z2^3
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, S1, in1_y);
+
+ `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
+ lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y);
+
+ lea $S1(%rsp), $b_ptr
+ lea $R(%rsp), $r_ptr # R = S2 - S1
+ call __ecp_nistz256_sub_from$x # p256_sub(R, S2, S1);
+
+ or $acc5, $acc4 # see if result is zero
+ movdqa %xmm4, %xmm2
+ or $acc0, $acc4
+ or $acc1, $acc4
+ por %xmm5, %xmm2 # in1infty || in2infty
+ movq $acc4, %xmm3
+
+ `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")`
+ lea $U1(%rsp), $r_ptr # U1 = X1*Z2^2
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(U1, in1_x, Z2sqr);
+
+ `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")`
+ lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in2_x, Z1sqr);
+
+ lea $U1(%rsp), $b_ptr
+ lea $H(%rsp), $r_ptr # H = U2 - U1
+ call __ecp_nistz256_sub_from$x # p256_sub(H, U2, U1);
+
+ or $acc5, $acc4 # see if result is zero
+ or $acc0, $acc4
+ or $acc1, $acc4
+
+ .byte 0x3e # predict taken
+ jnz .Ladd_proceed$x # is_equal(U1,U2)?
+ movq %xmm2, $acc0
+ movq %xmm3, $acc1
+ test $acc0, $acc0
+ jnz .Ladd_proceed$x # (in1infty || in2infty)?
+ test $acc1, $acc1
+ jz .Ladd_proceed$x # is_equal(S1,S2)?
+
+ movq %xmm0, $r_ptr # restore $r_ptr
+ pxor %xmm0, %xmm0
+ movdqu %xmm0, 0x00($r_ptr)
+ movdqu %xmm0, 0x10($r_ptr)
+ movdqu %xmm0, 0x20($r_ptr)
+ movdqu %xmm0, 0x30($r_ptr)
+ movdqu %xmm0, 0x40($r_ptr)
+ movdqu %xmm0, 0x50($r_ptr)
+ jmp .Ladd_done$x
+
+.align 32
+.Ladd_proceed$x:
+ `&load_for_sqr("$R(%rsp)", "$src0")`
+ lea $Rsqr(%rsp), $r_ptr # R^2
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R);
+
+ `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
+ lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z);
+
+ `&load_for_sqr("$H(%rsp)", "$src0")`
+ lea $Hsqr(%rsp), $r_ptr # H^2
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H);
+
+ `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")`
+ lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, res_z, in2_z);
+
+ `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")`
+ lea $Hcub(%rsp), $r_ptr # H^3
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H);
+
+ `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")`
+ lea $U2(%rsp), $r_ptr # U1*H^2
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, U1, Hsqr);
+___
+{
+#######################################################################
+# operate in 4-5-0-1 "name space" that matches multiplication output
+#
+my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
+my ($poly1, $poly3)=($acc6,$acc7);
+
+$code.=<<___;
+ #lea $U2(%rsp), $a_ptr
+ #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
+ #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
+
+ add $acc0, $acc0 # a0:a3+a0:a3
+ lea $Rsqr(%rsp), $a_ptr
+ adc $acc1, $acc1
+ mov $acc0, $t0
+ adc $acc2, $acc2
+ adc $acc3, $acc3
+ mov $acc1, $t1
+ sbb $t4, $t4
+
+ sub \$-1, $acc0
+ mov $acc2, $t2
+ sbb $poly1, $acc1
+ sbb \$0, $acc2
+ mov $acc3, $t3
+ sbb $poly3, $acc3
+ test $t4, $t4
+
+ cmovz $t0, $acc0
+ mov 8*0($a_ptr), $t0
+ cmovz $t1, $acc1
+ mov 8*1($a_ptr), $t1
+ cmovz $t2, $acc2
+ mov 8*2($a_ptr), $t2
+ cmovz $t3, $acc3
+ mov 8*3($a_ptr), $t3
+
+ call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
+
+ lea $Hcub(%rsp), $b_ptr
+ lea $res_x(%rsp), $r_ptr
+ call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub);
+
+ mov $U2+8*0(%rsp), $t0
+ mov $U2+8*1(%rsp), $t1
+ mov $U2+8*2(%rsp), $t2
+ mov $U2+8*3(%rsp), $t3
+ lea $res_y(%rsp), $r_ptr
+
+ call __ecp_nistz256_sub$x # p256_sub(res_y, U2, res_x);
+
+ mov $acc0, 8*0($r_ptr) # save the result, as
+ mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't
+ mov $acc2, 8*2($r_ptr)
+ mov $acc3, 8*3($r_ptr)
+___
+}
+$code.=<<___;
+ `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")`
+ lea $S2(%rsp), $r_ptr
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S1, Hcub);
+
+ `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")`
+ lea $res_y(%rsp), $r_ptr
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_y, R, res_y);
+
+ lea $S2(%rsp), $b_ptr
+ lea $res_y(%rsp), $r_ptr
+ call __ecp_nistz256_sub_from$x # p256_sub(res_y, res_y, S2);
+
+ movq %xmm0, $r_ptr # restore $r_ptr
+
+ movdqa %xmm5, %xmm0 # copy_conditional(res_z, in2_z, in1infty);
+ movdqa %xmm5, %xmm1
+ pandn $res_z(%rsp), %xmm0
+ movdqa %xmm5, %xmm2
+ pandn $res_z+0x10(%rsp), %xmm1
+ movdqa %xmm5, %xmm3
+ pand $in2_z(%rsp), %xmm2
+ pand $in2_z+0x10(%rsp), %xmm3
+ por %xmm0, %xmm2
+ por %xmm1, %xmm3
+
+ movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty);
+ movdqa %xmm4, %xmm1
+ pandn %xmm2, %xmm0
+ movdqa %xmm4, %xmm2
+ pandn %xmm3, %xmm1
+ movdqa %xmm4, %xmm3
+ pand $in1_z(%rsp), %xmm2
+ pand $in1_z+0x10(%rsp), %xmm3
+ por %xmm0, %xmm2
+ por %xmm1, %xmm3
+ movdqu %xmm2, 0x40($r_ptr)
+ movdqu %xmm3, 0x50($r_ptr)
+
+ movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty);
+ movdqa %xmm5, %xmm1
+ pandn $res_x(%rsp), %xmm0
+ movdqa %xmm5, %xmm2
+ pandn $res_x+0x10(%rsp), %xmm1
+ movdqa %xmm5, %xmm3
+ pand $in2_x(%rsp), %xmm2
+ pand $in2_x+0x10(%rsp), %xmm3
+ por %xmm0, %xmm2
+ por %xmm1, %xmm3
+
+ movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty);
+ movdqa %xmm4, %xmm1
+ pandn %xmm2, %xmm0
+ movdqa %xmm4, %xmm2
+ pandn %xmm3, %xmm1
+ movdqa %xmm4, %xmm3
+ pand $in1_x(%rsp), %xmm2
+ pand $in1_x+0x10(%rsp), %xmm3
+ por %xmm0, %xmm2
+ por %xmm1, %xmm3
+ movdqu %xmm2, 0x00($r_ptr)
+ movdqu %xmm3, 0x10($r_ptr)
+
+ movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty);
+ movdqa %xmm5, %xmm1
+ pandn $res_y(%rsp), %xmm0
+ movdqa %xmm5, %xmm2
+ pandn $res_y+0x10(%rsp), %xmm1
+ movdqa %xmm5, %xmm3
+ pand $in2_y(%rsp), %xmm2
+ pand $in2_y+0x10(%rsp), %xmm3
+ por %xmm0, %xmm2
+ por %xmm1, %xmm3
+
+ movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty);
+ movdqa %xmm4, %xmm1
+ pandn %xmm2, %xmm0
+ movdqa %xmm4, %xmm2
+ pandn %xmm3, %xmm1
+ movdqa %xmm4, %xmm3
+ pand $in1_y(%rsp), %xmm2
+ pand $in1_y+0x10(%rsp), %xmm3
+ por %xmm0, %xmm2
+ por %xmm1, %xmm3
+ movdqu %xmm2, 0x20($r_ptr)
+ movdqu %xmm3, 0x30($r_ptr)
+
+.Ladd_done$x:
+ add \$32*18+8, %rsp
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+ pop %rbp
+ ret
+.size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx
+___
+}
+&gen_add("q");
+
+sub gen_add_affine () {
+ my $x = shift;
+ my ($src0,$sfx,$bias);
+ my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr,
+ $res_x,$res_y,$res_z,
+ $in1_x,$in1_y,$in1_z,
+ $in2_x,$in2_y)=map(32*$_,(0..14));
+ my $Z1sqr = $S2;
+
+ if ($x ne "x") {
+ $src0 = "%rax";
+ $sfx = "";
+ $bias = 0;
+
+$code.=<<___;
+.globl ecp_nistz256_point_add_affine
+.type ecp_nistz256_point_add_affine,\@function,3
+.align 32
+ecp_nistz256_point_add_affine:
+___
+$code.=<<___ if ($addx);
+ mov \$0x80100, %ecx
+ and OPENSSL_ia32cap_P+8(%rip), %ecx
+ cmp \$0x80100, %ecx
+ je .Lpoint_add_affinex
+___
+ } else {
+ $src0 = "%rdx";
+ $sfx = "x";
+ $bias = 128;
+
+$code.=<<___;
+.type ecp_nistz256_point_add_affinex,\@function,3
+.align 32
+ecp_nistz256_point_add_affinex:
+.Lpoint_add_affinex:
+___
+ }
+$code.=<<___;
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ sub \$32*15+8, %rsp
+
+ movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr
+ mov $b_org, $b_ptr # reassign
+ movdqu 0x10($a_ptr), %xmm1
+ movdqu 0x20($a_ptr), %xmm2
+ movdqu 0x30($a_ptr), %xmm3
+ movdqu 0x40($a_ptr), %xmm4
+ movdqu 0x50($a_ptr), %xmm5
+ mov 0x40+8*0($a_ptr), $src0 # load original in1_z
+ mov 0x40+8*1($a_ptr), $acc6
+ mov 0x40+8*2($a_ptr), $acc7
+ mov 0x40+8*3($a_ptr), $acc0
+ movdqa %xmm0, $in1_x(%rsp)
+ movdqa %xmm1, $in1_x+0x10(%rsp)
+ por %xmm0, %xmm1
+ movdqa %xmm2, $in1_y(%rsp)
+ movdqa %xmm3, $in1_y+0x10(%rsp)
+ por %xmm2, %xmm3
+ movdqa %xmm4, $in1_z(%rsp)
+ movdqa %xmm5, $in1_z+0x10(%rsp)
+ por %xmm1, %xmm3
+
+ movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr
+ pshufd \$0xb1, %xmm3, %xmm5
+ movdqu 0x10($b_ptr), %xmm1
+ movdqu 0x20($b_ptr), %xmm2
+ por %xmm3, %xmm5
+ movdqu 0x30($b_ptr), %xmm3
+ movdqa %xmm0, $in2_x(%rsp)
+ pshufd \$0x1e, %xmm5, %xmm4
+ movdqa %xmm1, $in2_x+0x10(%rsp)
+ por %xmm0, %xmm1
+ movq $r_ptr, %xmm0 # save $r_ptr
+ movdqa %xmm2, $in2_y(%rsp)
+ movdqa %xmm3, $in2_y+0x10(%rsp)
+ por %xmm2, %xmm3
+ por %xmm4, %xmm5
+ pxor %xmm4, %xmm4
+ por %xmm1, %xmm3
+
+ lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid
+ lea $Z1sqr(%rsp), $r_ptr # Z1^2
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z);
+
+ pcmpeqd %xmm4, %xmm5
+ pshufd \$0xb1, %xmm3, %xmm4
+ mov 0x00($b_ptr), $src0 # $b_ptr is still valid
+ #lea 0x00($b_ptr), $b_ptr
+ mov $acc4, $acc1 # harmonize sqr output and mul input
+ por %xmm3, %xmm4
+ pshufd \$0, %xmm5, %xmm5 # in1infty
+ pshufd \$0x1e, %xmm4, %xmm3
+ mov $acc5, $acc2
+ por %xmm3, %xmm4
+ pxor %xmm3, %xmm3
+ mov $acc6, $acc3
+ pcmpeqd %xmm3, %xmm4
+ pshufd \$0, %xmm4, %xmm4 # in2infty
+
+ lea $Z1sqr-$bias(%rsp), $a_ptr
+ mov $acc7, $acc4
+ lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, Z1sqr, in2_x);
+
+ lea $in1_x(%rsp), $b_ptr
+ lea $H(%rsp), $r_ptr # H = U2 - U1
+ call __ecp_nistz256_sub_from$x # p256_sub(H, U2, in1_x);
+
+ `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
+ lea $S2(%rsp), $r_ptr # S2 = Z1^3
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z);
+
+ `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
+ lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z);
+
+ `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
+ lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y);
+
+ lea $in1_y(%rsp), $b_ptr
+ lea $R(%rsp), $r_ptr # R = S2 - S1
+ call __ecp_nistz256_sub_from$x # p256_sub(R, S2, in1_y);
+
+ `&load_for_sqr("$H(%rsp)", "$src0")`
+ lea $Hsqr(%rsp), $r_ptr # H^2
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H);
+
+ `&load_for_sqr("$R(%rsp)", "$src0")`
+ lea $Rsqr(%rsp), $r_ptr # R^2
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R);
+
+ `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")`
+ lea $Hcub(%rsp), $r_ptr # H^3
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H);
+
+ `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")`
+ lea $U2(%rsp), $r_ptr # U1*H^2
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in1_x, Hsqr);
+___
+{
+#######################################################################
+# operate in 4-5-0-1 "name space" that matches multiplication output
+#
+my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
+my ($poly1, $poly3)=($acc6,$acc7);
+
+$code.=<<___;
+ #lea $U2(%rsp), $a_ptr
+ #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
+ #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
+
+ add $acc0, $acc0 # a0:a3+a0:a3
+ lea $Rsqr(%rsp), $a_ptr
+ adc $acc1, $acc1
+ mov $acc0, $t0
+ adc $acc2, $acc2
+ adc $acc3, $acc3
+ mov $acc1, $t1
+ sbb $t4, $t4
+
+ sub \$-1, $acc0
+ mov $acc2, $t2
+ sbb $poly1, $acc1
+ sbb \$0, $acc2
+ mov $acc3, $t3
+ sbb $poly3, $acc3
+ test $t4, $t4
+
+ cmovz $t0, $acc0
+ mov 8*0($a_ptr), $t0
+ cmovz $t1, $acc1
+ mov 8*1($a_ptr), $t1
+ cmovz $t2, $acc2
+ mov 8*2($a_ptr), $t2
+ cmovz $t3, $acc3
+ mov 8*3($a_ptr), $t3
+
+ call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
+
+ lea $Hcub(%rsp), $b_ptr
+ lea $res_x(%rsp), $r_ptr
+ call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub);
+
+ mov $U2+8*0(%rsp), $t0
+ mov $U2+8*1(%rsp), $t1
+ mov $U2+8*2(%rsp), $t2
+ mov $U2+8*3(%rsp), $t3
+ lea $H(%rsp), $r_ptr
+
+ call __ecp_nistz256_sub$x # p256_sub(H, U2, res_x);
+
+ mov $acc0, 8*0($r_ptr) # save the result, as
+ mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't
+ mov $acc2, 8*2($r_ptr)
+ mov $acc3, 8*3($r_ptr)
+___
+}
+$code.=<<___;
+ `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")`
+ lea $S2(%rsp), $r_ptr
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Hcub, in1_y);
+
+ `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")`
+ lea $H(%rsp), $r_ptr
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(H, H, R);
+
+ lea $S2(%rsp), $b_ptr
+ lea $res_y(%rsp), $r_ptr
+ call __ecp_nistz256_sub_from$x # p256_sub(res_y, H, S2);
+
+ movq %xmm0, $r_ptr # restore $r_ptr
+
+ movdqa %xmm5, %xmm0 # copy_conditional(res_z, ONE, in1infty);
+ movdqa %xmm5, %xmm1
+ pandn $res_z(%rsp), %xmm0
+ movdqa %xmm5, %xmm2
+ pandn $res_z+0x10(%rsp), %xmm1
+ movdqa %xmm5, %xmm3
+ pand .LONE_mont(%rip), %xmm2
+ pand .LONE_mont+0x10(%rip), %xmm3
+ por %xmm0, %xmm2
+ por %xmm1, %xmm3
+
+ movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty);
+ movdqa %xmm4, %xmm1
+ pandn %xmm2, %xmm0
+ movdqa %xmm4, %xmm2
+ pandn %xmm3, %xmm1
+ movdqa %xmm4, %xmm3
+ pand $in1_z(%rsp), %xmm2
+ pand $in1_z+0x10(%rsp), %xmm3
+ por %xmm0, %xmm2
+ por %xmm1, %xmm3
+ movdqu %xmm2, 0x40($r_ptr)
+ movdqu %xmm3, 0x50($r_ptr)
+
+ movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty);
+ movdqa %xmm5, %xmm1
+ pandn $res_x(%rsp), %xmm0
+ movdqa %xmm5, %xmm2
+ pandn $res_x+0x10(%rsp), %xmm1
+ movdqa %xmm5, %xmm3
+ pand $in2_x(%rsp), %xmm2
+ pand $in2_x+0x10(%rsp), %xmm3
+ por %xmm0, %xmm2
+ por %xmm1, %xmm3
+
+ movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty);
+ movdqa %xmm4, %xmm1
+ pandn %xmm2, %xmm0
+ movdqa %xmm4, %xmm2
+ pandn %xmm3, %xmm1
+ movdqa %xmm4, %xmm3
+ pand $in1_x(%rsp), %xmm2
+ pand $in1_x+0x10(%rsp), %xmm3
+ por %xmm0, %xmm2
+ por %xmm1, %xmm3
+ movdqu %xmm2, 0x00($r_ptr)
+ movdqu %xmm3, 0x10($r_ptr)
+
+ movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty);
+ movdqa %xmm5, %xmm1
+ pandn $res_y(%rsp), %xmm0
+ movdqa %xmm5, %xmm2
+ pandn $res_y+0x10(%rsp), %xmm1
+ movdqa %xmm5, %xmm3
+ pand $in2_y(%rsp), %xmm2
+ pand $in2_y+0x10(%rsp), %xmm3
+ por %xmm0, %xmm2
+ por %xmm1, %xmm3
+
+ movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty);
+ movdqa %xmm4, %xmm1
+ pandn %xmm2, %xmm0
+ movdqa %xmm4, %xmm2
+ pandn %xmm3, %xmm1
+ movdqa %xmm4, %xmm3
+ pand $in1_y(%rsp), %xmm2
+ pand $in1_y+0x10(%rsp), %xmm3
+ por %xmm0, %xmm2
+ por %xmm1, %xmm3
+ movdqu %xmm2, 0x20($r_ptr)
+ movdqu %xmm3, 0x30($r_ptr)
+
+ add \$32*15+8, %rsp
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+ pop %rbp
+ ret
+.size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx
+___
+}
+&gen_add_affine("q");
+
+########################################################################
+# AD*X magic
+#
+if ($addx) { {
+########################################################################
+# operate in 4-5-0-1 "name space" that matches multiplication output
+#
+my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
+
+$code.=<<___;
+.type __ecp_nistz256_add_tox,\@abi-omnipotent
+.align 32
+__ecp_nistz256_add_tox:
+ xor $t4, $t4
+ adc 8*0($b_ptr), $a0
+ adc 8*1($b_ptr), $a1
+ mov $a0, $t0
+ adc 8*2($b_ptr), $a2
+ adc 8*3($b_ptr), $a3
+ mov $a1, $t1
+ adc \$0, $t4
+
+ xor $t3, $t3
+ sbb \$-1, $a0
+ mov $a2, $t2
+ sbb $poly1, $a1
+ sbb \$0, $a2
+ mov $a3, $t3
+ sbb $poly3, $a3
+
+ bt \$0, $t4
+ cmovnc $t0, $a0
+ cmovnc $t1, $a1
+ mov $a0, 8*0($r_ptr)
+ cmovnc $t2, $a2
+ mov $a1, 8*1($r_ptr)
+ cmovnc $t3, $a3
+ mov $a2, 8*2($r_ptr)
+ mov $a3, 8*3($r_ptr)
+
+ ret
+.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
+
+.type __ecp_nistz256_sub_fromx,\@abi-omnipotent
+.align 32
+__ecp_nistz256_sub_fromx:
+ xor $t4, $t4
+ sbb 8*0($b_ptr), $a0
+ sbb 8*1($b_ptr), $a1
+ mov $a0, $t0
+ sbb 8*2($b_ptr), $a2
+ sbb 8*3($b_ptr), $a3
+ mov $a1, $t1
+ sbb \$0, $t4
+
+ xor $t3, $t3
+ adc \$-1, $a0
+ mov $a2, $t2
+ adc $poly1, $a1
+ adc \$0, $a2
+ mov $a3, $t3
+ adc $poly3, $a3
+
+ bt \$0, $t4
+ cmovnc $t0, $a0
+ cmovnc $t1, $a1
+ mov $a0, 8*0($r_ptr)
+ cmovnc $t2, $a2
+ mov $a1, 8*1($r_ptr)
+ cmovnc $t3, $a3
+ mov $a2, 8*2($r_ptr)
+ mov $a3, 8*3($r_ptr)
+
+ ret
+.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
+
+.type __ecp_nistz256_subx,\@abi-omnipotent
+.align 32
+__ecp_nistz256_subx:
+ xor $t4, $t4
+ sbb $a0, $t0
+ sbb $a1, $t1
+ mov $t0, $a0
+ sbb $a2, $t2
+ sbb $a3, $t3
+ mov $t1, $a1
+ sbb \$0, $t4
+
+ xor $a3 ,$a3
+ adc \$-1, $t0
+ mov $t2, $a2
+ adc $poly1, $t1
+ adc \$0, $t2
+ mov $t3, $a3
+ adc $poly3, $t3
+
+ bt \$0, $t4
+ cmovc $t0, $a0
+ cmovc $t1, $a1
+ cmovc $t2, $a2
+ cmovc $t3, $a3
+
+ ret
+.size __ecp_nistz256_subx,.-__ecp_nistz256_subx
+
+.type __ecp_nistz256_mul_by_2x,\@abi-omnipotent
+.align 32
+__ecp_nistz256_mul_by_2x:
+ xor $t4, $t4
+ adc $a0, $a0 # a0:a3+a0:a3
+ adc $a1, $a1
+ mov $a0, $t0
+ adc $a2, $a2
+ adc $a3, $a3
+ mov $a1, $t1
+ adc \$0, $t4
+
+ xor $t3, $t3
+ sbb \$-1, $a0
+ mov $a2, $t2
+ sbb $poly1, $a1
+ sbb \$0, $a2
+ mov $a3, $t3
+ sbb $poly3, $a3
+
+ bt \$0, $t4
+ cmovnc $t0, $a0
+ cmovnc $t1, $a1
+ mov $a0, 8*0($r_ptr)
+ cmovnc $t2, $a2
+ mov $a1, 8*1($r_ptr)
+ cmovnc $t3, $a3
+ mov $a2, 8*2($r_ptr)
+ mov $a3, 8*3($r_ptr)
+
+ ret
+.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
+___
+ }
+&gen_double("x");
+&gen_add("x");
+&gen_add_affine("x");
+}
+}}}
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/crypto/ec/ec.h b/crypto/ec/ec.h
index c4e7aea938c2..6d3178f609f4 100644
--- a/crypto/ec/ec.h
+++ b/crypto/ec/ec.h
@@ -240,6 +240,12 @@ int EC_GROUP_set_generator(EC_GROUP *group, const EC_POINT *generator,
*/
const EC_POINT *EC_GROUP_get0_generator(const EC_GROUP *group);
+/** Returns the montgomery data for order(Generator)
+ * \param group EC_GROUP object
+ * \return the currently used generator (possibly NULL).
+*/
+BN_MONT_CTX *EC_GROUP_get_mont_data(const EC_GROUP *group);
+
/** Gets the order of a EC_GROUP
* \param group EC_GROUP object
* \param order BIGNUM to which the order is copied
@@ -404,6 +410,9 @@ typedef struct {
*/
size_t EC_get_builtin_curves(EC_builtin_curve *r, size_t nitems);
+const char *EC_curve_nid2nist(int nid);
+int EC_curve_nist2nid(const char *name);
+
/********************************************************************/
/* EC_POINT functions */
/********************************************************************/
@@ -986,10 +995,78 @@ int EC_KEY_print_fp(FILE *fp, const EC_KEY *key, int off);
# endif
# define EVP_PKEY_CTX_set_ec_paramgen_curve_nid(ctx, nid) \
- EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_EC, EVP_PKEY_OP_PARAMGEN, \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_EC, \
+ EVP_PKEY_OP_PARAMGEN|EVP_PKEY_OP_KEYGEN, \
EVP_PKEY_CTRL_EC_PARAMGEN_CURVE_NID, nid, NULL)
+# define EVP_PKEY_CTX_set_ec_param_enc(ctx, flag) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_EC, \
+ EVP_PKEY_OP_PARAMGEN|EVP_PKEY_OP_KEYGEN, \
+ EVP_PKEY_CTRL_EC_PARAM_ENC, flag, NULL)
+
+# define EVP_PKEY_CTX_set_ecdh_cofactor_mode(ctx, flag) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_EC, \
+ EVP_PKEY_OP_DERIVE, \
+ EVP_PKEY_CTRL_EC_ECDH_COFACTOR, flag, NULL)
+
+# define EVP_PKEY_CTX_get_ecdh_cofactor_mode(ctx) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_EC, \
+ EVP_PKEY_OP_DERIVE, \
+ EVP_PKEY_CTRL_EC_ECDH_COFACTOR, -2, NULL)
+
+# define EVP_PKEY_CTX_set_ecdh_kdf_type(ctx, kdf) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_EC, \
+ EVP_PKEY_OP_DERIVE, \
+ EVP_PKEY_CTRL_EC_KDF_TYPE, kdf, NULL)
+
+# define EVP_PKEY_CTX_get_ecdh_kdf_type(ctx) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_EC, \
+ EVP_PKEY_OP_DERIVE, \
+ EVP_PKEY_CTRL_EC_KDF_TYPE, -2, NULL)
+
+# define EVP_PKEY_CTX_set_ecdh_kdf_md(ctx, md) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_EC, \
+ EVP_PKEY_OP_DERIVE, \
+ EVP_PKEY_CTRL_EC_KDF_MD, 0, (void *)md)
+
+# define EVP_PKEY_CTX_get_ecdh_kdf_md(ctx, pmd) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_EC, \
+ EVP_PKEY_OP_DERIVE, \
+ EVP_PKEY_CTRL_GET_EC_KDF_MD, 0, (void *)pmd)
+
+# define EVP_PKEY_CTX_set_ecdh_kdf_outlen(ctx, len) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_EC, \
+ EVP_PKEY_OP_DERIVE, \
+ EVP_PKEY_CTRL_EC_KDF_OUTLEN, len, NULL)
+
+# define EVP_PKEY_CTX_get_ecdh_kdf_outlen(ctx, plen) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_EC, \
+ EVP_PKEY_OP_DERIVE, \
+ EVP_PKEY_CTRL_GET_EC_KDF_OUTLEN, 0, (void *)plen)
+
+# define EVP_PKEY_CTX_set0_ecdh_kdf_ukm(ctx, p, plen) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_EC, \
+ EVP_PKEY_OP_DERIVE, \
+ EVP_PKEY_CTRL_EC_KDF_UKM, plen, (void *)p)
+
+# define EVP_PKEY_CTX_get0_ecdh_kdf_ukm(ctx, p) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_EC, \
+ EVP_PKEY_OP_DERIVE, \
+ EVP_PKEY_CTRL_GET_EC_KDF_UKM, 0, (void *)p)
+
# define EVP_PKEY_CTRL_EC_PARAMGEN_CURVE_NID (EVP_PKEY_ALG_CTRL + 1)
+# define EVP_PKEY_CTRL_EC_PARAM_ENC (EVP_PKEY_ALG_CTRL + 2)
+# define EVP_PKEY_CTRL_EC_ECDH_COFACTOR (EVP_PKEY_ALG_CTRL + 3)
+# define EVP_PKEY_CTRL_EC_KDF_TYPE (EVP_PKEY_ALG_CTRL + 4)
+# define EVP_PKEY_CTRL_EC_KDF_MD (EVP_PKEY_ALG_CTRL + 5)
+# define EVP_PKEY_CTRL_GET_EC_KDF_MD (EVP_PKEY_ALG_CTRL + 6)
+# define EVP_PKEY_CTRL_EC_KDF_OUTLEN (EVP_PKEY_ALG_CTRL + 7)
+# define EVP_PKEY_CTRL_GET_EC_KDF_OUTLEN (EVP_PKEY_ALG_CTRL + 8)
+# define EVP_PKEY_CTRL_EC_KDF_UKM (EVP_PKEY_ALG_CTRL + 9)
+# define EVP_PKEY_CTRL_GET_EC_KDF_UKM (EVP_PKEY_ALG_CTRL + 10)
+/* KDF types */
+# define EVP_PKEY_ECDH_KDF_NONE 1
+# define EVP_PKEY_ECDH_KDF_X9_62 2
/* BEGIN ERROR CODES */
/*
@@ -1007,6 +1084,8 @@ void ERR_load_EC_strings(void);
# define EC_F_D2I_ECPKPARAMETERS 145
# define EC_F_D2I_ECPRIVATEKEY 146
# define EC_F_DO_EC_KEY_PRINT 221
+# define EC_F_ECDH_CMS_DECRYPT 238
+# define EC_F_ECDH_CMS_SET_SHARED_INFO 239
# define EC_F_ECKEY_PARAM2TYPE 223
# define EC_F_ECKEY_PARAM_DECODE 212
# define EC_F_ECKEY_PRIV_DECODE 213
@@ -1018,6 +1097,12 @@ void ERR_load_EC_strings(void);
# define EC_F_ECPARAMETERS_PRINT_FP 148
# define EC_F_ECPKPARAMETERS_PRINT 149
# define EC_F_ECPKPARAMETERS_PRINT_FP 150
+# define EC_F_ECP_NISTZ256_GET_AFFINE 240
+# define EC_F_ECP_NISTZ256_MULT_PRECOMPUTE 243
+# define EC_F_ECP_NISTZ256_POINTS_MUL 241
+# define EC_F_ECP_NISTZ256_PRE_COMP_NEW 244
+# define EC_F_ECP_NISTZ256_SET_WORDS 245
+# define EC_F_ECP_NISTZ256_WINDOWED_MUL 242
# define EC_F_ECP_NIST_MOD_192 203
# define EC_F_ECP_NIST_MOD_224 204
# define EC_F_ECP_NIST_MOD_256 205
@@ -1157,6 +1242,7 @@ void ERR_load_EC_strings(void);
# define EC_R_INVALID_COMPRESSED_POINT 110
# define EC_R_INVALID_COMPRESSION_BIT 109
# define EC_R_INVALID_CURVE 141
+# define EC_R_INVALID_DIGEST 151
# define EC_R_INVALID_DIGEST_TYPE 138
# define EC_R_INVALID_ENCODING 102
# define EC_R_INVALID_FIELD 103
@@ -1165,6 +1251,7 @@ void ERR_load_EC_strings(void);
# define EC_R_INVALID_PENTANOMIAL_BASIS 132
# define EC_R_INVALID_PRIVATE_KEY 123
# define EC_R_INVALID_TRINOMIAL_BASIS 137
+# define EC_R_KDF_PARAMETER_ERROR 148
# define EC_R_KEYS_NOT_SET 140
# define EC_R_MISSING_PARAMETERS 124
# define EC_R_MISSING_PRIVATE_KEY 125
@@ -1175,9 +1262,11 @@ void ERR_load_EC_strings(void);
# define EC_R_NO_FIELD_MOD 133
# define EC_R_NO_PARAMETERS_SET 139
# define EC_R_PASSED_NULL_PARAMETER 134
+# define EC_R_PEER_KEY_ERROR 149
# define EC_R_PKPARAMETERS2GROUP_FAILURE 127
# define EC_R_POINT_AT_INFINITY 106
# define EC_R_POINT_IS_NOT_ON_CURVE 107
+# define EC_R_SHARED_INFO_ERROR 150
# define EC_R_SLOT_FULL 108
# define EC_R_UNDEFINED_GENERATOR 113
# define EC_R_UNDEFINED_ORDER 128
diff --git a/crypto/ec/ec_ameth.c b/crypto/ec/ec_ameth.c
index 5cefb5ad303a..83e208cfe491 100644
--- a/crypto/ec/ec_ameth.c
+++ b/crypto/ec/ec_ameth.c
@@ -64,8 +64,12 @@
#ifndef OPENSSL_NO_CMS
# include <openssl/cms.h>
#endif
+#include <openssl/asn1t.h>
#include "asn1_locl.h"
+static int ecdh_cms_decrypt(CMS_RecipientInfo *ri);
+static int ecdh_cms_encrypt(CMS_RecipientInfo *ri);
+
static int eckey_param2type(int *pptype, void **ppval, EC_KEY *ec_key)
{
const EC_GROUP *group;
@@ -580,10 +584,21 @@ static int ec_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2)
X509_ALGOR_set0(alg2, OBJ_nid2obj(snid), V_ASN1_UNDEF, 0);
}
return 1;
+
+ case ASN1_PKEY_CTRL_CMS_ENVELOPE:
+ if (arg1 == 1)
+ return ecdh_cms_decrypt(arg2);
+ else if (arg1 == 0)
+ return ecdh_cms_encrypt(arg2);
+ return -2;
+
+ case ASN1_PKEY_CTRL_CMS_RI_TYPE:
+ *(int *)arg2 = CMS_RECIPINFO_AGREE;
+ return 1;
#endif
case ASN1_PKEY_CTRL_DEFAULT_MD_NID:
- *(int *)arg2 = NID_sha1;
+ *(int *)arg2 = NID_sha256;
return 2;
default:
@@ -625,3 +640,326 @@ const EVP_PKEY_ASN1_METHOD eckey_asn1_meth = {
old_ec_priv_decode,
old_ec_priv_encode
};
+
+#ifndef OPENSSL_NO_CMS
+
+static int ecdh_cms_set_peerkey(EVP_PKEY_CTX *pctx,
+ X509_ALGOR *alg, ASN1_BIT_STRING *pubkey)
+{
+ ASN1_OBJECT *aoid;
+ int atype;
+ void *aval;
+ int rv = 0;
+ EVP_PKEY *pkpeer = NULL;
+ EC_KEY *ecpeer = NULL;
+ const unsigned char *p;
+ int plen;
+ X509_ALGOR_get0(&aoid, &atype, &aval, alg);
+ if (OBJ_obj2nid(aoid) != NID_X9_62_id_ecPublicKey)
+ goto err;
+ /* If absent parameters get group from main key */
+ if (atype == V_ASN1_UNDEF || atype == V_ASN1_NULL) {
+ const EC_GROUP *grp;
+ EVP_PKEY *pk;
+ pk = EVP_PKEY_CTX_get0_pkey(pctx);
+ if (!pk)
+ goto err;
+ grp = EC_KEY_get0_group(pk->pkey.ec);
+ ecpeer = EC_KEY_new();
+ if (!ecpeer)
+ goto err;
+ if (!EC_KEY_set_group(ecpeer, grp))
+ goto err;
+ } else {
+ ecpeer = eckey_type2param(atype, aval);
+ if (!ecpeer)
+ goto err;
+ }
+ /* We have parameters now set public key */
+ plen = ASN1_STRING_length(pubkey);
+ p = ASN1_STRING_data(pubkey);
+ if (!p || !plen)
+ goto err;
+ if (!o2i_ECPublicKey(&ecpeer, &p, plen))
+ goto err;
+ pkpeer = EVP_PKEY_new();
+ if (!pkpeer)
+ goto err;
+ EVP_PKEY_set1_EC_KEY(pkpeer, ecpeer);
+ if (EVP_PKEY_derive_set_peer(pctx, pkpeer) > 0)
+ rv = 1;
+ err:
+ if (ecpeer)
+ EC_KEY_free(ecpeer);
+ if (pkpeer)
+ EVP_PKEY_free(pkpeer);
+ return rv;
+}
+
+/* Set KDF parameters based on KDF NID */
+static int ecdh_cms_set_kdf_param(EVP_PKEY_CTX *pctx, int eckdf_nid)
+{
+ int kdf_nid, kdfmd_nid, cofactor;
+ const EVP_MD *kdf_md;
+ if (eckdf_nid == NID_undef)
+ return 0;
+
+ /* Lookup KDF type, cofactor mode and digest */
+ if (!OBJ_find_sigid_algs(eckdf_nid, &kdfmd_nid, &kdf_nid))
+ return 0;
+
+ if (kdf_nid == NID_dh_std_kdf)
+ cofactor = 0;
+ else if (kdf_nid == NID_dh_cofactor_kdf)
+ cofactor = 1;
+ else
+ return 0;
+
+ if (EVP_PKEY_CTX_set_ecdh_cofactor_mode(pctx, cofactor) <= 0)
+ return 0;
+
+ if (EVP_PKEY_CTX_set_ecdh_kdf_type(pctx, EVP_PKEY_ECDH_KDF_X9_62) <= 0)
+ return 0;
+
+ kdf_md = EVP_get_digestbynid(kdfmd_nid);
+ if (!kdf_md)
+ return 0;
+
+ if (EVP_PKEY_CTX_set_ecdh_kdf_md(pctx, kdf_md) <= 0)
+ return 0;
+ return 1;
+}
+
+static int ecdh_cms_set_shared_info(EVP_PKEY_CTX *pctx, CMS_RecipientInfo *ri)
+{
+ int rv = 0;
+
+ X509_ALGOR *alg, *kekalg = NULL;
+ ASN1_OCTET_STRING *ukm;
+ const unsigned char *p;
+ unsigned char *der = NULL;
+ int plen, keylen;
+ const EVP_CIPHER *kekcipher;
+ EVP_CIPHER_CTX *kekctx;
+
+ if (!CMS_RecipientInfo_kari_get0_alg(ri, &alg, &ukm))
+ return 0;
+
+ if (!ecdh_cms_set_kdf_param(pctx, OBJ_obj2nid(alg->algorithm))) {
+ ECerr(EC_F_ECDH_CMS_SET_SHARED_INFO, EC_R_KDF_PARAMETER_ERROR);
+ return 0;
+ }
+
+ if (alg->parameter->type != V_ASN1_SEQUENCE)
+ return 0;
+
+ p = alg->parameter->value.sequence->data;
+ plen = alg->parameter->value.sequence->length;
+ kekalg = d2i_X509_ALGOR(NULL, &p, plen);
+ if (!kekalg)
+ goto err;
+ kekctx = CMS_RecipientInfo_kari_get0_ctx(ri);
+ if (!kekctx)
+ goto err;
+ kekcipher = EVP_get_cipherbyobj(kekalg->algorithm);
+ if (!kekcipher || EVP_CIPHER_mode(kekcipher) != EVP_CIPH_WRAP_MODE)
+ goto err;
+ if (!EVP_EncryptInit_ex(kekctx, kekcipher, NULL, NULL, NULL))
+ goto err;
+ if (EVP_CIPHER_asn1_to_param(kekctx, kekalg->parameter) <= 0)
+ goto err;
+
+ keylen = EVP_CIPHER_CTX_key_length(kekctx);
+ if (EVP_PKEY_CTX_set_ecdh_kdf_outlen(pctx, keylen) <= 0)
+ goto err;
+
+ plen = CMS_SharedInfo_encode(&der, kekalg, ukm, keylen);
+
+ if (!plen)
+ goto err;
+
+ if (EVP_PKEY_CTX_set0_ecdh_kdf_ukm(pctx, der, plen) <= 0)
+ goto err;
+ der = NULL;
+
+ rv = 1;
+ err:
+ if (kekalg)
+ X509_ALGOR_free(kekalg);
+ if (der)
+ OPENSSL_free(der);
+ return rv;
+}
+
+static int ecdh_cms_decrypt(CMS_RecipientInfo *ri)
+{
+ EVP_PKEY_CTX *pctx;
+ pctx = CMS_RecipientInfo_get0_pkey_ctx(ri);
+ if (!pctx)
+ return 0;
+ /* See if we need to set peer key */
+ if (!EVP_PKEY_CTX_get0_peerkey(pctx)) {
+ X509_ALGOR *alg;
+ ASN1_BIT_STRING *pubkey;
+ if (!CMS_RecipientInfo_kari_get0_orig_id(ri, &alg, &pubkey,
+ NULL, NULL, NULL))
+ return 0;
+ if (!alg || !pubkey)
+ return 0;
+ if (!ecdh_cms_set_peerkey(pctx, alg, pubkey)) {
+ ECerr(EC_F_ECDH_CMS_DECRYPT, EC_R_PEER_KEY_ERROR);
+ return 0;
+ }
+ }
+ /* Set ECDH derivation parameters and initialise unwrap context */
+ if (!ecdh_cms_set_shared_info(pctx, ri)) {
+ ECerr(EC_F_ECDH_CMS_DECRYPT, EC_R_SHARED_INFO_ERROR);
+ return 0;
+ }
+ return 1;
+}
+
+static int ecdh_cms_encrypt(CMS_RecipientInfo *ri)
+{
+ EVP_PKEY_CTX *pctx;
+ EVP_PKEY *pkey;
+ EVP_CIPHER_CTX *ctx;
+ int keylen;
+ X509_ALGOR *talg, *wrap_alg = NULL;
+ ASN1_OBJECT *aoid;
+ ASN1_BIT_STRING *pubkey;
+ ASN1_STRING *wrap_str;
+ ASN1_OCTET_STRING *ukm;
+ unsigned char *penc = NULL;
+ int penclen;
+ int rv = 0;
+ int ecdh_nid, kdf_type, kdf_nid, wrap_nid;
+ const EVP_MD *kdf_md;
+ pctx = CMS_RecipientInfo_get0_pkey_ctx(ri);
+ if (!pctx)
+ return 0;
+ /* Get ephemeral key */
+ pkey = EVP_PKEY_CTX_get0_pkey(pctx);
+ if (!CMS_RecipientInfo_kari_get0_orig_id(ri, &talg, &pubkey,
+ NULL, NULL, NULL))
+ goto err;
+ X509_ALGOR_get0(&aoid, NULL, NULL, talg);
+ /* Is everything uninitialised? */
+ if (aoid == OBJ_nid2obj(NID_undef)) {
+
+ EC_KEY *eckey = pkey->pkey.ec;
+ /* Set the key */
+ unsigned char *p;
+
+ penclen = i2o_ECPublicKey(eckey, NULL);
+ if (penclen <= 0)
+ goto err;
+ penc = OPENSSL_malloc(penclen);
+ if (!penc)
+ goto err;
+ p = penc;
+ penclen = i2o_ECPublicKey(eckey, &p);
+ if (penclen <= 0)
+ goto err;
+ ASN1_STRING_set0(pubkey, penc, penclen);
+ pubkey->flags &= ~(ASN1_STRING_FLAG_BITS_LEFT | 0x07);
+ pubkey->flags |= ASN1_STRING_FLAG_BITS_LEFT;
+
+ penc = NULL;
+ X509_ALGOR_set0(talg, OBJ_nid2obj(NID_X9_62_id_ecPublicKey),
+ V_ASN1_UNDEF, NULL);
+ }
+
+ /* See if custom paraneters set */
+ kdf_type = EVP_PKEY_CTX_get_ecdh_kdf_type(pctx);
+ if (kdf_type <= 0)
+ goto err;
+ if (!EVP_PKEY_CTX_get_ecdh_kdf_md(pctx, &kdf_md))
+ goto err;
+ ecdh_nid = EVP_PKEY_CTX_get_ecdh_cofactor_mode(pctx);
+ if (ecdh_nid < 0)
+ goto err;
+ else if (ecdh_nid == 0)
+ ecdh_nid = NID_dh_std_kdf;
+ else if (ecdh_nid == 1)
+ ecdh_nid = NID_dh_cofactor_kdf;
+
+ if (kdf_type == EVP_PKEY_ECDH_KDF_NONE) {
+ kdf_type = EVP_PKEY_ECDH_KDF_X9_62;
+ if (EVP_PKEY_CTX_set_ecdh_kdf_type(pctx, kdf_type) <= 0)
+ goto err;
+ } else
+ /* Uknown KDF */
+ goto err;
+ if (kdf_md == NULL) {
+ /* Fixme later for better MD */
+ kdf_md = EVP_sha1();
+ if (EVP_PKEY_CTX_set_ecdh_kdf_md(pctx, kdf_md) <= 0)
+ goto err;
+ }
+
+ if (!CMS_RecipientInfo_kari_get0_alg(ri, &talg, &ukm))
+ goto err;
+
+ /* Lookup NID for KDF+cofactor+digest */
+
+ if (!OBJ_find_sigid_by_algs(&kdf_nid, EVP_MD_type(kdf_md), ecdh_nid))
+ goto err;
+ /* Get wrap NID */
+ ctx = CMS_RecipientInfo_kari_get0_ctx(ri);
+ wrap_nid = EVP_CIPHER_CTX_type(ctx);
+ keylen = EVP_CIPHER_CTX_key_length(ctx);
+
+ /* Package wrap algorithm in an AlgorithmIdentifier */
+
+ wrap_alg = X509_ALGOR_new();
+ if (!wrap_alg)
+ goto err;
+ wrap_alg->algorithm = OBJ_nid2obj(wrap_nid);
+ wrap_alg->parameter = ASN1_TYPE_new();
+ if (!wrap_alg->parameter)
+ goto err;
+ if (EVP_CIPHER_param_to_asn1(ctx, wrap_alg->parameter) <= 0)
+ goto err;
+ if (ASN1_TYPE_get(wrap_alg->parameter) == NID_undef) {
+ ASN1_TYPE_free(wrap_alg->parameter);
+ wrap_alg->parameter = NULL;
+ }
+
+ if (EVP_PKEY_CTX_set_ecdh_kdf_outlen(pctx, keylen) <= 0)
+ goto err;
+
+ penclen = CMS_SharedInfo_encode(&penc, wrap_alg, ukm, keylen);
+
+ if (!penclen)
+ goto err;
+
+ if (EVP_PKEY_CTX_set0_ecdh_kdf_ukm(pctx, penc, penclen) <= 0)
+ goto err;
+ penc = NULL;
+
+ /*
+ * Now need to wrap encoding of wrap AlgorithmIdentifier into parameter
+ * of another AlgorithmIdentifier.
+ */
+ penclen = i2d_X509_ALGOR(wrap_alg, &penc);
+ if (!penc || !penclen)
+ goto err;
+ wrap_str = ASN1_STRING_new();
+ if (!wrap_str)
+ goto err;
+ ASN1_STRING_set0(wrap_str, penc, penclen);
+ penc = NULL;
+ X509_ALGOR_set0(talg, OBJ_nid2obj(kdf_nid), V_ASN1_SEQUENCE, wrap_str);
+
+ rv = 1;
+
+ err:
+ if (penc)
+ OPENSSL_free(penc);
+ if (wrap_alg)
+ X509_ALGOR_free(wrap_alg);
+ return rv;
+}
+
+#endif
diff --git a/crypto/ec/ec_curve.c b/crypto/ec/ec_curve.c
index e4bcb5dd81dc..6dbe9d8258de 100644
--- a/crypto/ec/ec_curve.c
+++ b/crypto/ec/ec_curve.c
@@ -69,11 +69,16 @@
*
*/
+#include <string.h>
#include "ec_lcl.h"
#include <openssl/err.h>
#include <openssl/obj_mac.h>
#include <openssl/opensslconf.h>
+#ifdef OPENSSL_FIPS
+# include <openssl/fips.h>
+#endif
+
typedef struct {
int field_type, /* either NID_X9_62_prime_field or
* NID_X9_62_characteristic_two_field */
@@ -2282,6 +2287,554 @@ static const struct {
#endif
+/*
+ * These curves were added by Annie Yousar <a.yousar@informatik.hu-berlin.de>
+ * For the definition of RFC 5639 curves see
+ * http://www.ietf.org/rfc/rfc5639.txt These curves are generated verifiable
+ * at random, nevertheless the seed is omitted as parameter because the
+ * generation mechanism is different from those defined in ANSI X9.62.
+ */
+
+static const struct {
+ EC_CURVE_DATA h;
+ unsigned char data[0 + 20 * 6];
+} _EC_brainpoolP160r1 = {
+ {
+ NID_X9_62_prime_field, 0, 20, 1
+ },
+ {
+ /* no seed */
+ /* p */
+ 0xE9, 0x5E, 0x4A, 0x5F, 0x73, 0x70, 0x59, 0xDC, 0x60, 0xDF, 0xC7, 0xAD,
+ 0x95, 0xB3, 0xD8, 0x13, 0x95, 0x15, 0x62, 0x0F,
+ /* a */
+ 0x34, 0x0E, 0x7B, 0xE2, 0xA2, 0x80, 0xEB, 0x74, 0xE2, 0xBE, 0x61, 0xBA,
+ 0xDA, 0x74, 0x5D, 0x97, 0xE8, 0xF7, 0xC3, 0x00,
+ /* b */
+ 0x1E, 0x58, 0x9A, 0x85, 0x95, 0x42, 0x34, 0x12, 0x13, 0x4F, 0xAA, 0x2D,
+ 0xBD, 0xEC, 0x95, 0xC8, 0xD8, 0x67, 0x5E, 0x58,
+ /* x */
+ 0xBE, 0xD5, 0xAF, 0x16, 0xEA, 0x3F, 0x6A, 0x4F, 0x62, 0x93, 0x8C, 0x46,
+ 0x31, 0xEB, 0x5A, 0xF7, 0xBD, 0xBC, 0xDB, 0xC3,
+ /* y */
+ 0x16, 0x67, 0xCB, 0x47, 0x7A, 0x1A, 0x8E, 0xC3, 0x38, 0xF9, 0x47, 0x41,
+ 0x66, 0x9C, 0x97, 0x63, 0x16, 0xDA, 0x63, 0x21,
+ /* order */
+ 0xE9, 0x5E, 0x4A, 0x5F, 0x73, 0x70, 0x59, 0xDC, 0x60, 0xDF, 0x59, 0x91,
+ 0xD4, 0x50, 0x29, 0x40, 0x9E, 0x60, 0xFC, 0x09
+ }
+};
+
+static const struct {
+ EC_CURVE_DATA h;
+ unsigned char data[0 + 20 * 6];
+} _EC_brainpoolP160t1 = {
+ {
+ NID_X9_62_prime_field, 0, 20, 1
+ },
+ {
+ /* no seed */
+ /* p */
+ 0xE9, 0x5E, 0x4A, 0x5F, 0x73, 0x70, 0x59, 0xDC, 0x60, 0xDF, 0xC7, 0xAD,
+ 0x95, 0xB3, 0xD8, 0x13, 0x95, 0x15, 0x62, 0x0F,
+ /* a */
+ 0xE9, 0x5E, 0x4A, 0x5F, 0x73, 0x70, 0x59, 0xDC, 0x60, 0xDF, 0xC7, 0xAD,
+ 0x95, 0xB3, 0xD8, 0x13, 0x95, 0x15, 0x62, 0x0C,
+ /* b */
+ 0x7A, 0x55, 0x6B, 0x6D, 0xAE, 0x53, 0x5B, 0x7B, 0x51, 0xED, 0x2C, 0x4D,
+ 0x7D, 0xAA, 0x7A, 0x0B, 0x5C, 0x55, 0xF3, 0x80,
+ /* x */
+ 0xB1, 0x99, 0xB1, 0x3B, 0x9B, 0x34, 0xEF, 0xC1, 0x39, 0x7E, 0x64, 0xBA,
+ 0xEB, 0x05, 0xAC, 0xC2, 0x65, 0xFF, 0x23, 0x78,
+ /* y */
+ 0xAD, 0xD6, 0x71, 0x8B, 0x7C, 0x7C, 0x19, 0x61, 0xF0, 0x99, 0x1B, 0x84,
+ 0x24, 0x43, 0x77, 0x21, 0x52, 0xC9, 0xE0, 0xAD,
+ /* order */
+ 0xE9, 0x5E, 0x4A, 0x5F, 0x73, 0x70, 0x59, 0xDC, 0x60, 0xDF, 0x59, 0x91,
+ 0xD4, 0x50, 0x29, 0x40, 0x9E, 0x60, 0xFC, 0x09
+ }
+};
+
+static const struct {
+ EC_CURVE_DATA h;
+ unsigned char data[0 + 24 * 6];
+} _EC_brainpoolP192r1 = {
+ {
+ NID_X9_62_prime_field, 0, 24, 1
+ },
+ {
+ /* no seed */
+ /* p */
+ 0xC3, 0x02, 0xF4, 0x1D, 0x93, 0x2A, 0x36, 0xCD, 0xA7, 0xA3, 0x46, 0x30,
+ 0x93, 0xD1, 0x8D, 0xB7, 0x8F, 0xCE, 0x47, 0x6D, 0xE1, 0xA8, 0x62, 0x97,
+ /* a */
+ 0x6A, 0x91, 0x17, 0x40, 0x76, 0xB1, 0xE0, 0xE1, 0x9C, 0x39, 0xC0, 0x31,
+ 0xFE, 0x86, 0x85, 0xC1, 0xCA, 0xE0, 0x40, 0xE5, 0xC6, 0x9A, 0x28, 0xEF,
+ /* b */
+ 0x46, 0x9A, 0x28, 0xEF, 0x7C, 0x28, 0xCC, 0xA3, 0xDC, 0x72, 0x1D, 0x04,
+ 0x4F, 0x44, 0x96, 0xBC, 0xCA, 0x7E, 0xF4, 0x14, 0x6F, 0xBF, 0x25, 0xC9,
+ /* x */
+ 0xC0, 0xA0, 0x64, 0x7E, 0xAA, 0xB6, 0xA4, 0x87, 0x53, 0xB0, 0x33, 0xC5,
+ 0x6C, 0xB0, 0xF0, 0x90, 0x0A, 0x2F, 0x5C, 0x48, 0x53, 0x37, 0x5F, 0xD6,
+ /* y */
+ 0x14, 0xB6, 0x90, 0x86, 0x6A, 0xBD, 0x5B, 0xB8, 0x8B, 0x5F, 0x48, 0x28,
+ 0xC1, 0x49, 0x00, 0x02, 0xE6, 0x77, 0x3F, 0xA2, 0xFA, 0x29, 0x9B, 0x8F,
+ /* order */
+ 0xC3, 0x02, 0xF4, 0x1D, 0x93, 0x2A, 0x36, 0xCD, 0xA7, 0xA3, 0x46, 0x2F,
+ 0x9E, 0x9E, 0x91, 0x6B, 0x5B, 0xE8, 0xF1, 0x02, 0x9A, 0xC4, 0xAC, 0xC1
+ }
+};
+
+static const struct {
+ EC_CURVE_DATA h;
+ unsigned char data[0 + 24 * 6];
+} _EC_brainpoolP192t1 = {
+ {
+ NID_X9_62_prime_field, 0, 24, 1
+ },
+ {
+ /* no seed */
+ /* p */
+ 0xC3, 0x02, 0xF4, 0x1D, 0x93, 0x2A, 0x36, 0xCD, 0xA7, 0xA3, 0x46, 0x30,
+ 0x93, 0xD1, 0x8D, 0xB7, 0x8F, 0xCE, 0x47, 0x6D, 0xE1, 0xA8, 0x62, 0x97,
+ /* a */
+ 0xC3, 0x02, 0xF4, 0x1D, 0x93, 0x2A, 0x36, 0xCD, 0xA7, 0xA3, 0x46, 0x30,
+ 0x93, 0xD1, 0x8D, 0xB7, 0x8F, 0xCE, 0x47, 0x6D, 0xE1, 0xA8, 0x62, 0x94,
+ /* b */
+ 0x13, 0xD5, 0x6F, 0xFA, 0xEC, 0x78, 0x68, 0x1E, 0x68, 0xF9, 0xDE, 0xB4,
+ 0x3B, 0x35, 0xBE, 0xC2, 0xFB, 0x68, 0x54, 0x2E, 0x27, 0x89, 0x7B, 0x79,
+ /* x */
+ 0x3A, 0xE9, 0xE5, 0x8C, 0x82, 0xF6, 0x3C, 0x30, 0x28, 0x2E, 0x1F, 0xE7,
+ 0xBB, 0xF4, 0x3F, 0xA7, 0x2C, 0x44, 0x6A, 0xF6, 0xF4, 0x61, 0x81, 0x29,
+ /* y */
+ 0x09, 0x7E, 0x2C, 0x56, 0x67, 0xC2, 0x22, 0x3A, 0x90, 0x2A, 0xB5, 0xCA,
+ 0x44, 0x9D, 0x00, 0x84, 0xB7, 0xE5, 0xB3, 0xDE, 0x7C, 0xCC, 0x01, 0xC9,
+ /* order */
+ 0xC3, 0x02, 0xF4, 0x1D, 0x93, 0x2A, 0x36, 0xCD, 0xA7, 0xA3, 0x46, 0x2F,
+ 0x9E, 0x9E, 0x91, 0x6B, 0x5B, 0xE8, 0xF1, 0x02, 0x9A, 0xC4, 0xAC, 0xC1
+ }
+};
+
+static const struct {
+ EC_CURVE_DATA h;
+ unsigned char data[0 + 28 * 6];
+} _EC_brainpoolP224r1 = {
+ {
+ NID_X9_62_prime_field, 0, 28, 1
+ },
+ {
+ /* no seed */
+ /* p */
+ 0xD7, 0xC1, 0x34, 0xAA, 0x26, 0x43, 0x66, 0x86, 0x2A, 0x18, 0x30, 0x25,
+ 0x75, 0xD1, 0xD7, 0x87, 0xB0, 0x9F, 0x07, 0x57, 0x97, 0xDA, 0x89, 0xF5,
+ 0x7E, 0xC8, 0xC0, 0xFF,
+ /* a */
+ 0x68, 0xA5, 0xE6, 0x2C, 0xA9, 0xCE, 0x6C, 0x1C, 0x29, 0x98, 0x03, 0xA6,
+ 0xC1, 0x53, 0x0B, 0x51, 0x4E, 0x18, 0x2A, 0xD8, 0xB0, 0x04, 0x2A, 0x59,
+ 0xCA, 0xD2, 0x9F, 0x43,
+ /* b */
+ 0x25, 0x80, 0xF6, 0x3C, 0xCF, 0xE4, 0x41, 0x38, 0x87, 0x07, 0x13, 0xB1,
+ 0xA9, 0x23, 0x69, 0xE3, 0x3E, 0x21, 0x35, 0xD2, 0x66, 0xDB, 0xB3, 0x72,
+ 0x38, 0x6C, 0x40, 0x0B,
+ /* x */
+ 0x0D, 0x90, 0x29, 0xAD, 0x2C, 0x7E, 0x5C, 0xF4, 0x34, 0x08, 0x23, 0xB2,
+ 0xA8, 0x7D, 0xC6, 0x8C, 0x9E, 0x4C, 0xE3, 0x17, 0x4C, 0x1E, 0x6E, 0xFD,
+ 0xEE, 0x12, 0xC0, 0x7D,
+ /* y */
+ 0x58, 0xAA, 0x56, 0xF7, 0x72, 0xC0, 0x72, 0x6F, 0x24, 0xC6, 0xB8, 0x9E,
+ 0x4E, 0xCD, 0xAC, 0x24, 0x35, 0x4B, 0x9E, 0x99, 0xCA, 0xA3, 0xF6, 0xD3,
+ 0x76, 0x14, 0x02, 0xCD,
+ /* order */
+ 0xD7, 0xC1, 0x34, 0xAA, 0x26, 0x43, 0x66, 0x86, 0x2A, 0x18, 0x30, 0x25,
+ 0x75, 0xD0, 0xFB, 0x98, 0xD1, 0x16, 0xBC, 0x4B, 0x6D, 0xDE, 0xBC, 0xA3,
+ 0xA5, 0xA7, 0x93, 0x9F
+ }
+};
+
+static const struct {
+ EC_CURVE_DATA h;
+ unsigned char data[0 + 28 * 6];
+} _EC_brainpoolP224t1 = {
+ {
+ NID_X9_62_prime_field, 0, 28, 1
+ },
+ {
+ /* no seed */
+ /* p */
+ 0xD7, 0xC1, 0x34, 0xAA, 0x26, 0x43, 0x66, 0x86, 0x2A, 0x18, 0x30, 0x25,
+ 0x75, 0xD1, 0xD7, 0x87, 0xB0, 0x9F, 0x07, 0x57, 0x97, 0xDA, 0x89, 0xF5,
+ 0x7E, 0xC8, 0xC0, 0xFF,
+ /* a */
+ 0xD7, 0xC1, 0x34, 0xAA, 0x26, 0x43, 0x66, 0x86, 0x2A, 0x18, 0x30, 0x25,
+ 0x75, 0xD1, 0xD7, 0x87, 0xB0, 0x9F, 0x07, 0x57, 0x97, 0xDA, 0x89, 0xF5,
+ 0x7E, 0xC8, 0xC0, 0xFC,
+ /* b */
+ 0x4B, 0x33, 0x7D, 0x93, 0x41, 0x04, 0xCD, 0x7B, 0xEF, 0x27, 0x1B, 0xF6,
+ 0x0C, 0xED, 0x1E, 0xD2, 0x0D, 0xA1, 0x4C, 0x08, 0xB3, 0xBB, 0x64, 0xF1,
+ 0x8A, 0x60, 0x88, 0x8D,
+ /* x */
+ 0x6A, 0xB1, 0xE3, 0x44, 0xCE, 0x25, 0xFF, 0x38, 0x96, 0x42, 0x4E, 0x7F,
+ 0xFE, 0x14, 0x76, 0x2E, 0xCB, 0x49, 0xF8, 0x92, 0x8A, 0xC0, 0xC7, 0x60,
+ 0x29, 0xB4, 0xD5, 0x80,
+ /* y */
+ 0x03, 0x74, 0xE9, 0xF5, 0x14, 0x3E, 0x56, 0x8C, 0xD2, 0x3F, 0x3F, 0x4D,
+ 0x7C, 0x0D, 0x4B, 0x1E, 0x41, 0xC8, 0xCC, 0x0D, 0x1C, 0x6A, 0xBD, 0x5F,
+ 0x1A, 0x46, 0xDB, 0x4C,
+ /* order */
+ 0xD7, 0xC1, 0x34, 0xAA, 0x26, 0x43, 0x66, 0x86, 0x2A, 0x18, 0x30, 0x25,
+ 0x75, 0xD0, 0xFB, 0x98, 0xD1, 0x16, 0xBC, 0x4B, 0x6D, 0xDE, 0xBC, 0xA3,
+ 0xA5, 0xA7, 0x93, 0x9F
+ }
+};
+
+static const struct {
+ EC_CURVE_DATA h;
+ unsigned char data[0 + 32 * 6];
+} _EC_brainpoolP256r1 = {
+ {
+ NID_X9_62_prime_field, 0, 32, 1
+ },
+ {
+ /* no seed */
+ /* p */
+ 0xA9, 0xFB, 0x57, 0xDB, 0xA1, 0xEE, 0xA9, 0xBC, 0x3E, 0x66, 0x0A, 0x90,
+ 0x9D, 0x83, 0x8D, 0x72, 0x6E, 0x3B, 0xF6, 0x23, 0xD5, 0x26, 0x20, 0x28,
+ 0x20, 0x13, 0x48, 0x1D, 0x1F, 0x6E, 0x53, 0x77,
+ /* a */
+ 0x7D, 0x5A, 0x09, 0x75, 0xFC, 0x2C, 0x30, 0x57, 0xEE, 0xF6, 0x75, 0x30,
+ 0x41, 0x7A, 0xFF, 0xE7, 0xFB, 0x80, 0x55, 0xC1, 0x26, 0xDC, 0x5C, 0x6C,
+ 0xE9, 0x4A, 0x4B, 0x44, 0xF3, 0x30, 0xB5, 0xD9,
+ /* b */
+ 0x26, 0xDC, 0x5C, 0x6C, 0xE9, 0x4A, 0x4B, 0x44, 0xF3, 0x30, 0xB5, 0xD9,
+ 0xBB, 0xD7, 0x7C, 0xBF, 0x95, 0x84, 0x16, 0x29, 0x5C, 0xF7, 0xE1, 0xCE,
+ 0x6B, 0xCC, 0xDC, 0x18, 0xFF, 0x8C, 0x07, 0xB6,
+ /* x */
+ 0x8B, 0xD2, 0xAE, 0xB9, 0xCB, 0x7E, 0x57, 0xCB, 0x2C, 0x4B, 0x48, 0x2F,
+ 0xFC, 0x81, 0xB7, 0xAF, 0xB9, 0xDE, 0x27, 0xE1, 0xE3, 0xBD, 0x23, 0xC2,
+ 0x3A, 0x44, 0x53, 0xBD, 0x9A, 0xCE, 0x32, 0x62,
+ /* y */
+ 0x54, 0x7E, 0xF8, 0x35, 0xC3, 0xDA, 0xC4, 0xFD, 0x97, 0xF8, 0x46, 0x1A,
+ 0x14, 0x61, 0x1D, 0xC9, 0xC2, 0x77, 0x45, 0x13, 0x2D, 0xED, 0x8E, 0x54,
+ 0x5C, 0x1D, 0x54, 0xC7, 0x2F, 0x04, 0x69, 0x97,
+ /* order */
+ 0xA9, 0xFB, 0x57, 0xDB, 0xA1, 0xEE, 0xA9, 0xBC, 0x3E, 0x66, 0x0A, 0x90,
+ 0x9D, 0x83, 0x8D, 0x71, 0x8C, 0x39, 0x7A, 0xA3, 0xB5, 0x61, 0xA6, 0xF7,
+ 0x90, 0x1E, 0x0E, 0x82, 0x97, 0x48, 0x56, 0xA7
+ }
+};
+
+static const struct {
+ EC_CURVE_DATA h;
+ unsigned char data[0 + 32 * 6];
+} _EC_brainpoolP256t1 = {
+ {
+ NID_X9_62_prime_field, 0, 32, 1
+ },
+ {
+ /* no seed */
+ /* p */
+ 0xA9, 0xFB, 0x57, 0xDB, 0xA1, 0xEE, 0xA9, 0xBC, 0x3E, 0x66, 0x0A, 0x90,
+ 0x9D, 0x83, 0x8D, 0x72, 0x6E, 0x3B, 0xF6, 0x23, 0xD5, 0x26, 0x20, 0x28,
+ 0x20, 0x13, 0x48, 0x1D, 0x1F, 0x6E, 0x53, 0x77,
+ /* a */
+ 0xA9, 0xFB, 0x57, 0xDB, 0xA1, 0xEE, 0xA9, 0xBC, 0x3E, 0x66, 0x0A, 0x90,
+ 0x9D, 0x83, 0x8D, 0x72, 0x6E, 0x3B, 0xF6, 0x23, 0xD5, 0x26, 0x20, 0x28,
+ 0x20, 0x13, 0x48, 0x1D, 0x1F, 0x6E, 0x53, 0x74,
+ /* b */
+ 0x66, 0x2C, 0x61, 0xC4, 0x30, 0xD8, 0x4E, 0xA4, 0xFE, 0x66, 0xA7, 0x73,
+ 0x3D, 0x0B, 0x76, 0xB7, 0xBF, 0x93, 0xEB, 0xC4, 0xAF, 0x2F, 0x49, 0x25,
+ 0x6A, 0xE5, 0x81, 0x01, 0xFE, 0xE9, 0x2B, 0x04,
+ /* x */
+ 0xA3, 0xE8, 0xEB, 0x3C, 0xC1, 0xCF, 0xE7, 0xB7, 0x73, 0x22, 0x13, 0xB2,
+ 0x3A, 0x65, 0x61, 0x49, 0xAF, 0xA1, 0x42, 0xC4, 0x7A, 0xAF, 0xBC, 0x2B,
+ 0x79, 0xA1, 0x91, 0x56, 0x2E, 0x13, 0x05, 0xF4,
+ /* y */
+ 0x2D, 0x99, 0x6C, 0x82, 0x34, 0x39, 0xC5, 0x6D, 0x7F, 0x7B, 0x22, 0xE1,
+ 0x46, 0x44, 0x41, 0x7E, 0x69, 0xBC, 0xB6, 0xDE, 0x39, 0xD0, 0x27, 0x00,
+ 0x1D, 0xAB, 0xE8, 0xF3, 0x5B, 0x25, 0xC9, 0xBE,
+ /* order */
+ 0xA9, 0xFB, 0x57, 0xDB, 0xA1, 0xEE, 0xA9, 0xBC, 0x3E, 0x66, 0x0A, 0x90,
+ 0x9D, 0x83, 0x8D, 0x71, 0x8C, 0x39, 0x7A, 0xA3, 0xB5, 0x61, 0xA6, 0xF7,
+ 0x90, 0x1E, 0x0E, 0x82, 0x97, 0x48, 0x56, 0xA7
+ }
+};
+
+static const struct {
+ EC_CURVE_DATA h;
+ unsigned char data[0 + 40 * 6];
+} _EC_brainpoolP320r1 = {
+ {
+ NID_X9_62_prime_field, 0, 40, 1
+ },
+ {
+ /* no seed */
+ /* p */
+ 0xD3, 0x5E, 0x47, 0x20, 0x36, 0xBC, 0x4F, 0xB7, 0xE1, 0x3C, 0x78, 0x5E,
+ 0xD2, 0x01, 0xE0, 0x65, 0xF9, 0x8F, 0xCF, 0xA6, 0xF6, 0xF4, 0x0D, 0xEF,
+ 0x4F, 0x92, 0xB9, 0xEC, 0x78, 0x93, 0xEC, 0x28, 0xFC, 0xD4, 0x12, 0xB1,
+ 0xF1, 0xB3, 0x2E, 0x27,
+ /* a */
+ 0x3E, 0xE3, 0x0B, 0x56, 0x8F, 0xBA, 0xB0, 0xF8, 0x83, 0xCC, 0xEB, 0xD4,
+ 0x6D, 0x3F, 0x3B, 0xB8, 0xA2, 0xA7, 0x35, 0x13, 0xF5, 0xEB, 0x79, 0xDA,
+ 0x66, 0x19, 0x0E, 0xB0, 0x85, 0xFF, 0xA9, 0xF4, 0x92, 0xF3, 0x75, 0xA9,
+ 0x7D, 0x86, 0x0E, 0xB4,
+ /* b */
+ 0x52, 0x08, 0x83, 0x94, 0x9D, 0xFD, 0xBC, 0x42, 0xD3, 0xAD, 0x19, 0x86,
+ 0x40, 0x68, 0x8A, 0x6F, 0xE1, 0x3F, 0x41, 0x34, 0x95, 0x54, 0xB4, 0x9A,
+ 0xCC, 0x31, 0xDC, 0xCD, 0x88, 0x45, 0x39, 0x81, 0x6F, 0x5E, 0xB4, 0xAC,
+ 0x8F, 0xB1, 0xF1, 0xA6,
+ /* x */
+ 0x43, 0xBD, 0x7E, 0x9A, 0xFB, 0x53, 0xD8, 0xB8, 0x52, 0x89, 0xBC, 0xC4,
+ 0x8E, 0xE5, 0xBF, 0xE6, 0xF2, 0x01, 0x37, 0xD1, 0x0A, 0x08, 0x7E, 0xB6,
+ 0xE7, 0x87, 0x1E, 0x2A, 0x10, 0xA5, 0x99, 0xC7, 0x10, 0xAF, 0x8D, 0x0D,
+ 0x39, 0xE2, 0x06, 0x11,
+ /* y */
+ 0x14, 0xFD, 0xD0, 0x55, 0x45, 0xEC, 0x1C, 0xC8, 0xAB, 0x40, 0x93, 0x24,
+ 0x7F, 0x77, 0x27, 0x5E, 0x07, 0x43, 0xFF, 0xED, 0x11, 0x71, 0x82, 0xEA,
+ 0xA9, 0xC7, 0x78, 0x77, 0xAA, 0xAC, 0x6A, 0xC7, 0xD3, 0x52, 0x45, 0xD1,
+ 0x69, 0x2E, 0x8E, 0xE1,
+ /* order */
+ 0xD3, 0x5E, 0x47, 0x20, 0x36, 0xBC, 0x4F, 0xB7, 0xE1, 0x3C, 0x78, 0x5E,
+ 0xD2, 0x01, 0xE0, 0x65, 0xF9, 0x8F, 0xCF, 0xA5, 0xB6, 0x8F, 0x12, 0xA3,
+ 0x2D, 0x48, 0x2E, 0xC7, 0xEE, 0x86, 0x58, 0xE9, 0x86, 0x91, 0x55, 0x5B,
+ 0x44, 0xC5, 0x93, 0x11
+ }
+};
+
+static const struct {
+ EC_CURVE_DATA h;
+ unsigned char data[0 + 40 * 6];
+} _EC_brainpoolP320t1 = {
+ {
+ NID_X9_62_prime_field, 0, 40, 1
+ },
+ {
+ /* no seed */
+ /* p */
+ 0xD3, 0x5E, 0x47, 0x20, 0x36, 0xBC, 0x4F, 0xB7, 0xE1, 0x3C, 0x78, 0x5E,
+ 0xD2, 0x01, 0xE0, 0x65, 0xF9, 0x8F, 0xCF, 0xA6, 0xF6, 0xF4, 0x0D, 0xEF,
+ 0x4F, 0x92, 0xB9, 0xEC, 0x78, 0x93, 0xEC, 0x28, 0xFC, 0xD4, 0x12, 0xB1,
+ 0xF1, 0xB3, 0x2E, 0x27,
+ /* a */
+ 0xD3, 0x5E, 0x47, 0x20, 0x36, 0xBC, 0x4F, 0xB7, 0xE1, 0x3C, 0x78, 0x5E,
+ 0xD2, 0x01, 0xE0, 0x65, 0xF9, 0x8F, 0xCF, 0xA6, 0xF6, 0xF4, 0x0D, 0xEF,
+ 0x4F, 0x92, 0xB9, 0xEC, 0x78, 0x93, 0xEC, 0x28, 0xFC, 0xD4, 0x12, 0xB1,
+ 0xF1, 0xB3, 0x2E, 0x24,
+ /* b */
+ 0xA7, 0xF5, 0x61, 0xE0, 0x38, 0xEB, 0x1E, 0xD5, 0x60, 0xB3, 0xD1, 0x47,
+ 0xDB, 0x78, 0x20, 0x13, 0x06, 0x4C, 0x19, 0xF2, 0x7E, 0xD2, 0x7C, 0x67,
+ 0x80, 0xAA, 0xF7, 0x7F, 0xB8, 0xA5, 0x47, 0xCE, 0xB5, 0xB4, 0xFE, 0xF4,
+ 0x22, 0x34, 0x03, 0x53,
+ /* x */
+ 0x92, 0x5B, 0xE9, 0xFB, 0x01, 0xAF, 0xC6, 0xFB, 0x4D, 0x3E, 0x7D, 0x49,
+ 0x90, 0x01, 0x0F, 0x81, 0x34, 0x08, 0xAB, 0x10, 0x6C, 0x4F, 0x09, 0xCB,
+ 0x7E, 0xE0, 0x78, 0x68, 0xCC, 0x13, 0x6F, 0xFF, 0x33, 0x57, 0xF6, 0x24,
+ 0xA2, 0x1B, 0xED, 0x52,
+ /* y */
+ 0x63, 0xBA, 0x3A, 0x7A, 0x27, 0x48, 0x3E, 0xBF, 0x66, 0x71, 0xDB, 0xEF,
+ 0x7A, 0xBB, 0x30, 0xEB, 0xEE, 0x08, 0x4E, 0x58, 0xA0, 0xB0, 0x77, 0xAD,
+ 0x42, 0xA5, 0xA0, 0x98, 0x9D, 0x1E, 0xE7, 0x1B, 0x1B, 0x9B, 0xC0, 0x45,
+ 0x5F, 0xB0, 0xD2, 0xC3,
+ /* order */
+ 0xD3, 0x5E, 0x47, 0x20, 0x36, 0xBC, 0x4F, 0xB7, 0xE1, 0x3C, 0x78, 0x5E,
+ 0xD2, 0x01, 0xE0, 0x65, 0xF9, 0x8F, 0xCF, 0xA5, 0xB6, 0x8F, 0x12, 0xA3,
+ 0x2D, 0x48, 0x2E, 0xC7, 0xEE, 0x86, 0x58, 0xE9, 0x86, 0x91, 0x55, 0x5B,
+ 0x44, 0xC5, 0x93, 0x11
+ }
+};
+
+static const struct {
+ EC_CURVE_DATA h;
+ unsigned char data[0 + 48 * 6];
+} _EC_brainpoolP384r1 = {
+ {
+ NID_X9_62_prime_field, 0, 48, 1
+ },
+ {
+ /* no seed */
+ /* p */
+ 0x8C, 0xB9, 0x1E, 0x82, 0xA3, 0x38, 0x6D, 0x28, 0x0F, 0x5D, 0x6F, 0x7E,
+ 0x50, 0xE6, 0x41, 0xDF, 0x15, 0x2F, 0x71, 0x09, 0xED, 0x54, 0x56, 0xB4,
+ 0x12, 0xB1, 0xDA, 0x19, 0x7F, 0xB7, 0x11, 0x23, 0xAC, 0xD3, 0xA7, 0x29,
+ 0x90, 0x1D, 0x1A, 0x71, 0x87, 0x47, 0x00, 0x13, 0x31, 0x07, 0xEC, 0x53,
+ /* a */
+ 0x7B, 0xC3, 0x82, 0xC6, 0x3D, 0x8C, 0x15, 0x0C, 0x3C, 0x72, 0x08, 0x0A,
+ 0xCE, 0x05, 0xAF, 0xA0, 0xC2, 0xBE, 0xA2, 0x8E, 0x4F, 0xB2, 0x27, 0x87,
+ 0x13, 0x91, 0x65, 0xEF, 0xBA, 0x91, 0xF9, 0x0F, 0x8A, 0xA5, 0x81, 0x4A,
+ 0x50, 0x3A, 0xD4, 0xEB, 0x04, 0xA8, 0xC7, 0xDD, 0x22, 0xCE, 0x28, 0x26,
+ /* b */
+ 0x04, 0xA8, 0xC7, 0xDD, 0x22, 0xCE, 0x28, 0x26, 0x8B, 0x39, 0xB5, 0x54,
+ 0x16, 0xF0, 0x44, 0x7C, 0x2F, 0xB7, 0x7D, 0xE1, 0x07, 0xDC, 0xD2, 0xA6,
+ 0x2E, 0x88, 0x0E, 0xA5, 0x3E, 0xEB, 0x62, 0xD5, 0x7C, 0xB4, 0x39, 0x02,
+ 0x95, 0xDB, 0xC9, 0x94, 0x3A, 0xB7, 0x86, 0x96, 0xFA, 0x50, 0x4C, 0x11,
+ /* x */
+ 0x1D, 0x1C, 0x64, 0xF0, 0x68, 0xCF, 0x45, 0xFF, 0xA2, 0xA6, 0x3A, 0x81,
+ 0xB7, 0xC1, 0x3F, 0x6B, 0x88, 0x47, 0xA3, 0xE7, 0x7E, 0xF1, 0x4F, 0xE3,
+ 0xDB, 0x7F, 0xCA, 0xFE, 0x0C, 0xBD, 0x10, 0xE8, 0xE8, 0x26, 0xE0, 0x34,
+ 0x36, 0xD6, 0x46, 0xAA, 0xEF, 0x87, 0xB2, 0xE2, 0x47, 0xD4, 0xAF, 0x1E,
+ /* y */
+ 0x8A, 0xBE, 0x1D, 0x75, 0x20, 0xF9, 0xC2, 0xA4, 0x5C, 0xB1, 0xEB, 0x8E,
+ 0x95, 0xCF, 0xD5, 0x52, 0x62, 0xB7, 0x0B, 0x29, 0xFE, 0xEC, 0x58, 0x64,
+ 0xE1, 0x9C, 0x05, 0x4F, 0xF9, 0x91, 0x29, 0x28, 0x0E, 0x46, 0x46, 0x21,
+ 0x77, 0x91, 0x81, 0x11, 0x42, 0x82, 0x03, 0x41, 0x26, 0x3C, 0x53, 0x15,
+ /* order */
+ 0x8C, 0xB9, 0x1E, 0x82, 0xA3, 0x38, 0x6D, 0x28, 0x0F, 0x5D, 0x6F, 0x7E,
+ 0x50, 0xE6, 0x41, 0xDF, 0x15, 0x2F, 0x71, 0x09, 0xED, 0x54, 0x56, 0xB3,
+ 0x1F, 0x16, 0x6E, 0x6C, 0xAC, 0x04, 0x25, 0xA7, 0xCF, 0x3A, 0xB6, 0xAF,
+ 0x6B, 0x7F, 0xC3, 0x10, 0x3B, 0x88, 0x32, 0x02, 0xE9, 0x04, 0x65, 0x65
+ }
+};
+
+static const struct {
+ EC_CURVE_DATA h;
+ unsigned char data[0 + 48 * 6];
+} _EC_brainpoolP384t1 = {
+ {
+ NID_X9_62_prime_field, 0, 48, 1
+ },
+ {
+ /* no seed */
+ /* p */
+ 0x8C, 0xB9, 0x1E, 0x82, 0xA3, 0x38, 0x6D, 0x28, 0x0F, 0x5D, 0x6F, 0x7E,
+ 0x50, 0xE6, 0x41, 0xDF, 0x15, 0x2F, 0x71, 0x09, 0xED, 0x54, 0x56, 0xB4,
+ 0x12, 0xB1, 0xDA, 0x19, 0x7F, 0xB7, 0x11, 0x23, 0xAC, 0xD3, 0xA7, 0x29,
+ 0x90, 0x1D, 0x1A, 0x71, 0x87, 0x47, 0x00, 0x13, 0x31, 0x07, 0xEC, 0x53,
+ /* a */
+ 0x8C, 0xB9, 0x1E, 0x82, 0xA3, 0x38, 0x6D, 0x28, 0x0F, 0x5D, 0x6F, 0x7E,
+ 0x50, 0xE6, 0x41, 0xDF, 0x15, 0x2F, 0x71, 0x09, 0xED, 0x54, 0x56, 0xB4,
+ 0x12, 0xB1, 0xDA, 0x19, 0x7F, 0xB7, 0x11, 0x23, 0xAC, 0xD3, 0xA7, 0x29,
+ 0x90, 0x1D, 0x1A, 0x71, 0x87, 0x47, 0x00, 0x13, 0x31, 0x07, 0xEC, 0x50,
+ /* b */
+ 0x7F, 0x51, 0x9E, 0xAD, 0xA7, 0xBD, 0xA8, 0x1B, 0xD8, 0x26, 0xDB, 0xA6,
+ 0x47, 0x91, 0x0F, 0x8C, 0x4B, 0x93, 0x46, 0xED, 0x8C, 0xCD, 0xC6, 0x4E,
+ 0x4B, 0x1A, 0xBD, 0x11, 0x75, 0x6D, 0xCE, 0x1D, 0x20, 0x74, 0xAA, 0x26,
+ 0x3B, 0x88, 0x80, 0x5C, 0xED, 0x70, 0x35, 0x5A, 0x33, 0xB4, 0x71, 0xEE,
+ /* x */
+ 0x18, 0xDE, 0x98, 0xB0, 0x2D, 0xB9, 0xA3, 0x06, 0xF2, 0xAF, 0xCD, 0x72,
+ 0x35, 0xF7, 0x2A, 0x81, 0x9B, 0x80, 0xAB, 0x12, 0xEB, 0xD6, 0x53, 0x17,
+ 0x24, 0x76, 0xFE, 0xCD, 0x46, 0x2A, 0xAB, 0xFF, 0xC4, 0xFF, 0x19, 0x1B,
+ 0x94, 0x6A, 0x5F, 0x54, 0xD8, 0xD0, 0xAA, 0x2F, 0x41, 0x88, 0x08, 0xCC,
+ /* y */
+ 0x25, 0xAB, 0x05, 0x69, 0x62, 0xD3, 0x06, 0x51, 0xA1, 0x14, 0xAF, 0xD2,
+ 0x75, 0x5A, 0xD3, 0x36, 0x74, 0x7F, 0x93, 0x47, 0x5B, 0x7A, 0x1F, 0xCA,
+ 0x3B, 0x88, 0xF2, 0xB6, 0xA2, 0x08, 0xCC, 0xFE, 0x46, 0x94, 0x08, 0x58,
+ 0x4D, 0xC2, 0xB2, 0x91, 0x26, 0x75, 0xBF, 0x5B, 0x9E, 0x58, 0x29, 0x28,
+ /* order */
+ 0x8C, 0xB9, 0x1E, 0x82, 0xA3, 0x38, 0x6D, 0x28, 0x0F, 0x5D, 0x6F, 0x7E,
+ 0x50, 0xE6, 0x41, 0xDF, 0x15, 0x2F, 0x71, 0x09, 0xED, 0x54, 0x56, 0xB3,
+ 0x1F, 0x16, 0x6E, 0x6C, 0xAC, 0x04, 0x25, 0xA7, 0xCF, 0x3A, 0xB6, 0xAF,
+ 0x6B, 0x7F, 0xC3, 0x10, 0x3B, 0x88, 0x32, 0x02, 0xE9, 0x04, 0x65, 0x65
+ }
+};
+
+static const struct {
+ EC_CURVE_DATA h;
+ unsigned char data[0 + 64 * 6];
+} _EC_brainpoolP512r1 = {
+ {
+ NID_X9_62_prime_field, 0, 64, 1
+ },
+ {
+ /* no seed */
+ /* p */
+ 0xAA, 0xDD, 0x9D, 0xB8, 0xDB, 0xE9, 0xC4, 0x8B, 0x3F, 0xD4, 0xE6, 0xAE,
+ 0x33, 0xC9, 0xFC, 0x07, 0xCB, 0x30, 0x8D, 0xB3, 0xB3, 0xC9, 0xD2, 0x0E,
+ 0xD6, 0x63, 0x9C, 0xCA, 0x70, 0x33, 0x08, 0x71, 0x7D, 0x4D, 0x9B, 0x00,
+ 0x9B, 0xC6, 0x68, 0x42, 0xAE, 0xCD, 0xA1, 0x2A, 0xE6, 0xA3, 0x80, 0xE6,
+ 0x28, 0x81, 0xFF, 0x2F, 0x2D, 0x82, 0xC6, 0x85, 0x28, 0xAA, 0x60, 0x56,
+ 0x58, 0x3A, 0x48, 0xF3,
+ /* a */
+ 0x78, 0x30, 0xA3, 0x31, 0x8B, 0x60, 0x3B, 0x89, 0xE2, 0x32, 0x71, 0x45,
+ 0xAC, 0x23, 0x4C, 0xC5, 0x94, 0xCB, 0xDD, 0x8D, 0x3D, 0xF9, 0x16, 0x10,
+ 0xA8, 0x34, 0x41, 0xCA, 0xEA, 0x98, 0x63, 0xBC, 0x2D, 0xED, 0x5D, 0x5A,
+ 0xA8, 0x25, 0x3A, 0xA1, 0x0A, 0x2E, 0xF1, 0xC9, 0x8B, 0x9A, 0xC8, 0xB5,
+ 0x7F, 0x11, 0x17, 0xA7, 0x2B, 0xF2, 0xC7, 0xB9, 0xE7, 0xC1, 0xAC, 0x4D,
+ 0x77, 0xFC, 0x94, 0xCA,
+ /* b */
+ 0x3D, 0xF9, 0x16, 0x10, 0xA8, 0x34, 0x41, 0xCA, 0xEA, 0x98, 0x63, 0xBC,
+ 0x2D, 0xED, 0x5D, 0x5A, 0xA8, 0x25, 0x3A, 0xA1, 0x0A, 0x2E, 0xF1, 0xC9,
+ 0x8B, 0x9A, 0xC8, 0xB5, 0x7F, 0x11, 0x17, 0xA7, 0x2B, 0xF2, 0xC7, 0xB9,
+ 0xE7, 0xC1, 0xAC, 0x4D, 0x77, 0xFC, 0x94, 0xCA, 0xDC, 0x08, 0x3E, 0x67,
+ 0x98, 0x40, 0x50, 0xB7, 0x5E, 0xBA, 0xE5, 0xDD, 0x28, 0x09, 0xBD, 0x63,
+ 0x80, 0x16, 0xF7, 0x23,
+ /* x */
+ 0x81, 0xAE, 0xE4, 0xBD, 0xD8, 0x2E, 0xD9, 0x64, 0x5A, 0x21, 0x32, 0x2E,
+ 0x9C, 0x4C, 0x6A, 0x93, 0x85, 0xED, 0x9F, 0x70, 0xB5, 0xD9, 0x16, 0xC1,
+ 0xB4, 0x3B, 0x62, 0xEE, 0xF4, 0xD0, 0x09, 0x8E, 0xFF, 0x3B, 0x1F, 0x78,
+ 0xE2, 0xD0, 0xD4, 0x8D, 0x50, 0xD1, 0x68, 0x7B, 0x93, 0xB9, 0x7D, 0x5F,
+ 0x7C, 0x6D, 0x50, 0x47, 0x40, 0x6A, 0x5E, 0x68, 0x8B, 0x35, 0x22, 0x09,
+ 0xBC, 0xB9, 0xF8, 0x22,
+ /* y */
+ 0x7D, 0xDE, 0x38, 0x5D, 0x56, 0x63, 0x32, 0xEC, 0xC0, 0xEA, 0xBF, 0xA9,
+ 0xCF, 0x78, 0x22, 0xFD, 0xF2, 0x09, 0xF7, 0x00, 0x24, 0xA5, 0x7B, 0x1A,
+ 0xA0, 0x00, 0xC5, 0x5B, 0x88, 0x1F, 0x81, 0x11, 0xB2, 0xDC, 0xDE, 0x49,
+ 0x4A, 0x5F, 0x48, 0x5E, 0x5B, 0xCA, 0x4B, 0xD8, 0x8A, 0x27, 0x63, 0xAE,
+ 0xD1, 0xCA, 0x2B, 0x2F, 0xA8, 0xF0, 0x54, 0x06, 0x78, 0xCD, 0x1E, 0x0F,
+ 0x3A, 0xD8, 0x08, 0x92,
+ /* order */
+ 0xAA, 0xDD, 0x9D, 0xB8, 0xDB, 0xE9, 0xC4, 0x8B, 0x3F, 0xD4, 0xE6, 0xAE,
+ 0x33, 0xC9, 0xFC, 0x07, 0xCB, 0x30, 0x8D, 0xB3, 0xB3, 0xC9, 0xD2, 0x0E,
+ 0xD6, 0x63, 0x9C, 0xCA, 0x70, 0x33, 0x08, 0x70, 0x55, 0x3E, 0x5C, 0x41,
+ 0x4C, 0xA9, 0x26, 0x19, 0x41, 0x86, 0x61, 0x19, 0x7F, 0xAC, 0x10, 0x47,
+ 0x1D, 0xB1, 0xD3, 0x81, 0x08, 0x5D, 0xDA, 0xDD, 0xB5, 0x87, 0x96, 0x82,
+ 0x9C, 0xA9, 0x00, 0x69
+ }
+};
+
+static const struct {
+ EC_CURVE_DATA h;
+ unsigned char data[0 + 64 * 6];
+} _EC_brainpoolP512t1 = {
+ {
+ NID_X9_62_prime_field, 0, 64, 1
+ },
+ {
+ /* no seed */
+ /* p */
+ 0xAA, 0xDD, 0x9D, 0xB8, 0xDB, 0xE9, 0xC4, 0x8B, 0x3F, 0xD4, 0xE6, 0xAE,
+ 0x33, 0xC9, 0xFC, 0x07, 0xCB, 0x30, 0x8D, 0xB3, 0xB3, 0xC9, 0xD2, 0x0E,
+ 0xD6, 0x63, 0x9C, 0xCA, 0x70, 0x33, 0x08, 0x71, 0x7D, 0x4D, 0x9B, 0x00,
+ 0x9B, 0xC6, 0x68, 0x42, 0xAE, 0xCD, 0xA1, 0x2A, 0xE6, 0xA3, 0x80, 0xE6,
+ 0x28, 0x81, 0xFF, 0x2F, 0x2D, 0x82, 0xC6, 0x85, 0x28, 0xAA, 0x60, 0x56,
+ 0x58, 0x3A, 0x48, 0xF3,
+ /* a */
+ 0xAA, 0xDD, 0x9D, 0xB8, 0xDB, 0xE9, 0xC4, 0x8B, 0x3F, 0xD4, 0xE6, 0xAE,
+ 0x33, 0xC9, 0xFC, 0x07, 0xCB, 0x30, 0x8D, 0xB3, 0xB3, 0xC9, 0xD2, 0x0E,
+ 0xD6, 0x63, 0x9C, 0xCA, 0x70, 0x33, 0x08, 0x71, 0x7D, 0x4D, 0x9B, 0x00,
+ 0x9B, 0xC6, 0x68, 0x42, 0xAE, 0xCD, 0xA1, 0x2A, 0xE6, 0xA3, 0x80, 0xE6,
+ 0x28, 0x81, 0xFF, 0x2F, 0x2D, 0x82, 0xC6, 0x85, 0x28, 0xAA, 0x60, 0x56,
+ 0x58, 0x3A, 0x48, 0xF0,
+ /* b */
+ 0x7C, 0xBB, 0xBC, 0xF9, 0x44, 0x1C, 0xFA, 0xB7, 0x6E, 0x18, 0x90, 0xE4,
+ 0x68, 0x84, 0xEA, 0xE3, 0x21, 0xF7, 0x0C, 0x0B, 0xCB, 0x49, 0x81, 0x52,
+ 0x78, 0x97, 0x50, 0x4B, 0xEC, 0x3E, 0x36, 0xA6, 0x2B, 0xCD, 0xFA, 0x23,
+ 0x04, 0x97, 0x65, 0x40, 0xF6, 0x45, 0x00, 0x85, 0xF2, 0xDA, 0xE1, 0x45,
+ 0xC2, 0x25, 0x53, 0xB4, 0x65, 0x76, 0x36, 0x89, 0x18, 0x0E, 0xA2, 0x57,
+ 0x18, 0x67, 0x42, 0x3E,
+ /* x */
+ 0x64, 0x0E, 0xCE, 0x5C, 0x12, 0x78, 0x87, 0x17, 0xB9, 0xC1, 0xBA, 0x06,
+ 0xCB, 0xC2, 0xA6, 0xFE, 0xBA, 0x85, 0x84, 0x24, 0x58, 0xC5, 0x6D, 0xDE,
+ 0x9D, 0xB1, 0x75, 0x8D, 0x39, 0xC0, 0x31, 0x3D, 0x82, 0xBA, 0x51, 0x73,
+ 0x5C, 0xDB, 0x3E, 0xA4, 0x99, 0xAA, 0x77, 0xA7, 0xD6, 0x94, 0x3A, 0x64,
+ 0xF7, 0xA3, 0xF2, 0x5F, 0xE2, 0x6F, 0x06, 0xB5, 0x1B, 0xAA, 0x26, 0x96,
+ 0xFA, 0x90, 0x35, 0xDA,
+ /* y */
+ 0x5B, 0x53, 0x4B, 0xD5, 0x95, 0xF5, 0xAF, 0x0F, 0xA2, 0xC8, 0x92, 0x37,
+ 0x6C, 0x84, 0xAC, 0xE1, 0xBB, 0x4E, 0x30, 0x19, 0xB7, 0x16, 0x34, 0xC0,
+ 0x11, 0x31, 0x15, 0x9C, 0xAE, 0x03, 0xCE, 0xE9, 0xD9, 0x93, 0x21, 0x84,
+ 0xBE, 0xEF, 0x21, 0x6B, 0xD7, 0x1D, 0xF2, 0xDA, 0xDF, 0x86, 0xA6, 0x27,
+ 0x30, 0x6E, 0xCF, 0xF9, 0x6D, 0xBB, 0x8B, 0xAC, 0xE1, 0x98, 0xB6, 0x1E,
+ 0x00, 0xF8, 0xB3, 0x32,
+ /* order */
+ 0xAA, 0xDD, 0x9D, 0xB8, 0xDB, 0xE9, 0xC4, 0x8B, 0x3F, 0xD4, 0xE6, 0xAE,
+ 0x33, 0xC9, 0xFC, 0x07, 0xCB, 0x30, 0x8D, 0xB3, 0xB3, 0xC9, 0xD2, 0x0E,
+ 0xD6, 0x63, 0x9C, 0xCA, 0x70, 0x33, 0x08, 0x70, 0x55, 0x3E, 0x5C, 0x41,
+ 0x4C, 0xA9, 0x26, 0x19, 0x41, 0x86, 0x61, 0x19, 0x7F, 0xAC, 0x10, 0x47,
+ 0x1D, 0xB1, 0xD3, 0x81, 0x08, 0x5D, 0xDA, 0xDD, 0xB5, 0x87, 0x96, 0x82,
+ 0x9C, 0xA9, 0x00, 0x69
+ }
+};
+
typedef struct _ec_list_element_st {
int nid;
const EC_CURVE_DATA *data;
@@ -2343,13 +2896,15 @@ static const ec_list_element curve_list[] = {
"X9.62 curve over a 239 bit prime field"},
{NID_X9_62_prime239v3, &_EC_X9_62_PRIME_239V3.h, 0,
"X9.62 curve over a 239 bit prime field"},
-#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
- {NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, EC_GFp_nistp256_method,
- "X9.62/SECG curve over a 256 bit prime field"},
+ {NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h,
+#if defined(ECP_NISTZ256_ASM)
+ EC_GFp_nistz256_method,
+#elif !defined(OPENSSL_NO_EC_NISTP_64_GCC_128)
+ EC_GFp_nistp256_method,
#else
- {NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, 0,
- "X9.62/SECG curve over a 256 bit prime field"},
+ 0,
#endif
+ "X9.62/SECG curve over a 256 bit prime field"},
#ifndef OPENSSL_NO_EC2M
/* characteristic two field curves */
/* NIST/SECG curves */
@@ -2460,6 +3015,35 @@ static const ec_list_element curve_list[] = {
"\n\tIPSec/IKE/Oakley curve #4 over a 185 bit binary field.\n"
"\tNot suitable for ECDSA.\n\tQuestionable extension field!"},
#endif
+ /* brainpool curves */
+ {NID_brainpoolP160r1, &_EC_brainpoolP160r1.h, 0,
+ "RFC 5639 curve over a 160 bit prime field"},
+ {NID_brainpoolP160t1, &_EC_brainpoolP160t1.h, 0,
+ "RFC 5639 curve over a 160 bit prime field"},
+ {NID_brainpoolP192r1, &_EC_brainpoolP192r1.h, 0,
+ "RFC 5639 curve over a 192 bit prime field"},
+ {NID_brainpoolP192t1, &_EC_brainpoolP192t1.h, 0,
+ "RFC 5639 curve over a 192 bit prime field"},
+ {NID_brainpoolP224r1, &_EC_brainpoolP224r1.h, 0,
+ "RFC 5639 curve over a 224 bit prime field"},
+ {NID_brainpoolP224t1, &_EC_brainpoolP224t1.h, 0,
+ "RFC 5639 curve over a 224 bit prime field"},
+ {NID_brainpoolP256r1, &_EC_brainpoolP256r1.h, 0,
+ "RFC 5639 curve over a 256 bit prime field"},
+ {NID_brainpoolP256t1, &_EC_brainpoolP256t1.h, 0,
+ "RFC 5639 curve over a 256 bit prime field"},
+ {NID_brainpoolP320r1, &_EC_brainpoolP320r1.h, 0,
+ "RFC 5639 curve over a 320 bit prime field"},
+ {NID_brainpoolP320t1, &_EC_brainpoolP320t1.h, 0,
+ "RFC 5639 curve over a 320 bit prime field"},
+ {NID_brainpoolP384r1, &_EC_brainpoolP384r1.h, 0,
+ "RFC 5639 curve over a 384 bit prime field"},
+ {NID_brainpoolP384t1, &_EC_brainpoolP384t1.h, 0,
+ "RFC 5639 curve over a 384 bit prime field"},
+ {NID_brainpoolP512r1, &_EC_brainpoolP512r1.h, 0,
+ "RFC 5639 curve over a 512 bit prime field"},
+ {NID_brainpoolP512t1, &_EC_brainpoolP512t1.h, 0,
+ "RFC 5639 curve over a 512 bit prime field"},
};
#define curve_list_length (sizeof(curve_list)/sizeof(ec_list_element))
@@ -2578,6 +3162,10 @@ EC_GROUP *EC_GROUP_new_by_curve_name(int nid)
size_t i;
EC_GROUP *ret = NULL;
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode())
+ return FIPS_ec_group_new_by_curve_name(nid);
+#endif
if (nid <= 0)
return NULL;
@@ -2613,3 +3201,48 @@ size_t EC_get_builtin_curves(EC_builtin_curve *r, size_t nitems)
return curve_list_length;
}
+
+/* Functions to translate between common NIST curve names and NIDs */
+
+typedef struct {
+ const char *name; /* NIST Name of curve */
+ int nid; /* Curve NID */
+} EC_NIST_NAME;
+
+static EC_NIST_NAME nist_curves[] = {
+ {"B-163", NID_sect163r2},
+ {"B-233", NID_sect233r1},
+ {"B-283", NID_sect283r1},
+ {"B-409", NID_sect409r1},
+ {"B-571", NID_sect571r1},
+ {"K-163", NID_sect163k1},
+ {"K-233", NID_sect233k1},
+ {"K-283", NID_sect283k1},
+ {"K-409", NID_sect409k1},
+ {"K-571", NID_sect571k1},
+ {"P-192", NID_X9_62_prime192v1},
+ {"P-224", NID_secp224r1},
+ {"P-256", NID_X9_62_prime256v1},
+ {"P-384", NID_secp384r1},
+ {"P-521", NID_secp521r1}
+};
+
+const char *EC_curve_nid2nist(int nid)
+{
+ size_t i;
+ for (i = 0; i < sizeof(nist_curves) / sizeof(EC_NIST_NAME); i++) {
+ if (nist_curves[i].nid == nid)
+ return nist_curves[i].name;
+ }
+ return NULL;
+}
+
+int EC_curve_nist2nid(const char *name)
+{
+ size_t i;
+ for (i = 0; i < sizeof(nist_curves) / sizeof(EC_NIST_NAME); i++) {
+ if (!strcmp(nist_curves[i].name, name))
+ return nist_curves[i].nid;
+ }
+ return NID_undef;
+}
diff --git a/crypto/ec/ec_cvt.c b/crypto/ec/ec_cvt.c
index 487d727407e0..5a832ba1cfa4 100644
--- a/crypto/ec/ec_cvt.c
+++ b/crypto/ec/ec_cvt.c
@@ -72,12 +72,20 @@
#include <openssl/err.h>
#include "ec_lcl.h"
+#ifdef OPENSSL_FIPS
+# include <openssl/fips.h>
+#endif
+
EC_GROUP *EC_GROUP_new_curve_GFp(const BIGNUM *p, const BIGNUM *a,
const BIGNUM *b, BN_CTX *ctx)
{
const EC_METHOD *meth;
EC_GROUP *ret;
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode())
+ return FIPS_ec_group_new_curve_gfp(p, a, b, ctx);
+#endif
#if defined(OPENSSL_BN_ASM_MONT)
/*
* This might appear controversial, but the fact is that generic
@@ -152,6 +160,10 @@ EC_GROUP *EC_GROUP_new_curve_GF2m(const BIGNUM *p, const BIGNUM *a,
const EC_METHOD *meth;
EC_GROUP *ret;
+# ifdef OPENSSL_FIPS
+ if (FIPS_mode())
+ return FIPS_ec_group_new_curve_gf2m(p, a, b, ctx);
+# endif
meth = EC_GF2m_simple_method();
ret = EC_GROUP_new(meth);
diff --git a/crypto/ec/ec_err.c b/crypto/ec/ec_err.c
index 58eae7c6668e..6fe5baafd4b3 100644
--- a/crypto/ec/ec_err.c
+++ b/crypto/ec/ec_err.c
@@ -1,6 +1,6 @@
/* crypto/ec/ec_err.c */
/* ====================================================================
- * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1999-2015 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -76,6 +76,8 @@ static ERR_STRING_DATA EC_str_functs[] = {
{ERR_FUNC(EC_F_D2I_ECPKPARAMETERS), "d2i_ECPKParameters"},
{ERR_FUNC(EC_F_D2I_ECPRIVATEKEY), "d2i_ECPrivateKey"},
{ERR_FUNC(EC_F_DO_EC_KEY_PRINT), "DO_EC_KEY_PRINT"},
+ {ERR_FUNC(EC_F_ECDH_CMS_DECRYPT), "ECDH_CMS_DECRYPT"},
+ {ERR_FUNC(EC_F_ECDH_CMS_SET_SHARED_INFO), "ECDH_CMS_SET_SHARED_INFO"},
{ERR_FUNC(EC_F_ECKEY_PARAM2TYPE), "ECKEY_PARAM2TYPE"},
{ERR_FUNC(EC_F_ECKEY_PARAM_DECODE), "ECKEY_PARAM_DECODE"},
{ERR_FUNC(EC_F_ECKEY_PRIV_DECODE), "ECKEY_PRIV_DECODE"},
@@ -87,6 +89,13 @@ static ERR_STRING_DATA EC_str_functs[] = {
{ERR_FUNC(EC_F_ECPARAMETERS_PRINT_FP), "ECParameters_print_fp"},
{ERR_FUNC(EC_F_ECPKPARAMETERS_PRINT), "ECPKParameters_print"},
{ERR_FUNC(EC_F_ECPKPARAMETERS_PRINT_FP), "ECPKParameters_print_fp"},
+ {ERR_FUNC(EC_F_ECP_NISTZ256_GET_AFFINE), "ecp_nistz256_get_affine"},
+ {ERR_FUNC(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE),
+ "ecp_nistz256_mult_precompute"},
+ {ERR_FUNC(EC_F_ECP_NISTZ256_POINTS_MUL), "ecp_nistz256_points_mul"},
+ {ERR_FUNC(EC_F_ECP_NISTZ256_PRE_COMP_NEW), "ecp_nistz256_pre_comp_new"},
+ {ERR_FUNC(EC_F_ECP_NISTZ256_SET_WORDS), "ecp_nistz256_set_words"},
+ {ERR_FUNC(EC_F_ECP_NISTZ256_WINDOWED_MUL), "ecp_nistz256_windowed_mul"},
{ERR_FUNC(EC_F_ECP_NIST_MOD_192), "ECP_NIST_MOD_192"},
{ERR_FUNC(EC_F_ECP_NIST_MOD_224), "ECP_NIST_MOD_224"},
{ERR_FUNC(EC_F_ECP_NIST_MOD_256), "ECP_NIST_MOD_256"},
@@ -271,6 +280,7 @@ static ERR_STRING_DATA EC_str_reasons[] = {
{ERR_REASON(EC_R_INVALID_COMPRESSED_POINT), "invalid compressed point"},
{ERR_REASON(EC_R_INVALID_COMPRESSION_BIT), "invalid compression bit"},
{ERR_REASON(EC_R_INVALID_CURVE), "invalid curve"},
+ {ERR_REASON(EC_R_INVALID_DIGEST), "invalid digest"},
{ERR_REASON(EC_R_INVALID_DIGEST_TYPE), "invalid digest type"},
{ERR_REASON(EC_R_INVALID_ENCODING), "invalid encoding"},
{ERR_REASON(EC_R_INVALID_FIELD), "invalid field"},
@@ -279,6 +289,7 @@ static ERR_STRING_DATA EC_str_reasons[] = {
{ERR_REASON(EC_R_INVALID_PENTANOMIAL_BASIS), "invalid pentanomial basis"},
{ERR_REASON(EC_R_INVALID_PRIVATE_KEY), "invalid private key"},
{ERR_REASON(EC_R_INVALID_TRINOMIAL_BASIS), "invalid trinomial basis"},
+ {ERR_REASON(EC_R_KDF_PARAMETER_ERROR), "kdf parameter error"},
{ERR_REASON(EC_R_KEYS_NOT_SET), "keys not set"},
{ERR_REASON(EC_R_MISSING_PARAMETERS), "missing parameters"},
{ERR_REASON(EC_R_MISSING_PRIVATE_KEY), "missing private key"},
@@ -290,10 +301,12 @@ static ERR_STRING_DATA EC_str_reasons[] = {
{ERR_REASON(EC_R_NO_FIELD_MOD), "no field mod"},
{ERR_REASON(EC_R_NO_PARAMETERS_SET), "no parameters set"},
{ERR_REASON(EC_R_PASSED_NULL_PARAMETER), "passed null parameter"},
+ {ERR_REASON(EC_R_PEER_KEY_ERROR), "peer key error"},
{ERR_REASON(EC_R_PKPARAMETERS2GROUP_FAILURE),
"pkparameters2group failure"},
{ERR_REASON(EC_R_POINT_AT_INFINITY), "point at infinity"},
{ERR_REASON(EC_R_POINT_IS_NOT_ON_CURVE), "point is not on curve"},
+ {ERR_REASON(EC_R_SHARED_INFO_ERROR), "shared info error"},
{ERR_REASON(EC_R_SLOT_FULL), "slot full"},
{ERR_REASON(EC_R_UNDEFINED_GENERATOR), "undefined generator"},
{ERR_REASON(EC_R_UNDEFINED_ORDER), "undefined order"},
diff --git a/crypto/ec/ec_lcl.h b/crypto/ec/ec_lcl.h
index d79ed1e4d92d..969fd147ef93 100644
--- a/crypto/ec/ec_lcl.h
+++ b/crypto/ec/ec_lcl.h
@@ -212,6 +212,13 @@ struct ec_group_st {
BIGNUM order, cofactor;
int curve_name; /* optional NID for named curve */
int asn1_flag; /* flag to control the asn1 encoding */
+ /*
+ * Kludge: upper bit of ans1_flag is used to denote structure
+ * version. Is set, then last field is present. This is done
+ * for interoperation with FIPS code.
+ */
+#define EC_GROUP_ASN1_FLAG_MASK 0x7fffffff
+#define EC_GROUP_VERSION(p) (p->asn1_flag&~EC_GROUP_ASN1_FLAG_MASK)
point_conversion_form_t asn1_form;
unsigned char *seed; /* optional seed for parameters (appears in
* ASN1) */
@@ -252,6 +259,7 @@ struct ec_group_st {
/* method-specific */
int (*field_mod_func) (BIGNUM *, const BIGNUM *, const BIGNUM *,
BN_CTX *);
+ BN_MONT_CTX *mont_data; /* data for ECDSA inverse */
} /* EC_GROUP */ ;
struct ec_key_st {
@@ -541,3 +549,20 @@ void ec_GFp_nistp_points_make_affine_internal(size_t num, void *point_array,
void ec_GFp_nistp_recode_scalar_bits(unsigned char *sign,
unsigned char *digit, unsigned char in);
#endif
+int ec_precompute_mont_data(EC_GROUP *);
+
+#ifdef ECP_NISTZ256_ASM
+/** Returns GFp methods using montgomery multiplication, with x86-64 optimized
+ * P256. See http://eprint.iacr.org/2013/816.
+ * \return EC_METHOD object
+ */
+const EC_METHOD *EC_GFp_nistz256_method(void);
+#endif
+
+#ifdef OPENSSL_FIPS
+EC_GROUP *FIPS_ec_group_new_curve_gfp(const BIGNUM *p, const BIGNUM *a,
+ const BIGNUM *b, BN_CTX *ctx);
+EC_GROUP *FIPS_ec_group_new_curve_gf2m(const BIGNUM *p, const BIGNUM *a,
+ const BIGNUM *b, BN_CTX *ctx);
+EC_GROUP *FIPS_ec_group_new_by_curve_name(int nid);
+#endif
diff --git a/crypto/ec/ec_lib.c b/crypto/ec/ec_lib.c
index e2275207ed22..3ffa112cc306 100644
--- a/crypto/ec/ec_lib.c
+++ b/crypto/ec/ec_lib.c
@@ -94,13 +94,14 @@ EC_GROUP *EC_GROUP_new(const EC_METHOD *meth)
ret->meth = meth;
ret->extra_data = NULL;
+ ret->mont_data = NULL;
ret->generator = NULL;
BN_init(&ret->order);
BN_init(&ret->cofactor);
ret->curve_name = 0;
- ret->asn1_flag = 0;
+ ret->asn1_flag = ~EC_GROUP_ASN1_FLAG_MASK;
ret->asn1_form = POINT_CONVERSION_UNCOMPRESSED;
ret->seed = NULL;
@@ -124,6 +125,9 @@ void EC_GROUP_free(EC_GROUP *group)
EC_EX_DATA_free_all_data(&group->extra_data);
+ if (EC_GROUP_VERSION(group) && group->mont_data)
+ BN_MONT_CTX_free(group->mont_data);
+
if (group->generator != NULL)
EC_POINT_free(group->generator);
BN_free(&group->order);
@@ -147,6 +151,9 @@ void EC_GROUP_clear_free(EC_GROUP *group)
EC_EX_DATA_clear_free_all_data(&group->extra_data);
+ if (EC_GROUP_VERSION(group) && group->mont_data)
+ BN_MONT_CTX_free(group->mont_data);
+
if (group->generator != NULL)
EC_POINT_clear_free(group->generator);
BN_clear_free(&group->order);
@@ -189,6 +196,22 @@ int EC_GROUP_copy(EC_GROUP *dest, const EC_GROUP *src)
return 0;
}
+ if (EC_GROUP_VERSION(src) && src->mont_data != NULL) {
+ if (dest->mont_data == NULL) {
+ dest->mont_data = BN_MONT_CTX_new();
+ if (dest->mont_data == NULL)
+ return 0;
+ }
+ if (!BN_MONT_CTX_copy(dest->mont_data, src->mont_data))
+ return 0;
+ } else {
+ /* src->generator == NULL */
+ if (EC_GROUP_VERSION(dest) && dest->mont_data != NULL) {
+ BN_MONT_CTX_free(dest->mont_data);
+ dest->mont_data = NULL;
+ }
+ }
+
if (src->generator != NULL) {
if (dest->generator == NULL) {
dest->generator = EC_POINT_new(dest);
@@ -295,6 +318,13 @@ int EC_GROUP_set_generator(EC_GROUP *group, const EC_POINT *generator,
} else
BN_zero(&group->cofactor);
+ /*
+ * We ignore the return value because some groups have an order with
+ * factors of two, which makes the Montgomery setup fail.
+ * |group->mont_data| will be NULL in this case.
+ */
+ ec_precompute_mont_data(group);
+
return 1;
}
@@ -303,6 +333,11 @@ const EC_POINT *EC_GROUP_get0_generator(const EC_GROUP *group)
return group->generator;
}
+BN_MONT_CTX *EC_GROUP_get_mont_data(const EC_GROUP *group)
+{
+ return EC_GROUP_VERSION(group) ? group->mont_data : NULL;
+}
+
int EC_GROUP_get_order(const EC_GROUP *group, BIGNUM *order, BN_CTX *ctx)
{
if (!BN_copy(order, &group->order))
@@ -332,12 +367,13 @@ int EC_GROUP_get_curve_name(const EC_GROUP *group)
void EC_GROUP_set_asn1_flag(EC_GROUP *group, int flag)
{
- group->asn1_flag = flag;
+ group->asn1_flag &= ~EC_GROUP_ASN1_FLAG_MASK;
+ group->asn1_flag |= flag & EC_GROUP_ASN1_FLAG_MASK;
}
int EC_GROUP_get_asn1_flag(const EC_GROUP *group)
{
- return group->asn1_flag;
+ return group->asn1_flag & EC_GROUP_ASN1_FLAG_MASK;
}
void EC_GROUP_set_point_conversion_form(EC_GROUP *group,
@@ -1057,3 +1093,42 @@ int EC_GROUP_have_precompute_mult(const EC_GROUP *group)
return 0; /* cannot tell whether precomputation has
* been performed */
}
+
+/*
+ * ec_precompute_mont_data sets |group->mont_data| from |group->order| and
+ * returns one on success. On error it returns zero.
+ */
+int ec_precompute_mont_data(EC_GROUP *group)
+{
+ BN_CTX *ctx = BN_CTX_new();
+ int ret = 0;
+
+ if (!EC_GROUP_VERSION(group))
+ goto err;
+
+ if (group->mont_data) {
+ BN_MONT_CTX_free(group->mont_data);
+ group->mont_data = NULL;
+ }
+
+ if (ctx == NULL)
+ goto err;
+
+ group->mont_data = BN_MONT_CTX_new();
+ if (!group->mont_data)
+ goto err;
+
+ if (!BN_MONT_CTX_set(group->mont_data, &group->order, ctx)) {
+ BN_MONT_CTX_free(group->mont_data);
+ group->mont_data = NULL;
+ goto err;
+ }
+
+ ret = 1;
+
+ err:
+
+ if (ctx)
+ BN_CTX_free(ctx);
+ return ret;
+}
diff --git a/crypto/ec/ec_pmeth.c b/crypto/ec/ec_pmeth.c
index c189d3ffbf2a..b76749010c93 100644
--- a/crypto/ec/ec_pmeth.c
+++ b/crypto/ec/ec_pmeth.c
@@ -61,6 +61,7 @@
#include <openssl/asn1t.h>
#include <openssl/x509.h>
#include <openssl/ec.h>
+#include "ec_lcl.h"
#include <openssl/ecdsa.h>
#include <openssl/evp.h>
#include "evp_locl.h"
@@ -72,6 +73,19 @@ typedef struct {
EC_GROUP *gen_group;
/* message digest */
const EVP_MD *md;
+ /* Duplicate key if custom cofactor needed */
+ EC_KEY *co_key;
+ /* Cofactor mode */
+ signed char cofactor_mode;
+ /* KDF (if any) to use for ECDH */
+ char kdf_type;
+ /* Message digest to use for key derivation */
+ const EVP_MD *kdf_md;
+ /* User key material */
+ unsigned char *kdf_ukm;
+ size_t kdf_ukmlen;
+ /* KDF output length */
+ size_t kdf_outlen;
} EC_PKEY_CTX;
static int pkey_ec_init(EVP_PKEY_CTX *ctx)
@@ -83,6 +97,14 @@ static int pkey_ec_init(EVP_PKEY_CTX *ctx)
dctx->gen_group = NULL;
dctx->md = NULL;
+ dctx->cofactor_mode = -1;
+ dctx->co_key = NULL;
+ dctx->kdf_type = EVP_PKEY_ECDH_KDF_NONE;
+ dctx->kdf_md = NULL;
+ dctx->kdf_outlen = 0;
+ dctx->kdf_ukm = NULL;
+ dctx->kdf_ukmlen = 0;
+
ctx->data = dctx;
return 1;
@@ -101,6 +123,22 @@ static int pkey_ec_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src)
return 0;
}
dctx->md = sctx->md;
+
+ if (sctx->co_key) {
+ dctx->co_key = EC_KEY_dup(sctx->co_key);
+ if (!dctx->co_key)
+ return 0;
+ }
+ dctx->kdf_type = sctx->kdf_type;
+ dctx->kdf_md = sctx->kdf_md;
+ dctx->kdf_outlen = sctx->kdf_outlen;
+ if (sctx->kdf_ukm) {
+ dctx->kdf_ukm = BUF_memdup(sctx->kdf_ukm, sctx->kdf_ukmlen);
+ if (!dctx->kdf_ukm)
+ return 0;
+ } else
+ dctx->kdf_ukm = NULL;
+ dctx->kdf_ukmlen = sctx->kdf_ukmlen;
return 1;
}
@@ -110,6 +148,10 @@ static void pkey_ec_cleanup(EVP_PKEY_CTX *ctx)
if (dctx) {
if (dctx->gen_group)
EC_GROUP_free(dctx->gen_group);
+ if (dctx->co_key)
+ EC_KEY_free(dctx->co_key);
+ if (dctx->kdf_ukm)
+ OPENSSL_free(dctx->kdf_ukm);
OPENSSL_free(dctx);
}
}
@@ -168,18 +210,21 @@ static int pkey_ec_derive(EVP_PKEY_CTX *ctx, unsigned char *key,
int ret;
size_t outlen;
const EC_POINT *pubkey = NULL;
+ EC_KEY *eckey;
+ EC_PKEY_CTX *dctx = ctx->data;
if (!ctx->pkey || !ctx->peerkey) {
ECerr(EC_F_PKEY_EC_DERIVE, EC_R_KEYS_NOT_SET);
return 0;
}
+ eckey = dctx->co_key ? dctx->co_key : ctx->pkey->pkey.ec;
+
if (!key) {
const EC_GROUP *group;
- group = EC_KEY_get0_group(ctx->pkey->pkey.ec);
+ group = EC_KEY_get0_group(eckey);
*keylen = (EC_GROUP_get_degree(group) + 7) / 8;
return 1;
}
-
pubkey = EC_KEY_get0_public_key(ctx->peerkey->pkey.ec);
/*
@@ -189,12 +234,48 @@ static int pkey_ec_derive(EVP_PKEY_CTX *ctx, unsigned char *key,
outlen = *keylen;
- ret = ECDH_compute_key(key, outlen, pubkey, ctx->pkey->pkey.ec, 0);
- if (ret < 0)
- return ret;
+ ret = ECDH_compute_key(key, outlen, pubkey, eckey, 0);
+ if (ret <= 0)
+ return 0;
*keylen = ret;
return 1;
}
+
+static int pkey_ec_kdf_derive(EVP_PKEY_CTX *ctx,
+ unsigned char *key, size_t *keylen)
+{
+ EC_PKEY_CTX *dctx = ctx->data;
+ unsigned char *ktmp = NULL;
+ size_t ktmplen;
+ int rv = 0;
+ if (dctx->kdf_type == EVP_PKEY_ECDH_KDF_NONE)
+ return pkey_ec_derive(ctx, key, keylen);
+ if (!key) {
+ *keylen = dctx->kdf_outlen;
+ return 1;
+ }
+ if (*keylen != dctx->kdf_outlen)
+ return 0;
+ if (!pkey_ec_derive(ctx, NULL, &ktmplen))
+ return 0;
+ ktmp = OPENSSL_malloc(ktmplen);
+ if (!ktmp)
+ return 0;
+ if (!pkey_ec_derive(ctx, ktmp, &ktmplen))
+ goto err;
+ /* Do KDF stuff */
+ if (!ECDH_KDF_X9_62(key, *keylen, ktmp, ktmplen,
+ dctx->kdf_ukm, dctx->kdf_ukmlen, dctx->kdf_md))
+ goto err;
+ rv = 1;
+
+ err:
+ if (ktmp) {
+ OPENSSL_cleanse(ktmp, ktmplen);
+ OPENSSL_free(ktmp);
+ }
+ return rv;
+}
#endif
static int pkey_ec_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
@@ -213,6 +294,90 @@ static int pkey_ec_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
dctx->gen_group = group;
return 1;
+ case EVP_PKEY_CTRL_EC_PARAM_ENC:
+ if (!dctx->gen_group) {
+ ECerr(EC_F_PKEY_EC_CTRL, EC_R_NO_PARAMETERS_SET);
+ return 0;
+ }
+ EC_GROUP_set_asn1_flag(dctx->gen_group, p1);
+ return 1;
+
+#ifndef OPENSSL_NO_ECDH
+ case EVP_PKEY_CTRL_EC_ECDH_COFACTOR:
+ if (p1 == -2) {
+ if (dctx->cofactor_mode != -1)
+ return dctx->cofactor_mode;
+ else {
+ EC_KEY *ec_key = ctx->pkey->pkey.ec;
+ return EC_KEY_get_flags(ec_key) & EC_FLAG_COFACTOR_ECDH ? 1 :
+ 0;
+ }
+ } else if (p1 < -1 || p1 > 1)
+ return -2;
+ dctx->cofactor_mode = p1;
+ if (p1 != -1) {
+ EC_KEY *ec_key = ctx->pkey->pkey.ec;
+ if (!ec_key->group)
+ return -2;
+ /* If cofactor is 1 cofactor mode does nothing */
+ if (BN_is_one(&ec_key->group->cofactor))
+ return 1;
+ if (!dctx->co_key) {
+ dctx->co_key = EC_KEY_dup(ec_key);
+ if (!dctx->co_key)
+ return 0;
+ }
+ if (p1)
+ EC_KEY_set_flags(dctx->co_key, EC_FLAG_COFACTOR_ECDH);
+ else
+ EC_KEY_clear_flags(dctx->co_key, EC_FLAG_COFACTOR_ECDH);
+ } else if (dctx->co_key) {
+ EC_KEY_free(dctx->co_key);
+ dctx->co_key = NULL;
+ }
+ return 1;
+#endif
+
+ case EVP_PKEY_CTRL_EC_KDF_TYPE:
+ if (p1 == -2)
+ return dctx->kdf_type;
+ if (p1 != EVP_PKEY_ECDH_KDF_NONE && p1 != EVP_PKEY_ECDH_KDF_X9_62)
+ return -2;
+ dctx->kdf_type = p1;
+ return 1;
+
+ case EVP_PKEY_CTRL_EC_KDF_MD:
+ dctx->kdf_md = p2;
+ return 1;
+
+ case EVP_PKEY_CTRL_GET_EC_KDF_MD:
+ *(const EVP_MD **)p2 = dctx->kdf_md;
+ return 1;
+
+ case EVP_PKEY_CTRL_EC_KDF_OUTLEN:
+ if (p1 <= 0)
+ return -2;
+ dctx->kdf_outlen = (size_t)p1;
+ return 1;
+
+ case EVP_PKEY_CTRL_GET_EC_KDF_OUTLEN:
+ *(int *)p2 = dctx->kdf_outlen;
+ return 1;
+
+ case EVP_PKEY_CTRL_EC_KDF_UKM:
+ if (dctx->kdf_ukm)
+ OPENSSL_free(dctx->kdf_ukm);
+ dctx->kdf_ukm = p2;
+ if (p2)
+ dctx->kdf_ukmlen = p1;
+ else
+ dctx->kdf_ukmlen = 0;
+ return 1;
+
+ case EVP_PKEY_CTRL_GET_EC_KDF_UKM:
+ *(unsigned char **)p2 = dctx->kdf_ukm;
+ return dctx->kdf_ukmlen;
+
case EVP_PKEY_CTRL_MD:
if (EVP_MD_type((const EVP_MD *)p2) != NID_sha1 &&
EVP_MD_type((const EVP_MD *)p2) != NID_ecdsa_with_SHA1 &&
@@ -226,6 +391,10 @@ static int pkey_ec_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
dctx->md = p2;
return 1;
+ case EVP_PKEY_CTRL_GET_MD:
+ *(const EVP_MD **)p2 = dctx->md;
+ return 1;
+
case EVP_PKEY_CTRL_PEER_KEY:
/* Default behaviour is OK */
case EVP_PKEY_CTRL_DIGESTINIT:
@@ -244,7 +413,9 @@ static int pkey_ec_ctrl_str(EVP_PKEY_CTX *ctx,
{
if (!strcmp(type, "ec_paramgen_curve")) {
int nid;
- nid = OBJ_sn2nid(value);
+ nid = EC_curve_nist2nid(value);
+ if (nid == NID_undef)
+ nid = OBJ_sn2nid(value);
if (nid == NID_undef)
nid = OBJ_ln2nid(value);
if (nid == NID_undef) {
@@ -252,7 +423,28 @@ static int pkey_ec_ctrl_str(EVP_PKEY_CTX *ctx,
return 0;
}
return EVP_PKEY_CTX_set_ec_paramgen_curve_nid(ctx, nid);
+ } else if (!strcmp(type, "ec_param_enc")) {
+ int param_enc;
+ if (!strcmp(value, "explicit"))
+ param_enc = 0;
+ else if (!strcmp(value, "named_curve"))
+ param_enc = OPENSSL_EC_NAMED_CURVE;
+ else
+ return -2;
+ return EVP_PKEY_CTX_set_ec_param_enc(ctx, param_enc);
+ } else if (!strcmp(type, "ecdh_kdf_md")) {
+ const EVP_MD *md;
+ if (!(md = EVP_get_digestbyname(value))) {
+ ECerr(EC_F_PKEY_EC_CTRL_STR, EC_R_INVALID_DIGEST);
+ return 0;
+ }
+ return EVP_PKEY_CTX_set_ecdh_kdf_md(ctx, md);
+ } else if (!strcmp(type, "ecdh_cofactor_mode")) {
+ int co_mode;
+ co_mode = atoi(value);
+ return EVP_PKEY_CTX_set_ecdh_cofactor_mode(ctx, co_mode);
}
+
return -2;
}
@@ -279,7 +471,8 @@ static int pkey_ec_paramgen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey)
static int pkey_ec_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey)
{
EC_KEY *ec = NULL;
- if (ctx->pkey == NULL) {
+ EC_PKEY_CTX *dctx = ctx->data;
+ if (ctx->pkey == NULL && dctx->gen_group == NULL) {
ECerr(EC_F_PKEY_EC_KEYGEN, EC_R_NO_PARAMETERS_SET);
return 0;
}
@@ -287,9 +480,14 @@ static int pkey_ec_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey)
if (!ec)
return 0;
EVP_PKEY_assign_EC_KEY(pkey, ec);
- /* Note: if error return, pkey is freed by parent routine */
- if (!EVP_PKEY_copy_parameters(pkey, ctx->pkey))
- return 0;
+ if (ctx->pkey) {
+ /* Note: if error return, pkey is freed by parent routine */
+ if (!EVP_PKEY_copy_parameters(pkey, ctx->pkey))
+ return 0;
+ } else {
+ if (!EC_KEY_set_group(ec, dctx->gen_group))
+ return 0;
+ }
return EC_KEY_generate_key(pkey->pkey.ec);
}
@@ -322,7 +520,7 @@ const EVP_PKEY_METHOD ec_pkey_meth = {
0,
#ifndef OPENSSL_NO_ECDH
- pkey_ec_derive,
+ pkey_ec_kdf_derive,
#else
0,
#endif
diff --git a/crypto/ec/eck_prn.c b/crypto/ec/eck_prn.c
index 5ef12ec02444..df9b37a750d6 100644
--- a/crypto/ec/eck_prn.c
+++ b/crypto/ec/eck_prn.c
@@ -171,6 +171,7 @@ int ECPKParameters_print(BIO *bp, const EC_GROUP *x, int off)
if (EC_GROUP_get_asn1_flag(x)) {
/* the curve parameter are given by an asn1 OID */
int nid;
+ const char *nname;
if (!BIO_indent(bp, off, 128))
goto err;
@@ -183,6 +184,13 @@ int ECPKParameters_print(BIO *bp, const EC_GROUP *x, int off)
goto err;
if (BIO_printf(bp, "\n") <= 0)
goto err;
+ nname = EC_curve_nid2nist(nid);
+ if (nname) {
+ if (!BIO_indent(bp, off, 128))
+ goto err;
+ if (BIO_printf(bp, "NIST CURVE: %s\n", nname) <= 0)
+ goto err;
+ }
} else {
/* explicit parameters */
int is_char_two = 0;
diff --git a/crypto/ec/ecp_nistp521.c b/crypto/ec/ecp_nistp521.c
index cc7634512d44..360b9a3516f6 100644
--- a/crypto/ec/ecp_nistp521.c
+++ b/crypto/ec/ecp_nistp521.c
@@ -1282,11 +1282,11 @@ static void point_add(felem x3, felem y3, felem z3,
felem_scalar128(tmp2, 2);
/* tmp2[i] < 17*2^121 */
felem_diff128(tmp, tmp2);
- /*-
- * tmp[i] < 2^127 - 2^69 + 17*2^122
- * = 2^126 - 2^122 - 2^6 - 2^2 - 1
- * < 2^127
- */
+ /*-
+ * tmp[i] < 2^127 - 2^69 + 17*2^122
+ * = 2^126 - 2^122 - 2^6 - 2^2 - 1
+ * < 2^127
+ */
felem_reduce(y_out, tmp);
copy_conditional(x_out, x2, z1_is_zero);
diff --git a/crypto/ec/ecp_nistz256.c b/crypto/ec/ecp_nistz256.c
new file mode 100644
index 000000000000..ca44d0aaeec4
--- /dev/null
+++ b/crypto/ec/ecp_nistz256.c
@@ -0,0 +1,1521 @@
+/******************************************************************************
+ * *
+ * Copyright 2014 Intel Corporation *
+ * *
+ * Licensed under the Apache License, Version 2.0 (the "License"); *
+ * you may not use this file except in compliance with the License. *
+ * You may obtain a copy of the License at *
+ * *
+ * http://www.apache.org/licenses/LICENSE-2.0 *
+ * *
+ * Unless required by applicable law or agreed to in writing, software *
+ * distributed under the License is distributed on an "AS IS" BASIS, *
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
+ * See the License for the specific language governing permissions and *
+ * limitations under the License. *
+ * *
+ ******************************************************************************
+ * *
+ * Developers and authors: *
+ * Shay Gueron (1, 2), and Vlad Krasnov (1) *
+ * (1) Intel Corporation, Israel Development Center *
+ * (2) University of Haifa *
+ * Reference: *
+ * S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with *
+ * 256 Bit Primes" *
+ * *
+ ******************************************************************************/
+
+#include <string.h>
+
+#include <openssl/bn.h>
+#include <openssl/err.h>
+#include <openssl/ec.h>
+#include "cryptlib.h"
+
+#include "ec_lcl.h"
+
+#if BN_BITS2 != 64
+# define TOBN(hi,lo) lo,hi
+#else
+# define TOBN(hi,lo) ((BN_ULONG)hi<<32|lo)
+#endif
+
+#if defined(__GNUC__)
+# define ALIGN32 __attribute((aligned(32)))
+#elif defined(_MSC_VER)
+# define ALIGN32 __declspec(align(32))
+#else
+# define ALIGN32
+#endif
+
+#define ALIGNPTR(p,N) ((unsigned char *)p+N-(size_t)p%N)
+#define P256_LIMBS (256/BN_BITS2)
+
+typedef unsigned short u16;
+
+typedef struct {
+ BN_ULONG X[P256_LIMBS];
+ BN_ULONG Y[P256_LIMBS];
+ BN_ULONG Z[P256_LIMBS];
+} P256_POINT;
+
+typedef struct {
+ BN_ULONG X[P256_LIMBS];
+ BN_ULONG Y[P256_LIMBS];
+} P256_POINT_AFFINE;
+
+typedef P256_POINT_AFFINE PRECOMP256_ROW[64];
+
+/* structure for precomputed multiples of the generator */
+typedef struct ec_pre_comp_st {
+ const EC_GROUP *group; /* Parent EC_GROUP object */
+ size_t w; /* Window size */
+ /*
+ * Constant time access to the X and Y coordinates of the pre-computed,
+ * generator multiplies, in the Montgomery domain. Pre-calculated
+ * multiplies are stored in affine form.
+ */
+ PRECOMP256_ROW *precomp;
+ void *precomp_storage;
+ int references;
+} EC_PRE_COMP;
+
+/* Functions implemented in assembly */
+/* Modular mul by 2: res = 2*a mod P */
+void ecp_nistz256_mul_by_2(BN_ULONG res[P256_LIMBS],
+ const BN_ULONG a[P256_LIMBS]);
+/* Modular div by 2: res = a/2 mod P */
+void ecp_nistz256_div_by_2(BN_ULONG res[P256_LIMBS],
+ const BN_ULONG a[P256_LIMBS]);
+/* Modular mul by 3: res = 3*a mod P */
+void ecp_nistz256_mul_by_3(BN_ULONG res[P256_LIMBS],
+ const BN_ULONG a[P256_LIMBS]);
+/* Modular add: res = a+b mod P */
+void ecp_nistz256_add(BN_ULONG res[P256_LIMBS],
+ const BN_ULONG a[P256_LIMBS],
+ const BN_ULONG b[P256_LIMBS]);
+/* Modular sub: res = a-b mod P */
+void ecp_nistz256_sub(BN_ULONG res[P256_LIMBS],
+ const BN_ULONG a[P256_LIMBS],
+ const BN_ULONG b[P256_LIMBS]);
+/* Modular neg: res = -a mod P */
+void ecp_nistz256_neg(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS]);
+/* Montgomery mul: res = a*b*2^-256 mod P */
+void ecp_nistz256_mul_mont(BN_ULONG res[P256_LIMBS],
+ const BN_ULONG a[P256_LIMBS],
+ const BN_ULONG b[P256_LIMBS]);
+/* Montgomery sqr: res = a*a*2^-256 mod P */
+void ecp_nistz256_sqr_mont(BN_ULONG res[P256_LIMBS],
+ const BN_ULONG a[P256_LIMBS]);
+/* Convert a number from Montgomery domain, by multiplying with 1 */
+void ecp_nistz256_from_mont(BN_ULONG res[P256_LIMBS],
+ const BN_ULONG in[P256_LIMBS]);
+/* Convert a number to Montgomery domain, by multiplying with 2^512 mod P*/
+void ecp_nistz256_to_mont(BN_ULONG res[P256_LIMBS],
+ const BN_ULONG in[P256_LIMBS]);
+/* Functions that perform constant time access to the precomputed tables */
+void ecp_nistz256_select_w5(P256_POINT * val,
+ const P256_POINT * in_t, int index);
+void ecp_nistz256_select_w7(P256_POINT_AFFINE * val,
+ const P256_POINT_AFFINE * in_t, int index);
+
+/* One converted into the Montgomery domain */
+static const BN_ULONG ONE[P256_LIMBS] = {
+ TOBN(0x00000000, 0x00000001), TOBN(0xffffffff, 0x00000000),
+ TOBN(0xffffffff, 0xffffffff), TOBN(0x00000000, 0xfffffffe)
+};
+
+static void *ecp_nistz256_pre_comp_dup(void *);
+static void ecp_nistz256_pre_comp_free(void *);
+static void ecp_nistz256_pre_comp_clear_free(void *);
+static EC_PRE_COMP *ecp_nistz256_pre_comp_new(const EC_GROUP *group);
+
+/* Precomputed tables for the default generator */
+#include "ecp_nistz256_table.c"
+
+/* Recode window to a signed digit, see ecp_nistputil.c for details */
+static unsigned int _booth_recode_w5(unsigned int in)
+{
+ unsigned int s, d;
+
+ s = ~((in >> 5) - 1);
+ d = (1 << 6) - in - 1;
+ d = (d & s) | (in & ~s);
+ d = (d >> 1) + (d & 1);
+
+ return (d << 1) + (s & 1);
+}
+
+static unsigned int _booth_recode_w7(unsigned int in)
+{
+ unsigned int s, d;
+
+ s = ~((in >> 7) - 1);
+ d = (1 << 8) - in - 1;
+ d = (d & s) | (in & ~s);
+ d = (d >> 1) + (d & 1);
+
+ return (d << 1) + (s & 1);
+}
+
+static void copy_conditional(BN_ULONG dst[P256_LIMBS],
+ const BN_ULONG src[P256_LIMBS], BN_ULONG move)
+{
+ BN_ULONG mask1 = -move;
+ BN_ULONG mask2 = ~mask1;
+
+ dst[0] = (src[0] & mask1) ^ (dst[0] & mask2);
+ dst[1] = (src[1] & mask1) ^ (dst[1] & mask2);
+ dst[2] = (src[2] & mask1) ^ (dst[2] & mask2);
+ dst[3] = (src[3] & mask1) ^ (dst[3] & mask2);
+ if (P256_LIMBS == 8) {
+ dst[4] = (src[4] & mask1) ^ (dst[4] & mask2);
+ dst[5] = (src[5] & mask1) ^ (dst[5] & mask2);
+ dst[6] = (src[6] & mask1) ^ (dst[6] & mask2);
+ dst[7] = (src[7] & mask1) ^ (dst[7] & mask2);
+ }
+}
+
+static BN_ULONG is_zero(BN_ULONG in)
+{
+ in |= (0 - in);
+ in = ~in;
+ in &= BN_MASK2;
+ in >>= BN_BITS2 - 1;
+ return in;
+}
+
+static BN_ULONG is_equal(const BN_ULONG a[P256_LIMBS],
+ const BN_ULONG b[P256_LIMBS])
+{
+ BN_ULONG res;
+
+ res = a[0] ^ b[0];
+ res |= a[1] ^ b[1];
+ res |= a[2] ^ b[2];
+ res |= a[3] ^ b[3];
+ if (P256_LIMBS == 8) {
+ res |= a[4] ^ b[4];
+ res |= a[5] ^ b[5];
+ res |= a[6] ^ b[6];
+ res |= a[7] ^ b[7];
+ }
+
+ return is_zero(res);
+}
+
+static BN_ULONG is_one(const BN_ULONG a[P256_LIMBS])
+{
+ BN_ULONG res;
+
+ res = a[0] ^ ONE[0];
+ res |= a[1] ^ ONE[1];
+ res |= a[2] ^ ONE[2];
+ res |= a[3] ^ ONE[3];
+ if (P256_LIMBS == 8) {
+ res |= a[4] ^ ONE[4];
+ res |= a[5] ^ ONE[5];
+ res |= a[6] ^ ONE[6];
+ }
+
+ return is_zero(res);
+}
+
+static int ecp_nistz256_set_words(BIGNUM *a, BN_ULONG words[P256_LIMBS])
+ {
+ if (bn_wexpand(a, P256_LIMBS) == NULL) {
+ ECerr(EC_F_ECP_NISTZ256_SET_WORDS, ERR_R_MALLOC_FAILURE);
+ return 0;
+ }
+ memcpy(a->d, words, sizeof(BN_ULONG) * P256_LIMBS);
+ a->top = P256_LIMBS;
+ bn_correct_top(a);
+ return 1;
+}
+
+#ifndef ECP_NISTZ256_REFERENCE_IMPLEMENTATION
+void ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a);
+void ecp_nistz256_point_add(P256_POINT *r,
+ const P256_POINT *a, const P256_POINT *b);
+void ecp_nistz256_point_add_affine(P256_POINT *r,
+ const P256_POINT *a,
+ const P256_POINT_AFFINE *b);
+#else
+/* Point double: r = 2*a */
+static void ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a)
+{
+ BN_ULONG S[P256_LIMBS];
+ BN_ULONG M[P256_LIMBS];
+ BN_ULONG Zsqr[P256_LIMBS];
+ BN_ULONG tmp0[P256_LIMBS];
+
+ const BN_ULONG *in_x = a->X;
+ const BN_ULONG *in_y = a->Y;
+ const BN_ULONG *in_z = a->Z;
+
+ BN_ULONG *res_x = r->X;
+ BN_ULONG *res_y = r->Y;
+ BN_ULONG *res_z = r->Z;
+
+ ecp_nistz256_mul_by_2(S, in_y);
+
+ ecp_nistz256_sqr_mont(Zsqr, in_z);
+
+ ecp_nistz256_sqr_mont(S, S);
+
+ ecp_nistz256_mul_mont(res_z, in_z, in_y);
+ ecp_nistz256_mul_by_2(res_z, res_z);
+
+ ecp_nistz256_add(M, in_x, Zsqr);
+ ecp_nistz256_sub(Zsqr, in_x, Zsqr);
+
+ ecp_nistz256_sqr_mont(res_y, S);
+ ecp_nistz256_div_by_2(res_y, res_y);
+
+ ecp_nistz256_mul_mont(M, M, Zsqr);
+ ecp_nistz256_mul_by_3(M, M);
+
+ ecp_nistz256_mul_mont(S, S, in_x);
+ ecp_nistz256_mul_by_2(tmp0, S);
+
+ ecp_nistz256_sqr_mont(res_x, M);
+
+ ecp_nistz256_sub(res_x, res_x, tmp0);
+ ecp_nistz256_sub(S, S, res_x);
+
+ ecp_nistz256_mul_mont(S, S, M);
+ ecp_nistz256_sub(res_y, S, res_y);
+}
+
+/* Point addition: r = a+b */
+static void ecp_nistz256_point_add(P256_POINT *r,
+ const P256_POINT *a, const P256_POINT *b)
+{
+ BN_ULONG U2[P256_LIMBS], S2[P256_LIMBS];
+ BN_ULONG U1[P256_LIMBS], S1[P256_LIMBS];
+ BN_ULONG Z1sqr[P256_LIMBS];
+ BN_ULONG Z2sqr[P256_LIMBS];
+ BN_ULONG H[P256_LIMBS], R[P256_LIMBS];
+ BN_ULONG Hsqr[P256_LIMBS];
+ BN_ULONG Rsqr[P256_LIMBS];
+ BN_ULONG Hcub[P256_LIMBS];
+
+ BN_ULONG res_x[P256_LIMBS];
+ BN_ULONG res_y[P256_LIMBS];
+ BN_ULONG res_z[P256_LIMBS];
+
+ BN_ULONG in1infty, in2infty;
+
+ const BN_ULONG *in1_x = a->X;
+ const BN_ULONG *in1_y = a->Y;
+ const BN_ULONG *in1_z = a->Z;
+
+ const BN_ULONG *in2_x = b->X;
+ const BN_ULONG *in2_y = b->Y;
+ const BN_ULONG *in2_z = b->Z;
+
+ /* We encode infinity as (0,0), which is not on the curve,
+ * so it is OK. */
+ in1infty = (in1_x[0] | in1_x[1] | in1_x[2] | in1_x[3] |
+ in1_y[0] | in1_y[1] | in1_y[2] | in1_y[3]);
+ if (P256_LIMBS == 8)
+ in1infty |= (in1_x[4] | in1_x[5] | in1_x[6] | in1_x[7] |
+ in1_y[4] | in1_y[5] | in1_y[6] | in1_y[7]);
+
+ in2infty = (in2_x[0] | in2_x[1] | in2_x[2] | in2_x[3] |
+ in2_y[0] | in2_y[1] | in2_y[2] | in2_y[3]);
+ if (P256_LIMBS == 8)
+ in2infty |= (in2_x[4] | in2_x[5] | in2_x[6] | in2_x[7] |
+ in2_y[4] | in2_y[5] | in2_y[6] | in2_y[7]);
+
+ in1infty = is_zero(in1infty);
+ in2infty = is_zero(in2infty);
+
+ ecp_nistz256_sqr_mont(Z2sqr, in2_z); /* Z2^2 */
+ ecp_nistz256_sqr_mont(Z1sqr, in1_z); /* Z1^2 */
+
+ ecp_nistz256_mul_mont(S1, Z2sqr, in2_z); /* S1 = Z2^3 */
+ ecp_nistz256_mul_mont(S2, Z1sqr, in1_z); /* S2 = Z1^3 */
+
+ ecp_nistz256_mul_mont(S1, S1, in1_y); /* S1 = Y1*Z2^3 */
+ ecp_nistz256_mul_mont(S2, S2, in2_y); /* S2 = Y2*Z1^3 */
+ ecp_nistz256_sub(R, S2, S1); /* R = S2 - S1 */
+
+ ecp_nistz256_mul_mont(U1, in1_x, Z2sqr); /* U1 = X1*Z2^2 */
+ ecp_nistz256_mul_mont(U2, in2_x, Z1sqr); /* U2 = X2*Z1^2 */
+ ecp_nistz256_sub(H, U2, U1); /* H = U2 - U1 */
+
+ /*
+ * This should not happen during sign/ecdh, so no constant time violation
+ */
+ if (is_equal(U1, U2) && !in1infty && !in2infty) {
+ if (is_equal(S1, S2)) {
+ ecp_nistz256_point_double(r, a);
+ return;
+ } else {
+ memset(r, 0, sizeof(*r));
+ return;
+ }
+ }
+
+ ecp_nistz256_sqr_mont(Rsqr, R); /* R^2 */
+ ecp_nistz256_mul_mont(res_z, H, in1_z); /* Z3 = H*Z1*Z2 */
+ ecp_nistz256_sqr_mont(Hsqr, H); /* H^2 */
+ ecp_nistz256_mul_mont(res_z, res_z, in2_z); /* Z3 = H*Z1*Z2 */
+ ecp_nistz256_mul_mont(Hcub, Hsqr, H); /* H^3 */
+
+ ecp_nistz256_mul_mont(U2, U1, Hsqr); /* U1*H^2 */
+ ecp_nistz256_mul_by_2(Hsqr, U2); /* 2*U1*H^2 */
+
+ ecp_nistz256_sub(res_x, Rsqr, Hsqr);
+ ecp_nistz256_sub(res_x, res_x, Hcub);
+
+ ecp_nistz256_sub(res_y, U2, res_x);
+
+ ecp_nistz256_mul_mont(S2, S1, Hcub);
+ ecp_nistz256_mul_mont(res_y, R, res_y);
+ ecp_nistz256_sub(res_y, res_y, S2);
+
+ copy_conditional(res_x, in2_x, in1infty);
+ copy_conditional(res_y, in2_y, in1infty);
+ copy_conditional(res_z, in2_z, in1infty);
+
+ copy_conditional(res_x, in1_x, in2infty);
+ copy_conditional(res_y, in1_y, in2infty);
+ copy_conditional(res_z, in1_z, in2infty);
+
+ memcpy(r->X, res_x, sizeof(res_x));
+ memcpy(r->Y, res_y, sizeof(res_y));
+ memcpy(r->Z, res_z, sizeof(res_z));
+}
+
+/* Point addition when b is known to be affine: r = a+b */
+static void ecp_nistz256_point_add_affine(P256_POINT *r,
+ const P256_POINT *a,
+ const P256_POINT_AFFINE *b)
+{
+ BN_ULONG U2[P256_LIMBS], S2[P256_LIMBS];
+ BN_ULONG Z1sqr[P256_LIMBS];
+ BN_ULONG H[P256_LIMBS], R[P256_LIMBS];
+ BN_ULONG Hsqr[P256_LIMBS];
+ BN_ULONG Rsqr[P256_LIMBS];
+ BN_ULONG Hcub[P256_LIMBS];
+
+ BN_ULONG res_x[P256_LIMBS];
+ BN_ULONG res_y[P256_LIMBS];
+ BN_ULONG res_z[P256_LIMBS];
+
+ BN_ULONG in1infty, in2infty;
+
+ const BN_ULONG *in1_x = a->X;
+ const BN_ULONG *in1_y = a->Y;
+ const BN_ULONG *in1_z = a->Z;
+
+ const BN_ULONG *in2_x = b->X;
+ const BN_ULONG *in2_y = b->Y;
+
+ /*
+ * In affine representation we encode infty as (0,0), which is not on the
+ * curve, so it is OK
+ */
+ in1infty = (in1_x[0] | in1_x[1] | in1_x[2] | in1_x[3] |
+ in1_y[0] | in1_y[1] | in1_y[2] | in1_y[3]);
+ if (P256_LIMBS == 8)
+ in1infty |= (in1_x[4] | in1_x[5] | in1_x[6] | in1_x[7] |
+ in1_y[4] | in1_y[5] | in1_y[6] | in1_y[7]);
+
+ in2infty = (in2_x[0] | in2_x[1] | in2_x[2] | in2_x[3] |
+ in2_y[0] | in2_y[1] | in2_y[2] | in2_y[3]);
+ if (P256_LIMBS == 8)
+ in2infty |= (in2_x[4] | in2_x[5] | in2_x[6] | in2_x[7] |
+ in2_y[4] | in2_y[5] | in2_y[6] | in2_y[7]);
+
+ in1infty = is_zero(in1infty);
+ in2infty = is_zero(in2infty);
+
+ ecp_nistz256_sqr_mont(Z1sqr, in1_z); /* Z1^2 */
+
+ ecp_nistz256_mul_mont(U2, in2_x, Z1sqr); /* U2 = X2*Z1^2 */
+ ecp_nistz256_sub(H, U2, in1_x); /* H = U2 - U1 */
+
+ ecp_nistz256_mul_mont(S2, Z1sqr, in1_z); /* S2 = Z1^3 */
+
+ ecp_nistz256_mul_mont(res_z, H, in1_z); /* Z3 = H*Z1*Z2 */
+
+ ecp_nistz256_mul_mont(S2, S2, in2_y); /* S2 = Y2*Z1^3 */
+ ecp_nistz256_sub(R, S2, in1_y); /* R = S2 - S1 */
+
+ ecp_nistz256_sqr_mont(Hsqr, H); /* H^2 */
+ ecp_nistz256_sqr_mont(Rsqr, R); /* R^2 */
+ ecp_nistz256_mul_mont(Hcub, Hsqr, H); /* H^3 */
+
+ ecp_nistz256_mul_mont(U2, in1_x, Hsqr); /* U1*H^2 */
+ ecp_nistz256_mul_by_2(Hsqr, U2); /* 2*U1*H^2 */
+
+ ecp_nistz256_sub(res_x, Rsqr, Hsqr);
+ ecp_nistz256_sub(res_x, res_x, Hcub);
+ ecp_nistz256_sub(H, U2, res_x);
+
+ ecp_nistz256_mul_mont(S2, in1_y, Hcub);
+ ecp_nistz256_mul_mont(H, H, R);
+ ecp_nistz256_sub(res_y, H, S2);
+
+ copy_conditional(res_x, in2_x, in1infty);
+ copy_conditional(res_x, in1_x, in2infty);
+
+ copy_conditional(res_y, in2_y, in1infty);
+ copy_conditional(res_y, in1_y, in2infty);
+
+ copy_conditional(res_z, ONE, in1infty);
+ copy_conditional(res_z, in1_z, in2infty);
+
+ memcpy(r->X, res_x, sizeof(res_x));
+ memcpy(r->Y, res_y, sizeof(res_y));
+ memcpy(r->Z, res_z, sizeof(res_z));
+}
+#endif
+
+/* r = in^-1 mod p */
+static void ecp_nistz256_mod_inverse(BN_ULONG r[P256_LIMBS],
+ const BN_ULONG in[P256_LIMBS])
+{
+ /*
+ * The poly is ffffffff 00000001 00000000 00000000 00000000 ffffffff
+ * ffffffff ffffffff We use FLT and used poly-2 as exponent
+ */
+ BN_ULONG p2[P256_LIMBS];
+ BN_ULONG p4[P256_LIMBS];
+ BN_ULONG p8[P256_LIMBS];
+ BN_ULONG p16[P256_LIMBS];
+ BN_ULONG p32[P256_LIMBS];
+ BN_ULONG res[P256_LIMBS];
+ int i;
+
+ ecp_nistz256_sqr_mont(res, in);
+ ecp_nistz256_mul_mont(p2, res, in); /* 3*p */
+
+ ecp_nistz256_sqr_mont(res, p2);
+ ecp_nistz256_sqr_mont(res, res);
+ ecp_nistz256_mul_mont(p4, res, p2); /* f*p */
+
+ ecp_nistz256_sqr_mont(res, p4);
+ ecp_nistz256_sqr_mont(res, res);
+ ecp_nistz256_sqr_mont(res, res);
+ ecp_nistz256_sqr_mont(res, res);
+ ecp_nistz256_mul_mont(p8, res, p4); /* ff*p */
+
+ ecp_nistz256_sqr_mont(res, p8);
+ for (i = 0; i < 7; i++)
+ ecp_nistz256_sqr_mont(res, res);
+ ecp_nistz256_mul_mont(p16, res, p8); /* ffff*p */
+
+ ecp_nistz256_sqr_mont(res, p16);
+ for (i = 0; i < 15; i++)
+ ecp_nistz256_sqr_mont(res, res);
+ ecp_nistz256_mul_mont(p32, res, p16); /* ffffffff*p */
+
+ ecp_nistz256_sqr_mont(res, p32);
+ for (i = 0; i < 31; i++)
+ ecp_nistz256_sqr_mont(res, res);
+ ecp_nistz256_mul_mont(res, res, in);
+
+ for (i = 0; i < 32 * 4; i++)
+ ecp_nistz256_sqr_mont(res, res);
+ ecp_nistz256_mul_mont(res, res, p32);
+
+ for (i = 0; i < 32; i++)
+ ecp_nistz256_sqr_mont(res, res);
+ ecp_nistz256_mul_mont(res, res, p32);
+
+ for (i = 0; i < 16; i++)
+ ecp_nistz256_sqr_mont(res, res);
+ ecp_nistz256_mul_mont(res, res, p16);
+
+ for (i = 0; i < 8; i++)
+ ecp_nistz256_sqr_mont(res, res);
+ ecp_nistz256_mul_mont(res, res, p8);
+
+ ecp_nistz256_sqr_mont(res, res);
+ ecp_nistz256_sqr_mont(res, res);
+ ecp_nistz256_sqr_mont(res, res);
+ ecp_nistz256_sqr_mont(res, res);
+ ecp_nistz256_mul_mont(res, res, p4);
+
+ ecp_nistz256_sqr_mont(res, res);
+ ecp_nistz256_sqr_mont(res, res);
+ ecp_nistz256_mul_mont(res, res, p2);
+
+ ecp_nistz256_sqr_mont(res, res);
+ ecp_nistz256_sqr_mont(res, res);
+ ecp_nistz256_mul_mont(res, res, in);
+
+ memcpy(r, res, sizeof(res));
+}
+
+/*
+ * ecp_nistz256_bignum_to_field_elem copies the contents of |in| to |out| and
+ * returns one if it fits. Otherwise it returns zero.
+ */
+static int ecp_nistz256_bignum_to_field_elem(BN_ULONG out[P256_LIMBS],
+ const BIGNUM *in)
+{
+ if (in->top > P256_LIMBS)
+ return 0;
+
+ memset(out, 0, sizeof(BN_ULONG) * P256_LIMBS);
+ memcpy(out, in->d, sizeof(BN_ULONG) * in->top);
+ return 1;
+}
+
+/* r = sum(scalar[i]*point[i]) */
+static int ecp_nistz256_windowed_mul(const EC_GROUP *group,
+ P256_POINT *r,
+ const BIGNUM **scalar,
+ const EC_POINT **point,
+ int num, BN_CTX *ctx)
+{
+
+ int i, j, ret = 0;
+ unsigned int index;
+ unsigned char (*p_str)[33] = NULL;
+ const unsigned int window_size = 5;
+ const unsigned int mask = (1 << (window_size + 1)) - 1;
+ unsigned int wvalue;
+ BN_ULONG tmp[P256_LIMBS];
+ ALIGN32 P256_POINT h;
+ const BIGNUM **scalars = NULL;
+ P256_POINT (*table)[16] = NULL;
+ void *table_storage = NULL;
+
+ if ((table_storage =
+ OPENSSL_malloc(num * 16 * sizeof(P256_POINT) + 64)) == NULL
+ || (p_str =
+ OPENSSL_malloc(num * 33 * sizeof(unsigned char))) == NULL
+ || (scalars = OPENSSL_malloc(num * sizeof(BIGNUM *))) == NULL) {
+ ECerr(EC_F_ECP_NISTZ256_WINDOWED_MUL, ERR_R_MALLOC_FAILURE);
+ goto err;
+ } else {
+ table = (void *)ALIGNPTR(table_storage, 64);
+ }
+
+ for (i = 0; i < num; i++) {
+ P256_POINT *row = table[i];
+
+ /* This is an unusual input, we don't guarantee constant-timeness. */
+ if ((BN_num_bits(scalar[i]) > 256) || BN_is_negative(scalar[i])) {
+ BIGNUM *mod;
+
+ if ((mod = BN_CTX_get(ctx)) == NULL)
+ goto err;
+ if (!BN_nnmod(mod, scalar[i], &group->order, ctx)) {
+ ECerr(EC_F_ECP_NISTZ256_WINDOWED_MUL, ERR_R_BN_LIB);
+ goto err;
+ }
+ scalars[i] = mod;
+ } else
+ scalars[i] = scalar[i];
+
+ for (j = 0; j < scalars[i]->top * BN_BYTES; j += BN_BYTES) {
+ BN_ULONG d = scalars[i]->d[j / BN_BYTES];
+
+ p_str[i][j + 0] = d & 0xff;
+ p_str[i][j + 1] = (d >> 8) & 0xff;
+ p_str[i][j + 2] = (d >> 16) & 0xff;
+ p_str[i][j + 3] = (d >>= 24) & 0xff;
+ if (BN_BYTES == 8) {
+ d >>= 8;
+ p_str[i][j + 4] = d & 0xff;
+ p_str[i][j + 5] = (d >> 8) & 0xff;
+ p_str[i][j + 6] = (d >> 16) & 0xff;
+ p_str[i][j + 7] = (d >> 24) & 0xff;
+ }
+ }
+ for (; j < 33; j++)
+ p_str[i][j] = 0;
+
+ /* table[0] is implicitly (0,0,0) (the point at infinity),
+ * therefore it is not stored. All other values are actually
+ * stored with an offset of -1 in table.
+ */
+
+ if (!ecp_nistz256_bignum_to_field_elem(row[1 - 1].X, &point[i]->X)
+ || !ecp_nistz256_bignum_to_field_elem(row[1 - 1].Y, &point[i]->Y)
+ || !ecp_nistz256_bignum_to_field_elem(row[1 - 1].Z, &point[i]->Z)) {
+ ECerr(EC_F_ECP_NISTZ256_WINDOWED_MUL, EC_R_COORDINATES_OUT_OF_RANGE);
+ goto err;
+ }
+
+ ecp_nistz256_point_double(&row[ 2 - 1], &row[ 1 - 1]);
+ ecp_nistz256_point_add (&row[ 3 - 1], &row[ 2 - 1], &row[1 - 1]);
+ ecp_nistz256_point_double(&row[ 4 - 1], &row[ 2 - 1]);
+ ecp_nistz256_point_double(&row[ 6 - 1], &row[ 3 - 1]);
+ ecp_nistz256_point_double(&row[ 8 - 1], &row[ 4 - 1]);
+ ecp_nistz256_point_double(&row[12 - 1], &row[ 6 - 1]);
+ ecp_nistz256_point_add (&row[ 5 - 1], &row[ 4 - 1], &row[1 - 1]);
+ ecp_nistz256_point_add (&row[ 7 - 1], &row[ 6 - 1], &row[1 - 1]);
+ ecp_nistz256_point_add (&row[ 9 - 1], &row[ 8 - 1], &row[1 - 1]);
+ ecp_nistz256_point_add (&row[13 - 1], &row[12 - 1], &row[1 - 1]);
+ ecp_nistz256_point_double(&row[14 - 1], &row[ 7 - 1]);
+ ecp_nistz256_point_double(&row[10 - 1], &row[ 5 - 1]);
+ ecp_nistz256_point_add (&row[15 - 1], &row[14 - 1], &row[1 - 1]);
+ ecp_nistz256_point_add (&row[11 - 1], &row[10 - 1], &row[1 - 1]);
+ ecp_nistz256_point_add (&row[16 - 1], &row[15 - 1], &row[1 - 1]);
+ }
+
+ index = 255;
+
+ wvalue = p_str[0][(index - 1) / 8];
+ wvalue = (wvalue >> ((index - 1) % 8)) & mask;
+
+ ecp_nistz256_select_w5(r, table[0], _booth_recode_w5(wvalue) >> 1);
+
+ while (index >= 5) {
+ for (i = (index == 255 ? 1 : 0); i < num; i++) {
+ unsigned int off = (index - 1) / 8;
+
+ wvalue = p_str[i][off] | p_str[i][off + 1] << 8;
+ wvalue = (wvalue >> ((index - 1) % 8)) & mask;
+
+ wvalue = _booth_recode_w5(wvalue);
+
+ ecp_nistz256_select_w5(&h, table[i], wvalue >> 1);
+
+ ecp_nistz256_neg(tmp, h.Y);
+ copy_conditional(h.Y, tmp, (wvalue & 1));
+
+ ecp_nistz256_point_add(r, r, &h);
+ }
+
+ index -= window_size;
+
+ ecp_nistz256_point_double(r, r);
+ ecp_nistz256_point_double(r, r);
+ ecp_nistz256_point_double(r, r);
+ ecp_nistz256_point_double(r, r);
+ ecp_nistz256_point_double(r, r);
+ }
+
+ /* Final window */
+ for (i = 0; i < num; i++) {
+ wvalue = p_str[i][0];
+ wvalue = (wvalue << 1) & mask;
+
+ wvalue = _booth_recode_w5(wvalue);
+
+ ecp_nistz256_select_w5(&h, table[i], wvalue >> 1);
+
+ ecp_nistz256_neg(tmp, h.Y);
+ copy_conditional(h.Y, tmp, wvalue & 1);
+
+ ecp_nistz256_point_add(r, r, &h);
+ }
+
+ ret = 1;
+ err:
+ if (table_storage)
+ OPENSSL_free(table_storage);
+ if (p_str)
+ OPENSSL_free(p_str);
+ if (scalars)
+ OPENSSL_free(scalars);
+ return ret;
+}
+
+/* Coordinates of G, for which we have precomputed tables */
+const static BN_ULONG def_xG[P256_LIMBS] = {
+ TOBN(0x79e730d4, 0x18a9143c), TOBN(0x75ba95fc, 0x5fedb601),
+ TOBN(0x79fb732b, 0x77622510), TOBN(0x18905f76, 0xa53755c6)
+};
+
+const static BN_ULONG def_yG[P256_LIMBS] = {
+ TOBN(0xddf25357, 0xce95560a), TOBN(0x8b4ab8e4, 0xba19e45c),
+ TOBN(0xd2e88688, 0xdd21f325), TOBN(0x8571ff18, 0x25885d85)
+};
+
+/*
+ * ecp_nistz256_is_affine_G returns one if |generator| is the standard, P-256
+ * generator.
+ */
+static int ecp_nistz256_is_affine_G(const EC_POINT *generator)
+{
+ return (generator->X.top == P256_LIMBS) &&
+ (generator->Y.top == P256_LIMBS) &&
+ (generator->Z.top == (P256_LIMBS - P256_LIMBS / 8)) &&
+ is_equal(generator->X.d, def_xG) &&
+ is_equal(generator->Y.d, def_yG) && is_one(generator->Z.d);
+}
+
+static int ecp_nistz256_mult_precompute(EC_GROUP *group, BN_CTX *ctx)
+{
+ /*
+ * We precompute a table for a Booth encoded exponent (wNAF) based
+ * computation. Each table holds 64 values for safe access, with an
+ * implicit value of infinity at index zero. We use window of size 7, and
+ * therefore require ceil(256/7) = 37 tables.
+ */
+ BIGNUM *order;
+ EC_POINT *P = NULL, *T = NULL;
+ const EC_POINT *generator;
+ EC_PRE_COMP *pre_comp;
+ BN_CTX *new_ctx = NULL;
+ int i, j, k, ret = 0;
+ size_t w;
+
+ PRECOMP256_ROW *preComputedTable = NULL;
+ unsigned char *precomp_storage = NULL;
+
+ /* if there is an old EC_PRE_COMP object, throw it away */
+ EC_EX_DATA_free_data(&group->extra_data, ecp_nistz256_pre_comp_dup,
+ ecp_nistz256_pre_comp_free,
+ ecp_nistz256_pre_comp_clear_free);
+
+ generator = EC_GROUP_get0_generator(group);
+ if (generator == NULL) {
+ ECerr(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, EC_R_UNDEFINED_GENERATOR);
+ return 0;
+ }
+
+ if (ecp_nistz256_is_affine_G(generator)) {
+ /*
+ * No need to calculate tables for the standard generator because we
+ * have them statically.
+ */
+ return 1;
+ }
+
+ if ((pre_comp = ecp_nistz256_pre_comp_new(group)) == NULL)
+ return 0;
+
+ if (ctx == NULL) {
+ ctx = new_ctx = BN_CTX_new();
+ if (ctx == NULL)
+ goto err;
+ }
+
+ BN_CTX_start(ctx);
+ order = BN_CTX_get(ctx);
+
+ if (order == NULL)
+ goto err;
+
+ if (!EC_GROUP_get_order(group, order, ctx))
+ goto err;
+
+ if (BN_is_zero(order)) {
+ ECerr(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, EC_R_UNKNOWN_ORDER);
+ goto err;
+ }
+
+ w = 7;
+
+ if ((precomp_storage =
+ OPENSSL_malloc(37 * 64 * sizeof(P256_POINT_AFFINE) + 64)) == NULL) {
+ ECerr(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, ERR_R_MALLOC_FAILURE);
+ goto err;
+ } else {
+ preComputedTable = (void *)ALIGNPTR(precomp_storage, 64);
+ }
+
+ P = EC_POINT_new(group);
+ T = EC_POINT_new(group);
+ if (P == NULL || T == NULL)
+ goto err;
+
+ /*
+ * The zero entry is implicitly infinity, and we skip it, storing other
+ * values with -1 offset.
+ */
+ if (!EC_POINT_copy(T, generator))
+ goto err;
+
+ for (k = 0; k < 64; k++) {
+ if (!EC_POINT_copy(P, T))
+ goto err;
+ for (j = 0; j < 37; j++) {
+ /*
+ * It would be faster to use EC_POINTs_make_affine and
+ * make multiple points affine at the same time.
+ */
+ if (!EC_POINT_make_affine(group, P, ctx))
+ goto err;
+ if (!ecp_nistz256_bignum_to_field_elem(preComputedTable[j][k].X,
+ &P->X) ||
+ !ecp_nistz256_bignum_to_field_elem(preComputedTable[j][k].Y,
+ &P->Y)) {
+ ECerr(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE,
+ EC_R_COORDINATES_OUT_OF_RANGE);
+ goto err;
+ }
+ for (i = 0; i < 7; i++) {
+ if (!EC_POINT_dbl(group, P, P, ctx))
+ goto err;
+ }
+ }
+ if (!EC_POINT_add(group, T, T, generator, ctx))
+ goto err;
+ }
+
+ pre_comp->group = group;
+ pre_comp->w = w;
+ pre_comp->precomp = preComputedTable;
+ pre_comp->precomp_storage = precomp_storage;
+
+ precomp_storage = NULL;
+
+ if (!EC_EX_DATA_set_data(&group->extra_data, pre_comp,
+ ecp_nistz256_pre_comp_dup,
+ ecp_nistz256_pre_comp_free,
+ ecp_nistz256_pre_comp_clear_free)) {
+ goto err;
+ }
+
+ pre_comp = NULL;
+
+ ret = 1;
+
+ err:
+ if (ctx != NULL)
+ BN_CTX_end(ctx);
+ BN_CTX_free(new_ctx);
+
+ if (pre_comp)
+ ecp_nistz256_pre_comp_free(pre_comp);
+ if (precomp_storage)
+ OPENSSL_free(precomp_storage);
+ if (P)
+ EC_POINT_free(P);
+ if (T)
+ EC_POINT_free(T);
+ return ret;
+}
+
+/*
+ * Note that by default ECP_NISTZ256_AVX2 is undefined. While it's great
+ * code processing 4 points in parallel, corresponding serial operation
+ * is several times slower, because it uses 29x29=58-bit multiplication
+ * as opposite to 64x64=128-bit in integer-only scalar case. As result
+ * it doesn't provide *significant* performance improvement. Note that
+ * just defining ECP_NISTZ256_AVX2 is not sufficient to make it work,
+ * you'd need to compile even asm/ecp_nistz256-avx.pl module.
+ */
+#if defined(ECP_NISTZ256_AVX2)
+# if !(defined(__x86_64) || defined(__x86_64__)) || \
+ defined(_M_AMD64) || defined(_MX64)) || \
+ !(defined(__GNUC__) || defined(_MSC_VER)) /* this is for ALIGN32 */
+# undef ECP_NISTZ256_AVX2
+# else
+/* Constant time access, loading four values, from four consecutive tables */
+void ecp_nistz256_avx2_select_w7(P256_POINT_AFFINE * val,
+ const P256_POINT_AFFINE * in_t, int index);
+void ecp_nistz256_avx2_multi_select_w7(void *result, const void *in, int index0,
+ int index1, int index2, int index3);
+void ecp_nistz256_avx2_transpose_convert(void *RESULTx4, const void *in);
+void ecp_nistz256_avx2_convert_transpose_back(void *result, const void *Ax4);
+void ecp_nistz256_avx2_point_add_affine_x4(void *RESULTx4, const void *Ax4,
+ const void *Bx4);
+void ecp_nistz256_avx2_point_add_affines_x4(void *RESULTx4, const void *Ax4,
+ const void *Bx4);
+void ecp_nistz256_avx2_to_mont(void *RESULTx4, const void *Ax4);
+void ecp_nistz256_avx2_from_mont(void *RESULTx4, const void *Ax4);
+void ecp_nistz256_avx2_set1(void *RESULTx4);
+int ecp_nistz_avx2_eligible(void);
+
+static void booth_recode_w7(unsigned char *sign,
+ unsigned char *digit, unsigned char in)
+{
+ unsigned char s, d;
+
+ s = ~((in >> 7) - 1);
+ d = (1 << 8) - in - 1;
+ d = (d & s) | (in & ~s);
+ d = (d >> 1) + (d & 1);
+
+ *sign = s & 1;
+ *digit = d;
+}
+
+/*
+ * ecp_nistz256_avx2_mul_g performs multiplication by G, using only the
+ * precomputed table. It does 4 affine point additions in parallel,
+ * significantly speeding up point multiplication for a fixed value.
+ */
+static void ecp_nistz256_avx2_mul_g(P256_POINT *r,
+ unsigned char p_str[33],
+ const P256_POINT_AFFINE(*preComputedTable)[64])
+{
+ const unsigned int window_size = 7;
+ const unsigned int mask = (1 << (window_size + 1)) - 1;
+ unsigned int wvalue;
+ /* Using 4 windows at a time */
+ unsigned char sign0, digit0;
+ unsigned char sign1, digit1;
+ unsigned char sign2, digit2;
+ unsigned char sign3, digit3;
+ unsigned int index = 0;
+ BN_ULONG tmp[P256_LIMBS];
+ int i;
+
+ ALIGN32 BN_ULONG aX4[4 * 9 * 3] = { 0 };
+ ALIGN32 BN_ULONG bX4[4 * 9 * 2] = { 0 };
+ ALIGN32 P256_POINT_AFFINE point_arr[P256_LIMBS];
+ ALIGN32 P256_POINT res_point_arr[P256_LIMBS];
+
+ /* Initial four windows */
+ wvalue = *((u16 *) & p_str[0]);
+ wvalue = (wvalue << 1) & mask;
+ index += window_size;
+ booth_recode_w7(&sign0, &digit0, wvalue);
+ wvalue = *((u16 *) & p_str[(index - 1) / 8]);
+ wvalue = (wvalue >> ((index - 1) % 8)) & mask;
+ index += window_size;
+ booth_recode_w7(&sign1, &digit1, wvalue);
+ wvalue = *((u16 *) & p_str[(index - 1) / 8]);
+ wvalue = (wvalue >> ((index - 1) % 8)) & mask;
+ index += window_size;
+ booth_recode_w7(&sign2, &digit2, wvalue);
+ wvalue = *((u16 *) & p_str[(index - 1) / 8]);
+ wvalue = (wvalue >> ((index - 1) % 8)) & mask;
+ index += window_size;
+ booth_recode_w7(&sign3, &digit3, wvalue);
+
+ ecp_nistz256_avx2_multi_select_w7(point_arr, preComputedTable[0],
+ digit0, digit1, digit2, digit3);
+
+ ecp_nistz256_neg(tmp, point_arr[0].Y);
+ copy_conditional(point_arr[0].Y, tmp, sign0);
+ ecp_nistz256_neg(tmp, point_arr[1].Y);
+ copy_conditional(point_arr[1].Y, tmp, sign1);
+ ecp_nistz256_neg(tmp, point_arr[2].Y);
+ copy_conditional(point_arr[2].Y, tmp, sign2);
+ ecp_nistz256_neg(tmp, point_arr[3].Y);
+ copy_conditional(point_arr[3].Y, tmp, sign3);
+
+ ecp_nistz256_avx2_transpose_convert(aX4, point_arr);
+ ecp_nistz256_avx2_to_mont(aX4, aX4);
+ ecp_nistz256_avx2_to_mont(&aX4[4 * 9], &aX4[4 * 9]);
+ ecp_nistz256_avx2_set1(&aX4[4 * 9 * 2]);
+
+ wvalue = *((u16 *) & p_str[(index - 1) / 8]);
+ wvalue = (wvalue >> ((index - 1) % 8)) & mask;
+ index += window_size;
+ booth_recode_w7(&sign0, &digit0, wvalue);
+ wvalue = *((u16 *) & p_str[(index - 1) / 8]);
+ wvalue = (wvalue >> ((index - 1) % 8)) & mask;
+ index += window_size;
+ booth_recode_w7(&sign1, &digit1, wvalue);
+ wvalue = *((u16 *) & p_str[(index - 1) / 8]);
+ wvalue = (wvalue >> ((index - 1) % 8)) & mask;
+ index += window_size;
+ booth_recode_w7(&sign2, &digit2, wvalue);
+ wvalue = *((u16 *) & p_str[(index - 1) / 8]);
+ wvalue = (wvalue >> ((index - 1) % 8)) & mask;
+ index += window_size;
+ booth_recode_w7(&sign3, &digit3, wvalue);
+
+ ecp_nistz256_avx2_multi_select_w7(point_arr, preComputedTable[4 * 1],
+ digit0, digit1, digit2, digit3);
+
+ ecp_nistz256_neg(tmp, point_arr[0].Y);
+ copy_conditional(point_arr[0].Y, tmp, sign0);
+ ecp_nistz256_neg(tmp, point_arr[1].Y);
+ copy_conditional(point_arr[1].Y, tmp, sign1);
+ ecp_nistz256_neg(tmp, point_arr[2].Y);
+ copy_conditional(point_arr[2].Y, tmp, sign2);
+ ecp_nistz256_neg(tmp, point_arr[3].Y);
+ copy_conditional(point_arr[3].Y, tmp, sign3);
+
+ ecp_nistz256_avx2_transpose_convert(bX4, point_arr);
+ ecp_nistz256_avx2_to_mont(bX4, bX4);
+ ecp_nistz256_avx2_to_mont(&bX4[4 * 9], &bX4[4 * 9]);
+ /* Optimized when both inputs are affine */
+ ecp_nistz256_avx2_point_add_affines_x4(aX4, aX4, bX4);
+
+ for (i = 2; i < 9; i++) {
+ wvalue = *((u16 *) & p_str[(index - 1) / 8]);
+ wvalue = (wvalue >> ((index - 1) % 8)) & mask;
+ index += window_size;
+ booth_recode_w7(&sign0, &digit0, wvalue);
+ wvalue = *((u16 *) & p_str[(index - 1) / 8]);
+ wvalue = (wvalue >> ((index - 1) % 8)) & mask;
+ index += window_size;
+ booth_recode_w7(&sign1, &digit1, wvalue);
+ wvalue = *((u16 *) & p_str[(index - 1) / 8]);
+ wvalue = (wvalue >> ((index - 1) % 8)) & mask;
+ index += window_size;
+ booth_recode_w7(&sign2, &digit2, wvalue);
+ wvalue = *((u16 *) & p_str[(index - 1) / 8]);
+ wvalue = (wvalue >> ((index - 1) % 8)) & mask;
+ index += window_size;
+ booth_recode_w7(&sign3, &digit3, wvalue);
+
+ ecp_nistz256_avx2_multi_select_w7(point_arr,
+ preComputedTable[4 * i],
+ digit0, digit1, digit2, digit3);
+
+ ecp_nistz256_neg(tmp, point_arr[0].Y);
+ copy_conditional(point_arr[0].Y, tmp, sign0);
+ ecp_nistz256_neg(tmp, point_arr[1].Y);
+ copy_conditional(point_arr[1].Y, tmp, sign1);
+ ecp_nistz256_neg(tmp, point_arr[2].Y);
+ copy_conditional(point_arr[2].Y, tmp, sign2);
+ ecp_nistz256_neg(tmp, point_arr[3].Y);
+ copy_conditional(point_arr[3].Y, tmp, sign3);
+
+ ecp_nistz256_avx2_transpose_convert(bX4, point_arr);
+ ecp_nistz256_avx2_to_mont(bX4, bX4);
+ ecp_nistz256_avx2_to_mont(&bX4[4 * 9], &bX4[4 * 9]);
+
+ ecp_nistz256_avx2_point_add_affine_x4(aX4, aX4, bX4);
+ }
+
+ ecp_nistz256_avx2_from_mont(&aX4[4 * 9 * 0], &aX4[4 * 9 * 0]);
+ ecp_nistz256_avx2_from_mont(&aX4[4 * 9 * 1], &aX4[4 * 9 * 1]);
+ ecp_nistz256_avx2_from_mont(&aX4[4 * 9 * 2], &aX4[4 * 9 * 2]);
+
+ ecp_nistz256_avx2_convert_transpose_back(res_point_arr, aX4);
+ /* Last window is performed serially */
+ wvalue = *((u16 *) & p_str[(index - 1) / 8]);
+ wvalue = (wvalue >> ((index - 1) % 8)) & mask;
+ booth_recode_w7(&sign0, &digit0, wvalue);
+ ecp_nistz256_avx2_select_w7((P256_POINT_AFFINE *) r,
+ preComputedTable[36], digit0);
+ ecp_nistz256_neg(tmp, r->Y);
+ copy_conditional(r->Y, tmp, sign0);
+ memcpy(r->Z, ONE, sizeof(ONE));
+ /* Sum the four windows */
+ ecp_nistz256_point_add(r, r, &res_point_arr[0]);
+ ecp_nistz256_point_add(r, r, &res_point_arr[1]);
+ ecp_nistz256_point_add(r, r, &res_point_arr[2]);
+ ecp_nistz256_point_add(r, r, &res_point_arr[3]);
+}
+# endif
+#endif
+
+static int ecp_nistz256_set_from_affine(EC_POINT *out, const EC_GROUP *group,
+ const P256_POINT_AFFINE *in,
+ BN_CTX *ctx)
+{
+ BIGNUM x, y;
+ BN_ULONG d_x[P256_LIMBS], d_y[P256_LIMBS];
+ int ret = 0;
+
+ memcpy(d_x, in->X, sizeof(d_x));
+ x.d = d_x;
+ x.dmax = x.top = P256_LIMBS;
+ x.neg = 0;
+ x.flags = BN_FLG_STATIC_DATA;
+
+ memcpy(d_y, in->Y, sizeof(d_y));
+ y.d = d_y;
+ y.dmax = y.top = P256_LIMBS;
+ y.neg = 0;
+ y.flags = BN_FLG_STATIC_DATA;
+
+ ret = EC_POINT_set_affine_coordinates_GFp(group, out, &x, &y, ctx);
+
+ return ret;
+}
+
+/* r = scalar*G + sum(scalars[i]*points[i]) */
+static int ecp_nistz256_points_mul(const EC_GROUP *group,
+ EC_POINT *r,
+ const BIGNUM *scalar,
+ size_t num,
+ const EC_POINT *points[],
+ const BIGNUM *scalars[], BN_CTX *ctx)
+{
+ int i = 0, ret = 0, no_precomp_for_generator = 0, p_is_infinity = 0;
+ size_t j;
+ unsigned char p_str[33] = { 0 };
+ const PRECOMP256_ROW *preComputedTable = NULL;
+ const EC_PRE_COMP *pre_comp = NULL;
+ const EC_POINT *generator = NULL;
+ unsigned int index = 0;
+ BN_CTX *new_ctx = NULL;
+ const BIGNUM **new_scalars = NULL;
+ const EC_POINT **new_points = NULL;
+ const unsigned int window_size = 7;
+ const unsigned int mask = (1 << (window_size + 1)) - 1;
+ unsigned int wvalue;
+ ALIGN32 union {
+ P256_POINT p;
+ P256_POINT_AFFINE a;
+ } t, p;
+ BIGNUM *tmp_scalar;
+
+ if (group->meth != r->meth) {
+ ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, EC_R_INCOMPATIBLE_OBJECTS);
+ return 0;
+ }
+
+ if ((scalar == NULL) && (num == 0))
+ return EC_POINT_set_to_infinity(group, r);
+
+ for (j = 0; j < num; j++) {
+ if (group->meth != points[j]->meth) {
+ ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, EC_R_INCOMPATIBLE_OBJECTS);
+ return 0;
+ }
+ }
+
+ if (ctx == NULL) {
+ ctx = new_ctx = BN_CTX_new();
+ if (ctx == NULL)
+ goto err;
+ }
+
+ BN_CTX_start(ctx);
+
+ if (scalar) {
+ generator = EC_GROUP_get0_generator(group);
+ if (generator == NULL) {
+ ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, EC_R_UNDEFINED_GENERATOR);
+ goto err;
+ }
+
+ /* look if we can use precomputed multiples of generator */
+ pre_comp =
+ EC_EX_DATA_get_data(group->extra_data, ecp_nistz256_pre_comp_dup,
+ ecp_nistz256_pre_comp_free,
+ ecp_nistz256_pre_comp_clear_free);
+
+ if (pre_comp) {
+ /*
+ * If there is a precomputed table for the generator, check that
+ * it was generated with the same generator.
+ */
+ EC_POINT *pre_comp_generator = EC_POINT_new(group);
+ if (pre_comp_generator == NULL)
+ goto err;
+
+ if (!ecp_nistz256_set_from_affine
+ (pre_comp_generator, group, pre_comp->precomp[0], ctx)) {
+ EC_POINT_free(pre_comp_generator);
+ goto err;
+ }
+
+ if (0 == EC_POINT_cmp(group, generator, pre_comp_generator, ctx))
+ preComputedTable = (const PRECOMP256_ROW *)pre_comp->precomp;
+
+ EC_POINT_free(pre_comp_generator);
+ }
+
+ if (preComputedTable == NULL && ecp_nistz256_is_affine_G(generator)) {
+ /*
+ * If there is no precomputed data, but the generator
+ * is the default, a hardcoded table of precomputed
+ * data is used. This is because applications, such as
+ * Apache, do not use EC_KEY_precompute_mult.
+ */
+ preComputedTable = (const PRECOMP256_ROW *)ecp_nistz256_precomputed;
+ }
+
+ if (preComputedTable) {
+ if ((BN_num_bits(scalar) > 256)
+ || BN_is_negative(scalar)) {
+ if ((tmp_scalar = BN_CTX_get(ctx)) == NULL)
+ goto err;
+
+ if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) {
+ ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, ERR_R_BN_LIB);
+ goto err;
+ }
+ scalar = tmp_scalar;
+ }
+
+ for (i = 0; i < scalar->top * BN_BYTES; i += BN_BYTES) {
+ BN_ULONG d = scalar->d[i / BN_BYTES];
+
+ p_str[i + 0] = d & 0xff;
+ p_str[i + 1] = (d >> 8) & 0xff;
+ p_str[i + 2] = (d >> 16) & 0xff;
+ p_str[i + 3] = (d >>= 24) & 0xff;
+ if (BN_BYTES == 8) {
+ d >>= 8;
+ p_str[i + 4] = d & 0xff;
+ p_str[i + 5] = (d >> 8) & 0xff;
+ p_str[i + 6] = (d >> 16) & 0xff;
+ p_str[i + 7] = (d >> 24) & 0xff;
+ }
+ }
+
+ for (; i < 33; i++)
+ p_str[i] = 0;
+
+#if defined(ECP_NISTZ256_AVX2)
+ if (ecp_nistz_avx2_eligible()) {
+ ecp_nistz256_avx2_mul_g(&p.p, p_str, preComputedTable);
+ } else
+#endif
+ {
+ /* First window */
+ wvalue = (p_str[0] << 1) & mask;
+ index += window_size;
+
+ wvalue = _booth_recode_w7(wvalue);
+
+ ecp_nistz256_select_w7(&p.a, preComputedTable[0], wvalue >> 1);
+
+ ecp_nistz256_neg(p.p.Z, p.p.Y);
+ copy_conditional(p.p.Y, p.p.Z, wvalue & 1);
+
+ memcpy(p.p.Z, ONE, sizeof(ONE));
+
+ for (i = 1; i < 37; i++) {
+ unsigned int off = (index - 1) / 8;
+ wvalue = p_str[off] | p_str[off + 1] << 8;
+ wvalue = (wvalue >> ((index - 1) % 8)) & mask;
+ index += window_size;
+
+ wvalue = _booth_recode_w7(wvalue);
+
+ ecp_nistz256_select_w7(&t.a,
+ preComputedTable[i], wvalue >> 1);
+
+ ecp_nistz256_neg(t.p.Z, t.a.Y);
+ copy_conditional(t.a.Y, t.p.Z, wvalue & 1);
+
+ ecp_nistz256_point_add_affine(&p.p, &p.p, &t.a);
+ }
+ }
+ } else {
+ p_is_infinity = 1;
+ no_precomp_for_generator = 1;
+ }
+ } else
+ p_is_infinity = 1;
+
+ if (no_precomp_for_generator) {
+ /*
+ * Without a precomputed table for the generator, it has to be
+ * handled like a normal point.
+ */
+ new_scalars = OPENSSL_malloc((num + 1) * sizeof(BIGNUM *));
+ if (!new_scalars) {
+ ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+
+ new_points = OPENSSL_malloc((num + 1) * sizeof(EC_POINT *));
+ if (!new_points) {
+ ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+
+ memcpy(new_scalars, scalars, num * sizeof(BIGNUM *));
+ new_scalars[num] = scalar;
+ memcpy(new_points, points, num * sizeof(EC_POINT *));
+ new_points[num] = generator;
+
+ scalars = new_scalars;
+ points = new_points;
+ num++;
+ }
+
+ if (num) {
+ P256_POINT *out = &t.p;
+ if (p_is_infinity)
+ out = &p.p;
+
+ if (!ecp_nistz256_windowed_mul(group, out, scalars, points, num, ctx))
+ goto err;
+
+ if (!p_is_infinity)
+ ecp_nistz256_point_add(&p.p, &p.p, out);
+ }
+
+ /* Not constant-time, but we're only operating on the public output. */
+ if (!ecp_nistz256_set_words(&r->X, p.p.X) ||
+ !ecp_nistz256_set_words(&r->Y, p.p.Y) ||
+ !ecp_nistz256_set_words(&r->Z, p.p.Z)) {
+ goto err;
+ }
+ r->Z_is_one = is_one(p.p.Z) & 1;
+
+ ret = 1;
+
+err:
+ if (ctx)
+ BN_CTX_end(ctx);
+ BN_CTX_free(new_ctx);
+ if (new_points)
+ OPENSSL_free(new_points);
+ if (new_scalars)
+ OPENSSL_free(new_scalars);
+ return ret;
+}
+
+static int ecp_nistz256_get_affine(const EC_GROUP *group,
+ const EC_POINT *point,
+ BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
+{
+ BN_ULONG z_inv2[P256_LIMBS];
+ BN_ULONG z_inv3[P256_LIMBS];
+ BN_ULONG x_aff[P256_LIMBS];
+ BN_ULONG y_aff[P256_LIMBS];
+ BN_ULONG point_x[P256_LIMBS], point_y[P256_LIMBS], point_z[P256_LIMBS];
+ BN_ULONG x_ret[P256_LIMBS], y_ret[P256_LIMBS];
+
+ if (EC_POINT_is_at_infinity(group, point)) {
+ ECerr(EC_F_ECP_NISTZ256_GET_AFFINE, EC_R_POINT_AT_INFINITY);
+ return 0;
+ }
+
+ if (!ecp_nistz256_bignum_to_field_elem(point_x, &point->X) ||
+ !ecp_nistz256_bignum_to_field_elem(point_y, &point->Y) ||
+ !ecp_nistz256_bignum_to_field_elem(point_z, &point->Z)) {
+ ECerr(EC_F_ECP_NISTZ256_GET_AFFINE, EC_R_COORDINATES_OUT_OF_RANGE);
+ return 0;
+ }
+
+ ecp_nistz256_mod_inverse(z_inv3, point_z);
+ ecp_nistz256_sqr_mont(z_inv2, z_inv3);
+ ecp_nistz256_mul_mont(x_aff, z_inv2, point_x);
+
+ if (x != NULL) {
+ ecp_nistz256_from_mont(x_ret, x_aff);
+ if (!ecp_nistz256_set_words(x, x_ret))
+ return 0;
+ }
+
+ if (y != NULL) {
+ ecp_nistz256_mul_mont(z_inv3, z_inv3, z_inv2);
+ ecp_nistz256_mul_mont(y_aff, z_inv3, point_y);
+ ecp_nistz256_from_mont(y_ret, y_aff);
+ if (!ecp_nistz256_set_words(y, y_ret))
+ return 0;
+ }
+
+ return 1;
+}
+
+static EC_PRE_COMP *ecp_nistz256_pre_comp_new(const EC_GROUP *group)
+{
+ EC_PRE_COMP *ret = NULL;
+
+ if (!group)
+ return NULL;
+
+ ret = (EC_PRE_COMP *)OPENSSL_malloc(sizeof(EC_PRE_COMP));
+
+ if (!ret) {
+ ECerr(EC_F_ECP_NISTZ256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
+ return ret;
+ }
+
+ ret->group = group;
+ ret->w = 6; /* default */
+ ret->precomp = NULL;
+ ret->precomp_storage = NULL;
+ ret->references = 1;
+ return ret;
+}
+
+static void *ecp_nistz256_pre_comp_dup(void *src_)
+{
+ EC_PRE_COMP *src = src_;
+
+ /* no need to actually copy, these objects never change! */
+ CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
+
+ return src_;
+}
+
+static void ecp_nistz256_pre_comp_free(void *pre_)
+{
+ int i;
+ EC_PRE_COMP *pre = pre_;
+
+ if (!pre)
+ return;
+
+ i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
+ if (i > 0)
+ return;
+
+ if (pre->precomp_storage)
+ OPENSSL_free(pre->precomp_storage);
+
+ OPENSSL_free(pre);
+}
+
+static void ecp_nistz256_pre_comp_clear_free(void *pre_)
+{
+ int i;
+ EC_PRE_COMP *pre = pre_;
+
+ if (!pre)
+ return;
+
+ i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
+ if (i > 0)
+ return;
+
+ if (pre->precomp_storage) {
+ OPENSSL_cleanse(pre->precomp,
+ 32 * sizeof(unsigned char) * (1 << pre->w) * 2 * 37);
+ OPENSSL_free(pre->precomp_storage);
+ }
+ OPENSSL_cleanse(pre, sizeof *pre);
+ OPENSSL_free(pre);
+}
+
+static int ecp_nistz256_window_have_precompute_mult(const EC_GROUP *group)
+{
+ /* There is a hard-coded table for the default generator. */
+ const EC_POINT *generator = EC_GROUP_get0_generator(group);
+ if (generator != NULL && ecp_nistz256_is_affine_G(generator)) {
+ /* There is a hard-coded table for the default generator. */
+ return 1;
+ }
+
+ return EC_EX_DATA_get_data(group->extra_data, ecp_nistz256_pre_comp_dup,
+ ecp_nistz256_pre_comp_free,
+ ecp_nistz256_pre_comp_clear_free) != NULL;
+}
+
+const EC_METHOD *EC_GFp_nistz256_method(void)
+{
+ static const EC_METHOD ret = {
+ EC_FLAGS_DEFAULT_OCT,
+ NID_X9_62_prime_field,
+ ec_GFp_mont_group_init,
+ ec_GFp_mont_group_finish,
+ ec_GFp_mont_group_clear_finish,
+ ec_GFp_mont_group_copy,
+ ec_GFp_mont_group_set_curve,
+ ec_GFp_simple_group_get_curve,
+ ec_GFp_simple_group_get_degree,
+ ec_GFp_simple_group_check_discriminant,
+ ec_GFp_simple_point_init,
+ ec_GFp_simple_point_finish,
+ ec_GFp_simple_point_clear_finish,
+ ec_GFp_simple_point_copy,
+ ec_GFp_simple_point_set_to_infinity,
+ ec_GFp_simple_set_Jprojective_coordinates_GFp,
+ ec_GFp_simple_get_Jprojective_coordinates_GFp,
+ ec_GFp_simple_point_set_affine_coordinates,
+ ecp_nistz256_get_affine,
+ 0, 0, 0,
+ ec_GFp_simple_add,
+ ec_GFp_simple_dbl,
+ ec_GFp_simple_invert,
+ ec_GFp_simple_is_at_infinity,
+ ec_GFp_simple_is_on_curve,
+ ec_GFp_simple_cmp,
+ ec_GFp_simple_make_affine,
+ ec_GFp_simple_points_make_affine,
+ ecp_nistz256_points_mul, /* mul */
+ ecp_nistz256_mult_precompute, /* precompute_mult */
+ ecp_nistz256_window_have_precompute_mult, /* have_precompute_mult */
+ ec_GFp_mont_field_mul,
+ ec_GFp_mont_field_sqr,
+ 0, /* field_div */
+ ec_GFp_mont_field_encode,
+ ec_GFp_mont_field_decode,
+ ec_GFp_mont_field_set_to_one
+ };
+
+ return &ret;
+}
diff --git a/crypto/ec/ecp_nistz256_table.c b/crypto/ec/ecp_nistz256_table.c
new file mode 100644
index 000000000000..216d024e0120
--- /dev/null
+++ b/crypto/ec/ecp_nistz256_table.c
@@ -0,0 +1,9533 @@
+/*
+ * This is the precomputed constant time access table for the code in
+ * ecp_montp256.c, for the default generator. The table consists of 37
+ * subtables, each subtable contains 64 affine points. The affine points are
+ * encoded as eight uint64's, four for the x coordinate and four for the y.
+ * Both values are in little-endian order. There are 37 tables because a
+ * signed, 6-bit wNAF form of the scalar is used and ceil(256/(6 + 1)) = 37.
+ * Within each table there are 64 values because the 6-bit wNAF value can
+ * take 64 values, ignoring the sign bit, which is implemented by performing
+ * a negation of the affine point when required. We would like to align it
+ * to 2MB in order to increase the chances of using a large page but that
+ * appears to lead to invalid ELF files being produced.
+ */
+
+#if defined(__GNUC__)
+__attribute((aligned(4096)))
+#elif defined(_MSC_VER)
+__declspec(align(4096))
+#elif defined(__SUNPRO_C)
+# pragma align 4096(ecp_nistz256_precomputed)
+#endif
+static const BN_ULONG ecp_nistz256_precomputed[37][64 *
+ sizeof(P256_POINT_AFFINE) /
+ sizeof(BN_ULONG)] = {
+ {TOBN(0x79e730d4, 0x18a9143c), TOBN(0x75ba95fc, 0x5fedb601),
+ TOBN(0x79fb732b, 0x77622510), TOBN(0x18905f76, 0xa53755c6),
+ TOBN(0xddf25357, 0xce95560a), TOBN(0x8b4ab8e4, 0xba19e45c),
+ TOBN(0xd2e88688, 0xdd21f325), TOBN(0x8571ff18, 0x25885d85),
+ TOBN(0x850046d4, 0x10ddd64d), TOBN(0xaa6ae3c1, 0xa433827d),
+ TOBN(0x73220503, 0x8d1490d9), TOBN(0xf6bb32e4, 0x3dcf3a3b),
+ TOBN(0x2f3648d3, 0x61bee1a5), TOBN(0x152cd7cb, 0xeb236ff8),
+ TOBN(0x19a8fb0e, 0x92042dbe), TOBN(0x78c57751, 0x0a5b8a3b),
+ TOBN(0xffac3f90, 0x4eebc127), TOBN(0xb027f84a, 0x087d81fb),
+ TOBN(0x66ad77dd, 0x87cbbc98), TOBN(0x26936a3f, 0xb6ff747e),
+ TOBN(0xb04c5c1f, 0xc983a7eb), TOBN(0x583e47ad, 0x0861fe1a),
+ TOBN(0x78820831, 0x1a2ee98e), TOBN(0xd5f06a29, 0xe587cc07),
+ TOBN(0x74b0b50d, 0x46918dcc), TOBN(0x4650a6ed, 0xc623c173),
+ TOBN(0x0cdaacac, 0xe8100af2), TOBN(0x577362f5, 0x41b0176b),
+ TOBN(0x2d96f24c, 0xe4cbaba6), TOBN(0x17628471, 0xfad6f447),
+ TOBN(0x6b6c36de, 0xe5ddd22e), TOBN(0x84b14c39, 0x4c5ab863),
+ TOBN(0xbe1b8aae, 0xc45c61f5), TOBN(0x90ec649a, 0x94b9537d),
+ TOBN(0x941cb5aa, 0xd076c20c), TOBN(0xc9079605, 0x890523c8),
+ TOBN(0xeb309b4a, 0xe7ba4f10), TOBN(0x73c568ef, 0xe5eb882b),
+ TOBN(0x3540a987, 0x7e7a1f68), TOBN(0x73a076bb, 0x2dd1e916),
+ TOBN(0x40394737, 0x3e77664a), TOBN(0x55ae744f, 0x346cee3e),
+ TOBN(0xd50a961a, 0x5b17a3ad), TOBN(0x13074b59, 0x54213673),
+ TOBN(0x93d36220, 0xd377e44b), TOBN(0x299c2b53, 0xadff14b5),
+ TOBN(0xf424d44c, 0xef639f11), TOBN(0xa4c9916d, 0x4a07f75f),
+ TOBN(0x0746354e, 0xa0173b4f), TOBN(0x2bd20213, 0xd23c00f7),
+ TOBN(0xf43eaab5, 0x0c23bb08), TOBN(0x13ba5119, 0xc3123e03),
+ TOBN(0x2847d030, 0x3f5b9d4d), TOBN(0x6742f2f2, 0x5da67bdd),
+ TOBN(0xef933bdc, 0x77c94195), TOBN(0xeaedd915, 0x6e240867),
+ TOBN(0x27f14cd1, 0x9499a78f), TOBN(0x462ab5c5, 0x6f9b3455),
+ TOBN(0x8f90f02a, 0xf02cfc6b), TOBN(0xb763891e, 0xb265230d),
+ TOBN(0xf59da3a9, 0x532d4977), TOBN(0x21e3327d, 0xcf9eba15),
+ TOBN(0x123c7b84, 0xbe60bbf0), TOBN(0x56ec12f2, 0x7706df76),
+ TOBN(0x75c96e8f, 0x264e20e8), TOBN(0xabe6bfed, 0x59a7a841),
+ TOBN(0x2cc09c04, 0x44c8eb00), TOBN(0xe05b3080, 0xf0c4e16b),
+ TOBN(0x1eb7777a, 0xa45f3314), TOBN(0x56af7bed, 0xce5d45e3),
+ TOBN(0x2b6e019a, 0x88b12f1a), TOBN(0x086659cd, 0xfd835f9b),
+ TOBN(0x2c18dbd1, 0x9dc21ec8), TOBN(0x98f9868a, 0x0fcf8139),
+ TOBN(0x737d2cd6, 0x48250b49), TOBN(0xcc61c947, 0x24b3428f),
+ TOBN(0x0c2b4078, 0x80dd9e76), TOBN(0xc43a8991, 0x383fbe08),
+ TOBN(0x5f7d2d65, 0x779be5d2), TOBN(0x78719a54, 0xeb3b4ab5),
+ TOBN(0xea7d260a, 0x6245e404), TOBN(0x9de40795, 0x6e7fdfe0),
+ TOBN(0x1ff3a415, 0x8dac1ab5), TOBN(0x3e7090f1, 0x649c9073),
+ TOBN(0x1a768561, 0x2b944e88), TOBN(0x250f939e, 0xe57f61c8),
+ TOBN(0x0c0daa89, 0x1ead643d), TOBN(0x68930023, 0xe125b88e),
+ TOBN(0x04b71aa7, 0xd2697768), TOBN(0xabdedef5, 0xca345a33),
+ TOBN(0x2409d29d, 0xee37385e), TOBN(0x4ee1df77, 0xcb83e156),
+ TOBN(0x0cac12d9, 0x1cbb5b43), TOBN(0x170ed2f6, 0xca895637),
+ TOBN(0x28228cfa, 0x8ade6d66), TOBN(0x7ff57c95, 0x53238aca),
+ TOBN(0xccc42563, 0x4b2ed709), TOBN(0x0e356769, 0x856fd30d),
+ TOBN(0xbcbcd43f, 0x559e9811), TOBN(0x738477ac, 0x5395b759),
+ TOBN(0x35752b90, 0xc00ee17f), TOBN(0x68748390, 0x742ed2e3),
+ TOBN(0x7cd06422, 0xbd1f5bc1), TOBN(0xfbc08769, 0xc9e7b797),
+ TOBN(0xa242a35b, 0xb0cf664a), TOBN(0x126e48f7, 0x7f9707e3),
+ TOBN(0x1717bf54, 0xc6832660), TOBN(0xfaae7332, 0xfd12c72e),
+ TOBN(0x27b52db7, 0x995d586b), TOBN(0xbe29569e, 0x832237c2),
+ TOBN(0xe8e4193e, 0x2a65e7db), TOBN(0x152706dc, 0x2eaa1bbb),
+ TOBN(0x72bcd8b7, 0xbc60055b), TOBN(0x03cc23ee, 0x56e27e4b),
+ TOBN(0xee337424, 0xe4819370), TOBN(0xe2aa0e43, 0x0ad3da09),
+ TOBN(0x40b8524f, 0x6383c45d), TOBN(0xd7663554, 0x42a41b25),
+ TOBN(0x64efa6de, 0x778a4797), TOBN(0x2042170a, 0x7079adf4),
+ TOBN(0x808b0b65, 0x0bc6fb80), TOBN(0x5882e075, 0x3ffe2e6b),
+ TOBN(0xd5ef2f7c, 0x2c83f549), TOBN(0x54d63c80, 0x9103b723),
+ TOBN(0xf2f11bd6, 0x52a23f9b), TOBN(0x3670c319, 0x4b0b6587),
+ TOBN(0x55c4623b, 0xb1580e9e), TOBN(0x64edf7b2, 0x01efe220),
+ TOBN(0x97091dcb, 0xd53c5c9d), TOBN(0xf17624b6, 0xac0a177b),
+ TOBN(0xb0f13975, 0x2cfe2dff), TOBN(0xc1a35c0a, 0x6c7a574e),
+ TOBN(0x227d3146, 0x93e79987), TOBN(0x0575bf30, 0xe89cb80e),
+ TOBN(0x2f4e247f, 0x0d1883bb), TOBN(0xebd51226, 0x3274c3d0),
+ TOBN(0x5f3e51c8, 0x56ada97a), TOBN(0x4afc964d, 0x8f8b403e),
+ TOBN(0xa6f247ab, 0x412e2979), TOBN(0x675abd1b, 0x6f80ebda),
+ TOBN(0x66a2bd72, 0x5e485a1d), TOBN(0x4b2a5caf, 0x8f4f0b3c),
+ TOBN(0x2626927f, 0x1b847bba), TOBN(0x6c6fc7d9, 0x0502394d),
+ TOBN(0xfea912ba, 0xa5659ae8), TOBN(0x68363aba, 0x25e1a16e),
+ TOBN(0xb8842277, 0x752c41ac), TOBN(0xfe545c28, 0x2897c3fc),
+ TOBN(0x2d36e9e7, 0xdc4c696b), TOBN(0x5806244a, 0xfba977c5),
+ TOBN(0x85665e9b, 0xe39508c1), TOBN(0xf720ee25, 0x6d12597b),
+ TOBN(0x8a979129, 0xd2337a31), TOBN(0x5916868f, 0x0f862bdc),
+ TOBN(0x048099d9, 0x5dd283ba), TOBN(0xe2d1eeb6, 0xfe5bfb4e),
+ TOBN(0x82ef1c41, 0x7884005d), TOBN(0xa2d4ec17, 0xffffcbae),
+ TOBN(0x9161c53f, 0x8aa95e66), TOBN(0x5ee104e1, 0xc5fee0d0),
+ TOBN(0x562e4cec, 0xc135b208), TOBN(0x74e1b265, 0x4783f47d),
+ TOBN(0x6d2a506c, 0x5a3f3b30), TOBN(0xecead9f4, 0xc16762fc),
+ TOBN(0xf29dd4b2, 0xe286e5b9), TOBN(0x1b0fadc0, 0x83bb3c61),
+ TOBN(0x7a75023e, 0x7fac29a4), TOBN(0xc086d5f1, 0xc9477fa3),
+ TOBN(0x0fc61135, 0x2f6f3076), TOBN(0xc99ffa23, 0xe3912a9a),
+ TOBN(0x6a0b0685, 0xd2f8ba3d), TOBN(0xfdc777e8, 0xe93358a4),
+ TOBN(0x94a787bb, 0x35415f04), TOBN(0x640c2d6a, 0x4d23fea4),
+ TOBN(0x9de917da, 0x153a35b5), TOBN(0x793e8d07, 0x5d5cd074),
+ TOBN(0xf4f87653, 0x2de45068), TOBN(0x37c7a7e8, 0x9e2e1f6e),
+ TOBN(0xd0825fa2, 0xa3584069), TOBN(0xaf2cea7c, 0x1727bf42),
+ TOBN(0x0360a4fb, 0x9e4785a9), TOBN(0xe5fda49c, 0x27299f4a),
+ TOBN(0x48068e13, 0x71ac2f71), TOBN(0x83d0687b, 0x9077666f),
+ TOBN(0x6d3883b2, 0x15d02819), TOBN(0x6d0d7550, 0x40dd9a35),
+ TOBN(0x61d7cbf9, 0x1d2b469f), TOBN(0xf97b232f, 0x2efc3115),
+ TOBN(0xa551d750, 0xb24bcbc7), TOBN(0x11ea4949, 0x88a1e356),
+ TOBN(0x7669f031, 0x93cb7501), TOBN(0x595dc55e, 0xca737b8a),
+ TOBN(0xa4a319ac, 0xd837879f), TOBN(0x6fc1b49e, 0xed6b67b0),
+ TOBN(0xe3959933, 0x32f1f3af), TOBN(0x966742eb, 0x65432a2e),
+ TOBN(0x4b8dc9fe, 0xb4966228), TOBN(0x96cc6312, 0x43f43950),
+ TOBN(0x12068859, 0xc9b731ee), TOBN(0x7b948dc3, 0x56f79968),
+ TOBN(0x61e4ad32, 0xed1f8008), TOBN(0xe6c9267a, 0xd8b17538),
+ TOBN(0x1ac7c5eb, 0x857ff6fb), TOBN(0x994baaa8, 0x55f2fb10),
+ TOBN(0x84cf14e1, 0x1d248018), TOBN(0x5a39898b, 0x628ac508),
+ TOBN(0x14fde97b, 0x5fa944f5), TOBN(0xed178030, 0xd12e5ac7),
+ TOBN(0x042c2af4, 0x97e2feb4), TOBN(0xd36a42d7, 0xaebf7313),
+ TOBN(0x49d2c9eb, 0x084ffdd7), TOBN(0x9f8aa54b, 0x2ef7c76a),
+ TOBN(0x9200b7ba, 0x09895e70), TOBN(0x3bd0c66f, 0xddb7fb58),
+ TOBN(0x2d97d108, 0x78eb4cbb), TOBN(0x2d431068, 0xd84bde31),
+ TOBN(0x4b523eb7, 0x172ccd1f), TOBN(0x7323cb28, 0x30a6a892),
+ TOBN(0x97082ec0, 0xcfe153eb), TOBN(0xe97f6b6a, 0xf2aadb97),
+ TOBN(0x1d3d393e, 0xd1a83da1), TOBN(0xa6a7f9c7, 0x804b2a68),
+ TOBN(0x4a688b48, 0x2d0cb71e), TOBN(0xa9b4cc5f, 0x40585278),
+ TOBN(0x5e5db46a, 0xcb66e132), TOBN(0xf1be963a, 0x0d925880),
+ TOBN(0x944a7027, 0x0317b9e2), TOBN(0xe266f959, 0x48603d48),
+ TOBN(0x98db6673, 0x5c208899), TOBN(0x90472447, 0xa2fb18a3),
+ TOBN(0x8a966939, 0x777c619f), TOBN(0x3798142a, 0x2a3be21b),
+ TOBN(0xb4241cb1, 0x3298b343), TOBN(0xa3a14e49, 0xb44f65a1),
+ TOBN(0xc5f4d6cd, 0x3ac77acd), TOBN(0xd0288cb5, 0x52b6fc3c),
+ TOBN(0xd5cc8c2f, 0x1c040abc), TOBN(0xb675511e, 0x06bf9b4a),
+ TOBN(0xd667da37, 0x9b3aa441), TOBN(0x460d45ce, 0x51601f72),
+ TOBN(0xe2f73c69, 0x6755ff89), TOBN(0xdd3cf7e7, 0x473017e6),
+ TOBN(0x8ef5689d, 0x3cf7600d), TOBN(0x948dc4f8, 0xb1fc87b4),
+ TOBN(0xd9e9fe81, 0x4ea53299), TOBN(0x2d921ca2, 0x98eb6028),
+ TOBN(0xfaecedfd, 0x0c9803fc), TOBN(0xf38ae891, 0x4d7b4745),
+ TOBN(0xd8c5fccf, 0xc5e3a3d8), TOBN(0xbefd904c, 0x4079dfbf),
+ TOBN(0xbc6d6a58, 0xfead0197), TOBN(0x39227077, 0x695532a4),
+ TOBN(0x09e23e6d, 0xdbef42f5), TOBN(0x7e449b64, 0x480a9908),
+ TOBN(0x7b969c1a, 0xad9a2e40), TOBN(0x6231d792, 0x9591c2a4),
+ TOBN(0x87151456, 0x0f664534), TOBN(0x85ceae7c, 0x4b68f103),
+ TOBN(0xac09c4ae, 0x65578ab9), TOBN(0x33ec6868, 0xf044b10c),
+ TOBN(0x6ac4832b, 0x3a8ec1f1), TOBN(0x5509d128, 0x5847d5ef),
+ TOBN(0xf909604f, 0x763f1574), TOBN(0xb16c4303, 0xc32f63c4),
+ TOBN(0xb6ab2014, 0x7ca23cd3), TOBN(0xcaa7a5c6, 0xa391849d),
+ TOBN(0x5b0673a3, 0x75678d94), TOBN(0xc982ddd4, 0xdd303e64),
+ TOBN(0xfd7b000b, 0x5db6f971), TOBN(0xbba2cb1f, 0x6f876f92),
+ TOBN(0xc77332a3, 0x3c569426), TOBN(0xa159100c, 0x570d74f8),
+ TOBN(0xfd16847f, 0xdec67ef5), TOBN(0x742ee464, 0x233e76b7),
+ TOBN(0x0b8e4134, 0xefc2b4c8), TOBN(0xca640b86, 0x42a3e521),
+ TOBN(0x653a0190, 0x8ceb6aa9), TOBN(0x313c300c, 0x547852d5),
+ TOBN(0x24e4ab12, 0x6b237af7), TOBN(0x2ba90162, 0x8bb47af8),
+ TOBN(0x3d5e58d6, 0xa8219bb7), TOBN(0xc691d0bd, 0x1b06c57f),
+ TOBN(0x0ae4cb10, 0xd257576e), TOBN(0x3569656c, 0xd54a3dc3),
+ TOBN(0xe5ebaebd, 0x94cda03a), TOBN(0x934e82d3, 0x162bfe13),
+ TOBN(0x450ac0ba, 0xe251a0c6), TOBN(0x480b9e11, 0xdd6da526),
+ TOBN(0x00467bc5, 0x8cce08b5), TOBN(0xb636458c, 0x7f178d55),
+ TOBN(0xc5748bae, 0xa677d806), TOBN(0x2763a387, 0xdfa394eb),
+ TOBN(0xa12b448a, 0x7d3cebb6), TOBN(0xe7adda3e, 0x6f20d850),
+ TOBN(0xf63ebce5, 0x1558462c), TOBN(0x58b36143, 0x620088a8),
+ TOBN(0x8a2cc3ca, 0x4d63c0ee), TOBN(0x51233117, 0x0fe948ce),
+ TOBN(0x7463fd85, 0x222ef33b), TOBN(0xadf0c7dc, 0x7c603d6c),
+ TOBN(0x0ec32d3b, 0xfe7765e5), TOBN(0xccaab359, 0xbf380409),
+ TOBN(0xbdaa84d6, 0x8e59319c), TOBN(0xd9a4c280, 0x9c80c34d),
+ TOBN(0xa9d89488, 0xa059c142), TOBN(0x6f5ae714, 0xff0b9346),
+ TOBN(0x068f237d, 0x16fb3664), TOBN(0x5853e4c4, 0x363186ac),
+ TOBN(0xe2d87d23, 0x63c52f98), TOBN(0x2ec4a766, 0x81828876),
+ TOBN(0x47b864fa, 0xe14e7b1c), TOBN(0x0c0bc0e5, 0x69192408),
+ TOBN(0xe4d7681d, 0xb82e9f3e), TOBN(0x83200f0b, 0xdf25e13c),
+ TOBN(0x8909984c, 0x66f27280), TOBN(0x462d7b00, 0x75f73227),
+ TOBN(0xd90ba188, 0xf2651798), TOBN(0x74c6e18c, 0x36ab1c34),
+ TOBN(0xab256ea3, 0x5ef54359), TOBN(0x03466612, 0xd1aa702f),
+ TOBN(0x624d6049, 0x2ed22e91), TOBN(0x6fdfe0b5, 0x6f072822),
+ TOBN(0xeeca1115, 0x39ce2271), TOBN(0x98100a4f, 0xdb01614f),
+ TOBN(0xb6b0daa2, 0xa35c628f), TOBN(0xb6f94d2e, 0xc87e9a47),
+ TOBN(0xc6773259, 0x1d57d9ce), TOBN(0xf70bfeec, 0x03884a7b),
+ TOBN(0x5fb35ccf, 0xed2bad01), TOBN(0xa155cbe3, 0x1da6a5c7),
+ TOBN(0xc2e2594c, 0x30a92f8f), TOBN(0x649c89ce, 0x5bfafe43),
+ TOBN(0xd158667d, 0xe9ff257a), TOBN(0x9b359611, 0xf32c50ae),
+ TOBN(0x4b00b20b, 0x906014cf), TOBN(0xf3a8cfe3, 0x89bc7d3d),
+ TOBN(0x4ff23ffd, 0x248a7d06), TOBN(0x80c5bfb4, 0x878873fa),
+ TOBN(0xb7d9ad90, 0x05745981), TOBN(0x179c85db, 0x3db01994),
+ TOBN(0xba41b062, 0x61a6966c), TOBN(0x4d82d052, 0xeadce5a8),
+ TOBN(0x9e91cd3b, 0xa5e6a318), TOBN(0x47795f4f, 0x95b2dda0),
+ TOBN(0xecfd7c1f, 0xd55a897c), TOBN(0x009194ab, 0xb29110fb),
+ TOBN(0x5f0e2046, 0xe381d3b0), TOBN(0x5f3425f6, 0xa98dd291),
+ TOBN(0xbfa06687, 0x730d50da), TOBN(0x0423446c, 0x4b083b7f),
+ TOBN(0x397a247d, 0xd69d3417), TOBN(0xeb629f90, 0x387ba42a),
+ TOBN(0x1ee426cc, 0xd5cd79bf), TOBN(0x0032940b, 0x946c6e18),
+ TOBN(0x1b1e8ae0, 0x57477f58), TOBN(0xe94f7d34, 0x6d823278),
+ TOBN(0xc747cb96, 0x782ba21a), TOBN(0xc5254469, 0xf72b33a5),
+ TOBN(0x772ef6de, 0xc7f80c81), TOBN(0xd73acbfe, 0x2cd9e6b5),
+ TOBN(0x4075b5b1, 0x49ee90d9), TOBN(0x785c339a, 0xa06e9eba),
+ TOBN(0xa1030d5b, 0xabf825e0), TOBN(0xcec684c3, 0xa42931dc),
+ TOBN(0x42ab62c9, 0xc1586e63), TOBN(0x45431d66, 0x5ab43f2b),
+ TOBN(0x57c8b2c0, 0x55f7835d), TOBN(0x033da338, 0xc1b7f865),
+ TOBN(0x283c7513, 0xcaa76097), TOBN(0x0a624fa9, 0x36c83906),
+ TOBN(0x6b20afec, 0x715af2c7), TOBN(0x4b969974, 0xeba78bfd),
+ TOBN(0x220755cc, 0xd921d60e), TOBN(0x9b944e10, 0x7baeca13),
+ TOBN(0x04819d51, 0x5ded93d4), TOBN(0x9bbff86e, 0x6dddfd27),
+ TOBN(0x6b344130, 0x77adc612), TOBN(0xa7496529, 0xbbd803a0),
+ TOBN(0x1a1baaa7, 0x6d8805bd), TOBN(0xc8403902, 0x470343ad),
+ TOBN(0x39f59f66, 0x175adff1), TOBN(0x0b26d7fb, 0xb7d8c5b7),
+ TOBN(0xa875f5ce, 0x529d75e3), TOBN(0x85efc7e9, 0x41325cc2),
+ TOBN(0x21950b42, 0x1ff6acd3), TOBN(0xffe70484, 0x53dc6909),
+ TOBN(0xff4cd0b2, 0x28766127), TOBN(0xabdbe608, 0x4fb7db2b),
+ TOBN(0x837c9228, 0x5e1109e8), TOBN(0x26147d27, 0xf4645b5a),
+ TOBN(0x4d78f592, 0xf7818ed8), TOBN(0xd394077e, 0xf247fa36),
+ TOBN(0x0fb9c2d0, 0x488c171a), TOBN(0xa78bfbaa, 0x13685278),
+ TOBN(0xedfbe268, 0xd5b1fa6a), TOBN(0x0dceb8db, 0x2b7eaba7),
+ TOBN(0xbf9e8089, 0x9ae2b710), TOBN(0xefde7ae6, 0xa4449c96),
+ TOBN(0x43b7716b, 0xcc143a46), TOBN(0xd7d34194, 0xc3628c13),
+ TOBN(0x508cec1c, 0x3b3f64c9), TOBN(0xe20bc0ba, 0x1e5edf3f),
+ TOBN(0xda1deb85, 0x2f4318d4), TOBN(0xd20ebe0d, 0x5c3fa443),
+ TOBN(0x370b4ea7, 0x73241ea3), TOBN(0x61f1511c, 0x5e1a5f65),
+ TOBN(0x99a5e23d, 0x82681c62), TOBN(0xd731e383, 0xa2f54c2d),
+ TOBN(0x2692f36e, 0x83445904), TOBN(0x2e0ec469, 0xaf45f9c0),
+ TOBN(0x905a3201, 0xc67528b7), TOBN(0x88f77f34, 0xd0e5e542),
+ TOBN(0xf67a8d29, 0x5864687c), TOBN(0x23b92eae, 0x22df3562),
+ TOBN(0x5c27014b, 0x9bbec39e), TOBN(0x7ef2f226, 0x9c0f0f8d),
+ TOBN(0x97359638, 0x546c4d8d), TOBN(0x5f9c3fc4, 0x92f24679),
+ TOBN(0x912e8bed, 0xa8c8acd9), TOBN(0xec3a318d, 0x306634b0),
+ TOBN(0x80167f41, 0xc31cb264), TOBN(0x3db82f6f, 0x522113f2),
+ TOBN(0xb155bcd2, 0xdcafe197), TOBN(0xfba1da59, 0x43465283),
+ TOBN(0xa0425b8e, 0xb212cf53), TOBN(0x4f2e512e, 0xf8557c5f),
+ TOBN(0xc1286ff9, 0x25c4d56c), TOBN(0xbb8a0fea, 0xee26c851),
+ TOBN(0xc28f70d2, 0xe7d6107e), TOBN(0x7ee0c444, 0xe76265aa),
+ TOBN(0x3df277a4, 0x1d1936b1), TOBN(0x1a556e3f, 0xea9595eb),
+ TOBN(0x258bbbf9, 0xe7305683), TOBN(0x31eea5bf, 0x07ef5be6),
+ TOBN(0x0deb0e4a, 0x46c814c1), TOBN(0x5cee8449, 0xa7b730dd),
+ TOBN(0xeab495c5, 0xa0182bde), TOBN(0xee759f87, 0x9e27a6b4),
+ TOBN(0xc2cf6a68, 0x80e518ca), TOBN(0x25e8013f, 0xf14cf3f4),
+ TOBN(0x8fc44140, 0x7e8d7a14), TOBN(0xbb1ff3ca, 0x9556f36a),
+ TOBN(0x6a844385, 0x14600044), TOBN(0xba3f0c4a, 0x7451ae63),
+ TOBN(0xdfcac25b, 0x1f9af32a), TOBN(0x01e0db86, 0xb1f2214b),
+ TOBN(0x4e9a5bc2, 0xa4b596ac), TOBN(0x83927681, 0x026c2c08),
+ TOBN(0x3ec832e7, 0x7acaca28), TOBN(0x1bfeea57, 0xc7385b29),
+ TOBN(0x068212e3, 0xfd1eaf38), TOBN(0xc1329830, 0x6acf8ccc),
+ TOBN(0xb909f2db, 0x2aac9e59), TOBN(0x5748060d, 0xb661782a),
+ TOBN(0xc5ab2632, 0xc79b7a01), TOBN(0xda44c6c6, 0x00017626),
+ TOBN(0xf26c00e8, 0xa7ea82f0), TOBN(0x99cac80d, 0xe4299aaf),
+ TOBN(0xd66fe3b6, 0x7ed78be1), TOBN(0x305f725f, 0x648d02cd),
+ TOBN(0x33ed1bc4, 0x623fb21b), TOBN(0xfa70533e, 0x7a6319ad),
+ TOBN(0x17ab562d, 0xbe5ffb3e), TOBN(0x06374994, 0x56674741),
+ TOBN(0x69d44ed6, 0x5c46aa8e), TOBN(0x2100d5d3, 0xa8d063d1),
+ TOBN(0xcb9727ea, 0xa2d17c36), TOBN(0x4c2bab1b, 0x8add53b7),
+ TOBN(0xa084e90c, 0x15426704), TOBN(0x778afcd3, 0xa837ebea),
+ TOBN(0x6651f701, 0x7ce477f8), TOBN(0xa0624998, 0x46fb7a8b),
+ TOBN(0xdc1e6828, 0xed8a6e19), TOBN(0x33fc2336, 0x4189d9c7),
+ TOBN(0x026f8fe2, 0x671c39bc), TOBN(0xd40c4ccd, 0xbc6f9915),
+ TOBN(0xafa135bb, 0xf80e75ca), TOBN(0x12c651a0, 0x22adff2c),
+ TOBN(0xc40a04bd, 0x4f51ad96), TOBN(0x04820109, 0xbbe4e832),
+ TOBN(0x3667eb1a, 0x7f4c04cc), TOBN(0x59556621, 0xa9404f84),
+ TOBN(0x71cdf653, 0x7eceb50a), TOBN(0x994a44a6, 0x9b8335fa),
+ TOBN(0xd7faf819, 0xdbeb9b69), TOBN(0x473c5680, 0xeed4350d),
+ TOBN(0xb6658466, 0xda44bba2), TOBN(0x0d1bc780, 0x872bdbf3),
+ TOBN(0xe535f175, 0xa1962f91), TOBN(0x6ed7e061, 0xed58f5a7),
+ TOBN(0x177aa4c0, 0x2089a233), TOBN(0x0dbcb03a, 0xe539b413),
+ TOBN(0xe3dc424e, 0xbb32e38e), TOBN(0x6472e5ef, 0x6806701e),
+ TOBN(0xdd47ff98, 0x814be9ee), TOBN(0x6b60cfff, 0x35ace009),
+ TOBN(0xb8d3d931, 0x9ff91fe5), TOBN(0x039c4800, 0xf0518eed),
+ TOBN(0x95c37632, 0x9182cb26), TOBN(0x0763a434, 0x82fc568d),
+ TOBN(0x707c04d5, 0x383e76ba), TOBN(0xac98b930, 0x824e8197),
+ TOBN(0x92bf7c8f, 0x91230de0), TOBN(0x90876a01, 0x40959b70),
+ TOBN(0xdb6d96f3, 0x05968b80), TOBN(0x380a0913, 0x089f73b9),
+ TOBN(0x7da70b83, 0xc2c61e01), TOBN(0x95fb8394, 0x569b38c7),
+ TOBN(0x9a3c6512, 0x80edfe2f), TOBN(0x8f726bb9, 0x8faeaf82),
+ TOBN(0x8010a4a0, 0x78424bf8), TOBN(0x29672044, 0x0e844970)}
+ ,
+ {TOBN(0x63c5cb81, 0x7a2ad62a), TOBN(0x7ef2b6b9, 0xac62ff54),
+ TOBN(0x3749bba4, 0xb3ad9db5), TOBN(0xad311f2c, 0x46d5a617),
+ TOBN(0xb77a8087, 0xc2ff3b6d), TOBN(0xb46feaf3, 0x367834ff),
+ TOBN(0xf8aa266d, 0x75d6b138), TOBN(0xfa38d320, 0xec008188),
+ TOBN(0x486d8ffa, 0x696946fc), TOBN(0x50fbc6d8, 0xb9cba56d),
+ TOBN(0x7e3d423e, 0x90f35a15), TOBN(0x7c3da195, 0xc0dd962c),
+ TOBN(0xe673fdb0, 0x3cfd5d8b), TOBN(0x0704b7c2, 0x889dfca5),
+ TOBN(0xf6ce581f, 0xf52305aa), TOBN(0x399d49eb, 0x914d5e53),
+ TOBN(0x380a496d, 0x6ec293cd), TOBN(0x733dbda7, 0x8e7051f5),
+ TOBN(0x037e388d, 0xb849140a), TOBN(0xee4b32b0, 0x5946dbf6),
+ TOBN(0xb1c4fda9, 0xcae368d1), TOBN(0x5001a7b0, 0xfdb0b2f3),
+ TOBN(0x6df59374, 0x2e3ac46e), TOBN(0x4af675f2, 0x39b3e656),
+ TOBN(0x44e38110, 0x39949296), TOBN(0x5b63827b, 0x361db1b5),
+ TOBN(0x3e5323ed, 0x206eaff5), TOBN(0x942370d2, 0xc21f4290),
+ TOBN(0xf2caaf2e, 0xe0d985a1), TOBN(0x192cc64b, 0x7239846d),
+ TOBN(0x7c0b8f47, 0xae6312f8), TOBN(0x7dc61f91, 0x96620108),
+ TOBN(0xb830fb5b, 0xc2da7de9), TOBN(0xd0e643df, 0x0ff8d3be),
+ TOBN(0x31ee77ba, 0x188a9641), TOBN(0x4e8aa3aa, 0xbcf6d502),
+ TOBN(0xf9fb6532, 0x9a49110f), TOBN(0xd18317f6, 0x2dd6b220),
+ TOBN(0x7e3ced41, 0x52c3ea5a), TOBN(0x0d296a14, 0x7d579c4a),
+ TOBN(0x35d6a53e, 0xed4c3717), TOBN(0x9f8240cf, 0x3d0ed2a3),
+ TOBN(0x8c0d4d05, 0xe5543aa5), TOBN(0x45d5bbfb, 0xdd33b4b4),
+ TOBN(0xfa04cc73, 0x137fd28e), TOBN(0x862ac6ef, 0xc73b3ffd),
+ TOBN(0x403ff9f5, 0x31f51ef2), TOBN(0x34d5e0fc, 0xbc73f5a2),
+ TOBN(0xf2526820, 0x08913f4f), TOBN(0xea20ed61, 0xeac93d95),
+ TOBN(0x51ed38b4, 0x6ca6b26c), TOBN(0x8662dcbc, 0xea4327b0),
+ TOBN(0x6daf295c, 0x725d2aaa), TOBN(0xbad2752f, 0x8e52dcda),
+ TOBN(0x2210e721, 0x0b17dacc), TOBN(0xa37f7912, 0xd51e8232),
+ TOBN(0x4f7081e1, 0x44cc3add), TOBN(0xd5ffa1d6, 0x87be82cf),
+ TOBN(0x89890b6c, 0x0edd6472), TOBN(0xada26e1a, 0x3ed17863),
+ TOBN(0x276f2715, 0x63483caa), TOBN(0xe6924cd9, 0x2f6077fd),
+ TOBN(0x05a7fe98, 0x0a466e3c), TOBN(0xf1c794b0, 0xb1902d1f),
+ TOBN(0xe5213688, 0x82a8042c), TOBN(0xd931cfaf, 0xcd278298),
+ TOBN(0x069a0ae0, 0xf597a740), TOBN(0x0adbb3f3, 0xeb59107c),
+ TOBN(0x983e951e, 0x5eaa8eb8), TOBN(0xe663a8b5, 0x11b48e78),
+ TOBN(0x1631cc0d, 0x8a03f2c5), TOBN(0x7577c11e, 0x11e271e2),
+ TOBN(0x33b2385c, 0x08369a90), TOBN(0x2990c59b, 0x190eb4f8),
+ TOBN(0x819a6145, 0xc68eac80), TOBN(0x7a786d62, 0x2ec4a014),
+ TOBN(0x33faadbe, 0x20ac3a8d), TOBN(0x31a21781, 0x5aba2d30),
+ TOBN(0x209d2742, 0xdba4f565), TOBN(0xdb2ce9e3, 0x55aa0fbb),
+ TOBN(0x8cef334b, 0x168984df), TOBN(0xe81dce17, 0x33879638),
+ TOBN(0xf6e6949c, 0x263720f0), TOBN(0x5c56feaf, 0xf593cbec),
+ TOBN(0x8bff5601, 0xfde58c84), TOBN(0x74e24117, 0x2eccb314),
+ TOBN(0xbcf01b61, 0x4c9a8a78), TOBN(0xa233e35e, 0x544c9868),
+ TOBN(0xb3156bf3, 0x8bd7aff1), TOBN(0x1b5ee4cb, 0x1d81b146),
+ TOBN(0x7ba1ac41, 0xd628a915), TOBN(0x8f3a8f9c, 0xfd89699e),
+ TOBN(0x7329b9c9, 0xa0748be7), TOBN(0x1d391c95, 0xa92e621f),
+ TOBN(0xe51e6b21, 0x4d10a837), TOBN(0xd255f53a, 0x4947b435),
+ TOBN(0x07669e04, 0xf1788ee3), TOBN(0xc14f27af, 0xa86938a2),
+ TOBN(0x8b47a334, 0xe93a01c0), TOBN(0xff627438, 0xd9366808),
+ TOBN(0x7a0985d8, 0xca2a5965), TOBN(0x3d9a5542, 0xd6e9b9b3),
+ TOBN(0xc23eb80b, 0x4cf972e8), TOBN(0x5c1c33bb, 0x4fdf72fd),
+ TOBN(0x0c4a58d4, 0x74a86108), TOBN(0xf8048a8f, 0xee4c5d90),
+ TOBN(0xe3c7c924, 0xe86d4c80), TOBN(0x28c889de, 0x056a1e60),
+ TOBN(0x57e2662e, 0xb214a040), TOBN(0xe8c48e98, 0x37e10347),
+ TOBN(0x87742862, 0x80ac748a), TOBN(0xf1c24022, 0x186b06f2),
+ TOBN(0xac2dd4c3, 0x5f74040a), TOBN(0x409aeb71, 0xfceac957),
+ TOBN(0x4fbad782, 0x55c4ec23), TOBN(0xb359ed61, 0x8a7b76ec),
+ TOBN(0x12744926, 0xed6f4a60), TOBN(0xe21e8d7f, 0x4b912de3),
+ TOBN(0xe2575a59, 0xfc705a59), TOBN(0x72f1d4de, 0xed2dbc0e),
+ TOBN(0x3d2b24b9, 0xeb7926b8), TOBN(0xbff88cb3, 0xcdbe5509),
+ TOBN(0xd0f399af, 0xe4dd640b), TOBN(0x3c5fe130, 0x2f76ed45),
+ TOBN(0x6f3562f4, 0x3764fb3d), TOBN(0x7b5af318, 0x3151b62d),
+ TOBN(0xd5bd0bc7, 0xd79ce5f3), TOBN(0xfdaf6b20, 0xec66890f),
+ TOBN(0x735c67ec, 0x6063540c), TOBN(0x50b259c2, 0xe5f9cb8f),
+ TOBN(0xb8734f9a, 0x3f99c6ab), TOBN(0xf8cc13d5, 0xa3a7bc85),
+ TOBN(0x80c1b305, 0xc5217659), TOBN(0xfe5364d4, 0x4ec12a54),
+ TOBN(0xbd87045e, 0x681345fe), TOBN(0x7f8efeb1, 0x582f897f),
+ TOBN(0xe8cbf1e5, 0xd5923359), TOBN(0xdb0cea9d, 0x539b9fb0),
+ TOBN(0x0c5b34cf, 0x49859b98), TOBN(0x5e583c56, 0xa4403cc6),
+ TOBN(0x11fc1a2d, 0xd48185b7), TOBN(0xc93fbc7e, 0x6e521787),
+ TOBN(0x47e7a058, 0x05105b8b), TOBN(0x7b4d4d58, 0xdb8260c8),
+ TOBN(0xe33930b0, 0x46eb842a), TOBN(0x8e844a9a, 0x7bdae56d),
+ TOBN(0x34ef3a9e, 0x13f7fdfc), TOBN(0xb3768f82, 0x636ca176),
+ TOBN(0x2821f4e0, 0x4e09e61c), TOBN(0x414dc3a1, 0xa0c7cddc),
+ TOBN(0xd5379437, 0x54945fcd), TOBN(0x151b6eef, 0xb3555ff1),
+ TOBN(0xb31bd613, 0x6339c083), TOBN(0x39ff8155, 0xdfb64701),
+ TOBN(0x7c3388d2, 0xe29604ab), TOBN(0x1e19084b, 0xa6b10442),
+ TOBN(0x17cf54c0, 0xeccd47ef), TOBN(0x89693385, 0x4a5dfb30),
+ TOBN(0x69d023fb, 0x47daf9f6), TOBN(0x9222840b, 0x7d91d959),
+ TOBN(0x439108f5, 0x803bac62), TOBN(0x0b7dd91d, 0x379bd45f),
+ TOBN(0xd651e827, 0xca63c581), TOBN(0x5c5d75f6, 0x509c104f),
+ TOBN(0x7d5fc738, 0x1f2dc308), TOBN(0x20faa7bf, 0xd98454be),
+ TOBN(0x95374bee, 0xa517b031), TOBN(0xf036b9b1, 0x642692ac),
+ TOBN(0xc5106109, 0x39842194), TOBN(0xb7e2353e, 0x49d05295),
+ TOBN(0xfc8c1d5c, 0xefb42ee0), TOBN(0xe04884eb, 0x08ce811c),
+ TOBN(0xf1f75d81, 0x7419f40e), TOBN(0x5b0ac162, 0xa995c241),
+ TOBN(0x120921bb, 0xc4c55646), TOBN(0x713520c2, 0x8d33cf97),
+ TOBN(0xb4a65a5c, 0xe98c5100), TOBN(0x6cec871d, 0x2ddd0f5a),
+ TOBN(0x251f0b7f, 0x9ba2e78b), TOBN(0x224a8434, 0xce3a2a5f),
+ TOBN(0x26827f61, 0x25f5c46f), TOBN(0x6a22bedc, 0x48545ec0),
+ TOBN(0x25ae5fa0, 0xb1bb5cdc), TOBN(0xd693682f, 0xfcb9b98f),
+ TOBN(0x32027fe8, 0x91e5d7d3), TOBN(0xf14b7d17, 0x73a07678),
+ TOBN(0xf88497b3, 0xc0dfdd61), TOBN(0xf7c2eec0, 0x2a8c4f48),
+ TOBN(0xaa5573f4, 0x3756e621), TOBN(0xc013a240, 0x1825b948),
+ TOBN(0x1c03b345, 0x63878572), TOBN(0xa0472bea, 0x653a4184),
+ TOBN(0xf4222e27, 0x0ac69a80), TOBN(0x34096d25, 0xf51e54f6),
+ TOBN(0x00a648cb, 0x8fffa591), TOBN(0x4e87acdc, 0x69b6527f),
+ TOBN(0x0575e037, 0xe285ccb4), TOBN(0x188089e4, 0x50ddcf52),
+ TOBN(0xaa96c9a8, 0x870ff719), TOBN(0x74a56cd8, 0x1fc7e369),
+ TOBN(0x41d04ee2, 0x1726931a), TOBN(0x0bbbb2c8, 0x3660ecfd),
+ TOBN(0xa6ef6de5, 0x24818e18), TOBN(0xe421cc51, 0xe7d57887),
+ TOBN(0xf127d208, 0xbea87be6), TOBN(0x16a475d3, 0xb1cdd682),
+ TOBN(0x9db1b684, 0x439b63f7), TOBN(0x5359b3db, 0xf0f113b6),
+ TOBN(0xdfccf1de, 0x8bf06e31), TOBN(0x1fdf8f44, 0xdd383901),
+ TOBN(0x10775cad, 0x5017e7d2), TOBN(0xdfc3a597, 0x58d11eef),
+ TOBN(0x6ec9c8a0, 0xb1ecff10), TOBN(0xee6ed6cc, 0x28400549),
+ TOBN(0xb5ad7bae, 0x1b4f8d73), TOBN(0x61b4f11d, 0xe00aaab9),
+ TOBN(0x7b32d69b, 0xd4eff2d7), TOBN(0x88ae6771, 0x4288b60f),
+ TOBN(0x159461b4, 0x37a1e723), TOBN(0x1f3d4789, 0x570aae8c),
+ TOBN(0x869118c0, 0x7f9871da), TOBN(0x35fbda78, 0xf635e278),
+ TOBN(0x738f3641, 0xe1541dac), TOBN(0x6794b13a, 0xc0dae45f),
+ TOBN(0x065064ac, 0x09cc0917), TOBN(0x27c53729, 0xc68540fd),
+ TOBN(0x0d2d4c8e, 0xef227671), TOBN(0xd23a9f80, 0xa1785a04),
+ TOBN(0x98c59528, 0x52650359), TOBN(0xfa09ad01, 0x74a1acad),
+ TOBN(0x082d5a29, 0x0b55bf5c), TOBN(0xa40f1c67, 0x419b8084),
+ TOBN(0x3a5c752e, 0xdcc18770), TOBN(0x4baf1f2f, 0x8825c3a5),
+ TOBN(0xebd63f74, 0x21b153ed), TOBN(0xa2383e47, 0xb2f64723),
+ TOBN(0xe7bf620a, 0x2646d19a), TOBN(0x56cb44ec, 0x03c83ffd),
+ TOBN(0xaf7267c9, 0x4f6be9f1), TOBN(0x8b2dfd7b, 0xc06bb5e9),
+ TOBN(0xb87072f2, 0xa672c5c7), TOBN(0xeacb11c8, 0x0d53c5e2),
+ TOBN(0x22dac29d, 0xff435932), TOBN(0x37bdb99d, 0x4408693c),
+ TOBN(0xf6e62fb6, 0x2899c20f), TOBN(0x3535d512, 0x447ece24),
+ TOBN(0xfbdc6b88, 0xff577ce3), TOBN(0x726693bd, 0x190575f2),
+ TOBN(0x6772b0e5, 0xab4b35a2), TOBN(0x1d8b6001, 0xf5eeaacf),
+ TOBN(0x728f7ce4, 0x795b9580), TOBN(0x4a20ed2a, 0x41fb81da),
+ TOBN(0x9f685cd4, 0x4fec01e6), TOBN(0x3ed7ddcc, 0xa7ff50ad),
+ TOBN(0x460fd264, 0x0c2d97fd), TOBN(0x3a241426, 0xeb82f4f9),
+ TOBN(0x17d1df2c, 0x6a8ea820), TOBN(0xb2b50d3b, 0xf22cc254),
+ TOBN(0x03856cba, 0xb7291426), TOBN(0x87fd26ae, 0x04f5ee39),
+ TOBN(0x9cb696cc, 0x02bee4ba), TOBN(0x53121804, 0x06820fd6),
+ TOBN(0xa5dfc269, 0x0212e985), TOBN(0x666f7ffa, 0x160f9a09),
+ TOBN(0xc503cd33, 0xbccd9617), TOBN(0x365dede4, 0xba7730a3),
+ TOBN(0x798c6355, 0x5ddb0786), TOBN(0xa6c3200e, 0xfc9cd3bc),
+ TOBN(0x060ffb2c, 0xe5e35efd), TOBN(0x99a4e25b, 0x5555a1c1),
+ TOBN(0x11d95375, 0xf70b3751), TOBN(0x0a57354a, 0x160e1bf6),
+ TOBN(0xecb3ae4b, 0xf8e4b065), TOBN(0x07a834c4, 0x2e53022b),
+ TOBN(0x1cd300b3, 0x8692ed96), TOBN(0x16a6f792, 0x61ee14ec),
+ TOBN(0x8f1063c6, 0x6a8649ed), TOBN(0xfbcdfcfe, 0x869f3e14),
+ TOBN(0x2cfb97c1, 0x00a7b3ec), TOBN(0xcea49b3c, 0x7130c2f1),
+ TOBN(0x462d044f, 0xe9d96488), TOBN(0x4b53d52e, 0x8182a0c1),
+ TOBN(0x84b6ddd3, 0x0391e9e9), TOBN(0x80ab7b48, 0xb1741a09),
+ TOBN(0xec0e15d4, 0x27d3317f), TOBN(0x8dfc1ddb, 0x1a64671e),
+ TOBN(0x93cc5d5f, 0xd49c5b92), TOBN(0xc995d53d, 0x3674a331),
+ TOBN(0x302e41ec, 0x090090ae), TOBN(0x2278a0cc, 0xedb06830),
+ TOBN(0x1d025932, 0xfbc99690), TOBN(0x0c32fbd2, 0xb80d68da),
+ TOBN(0xd79146da, 0xf341a6c1), TOBN(0xae0ba139, 0x1bef68a0),
+ TOBN(0xc6b8a563, 0x8d774b3a), TOBN(0x1cf307bd, 0x880ba4d7),
+ TOBN(0xc033bdc7, 0x19803511), TOBN(0xa9f97b3b, 0x8888c3be),
+ TOBN(0x3d68aebc, 0x85c6d05e), TOBN(0xc3b88a9d, 0x193919eb),
+ TOBN(0x2d300748, 0xc48b0ee3), TOBN(0x7506bc7c, 0x07a746c1),
+ TOBN(0xfc48437c, 0x6e6d57f3), TOBN(0x5bd71587, 0xcfeaa91a),
+ TOBN(0xa4ed0408, 0xc1bc5225), TOBN(0xd0b946db, 0x2719226d),
+ TOBN(0x109ecd62, 0x758d2d43), TOBN(0x75c8485a, 0x2751759b),
+ TOBN(0xb0b75f49, 0x9ce4177a), TOBN(0x4fa61a1e, 0x79c10c3d),
+ TOBN(0xc062d300, 0xa167fcd7), TOBN(0x4df3874c, 0x750f0fa8),
+ TOBN(0x29ae2cf9, 0x83dfedc9), TOBN(0xf8437134, 0x8d87631a),
+ TOBN(0xaf571711, 0x7429c8d2), TOBN(0x18d15867, 0x146d9272),
+ TOBN(0x83053ecf, 0x69769bb7), TOBN(0xc55eb856, 0xc479ab82),
+ TOBN(0x5ef7791c, 0x21b0f4b2), TOBN(0xaa5956ba, 0x3d491525),
+ TOBN(0x407a96c2, 0x9fe20eba), TOBN(0xf27168bb, 0xe52a5ad3),
+ TOBN(0x43b60ab3, 0xbf1d9d89), TOBN(0xe45c51ef, 0x710e727a),
+ TOBN(0xdfca5276, 0x099b4221), TOBN(0x8dc6407c, 0x2557a159),
+ TOBN(0x0ead8335, 0x91035895), TOBN(0x0a9db957, 0x9c55dc32),
+ TOBN(0xe40736d3, 0xdf61bc76), TOBN(0x13a619c0, 0x3f778cdb),
+ TOBN(0x6dd921a4, 0xc56ea28f), TOBN(0x76a52433, 0x2fa647b4),
+ TOBN(0x23591891, 0xac5bdc5d), TOBN(0xff4a1a72, 0xbac7dc01),
+ TOBN(0x9905e261, 0x62df8453), TOBN(0x3ac045df, 0xe63b265f),
+ TOBN(0x8a3f341b, 0xad53dba7), TOBN(0x8ec269cc, 0x837b625a),
+ TOBN(0xd71a2782, 0x3ae31189), TOBN(0x8fb4f9a3, 0x55e96120),
+ TOBN(0x804af823, 0xff9875cf), TOBN(0x23224f57, 0x5d442a9b),
+ TOBN(0x1c4d3b9e, 0xecc62679), TOBN(0x91da22fb, 0xa0e7ddb1),
+ TOBN(0xa370324d, 0x6c04a661), TOBN(0x9710d3b6, 0x5e376d17),
+ TOBN(0xed8c98f0, 0x3044e357), TOBN(0xc364ebbe, 0x6422701c),
+ TOBN(0x347f5d51, 0x7733d61c), TOBN(0xd55644b9, 0xcea826c3),
+ TOBN(0x80c6e0ad, 0x55a25548), TOBN(0x0aa7641d, 0x844220a7),
+ TOBN(0x1438ec81, 0x31810660), TOBN(0x9dfa6507, 0xde4b4043),
+ TOBN(0x10b515d8, 0xcc3e0273), TOBN(0x1b6066dd, 0x28d8cfb2),
+ TOBN(0xd3b04591, 0x9c9efebd), TOBN(0x425d4bdf, 0xa21c1ff4),
+ TOBN(0x5fe5af19, 0xd57607d3), TOBN(0xbbf773f7, 0x54481084),
+ TOBN(0x8435bd69, 0x94b03ed1), TOBN(0xd9ad1de3, 0x634cc546),
+ TOBN(0x2cf423fc, 0x00e420ca), TOBN(0xeed26d80, 0xa03096dd),
+ TOBN(0xd7f60be7, 0xa4db09d2), TOBN(0xf47f569d, 0x960622f7),
+ TOBN(0xe5925fd7, 0x7296c729), TOBN(0xeff2db26, 0x26ca2715),
+ TOBN(0xa6fcd014, 0xb913e759), TOBN(0x53da4786, 0x8ff4de93),
+ TOBN(0x14616d79, 0xc32068e1), TOBN(0xb187d664, 0xccdf352e),
+ TOBN(0xf7afb650, 0x1dc90b59), TOBN(0x8170e943, 0x7daa1b26),
+ TOBN(0xc8e3bdd8, 0x700c0a84), TOBN(0x6e8d345f, 0x6482bdfa),
+ TOBN(0x84cfbfa1, 0xc5c5ea50), TOBN(0xd3baf14c, 0x67960681),
+ TOBN(0x26398403, 0x0dd50942), TOBN(0xe4b7839c, 0x4716a663),
+ TOBN(0xd5f1f794, 0xe7de6dc0), TOBN(0x5cd0f4d4, 0x622aa7ce),
+ TOBN(0x5295f3f1, 0x59acfeec), TOBN(0x8d933552, 0x953e0607),
+ TOBN(0xc7db8ec5, 0x776c5722), TOBN(0xdc467e62, 0x2b5f290c),
+ TOBN(0xd4297e70, 0x4ff425a9), TOBN(0x4be924c1, 0x0cf7bb72),
+ TOBN(0x0d5dc5ae, 0xa1892131), TOBN(0x8bf8a8e3, 0xa705c992),
+ TOBN(0x73a0b064, 0x7a305ac5), TOBN(0x00c9ca4e, 0x9a8c77a8),
+ TOBN(0x5dfee80f, 0x83774bdd), TOBN(0x63131602, 0x85734485),
+ TOBN(0xa1b524ae, 0x914a69a9), TOBN(0xebc2ffaf, 0xd4e300d7),
+ TOBN(0x52c93db7, 0x7cfa46a5), TOBN(0x71e6161f, 0x21653b50),
+ TOBN(0x3574fc57, 0xa4bc580a), TOBN(0xc09015dd, 0xe1bc1253),
+ TOBN(0x4b7b47b2, 0xd174d7aa), TOBN(0x4072d8e8, 0xf3a15d04),
+ TOBN(0xeeb7d47f, 0xd6fa07ed), TOBN(0x6f2b9ff9, 0xedbdafb1),
+ TOBN(0x18c51615, 0x3760fe8a), TOBN(0x7a96e6bf, 0xf06c6c13),
+ TOBN(0x4d7a0410, 0x0ea2d071), TOBN(0xa1914e9b, 0x0be2a5ce),
+ TOBN(0x5726e357, 0xd8a3c5cf), TOBN(0x1197ecc3, 0x2abb2b13),
+ TOBN(0x6c0d7f7f, 0x31ae88dd), TOBN(0x15b20d1a, 0xfdbb3efe),
+ TOBN(0xcd06aa26, 0x70584039), TOBN(0x2277c969, 0xa7dc9747),
+ TOBN(0xbca69587, 0x7855d815), TOBN(0x899ea238, 0x5188b32a),
+ TOBN(0x37d9228b, 0x760c1c9d), TOBN(0xc7efbb11, 0x9b5c18da),
+ TOBN(0x7f0d1bc8, 0x19f6dbc5), TOBN(0x4875384b, 0x07e6905b),
+ TOBN(0xc7c50baa, 0x3ba8cd86), TOBN(0xb0ce40fb, 0xc2905de0),
+ TOBN(0x70840673, 0x7a231952), TOBN(0xa912a262, 0xcf43de26),
+ TOBN(0x9c38ddcc, 0xeb5b76c1), TOBN(0x746f5285, 0x26fc0ab4),
+ TOBN(0x52a63a50, 0xd62c269f), TOBN(0x60049c55, 0x99458621),
+ TOBN(0xe7f48f82, 0x3c2f7c9e), TOBN(0x6bd99043, 0x917d5cf3),
+ TOBN(0xeb1317a8, 0x8701f469), TOBN(0xbd3fe2ed, 0x9a449fe0),
+ TOBN(0x421e79ca, 0x12ef3d36), TOBN(0x9ee3c36c, 0x3e7ea5de),
+ TOBN(0xe48198b5, 0xcdff36f7), TOBN(0xaff4f967, 0xc6b82228),
+ TOBN(0x15e19dd0, 0xc47adb7e), TOBN(0x45699b23, 0x032e7dfa),
+ TOBN(0x40680c8b, 0x1fae026a), TOBN(0x5a347a48, 0x550dbf4d),
+ TOBN(0xe652533b, 0x3cef0d7d), TOBN(0xd94f7b18, 0x2bbb4381),
+ TOBN(0x838752be, 0x0e80f500), TOBN(0x8e6e2488, 0x9e9c9bfb),
+ TOBN(0xc9751697, 0x16caca6a), TOBN(0x866c49d8, 0x38531ad9),
+ TOBN(0xc917e239, 0x7151ade1), TOBN(0x2d016ec1, 0x6037c407),
+ TOBN(0xa407ccc9, 0x00eac3f9), TOBN(0x835f6280, 0xe2ed4748),
+ TOBN(0xcc54c347, 0x1cc98e0d), TOBN(0x0e969937, 0xdcb572eb),
+ TOBN(0x1b16c8e8, 0x8f30c9cb), TOBN(0xa606ae75, 0x373c4661),
+ TOBN(0x47aa689b, 0x35502cab), TOBN(0xf89014ae, 0x4d9bb64f),
+ TOBN(0x202f6a9c, 0x31c71f7b), TOBN(0x01f95aa3, 0x296ffe5c),
+ TOBN(0x5fc06014, 0x53cec3a3), TOBN(0xeb991237, 0x5f498a45),
+ TOBN(0xae9a935e, 0x5d91ba87), TOBN(0xc6ac6281, 0x0b564a19),
+ TOBN(0x8a8fe81c, 0x3bd44e69), TOBN(0x7c8b467f, 0x9dd11d45),
+ TOBN(0xf772251f, 0xea5b8e69), TOBN(0xaeecb3bd, 0xc5b75fbc),
+ TOBN(0x1aca3331, 0x887ff0e5), TOBN(0xbe5d49ff, 0x19f0a131),
+ TOBN(0x582c13aa, 0xe5c8646f), TOBN(0xdbaa12e8, 0x20e19980),
+ TOBN(0x8f40f31a, 0xf7abbd94), TOBN(0x1f13f5a8, 0x1dfc7663),
+ TOBN(0x5d81f1ee, 0xaceb4fc0), TOBN(0x36256002, 0x5e6f0f42),
+ TOBN(0x4b67d6d7, 0x751370c8), TOBN(0x2608b698, 0x03e80589),
+ TOBN(0xcfc0d2fc, 0x05268301), TOBN(0xa6943d39, 0x40309212),
+ TOBN(0x192a90c2, 0x1fd0e1c2), TOBN(0xb209f113, 0x37f1dc76),
+ TOBN(0xefcc5e06, 0x97bf1298), TOBN(0xcbdb6730, 0x219d639e),
+ TOBN(0xd009c116, 0xb81e8c6f), TOBN(0xa3ffdde3, 0x1a7ce2e5),
+ TOBN(0xc53fbaaa, 0xa914d3ba), TOBN(0x836d500f, 0x88df85ee),
+ TOBN(0xd98dc71b, 0x66ee0751), TOBN(0x5a3d7005, 0x714516fd),
+ TOBN(0x21d3634d, 0x39eedbba), TOBN(0x35cd2e68, 0x0455a46d),
+ TOBN(0xc8cafe65, 0xf9d7eb0c), TOBN(0xbda3ce9e, 0x00cefb3e),
+ TOBN(0xddc17a60, 0x2c9cf7a4), TOBN(0x01572ee4, 0x7bcb8773),
+ TOBN(0xa92b2b01, 0x8c7548df), TOBN(0x732fd309, 0xa84600e3),
+ TOBN(0xe22109c7, 0x16543a40), TOBN(0x9acafd36, 0xfede3c6c),
+ TOBN(0xfb206852, 0x6824e614), TOBN(0x2a4544a9, 0xda25dca0),
+ TOBN(0x25985262, 0x91d60b06), TOBN(0x281b7be9, 0x28753545),
+ TOBN(0xec667b1a, 0x90f13b27), TOBN(0x33a83aff, 0x940e2eb4),
+ TOBN(0x80009862, 0xd5d721d5), TOBN(0x0c3357a3, 0x5bd3a182),
+ TOBN(0x27f3a83b, 0x7aa2cda4), TOBN(0xb58ae74e, 0xf6f83085),
+ TOBN(0x2a911a81, 0x2e6dad6b), TOBN(0xde286051, 0xf43d6c5b),
+ TOBN(0x4bdccc41, 0xf996c4d8), TOBN(0xe7312ec0, 0x0ae1e24e)}
+ ,
+ {TOBN(0xf8d112e7, 0x6e6485b3), TOBN(0x4d3e24db, 0x771c52f8),
+ TOBN(0x48e3ee41, 0x684a2f6d), TOBN(0x7161957d, 0x21d95551),
+ TOBN(0x19631283, 0xcdb12a6c), TOBN(0xbf3fa882, 0x2e50e164),
+ TOBN(0xf6254b63, 0x3166cc73), TOBN(0x3aefa7ae, 0xaee8cc38),
+ TOBN(0x79b0fe62, 0x3b36f9fd), TOBN(0x26543b23, 0xfde19fc0),
+ TOBN(0x136e64a0, 0x958482ef), TOBN(0x23f63771, 0x9b095825),
+ TOBN(0x14cfd596, 0xb6a1142e), TOBN(0x5ea6aac6, 0x335aac0b),
+ TOBN(0x86a0e8bd, 0xf3081dd5), TOBN(0x5fb89d79, 0x003dc12a),
+ TOBN(0xf615c33a, 0xf72e34d4), TOBN(0x0bd9ea40, 0x110eec35),
+ TOBN(0x1c12bc5b, 0xc1dea34e), TOBN(0x686584c9, 0x49ae4699),
+ TOBN(0x13ad95d3, 0x8c97b942), TOBN(0x4609561a, 0x4e5c7562),
+ TOBN(0x9e94a4ae, 0xf2737f89), TOBN(0xf57594c6, 0x371c78b6),
+ TOBN(0x0f0165fc, 0xe3779ee3), TOBN(0xe00e7f9d, 0xbd495d9e),
+ TOBN(0x1fa4efa2, 0x20284e7a), TOBN(0x4564bade, 0x47ac6219),
+ TOBN(0x90e6312a, 0xc4708e8e), TOBN(0x4f5725fb, 0xa71e9adf),
+ TOBN(0xe95f55ae, 0x3d684b9f), TOBN(0x47f7ccb1, 0x1e94b415),
+ TOBN(0x7322851b, 0x8d946581), TOBN(0xf0d13133, 0xbdf4a012),
+ TOBN(0xa3510f69, 0x6584dae0), TOBN(0x03a7c171, 0x3c9f6c6d),
+ TOBN(0x5be97f38, 0xe475381a), TOBN(0xca1ba422, 0x85823334),
+ TOBN(0xf83cc5c7, 0x0be17dda), TOBN(0x158b1494, 0x0b918c0f),
+ TOBN(0xda3a77e5, 0x522e6b69), TOBN(0x69c908c3, 0xbbcd6c18),
+ TOBN(0x1f1b9e48, 0xd924fd56), TOBN(0x37c64e36, 0xaa4bb3f7),
+ TOBN(0x5a4fdbdf, 0xee478d7d), TOBN(0xba75c8bc, 0x0193f7a0),
+ TOBN(0x84bc1e84, 0x56cd16df), TOBN(0x1fb08f08, 0x46fad151),
+ TOBN(0x8a7cabf9, 0x842e9f30), TOBN(0xa331d4bf, 0x5eab83af),
+ TOBN(0xd272cfba, 0x017f2a6a), TOBN(0x27560abc, 0x83aba0e3),
+ TOBN(0x94b83387, 0x0e3a6b75), TOBN(0x25c6aea2, 0x6b9f50f5),
+ TOBN(0x803d691d, 0xb5fdf6d0), TOBN(0x03b77509, 0xe6333514),
+ TOBN(0x36178903, 0x61a341c1), TOBN(0x3604dc60, 0x0cfd6142),
+ TOBN(0x022295eb, 0x8533316c), TOBN(0x3dbde4ac, 0x44af2922),
+ TOBN(0x898afc5d, 0x1c7eef69), TOBN(0x58896805, 0xd14f4fa1),
+ TOBN(0x05002160, 0x203c21ca), TOBN(0x6f0d1f30, 0x40ef730b),
+ TOBN(0x8e8c44d4, 0x196224f8), TOBN(0x75a4ab95, 0x374d079d),
+ TOBN(0x79085ecc, 0x7d48f123), TOBN(0x56f04d31, 0x1bf65ad8),
+ TOBN(0xe220bf1c, 0xbda602b2), TOBN(0x73ee1742, 0xf9612c69),
+ TOBN(0x76008fc8, 0x084fd06b), TOBN(0x4000ef9f, 0xf11380d1),
+ TOBN(0x48201b4b, 0x12cfe297), TOBN(0x3eee129c, 0x292f74e5),
+ TOBN(0xe1fe114e, 0xc9e874e8), TOBN(0x899b055c, 0x92c5fc41),
+ TOBN(0x4e477a64, 0x3a39c8cf), TOBN(0x82f09efe, 0x78963cc9),
+ TOBN(0x6fd3fd8f, 0xd333f863), TOBN(0x85132b2a, 0xdc949c63),
+ TOBN(0x7e06a3ab, 0x516eb17b), TOBN(0x73bec06f, 0xd2c7372b),
+ TOBN(0xe4f74f55, 0xba896da6), TOBN(0xbb4afef8, 0x8e9eb40f),
+ TOBN(0x2d75bec8, 0xe61d66b0), TOBN(0x02bda4b4, 0xef29300b),
+ TOBN(0x8bbaa8de, 0x026baa5a), TOBN(0xff54befd, 0xa07f4440),
+ TOBN(0xbd9b8b1d, 0xbe7a2af3), TOBN(0xec51caa9, 0x4fb74a72),
+ TOBN(0xb9937a4b, 0x63879697), TOBN(0x7c9a9d20, 0xec2687d5),
+ TOBN(0x1773e44f, 0x6ef5f014), TOBN(0x8abcf412, 0xe90c6900),
+ TOBN(0x387bd022, 0x8142161e), TOBN(0x50393755, 0xfcb6ff2a),
+ TOBN(0x9813fd56, 0xed6def63), TOBN(0x53cf6482, 0x7d53106c),
+ TOBN(0x991a35bd, 0x431f7ac1), TOBN(0xf1e274dd, 0x63e65faf),
+ TOBN(0xf63ffa3c, 0x44cc7880), TOBN(0x411a426b, 0x7c256981),
+ TOBN(0xb698b9fd, 0x93a420e0), TOBN(0x89fdddc0, 0xae53f8fe),
+ TOBN(0x766e0722, 0x32398baa), TOBN(0x205fee42, 0x5cfca031),
+ TOBN(0xa49f5341, 0x7a029cf2), TOBN(0xa88c68b8, 0x4023890d),
+ TOBN(0xbc275041, 0x7337aaa8), TOBN(0x9ed364ad, 0x0eb384f4),
+ TOBN(0xe0816f85, 0x29aba92f), TOBN(0x2e9e1941, 0x04e38a88),
+ TOBN(0x57eef44a, 0x3dafd2d5), TOBN(0x35d1fae5, 0x97ed98d8),
+ TOBN(0x50628c09, 0x2307f9b1), TOBN(0x09d84aae, 0xd6cba5c6),
+ TOBN(0x67071bc7, 0x88aaa691), TOBN(0x2dea57a9, 0xafe6cb03),
+ TOBN(0xdfe11bb4, 0x3d78ac01), TOBN(0x7286418c, 0x7fd7aa51),
+ TOBN(0xfabf7709, 0x77f7195a), TOBN(0x8ec86167, 0xadeb838f),
+ TOBN(0xea1285a8, 0xbb4f012d), TOBN(0xd6883503, 0x9a3eab3f),
+ TOBN(0xee5d24f8, 0x309004c2), TOBN(0xa96e4b76, 0x13ffe95e),
+ TOBN(0x0cdffe12, 0xbd223ea4), TOBN(0x8f5c2ee5, 0xb6739a53),
+ TOBN(0x5cb4aaa5, 0xdd968198), TOBN(0xfa131c52, 0x72413a6c),
+ TOBN(0x53d46a90, 0x9536d903), TOBN(0xb270f0d3, 0x48606d8e),
+ TOBN(0x518c7564, 0xa053a3bc), TOBN(0x088254b7, 0x1a86caef),
+ TOBN(0xb3ba8cb4, 0x0ab5efd0), TOBN(0x5c59900e, 0x4605945d),
+ TOBN(0xecace1dd, 0xa1887395), TOBN(0x40960f36, 0x932a65de),
+ TOBN(0x9611ff5c, 0x3aa95529), TOBN(0xc58215b0, 0x7c1e5a36),
+ TOBN(0xd48c9b58, 0xf0e1a524), TOBN(0xb406856b, 0xf590dfb8),
+ TOBN(0xc7605e04, 0x9cd95662), TOBN(0x0dd036ee, 0xa33ecf82),
+ TOBN(0xa50171ac, 0xc33156b3), TOBN(0xf09d24ea, 0x4a80172e),
+ TOBN(0x4e1f72c6, 0x76dc8eef), TOBN(0xe60caadc, 0x5e3d44ee),
+ TOBN(0x006ef8a6, 0x979b1d8f), TOBN(0x60908a1c, 0x97788d26),
+ TOBN(0x6e08f95b, 0x266feec0), TOBN(0x618427c2, 0x22e8c94e),
+ TOBN(0x3d613339, 0x59145a65), TOBN(0xcd9bc368, 0xfa406337),
+ TOBN(0x82d11be3, 0x2d8a52a0), TOBN(0xf6877b27, 0x97a1c590),
+ TOBN(0x837a819b, 0xf5cbdb25), TOBN(0x2a4fd1d8, 0xde090249),
+ TOBN(0x622a7de7, 0x74990e5f), TOBN(0x840fa5a0, 0x7945511b),
+ TOBN(0x30b974be, 0x6558842d), TOBN(0x70df8c64, 0x17f3d0a6),
+ TOBN(0x7c803520, 0x7542e46d), TOBN(0x7251fe7f, 0xe4ecc823),
+ TOBN(0xe59134cb, 0x5e9aac9a), TOBN(0x11bb0934, 0xf0045d71),
+ TOBN(0x53e5d9b5, 0xdbcb1d4e), TOBN(0x8d97a905, 0x92defc91),
+ TOBN(0xfe289327, 0x7946d3f9), TOBN(0xe132bd24, 0x07472273),
+ TOBN(0xeeeb510c, 0x1eb6ae86), TOBN(0x777708c5, 0xf0595067),
+ TOBN(0x18e2c8cd, 0x1297029e), TOBN(0x2c61095c, 0xbbf9305e),
+ TOBN(0xe466c258, 0x6b85d6d9), TOBN(0x8ac06c36, 0xda1ea530),
+ TOBN(0xa365dc39, 0xa1304668), TOBN(0xe4a9c885, 0x07f89606),
+ TOBN(0x65a4898f, 0xacc7228d), TOBN(0x3e2347ff, 0x84ca8303),
+ TOBN(0xa5f6fb77, 0xea7d23a3), TOBN(0x2fac257d, 0x672a71cd),
+ TOBN(0x6908bef8, 0x7e6a44d3), TOBN(0x8ff87566, 0x891d3d7a),
+ TOBN(0xe58e90b3, 0x6b0cf82e), TOBN(0x6438d246, 0x2615b5e7),
+ TOBN(0x07b1f8fc, 0x669c145a), TOBN(0xb0d8b2da, 0x36f1e1cb),
+ TOBN(0x54d5dadb, 0xd9184c4d), TOBN(0x3dbb18d5, 0xf93d9976),
+ TOBN(0x0a3e0f56, 0xd1147d47), TOBN(0x2afa8c8d, 0xa0a48609),
+ TOBN(0x275353e8, 0xbc36742c), TOBN(0x898f427e, 0xeea0ed90),
+ TOBN(0x26f4947e, 0x3e477b00), TOBN(0x8ad8848a, 0x308741e3),
+ TOBN(0x6c703c38, 0xd74a2a46), TOBN(0x5e3e05a9, 0x9ba17ba2),
+ TOBN(0xc1fa6f66, 0x4ab9a9e4), TOBN(0x474a2d9a, 0x3841d6ec),
+ TOBN(0x871239ad, 0x653ae326), TOBN(0x14bcf72a, 0xa74cbb43),
+ TOBN(0x8737650e, 0x20d4c083), TOBN(0x3df86536, 0x110ed4af),
+ TOBN(0xd2d86fe7, 0xb53ca555), TOBN(0x688cb00d, 0xabd5d538),
+ TOBN(0xcf81bda3, 0x1ad38468), TOBN(0x7ccfe3cc, 0xf01167b6),
+ TOBN(0xcf4f47e0, 0x6c4c1fe6), TOBN(0x557e1f1a, 0x298bbb79),
+ TOBN(0xf93b974f, 0x30d45a14), TOBN(0x174a1d2d, 0x0baf97c4),
+ TOBN(0x7a003b30, 0xc51fbf53), TOBN(0xd8940991, 0xee68b225),
+ TOBN(0x5b0aa7b7, 0x1c0f4173), TOBN(0x975797c9, 0xa20a7153),
+ TOBN(0x26e08c07, 0xe3533d77), TOBN(0xd7222e6a, 0x2e341c99),
+ TOBN(0x9d60ec3d, 0x8d2dc4ed), TOBN(0xbdfe0d8f, 0x7c476cf8),
+ TOBN(0x1fe59ab6, 0x1d056605), TOBN(0xa9ea9df6, 0x86a8551f),
+ TOBN(0x8489941e, 0x47fb8d8c), TOBN(0xfeb874eb, 0x4a7f1b10),
+ TOBN(0xfe5fea86, 0x7ee0d98f), TOBN(0x201ad34b, 0xdbf61864),
+ TOBN(0x45d8fe47, 0x37c031d4), TOBN(0xd5f49fae, 0x795f0822),
+ TOBN(0xdb0fb291, 0xc7f4a40c), TOBN(0x2e69d9c1, 0x730ddd92),
+ TOBN(0x754e1054, 0x49d76987), TOBN(0x8a24911d, 0x7662db87),
+ TOBN(0x61fc1810, 0x60a71676), TOBN(0xe852d1a8, 0xf66a8ad1),
+ TOBN(0x172bbd65, 0x6417231e), TOBN(0x0d6de7bd, 0x3babb11f),
+ TOBN(0x6fde6f88, 0xc8e347f8), TOBN(0x1c587547, 0x9bd99cc3),
+ TOBN(0x78e54ed0, 0x34076950), TOBN(0x97f0f334, 0x796e83ba),
+ TOBN(0xe4dbe1ce, 0x4924867a), TOBN(0xbd5f51b0, 0x60b84917),
+ TOBN(0x37530040, 0x3cb09a79), TOBN(0xdb3fe0f8, 0xff1743d8),
+ TOBN(0xed7894d8, 0x556fa9db), TOBN(0xfa262169, 0x23412fbf),
+ TOBN(0x563be0db, 0xba7b9291), TOBN(0x6ca8b8c0, 0x0c9fb234),
+ TOBN(0xed406aa9, 0xbd763802), TOBN(0xc21486a0, 0x65303da1),
+ TOBN(0x61ae291e, 0xc7e62ec4), TOBN(0x622a0492, 0xdf99333e),
+ TOBN(0x7fd80c9d, 0xbb7a8ee0), TOBN(0xdc2ed3bc, 0x6c01aedb),
+ TOBN(0x35c35a12, 0x08be74ec), TOBN(0xd540cb1a, 0x469f671f),
+ TOBN(0xd16ced4e, 0xcf84f6c7), TOBN(0x8561fb9c, 0x2d090f43),
+ TOBN(0x7e693d79, 0x6f239db4), TOBN(0xa736f928, 0x77bd0d94),
+ TOBN(0x07b4d929, 0x2c1950ee), TOBN(0xda177543, 0x56dc11b3),
+ TOBN(0xa5dfbbaa, 0x7a6a878e), TOBN(0x1c70cb29, 0x4decb08a),
+ TOBN(0xfba28c8b, 0x6f0f7c50), TOBN(0xa8eba2b8, 0x854dcc6d),
+ TOBN(0x5ff8e89a, 0x36b78642), TOBN(0x070c1c8e, 0xf6873adf),
+ TOBN(0xbbd3c371, 0x6484d2e4), TOBN(0xfb78318f, 0x0d414129),
+ TOBN(0x2621a39c, 0x6ad93b0b), TOBN(0x979d74c2, 0xa9e917f7),
+ TOBN(0xfc195647, 0x61fb0428), TOBN(0x4d78954a, 0xbee624d4),
+ TOBN(0xb94896e0, 0xb8ae86fd), TOBN(0x6667ac0c, 0xc91c8b13),
+ TOBN(0x9f180512, 0x43bcf832), TOBN(0xfbadf8b7, 0xa0010137),
+ TOBN(0xc69b4089, 0xb3ba8aa7), TOBN(0xfac4bacd, 0xe687ce85),
+ TOBN(0x9164088d, 0x977eab40), TOBN(0x51f4c5b6, 0x2760b390),
+ TOBN(0xd238238f, 0x340dd553), TOBN(0x358566c3, 0xdb1d31c9),
+ TOBN(0x3a5ad69e, 0x5068f5ff), TOBN(0xf31435fc, 0xdaff6b06),
+ TOBN(0xae549a5b, 0xd6debff0), TOBN(0x59e5f0b7, 0x75e01331),
+ TOBN(0x5d492fb8, 0x98559acf), TOBN(0x96018c2e, 0x4db79b50),
+ TOBN(0x55f4a48f, 0x609f66aa), TOBN(0x1943b3af, 0x4900a14f),
+ TOBN(0xc22496df, 0x15a40d39), TOBN(0xb2a44684, 0x4c20f7c5),
+ TOBN(0x76a35afa, 0x3b98404c), TOBN(0xbec75725, 0xff5d1b77),
+ TOBN(0xb67aa163, 0xbea06444), TOBN(0x27e95bb2, 0xf724b6f2),
+ TOBN(0x3c20e3e9, 0xd238c8ab), TOBN(0x1213754e, 0xddd6ae17),
+ TOBN(0x8c431020, 0x716e0f74), TOBN(0x6679c82e, 0xffc095c2),
+ TOBN(0x2eb3adf4, 0xd0ac2932), TOBN(0x2cc970d3, 0x01bb7a76),
+ TOBN(0x70c71f2f, 0x740f0e66), TOBN(0x545c616b, 0x2b6b23cc),
+ TOBN(0x4528cfcb, 0xb40a8bd7), TOBN(0xff839633, 0x2ab27722),
+ TOBN(0x049127d9, 0x025ac99a), TOBN(0xd314d4a0, 0x2b63e33b),
+ TOBN(0xc8c310e7, 0x28d84519), TOBN(0x0fcb8983, 0xb3bc84ba),
+ TOBN(0x2cc52261, 0x38634818), TOBN(0x501814f4, 0xb44c2e0b),
+ TOBN(0xf7e181aa, 0x54dfdba3), TOBN(0xcfd58ff0, 0xe759718c),
+ TOBN(0xf90cdb14, 0xd3b507a8), TOBN(0x57bd478e, 0xc50bdad8),
+ TOBN(0x29c197e2, 0x50e5f9aa), TOBN(0x4db6eef8, 0xe40bc855),
+ TOBN(0x2cc8f21a, 0xd1fc0654), TOBN(0xc71cc963, 0x81269d73),
+ TOBN(0xecfbb204, 0x077f49f9), TOBN(0xdde92571, 0xca56b793),
+ TOBN(0x9abed6a3, 0xf97ad8f7), TOBN(0xe6c19d3f, 0x924de3bd),
+ TOBN(0x8dce92f4, 0xa140a800), TOBN(0x85f44d1e, 0x1337af07),
+ TOBN(0x5953c08b, 0x09d64c52), TOBN(0xa1b5e49f, 0xf5df9749),
+ TOBN(0x336a8fb8, 0x52735f7d), TOBN(0xb332b6db, 0x9add676b),
+ TOBN(0x558b88a0, 0xb4511aa4), TOBN(0x09788752, 0xdbd5cc55),
+ TOBN(0x16b43b9c, 0xd8cd52bd), TOBN(0x7f0bc5a0, 0xc2a2696b),
+ TOBN(0x146e12d4, 0xc11f61ef), TOBN(0x9ce10754, 0x3a83e79e),
+ TOBN(0x08ec73d9, 0x6cbfca15), TOBN(0x09ff29ad, 0x5b49653f),
+ TOBN(0xe31b72bd, 0xe7da946e), TOBN(0xebf9eb3b, 0xee80a4f2),
+ TOBN(0xd1aabd08, 0x17598ce4), TOBN(0x18b5fef4, 0x53f37e80),
+ TOBN(0xd5d5cdd3, 0x5958cd79), TOBN(0x3580a1b5, 0x1d373114),
+ TOBN(0xa36e4c91, 0xfa935726), TOBN(0xa38c534d, 0xef20d760),
+ TOBN(0x7088e40a, 0x2ff5845b), TOBN(0xe5bb40bd, 0xbd78177f),
+ TOBN(0x4f06a7a8, 0x857f9920), TOBN(0xe3cc3e50, 0xe968f05d),
+ TOBN(0x1d68b7fe, 0xe5682d26), TOBN(0x5206f76f, 0xaec7f87c),
+ TOBN(0x41110530, 0x041951ab), TOBN(0x58ec52c1, 0xd4b5a71a),
+ TOBN(0xf3488f99, 0x0f75cf9a), TOBN(0xf411951f, 0xba82d0d5),
+ TOBN(0x27ee75be, 0x618895ab), TOBN(0xeae060d4, 0x6d8aab14),
+ TOBN(0x9ae1df73, 0x7fb54dc2), TOBN(0x1f3e391b, 0x25963649),
+ TOBN(0x242ec32a, 0xfe055081), TOBN(0x5bd450ef, 0x8491c9bd),
+ TOBN(0x367efc67, 0x981eb389), TOBN(0xed7e1928, 0x3a0550d5),
+ TOBN(0x362e776b, 0xab3ce75c), TOBN(0xe890e308, 0x1f24c523),
+ TOBN(0xb961b682, 0xfeccef76), TOBN(0x8b8e11f5, 0x8bba6d92),
+ TOBN(0x8f2ccc4c, 0x2b2375c4), TOBN(0x0d7f7a52, 0xe2f86cfa),
+ TOBN(0xfd94d30a, 0x9efe5633), TOBN(0x2d8d246b, 0x5451f934),
+ TOBN(0x2234c6e3, 0x244e6a00), TOBN(0xde2b5b0d, 0xddec8c50),
+ TOBN(0x2ce53c5a, 0xbf776f5b), TOBN(0x6f724071, 0x60357b05),
+ TOBN(0xb2593717, 0x71bf3f7a), TOBN(0x87d2501c, 0x440c4a9f),
+ TOBN(0x440552e1, 0x87b05340), TOBN(0xb7bf7cc8, 0x21624c32),
+ TOBN(0x4155a6ce, 0x22facddb), TOBN(0x5a4228cb, 0x889837ef),
+ TOBN(0xef87d6d6, 0xfd4fd671), TOBN(0xa233687e, 0xc2daa10e),
+ TOBN(0x75622244, 0x03c0eb96), TOBN(0x7632d184, 0x8bf19be6),
+ TOBN(0x05d0f8e9, 0x40735ff4), TOBN(0x3a3e6e13, 0xc00931f1),
+ TOBN(0x31ccde6a, 0xdafe3f18), TOBN(0xf381366a, 0xcfe51207),
+ TOBN(0x24c222a9, 0x60167d92), TOBN(0x62f9d6f8, 0x7529f18c),
+ TOBN(0x412397c0, 0x0353b114), TOBN(0x334d89dc, 0xef808043),
+ TOBN(0xd9ec63ba, 0x2a4383ce), TOBN(0xcec8e937, 0x5cf92ba0),
+ TOBN(0xfb8b4288, 0xc8be74c0), TOBN(0x67d6912f, 0x105d4391),
+ TOBN(0x7b996c46, 0x1b913149), TOBN(0x36aae2ef, 0x3a4e02da),
+ TOBN(0xb68aa003, 0x972de594), TOBN(0x284ec70d, 0x4ec6d545),
+ TOBN(0xf3d2b2d0, 0x61391d54), TOBN(0x69c5d5d6, 0xfe114e92),
+ TOBN(0xbe0f00b5, 0xb4482dff), TOBN(0xe1596fa5, 0xf5bf33c5),
+ TOBN(0x10595b56, 0x96a71cba), TOBN(0x944938b2, 0xfdcadeb7),
+ TOBN(0xa282da4c, 0xfccd8471), TOBN(0x98ec05f3, 0x0d37bfe1),
+ TOBN(0xe171ce1b, 0x0698304a), TOBN(0x2d691444, 0x21bdf79b),
+ TOBN(0xd0cd3b74, 0x1b21dec1), TOBN(0x712ecd8b, 0x16a15f71),
+ TOBN(0x8d4c00a7, 0x00fd56e1), TOBN(0x02ec9692, 0xf9527c18),
+ TOBN(0x21c44937, 0x4a3e42e1), TOBN(0x9176fbab, 0x1392ae0a),
+ TOBN(0x8726f1ba, 0x44b7b618), TOBN(0xb4d7aae9, 0xf1de491c),
+ TOBN(0xf91df7b9, 0x07b582c0), TOBN(0x7e116c30, 0xef60aa3a),
+ TOBN(0x99270f81, 0x466265d7), TOBN(0xb15b6fe2, 0x4df7adf0),
+ TOBN(0xfe33b2d3, 0xf9738f7f), TOBN(0x48553ab9, 0xd6d70f95),
+ TOBN(0x2cc72ac8, 0xc21e94db), TOBN(0x795ac38d, 0xbdc0bbee),
+ TOBN(0x0a1be449, 0x2e40478f), TOBN(0x81bd3394, 0x052bde55),
+ TOBN(0x63c8dbe9, 0x56b3c4f2), TOBN(0x017a99cf, 0x904177cc),
+ TOBN(0x947bbddb, 0x4d010fc1), TOBN(0xacf9b00b, 0xbb2c9b21),
+ TOBN(0x2970bc8d, 0x47173611), TOBN(0x1a4cbe08, 0xac7d756f),
+ TOBN(0x06d9f4aa, 0x67d541a2), TOBN(0xa3e8b689, 0x59c2cf44),
+ TOBN(0xaad066da, 0x4d88f1dd), TOBN(0xc604f165, 0x7ad35dea),
+ TOBN(0x7edc0720, 0x4478ca67), TOBN(0xa10dfae0, 0xba02ce06),
+ TOBN(0xeceb1c76, 0xaf36f4e4), TOBN(0x994b2292, 0xaf3f8f48),
+ TOBN(0xbf9ed77b, 0x77c8a68c), TOBN(0x74f544ea, 0x51744c9d),
+ TOBN(0x82d05bb9, 0x8113a757), TOBN(0x4ef2d2b4, 0x8a9885e4),
+ TOBN(0x1e332be5, 0x1aa7865f), TOBN(0x22b76b18, 0x290d1a52),
+ TOBN(0x308a2310, 0x44351683), TOBN(0x9d861896, 0xa3f22840),
+ TOBN(0x5959ddcd, 0x841ed947), TOBN(0x0def0c94, 0x154b73bf),
+ TOBN(0xf0105417, 0x4c7c15e0), TOBN(0x539bfb02, 0x3a277c32),
+ TOBN(0xe699268e, 0xf9dccf5f), TOBN(0x9f5796a5, 0x0247a3bd),
+ TOBN(0x8b839de8, 0x4f157269), TOBN(0xc825c1e5, 0x7a30196b),
+ TOBN(0x6ef0aabc, 0xdc8a5a91), TOBN(0xf4a8ce6c, 0x498b7fe6),
+ TOBN(0x1cce35a7, 0x70cbac78), TOBN(0x83488e9b, 0xf6b23958),
+ TOBN(0x0341a070, 0xd76cb011), TOBN(0xda6c9d06, 0xae1b2658),
+ TOBN(0xb701fb30, 0xdd648c52), TOBN(0x994ca02c, 0x52fb9fd1),
+ TOBN(0x06933117, 0x6f563086), TOBN(0x3d2b8100, 0x17856bab),
+ TOBN(0xe89f48c8, 0x5963a46e), TOBN(0x658ab875, 0xa99e61c7),
+ TOBN(0x6e296f87, 0x4b8517b4), TOBN(0x36c4fcdc, 0xfc1bc656),
+ TOBN(0xde5227a1, 0xa3906def), TOBN(0x9fe95f57, 0x62418945),
+ TOBN(0x20c91e81, 0xfdd96cde), TOBN(0x5adbe47e, 0xda4480de),
+ TOBN(0xa009370f, 0x396de2b6), TOBN(0x98583d4b, 0xf0ecc7bd),
+ TOBN(0xf44f6b57, 0xe51d0672), TOBN(0x03d6b078, 0x556b1984),
+ TOBN(0x27dbdd93, 0xb0b64912), TOBN(0x9b3a3434, 0x15687b09),
+ TOBN(0x0dba6461, 0x51ec20a9), TOBN(0xec93db7f, 0xff28187c),
+ TOBN(0x00ff8c24, 0x66e48bdd), TOBN(0x2514f2f9, 0x11ccd78e),
+ TOBN(0xeba11f4f, 0xe1250603), TOBN(0x8a22cd41, 0x243fa156),
+ TOBN(0xa4e58df4, 0xb283e4c6), TOBN(0x78c29859, 0x8b39783f),
+ TOBN(0x5235aee2, 0xa5259809), TOBN(0xc16284b5, 0x0e0227dd),
+ TOBN(0xa5f57916, 0x1338830d), TOBN(0x6d4b8a6b, 0xd2123fca),
+ TOBN(0x236ea68a, 0xf9c546f8), TOBN(0xc1d36873, 0xfa608d36),
+ TOBN(0xcd76e495, 0x8d436d13), TOBN(0xd4d9c221, 0x8fb080af),
+ TOBN(0x665c1728, 0xe8ad3fb5), TOBN(0xcf1ebe4d, 0xb3d572e0),
+ TOBN(0xa7a8746a, 0x584c5e20), TOBN(0x267e4ea1, 0xb9dc7035),
+ TOBN(0x593a15cf, 0xb9548c9b), TOBN(0x5e6e2135, 0x4bd012f3),
+ TOBN(0xdf31cc6a, 0x8c8f936e), TOBN(0x8af84d04, 0xb5c241dc),
+ TOBN(0x63990a6f, 0x345efb86), TOBN(0x6fef4e61, 0xb9b962cb)}
+ ,
+ {TOBN(0xf6368f09, 0x25722608), TOBN(0x131260db, 0x131cf5c6),
+ TOBN(0x40eb353b, 0xfab4f7ac), TOBN(0x85c78880, 0x37eee829),
+ TOBN(0x4c1581ff, 0xc3bdf24e), TOBN(0x5bff75cb, 0xf5c3c5a8),
+ TOBN(0x35e8c83f, 0xa14e6f40), TOBN(0xb81d1c0f, 0x0295e0ca),
+ TOBN(0xfcde7cc8, 0xf43a730f), TOBN(0xe89b6f3c, 0x33ab590e),
+ TOBN(0xc823f529, 0xad03240b), TOBN(0x82b79afe, 0x98bea5db),
+ TOBN(0x568f2856, 0x962fe5de), TOBN(0x0c590adb, 0x60c591f3),
+ TOBN(0x1fc74a14, 0x4a28a858), TOBN(0x3b662498, 0xb3203f4c),
+ TOBN(0x91e3cf0d, 0x6c39765a), TOBN(0xa2db3acd, 0xac3cca0b),
+ TOBN(0x288f2f08, 0xcb953b50), TOBN(0x2414582c, 0xcf43cf1a),
+ TOBN(0x8dec8bbc, 0x60eee9a8), TOBN(0x54c79f02, 0x729aa042),
+ TOBN(0xd81cd5ec, 0x6532f5d5), TOBN(0xa672303a, 0xcf82e15f),
+ TOBN(0x376aafa8, 0x719c0563), TOBN(0xcd8ad2dc, 0xbc5fc79f),
+ TOBN(0x303fdb9f, 0xcb750cd3), TOBN(0x14ff052f, 0x4418b08e),
+ TOBN(0xf75084cf, 0x3e2d6520), TOBN(0x7ebdf0f8, 0x144ed509),
+ TOBN(0xf43bf0f2, 0xd3f25b98), TOBN(0x86ad71cf, 0xa354d837),
+ TOBN(0xb827fe92, 0x26f43572), TOBN(0xdfd3ab5b, 0x5d824758),
+ TOBN(0x315dd23a, 0x539094c1), TOBN(0x85c0e37a, 0x66623d68),
+ TOBN(0x575c7972, 0x7be19ae0), TOBN(0x616a3396, 0xdf0d36b5),
+ TOBN(0xa1ebb3c8, 0x26b1ff7e), TOBN(0x635b9485, 0x140ad453),
+ TOBN(0x92bf3cda, 0xda430c0b), TOBN(0x4702850e, 0x3a96dac6),
+ TOBN(0xc91cf0a5, 0x15ac326a), TOBN(0x95de4f49, 0xab8c25e4),
+ TOBN(0xb01bad09, 0xe265c17c), TOBN(0x24e45464, 0x087b3881),
+ TOBN(0xd43e583c, 0xe1fac5ca), TOBN(0xe17cb318, 0x6ead97a6),
+ TOBN(0x6cc39243, 0x74dcec46), TOBN(0x33cfc02d, 0x54c2b73f),
+ TOBN(0x82917844, 0xf26cd99c), TOBN(0x8819dd95, 0xd1773f89),
+ TOBN(0x09572aa6, 0x0871f427), TOBN(0x8e0cf365, 0xf6f01c34),
+ TOBN(0x7fa52988, 0xbff1f5af), TOBN(0x4eb357ea, 0xe75e8e50),
+ TOBN(0xd9d0c8c4, 0x868af75d), TOBN(0xd7325cff, 0x45c8c7ea),
+ TOBN(0xab471996, 0xcc81ecb0), TOBN(0xff5d55f3, 0x611824ed),
+ TOBN(0xbe314541, 0x1977a0ee), TOBN(0x5085c4c5, 0x722038c6),
+ TOBN(0x2d5335bf, 0xf94bb495), TOBN(0x894ad8a6, 0xc8e2a082),
+ TOBN(0x5c3e2341, 0xada35438), TOBN(0xf4a9fc89, 0x049b8c4e),
+ TOBN(0xbeeb355a, 0x9f17cf34), TOBN(0x3f311e0e, 0x6c91fe10),
+ TOBN(0xc2d20038, 0x92ab9891), TOBN(0x257bdcc1, 0x3e8ce9a9),
+ TOBN(0x1b2d9789, 0x88c53bee), TOBN(0x927ce89a, 0xcdba143a),
+ TOBN(0xb0a32cca, 0x523db280), TOBN(0x5c889f8a, 0x50d43783),
+ TOBN(0x503e04b3, 0x4897d16f), TOBN(0x8cdb6e78, 0x08f5f2e8),
+ TOBN(0x6ab91cf0, 0x179c8e74), TOBN(0xd8874e52, 0x48211d60),
+ TOBN(0xf948d4d5, 0xea851200), TOBN(0x4076d41e, 0xe6f9840a),
+ TOBN(0xc20e263c, 0x47b517ea), TOBN(0x79a448fd, 0x30685e5e),
+ TOBN(0xe55f6f78, 0xf90631a0), TOBN(0x88a790b1, 0xa79e6346),
+ TOBN(0x62160c7d, 0x80969fe8), TOBN(0x54f92fd4, 0x41491bb9),
+ TOBN(0xa6645c23, 0x5c957526), TOBN(0xf44cc5ae, 0xbea3ce7b),
+ TOBN(0xf7628327, 0x8b1e68b7), TOBN(0xc731ad7a, 0x303f29d3),
+ TOBN(0xfe5a9ca9, 0x57d03ecb), TOBN(0x96c0d50c, 0x41bc97a7),
+ TOBN(0xc4669fe7, 0x9b4f7f24), TOBN(0xfdd781d8, 0x3d9967ef),
+ TOBN(0x7892c7c3, 0x5d2c208d), TOBN(0x8bf64f7c, 0xae545cb3),
+ TOBN(0xc01f862c, 0x467be912), TOBN(0xf4c85ee9, 0xc73d30cc),
+ TOBN(0x1fa6f4be, 0x6ab83ec7), TOBN(0xa07a3c1c, 0x4e3e3cf9),
+ TOBN(0x87f8ef45, 0x0c00beb3), TOBN(0x30e2c2b3, 0x000d4c3e),
+ TOBN(0x1aa00b94, 0xfe08bf5b), TOBN(0x32c133aa, 0x9224ef52),
+ TOBN(0x38df16bb, 0x32e5685d), TOBN(0x68a9e069, 0x58e6f544),
+ TOBN(0x495aaff7, 0xcdc5ebc6), TOBN(0xf894a645, 0x378b135f),
+ TOBN(0xf316350a, 0x09e27ecf), TOBN(0xeced201e, 0x58f7179d),
+ TOBN(0x2eec273c, 0xe97861ba), TOBN(0x47ec2cae, 0xd693be2e),
+ TOBN(0xfa4c97c4, 0xf68367ce), TOBN(0xe4f47d0b, 0xbe5a5755),
+ TOBN(0x17de815d, 0xb298a979), TOBN(0xd7eca659, 0xc177dc7d),
+ TOBN(0x20fdbb71, 0x49ded0a3), TOBN(0x4cb2aad4, 0xfb34d3c5),
+ TOBN(0x2cf31d28, 0x60858a33), TOBN(0x3b6873ef, 0xa24aa40f),
+ TOBN(0x540234b2, 0x2c11bb37), TOBN(0x2d0366dd, 0xed4c74a3),
+ TOBN(0xf9a968da, 0xeec5f25d), TOBN(0x36601068, 0x67b63142),
+ TOBN(0x07cd6d2c, 0x68d7b6d4), TOBN(0xa8f74f09, 0x0c842942),
+ TOBN(0xe2751404, 0x7768b1ee), TOBN(0x4b5f7e89, 0xfe62aee4),
+ TOBN(0xc6a77177, 0x89070d26), TOBN(0xa1f28e4e, 0xdd1c8bc7),
+ TOBN(0xea5f4f06, 0x469e1f17), TOBN(0x78fc242a, 0xfbdb78e0),
+ TOBN(0xc9c7c592, 0x8b0588f1), TOBN(0xb6b7a0fd, 0x1535921e),
+ TOBN(0xcc5bdb91, 0xbde5ae35), TOBN(0xb42c485e, 0x12ff1864),
+ TOBN(0xa1113e13, 0xdbab98aa), TOBN(0xde9d469b, 0xa17b1024),
+ TOBN(0x23f48b37, 0xc0462d3a), TOBN(0x3752e537, 0x7c5c078d),
+ TOBN(0xe3a86add, 0x15544eb9), TOBN(0xf013aea7, 0x80fba279),
+ TOBN(0x8b5bb76c, 0xf22001b5), TOBN(0xe617ba14, 0xf02891ab),
+ TOBN(0xd39182a6, 0x936219d3), TOBN(0x5ce1f194, 0xae51cb19),
+ TOBN(0xc78f8598, 0xbf07a74c), TOBN(0x6d7158f2, 0x22cbf1bc),
+ TOBN(0x3b846b21, 0xe300ce18), TOBN(0x35fba630, 0x2d11275d),
+ TOBN(0x5fe25c36, 0xa0239b9b), TOBN(0xd8beb35d, 0xdf05d940),
+ TOBN(0x4db02bb0, 0x1f7e320d), TOBN(0x0641c364, 0x6da320ea),
+ TOBN(0x6d95fa5d, 0x821389a3), TOBN(0x92699748, 0x8fcd8e3d),
+ TOBN(0x316fef17, 0xceb6c143), TOBN(0x67fcb841, 0xd933762b),
+ TOBN(0xbb837e35, 0x118b17f8), TOBN(0x4b92552f, 0x9fd24821),
+ TOBN(0xae6bc70e, 0x46aca793), TOBN(0x1cf0b0e4, 0xe579311b),
+ TOBN(0x8dc631be, 0x5802f716), TOBN(0x099bdc6f, 0xbddbee4d),
+ TOBN(0xcc352bb2, 0x0caf8b05), TOBN(0xf74d505a, 0x72d63df2),
+ TOBN(0xb9876d4b, 0x91c4f408), TOBN(0x1ce18473, 0x9e229b2d),
+ TOBN(0x49507597, 0x83abdb4a), TOBN(0x850fbcb6, 0xdee84b18),
+ TOBN(0x6325236e, 0x609e67dc), TOBN(0x04d831d9, 0x9336c6d8),
+ TOBN(0x8deaae3b, 0xfa12d45d), TOBN(0xe425f8ce, 0x4746e246),
+ TOBN(0x8004c175, 0x24f5f31e), TOBN(0xaca16d8f, 0xad62c3b7),
+ TOBN(0x0dc15a6a, 0x9152f934), TOBN(0xf1235e5d, 0xed0e12c1),
+ TOBN(0xc33c06ec, 0xda477dac), TOBN(0x76be8732, 0xb2ea0006),
+ TOBN(0xcf3f7831, 0x0c0cd313), TOBN(0x3c524553, 0xa614260d),
+ TOBN(0x31a756f8, 0xcab22d15), TOBN(0x03ee10d1, 0x77827a20),
+ TOBN(0xd1e059b2, 0x1994ef20), TOBN(0x2a653b69, 0x638ae318),
+ TOBN(0x70d5eb58, 0x2f699010), TOBN(0x279739f7, 0x09f5f84a),
+ TOBN(0x5da4663c, 0x8b799336), TOBN(0xfdfdf14d, 0x203c37eb),
+ TOBN(0x32d8a9dc, 0xa1dbfb2d), TOBN(0xab40cff0, 0x77d48f9b),
+ TOBN(0xc018b383, 0xd20b42d5), TOBN(0xf9a810ef, 0x9f78845f),
+ TOBN(0x40af3753, 0xbdba9df0), TOBN(0xb90bdcfc, 0x131dfdf9),
+ TOBN(0x18720591, 0xf01ab782), TOBN(0xc823f211, 0x6af12a88),
+ TOBN(0xa51b80f3, 0x0dc14401), TOBN(0xde248f77, 0xfb2dfbe3),
+ TOBN(0xef5a44e5, 0x0cafe751), TOBN(0x73997c9c, 0xd4dcd221),
+ TOBN(0x32fd86d1, 0xde854024), TOBN(0xd5b53adc, 0xa09b84bb),
+ TOBN(0x008d7a11, 0xdcedd8d1), TOBN(0x406bd1c8, 0x74b32c84),
+ TOBN(0x5d4472ff, 0x05dde8b1), TOBN(0x2e25f2cd, 0xfce2b32f),
+ TOBN(0xbec0dd5e, 0x29dfc254), TOBN(0x4455fcf6, 0x2b98b267),
+ TOBN(0x0b4d43a5, 0xc72df2ad), TOBN(0xea70e6be, 0x48a75397),
+ TOBN(0x2aad6169, 0x5820f3bf), TOBN(0xf410d2dd, 0x9e37f68f),
+ TOBN(0x70fb7dba, 0x7be5ac83), TOBN(0x636bb645, 0x36ec3eec),
+ TOBN(0x27104ea3, 0x9754e21c), TOBN(0xbc87a3e6, 0x8d63c373),
+ TOBN(0x483351d7, 0x4109db9a), TOBN(0x0fa724e3, 0x60134da7),
+ TOBN(0x9ff44c29, 0xb0720b16), TOBN(0x2dd0cf13, 0x06aceead),
+ TOBN(0x5942758c, 0xe26929a6), TOBN(0x96c5db92, 0xb766a92b),
+ TOBN(0xcec7d4c0, 0x5f18395e), TOBN(0xd3f22744, 0x1f80d032),
+ TOBN(0x7a68b37a, 0xcb86075b), TOBN(0x074764dd, 0xafef92db),
+ TOBN(0xded1e950, 0x7bc7f389), TOBN(0xc580c850, 0xb9756460),
+ TOBN(0xaeeec2a4, 0x7da48157), TOBN(0x3f0b4e7f, 0x82c587b3),
+ TOBN(0x231c6de8, 0xa9f19c53), TOBN(0x5717bd73, 0x6974e34e),
+ TOBN(0xd9e1d216, 0xf1508fa9), TOBN(0x9f112361, 0xdadaa124),
+ TOBN(0x80145e31, 0x823b7348), TOBN(0x4dd8f0d5, 0xac634069),
+ TOBN(0xe3d82fc7, 0x2297c258), TOBN(0x276fcfee, 0x9cee7431),
+ TOBN(0x8eb61b5e, 0x2bc0aea9), TOBN(0x4f668fd5, 0xde329431),
+ TOBN(0x03a32ab1, 0x38e4b87e), TOBN(0xe1374517, 0x73d0ef0b),
+ TOBN(0x1a46f7e6, 0x853ac983), TOBN(0xc3bdf42e, 0x68e78a57),
+ TOBN(0xacf20785, 0x2ea96dd1), TOBN(0xa10649b9, 0xf1638460),
+ TOBN(0xf2369f0b, 0x879fbbed), TOBN(0x0ff0ae86, 0xda9d1869),
+ TOBN(0x5251d759, 0x56766f45), TOBN(0x4984d8c0, 0x2be8d0fc),
+ TOBN(0x7ecc95a6, 0xd21008f0), TOBN(0x29bd54a0, 0x3a1a1c49),
+ TOBN(0xab9828c5, 0xd26c50f3), TOBN(0x32c0087c, 0x51d0d251),
+ TOBN(0x9bac3ce6, 0x0c1cdb26), TOBN(0xcd94d947, 0x557ca205),
+ TOBN(0x1b1bd598, 0x9db1fdcd), TOBN(0x0eda0108, 0xa3d8b149),
+ TOBN(0x95066610, 0x56152fcc), TOBN(0xc2f037e6, 0xe7192b33),
+ TOBN(0xdeffb41a, 0xc92e05a4), TOBN(0x1105f6c2, 0xc2f6c62e),
+ TOBN(0x68e73500, 0x8733913c), TOBN(0xcce86163, 0x3f3adc40),
+ TOBN(0xf407a942, 0x38a278e9), TOBN(0xd13c1b9d, 0x2ab21292),
+ TOBN(0x93ed7ec7, 0x1c74cf5c), TOBN(0x8887dc48, 0xf1a4c1b4),
+ TOBN(0x3830ff30, 0x4b3a11f1), TOBN(0x358c5a3c, 0x58937cb6),
+ TOBN(0x027dc404, 0x89022829), TOBN(0x40e93977, 0x3b798f79),
+ TOBN(0x90ad3337, 0x38be6ead), TOBN(0x9c23f6bc, 0xf34c0a5d),
+ TOBN(0xd1711a35, 0xfbffd8bb), TOBN(0x60fcfb49, 0x1949d3dd),
+ TOBN(0x09c8ef4b, 0x7825d93a), TOBN(0x24233cff, 0xa0a8c968),
+ TOBN(0x67ade46c, 0xe6d982af), TOBN(0xebb6bf3e, 0xe7544d7c),
+ TOBN(0xd6b9ba76, 0x3d8bd087), TOBN(0x46fe382d, 0x4dc61280),
+ TOBN(0xbd39a7e8, 0xb5bdbd75), TOBN(0xab381331, 0xb8f228fe),
+ TOBN(0x0709a77c, 0xce1c4300), TOBN(0x6a247e56, 0xf337ceac),
+ TOBN(0x8f34f21b, 0x636288be), TOBN(0x9dfdca74, 0xc8a7c305),
+ TOBN(0x6decfd1b, 0xea919e04), TOBN(0xcdf2688d, 0x8e1991f8),
+ TOBN(0xe607df44, 0xd0f8a67e), TOBN(0xd985df4b, 0x0b58d010),
+ TOBN(0x57f834c5, 0x0c24f8f4), TOBN(0xe976ef56, 0xa0bf01ae),
+ TOBN(0x536395ac, 0xa1c32373), TOBN(0x351027aa, 0x734c0a13),
+ TOBN(0xd2f1b5d6, 0x5e6bd5bc), TOBN(0x2b539e24, 0x223debed),
+ TOBN(0xd4994cec, 0x0eaa1d71), TOBN(0x2a83381d, 0x661dcf65),
+ TOBN(0x5f1aed2f, 0x7b54c740), TOBN(0x0bea3fa5, 0xd6dda5ee),
+ TOBN(0x9d4fb684, 0x36cc6134), TOBN(0x8eb9bbf3, 0xc0a443dd),
+ TOBN(0xfc500e2e, 0x383b7d2a), TOBN(0x7aad621c, 0x5b775257),
+ TOBN(0x69284d74, 0x0a8f7cc0), TOBN(0xe820c2ce, 0x07562d65),
+ TOBN(0xbf9531b9, 0x499758ee), TOBN(0x73e95ca5, 0x6ee0cc2d),
+ TOBN(0xf61790ab, 0xfbaf50a5), TOBN(0xdf55e76b, 0x684e0750),
+ TOBN(0xec516da7, 0xf176b005), TOBN(0x575553bb, 0x7a2dddc7),
+ TOBN(0x37c87ca3, 0x553afa73), TOBN(0x315f3ffc, 0x4d55c251),
+ TOBN(0xe846442a, 0xaf3e5d35), TOBN(0x61b91149, 0x6495ff28),
+ TOBN(0x23cc95d3, 0xfa326dc3), TOBN(0x1df4da1f, 0x18fc2cea),
+ TOBN(0x24bf9adc, 0xd0a37d59), TOBN(0xb6710053, 0x320d6e1e),
+ TOBN(0x96f9667e, 0x618344d1), TOBN(0xcc7ce042, 0xa06445af),
+ TOBN(0xa02d8514, 0xd68dbc3a), TOBN(0x4ea109e4, 0x280b5a5b),
+ TOBN(0x5741a7ac, 0xb40961bf), TOBN(0x4ada5937, 0x6aa56bfa),
+ TOBN(0x7feb9145, 0x02b765d1), TOBN(0x561e97be, 0xe6ad1582),
+ TOBN(0xbbc4a5b6, 0xda3982f5), TOBN(0x0c2659ed, 0xb546f468),
+ TOBN(0xb8e7e6aa, 0x59612d20), TOBN(0xd83dfe20, 0xac19e8e0),
+ TOBN(0x8530c45f, 0xb835398c), TOBN(0x6106a8bf, 0xb38a41c2),
+ TOBN(0x21e8f9a6, 0x35f5dcdb), TOBN(0x39707137, 0xcae498ed),
+ TOBN(0x70c23834, 0xd8249f00), TOBN(0x9f14b58f, 0xab2537a0),
+ TOBN(0xd043c365, 0x5f61c0c2), TOBN(0xdc5926d6, 0x09a194a7),
+ TOBN(0xddec0339, 0x8e77738a), TOBN(0xd07a63ef, 0xfba46426),
+ TOBN(0x2e58e79c, 0xee7f6e86), TOBN(0xe59b0459, 0xff32d241),
+ TOBN(0xc5ec84e5, 0x20fa0338), TOBN(0x97939ac8, 0xeaff5ace),
+ TOBN(0x0310a4e3, 0xb4a38313), TOBN(0x9115fba2, 0x8f9d9885),
+ TOBN(0x8dd710c2, 0x5fadf8c3), TOBN(0x66be38a2, 0xce19c0e2),
+ TOBN(0xd42a279c, 0x4cfe5022), TOBN(0x597bb530, 0x0e24e1b8),
+ TOBN(0x3cde86b7, 0xc153ca7f), TOBN(0xa8d30fb3, 0x707d63bd),
+ TOBN(0xac905f92, 0xbd60d21e), TOBN(0x98e7ffb6, 0x7b9a54ab),
+ TOBN(0xd7147df8, 0xe9726a30), TOBN(0xb5e216ff, 0xafce3533),
+ TOBN(0xb550b799, 0x2ff1ec40), TOBN(0x6b613b87, 0xa1e953fd),
+ TOBN(0x87b88dba, 0x792d5610), TOBN(0x2ee1270a, 0xa190fbe1),
+ TOBN(0x02f4e2dc, 0x2ef581da), TOBN(0x016530e4, 0xeff82a95),
+ TOBN(0xcbb93dfd, 0x8fd6ee89), TOBN(0x16d3d986, 0x46848fff),
+ TOBN(0x600eff24, 0x1da47adf), TOBN(0x1b9754a0, 0x0ad47a71),
+ TOBN(0x8f9266df, 0x70c33b98), TOBN(0xaadc87ae, 0xdf34186e),
+ TOBN(0x0d2ce8e1, 0x4ad24132), TOBN(0x8a47cbfc, 0x19946eba),
+ TOBN(0x47feeb66, 0x62b5f3af), TOBN(0xcefab561, 0x0abb3734),
+ TOBN(0x449de60e, 0x19f35cb1), TOBN(0x39f8db14, 0x157f0eb9),
+ TOBN(0xffaecc5b, 0x3c61bfd6), TOBN(0xa5a4d41d, 0x41216703),
+ TOBN(0x7f8fabed, 0x224e1cc2), TOBN(0x0d5a8186, 0x871ad953),
+ TOBN(0xf10774f7, 0xd22da9a9), TOBN(0x45b8a678, 0xcc8a9b0d),
+ TOBN(0xd9c2e722, 0xbdc32cff), TOBN(0xbf71b5f5, 0x337202a5),
+ TOBN(0x95c57f2f, 0x69fc4db9), TOBN(0xb6dad34c, 0x765d01e1),
+ TOBN(0x7e0bd13f, 0xcb904635), TOBN(0x61751253, 0x763a588c),
+ TOBN(0xd85c2997, 0x81af2c2d), TOBN(0xc0f7d9c4, 0x81b9d7da),
+ TOBN(0x838a34ae, 0x08533e8d), TOBN(0x15c4cb08, 0x311d8311),
+ TOBN(0x97f83285, 0x8e121e14), TOBN(0xeea7dc1e, 0x85000a5f),
+ TOBN(0x0c6059b6, 0x5d256274), TOBN(0xec9beace, 0xb95075c0),
+ TOBN(0x173daad7, 0x1df97828), TOBN(0xbf851cb5, 0xa8937877),
+ TOBN(0xb083c594, 0x01646f3c), TOBN(0x3bad30cf, 0x50c6d352),
+ TOBN(0xfeb2b202, 0x496bbcea), TOBN(0x3cf9fd4f, 0x18a1e8ba),
+ TOBN(0xd26de7ff, 0x1c066029), TOBN(0x39c81e9e, 0x4e9ed4f8),
+ TOBN(0xd8be0cb9, 0x7b390d35), TOBN(0x01df2bbd, 0x964aab27),
+ TOBN(0x3e8c1a65, 0xc3ef64f8), TOBN(0x567291d1, 0x716ed1dd),
+ TOBN(0x95499c6c, 0x5f5406d3), TOBN(0x71fdda39, 0x5ba8e23f),
+ TOBN(0xcfeb320e, 0xd5096ece), TOBN(0xbe7ba92b, 0xca66dd16),
+ TOBN(0x4608d36b, 0xc6fb5a7d), TOBN(0xe3eea15a, 0x6d2dd0e0),
+ TOBN(0x75b0a3eb, 0x8f97a36a), TOBN(0xf59814cc, 0x1c83de1e),
+ TOBN(0x56c9c5b0, 0x1c33c23f), TOBN(0xa96c1da4, 0x6faa4136),
+ TOBN(0x46bf2074, 0xde316551), TOBN(0x3b866e7b, 0x1f756c8f),
+ TOBN(0x727727d8, 0x1495ed6b), TOBN(0xb2394243, 0xb682dce7),
+ TOBN(0x8ab8454e, 0x758610f3), TOBN(0xc243ce84, 0x857d72a4),
+ TOBN(0x7b320d71, 0xdbbf370f), TOBN(0xff9afa37, 0x78e0f7ca),
+ TOBN(0x0119d1e0, 0xea7b523f), TOBN(0xb997f8cb, 0x058c7d42),
+ TOBN(0x285bcd2a, 0x37bbb184), TOBN(0x51dcec49, 0xa45d1fa6),
+ TOBN(0x6ade3b64, 0xe29634cb), TOBN(0x080c94a7, 0x26b86ef1),
+ TOBN(0xba583db1, 0x2283fbe3), TOBN(0x902bddc8, 0x5a9315ed),
+ TOBN(0x07c1ccb3, 0x86964bec), TOBN(0x78f4eacf, 0xb6258301),
+ TOBN(0x4bdf3a49, 0x56f90823), TOBN(0xba0f5080, 0x741d777b),
+ TOBN(0x091d71c3, 0xf38bf760), TOBN(0x9633d50f, 0x9b625b02),
+ TOBN(0x03ecb743, 0xb8c9de61), TOBN(0xb4751254, 0x5de74720),
+ TOBN(0x9f9defc9, 0x74ce1cb2), TOBN(0x774a4f6a, 0x00bd32ef),
+ TOBN(0xaca385f7, 0x73848f22), TOBN(0x53dad716, 0xf3f8558e),
+ TOBN(0xab7b34b0, 0x93c471f9), TOBN(0xf530e069, 0x19644bc7),
+ TOBN(0x3d9fb1ff, 0xdd59d31a), TOBN(0x4382e0df, 0x08daa795),
+ TOBN(0x165c6f4b, 0xd5cc88d7), TOBN(0xeaa392d5, 0x4a18c900),
+ TOBN(0x94203c67, 0x648024ee), TOBN(0x188763f2, 0x8c2fabcd),
+ TOBN(0xa80f87ac, 0xbbaec835), TOBN(0x632c96e0, 0xf29d8d54),
+ TOBN(0x29b0a60e, 0x4c00a95e), TOBN(0x2ef17f40, 0xe011e9fa),
+ TOBN(0xf6c0e1d1, 0x15b77223), TOBN(0xaaec2c62, 0x14b04e32),
+ TOBN(0xd35688d8, 0x3d84e58c), TOBN(0x2af5094c, 0x958571db),
+ TOBN(0x4fff7e19, 0x760682a6), TOBN(0x4cb27077, 0xe39a407c),
+ TOBN(0x0f59c547, 0x4ff0e321), TOBN(0x169f34a6, 0x1b34c8ff),
+ TOBN(0x2bff1096, 0x52bc1ba7), TOBN(0xa25423b7, 0x83583544),
+ TOBN(0x5d55d5d5, 0x0ac8b782), TOBN(0xff6622ec, 0x2db3c892),
+ TOBN(0x48fce741, 0x6b8bb642), TOBN(0x31d6998c, 0x69d7e3dc),
+ TOBN(0xdbaf8004, 0xcadcaed0), TOBN(0x801b0142, 0xd81d053c),
+ TOBN(0x94b189fc, 0x59630ec6), TOBN(0x120e9934, 0xaf762c8e),
+ TOBN(0x53a29aa4, 0xfdc6a404), TOBN(0x19d8e01e, 0xa1909948),
+ TOBN(0x3cfcabf1, 0xd7e89681), TOBN(0x3321a50d, 0x4e132d37),
+ TOBN(0xd0496863, 0xe9a86111), TOBN(0x8c0cde61, 0x06a3bc65),
+ TOBN(0xaf866c49, 0xfc9f8eef), TOBN(0x2066350e, 0xff7f5141),
+ TOBN(0x4f8a4689, 0xe56ddfbd), TOBN(0xea1b0c07, 0xfe32983a),
+ TOBN(0x2b317462, 0x873cb8cb), TOBN(0x658deddc, 0x2d93229f),
+ TOBN(0x65efaf4d, 0x0f64ef58), TOBN(0xfe43287d, 0x730cc7a8),
+ TOBN(0xaebc0c72, 0x3d047d70), TOBN(0x92efa539, 0xd92d26c9),
+ TOBN(0x06e78457, 0x94b56526), TOBN(0x415cb80f, 0x0961002d),
+ TOBN(0x89e5c565, 0x76dcb10f), TOBN(0x8bbb6982, 0xff9259fe),
+ TOBN(0x4fe8795b, 0x9abc2668), TOBN(0xb5d4f534, 0x1e678fb1),
+ TOBN(0x6601f3be, 0x7b7da2b9), TOBN(0x98da59e2, 0xa13d6805),
+ TOBN(0x190d8ea6, 0x01799a52), TOBN(0xa20cec41, 0xb86d2952),
+ TOBN(0x3062ffb2, 0x7fff2a7c), TOBN(0x741b32e5, 0x79f19d37),
+ TOBN(0xf80d8181, 0x4eb57d47), TOBN(0x7a2d0ed4, 0x16aef06b),
+ TOBN(0x09735fb0, 0x1cecb588), TOBN(0x1641caaa, 0xc6061f5b)}
+ ,
+ {TOBN(0x7f99824f, 0x20151427), TOBN(0x206828b6, 0x92430206),
+ TOBN(0xaa9097d7, 0xe1112357), TOBN(0xacf9a2f2, 0x09e414ec),
+ TOBN(0xdbdac9da, 0x27915356), TOBN(0x7e0734b7, 0x001efee3),
+ TOBN(0x54fab5bb, 0xd2b288e2), TOBN(0x4c630fc4, 0xf62dd09c),
+ TOBN(0x8537107a, 0x1ac2703b), TOBN(0xb49258d8, 0x6bc857b5),
+ TOBN(0x57df14de, 0xbcdaccd1), TOBN(0x24ab68d7, 0xc4ae8529),
+ TOBN(0x7ed8b5d4, 0x734e59d0), TOBN(0x5f8740c8, 0xc495cc80),
+ TOBN(0x84aedd5a, 0x291db9b3), TOBN(0x80b360f8, 0x4fb995be),
+ TOBN(0xae915f5d, 0x5fa067d1), TOBN(0x4134b57f, 0x9668960c),
+ TOBN(0xbd3656d6, 0xa48edaac), TOBN(0xdac1e3e4, 0xfc1d7436),
+ TOBN(0x674ff869, 0xd81fbb26), TOBN(0x449ed3ec, 0xb26c33d4),
+ TOBN(0x85138705, 0xd94203e8), TOBN(0xccde538b, 0xbeeb6f4a),
+ TOBN(0x55d5c68d, 0xa61a76fa), TOBN(0x598b441d, 0xca1554dc),
+ TOBN(0xd39923b9, 0x773b279c), TOBN(0x33331d3c, 0x36bf9efc),
+ TOBN(0x2d4c848e, 0x298de399), TOBN(0xcfdb8e77, 0xa1a27f56),
+ TOBN(0x94c855ea, 0x57b8ab70), TOBN(0xdcdb9dae, 0x6f7879ba),
+ TOBN(0x7bdff8c2, 0x019f2a59), TOBN(0xb3ce5bb3, 0xcb4fbc74),
+ TOBN(0xea907f68, 0x8a9173dd), TOBN(0x6cd3d0d3, 0x95a75439),
+ TOBN(0x92ecc4d6, 0xefed021c), TOBN(0x09a9f9b0, 0x6a77339a),
+ TOBN(0x87ca6b15, 0x7188c64a), TOBN(0x10c29968, 0x44899158),
+ TOBN(0x5859a229, 0xed6e82ef), TOBN(0x16f338e3, 0x65ebaf4e),
+ TOBN(0x0cd31387, 0x5ead67ae), TOBN(0x1c73d228, 0x54ef0bb4),
+ TOBN(0x4cb55131, 0x74a5c8c7), TOBN(0x01cd2970, 0x7f69ad6a),
+ TOBN(0xa04d00dd, 0xe966f87e), TOBN(0xd96fe447, 0x0b7b0321),
+ TOBN(0x342ac06e, 0x88fbd381), TOBN(0x02cd4a84, 0x5c35a493),
+ TOBN(0xe8fa89de, 0x54f1bbcd), TOBN(0x341d6367, 0x2575ed4c),
+ TOBN(0xebe357fb, 0xd238202b), TOBN(0x600b4d1a, 0xa984ead9),
+ TOBN(0xc35c9f44, 0x52436ea0), TOBN(0x96fe0a39, 0xa370751b),
+ TOBN(0x4c4f0736, 0x7f636a38), TOBN(0x9f943fb7, 0x0e76d5cb),
+ TOBN(0xb03510ba, 0xa8b68b8b), TOBN(0xc246780a, 0x9ed07a1f),
+ TOBN(0x3c051415, 0x6d549fc2), TOBN(0xc2953f31, 0x607781ca),
+ TOBN(0x955e2c69, 0xd8d95413), TOBN(0xb300fadc, 0x7bd282e3),
+ TOBN(0x81fe7b50, 0x87e9189f), TOBN(0xdb17375c, 0xf42dda27),
+ TOBN(0x22f7d896, 0xcf0a5904), TOBN(0xa0e57c5a, 0xebe348e6),
+ TOBN(0xa61011d3, 0xf40e3c80), TOBN(0xb1189321, 0x8db705c5),
+ TOBN(0x4ed9309e, 0x50fedec3), TOBN(0xdcf14a10, 0x4d6d5c1d),
+ TOBN(0x056c265b, 0x55691342), TOBN(0xe8e08504, 0x91049dc7),
+ TOBN(0x131329f5, 0xc9bae20a), TOBN(0x96c8b3e8, 0xd9dccdb4),
+ TOBN(0x8c5ff838, 0xfb4ee6b4), TOBN(0xfc5a9aeb, 0x41e8ccf0),
+ TOBN(0x7417b764, 0xfae050c6), TOBN(0x0953c3d7, 0x00452080),
+ TOBN(0x21372682, 0x38dfe7e8), TOBN(0xea417e15, 0x2bb79d4b),
+ TOBN(0x59641f1c, 0x76e7cf2d), TOBN(0x271e3059, 0xea0bcfcc),
+ TOBN(0x624c7dfd, 0x7253ecbd), TOBN(0x2f552e25, 0x4fca6186),
+ TOBN(0xcbf84ecd, 0x4d866e9c), TOBN(0x73967709, 0xf68d4610),
+ TOBN(0xa14b1163, 0xc27901b4), TOBN(0xfd9236e0, 0x899b8bf3),
+ TOBN(0x42b091ec, 0xcbc6da0a), TOBN(0xbb1dac6f, 0x5ad1d297),
+ TOBN(0x80e61d53, 0xa91cf76e), TOBN(0x4110a412, 0xd31f1ee7),
+ TOBN(0x2d87c3ba, 0x13efcf77), TOBN(0x1f374bb4, 0xdf450d76),
+ TOBN(0x5e78e2f2, 0x0d188dab), TOBN(0xe3968ed0, 0xf4b885ef),
+ TOBN(0x46c0568e, 0x7314570f), TOBN(0x31616338, 0x01170521),
+ TOBN(0x18e1e7e2, 0x4f0c8afe), TOBN(0x4caa75ff, 0xdeea78da),
+ TOBN(0x82db67f2, 0x7c5d8a51), TOBN(0x36a44d86, 0x6f505370),
+ TOBN(0xd72c5bda, 0x0333974f), TOBN(0x5db516ae, 0x27a70146),
+ TOBN(0x34705281, 0x210ef921), TOBN(0xbff17a8f, 0x0c9c38e5),
+ TOBN(0x78f4814e, 0x12476da1), TOBN(0xc1e16613, 0x33c16980),
+ TOBN(0x9e5b386f, 0x424d4bca), TOBN(0x4c274e87, 0xc85740de),
+ TOBN(0xb6a9b88d, 0x6c2f5226), TOBN(0x14d1b944, 0x550d7ca8),
+ TOBN(0x580c85fc, 0x1fc41709), TOBN(0xc1da368b, 0x54c6d519),
+ TOBN(0x2b0785ce, 0xd5113cf7), TOBN(0x0670f633, 0x5a34708f),
+ TOBN(0x46e23767, 0x15cc3f88), TOBN(0x1b480cfa, 0x50c72c8f),
+ TOBN(0x20288602, 0x4147519a), TOBN(0xd0981eac, 0x26b372f0),
+ TOBN(0xa9d4a7ca, 0xa785ebc8), TOBN(0xd953c50d, 0xdbdf58e9),
+ TOBN(0x9d6361cc, 0xfd590f8f), TOBN(0x72e9626b, 0x44e6c917),
+ TOBN(0x7fd96110, 0x22eb64cf), TOBN(0x863ebb7e, 0x9eb288f3),
+ TOBN(0x6e6ab761, 0x6aca8ee7), TOBN(0x97d10b39, 0xd7b40358),
+ TOBN(0x1687d377, 0x1e5feb0d), TOBN(0xc83e50e4, 0x8265a27a),
+ TOBN(0x8f75a9fe, 0xc954b313), TOBN(0xcc2e8f47, 0x310d1f61),
+ TOBN(0xf5ba81c5, 0x6557d0e0), TOBN(0x25f9680c, 0x3eaf6207),
+ TOBN(0xf95c6609, 0x4354080b), TOBN(0x5225bfa5, 0x7bf2fe1c),
+ TOBN(0xc5c004e2, 0x5c7d98fa), TOBN(0x3561bf1c, 0x019aaf60),
+ TOBN(0x5e6f9f17, 0xba151474), TOBN(0xdec2f934, 0xb04f6eca),
+ TOBN(0x64e368a1, 0x269acb1e), TOBN(0x1332d9e4, 0x0cdda493),
+ TOBN(0x60d6cf69, 0xdf23de05), TOBN(0x66d17da2, 0x009339a0),
+ TOBN(0x9fcac985, 0x0a693923), TOBN(0xbcf057fc, 0xed7c6a6d),
+ TOBN(0xc3c5c8c5, 0xf0b5662c), TOBN(0x25318dd8, 0xdcba4f24),
+ TOBN(0x60e8cb75, 0x082b69ff), TOBN(0x7c23b3ee, 0x1e728c01),
+ TOBN(0x15e10a0a, 0x097e4403), TOBN(0xcb3d0a86, 0x19854665),
+ TOBN(0x88d8e211, 0xd67d4826), TOBN(0xb39af66e, 0x0b9d2839),
+ TOBN(0xa5f94588, 0xbd475ca8), TOBN(0xe06b7966, 0xc077b80b),
+ TOBN(0xfedb1485, 0xda27c26c), TOBN(0xd290d33a, 0xfe0fd5e0),
+ TOBN(0xa40bcc47, 0xf34fb0fa), TOBN(0xb4760cc8, 0x1fb1ab09),
+ TOBN(0x8fca0993, 0xa273bfe3), TOBN(0x13e4fe07, 0xf70b213c),
+ TOBN(0x3bcdb992, 0xfdb05163), TOBN(0x8c484b11, 0x0c2b19b6),
+ TOBN(0x1acb815f, 0xaaf2e3e2), TOBN(0xc6905935, 0xb89ff1b4),
+ TOBN(0xb2ad6f9d, 0x586e74e1), TOBN(0x488883ad, 0x67b80484),
+ TOBN(0x758aa2c7, 0x369c3ddb), TOBN(0x8ab74e69, 0x9f9afd31),
+ TOBN(0x10fc2d28, 0x5e21beb1), TOBN(0x3484518a, 0x318c42f9),
+ TOBN(0x377427dc, 0x53cf40c3), TOBN(0x9de0781a, 0x391bc1d9),
+ TOBN(0x8faee858, 0x693807e1), TOBN(0xa3865327, 0x4e81ccc7),
+ TOBN(0x02c30ff2, 0x6f835b84), TOBN(0xb604437b, 0x0d3d38d4),
+ TOBN(0xb3fc8a98, 0x5ca1823d), TOBN(0xb82f7ec9, 0x03be0324),
+ TOBN(0xee36d761, 0xcf684a33), TOBN(0x5a01df0e, 0x9f29bf7d),
+ TOBN(0x686202f3, 0x1306583d), TOBN(0x05b10da0, 0x437c622e),
+ TOBN(0xbf9aaa0f, 0x076a7bc8), TOBN(0x25e94efb, 0x8f8f4e43),
+ TOBN(0x8a35c9b7, 0xfa3dc26d), TOBN(0xe0e5fb93, 0x96ff03c5),
+ TOBN(0xa77e3843, 0xebc394ce), TOBN(0xcede6595, 0x8361de60),
+ TOBN(0xd27c22f6, 0xa1993545), TOBN(0xab01cc36, 0x24d671ba),
+ TOBN(0x63fa2877, 0xa169c28e), TOBN(0x925ef904, 0x2eb08376),
+ TOBN(0x3b2fa3cf, 0x53aa0b32), TOBN(0xb27beb5b, 0x71c49d7a),
+ TOBN(0xb60e1834, 0xd105e27f), TOBN(0xd6089788, 0x4f68570d),
+ TOBN(0x23094ce0, 0xd6fbc2ac), TOBN(0x738037a1, 0x815ff551),
+ TOBN(0xda73b1bb, 0x6bef119c), TOBN(0xdcf6c430, 0xeef506ba),
+ TOBN(0x00e4fe7b, 0xe3ef104a), TOBN(0xebdd9a2c, 0x0a065628),
+ TOBN(0x853a81c3, 0x8792043e), TOBN(0x22ad6ece, 0xb3b59108),
+ TOBN(0x9fb813c0, 0x39cd297d), TOBN(0x8ec7e16e, 0x05bda5d9),
+ TOBN(0x2834797c, 0x0d104b96), TOBN(0xcc11a2e7, 0x7c511510),
+ TOBN(0x96ca5a53, 0x96ee6380), TOBN(0x054c8655, 0xcea38742),
+ TOBN(0xb5946852, 0xd54dfa7d), TOBN(0x97c422e7, 0x1f4ab207),
+ TOBN(0xbf907509, 0x0c22b540), TOBN(0x2cde42aa, 0xb7c267d4),
+ TOBN(0xba18f9ed, 0x5ab0d693), TOBN(0x3ba62aa6, 0x6e4660d9),
+ TOBN(0xb24bf97b, 0xab9ea96a), TOBN(0x5d039642, 0xe3b60e32),
+ TOBN(0x4e6a4506, 0x7c4d9bd5), TOBN(0x666c5b9e, 0x7ed4a6a4),
+ TOBN(0xfa3fdcd9, 0x8edbd7cc), TOBN(0x4660bb87, 0xc6ccd753),
+ TOBN(0x9ae90820, 0x21e6b64f), TOBN(0x8a56a713, 0xb36bfb3f),
+ TOBN(0xabfce096, 0x5726d47f), TOBN(0x9eed01b2, 0x0b1a9a7f),
+ TOBN(0x30e9cad4, 0x4eb74a37), TOBN(0x7b2524cc, 0x53e9666d),
+ TOBN(0x6a29683b, 0x8f4b002f), TOBN(0xc2200d7a, 0x41f4fc20),
+ TOBN(0xcf3af47a, 0x3a338acc), TOBN(0x6539a4fb, 0xe7128975),
+ TOBN(0xcec31c14, 0xc33c7fcf), TOBN(0x7eb6799b, 0xc7be322b),
+ TOBN(0x119ef4e9, 0x6646f623), TOBN(0x7b7a26a5, 0x54d7299b),
+ TOBN(0xcb37f08d, 0x403f46f2), TOBN(0x94b8fc43, 0x1a0ec0c7),
+ TOBN(0xbb8514e3, 0xc332142f), TOBN(0xf3ed2c33, 0xe80d2a7a),
+ TOBN(0x8d2080af, 0xb639126c), TOBN(0xf7b6be60, 0xe3553ade),
+ TOBN(0x3950aa9f, 0x1c7e2b09), TOBN(0x847ff958, 0x6410f02b),
+ TOBN(0x877b7cf5, 0x678a31b0), TOBN(0xd50301ae, 0x3998b620),
+ TOBN(0x734257c5, 0xc00fb396), TOBN(0xf9fb18a0, 0x04e672a6),
+ TOBN(0xff8bd8eb, 0xe8758851), TOBN(0x1e64e4c6, 0x5d99ba44),
+ TOBN(0x4b8eaedf, 0x7dfd93b7), TOBN(0xba2f2a98, 0x04e76b8c),
+ TOBN(0x7d790cba, 0xe8053433), TOBN(0xc8e725a0, 0x3d2c9585),
+ TOBN(0x58c5c476, 0xcdd8f5ed), TOBN(0xd106b952, 0xefa9fe1d),
+ TOBN(0x3c5c775b, 0x0eff13a9), TOBN(0x242442ba, 0xe057b930),
+ TOBN(0xe9f458d4, 0xc9b70cbd), TOBN(0x69b71448, 0xa3cdb89a),
+ TOBN(0x41ee46f6, 0x0e2ed742), TOBN(0x573f1045, 0x40067493),
+ TOBN(0xb1e154ff, 0x9d54c304), TOBN(0x2ad0436a, 0x8d3a7502),
+ TOBN(0xee4aaa2d, 0x431a8121), TOBN(0xcd38b3ab, 0x886f11ed),
+ TOBN(0x57d49ea6, 0x034a0eb7), TOBN(0xd2b773bd, 0xf7e85e58),
+ TOBN(0x4a559ac4, 0x9b5c1f14), TOBN(0xc444be1a, 0x3e54df2b),
+ TOBN(0x13aad704, 0xeda41891), TOBN(0xcd927bec, 0x5eb5c788),
+ TOBN(0xeb3c8516, 0xe48c8a34), TOBN(0x1b7ac812, 0x4b546669),
+ TOBN(0x1815f896, 0x594df8ec), TOBN(0x87c6a79c, 0x79227865),
+ TOBN(0xae02a2f0, 0x9b56ddbd), TOBN(0x1339b5ac, 0x8a2f1cf3),
+ TOBN(0xf2b569c7, 0x839dff0d), TOBN(0xb0b9e864, 0xfee9a43d),
+ TOBN(0x4ff8ca41, 0x77bb064e), TOBN(0x145a2812, 0xfd249f63),
+ TOBN(0x3ab7beac, 0xf86f689a), TOBN(0x9bafec27, 0x01d35f5e),
+ TOBN(0x28054c65, 0x4265aa91), TOBN(0xa4b18304, 0x035efe42),
+ TOBN(0x6887b0e6, 0x9639dec7), TOBN(0xf4b8f6ad, 0x3d52aea5),
+ TOBN(0xfb9293cc, 0x971a8a13), TOBN(0x3f159e5d, 0x4c934d07),
+ TOBN(0x2c50e9b1, 0x09acbc29), TOBN(0x08eb65e6, 0x7154d129),
+ TOBN(0x4feff589, 0x30b75c3e), TOBN(0x0bb82fe2, 0x94491c93),
+ TOBN(0xd8ac377a, 0x89af62bb), TOBN(0xd7b51490, 0x9685e49f),
+ TOBN(0xabca9a7b, 0x04497f19), TOBN(0x1b35ed0a, 0x1a7ad13f),
+ TOBN(0x6b601e21, 0x3ec86ed6), TOBN(0xda91fcb9, 0xce0c76f1),
+ TOBN(0x9e28507b, 0xd7ab27e1), TOBN(0x7c19a555, 0x63945b7b),
+ TOBN(0x6b43f0a1, 0xaafc9827), TOBN(0x443b4fbd, 0x3aa55b91),
+ TOBN(0x962b2e65, 0x6962c88f), TOBN(0x139da8d4, 0xce0db0ca),
+ TOBN(0xb93f05dd, 0x1b8d6c4f), TOBN(0x779cdff7, 0x180b9824),
+ TOBN(0xbba23fdd, 0xae57c7b7), TOBN(0x345342f2, 0x1b932522),
+ TOBN(0xfd9c80fe, 0x556d4aa3), TOBN(0xa03907ba, 0x6525bb61),
+ TOBN(0x38b010e1, 0xff218933), TOBN(0xc066b654, 0xaa52117b),
+ TOBN(0x8e141920, 0x94f2e6ea), TOBN(0x66a27dca, 0x0d32f2b2),
+ TOBN(0x69c7f993, 0x048b3717), TOBN(0xbf5a989a, 0xb178ae1c),
+ TOBN(0x49fa9058, 0x564f1d6b), TOBN(0x27ec6e15, 0xd31fde4e),
+ TOBN(0x4cce0373, 0x7276e7fc), TOBN(0x64086d79, 0x89d6bf02),
+ TOBN(0x5a72f046, 0x4ccdd979), TOBN(0x909c3566, 0x47775631),
+ TOBN(0x1c07bc6b, 0x75dd7125), TOBN(0xb4c6bc97, 0x87a0428d),
+ TOBN(0x507ece52, 0xfdeb6b9d), TOBN(0xfca56512, 0xb2c95432),
+ TOBN(0x15d97181, 0xd0e8bd06), TOBN(0x384dd317, 0xc6bb46ea),
+ TOBN(0x5441ea20, 0x3952b624), TOBN(0xbcf70dee, 0x4e7dc2fb),
+ TOBN(0x372b016e, 0x6628e8c3), TOBN(0x07a0d667, 0xb60a7522),
+ TOBN(0xcf05751b, 0x0a344ee2), TOBN(0x0ec09a48, 0x118bdeec),
+ TOBN(0x6e4b3d4e, 0xd83dce46), TOBN(0x43a6316d, 0x99d2fc6e),
+ TOBN(0xa99d8989, 0x56cf044c), TOBN(0x7c7f4454, 0xae3e5fb7),
+ TOBN(0xb2e6b121, 0xfbabbe92), TOBN(0x281850fb, 0xe1330076),
+ TOBN(0x093581ec, 0x97890015), TOBN(0x69b1dded, 0x75ff77f5),
+ TOBN(0x7cf0b18f, 0xab105105), TOBN(0x953ced31, 0xa89ccfef),
+ TOBN(0x3151f85f, 0xeb914009), TOBN(0x3c9f1b87, 0x88ed48ad),
+ TOBN(0xc9aba1a1, 0x4a7eadcb), TOBN(0x928e7501, 0x522e71cf),
+ TOBN(0xeaede727, 0x3a2e4f83), TOBN(0x467e10d1, 0x1ce3bbd3),
+ TOBN(0xf3442ac3, 0xb955dcf0), TOBN(0xba96307d, 0xd3d5e527),
+ TOBN(0xf763a10e, 0xfd77f474), TOBN(0x5d744bd0, 0x6a6e1ff0),
+ TOBN(0xd287282a, 0xa777899e), TOBN(0xe20eda8f, 0xd03f3cde),
+ TOBN(0x6a7e75bb, 0x50b07d31), TOBN(0x0b7e2a94, 0x6f379de4),
+ TOBN(0x31cb64ad, 0x19f593cf), TOBN(0x7b1a9e4f, 0x1e76ef1d),
+ TOBN(0xe18c9c9d, 0xb62d609c), TOBN(0x439bad6d, 0xe779a650),
+ TOBN(0x219d9066, 0xe032f144), TOBN(0x1db632b8, 0xe8b2ec6a),
+ TOBN(0xff0d0fd4, 0xfda12f78), TOBN(0x56fb4c2d, 0x2a25d265),
+ TOBN(0x5f4e2ee1, 0x255a03f1), TOBN(0x61cd6af2, 0xe96af176),
+ TOBN(0xe0317ba8, 0xd068bc97), TOBN(0x927d6bab, 0x264b988e),
+ TOBN(0xa18f07e0, 0xe90fb21e), TOBN(0x00fd2b80, 0xbba7fca1),
+ TOBN(0x20387f27, 0x95cd67b5), TOBN(0x5b89a4e7, 0xd39707f7),
+ TOBN(0x8f83ad3f, 0x894407ce), TOBN(0xa0025b94, 0x6c226132),
+ TOBN(0xc79563c7, 0xf906c13b), TOBN(0x5f548f31, 0x4e7bb025),
+ TOBN(0x2b4c6b8f, 0xeac6d113), TOBN(0xa67e3f9c, 0x0e813c76),
+ TOBN(0x3982717c, 0x3fe1f4b9), TOBN(0x58865819, 0x26d8050e),
+ TOBN(0x99f3640c, 0xf7f06f20), TOBN(0xdc610216, 0x2a66ebc2),
+ TOBN(0x52f2c175, 0x767a1e08), TOBN(0x05660e1a, 0x5999871b),
+ TOBN(0x6b0f1762, 0x6d3c4693), TOBN(0xf0e7d627, 0x37ed7bea),
+ TOBN(0xc51758c7, 0xb75b226d), TOBN(0x40a88628, 0x1f91613b),
+ TOBN(0x889dbaa7, 0xbbb38ce0), TOBN(0xe0404b65, 0xbddcad81),
+ TOBN(0xfebccd3a, 0x8bc9671f), TOBN(0xfbf9a357, 0xee1f5375),
+ TOBN(0x5dc169b0, 0x28f33398), TOBN(0xb07ec11d, 0x72e90f65),
+ TOBN(0xae7f3b4a, 0xfaab1eb1), TOBN(0xd970195e, 0x5f17538a),
+ TOBN(0x52b05cbe, 0x0181e640), TOBN(0xf5debd62, 0x2643313d),
+ TOBN(0x76148154, 0x5df31f82), TOBN(0x23e03b33, 0x3a9e13c5),
+ TOBN(0xff758949, 0x4fde0c1f), TOBN(0xbf8a1abe, 0xe5b6ec20),
+ TOBN(0x702278fb, 0x87e1db6c), TOBN(0xc447ad7a, 0x35ed658f),
+ TOBN(0x48d4aa38, 0x03d0ccf2), TOBN(0x80acb338, 0x819a7c03),
+ TOBN(0x9bc7c89e, 0x6e17cecc), TOBN(0x46736b8b, 0x03be1d82),
+ TOBN(0xd65d7b60, 0xc0432f96), TOBN(0xddebe7a3, 0xdeb5442f),
+ TOBN(0x79a25307, 0x7dff69a2), TOBN(0x37a56d94, 0x02cf3122),
+ TOBN(0x8bab8aed, 0xf2350d0a), TOBN(0x13c3f276, 0x037b0d9a),
+ TOBN(0xc664957c, 0x44c65cae), TOBN(0x88b44089, 0xc2e71a88),
+ TOBN(0xdb88e5a3, 0x5cb02664), TOBN(0x5d4c0bf1, 0x8686c72e),
+ TOBN(0xea3d9b62, 0xa682d53e), TOBN(0x9b605ef4, 0x0b2ad431),
+ TOBN(0x71bac202, 0xc69645d0), TOBN(0xa115f03a, 0x6a1b66e7),
+ TOBN(0xfe2c563a, 0x158f4dc4), TOBN(0xf715b3a0, 0x4d12a78c),
+ TOBN(0x8f7f0a48, 0xd413213a), TOBN(0x2035806d, 0xc04becdb),
+ TOBN(0xecd34a99, 0x5d8587f5), TOBN(0x4d8c3079, 0x9f6d3a71),
+ TOBN(0x1b2a2a67, 0x8d95a8f6), TOBN(0xc58c9d7d, 0xf2110d0d),
+ TOBN(0xdeee81d5, 0xcf8fba3f), TOBN(0xa42be3c0, 0x0c7cdf68),
+ TOBN(0x2126f742, 0xd43b5eaa), TOBN(0x054a0766, 0xdfa59b85),
+ TOBN(0x9d0d5e36, 0x126bfd45), TOBN(0xa1f8fbd7, 0x384f8a8f),
+ TOBN(0x317680f5, 0xd563fccc), TOBN(0x48ca5055, 0xf280a928),
+ TOBN(0xe00b81b2, 0x27b578cf), TOBN(0x10aad918, 0x2994a514),
+ TOBN(0xd9e07b62, 0xb7bdc953), TOBN(0x9f0f6ff2, 0x5bc086dd),
+ TOBN(0x09d1ccff, 0x655eee77), TOBN(0x45475f79, 0x5bef7df1),
+ TOBN(0x3faa28fa, 0x86f702cc), TOBN(0x92e60905, 0x0f021f07),
+ TOBN(0xe9e62968, 0x7f8fa8c6), TOBN(0xbd71419a, 0xf036ea2c),
+ TOBN(0x171ee1cc, 0x6028da9a), TOBN(0x5352fe1a, 0xc251f573),
+ TOBN(0xf8ff236e, 0x3fa997f4), TOBN(0xd831b6c9, 0xa5749d5f),
+ TOBN(0x7c872e1d, 0xe350e2c2), TOBN(0xc56240d9, 0x1e0ce403),
+ TOBN(0xf9deb077, 0x6974f5cb), TOBN(0x7d50ba87, 0x961c3728),
+ TOBN(0xd6f89426, 0x5a3a2518), TOBN(0xcf817799, 0xc6303d43),
+ TOBN(0x510a0471, 0x619e5696), TOBN(0xab049ff6, 0x3a5e307b),
+ TOBN(0xe4cdf9b0, 0xfeb13ec7), TOBN(0xd5e97117, 0x9d8ff90c),
+ TOBN(0xf6f64d06, 0x9afa96af), TOBN(0x00d0bf5e, 0x9d2012a2),
+ TOBN(0xe63f301f, 0x358bcdc0), TOBN(0x07689e99, 0x0a9d47f8),
+ TOBN(0x1f689e2f, 0x4f43d43a), TOBN(0x4d542a16, 0x90920904),
+ TOBN(0xaea293d5, 0x9ca0a707), TOBN(0xd061fe45, 0x8ac68065),
+ TOBN(0x1033bf1b, 0x0090008c), TOBN(0x29749558, 0xc08a6db6),
+ TOBN(0x74b5fc59, 0xc1d5d034), TOBN(0xf712e9f6, 0x67e215e0),
+ TOBN(0xfd520cbd, 0x860200e6), TOBN(0x0229acb4, 0x3ea22588),
+ TOBN(0x9cd1e14c, 0xfff0c82e), TOBN(0x87684b62, 0x59c69e73),
+ TOBN(0xda85e61c, 0x96ccb989), TOBN(0x2d5dbb02, 0xa3d06493),
+ TOBN(0xf22ad33a, 0xe86b173c), TOBN(0xe8e41ea5, 0xa79ff0e3),
+ TOBN(0x01d2d725, 0xdd0d0c10), TOBN(0x31f39088, 0x032d28f9),
+ TOBN(0x7b3f71e1, 0x7829839e), TOBN(0x0cf691b4, 0x4502ae58),
+ TOBN(0xef658dbd, 0xbefc6115), TOBN(0xa5cd6ee5, 0xb3ab5314),
+ TOBN(0x206c8d7b, 0x5f1d2347), TOBN(0x794645ba, 0x4cc2253a),
+ TOBN(0xd517d8ff, 0x58389e08), TOBN(0x4fa20dee, 0x9f847288),
+ TOBN(0xeba072d8, 0xd797770a), TOBN(0x7360c91d, 0xbf429e26),
+ TOBN(0x7200a3b3, 0x80af8279), TOBN(0x6a1c9150, 0x82dadce3),
+ TOBN(0x0ee6d3a7, 0xc35d8794), TOBN(0x042e6558, 0x0356bae5),
+ TOBN(0x9f59698d, 0x643322fd), TOBN(0x9379ae15, 0x50a61967),
+ TOBN(0x64b9ae62, 0xfcc9981e), TOBN(0xaed3d631, 0x6d2934c6),
+ TOBN(0x2454b302, 0x5e4e65eb), TOBN(0xab09f647, 0xf9950428)}
+ ,
+ {TOBN(0xb2083a12, 0x22248acc), TOBN(0x1f6ec0ef, 0x3264e366),
+ TOBN(0x5659b704, 0x5afdee28), TOBN(0x7a823a40, 0xe6430bb5),
+ TOBN(0x24592a04, 0xe1900a79), TOBN(0xcde09d4a, 0xc9ee6576),
+ TOBN(0x52b6463f, 0x4b5ea54a), TOBN(0x1efe9ed3, 0xd3ca65a7),
+ TOBN(0xe27a6dbe, 0x305406dd), TOBN(0x8eb7dc7f, 0xdd5d1957),
+ TOBN(0xf54a6876, 0x387d4d8f), TOBN(0x9c479409, 0xc7762de4),
+ TOBN(0xbe4d5b5d, 0x99b30778), TOBN(0x25380c56, 0x6e793682),
+ TOBN(0x602d37f3, 0xdac740e3), TOBN(0x140deabe, 0x1566e4ae),
+ TOBN(0x4481d067, 0xafd32acf), TOBN(0xd8f0fcca, 0xe1f71ccf),
+ TOBN(0xd208dd0c, 0xb596f2da), TOBN(0xd049d730, 0x9aad93f9),
+ TOBN(0xc79f263d, 0x42ab580e), TOBN(0x09411bb1, 0x23f707b4),
+ TOBN(0x8cfde1ff, 0x835e0eda), TOBN(0x72707490, 0x90f03402),
+ TOBN(0xeaee6126, 0xc49a861e), TOBN(0x024f3b65, 0xe14f0d06),
+ TOBN(0x51a3f1e8, 0xc69bfc17), TOBN(0xc3c3a8e9, 0xa7686381),
+ TOBN(0x3400752c, 0xb103d4c8), TOBN(0x02bc4613, 0x9218b36b),
+ TOBN(0xc67f75eb, 0x7651504a), TOBN(0xd6848b56, 0xd02aebfa),
+ TOBN(0xbd9802e6, 0xc30fa92b), TOBN(0x5a70d96d, 0x9a552784),
+ TOBN(0x9085c4ea, 0x3f83169b), TOBN(0xfa9423bb, 0x06908228),
+ TOBN(0x2ffebe12, 0xfe97a5b9), TOBN(0x85da6049, 0x71b99118),
+ TOBN(0x9cbc2f7f, 0x63178846), TOBN(0xfd96bc70, 0x9153218e),
+ TOBN(0x958381db, 0x1782269b), TOBN(0xae34bf79, 0x2597e550),
+ TOBN(0xbb5c6064, 0x5f385153), TOBN(0x6f0e96af, 0xe3088048),
+ TOBN(0xbf6a0215, 0x77884456), TOBN(0xb3b5688c, 0x69310ea7),
+ TOBN(0x17c94295, 0x04fad2de), TOBN(0xe020f0e5, 0x17896d4d),
+ TOBN(0x730ba0ab, 0x0976505f), TOBN(0x567f6813, 0x095e2ec5),
+ TOBN(0x47062010, 0x6331ab71), TOBN(0x72cfa977, 0x41d22b9f),
+ TOBN(0x33e55ead, 0x8a2373da), TOBN(0xa8d0d5f4, 0x7ba45a68),
+ TOBN(0xba1d8f9c, 0x03029d15), TOBN(0x8f34f1cc, 0xfc55b9f3),
+ TOBN(0xcca4428d, 0xbbe5a1a9), TOBN(0x8187fd5f, 0x3126bd67),
+ TOBN(0x0036973a, 0x48105826), TOBN(0xa39b6663, 0xb8bd61a0),
+ TOBN(0x6d42deef, 0x2d65a808), TOBN(0x4969044f, 0x94636b19),
+ TOBN(0xf611ee47, 0xdd5d564c), TOBN(0x7b2f3a49, 0xd2873077),
+ TOBN(0x94157d45, 0x300eb294), TOBN(0x2b2a656e, 0x169c1494),
+ TOBN(0xc000dd76, 0xd3a47aa9), TOBN(0xa2864e4f, 0xa6243ea4),
+ TOBN(0x82716c47, 0xdb89842e), TOBN(0x12dfd7d7, 0x61479fb7),
+ TOBN(0x3b9a2c56, 0xe0b2f6dc), TOBN(0x46be862a, 0xd7f85d67),
+ TOBN(0x03b0d8dd, 0x0f82b214), TOBN(0x460c34f9, 0xf103cbc6),
+ TOBN(0xf32e5c03, 0x18d79e19), TOBN(0x8b8888ba, 0xa84117f8),
+ TOBN(0x8f3c37dc, 0xc0722677), TOBN(0x10d21be9, 0x1c1c0f27),
+ TOBN(0xd47c8468, 0xe0f7a0c6), TOBN(0x9bf02213, 0xadecc0e0),
+ TOBN(0x0baa7d12, 0x42b48b99), TOBN(0x1bcb665d, 0x48424096),
+ TOBN(0x8b847cd6, 0xebfb5cfb), TOBN(0x87c2ae56, 0x9ad4d10d),
+ TOBN(0xf1cbb122, 0x0de36726), TOBN(0xe7043c68, 0x3fdfbd21),
+ TOBN(0x4bd0826a, 0x4e79d460), TOBN(0x11f5e598, 0x4bd1a2cb),
+ TOBN(0x97554160, 0xb7fe7b6e), TOBN(0x7d16189a, 0x400a3fb2),
+ TOBN(0xd73e9bea, 0xe328ca1e), TOBN(0x0dd04b97, 0xe793d8cc),
+ TOBN(0xa9c83c9b, 0x506db8cc), TOBN(0x5cd47aae, 0xcf38814c),
+ TOBN(0x26fc430d, 0xb64b45e6), TOBN(0x079b5499, 0xd818ea84),
+ TOBN(0xebb01102, 0xc1c24a3b), TOBN(0xca24e568, 0x1c161c1a),
+ TOBN(0x103eea69, 0x36f00a4a), TOBN(0x9ad76ee8, 0x76176c7b),
+ TOBN(0x97451fc2, 0x538e0ff7), TOBN(0x94f89809, 0x6604b3b0),
+ TOBN(0x6311436e, 0x3249cfd7), TOBN(0x27b4a7bd, 0x41224f69),
+ TOBN(0x03b5d21a, 0xe0ac2941), TOBN(0x279b0254, 0xc2d31937),
+ TOBN(0x3307c052, 0xcac992d0), TOBN(0x6aa7cb92, 0xefa8b1f3),
+ TOBN(0x5a182580, 0x0d37c7a5), TOBN(0x13380c37, 0x342d5422),
+ TOBN(0x92ac2d66, 0xd5d2ef92), TOBN(0x035a70c9, 0x030c63c6),
+ TOBN(0xc16025dd, 0x4ce4f152), TOBN(0x1f419a71, 0xf9df7c06),
+ TOBN(0x6d5b2214, 0x91e4bb14), TOBN(0xfc43c6cc, 0x839fb4ce),
+ TOBN(0x49f06591, 0x925d6b2d), TOBN(0x4b37d9d3, 0x62186598),
+ TOBN(0x8c54a971, 0xd01b1629), TOBN(0xe1a9c29f, 0x51d50e05),
+ TOBN(0x5109b785, 0x71ba1861), TOBN(0x48b22d5c, 0xd0c8f93d),
+ TOBN(0xe8fa84a7, 0x8633bb93), TOBN(0x53fba6ba, 0x5aebbd08),
+ TOBN(0x7ff27df3, 0xe5eea7d8), TOBN(0x521c8796, 0x68ca7158),
+ TOBN(0xb9d5133b, 0xce6f1a05), TOBN(0x2d50cd53, 0xfd0ebee4),
+ TOBN(0xc82115d6, 0xc5a3ef16), TOBN(0x993eff9d, 0xba079221),
+ TOBN(0xe4da2c5e, 0x4b5da81c), TOBN(0x9a89dbdb, 0x8033fd85),
+ TOBN(0x60819ebf, 0x2b892891), TOBN(0x53902b21, 0x5d14a4d5),
+ TOBN(0x6ac35051, 0xd7fda421), TOBN(0xcc6ab885, 0x61c83284),
+ TOBN(0x14eba133, 0xf74cff17), TOBN(0x240aaa03, 0xecb813f2),
+ TOBN(0xcfbb6540, 0x6f665bee), TOBN(0x084b1fe4, 0xa425ad73),
+ TOBN(0x009d5d16, 0xd081f6a6), TOBN(0x35304fe8, 0xeef82c90),
+ TOBN(0xf20346d5, 0xaa9eaa22), TOBN(0x0ada9f07, 0xac1c91e3),
+ TOBN(0xa6e21678, 0x968a6144), TOBN(0x54c1f77c, 0x07b31a1e),
+ TOBN(0xd6bb787e, 0x5781fbe1), TOBN(0x61bd2ee0, 0xe31f1c4a),
+ TOBN(0xf25aa1e9, 0x781105fc), TOBN(0x9cf2971f, 0x7b2f8e80),
+ TOBN(0x26d15412, 0xcdff919b), TOBN(0x01db4ebe, 0x34bc896e),
+ TOBN(0x7d9b3e23, 0xb40df1cf), TOBN(0x59337373, 0x94e971b4),
+ TOBN(0xbf57bd14, 0x669cf921), TOBN(0x865daedf, 0x0c1a1064),
+ TOBN(0x3eb70bd3, 0x83279125), TOBN(0xbc3d5b9f, 0x34ecdaab),
+ TOBN(0x91e3ed7e, 0x5f755caf), TOBN(0x49699f54, 0xd41e6f02),
+ TOBN(0x185770e1, 0xd4a7a15b), TOBN(0x08f3587a, 0xeaac87e7),
+ TOBN(0x352018db, 0x473133ea), TOBN(0x674ce719, 0x04fd30fc),
+ TOBN(0x7b8d9835, 0x088b3e0e), TOBN(0x7a0356a9, 0x5d0d47a1),
+ TOBN(0x9d9e7659, 0x6474a3c4), TOBN(0x61ea48a7, 0xff66966c),
+ TOBN(0x30417758, 0x0f3e4834), TOBN(0xfdbb21c2, 0x17a9afcb),
+ TOBN(0x756fa17f, 0x2f9a67b3), TOBN(0x2a6b2421, 0xa245c1a8),
+ TOBN(0x64be2794, 0x4af02291), TOBN(0xade465c6, 0x2a5804fe),
+ TOBN(0x8dffbd39, 0xa6f08fd7), TOBN(0xc4efa84c, 0xaa14403b),
+ TOBN(0xa1b91b2a, 0x442b0f5c), TOBN(0xb748e317, 0xcf997736),
+ TOBN(0x8d1b62bf, 0xcee90e16), TOBN(0x907ae271, 0x0b2078c0),
+ TOBN(0xdf31534b, 0x0c9bcddd), TOBN(0x043fb054, 0x39adce83),
+ TOBN(0x99031043, 0xd826846a), TOBN(0x61a9c0d6, 0xb144f393),
+ TOBN(0xdab48046, 0x47718427), TOBN(0xdf17ff9b, 0x6e830f8b),
+ TOBN(0x408d7ee8, 0xe49a1347), TOBN(0x6ac71e23, 0x91c1d4ae),
+ TOBN(0xc8cbb9fd, 0x1defd73c), TOBN(0x19840657, 0xbbbbfec5),
+ TOBN(0x39db1cb5, 0x9e7ef8ea), TOBN(0x78aa8296, 0x64105f30),
+ TOBN(0xa3d9b7f0, 0xa3738c29), TOBN(0x0a2f235a, 0xbc3250a3),
+ TOBN(0x55e506f6, 0x445e4caf), TOBN(0x0974f73d, 0x33475f7a),
+ TOBN(0xd37dbba3, 0x5ba2f5a8), TOBN(0x542c6e63, 0x6af40066),
+ TOBN(0x26d99b53, 0xc5d73e2c), TOBN(0x06060d7d, 0x6c3ca33e),
+ TOBN(0xcdbef1c2, 0x065fef4a), TOBN(0x77e60f7d, 0xfd5b92e3),
+ TOBN(0xd7c549f0, 0x26708350), TOBN(0x201b3ad0, 0x34f121bf),
+ TOBN(0x5fcac2a1, 0x0334fc14), TOBN(0x8a9a9e09, 0x344552f6),
+ TOBN(0x7dd8a1d3, 0x97653082), TOBN(0x5fc0738f, 0x79d4f289),
+ TOBN(0x787d244d, 0x17d2d8c3), TOBN(0xeffc6345, 0x70830684),
+ TOBN(0x5ddb96dd, 0xe4f73ae5), TOBN(0x8efb14b1, 0x172549a5),
+ TOBN(0x6eb73eee, 0x2245ae7a), TOBN(0xbca4061e, 0xea11f13e),
+ TOBN(0xb577421d, 0x30b01f5d), TOBN(0xaa688b24, 0x782e152c),
+ TOBN(0x67608e71, 0xbd3502ba), TOBN(0x4ef41f24, 0xb4de75a0),
+ TOBN(0xb08dde5e, 0xfd6125e5), TOBN(0xde484825, 0xa409543f),
+ TOBN(0x1f198d98, 0x65cc2295), TOBN(0x428a3771, 0x6e0edfa2),
+ TOBN(0x4f9697a2, 0xadf35fc7), TOBN(0x01a43c79, 0xf7cac3c7),
+ TOBN(0xb05d7059, 0x0fd3659a), TOBN(0x8927f30c, 0xbb7f2d9a),
+ TOBN(0x4023d1ac, 0x8cf984d3), TOBN(0x32125ed3, 0x02897a45),
+ TOBN(0xfb572dad, 0x3d414205), TOBN(0x73000ef2, 0xe3fa82a9),
+ TOBN(0x4c0868e9, 0xf10a5581), TOBN(0x5b61fc67, 0x6b0b3ca5),
+ TOBN(0xc1258d5b, 0x7cae440c), TOBN(0x21c08b41, 0x402b7531),
+ TOBN(0xf61a8955, 0xde932321), TOBN(0x3568faf8, 0x2d1408af),
+ TOBN(0x71b15e99, 0x9ecf965b), TOBN(0xf14ed248, 0xe917276f),
+ TOBN(0xc6f4caa1, 0x820cf9e2), TOBN(0x681b20b2, 0x18d83c7e),
+ TOBN(0x6cde738d, 0xc6c01120), TOBN(0x71db0813, 0xae70e0db),
+ TOBN(0x95fc0644, 0x74afe18c), TOBN(0x34619053, 0x129e2be7),
+ TOBN(0x80615cea, 0xdb2a3b15), TOBN(0x0a49a19e, 0xdb4c7073),
+ TOBN(0x0e1b84c8, 0x8fd2d367), TOBN(0xd74bf462, 0x033fb8aa),
+ TOBN(0x889f6d65, 0x533ef217), TOBN(0x7158c7e4, 0xc3ca2e87),
+ TOBN(0xfb670dfb, 0xdc2b4167), TOBN(0x75910a01, 0x844c257f),
+ TOBN(0xf336bf07, 0xcf88577d), TOBN(0x22245250, 0xe45e2ace),
+ TOBN(0x2ed92e8d, 0x7ca23d85), TOBN(0x29f8be4c, 0x2b812f58),
+ TOBN(0xdd9ebaa7, 0x076fe12b), TOBN(0x3f2400cb, 0xae1537f9),
+ TOBN(0x1aa93528, 0x17bdfb46), TOBN(0xc0f98430, 0x67883b41),
+ TOBN(0x5590ede1, 0x0170911d), TOBN(0x7562f5bb, 0x34d4b17f),
+ TOBN(0xe1fa1df2, 0x1826b8d2), TOBN(0xb40b796a, 0x6bd80d59),
+ TOBN(0xd65bf197, 0x3467ba92), TOBN(0x8c9b46db, 0xf70954b0),
+ TOBN(0x97c8a0f3, 0x0e78f15d), TOBN(0xa8f3a69a, 0x85a4c961),
+ TOBN(0x4242660f, 0x61e4ce9b), TOBN(0xbf06aab3, 0x6ea6790c),
+ TOBN(0xc6706f8e, 0xec986416), TOBN(0x9e56dec1, 0x9a9fc225),
+ TOBN(0x527c46f4, 0x9a9898d9), TOBN(0xd799e77b, 0x5633cdef),
+ TOBN(0x24eacc16, 0x7d9e4297), TOBN(0xabb61cea, 0x6b1cb734),
+ TOBN(0xbee2e8a7, 0xf778443c), TOBN(0x3bb42bf1, 0x29de2fe6),
+ TOBN(0xcbed86a1, 0x3003bb6f), TOBN(0xd3918e6c, 0xd781cdf6),
+ TOBN(0x4bee3271, 0x9a5103f1), TOBN(0x5243efc6, 0xf50eac06),
+ TOBN(0xb8e122cb, 0x6adcc119), TOBN(0x1b7faa84, 0xc0b80a08),
+ TOBN(0x32c3d1bd, 0x6dfcd08c), TOBN(0x129dec4e, 0x0be427de),
+ TOBN(0x98ab679c, 0x1d263c83), TOBN(0xafc83cb7, 0xcef64eff),
+ TOBN(0x85eb6088, 0x2fa6be76), TOBN(0x892585fb, 0x1328cbfe),
+ TOBN(0xc154d3ed, 0xcf618dda), TOBN(0xc44f601b, 0x3abaf26e),
+ TOBN(0x7bf57d0b, 0x2be1fdfd), TOBN(0xa833bd2d, 0x21137fee),
+ TOBN(0x9353af36, 0x2db591a8), TOBN(0xc76f26dc, 0x5562a056),
+ TOBN(0x1d87e47d, 0x3fdf5a51), TOBN(0x7afb5f93, 0x55c9cab0),
+ TOBN(0x91bbf58f, 0x89e0586e), TOBN(0x7c72c018, 0x0d843709),
+ TOBN(0xa9a5aafb, 0x99b5c3dc), TOBN(0xa48a0f1d, 0x3844aeb0),
+ TOBN(0x7178b7dd, 0xb667e482), TOBN(0x453985e9, 0x6e23a59a),
+ TOBN(0x4a54c860, 0x01b25dd8), TOBN(0x0dd37f48, 0xfb897c8a),
+ TOBN(0x5f8aa610, 0x0ea90cd9), TOBN(0xc8892c68, 0x16d5830d),
+ TOBN(0xeb4befc0, 0xef514ca5), TOBN(0x478eb679, 0xe72c9ee6),
+ TOBN(0x9bca20da, 0xdbc40d5f), TOBN(0xf015de21, 0xdde4f64a),
+ TOBN(0xaa6a4de0, 0xeaf4b8a5), TOBN(0x68cfd9ca, 0x4bc60e32),
+ TOBN(0x668a4b01, 0x7fd15e70), TOBN(0xd9f0694a, 0xf27dc09d),
+ TOBN(0xf6c3cad5, 0xba708bcd), TOBN(0x5cd2ba69, 0x5bb95c2a),
+ TOBN(0xaa28c1d3, 0x33c0a58f), TOBN(0x23e274e3, 0xabc77870),
+ TOBN(0x44c3692d, 0xdfd20a4a), TOBN(0x091c5fd3, 0x81a66653),
+ TOBN(0x6c0bb691, 0x09a0757d), TOBN(0x9072e8b9, 0x667343ea),
+ TOBN(0x31d40eb0, 0x80848bec), TOBN(0x95bd480a, 0x79fd36cc),
+ TOBN(0x01a77c61, 0x65ed43f5), TOBN(0xafccd127, 0x2e0d40bf),
+ TOBN(0xeccfc82d, 0x1cc1884b), TOBN(0xc85ac201, 0x5d4753b4),
+ TOBN(0xc7a6caac, 0x658e099f), TOBN(0xcf46369e, 0x04b27390),
+ TOBN(0xe2e7d049, 0x506467ea), TOBN(0x481b63a2, 0x37cdeccc),
+ TOBN(0x4029abd8, 0xed80143a), TOBN(0x28bfe3c7, 0xbcb00b88),
+ TOBN(0x3bec1009, 0x0643d84a), TOBN(0x885f3668, 0xabd11041),
+ TOBN(0xdb02432c, 0xf83a34d6), TOBN(0x32f7b360, 0x719ceebe),
+ TOBN(0xf06c7837, 0xdad1fe7a), TOBN(0x60a157a9, 0x5441a0b0),
+ TOBN(0x704970e9, 0xe2d47550), TOBN(0xcd2bd553, 0x271b9020),
+ TOBN(0xff57f82f, 0x33e24a0b), TOBN(0x9cbee23f, 0xf2565079),
+ TOBN(0x16353427, 0xeb5f5825), TOBN(0x276feec4, 0xe948d662),
+ TOBN(0xd1b62bc6, 0xda10032b), TOBN(0x718351dd, 0xf0e72a53),
+ TOBN(0x93452076, 0x2420e7ba), TOBN(0x96368fff, 0x3a00118d),
+ TOBN(0x00ce2d26, 0x150a49e4), TOBN(0x0c28b636, 0x3f04706b),
+ TOBN(0xbad65a46, 0x58b196d0), TOBN(0x6c8455fc, 0xec9f8b7c),
+ TOBN(0xe90c895f, 0x2d71867e), TOBN(0x5c0be31b, 0xedf9f38c),
+ TOBN(0x2a37a15e, 0xd8f6ec04), TOBN(0x239639e7, 0x8cd85251),
+ TOBN(0xd8975315, 0x9c7c4c6b), TOBN(0x603aa3c0, 0xd7409af7),
+ TOBN(0xb8d53d0c, 0x007132fb), TOBN(0x68d12af7, 0xa6849238),
+ TOBN(0xbe0607e7, 0xbf5d9279), TOBN(0x9aa50055, 0xaada74ce),
+ TOBN(0xe81079cb, 0xba7e8ccb), TOBN(0x610c71d1, 0xa5f4ff5e),
+ TOBN(0x9e2ee1a7, 0x5aa07093), TOBN(0xca84004b, 0xa75da47c),
+ TOBN(0x074d3951, 0x3de75401), TOBN(0xf938f756, 0xbb311592),
+ TOBN(0x96197618, 0x00a43421), TOBN(0x39a25362, 0x07bc78c8),
+ TOBN(0x278f710a, 0x0a171276), TOBN(0xb28446ea, 0x8d1a8f08),
+ TOBN(0x184781bf, 0xe3b6a661), TOBN(0x7751cb1d, 0xe6d279f7),
+ TOBN(0xf8ff95d6, 0xc59eb662), TOBN(0x186d90b7, 0x58d3dea7),
+ TOBN(0x0e4bb6c1, 0xdfb4f754), TOBN(0x5c5cf56b, 0x2b2801dc),
+ TOBN(0xc561e452, 0x1f54564d), TOBN(0xb4fb8c60, 0xf0dd7f13),
+ TOBN(0xf8849630, 0x33ff98c7), TOBN(0x9619fffa, 0xcf17769c),
+ TOBN(0xf8090bf6, 0x1bfdd80a), TOBN(0x14d9a149, 0x422cfe63),
+ TOBN(0xb354c360, 0x6f6df9ea), TOBN(0xdbcf770d, 0x218f17ea),
+ TOBN(0x207db7c8, 0x79eb3480), TOBN(0x213dbda8, 0x559b6a26),
+ TOBN(0xac4c200b, 0x29fc81b3), TOBN(0xebc3e09f, 0x171d87c1),
+ TOBN(0x91799530, 0x1481aa9e), TOBN(0x051b92e1, 0x92e114fa),
+ TOBN(0xdf8f92e9, 0xecb5537f), TOBN(0x44b1b2cc, 0x290c7483),
+ TOBN(0xa711455a, 0x2adeb016), TOBN(0x964b6856, 0x81a10c2c),
+ TOBN(0x4f159d99, 0xcec03623), TOBN(0x05532225, 0xef3271ea),
+ TOBN(0xb231bea3, 0xc5ee4849), TOBN(0x57a54f50, 0x7094f103),
+ TOBN(0x3e2d421d, 0x9598b352), TOBN(0xe865a49c, 0x67412ab4),
+ TOBN(0xd2998a25, 0x1cc3a912), TOBN(0x5d092808, 0x0c74d65d),
+ TOBN(0x73f45908, 0x4088567a), TOBN(0xeb6b280e, 0x1f214a61),
+ TOBN(0x8c9adc34, 0xcaf0c13d), TOBN(0x39d12938, 0xf561fb80),
+ TOBN(0xb2dc3a5e, 0xbc6edfb4), TOBN(0x7485b1b1, 0xfe4d210e),
+ TOBN(0x062e0400, 0xe186ae72), TOBN(0x91e32d5c, 0x6eeb3b88),
+ TOBN(0x6df574d7, 0x4be59224), TOBN(0xebc88ccc, 0x716d55f3),
+ TOBN(0x26c2e6d0, 0xcad6ed33), TOBN(0xc6e21e7d, 0x0d3e8b10),
+ TOBN(0x2cc5840e, 0x5bcc36bb), TOBN(0x9292445e, 0x7da74f69),
+ TOBN(0x8be8d321, 0x4e5193a8), TOBN(0x3ec23629, 0x8df06413),
+ TOBN(0xc7e9ae85, 0xb134defa), TOBN(0x6073b1d0, 0x1bb2d475),
+ TOBN(0xb9ad615e, 0x2863c00d), TOBN(0x9e29493d, 0x525f4ac4),
+ TOBN(0xc32b1dea, 0x4e9acf4f), TOBN(0x3e1f01c8, 0xa50db88d),
+ TOBN(0xb05d70ea, 0x04da916c), TOBN(0x714b0d0a, 0xd865803e),
+ TOBN(0x4bd493fc, 0x9920cb5e), TOBN(0x5b44b1f7, 0x92c7a3ac),
+ TOBN(0xa2a77293, 0xbcec9235), TOBN(0x5ee06e87, 0xcd378553),
+ TOBN(0xceff8173, 0xda621607), TOBN(0x2bb03e4c, 0x99f5d290),
+ TOBN(0x2945106a, 0xa6f734ac), TOBN(0xb5056604, 0xd25c4732),
+ TOBN(0x5945920c, 0xe079afee), TOBN(0x686e17a0, 0x6789831f),
+ TOBN(0x5966bee8, 0xb74a5ae5), TOBN(0x38a673a2, 0x1e258d46),
+ TOBN(0xbd1cc1f2, 0x83141c95), TOBN(0x3b2ecf4f, 0x0e96e486),
+ TOBN(0xcd3aa896, 0x74e5fc78), TOBN(0x415ec10c, 0x2482fa7a),
+ TOBN(0x15234419, 0x80503380), TOBN(0x513d917a, 0xd314b392),
+ TOBN(0xb0b52f4e, 0x63caecae), TOBN(0x07bf22ad, 0x2dc7780b),
+ TOBN(0xe761e8a1, 0xe4306839), TOBN(0x1b3be962, 0x5dd7feaa),
+ TOBN(0x4fe728de, 0x74c778f1), TOBN(0xf1fa0bda, 0x5e0070f6),
+ TOBN(0x85205a31, 0x6ec3f510), TOBN(0x2c7e4a14, 0xd2980475),
+ TOBN(0xde3c19c0, 0x6f30ebfd), TOBN(0xdb1c1f38, 0xd4b7e644),
+ TOBN(0xfe291a75, 0x5dce364a), TOBN(0xb7b22a3c, 0x058f5be3),
+ TOBN(0x2cd2c302, 0x37fea38c), TOBN(0x2930967a, 0x2e17be17),
+ TOBN(0x87f009de, 0x0c061c65), TOBN(0xcb014aac, 0xedc6ed44),
+ TOBN(0x49bd1cb4, 0x3bafb1eb), TOBN(0x81bd8b5c, 0x282d3688),
+ TOBN(0x1cdab87e, 0xf01a17af), TOBN(0x21f37ac4, 0xe710063b),
+ TOBN(0x5a6c5676, 0x42fc8193), TOBN(0xf4753e70, 0x56a6015c),
+ TOBN(0x020f795e, 0xa15b0a44), TOBN(0x8f37c8d7, 0x8958a958),
+ TOBN(0x63b7e89b, 0xa4b675b5), TOBN(0xb4fb0c0c, 0x0fc31aea),
+ TOBN(0xed95e639, 0xa7ff1f2e), TOBN(0x9880f5a3, 0x619614fb),
+ TOBN(0xdeb6ff02, 0x947151ab), TOBN(0x5bc5118c, 0xa868dcdb),
+ TOBN(0xd8da2055, 0x4c20cea5), TOBN(0xcac2776e, 0x14c4d69a),
+ TOBN(0xcccb22c1, 0x622d599b), TOBN(0xa4ddb653, 0x68a9bb50),
+ TOBN(0x2c4ff151, 0x1b4941b4), TOBN(0xe1ff19b4, 0x6efba588),
+ TOBN(0x35034363, 0xc48345e0), TOBN(0x45542e3d, 0x1e29dfc4),
+ TOBN(0xf197cb91, 0x349f7aed), TOBN(0x3b2b5a00, 0x8fca8420),
+ TOBN(0x7c175ee8, 0x23aaf6d8), TOBN(0x54dcf421, 0x35af32b6),
+ TOBN(0x0ba14307, 0x27d6561e), TOBN(0x879d5ee4, 0xd175b1e2),
+ TOBN(0xc7c43673, 0x99807db5), TOBN(0x77a54455, 0x9cd55bcd),
+ TOBN(0xe6c2ff13, 0x0105c072), TOBN(0x18f7a99f, 0x8dda7da4),
+ TOBN(0x4c301820, 0x0e2d35c1), TOBN(0x06a53ca0, 0xd9cc6c82),
+ TOBN(0xaa21cc1e, 0xf1aa1d9e), TOBN(0x32414334, 0x4a75b1e8),
+ TOBN(0x2a6d1328, 0x0ebe9fdc), TOBN(0x16bd173f, 0x98a4755a),
+ TOBN(0xfbb9b245, 0x2133ffd9), TOBN(0x39a8b2f1, 0x830f1a20),
+ TOBN(0x484bc97d, 0xd5a1f52a), TOBN(0xd6aebf56, 0xa40eddf8),
+ TOBN(0x32257acb, 0x76ccdac6), TOBN(0xaf4d36ec, 0x1586ff27),
+ TOBN(0x8eaa8863, 0xf8de7dd1), TOBN(0x0045d5cf, 0x88647c16)}
+ ,
+ {TOBN(0xa6f3d574, 0xc005979d), TOBN(0xc2072b42, 0x6a40e350),
+ TOBN(0xfca5c156, 0x8de2ecf9), TOBN(0xa8c8bf5b, 0xa515344e),
+ TOBN(0x97aee555, 0x114df14a), TOBN(0xd4374a4d, 0xfdc5ec6b),
+ TOBN(0x754cc28f, 0x2ca85418), TOBN(0x71cb9e27, 0xd3c41f78),
+ TOBN(0x89105079, 0x03605c39), TOBN(0xf0843d9e, 0xa142c96c),
+ TOBN(0xf3744934, 0x16923684), TOBN(0x732caa2f, 0xfa0a2893),
+ TOBN(0xb2e8c270, 0x61160170), TOBN(0xc32788cc, 0x437fbaa3),
+ TOBN(0x39cd818e, 0xa6eda3ac), TOBN(0xe2e94239, 0x9e2b2e07),
+ TOBN(0x6967d39b, 0x0260e52a), TOBN(0xd42585cc, 0x90653325),
+ TOBN(0x0d9bd605, 0x21ca7954), TOBN(0x4fa20877, 0x81ed57b3),
+ TOBN(0x60c1eff8, 0xe34a0bbe), TOBN(0x56b0040c, 0x84f6ef64),
+ TOBN(0x28be2b24, 0xb1af8483), TOBN(0xb2278163, 0xf5531614),
+ TOBN(0x8df27545, 0x5922ac1c), TOBN(0xa7b3ef5c, 0xa52b3f63),
+ TOBN(0x8e77b214, 0x71de57c4), TOBN(0x31682c10, 0x834c008b),
+ TOBN(0xc76824f0, 0x4bd55d31), TOBN(0xb6d1c086, 0x17b61c71),
+ TOBN(0x31db0903, 0xc2a5089d), TOBN(0x9c092172, 0x184e5d3f),
+ TOBN(0xdd7ced5b, 0xc00cc638), TOBN(0x1a2015eb, 0x61278fc2),
+ TOBN(0x2e8e5288, 0x6a37f8d6), TOBN(0xc457786f, 0xe79933ad),
+ TOBN(0xb3fe4cce, 0x2c51211a), TOBN(0xad9b10b2, 0x24c20498),
+ TOBN(0x90d87a4f, 0xd28db5e5), TOBN(0x698cd105, 0x3aca2fc3),
+ TOBN(0x4f112d07, 0xe91b536d), TOBN(0xceb982f2, 0x9eba09d6),
+ TOBN(0x3c157b2c, 0x197c396f), TOBN(0xe23c2d41, 0x7b66eb24),
+ TOBN(0x480c57d9, 0x3f330d37), TOBN(0xb3a4c8a1, 0x79108deb),
+ TOBN(0x702388de, 0xcb199ce5), TOBN(0x0b019211, 0xb944a8d4),
+ TOBN(0x24f2a692, 0x840bb336), TOBN(0x7c353bdc, 0xa669fa7b),
+ TOBN(0xda20d6fc, 0xdec9c300), TOBN(0x625fbe2f, 0xa13a4f17),
+ TOBN(0xa2b1b61a, 0xdbc17328), TOBN(0x008965bf, 0xa9515621),
+ TOBN(0x49690939, 0xc620ff46), TOBN(0x182dd27d, 0x8717e91c),
+ TOBN(0x5ace5035, 0xea6c3997), TOBN(0x54259aaa, 0xc2610bef),
+ TOBN(0xef18bb3f, 0x3c80dd39), TOBN(0x6910b95b, 0x5fc3fa39),
+ TOBN(0xfce2f510, 0x43e09aee), TOBN(0xced56c9f, 0xa7675665),
+ TOBN(0x10e265ac, 0xd872db61), TOBN(0x6982812e, 0xae9fce69),
+ TOBN(0x29be11c6, 0xce800998), TOBN(0x72bb1752, 0xb90360d9),
+ TOBN(0x2c193197, 0x5a4ad590), TOBN(0x2ba2f548, 0x9fc1dbc0),
+ TOBN(0x7fe4eebb, 0xe490ebe0), TOBN(0x12a0a4cd, 0x7fae11c0),
+ TOBN(0x7197cf81, 0xe903ba37), TOBN(0xcf7d4aa8, 0xde1c6dd8),
+ TOBN(0x92af6bf4, 0x3fd5684c), TOBN(0x2b26eecf, 0x80360aa1),
+ TOBN(0xbd960f30, 0x00546a82), TOBN(0x407b3c43, 0xf59ad8fe),
+ TOBN(0x86cae5fe, 0x249c82ba), TOBN(0x9e0faec7, 0x2463744c),
+ TOBN(0x87f551e8, 0x94916272), TOBN(0x033f9344, 0x6ceb0615),
+ TOBN(0x1e5eb0d1, 0x8be82e84), TOBN(0x89967f0e, 0x7a582fef),
+ TOBN(0xbcf687d5, 0xa6e921fa), TOBN(0xdfee4cf3, 0xd37a09ba),
+ TOBN(0x94f06965, 0xb493c465), TOBN(0x638b9a1c, 0x7635c030),
+ TOBN(0x76667864, 0x66f05e9f), TOBN(0xccaf6808, 0xc04da725),
+ TOBN(0xca2eb690, 0x768fccfc), TOBN(0xf402d37d, 0xb835b362),
+ TOBN(0x0efac0d0, 0xe2fdfcce), TOBN(0xefc9cdef, 0xb638d990),
+ TOBN(0x2af12b72, 0xd1669a8b), TOBN(0x33c536bc, 0x5774ccbd),
+ TOBN(0x30b21909, 0xfb34870e), TOBN(0xc38fa2f7, 0x7df25aca),
+ TOBN(0x74c5f02b, 0xbf81f3f5), TOBN(0x0525a5ae, 0xaf7e4581),
+ TOBN(0x88d2aaba, 0x433c54ae), TOBN(0xed9775db, 0x806a56c5),
+ TOBN(0xd320738a, 0xc0edb37d), TOBN(0x25fdb6ee, 0x66cc1f51),
+ TOBN(0xac661d17, 0x10600d76), TOBN(0x931ec1f3, 0xbdd1ed76),
+ TOBN(0x65c11d62, 0x19ee43f1), TOBN(0x5cd57c3e, 0x60829d97),
+ TOBN(0xd26c91a3, 0x984be6e8), TOBN(0xf08d9309, 0x8b0c53bd),
+ TOBN(0x94bc9e5b, 0xc016e4ea), TOBN(0xd3916839, 0x11d43d2b),
+ TOBN(0x886c5ad7, 0x73701155), TOBN(0xe0377626, 0x20b00715),
+ TOBN(0x7f01c9ec, 0xaa80ba59), TOBN(0x3083411a, 0x68538e51),
+ TOBN(0x970370f1, 0xe88128af), TOBN(0x625cc3db, 0x91dec14b),
+ TOBN(0xfef9666c, 0x01ac3107), TOBN(0xb2a8d577, 0xd5057ac3),
+ TOBN(0xb0f26299, 0x92be5df7), TOBN(0xf579c8e5, 0x00353924),
+ TOBN(0xb8fa3d93, 0x1341ed7a), TOBN(0x4223272c, 0xa7b59d49),
+ TOBN(0x3dcb1947, 0x83b8c4a4), TOBN(0x4e413c01, 0xed1302e4),
+ TOBN(0x6d999127, 0xe17e44ce), TOBN(0xee86bf75, 0x33b3adfb),
+ TOBN(0xf6902fe6, 0x25aa96ca), TOBN(0xb73540e4, 0xe5aae47d),
+ TOBN(0x32801d7b, 0x1b4a158c), TOBN(0xe571c99e, 0x27e2a369),
+ TOBN(0x40cb76c0, 0x10d9f197), TOBN(0xc308c289, 0x3167c0ae),
+ TOBN(0xa6ef9dd3, 0xeb7958f2), TOBN(0xa7226dfc, 0x300879b1),
+ TOBN(0x6cd0b362, 0x7edf0636), TOBN(0x4efbce6c, 0x7bc37eed),
+ TOBN(0x75f92a05, 0x8d699021), TOBN(0x586d4c79, 0x772566e3),
+ TOBN(0x378ca5f1, 0x761ad23a), TOBN(0x650d86fc, 0x1465a8ac),
+ TOBN(0x7a4ed457, 0x842ba251), TOBN(0x6b65e3e6, 0x42234933),
+ TOBN(0xaf1543b7, 0x31aad657), TOBN(0xa4cefe98, 0xcbfec369),
+ TOBN(0xb587da90, 0x9f47befb), TOBN(0x6562e9fb, 0x41312d13),
+ TOBN(0xa691ea59, 0xeff1cefe), TOBN(0xcc30477a, 0x05fc4cf6),
+ TOBN(0xa1632461, 0x0b0ffd3d), TOBN(0xa1f16f3b, 0x5b355956),
+ TOBN(0x5b148d53, 0x4224ec24), TOBN(0xdc834e7b, 0xf977012a),
+ TOBN(0x7bfc5e75, 0xb2c69dbc), TOBN(0x3aa77a29, 0x03c3da6c),
+ TOBN(0xde0df03c, 0xca910271), TOBN(0xcbd5ca4a, 0x7806dc55),
+ TOBN(0xe1ca5807, 0x6db476cb), TOBN(0xfde15d62, 0x5f37a31e),
+ TOBN(0xf49af520, 0xf41af416), TOBN(0x96c5c5b1, 0x7d342db5),
+ TOBN(0x155c43b7, 0xeb4ceb9b), TOBN(0x2e993010, 0x4e77371a),
+ TOBN(0x1d2987da, 0x675d43af), TOBN(0xef2bc1c0, 0x8599fd72),
+ TOBN(0x96894b7b, 0x9342f6b2), TOBN(0x201eadf2, 0x7c8e71f0),
+ TOBN(0xf3479d9f, 0x4a1f3efc), TOBN(0xe0f8a742, 0x702a9704),
+ TOBN(0xeafd44b6, 0xb3eba40c), TOBN(0xf9739f29, 0xc1c1e0d0),
+ TOBN(0x0091471a, 0x619d505e), TOBN(0xc15f9c96, 0x9d7c263e),
+ TOBN(0x5be47285, 0x83afbe33), TOBN(0xa3b6d6af, 0x04f1e092),
+ TOBN(0xe76526b9, 0x751a9d11), TOBN(0x2ec5b26d, 0x9a4ae4d2),
+ TOBN(0xeb66f4d9, 0x02f6fb8d), TOBN(0x4063c561, 0x96912164),
+ TOBN(0xeb7050c1, 0x80ef3000), TOBN(0x288d1c33, 0xeaa5b3f0),
+ TOBN(0xe87c68d6, 0x07806fd8), TOBN(0xb2f7f9d5, 0x4bbbf50f),
+ TOBN(0x25972f3a, 0xac8d6627), TOBN(0xf8547774, 0x10e8c13b),
+ TOBN(0xcc50ef6c, 0x872b4a60), TOBN(0xab2a34a4, 0x4613521b),
+ TOBN(0x39c5c190, 0x983e15d1), TOBN(0x61dde5df, 0x59905512),
+ TOBN(0xe417f621, 0x9f2275f3), TOBN(0x0750c8b6, 0x451d894b),
+ TOBN(0x75b04ab9, 0x78b0bdaa), TOBN(0x3bfd9fd4, 0x458589bd),
+ TOBN(0xf1013e30, 0xee9120b6), TOBN(0x2b51af93, 0x23a4743e),
+ TOBN(0xea96ffae, 0x48d14d9e), TOBN(0x71dc0dbe, 0x698a1d32),
+ TOBN(0x914962d2, 0x0180cca4), TOBN(0x1ae60677, 0xc3568963),
+ TOBN(0x8cf227b1, 0x437bc444), TOBN(0xc650c83b, 0xc9962c7a),
+ TOBN(0x23c2c7dd, 0xfe7ccfc4), TOBN(0xf925c89d, 0x1b929d48),
+ TOBN(0x4460f74b, 0x06783c33), TOBN(0xac2c8d49, 0xa590475a),
+ TOBN(0xfb40b407, 0xb807bba0), TOBN(0x9d1e362d, 0x69ff8f3a),
+ TOBN(0xa33e9681, 0xcbef64a4), TOBN(0x67ece5fa, 0x332fb4b2),
+ TOBN(0x6900a99b, 0x739f10e3), TOBN(0xc3341ca9, 0xff525925),
+ TOBN(0xee18a626, 0xa9e2d041), TOBN(0xa5a83685, 0x29580ddd),
+ TOBN(0xf3470c81, 0x9d7de3cd), TOBN(0xedf02586, 0x2062cf9c),
+ TOBN(0xf43522fa, 0xc010edb0), TOBN(0x30314135, 0x13a4b1ae),
+ TOBN(0xc792e02a, 0xdb22b94b), TOBN(0x993d8ae9, 0xa1eaa45b),
+ TOBN(0x8aad6cd3, 0xcd1e1c63), TOBN(0x89529ca7, 0xc5ce688a),
+ TOBN(0x2ccee3aa, 0xe572a253), TOBN(0xe02b6438, 0x02a21efb),
+ TOBN(0xa7091b6e, 0xc9430358), TOBN(0x06d1b1fa, 0x9d7db504),
+ TOBN(0x58846d32, 0xc4744733), TOBN(0x40517c71, 0x379f9e34),
+ TOBN(0x2f65655f, 0x130ef6ca), TOBN(0x526e4488, 0xf1f3503f),
+ TOBN(0x8467bd17, 0x7ee4a976), TOBN(0x1d9dc913, 0x921363d1),
+ TOBN(0xd8d24c33, 0xb069e041), TOBN(0x5eb5da0a, 0x2cdf7f51),
+ TOBN(0x1c0f3cb1, 0x197b994f), TOBN(0x3c95a6c5, 0x2843eae9),
+ TOBN(0x7766ffc9, 0xa6097ea5), TOBN(0x7bea4093, 0xd723b867),
+ TOBN(0xb48e1f73, 0x4db378f9), TOBN(0x70025b00, 0xe37b77ac),
+ TOBN(0x943dc8e7, 0xaf24ad46), TOBN(0xb98a15ac, 0x16d00a85),
+ TOBN(0x3adc38ba, 0x2743b004), TOBN(0xb1c7f4f7, 0x334415ee),
+ TOBN(0xea43df8f, 0x1e62d05a), TOBN(0x32618905, 0x9d76a3b6),
+ TOBN(0x2fbd0bb5, 0xa23a0f46), TOBN(0x5bc971db, 0x6a01918c),
+ TOBN(0x7801d94a, 0xb4743f94), TOBN(0xb94df65e, 0x676ae22b),
+ TOBN(0xaafcbfab, 0xaf95894c), TOBN(0x7b9bdc07, 0x276b2241),
+ TOBN(0xeaf98362, 0x5bdda48b), TOBN(0x5977faf2, 0xa3fcb4df),
+ TOBN(0xbed042ef, 0x052c4b5b), TOBN(0x9fe87f71, 0x067591f0),
+ TOBN(0xc89c73ca, 0x22f24ec7), TOBN(0x7d37fa9e, 0xe64a9f1b),
+ TOBN(0x2710841a, 0x15562627), TOBN(0x2c01a613, 0xc243b034),
+ TOBN(0x1d135c56, 0x2bc68609), TOBN(0xc2ca1715, 0x8b03f1f6),
+ TOBN(0xc9966c2d, 0x3eb81d82), TOBN(0xc02abf4a, 0x8f6df13e),
+ TOBN(0x77b34bd7, 0x8f72b43b), TOBN(0xaff6218f, 0x360c82b0),
+ TOBN(0x0aa5726c, 0x8d55b9d2), TOBN(0xdc0adbe9, 0x99e9bffb),
+ TOBN(0x9097549c, 0xefb9e72a), TOBN(0x16755712, 0x9dfb3111),
+ TOBN(0xdd8bf984, 0xf26847f9), TOBN(0xbcb8e387, 0xdfb30cb7),
+ TOBN(0xc1fd32a7, 0x5171ef9c), TOBN(0x977f3fc7, 0x389b363f),
+ TOBN(0x116eaf2b, 0xf4babda0), TOBN(0xfeab68bd, 0xf7113c8e),
+ TOBN(0xd1e3f064, 0xb7def526), TOBN(0x1ac30885, 0xe0b3fa02),
+ TOBN(0x1c5a6e7b, 0x40142d9d), TOBN(0x839b5603, 0x30921c0b),
+ TOBN(0x48f301fa, 0x36a116a3), TOBN(0x380e1107, 0xcfd9ee6d),
+ TOBN(0x7945ead8, 0x58854be1), TOBN(0x4111c12e, 0xcbd4d49d),
+ TOBN(0xece3b1ec, 0x3a29c2ef), TOBN(0x6356d404, 0x8d3616f5),
+ TOBN(0x9f0d6a8f, 0x594d320e), TOBN(0x0989316d, 0xf651ccd2),
+ TOBN(0x6c32117a, 0x0f8fdde4), TOBN(0x9abe5cc5, 0xa26a9bbc),
+ TOBN(0xcff560fb, 0x9723f671), TOBN(0x21b2a12d, 0x7f3d593c),
+ TOBN(0xe4cb18da, 0x24ba0696), TOBN(0x186e2220, 0xc3543384),
+ TOBN(0x722f64e0, 0x88312c29), TOBN(0x94282a99, 0x17dc7752),
+ TOBN(0x62467bbf, 0x5a85ee89), TOBN(0xf435c650, 0xf10076a0),
+ TOBN(0xc9ff1539, 0x43b3a50b), TOBN(0x7132130c, 0x1a53efbc),
+ TOBN(0x31bfe063, 0xf7b0c5b7), TOBN(0xb0179a7d, 0x4ea994cc),
+ TOBN(0x12d064b3, 0xc85f455b), TOBN(0x47259328, 0x8f6e0062),
+ TOBN(0xf64e590b, 0xb875d6d9), TOBN(0x22dd6225, 0xad92bcc7),
+ TOBN(0xb658038e, 0xb9c3bd6d), TOBN(0x00cdb0d6, 0xfbba27c8),
+ TOBN(0x0c681337, 0x1062c45d), TOBN(0xd8515b8c, 0x2d33407d),
+ TOBN(0xcb8f699e, 0x8cbb5ecf), TOBN(0x8c4347f8, 0xc608d7d8),
+ TOBN(0x2c11850a, 0xbb3e00db), TOBN(0x20a8dafd, 0xecb49d19),
+ TOBN(0xbd781480, 0x45ee2f40), TOBN(0x75e354af, 0x416b60cf),
+ TOBN(0xde0b58a1, 0x8d49a8c4), TOBN(0xe40e94e2, 0xfa359536),
+ TOBN(0xbd4fa59f, 0x62accd76), TOBN(0x05cf466a, 0x8c762837),
+ TOBN(0xb5abda99, 0x448c277b), TOBN(0x5a9e01bf, 0x48b13740),
+ TOBN(0x9d457798, 0x326aad8d), TOBN(0xbdef4954, 0xc396f7e7),
+ TOBN(0x6fb274a2, 0xc253e292), TOBN(0x2800bf0a, 0x1cfe53e7),
+ TOBN(0x22426d31, 0x44438fd4), TOBN(0xef233923, 0x5e259f9a),
+ TOBN(0x4188503c, 0x03f66264), TOBN(0x9e5e7f13, 0x7f9fdfab),
+ TOBN(0x565eb76c, 0x5fcc1aba), TOBN(0xea632548, 0x59b5bff8),
+ TOBN(0x5587c087, 0xaab6d3fa), TOBN(0x92b639ea, 0x6ce39c1b),
+ TOBN(0x0706e782, 0x953b135c), TOBN(0x7308912e, 0x425268ef),
+ TOBN(0x599e92c7, 0x090e7469), TOBN(0x83b90f52, 0x9bc35e75),
+ TOBN(0x4750b3d0, 0x244975b3), TOBN(0xf3a44358, 0x11965d72),
+ TOBN(0x179c6774, 0x9c8dc751), TOBN(0xff18cdfe, 0xd23d9ff0),
+ TOBN(0xc4013833, 0x2028e247), TOBN(0x96e280e2, 0xf3bfbc79),
+ TOBN(0xf60417bd, 0xd0880a84), TOBN(0x263c9f3d, 0x2a568151),
+ TOBN(0x36be15b3, 0x2d2ce811), TOBN(0x846dc0c2, 0xf8291d21),
+ TOBN(0x5cfa0ecb, 0x789fcfdb), TOBN(0x45a0beed, 0xd7535b9a),
+ TOBN(0xec8e9f07, 0x96d69af1), TOBN(0x31a7c5b8, 0x599ab6dc),
+ TOBN(0xd36d45ef, 0xf9e2e09f), TOBN(0x3cf49ef1, 0xdcee954b),
+ TOBN(0x6be34cf3, 0x086cff9b), TOBN(0x88dbd491, 0x39a3360f),
+ TOBN(0x1e96b8cc, 0x0dbfbd1d), TOBN(0xc1e5f7bf, 0xcb7e2552),
+ TOBN(0x0547b214, 0x28819d98), TOBN(0xc770dd9c, 0x7aea9dcb),
+ TOBN(0xaef0d4c7, 0x041d68c8), TOBN(0xcc2b9818, 0x13cb9ba8),
+ TOBN(0x7fc7bc76, 0xfe86c607), TOBN(0x6b7b9337, 0x502a9a95),
+ TOBN(0x1948dc27, 0xd14dab63), TOBN(0x249dd198, 0xdae047be),
+ TOBN(0xe8356584, 0xa981a202), TOBN(0x3531dd18, 0x3a893387),
+ TOBN(0x1be11f90, 0xc85c7209), TOBN(0x93d2fe1e, 0xe2a52b5a),
+ TOBN(0x8225bfe2, 0xec6d6b97), TOBN(0x9cf6d6f4, 0xbd0aa5de),
+ TOBN(0x911459cb, 0x54779f5f), TOBN(0x5649cddb, 0x86aeb1f3),
+ TOBN(0x32133579, 0x3f26ce5a), TOBN(0xc289a102, 0x550f431e),
+ TOBN(0x559dcfda, 0x73b84c6f), TOBN(0x84973819, 0xee3ac4d7),
+ TOBN(0xb51e55e6, 0xf2606a82), TOBN(0xe25f7061, 0x90f2fb57),
+ TOBN(0xacef6c2a, 0xb1a4e37c), TOBN(0x864e359d, 0x5dcf2706),
+ TOBN(0x479e6b18, 0x7ce57316), TOBN(0x2cab2500, 0x3a96b23d),
+ TOBN(0xed489862, 0x8ef16df7), TOBN(0x2056538c, 0xef3758b5),
+ TOBN(0xa7df865e, 0xf15d3101), TOBN(0x80c5533a, 0x61b553d7),
+ TOBN(0x366e1997, 0x4ed14294), TOBN(0x6620741f, 0xb3c0bcd6),
+ TOBN(0x21d1d9c4, 0xedc45418), TOBN(0x005b859e, 0xc1cc4a9d),
+ TOBN(0xdf01f630, 0xa1c462f0), TOBN(0x15d06cf3, 0xf26820c7),
+ TOBN(0x9f7f24ee, 0x3484be47), TOBN(0x2ff33e96, 0x4a0c902f),
+ TOBN(0x00bdf457, 0x5a0bc453), TOBN(0x2378dfaf, 0x1aa238db),
+ TOBN(0x272420ec, 0x856720f2), TOBN(0x2ad9d95b, 0x96797291),
+ TOBN(0xd1242cc6, 0x768a1558), TOBN(0x2e287f8b, 0x5cc86aa8),
+ TOBN(0x796873d0, 0x990cecaa), TOBN(0xade55f81, 0x675d4080),
+ TOBN(0x2645eea3, 0x21f0cd84), TOBN(0x7a1efa0f, 0xb4e17d02),
+ TOBN(0xf6858420, 0x037cc061), TOBN(0x682e05f0, 0xd5d43e12),
+ TOBN(0x59c36994, 0x27218710), TOBN(0x85cbba4d, 0x3f7cd2fc),
+ TOBN(0x726f9729, 0x7a3cd22a), TOBN(0x9f8cd5dc, 0x4a628397),
+ TOBN(0x17b93ab9, 0xc23165ed), TOBN(0xff5f5dbf, 0x122823d4),
+ TOBN(0xc1e4e4b5, 0x654a446d), TOBN(0xd1a9496f, 0x677257ba),
+ TOBN(0x6387ba94, 0xde766a56), TOBN(0x23608bc8, 0x521ec74a),
+ TOBN(0x16a522d7, 0x6688c4d4), TOBN(0x9d6b4282, 0x07373abd),
+ TOBN(0xa62f07ac, 0xb42efaa3), TOBN(0xf73e00f7, 0xe3b90180),
+ TOBN(0x36175fec, 0x49421c3e), TOBN(0xc4e44f9b, 0x3dcf2678),
+ TOBN(0x76df436b, 0x7220f09f), TOBN(0x172755fb, 0x3aa8b6cf),
+ TOBN(0xbab89d57, 0x446139cc), TOBN(0x0a0a6e02, 0x5fe0208f),
+ TOBN(0xcdbb63e2, 0x11e5d399), TOBN(0x33ecaa12, 0xa8977f0b),
+ TOBN(0x59598b21, 0xf7c42664), TOBN(0xb3e91b32, 0xab65d08a),
+ TOBN(0x035822ee, 0xf4502526), TOBN(0x1dcf0176, 0x720a82a9),
+ TOBN(0x50f8598f, 0x3d589e02), TOBN(0xdf0478ff, 0xb1d63d2c),
+ TOBN(0x8b8068bd, 0x1571cd07), TOBN(0x30c3aa4f, 0xd79670cd),
+ TOBN(0x25e8fd4b, 0x941ade7f), TOBN(0x3d1debdc, 0x32790011),
+ TOBN(0x65b6dcbd, 0x3a3f9ff0), TOBN(0x282736a4, 0x793de69c),
+ TOBN(0xef69a0c3, 0xd41d3bd3), TOBN(0xb533b8c9, 0x07a26bde),
+ TOBN(0xe2801d97, 0xdb2edf9f), TOBN(0xdc4a8269, 0xe1877af0),
+ TOBN(0x6c1c5851, 0x3d590dbe), TOBN(0x84632f6b, 0xee4e9357),
+ TOBN(0xd36d36b7, 0x79b33374), TOBN(0xb46833e3, 0x9bbca2e6),
+ TOBN(0x37893913, 0xf7fc0586), TOBN(0x385315f7, 0x66bf4719),
+ TOBN(0x72c56293, 0xb31855dc), TOBN(0xd1416d4e, 0x849061fe),
+ TOBN(0xbeb3ab78, 0x51047213), TOBN(0x447f6e61, 0xf040c996),
+ TOBN(0xd06d310d, 0x638b1d0c), TOBN(0xe28a413f, 0xbad1522e),
+ TOBN(0x685a76cb, 0x82003f86), TOBN(0x610d07f7, 0x0bcdbca3),
+ TOBN(0x6ff66021, 0x9ca4c455), TOBN(0x7df39b87, 0xcea10eec),
+ TOBN(0xb9255f96, 0xe22db218), TOBN(0x8cc6d9eb, 0x08a34c44),
+ TOBN(0xcd4ffb86, 0x859f9276), TOBN(0x8fa15eb2, 0x50d07335),
+ TOBN(0xdf553845, 0xcf2c24b5), TOBN(0x89f66a9f, 0x52f9c3ba),
+ TOBN(0x8f22b5b9, 0xe4a7ceb3), TOBN(0xaffef809, 0x0e134686),
+ TOBN(0x3e53e1c6, 0x8eb8fac2), TOBN(0x93c1e4eb, 0x28aec98e),
+ TOBN(0xb6b91ec5, 0x32a43bcb), TOBN(0x2dbfa947, 0xb2d74a51),
+ TOBN(0xe065d190, 0xca84bad7), TOBN(0xfb13919f, 0xad58e65c),
+ TOBN(0x3c41718b, 0xf1cb6e31), TOBN(0x688969f0, 0x06d05c3f),
+ TOBN(0xd4f94ce7, 0x21264d45), TOBN(0xfdfb65e9, 0x7367532b),
+ TOBN(0x5b1be8b1, 0x0945a39d), TOBN(0x229f789c, 0x2b8baf3b),
+ TOBN(0xd8f41f3e, 0x6f49f15d), TOBN(0x678ce828, 0x907f0792),
+ TOBN(0xc69ace82, 0xfca6e867), TOBN(0x106451ae, 0xd01dcc89),
+ TOBN(0x1bb4f7f0, 0x19fc32d2), TOBN(0x64633dfc, 0xb00c52d2),
+ TOBN(0x8f13549a, 0xad9ea445), TOBN(0x99a3bf50, 0xfb323705),
+ TOBN(0x0c9625a2, 0x534d4dbc), TOBN(0x45b8f1d1, 0xc2a2fea3),
+ TOBN(0x76ec21a1, 0xa530fc1a), TOBN(0x4bac9c2a, 0x9e5bd734),
+ TOBN(0x5996d76a, 0x7b4e3587), TOBN(0x0045cdee, 0x1182d9e3),
+ TOBN(0x1aee24b9, 0x1207f13d), TOBN(0x66452e97, 0x97345a41),
+ TOBN(0x16e5b054, 0x9f950cd0), TOBN(0x9cc72fb1, 0xd7fdd075),
+ TOBN(0x6edd61e7, 0x66249663), TOBN(0xde4caa4d, 0xf043cccb),
+ TOBN(0x11b1f57a, 0x55c7ac17), TOBN(0x779cbd44, 0x1a85e24d),
+ TOBN(0x78030f86, 0xe46081e7), TOBN(0xfd4a6032, 0x8e20f643),
+ TOBN(0xcc7a6488, 0x0a750c0f), TOBN(0x39bacfe3, 0x4e548e83),
+ TOBN(0x3d418c76, 0x0c110f05), TOBN(0x3e4daa4c, 0xb1f11588),
+ TOBN(0x2733e7b5, 0x5ffc69ff), TOBN(0x46f147bc, 0x92053127),
+ TOBN(0x885b2434, 0xd722df94), TOBN(0x6a444f65, 0xe6fc6b7c)}
+ ,
+ {TOBN(0x7a1a465a, 0xc3f16ea8), TOBN(0x115a461d, 0xb2f1d11c),
+ TOBN(0x4767dd95, 0x6c68a172), TOBN(0x3392f2eb, 0xd13a4698),
+ TOBN(0xc7a99ccd, 0xe526cdc7), TOBN(0x8e537fdc, 0x22292b81),
+ TOBN(0x76d8cf69, 0xa6d39198), TOBN(0xffc5ff43, 0x2446852d),
+ TOBN(0x97b14f7e, 0xa90567e6), TOBN(0x513257b7, 0xb6ae5cb7),
+ TOBN(0x85454a3c, 0x9f10903d), TOBN(0xd8d2c9ad, 0x69bc3724),
+ TOBN(0x38da9324, 0x6b29cb44), TOBN(0xb540a21d, 0x77c8cbac),
+ TOBN(0x9bbfe435, 0x01918e42), TOBN(0xfffa707a, 0x56c3614e),
+ TOBN(0x0ce4e3f1, 0xd4e353b7), TOBN(0x062d8a14, 0xef46b0a0),
+ TOBN(0x6408d5ab, 0x574b73fd), TOBN(0xbc41d1c9, 0xd3273ffd),
+ TOBN(0x3538e1e7, 0x6be77800), TOBN(0x71fe8b37, 0xc5655031),
+ TOBN(0x1cd91621, 0x6b9b331a), TOBN(0xad825d0b, 0xbb388f73),
+ TOBN(0x56c2e05b, 0x1cb76219), TOBN(0x0ec0bf91, 0x71567e7e),
+ TOBN(0xe7076f86, 0x61c4c910), TOBN(0xd67b085b, 0xbabc04d9),
+ TOBN(0x9fb90459, 0x5e93a96a), TOBN(0x7526c1ea, 0xfbdc249a),
+ TOBN(0x0d44d367, 0xecdd0bb7), TOBN(0x95399917, 0x9dc0d695),
+ TOBN(0x61360ee9, 0x9e240d18), TOBN(0x057cdcac, 0xb4b94466),
+ TOBN(0xe7667cd1, 0x2fe5325c), TOBN(0x1fa297b5, 0x21974e3b),
+ TOBN(0xfa4081e7, 0xdb083d76), TOBN(0x31993be6, 0xf206bd15),
+ TOBN(0x8949269b, 0x14c19f8c), TOBN(0x21468d72, 0xa9d92357),
+ TOBN(0x2ccbc583, 0xa4c506ec), TOBN(0x957ed188, 0xd1acfe97),
+ TOBN(0x8baed833, 0x12f1aea2), TOBN(0xef2a6cb4, 0x8325362d),
+ TOBN(0x130dde42, 0x8e195c43), TOBN(0xc842025a, 0x0e6050c6),
+ TOBN(0x2da972a7, 0x08686a5d), TOBN(0xb52999a1, 0xe508b4a8),
+ TOBN(0xd9f090b9, 0x10a5a8bd), TOBN(0xca91d249, 0x096864da),
+ TOBN(0x8e6a93be, 0x3f67dbc1), TOBN(0xacae6fba, 0xf5f4764c),
+ TOBN(0x1563c6e0, 0xd21411a0), TOBN(0x28fa787f, 0xda0a4ad8),
+ TOBN(0xd524491c, 0x908c8030), TOBN(0x1257ba0e, 0x4c795f07),
+ TOBN(0x83f49167, 0xceca9754), TOBN(0x426d2cf6, 0x4b7939a0),
+ TOBN(0x2555e355, 0x723fd0bf), TOBN(0xa96e6d06, 0xc4f144e2),
+ TOBN(0x4768a8dd, 0x87880e61), TOBN(0x15543815, 0xe508e4d5),
+ TOBN(0x09d7e772, 0xb1b65e15), TOBN(0x63439dd6, 0xac302fa0),
+ TOBN(0xb93f802f, 0xc14e35c2), TOBN(0x71735b7c, 0x4341333c),
+ TOBN(0x03a25104, 0x16d4f362), TOBN(0x3f4d069b, 0xbf433c8e),
+ TOBN(0x0d83ae01, 0xf78f5a7c), TOBN(0x50a8ffbe, 0x7c4eed07),
+ TOBN(0xc74f8906, 0x76e10f83), TOBN(0x7d080966, 0x9ddaf8e1),
+ TOBN(0xb11df8e1, 0x698e04cc), TOBN(0x877be203, 0x169005c8),
+ TOBN(0x32749e8c, 0x4f3c6179), TOBN(0x2dbc9d0a, 0x7853fc05),
+ TOBN(0x187d4f93, 0x9454d937), TOBN(0xe682ce9d, 0xb4800e1b),
+ TOBN(0xa9129ad8, 0x165e68e8), TOBN(0x0fe29735, 0xbe7f785b),
+ TOBN(0x5303f40c, 0x5b9e02b7), TOBN(0xa37c9692, 0x35ee04e8),
+ TOBN(0x5f46cc20, 0x34d6632b), TOBN(0x55ef72b2, 0x96ac545b),
+ TOBN(0xabec5c1f, 0x7b91b062), TOBN(0x0a79e1c7, 0xbb33e821),
+ TOBN(0xbb04b428, 0x3a9f4117), TOBN(0x0de1f28f, 0xfd2a475a),
+ TOBN(0x31019ccf, 0x3a4434b4), TOBN(0xa3458111, 0x1a7954dc),
+ TOBN(0xa9dac80d, 0xe34972a7), TOBN(0xb043d054, 0x74f6b8dd),
+ TOBN(0x021c319e, 0x11137b1a), TOBN(0x00a754ce, 0xed5cc03f),
+ TOBN(0x0aa2c794, 0xcbea5ad4), TOBN(0x093e67f4, 0x70c015b6),
+ TOBN(0x72cdfee9, 0xc97e3f6b), TOBN(0xc10bcab4, 0xb6da7461),
+ TOBN(0x3b02d2fc, 0xb59806b9), TOBN(0x85185e89, 0xa1de6f47),
+ TOBN(0x39e6931f, 0x0eb6c4d4), TOBN(0x4d4440bd, 0xd4fa5b04),
+ TOBN(0x5418786e, 0x34be7eb8), TOBN(0x6380e521, 0x9d7259bc),
+ TOBN(0x20ac0351, 0xd598d710), TOBN(0x272c4166, 0xcb3a4da4),
+ TOBN(0xdb82fe1a, 0xca71de1f), TOBN(0x746e79f2, 0xd8f54b0f),
+ TOBN(0x6e7fc736, 0x4b573e9b), TOBN(0x75d03f46, 0xfd4b5040),
+ TOBN(0x5c1cc36d, 0x0b98d87b), TOBN(0x513ba3f1, 0x1f472da1),
+ TOBN(0x79d0af26, 0xabb177dd), TOBN(0xf82ab568, 0x7891d564),
+ TOBN(0x2b6768a9, 0x72232173), TOBN(0xefbb3bb0, 0x8c1f6619),
+ TOBN(0xb29c11db, 0xa6d18358), TOBN(0x519e2797, 0xb0916d3a),
+ TOBN(0xd4dc18f0, 0x9188e290), TOBN(0x648e86e3, 0x98b0ca7f),
+ TOBN(0x859d3145, 0x983c38b5), TOBN(0xb14f176c, 0x637abc8b),
+ TOBN(0x2793fb9d, 0xcaff7be6), TOBN(0xebe5a55f, 0x35a66a5a),
+ TOBN(0x7cec1dcd, 0x9f87dc59), TOBN(0x7c595cd3, 0xfbdbf560),
+ TOBN(0x5b543b22, 0x26eb3257), TOBN(0x69080646, 0xc4c935fd),
+ TOBN(0x7f2e4403, 0x81e9ede3), TOBN(0x243c3894, 0xcaf6df0a),
+ TOBN(0x7c605bb1, 0x1c073b11), TOBN(0xcd06a541, 0xba6a4a62),
+ TOBN(0x29168949, 0x49d4e2e5), TOBN(0x33649d07, 0x4af66880),
+ TOBN(0xbfc0c885, 0xe9a85035), TOBN(0xb4e52113, 0xfc410f4b),
+ TOBN(0xdca3b706, 0x78a6513b), TOBN(0x92ea4a2a, 0x9edb1943),
+ TOBN(0x02642216, 0xdb6e2dd8), TOBN(0x9b45d0b4, 0x9fd57894),
+ TOBN(0x114e70db, 0xc69d11ae), TOBN(0x1477dd19, 0x4c57595f),
+ TOBN(0xbc2208b4, 0xec77c272), TOBN(0x95c5b4d7, 0xdb68f59c),
+ TOBN(0xb8c4fc63, 0x42e532b7), TOBN(0x386ba422, 0x9ae35290),
+ TOBN(0xfb5dda42, 0xd201ecbc), TOBN(0x2353dc8b, 0xa0e38fd6),
+ TOBN(0x9a0b85ea, 0x68f7e978), TOBN(0x96ec5682, 0x2ad6d11f),
+ TOBN(0x5e279d6c, 0xe5f6886d), TOBN(0xd3fe03cd, 0x3cb1914d),
+ TOBN(0xfe541fa4, 0x7ea67c77), TOBN(0x952bd2af, 0xe3ea810c),
+ TOBN(0x791fef56, 0x8d01d374), TOBN(0xa3a1c621, 0x0f11336e),
+ TOBN(0x5ad0d5a9, 0xc7ec6d79), TOBN(0xff7038af, 0x3225c342),
+ TOBN(0x003c6689, 0xbc69601b), TOBN(0x25059bc7, 0x45e8747d),
+ TOBN(0xfa4965b2, 0xf2086fbf), TOBN(0xf6840ea6, 0x86916078),
+ TOBN(0xd7ac7620, 0x70081d6c), TOBN(0xe600da31, 0xb5328645),
+ TOBN(0x01916f63, 0x529b8a80), TOBN(0xe80e4858, 0x2d7d6f3e),
+ TOBN(0x29eb0fe8, 0xd664ca7c), TOBN(0xf017637b, 0xe7b43b0c),
+ TOBN(0x9a75c806, 0x76cb2566), TOBN(0x8f76acb1, 0xb24892d9),
+ TOBN(0x7ae7b9cc, 0x1f08fe45), TOBN(0x19ef7329, 0x6a4907d8),
+ TOBN(0x2db4ab71, 0x5f228bf0), TOBN(0xf3cdea39, 0x817032d7),
+ TOBN(0x0b1f482e, 0xdcabe3c0), TOBN(0x3baf76b4, 0xbb86325c),
+ TOBN(0xd49065e0, 0x10089465), TOBN(0x3bab5d29, 0x8e77c596),
+ TOBN(0x7636c3a6, 0x193dbd95), TOBN(0xdef5d294, 0xb246e499),
+ TOBN(0xb22c58b9, 0x286b2475), TOBN(0xa0b93939, 0xcd80862b),
+ TOBN(0x3002c83a, 0xf0992388), TOBN(0x6de01f9b, 0xeacbe14c),
+ TOBN(0x6aac688e, 0xadd70482), TOBN(0x708de92a, 0x7b4a4e8a),
+ TOBN(0x75b6dd73, 0x758a6eef), TOBN(0xea4bf352, 0x725b3c43),
+ TOBN(0x10041f2c, 0x87912868), TOBN(0xb1b1be95, 0xef09297a),
+ TOBN(0x19ae23c5, 0xa9f3860a), TOBN(0xc4f0f839, 0x515dcf4b),
+ TOBN(0x3c7ecca3, 0x97f6306a), TOBN(0x744c44ae, 0x68a3a4b0),
+ TOBN(0x69cd13a0, 0xb3a1d8a2), TOBN(0x7cad0a1e, 0x5256b578),
+ TOBN(0xea653fcd, 0x33791d9e), TOBN(0x9cc2a05d, 0x74b2e05f),
+ TOBN(0x73b391dc, 0xfd7affa2), TOBN(0xddb7091e, 0xb6b05442),
+ TOBN(0xc71e27bf, 0x8538a5c6), TOBN(0x195c63dd, 0x89abff17),
+ TOBN(0xfd315285, 0x1b71e3da), TOBN(0x9cbdfda7, 0xfa680fa0),
+ TOBN(0x9db876ca, 0x849d7eab), TOBN(0xebe2764b, 0x3c273271),
+ TOBN(0x663357e3, 0xf208dcea), TOBN(0x8c5bd833, 0x565b1b70),
+ TOBN(0xccc3b4f5, 0x9837fc0d), TOBN(0x9b641ba8, 0xa79cf00f),
+ TOBN(0x7428243d, 0xdfdf3990), TOBN(0x83a594c4, 0x020786b1),
+ TOBN(0xb712451a, 0x526c4502), TOBN(0x9d39438e, 0x6adb3f93),
+ TOBN(0xfdb261e3, 0xe9ff0ccd), TOBN(0x80344e3c, 0xe07af4c3),
+ TOBN(0x75900d7c, 0x2fa4f126), TOBN(0x08a3b865, 0x5c99a232),
+ TOBN(0x2478b6bf, 0xdb25e0c3), TOBN(0x482cc2c2, 0x71db2edf),
+ TOBN(0x37df7e64, 0x5f321bb8), TOBN(0x8a93821b, 0x9a8005b4),
+ TOBN(0x3fa2f10c, 0xcc8c1958), TOBN(0x0d332218, 0x2c269d0a),
+ TOBN(0x20ab8119, 0xe246b0e6), TOBN(0xb39781e4, 0xd349fd17),
+ TOBN(0xd293231e, 0xb31aa100), TOBN(0x4b779c97, 0xbb032168),
+ TOBN(0x4b3f19e1, 0xc8470500), TOBN(0x45b7efe9, 0x0c4c869d),
+ TOBN(0xdb84f38a, 0xa1a6bbcc), TOBN(0x3b59cb15, 0xb2fddbc1),
+ TOBN(0xba5514df, 0x3fd165e8), TOBN(0x499fd6a9, 0x061f8811),
+ TOBN(0x72cd1fe0, 0xbfef9f00), TOBN(0x120a4bb9, 0x79ad7e8a),
+ TOBN(0xf2ffd095, 0x5f4a5ac5), TOBN(0xcfd174f1, 0x95a7a2f0),
+ TOBN(0xd42301ba, 0x9d17baf1), TOBN(0xd2fa487a, 0x77f22089),
+ TOBN(0x9cb09efe, 0xb1dc77e1), TOBN(0xe9566939, 0x21c99682),
+ TOBN(0x8c546901, 0x6c6067bb), TOBN(0xfd378574, 0x61c24456),
+ TOBN(0x2b6a6cbe, 0x81796b33), TOBN(0x62d550f6, 0x58e87f8b),
+ TOBN(0x1b763e1c, 0x7f1b01b4), TOBN(0x4b93cfea, 0x1b1b5e12),
+ TOBN(0xb9345238, 0x1d531696), TOBN(0x57201c00, 0x88cdde69),
+ TOBN(0xdde92251, 0x9a86afc7), TOBN(0xe3043895, 0xbd35cea8),
+ TOBN(0x7608c1e1, 0x8555970d), TOBN(0x8267dfa9, 0x2535935e),
+ TOBN(0xd4c60a57, 0x322ea38b), TOBN(0xe0bf7977, 0x804ef8b5),
+ TOBN(0x1a0dab28, 0xc06fece4), TOBN(0xd405991e, 0x94e7b49d),
+ TOBN(0xc542b6d2, 0x706dab28), TOBN(0xcb228da3, 0xa91618fb),
+ TOBN(0x224e4164, 0x107d1cea), TOBN(0xeb9fdab3, 0xd0f5d8f1),
+ TOBN(0xc02ba386, 0x0d6e41cd), TOBN(0x676a72c5, 0x9b1f7146),
+ TOBN(0xffd6dd98, 0x4d6cb00b), TOBN(0xcef9c5ca, 0xde2e8d7c),
+ TOBN(0xa1bbf5d7, 0x641c7936), TOBN(0x1b95b230, 0xee8f772e),
+ TOBN(0xf765a92e, 0xe8ac25b1), TOBN(0xceb04cfc, 0x3a18b7c6),
+ TOBN(0x27944cef, 0x0acc8966), TOBN(0xcbb3c957, 0x434c1004),
+ TOBN(0x9c9971a1, 0xa43ff93c), TOBN(0x5bc2db17, 0xa1e358a9),
+ TOBN(0x45b4862e, 0xa8d9bc82), TOBN(0x70ebfbfb, 0x2201e052),
+ TOBN(0xafdf64c7, 0x92871591), TOBN(0xea5bcae6, 0xb42d0219),
+ TOBN(0xde536c55, 0x2ad8f03c), TOBN(0xcd6c3f4d, 0xa76aa33c),
+ TOBN(0xbeb5f623, 0x0bca6de3), TOBN(0xdd20dd99, 0xb1e706fd),
+ TOBN(0x90b3ff9d, 0xac9059d4), TOBN(0x2d7b2902, 0x7ccccc4e),
+ TOBN(0x8a090a59, 0xce98840f), TOBN(0xa5d947e0, 0x8410680a),
+ TOBN(0x49ae346a, 0x923379a5), TOBN(0x7dbc84f9, 0xb28a3156),
+ TOBN(0xfd40d916, 0x54a1aff2), TOBN(0xabf318ba, 0x3a78fb9b),
+ TOBN(0x50152ed8, 0x3029f95e), TOBN(0x9fc1dd77, 0xc58ad7fa),
+ TOBN(0x5fa57915, 0x13595c17), TOBN(0xb9504668, 0x8f62b3a9),
+ TOBN(0x907b5b24, 0xff3055b0), TOBN(0x2e995e35, 0x9a84f125),
+ TOBN(0x87dacf69, 0x7e9bbcfb), TOBN(0x95d0c1d6, 0xe86d96e3),
+ TOBN(0x65726e3c, 0x2d95a75c), TOBN(0x2c3c9001, 0xacd27f21),
+ TOBN(0x1deab561, 0x6c973f57), TOBN(0x108b7e2c, 0xa5221643),
+ TOBN(0x5fee9859, 0xc4ef79d4), TOBN(0xbd62b88a, 0x40d4b8c6),
+ TOBN(0xb4dd29c4, 0x197c75d6), TOBN(0x266a6df2, 0xb7076feb),
+ TOBN(0x9512d0ea, 0x4bf2df11), TOBN(0x1320c24f, 0x6b0cc9ec),
+ TOBN(0x6bb1e0e1, 0x01a59596), TOBN(0x8317c5bb, 0xeff9aaac),
+ TOBN(0x65bb405e, 0x385aa6c9), TOBN(0x613439c1, 0x8f07988f),
+ TOBN(0xd730049f, 0x16a66e91), TOBN(0xe97f2820, 0xfa1b0e0d),
+ TOBN(0x4131e003, 0x304c28ea), TOBN(0x820ab732, 0x526bac62),
+ TOBN(0xb2ac9ef9, 0x28714423), TOBN(0x54ecfffa, 0xadb10cb2),
+ TOBN(0x8781476e, 0xf886a4cc), TOBN(0x4b2c87b5, 0xdb2f8d49),
+ TOBN(0xe857cd20, 0x0a44295d), TOBN(0x707d7d21, 0x58c6b044),
+ TOBN(0xae8521f9, 0xf596757c), TOBN(0x87448f03, 0x67b2b714),
+ TOBN(0x13a9bc45, 0x5ebcd58d), TOBN(0x79bcced9, 0x9122d3c1),
+ TOBN(0x3c644247, 0x9e076642), TOBN(0x0cf22778, 0x2df4767d),
+ TOBN(0x5e61aee4, 0x71d444b6), TOBN(0x211236bf, 0xc5084a1d),
+ TOBN(0x7e15bc9a, 0x4fd3eaf6), TOBN(0x68df2c34, 0xab622bf5),
+ TOBN(0x9e674f0f, 0x59bf4f36), TOBN(0xf883669b, 0xd7f34d73),
+ TOBN(0xc48ac1b8, 0x31497b1d), TOBN(0x323b925d, 0x5106703b),
+ TOBN(0x22156f42, 0x74082008), TOBN(0xeffc521a, 0xc8482bcb),
+ TOBN(0x5c6831bf, 0x12173479), TOBN(0xcaa2528f, 0xc4739490),
+ TOBN(0x84d2102a, 0x8f1b3c4d), TOBN(0xcf64dfc1, 0x2d9bec0d),
+ TOBN(0x433febad, 0x78a546ef), TOBN(0x1f621ec3, 0x7b73cef1),
+ TOBN(0x6aecd627, 0x37338615), TOBN(0x162082ab, 0x01d8edf6),
+ TOBN(0x833a8119, 0x19e86b66), TOBN(0x6023a251, 0xd299b5db),
+ TOBN(0xf5bb0c3a, 0xbbf04b89), TOBN(0x6735eb69, 0xae749a44),
+ TOBN(0xd0e058c5, 0x4713de3b), TOBN(0xfdf2593e, 0x2c3d4ccd),
+ TOBN(0x1b8f414e, 0xfdd23667), TOBN(0xdd52aaca, 0xfa2015ee),
+ TOBN(0x3e31b517, 0xbd9625ff), TOBN(0x5ec9322d, 0x8db5918c),
+ TOBN(0xbc73ac85, 0xa96f5294), TOBN(0x82aa5bf3, 0x61a0666a),
+ TOBN(0x49755810, 0xbf08ac42), TOBN(0xd21cdfd5, 0x891cedfc),
+ TOBN(0x918cb57b, 0x67f8be10), TOBN(0x365d1a7c, 0x56ffa726),
+ TOBN(0x2435c504, 0x6532de93), TOBN(0xc0fc5e10, 0x2674cd02),
+ TOBN(0x6e51fcf8, 0x9cbbb142), TOBN(0x1d436e5a, 0xafc50692),
+ TOBN(0x766bffff, 0x3fbcae22), TOBN(0x3148c2fd, 0xfd55d3b8),
+ TOBN(0x52c7fdc9, 0x233222fa), TOBN(0x89ff1092, 0xe419fb6b),
+ TOBN(0x3cd6db99, 0x25254977), TOBN(0x2e85a161, 0x1cf12ca7),
+ TOBN(0xadd2547c, 0xdc810bc9), TOBN(0xea3f458f, 0x9d257c22),
+ TOBN(0x642c1fbe, 0x27d6b19b), TOBN(0xed07e6b5, 0x140481a6),
+ TOBN(0x6ada1d42, 0x86d2e0f8), TOBN(0xe5920122, 0x0e8a9fd5),
+ TOBN(0x02c936af, 0x708c1b49), TOBN(0x60f30fee, 0x2b4bfaff),
+ TOBN(0x6637ad06, 0x858e6a61), TOBN(0xce4c7767, 0x3fd374d0),
+ TOBN(0x39d54b2d, 0x7188defb), TOBN(0xa8c9d250, 0xf56a6b66),
+ TOBN(0x58fc0f5e, 0xb24fe1dc), TOBN(0x9eaf9dee, 0x6b73f24c),
+ TOBN(0xa90d588b, 0x33650705), TOBN(0xde5b62c5, 0xaf2ec729),
+ TOBN(0x5c72cfae, 0xd3c2b36e), TOBN(0x868c19d5, 0x034435da),
+ TOBN(0x88605f93, 0xe17ee145), TOBN(0xaa60c4ee, 0x77a5d5b1),
+ TOBN(0xbcf5bfd2, 0x3b60c472), TOBN(0xaf4ef13c, 0xeb1d3049),
+ TOBN(0x373f44fc, 0xe13895c9), TOBN(0xf29b382f, 0x0cbc9822),
+ TOBN(0x1bfcb853, 0x73efaef6), TOBN(0xcf56ac9c, 0xa8c96f40),
+ TOBN(0xd7adf109, 0x7a191e24), TOBN(0x98035f44, 0xbf8a8dc2),
+ TOBN(0xf40a71b9, 0x1e750c84), TOBN(0xc57f7b0c, 0x5dc6c469),
+ TOBN(0x49a0e79c, 0x6fbc19c1), TOBN(0x6b0f5889, 0xa48ebdb8),
+ TOBN(0x5d3fd084, 0xa07c4e9f), TOBN(0xc3830111, 0xab27de14),
+ TOBN(0x0e4929fe, 0x33e08dcc), TOBN(0xf4a5ad24, 0x40bb73a3),
+ TOBN(0xde86c2bf, 0x490f97ca), TOBN(0x288f09c6, 0x67a1ce18),
+ TOBN(0x364bb886, 0x1844478d), TOBN(0x7840fa42, 0xceedb040),
+ TOBN(0x1269fdd2, 0x5a631b37), TOBN(0x94761f1e, 0xa47c8b7d),
+ TOBN(0xfc0c2e17, 0x481c6266), TOBN(0x85e16ea2, 0x3daa5fa7),
+ TOBN(0xccd86033, 0x92491048), TOBN(0x0c2f6963, 0xf4d402d7),
+ TOBN(0x6336f7df, 0xdf6a865c), TOBN(0x0a2a463c, 0xb5c02a87),
+ TOBN(0xb0e29be7, 0xbf2f12ee), TOBN(0xf0a22002, 0x66bad988),
+ TOBN(0x27f87e03, 0x9123c1d7), TOBN(0x21669c55, 0x328a8c98),
+ TOBN(0x186b9803, 0x92f14529), TOBN(0xd3d056cc, 0x63954df3),
+ TOBN(0x2f03fd58, 0x175a46f6), TOBN(0x63e34ebe, 0x11558558),
+ TOBN(0xe13fedee, 0x5b80cfa5), TOBN(0xe872a120, 0xd401dbd1),
+ TOBN(0x52657616, 0xe8a9d667), TOBN(0xbc8da4b6, 0xe08d6693),
+ TOBN(0x370fb9bb, 0x1b703e75), TOBN(0x6773b186, 0xd4338363),
+ TOBN(0x18dad378, 0xecef7bff), TOBN(0xaac787ed, 0x995677da),
+ TOBN(0x4801ea8b, 0x0437164b), TOBN(0xf430ad20, 0x73fe795e),
+ TOBN(0xb164154d, 0x8ee5eb73), TOBN(0x0884ecd8, 0x108f7c0e),
+ TOBN(0x0e6ec096, 0x5f520698), TOBN(0x640631fe, 0x44f7b8d9),
+ TOBN(0x92fd34fc, 0xa35a68b9), TOBN(0x9c5a4b66, 0x4d40cf4e),
+ TOBN(0x949454bf, 0x80b6783d), TOBN(0x80e701fe, 0x3a320a10),
+ TOBN(0x8d1a564a, 0x1a0a39b2), TOBN(0x1436d53d, 0x320587db),
+ TOBN(0xf5096e6d, 0x6556c362), TOBN(0xbc23a3c0, 0xe2455d7e),
+ TOBN(0x3a7aee54, 0x807230f9), TOBN(0x9ba1cfa6, 0x22ae82fd),
+ TOBN(0x833a057a, 0x99c5d706), TOBN(0x8be85f4b, 0x842315c9),
+ TOBN(0xd083179a, 0x66a72f12), TOBN(0x2fc77d5d, 0xcdcc73cd),
+ TOBN(0x22b88a80, 0x5616ee30), TOBN(0xfb09548f, 0xe7ab1083),
+ TOBN(0x8ad6ab0d, 0x511270cd), TOBN(0x61f6c57a, 0x6924d9ab),
+ TOBN(0xa0f7bf72, 0x90aecb08), TOBN(0x849f87c9, 0x0df784a4),
+ TOBN(0x27c79c15, 0xcfaf1d03), TOBN(0xbbf9f675, 0xc463face),
+ TOBN(0x91502c65, 0x765ba543), TOBN(0x18ce3cac, 0x42ea60dd),
+ TOBN(0xe5cee6ac, 0x6e43ecb3), TOBN(0x63e4e910, 0x68f2aeeb),
+ TOBN(0x26234fa3, 0xc85932ee), TOBN(0x96883e8b, 0x4c90c44d),
+ TOBN(0x29b9e738, 0xa18a50f6), TOBN(0xbfc62b2a, 0x3f0420df),
+ TOBN(0xd22a7d90, 0x6d3e1fa9), TOBN(0x17115618, 0xfe05b8a3),
+ TOBN(0x2a0c9926, 0xbb2b9c01), TOBN(0xc739fcc6, 0xe07e76a2),
+ TOBN(0x540e9157, 0x165e439a), TOBN(0x06353a62, 0x6a9063d8),
+ TOBN(0x84d95594, 0x61e927a3), TOBN(0x013b9b26, 0xe2e0be7f),
+ TOBN(0x4feaec3b, 0x973497f1), TOBN(0x15c0f94e, 0x093ebc2d),
+ TOBN(0x6af5f227, 0x33af0583), TOBN(0x0c2af206, 0xc61f3340),
+ TOBN(0xd25dbdf1, 0x4457397c), TOBN(0x2e8ed017, 0xcabcbae0),
+ TOBN(0xe3010938, 0xc2815306), TOBN(0xbaa99337, 0xe8c6cd68),
+ TOBN(0x08513182, 0x3b0ec7de), TOBN(0x1e1b822b, 0x58df05df),
+ TOBN(0x5c14842f, 0xa5c3b683), TOBN(0x98fe977e, 0x3eba34ce),
+ TOBN(0xfd2316c2, 0x0d5e8873), TOBN(0xe48d839a, 0xbd0d427d),
+ TOBN(0x495b2218, 0x623fc961), TOBN(0x24ee56e7, 0xb46fba5e),
+ TOBN(0x9184a55b, 0x91e4de58), TOBN(0xa7488ca5, 0xdfdea288),
+ TOBN(0xa723862e, 0xa8dcc943), TOBN(0x92d762b2, 0x849dc0fc),
+ TOBN(0x3c444a12, 0x091ff4a9), TOBN(0x581113fa, 0x0cada274),
+ TOBN(0xb9de0a45, 0x30d8eae2), TOBN(0x5e0fcd85, 0xdf6b41ea),
+ TOBN(0x6233ea68, 0xc094dbb5), TOBN(0xb77d062e, 0xd968d410),
+ TOBN(0x3e719bbc, 0x58b3002d), TOBN(0x68e7dd3d, 0x3dc49d58),
+ TOBN(0x8d825740, 0x013a5e58), TOBN(0x21311747, 0x3c9e3c1b),
+ TOBN(0x0cb0a2a7, 0x7c99b6ab), TOBN(0x5c48a3b3, 0xc2f888f2)}
+ ,
+ {TOBN(0xc7913e91, 0x991724f3), TOBN(0x5eda799c, 0x39cbd686),
+ TOBN(0xddb595c7, 0x63d4fc1e), TOBN(0x6b63b80b, 0xac4fed54),
+ TOBN(0x6ea0fc69, 0x7e5fb516), TOBN(0x737708ba, 0xd0f1c964),
+ TOBN(0x9628745f, 0x11a92ca5), TOBN(0x61f37958, 0x9a86967a),
+ TOBN(0x9af39b2c, 0xaa665072), TOBN(0x78322fa4, 0xefd324ef),
+ TOBN(0x3d153394, 0xc327bd31), TOBN(0x81d5f271, 0x3129dab0),
+ TOBN(0xc72e0c42, 0xf48027f5), TOBN(0xaa40cdbc, 0x8536e717),
+ TOBN(0xf45a657a, 0x2d369d0f), TOBN(0xb03bbfc4, 0xea7f74e6),
+ TOBN(0x46a8c418, 0x0d738ded), TOBN(0x6f1a5bb0, 0xe0de5729),
+ TOBN(0xf10230b9, 0x8ba81675), TOBN(0x32c6f30c, 0x112b33d4),
+ TOBN(0x7559129d, 0xd8fffb62), TOBN(0x6a281b47, 0xb459bf05),
+ TOBN(0x77c1bd3a, 0xfa3b6776), TOBN(0x0709b380, 0x7829973a),
+ TOBN(0x8c26b232, 0xa3326505), TOBN(0x38d69272, 0xee1d41bf),
+ TOBN(0x0459453e, 0xffe32afa), TOBN(0xce8143ad, 0x7cb3ea87),
+ TOBN(0x932ec1fa, 0x7e6ab666), TOBN(0x6cd2d230, 0x22286264),
+ TOBN(0x459a46fe, 0x6736f8ed), TOBN(0x50bf0d00, 0x9eca85bb),
+ TOBN(0x0b825852, 0x877a21ec), TOBN(0x300414a7, 0x0f537a94),
+ TOBN(0x3f1cba40, 0x21a9a6a2), TOBN(0x50824eee, 0x76943c00),
+ TOBN(0xa0dbfcec, 0xf83cba5d), TOBN(0xf9538148, 0x93b4f3c0),
+ TOBN(0x61744162, 0x48f24dd7), TOBN(0x5322d64d, 0xe4fb09dd),
+ TOBN(0x57447384, 0x3d9325f3), TOBN(0xa9bef2d0, 0xf371cb84),
+ TOBN(0x77d2188b, 0xa61e36c5), TOBN(0xbbd6a7d7, 0xc602df72),
+ TOBN(0xba3aa902, 0x8f61bc0b), TOBN(0xf49085ed, 0x6ed0b6a1),
+ TOBN(0x8bc625d6, 0xae6e8298), TOBN(0x832b0b1d, 0xa2e9c01d),
+ TOBN(0xa337c447, 0xf1f0ced1), TOBN(0x800cc793, 0x9492dd2b),
+ TOBN(0x4b93151d, 0xbea08efa), TOBN(0x820cf3f8, 0xde0a741e),
+ TOBN(0xff1982dc, 0x1c0f7d13), TOBN(0xef921960, 0x84dde6ca),
+ TOBN(0x1ad7d972, 0x45f96ee3), TOBN(0x319c8dbe, 0x29dea0c7),
+ TOBN(0xd3ea3871, 0x7b82b99b), TOBN(0x75922d4d, 0x470eb624),
+ TOBN(0x8f66ec54, 0x3b95d466), TOBN(0x66e673cc, 0xbee1e346),
+ TOBN(0x6afe67c4, 0xb5f2b89a), TOBN(0x3de9c1e6, 0x290e5cd3),
+ TOBN(0x8c278bb6, 0x310a2ada), TOBN(0x420fa384, 0x0bdb323b),
+ TOBN(0x0ae1d63b, 0x0eb919b0), TOBN(0xd74ee51d, 0xa74b9620),
+ TOBN(0x395458d0, 0xa674290c), TOBN(0x324c930f, 0x4620a510),
+ TOBN(0x2d1f4d19, 0xfbac27d4), TOBN(0x4086e8ca, 0x9bedeeac),
+ TOBN(0x0cdd211b, 0x9b679ab8), TOBN(0x5970167d, 0x7090fec4),
+ TOBN(0x3420f2c9, 0xfaf1fc63), TOBN(0x616d333a, 0x328c8bb4),
+ TOBN(0x7d65364c, 0x57f1fe4a), TOBN(0x9343e877, 0x55e5c73a),
+ TOBN(0x5795176b, 0xe970e78c), TOBN(0xa36ccebf, 0x60533627),
+ TOBN(0xfc7c7380, 0x09cdfc1b), TOBN(0xb39a2afe, 0xb3fec326),
+ TOBN(0xb7ff1ba1, 0x6224408a), TOBN(0xcc856e92, 0x247cfc5e),
+ TOBN(0x01f102e7, 0xc18bc493), TOBN(0x4613ab74, 0x2091c727),
+ TOBN(0xaa25e89c, 0xc420bf2b), TOBN(0x00a53176, 0x90337ec2),
+ TOBN(0xd2be9f43, 0x7d025fc7), TOBN(0x3316fb85, 0x6e6fe3dc),
+ TOBN(0x27520af5, 0x9ac50814), TOBN(0xfdf95e78, 0x9a8e4223),
+ TOBN(0xb7e7df2a, 0x56bec5a0), TOBN(0xf7022f7d, 0xdf159e5d),
+ TOBN(0x93eeeab1, 0xcac1fe8f), TOBN(0x8040188c, 0x37451168),
+ TOBN(0x7ee8aa8a, 0xd967dce6), TOBN(0xfa0e79e7, 0x3abc9299),
+ TOBN(0x67332cfc, 0x2064cfd1), TOBN(0x339c31de, 0xb0651934),
+ TOBN(0x719b28d5, 0x2a3bcbea), TOBN(0xee74c82b, 0x9d6ae5c6),
+ TOBN(0x0927d05e, 0xbaf28ee6), TOBN(0x82cecf2c, 0x9d719028),
+ TOBN(0x0b0d353e, 0xddb30289), TOBN(0xfe4bb977, 0xfddb2e29),
+ TOBN(0xbb5bb990, 0x640bfd9e), TOBN(0xd226e277, 0x82f62108),
+ TOBN(0x4bf00985, 0x02ffdd56), TOBN(0x7756758a, 0x2ca1b1b5),
+ TOBN(0xc32b62a3, 0x5285fe91), TOBN(0xedbc546a, 0x8c9cd140),
+ TOBN(0x1e47a013, 0xaf5cb008), TOBN(0xbca7e720, 0x073ce8f2),
+ TOBN(0xe10b2ab8, 0x17a91cae), TOBN(0xb89aab65, 0x08e27f63),
+ TOBN(0x7b3074a7, 0xdba3ddf9), TOBN(0x1c20ce09, 0x330c2972),
+ TOBN(0x6b9917b4, 0x5fcf7e33), TOBN(0xe6793743, 0x945ceb42),
+ TOBN(0x18fc2215, 0x5c633d19), TOBN(0xad1adb3c, 0xc7485474),
+ TOBN(0x646f9679, 0x6424c49b), TOBN(0xf888dfe8, 0x67c241c9),
+ TOBN(0xe12d4b93, 0x24f68b49), TOBN(0x9a6b62d8, 0xa571df20),
+ TOBN(0x81b4b26d, 0x179483cb), TOBN(0x666f9632, 0x9511fae2),
+ TOBN(0xd281b3e4, 0xd53aa51f), TOBN(0x7f96a765, 0x7f3dbd16),
+ TOBN(0xa7f8b5bf, 0x074a30ce), TOBN(0xd7f52107, 0x005a32e6),
+ TOBN(0x6f9e0907, 0x50237ed4), TOBN(0x2f21da47, 0x8096fa2b),
+ TOBN(0xf3e19cb4, 0xeec863a0), TOBN(0xd18f77fd, 0x9527620a),
+ TOBN(0x9505c81c, 0x407c1cf8), TOBN(0x9998db4e, 0x1b6ec284),
+ TOBN(0x7e3389e5, 0xc247d44d), TOBN(0x12507141, 0x3f4f3d80),
+ TOBN(0xd4ba0110, 0x4a78a6c7), TOBN(0x312874a0, 0x767720be),
+ TOBN(0xded059a6, 0x75944370), TOBN(0xd6123d90, 0x3b2c0bdd),
+ TOBN(0xa56b717b, 0x51c108e3), TOBN(0x9bb7940e, 0x070623e9),
+ TOBN(0x794e2d59, 0x84ac066c), TOBN(0xf5954a92, 0xe68c69a0),
+ TOBN(0x28c52458, 0x4fd99dcc), TOBN(0x60e639fc, 0xb1012517),
+ TOBN(0xc2e60125, 0x7de79248), TOBN(0xe9ef6404, 0xf12fc6d7),
+ TOBN(0x4c4f2808, 0x2a3b5d32), TOBN(0x865ad32e, 0xc768eb8a),
+ TOBN(0xac02331b, 0x13fb70b6), TOBN(0x037b44c1, 0x95599b27),
+ TOBN(0x1a860fc4, 0x60bd082c), TOBN(0xa2e25745, 0xc980cd01),
+ TOBN(0xee3387a8, 0x1da0263e), TOBN(0x931bfb95, 0x2d10f3d6),
+ TOBN(0x5b687270, 0xa1f24a32), TOBN(0xf140e65d, 0xca494b86),
+ TOBN(0x4f4ddf91, 0xb2f1ac7a), TOBN(0xf99eaabb, 0x760fee27),
+ TOBN(0x57f4008a, 0x49c228e5), TOBN(0x090be440, 0x1cf713bb),
+ TOBN(0xac91fbe4, 0x5004f022), TOBN(0xd838c2c2, 0x569e1af6),
+ TOBN(0xd6c7d20b, 0x0f1daaa5), TOBN(0xaa063ac1, 0x1bbb02c0),
+ TOBN(0x0938a422, 0x59558a78), TOBN(0x5343c669, 0x8435da2f),
+ TOBN(0x96f67b18, 0x034410dc), TOBN(0x7cc1e424, 0x84510804),
+ TOBN(0x86a1543f, 0x16dfbb7d), TOBN(0x921fa942, 0x5b5bd592),
+ TOBN(0x9dcccb6e, 0xb33dd03c), TOBN(0x8581ddd9, 0xb843f51e),
+ TOBN(0x54935fcb, 0x81d73c9e), TOBN(0x6d07e979, 0x0a5e97ab),
+ TOBN(0x4dc7b30a, 0xcf3a6bab), TOBN(0x147ab1f3, 0x170bee11),
+ TOBN(0x0aaf8e3d, 0x9fafdee4), TOBN(0xfab3dbcb, 0x538a8b95),
+ TOBN(0x405df4b3, 0x6ef13871), TOBN(0xf1f4e9cb, 0x088d5a49),
+ TOBN(0x9bcd24d3, 0x66b33f1d), TOBN(0x3b97b820, 0x5ce445c0),
+ TOBN(0xe2926549, 0xba93ff61), TOBN(0xd9c341ce, 0x4dafe616),
+ TOBN(0xfb30a76e, 0x16efb6f3), TOBN(0xdf24b8ca, 0x605b953c),
+ TOBN(0x8bd52afe, 0xc2fffb9f), TOBN(0xbbac5ff7, 0xe19d0b96),
+ TOBN(0x43c01b87, 0x459afccd), TOBN(0x6bd45143, 0xb7432652),
+ TOBN(0x84734530, 0x55b5d78e), TOBN(0x81088fdb, 0x1554ba7d),
+ TOBN(0xada0a52c, 0x1e269375), TOBN(0xf9f037c4, 0x2dc5ec10),
+ TOBN(0xc0660607, 0x94bfbc11), TOBN(0xc0a630bb, 0xc9c40d2f),
+ TOBN(0x5efc797e, 0xab64c31e), TOBN(0xffdb1dab, 0x74507144),
+ TOBN(0xf6124287, 0x1ca6790c), TOBN(0xe9609d81, 0xe69bf1bf),
+ TOBN(0xdb898595, 0x00d24fc9), TOBN(0x9c750333, 0xe51fb417),
+ TOBN(0x51830a91, 0xfef7bbde), TOBN(0x0ce67dc8, 0x945f585c),
+ TOBN(0x9a730ed4, 0x4763eb50), TOBN(0x24a0e221, 0xc1ab0d66),
+ TOBN(0x643b6393, 0x648748f3), TOBN(0x1982daa1, 0x6d3c6291),
+ TOBN(0x6f00a9f7, 0x8bbc5549), TOBN(0x7a1783e1, 0x7f36384e),
+ TOBN(0xe8346323, 0xde977f50), TOBN(0x91ab688d, 0xb245502a),
+ TOBN(0x331ab6b5, 0x6d0bdd66), TOBN(0x0a6ef32e, 0x64b71229),
+ TOBN(0x1028150e, 0xfe7c352f), TOBN(0x27e04350, 0xce7b39d3),
+ TOBN(0x2a3c8acd, 0xc1070c82), TOBN(0xfb2034d3, 0x80c9feef),
+ TOBN(0x2d729621, 0x709f3729), TOBN(0x8df290bf, 0x62cb4549),
+ TOBN(0x02f99f33, 0xfc2e4326), TOBN(0x3b30076d, 0x5eddf032),
+ TOBN(0xbb21f8cf, 0x0c652fb5), TOBN(0x314fb49e, 0xed91cf7b),
+ TOBN(0xa013eca5, 0x2f700750), TOBN(0x2b9e3c23, 0x712a4575),
+ TOBN(0xe5355557, 0xaf30fbb0), TOBN(0x1ada3516, 0x7c77e771),
+ TOBN(0x45f6ecb2, 0x7b135670), TOBN(0xe85d19df, 0x7cfc202e),
+ TOBN(0x0f1b50c7, 0x58d1be9f), TOBN(0x5ebf2c0a, 0xead2e344),
+ TOBN(0x1531fe4e, 0xabc199c9), TOBN(0xc7032592, 0x56bab0ae),
+ TOBN(0x16ab2e48, 0x6c1fec54), TOBN(0x0f87fda8, 0x04280188),
+ TOBN(0xdc9f46fc, 0x609e4a74), TOBN(0x2a44a143, 0xba667f91),
+ TOBN(0xbc3d8b95, 0xb4d83436), TOBN(0xa01e4bd0, 0xc7bd2958),
+ TOBN(0x7b182932, 0x73483c90), TOBN(0xa79c6aa1, 0xa7c7b598),
+ TOBN(0xbf3983c6, 0xeaaac07e), TOBN(0x8f18181e, 0x96e0d4e6),
+ TOBN(0x8553d37c, 0x051af62b), TOBN(0xe9a998eb, 0x0bf94496),
+ TOBN(0xe0844f9f, 0xb0d59aa1), TOBN(0x983fd558, 0xe6afb813),
+ TOBN(0x9670c0ca, 0x65d69804), TOBN(0x732b22de, 0x6ea5ff2d),
+ TOBN(0xd7640ba9, 0x5fd8623b), TOBN(0x9f619163, 0xa6351782),
+ TOBN(0x0bfc27ee, 0xacee5043), TOBN(0xae419e73, 0x2eb10f02),
+ TOBN(0x19c028d1, 0x8943fb05), TOBN(0x71f01cf7, 0xff13aa2a),
+ TOBN(0x7790737e, 0x8887a132), TOBN(0x67513309, 0x66318410),
+ TOBN(0x9819e8a3, 0x7ddb795e), TOBN(0xfecb8ef5, 0xdad100b2),
+ TOBN(0x59f74a22, 0x3021926a), TOBN(0xb7c28a49, 0x6f9b4c1c),
+ TOBN(0xed1a733f, 0x912ad0ab), TOBN(0x42a910af, 0x01a5659c),
+ TOBN(0x3842c6e0, 0x7bd68cab), TOBN(0x2b57fa38, 0x76d70ac8),
+ TOBN(0x8a6707a8, 0x3c53aaeb), TOBN(0x62c1c510, 0x65b4db18),
+ TOBN(0x8de2c1fb, 0xb2d09dc7), TOBN(0xc3dfed12, 0x266bd23b),
+ TOBN(0x927d039b, 0xd5b27db6), TOBN(0x2fb2f0f1, 0x103243da),
+ TOBN(0xf855a07b, 0x80be7399), TOBN(0xed9327ce, 0x1f9f27a8),
+ TOBN(0xa0bd99c7, 0x729bdef7), TOBN(0x2b67125e, 0x28250d88),
+ TOBN(0x784b26e8, 0x8670ced7), TOBN(0xe3dfe41f, 0xc31bd3b4),
+ TOBN(0x9e353a06, 0xbcc85cbc), TOBN(0x302e2909, 0x60178a9d),
+ TOBN(0x860abf11, 0xa6eac16e), TOBN(0x76447000, 0xaa2b3aac),
+ TOBN(0x46ff9d19, 0x850afdab), TOBN(0x35bdd6a5, 0xfdb2d4c1),
+ TOBN(0xe82594b0, 0x7e5c9ce9), TOBN(0x0f379e53, 0x20af346e),
+ TOBN(0x608b31e3, 0xbc65ad4a), TOBN(0x710c6b12, 0x267c4826),
+ TOBN(0x51c966f9, 0x71954cf1), TOBN(0xb1cec793, 0x0d0aa215),
+ TOBN(0x1f155989, 0x86bd23a8), TOBN(0xae2ff99c, 0xf9452e86),
+ TOBN(0xd8dd953c, 0x340ceaa2), TOBN(0x26355275, 0x2e2e9333),
+ TOBN(0x15d4e5f9, 0x8586f06d), TOBN(0xd6bf94a8, 0xf7cab546),
+ TOBN(0x33c59a0a, 0xb76a9af0), TOBN(0x52740ab3, 0xba095af7),
+ TOBN(0xc444de8a, 0x24389ca0), TOBN(0xcc6f9863, 0x706da0cb),
+ TOBN(0xb5a741a7, 0x6b2515cf), TOBN(0x71c41601, 0x9585c749),
+ TOBN(0x78350d4f, 0xe683de97), TOBN(0x31d61524, 0x63d0b5f5),
+ TOBN(0x7a0cc5e1, 0xfbce090b), TOBN(0xaac927ed, 0xfbcb2a5b),
+ TOBN(0xe920de49, 0x20d84c35), TOBN(0x8c06a0b6, 0x22b4de26),
+ TOBN(0xd34dd58b, 0xafe7ddf3), TOBN(0x55851fed, 0xc1e6e55b),
+ TOBN(0xd1395616, 0x960696e7), TOBN(0x940304b2, 0x5f22705f),
+ TOBN(0x6f43f861, 0xb0a2a860), TOBN(0xcf121282, 0x0e7cc981),
+ TOBN(0x12186212, 0x0ab64a96), TOBN(0x09215b9a, 0xb789383c),
+ TOBN(0x311eb305, 0x37387c09), TOBN(0xc5832fce, 0xf03ee760),
+ TOBN(0x30358f58, 0x32f7ea19), TOBN(0xe01d3c34, 0x91d53551),
+ TOBN(0x1ca5ee41, 0xda48ea80), TOBN(0x34e71e8e, 0xcf4fa4c1),
+ TOBN(0x312abd25, 0x7af1e1c7), TOBN(0xe3afcdeb, 0x2153f4a5),
+ TOBN(0x9d5c84d7, 0x00235e9a), TOBN(0x0308d3f4, 0x8c4c836f),
+ TOBN(0xc0a66b04, 0x89332de5), TOBN(0x610dd399, 0x89e566ef),
+ TOBN(0xf8eea460, 0xd1ac1635), TOBN(0x84cbb3fb, 0x20a2c0df),
+ TOBN(0x40afb488, 0xe74a48c5), TOBN(0x29738198, 0xd326b150),
+ TOBN(0x2a17747f, 0xa6d74081), TOBN(0x60ea4c05, 0x55a26214),
+ TOBN(0x53514bb4, 0x1f88c5fe), TOBN(0xedd64567, 0x7e83426c),
+ TOBN(0xd5d6cbec, 0x96460b25), TOBN(0xa12fd0ce, 0x68dc115e),
+ TOBN(0xc5bc3ed2, 0x697840ea), TOBN(0x969876a8, 0xa6331e31),
+ TOBN(0x60c36217, 0x472ff580), TOBN(0xf4229705, 0x4ad41393),
+ TOBN(0x4bd99ef0, 0xa03b8b92), TOBN(0x501c7317, 0xc144f4f6),
+ TOBN(0x159009b3, 0x18464945), TOBN(0x6d5e594c, 0x74c5c6be),
+ TOBN(0x2d587011, 0x321a3660), TOBN(0xd1e184b1, 0x3898d022),
+ TOBN(0x5ba04752, 0x4c6a7e04), TOBN(0x47fa1e2b, 0x45550b65),
+ TOBN(0x9419daf0, 0x48c0a9a5), TOBN(0x66362953, 0x7c243236),
+ TOBN(0xcd0744b1, 0x5cb12a88), TOBN(0x561b6f9a, 0x2b646188),
+ TOBN(0x599415a5, 0x66c2c0c0), TOBN(0xbe3f0859, 0x0f83f09a),
+ TOBN(0x9141c5be, 0xb92041b8), TOBN(0x01ae38c7, 0x26477d0d),
+ TOBN(0xca8b71f3, 0xd12c7a94), TOBN(0xfab5b31f, 0x765c70db),
+ TOBN(0x76ae7492, 0x487443e9), TOBN(0x8595a310, 0x990d1349),
+ TOBN(0xf8dbeda8, 0x7d460a37), TOBN(0x7f7ad082, 0x1e45a38f),
+ TOBN(0xed1d4db6, 0x1059705a), TOBN(0xa3dd492a, 0xe6b9c697),
+ TOBN(0x4b92ee3a, 0x6eb38bd5), TOBN(0xbab2609d, 0x67cc0bb7),
+ TOBN(0x7fc4fe89, 0x6e70ee82), TOBN(0xeff2c56e, 0x13e6b7e3),
+ TOBN(0x9b18959e, 0x34d26fca), TOBN(0x2517ab66, 0x889d6b45),
+ TOBN(0xf167b4e0, 0xbdefdd4f), TOBN(0x69958465, 0xf366e401),
+ TOBN(0x5aa368ab, 0xa73bbec0), TOBN(0x12148709, 0x7b240c21),
+ TOBN(0x378c3233, 0x18969006), TOBN(0xcb4d73ce, 0xe1fe53d1),
+ TOBN(0x5f50a80e, 0x130c4361), TOBN(0xd67f5951, 0x7ef5212b),
+ TOBN(0xf145e21e, 0x9e70c72e), TOBN(0xb2e52e29, 0x5566d2fb),
+ TOBN(0x44eaba4a, 0x032397f5), TOBN(0x5e56937b, 0x7e31a7de),
+ TOBN(0x68dcf517, 0x456c61e1), TOBN(0xbc2e954a, 0xa8b0a388),
+ TOBN(0xe3552fa7, 0x60a8b755), TOBN(0x03442dae, 0x73ad0cde),
+ TOBN(0x37ffe747, 0xceb26210), TOBN(0x983545e8, 0x787baef9),
+ TOBN(0x8b8c8535, 0x86a3de31), TOBN(0xc621dbcb, 0xfacd46db),
+ TOBN(0x82e442e9, 0x59266fbb), TOBN(0xa3514c37, 0x339d471c),
+ TOBN(0x3a11b771, 0x62cdad96), TOBN(0xf0cb3b3c, 0xecf9bdf0),
+ TOBN(0x3fcbdbce, 0x478e2135), TOBN(0x7547b5cf, 0xbda35342),
+ TOBN(0xa97e81f1, 0x8a677af6), TOBN(0xc8c2bf83, 0x28817987),
+ TOBN(0xdf07eaaf, 0x45580985), TOBN(0xc68d1f05, 0xc93b45cb),
+ TOBN(0x106aa2fe, 0xc77b4cac), TOBN(0x4c1d8afc, 0x04a7ae86),
+ TOBN(0xdb41c3fd, 0x9eb45ab2), TOBN(0x5b234b5b, 0xd4b22e74),
+ TOBN(0xda253dec, 0xf215958a), TOBN(0x67e0606e, 0xa04edfa0),
+ TOBN(0xabbbf070, 0xef751b11), TOBN(0xf352f175, 0xf6f06dce),
+ TOBN(0xdfc4b6af, 0x6839f6b4), TOBN(0x53ddf9a8, 0x9959848e),
+ TOBN(0xda49c379, 0xc21520b0), TOBN(0x90864ff0, 0xdbd5d1b6),
+ TOBN(0x2f055d23, 0x5f49c7f7), TOBN(0xe51e4e6a, 0xa796b2d8),
+ TOBN(0xc361a67f, 0x5c9dc340), TOBN(0x5ad53c37, 0xbca7c620),
+ TOBN(0xda1d6588, 0x32c756d0), TOBN(0xad60d911, 0x8bb67e13),
+ TOBN(0xd6c47bdf, 0x0eeec8c6), TOBN(0x4a27fec1, 0x078a1821),
+ TOBN(0x081f7415, 0xc3099524), TOBN(0x8effdf0b, 0x82cd8060),
+ TOBN(0xdb70ec1c, 0x65842df8), TOBN(0x8821b358, 0xd319a901),
+ TOBN(0x72ee56ee, 0xde42b529), TOBN(0x5bb39592, 0x236e4286),
+ TOBN(0xd1183316, 0xfd6f7140), TOBN(0xf9fadb5b, 0xbd8e81f7),
+ TOBN(0x701d5e0c, 0x5a02d962), TOBN(0xfdee4dbf, 0x1b601324),
+ TOBN(0xbed17407, 0x35d7620e), TOBN(0x04e3c2c3, 0xf48c0012),
+ TOBN(0x9ee29da7, 0x3455449a), TOBN(0x562cdef4, 0x91a836c4),
+ TOBN(0x8f682a5f, 0x47701097), TOBN(0x617125d8, 0xff88d0c2),
+ TOBN(0x948fda24, 0x57bb86dd), TOBN(0x348abb8f, 0x289f7286),
+ TOBN(0xeb10eab5, 0x99d94bbd), TOBN(0xd51ba28e, 0x4684d160),
+ TOBN(0xabe0e51c, 0x30c8f41a), TOBN(0x66588b45, 0x13254f4a),
+ TOBN(0x147ebf01, 0xfad097a5), TOBN(0x49883ea8, 0x610e815d),
+ TOBN(0xe44d60ba, 0x8a11de56), TOBN(0xa970de6e, 0x827a7a6d),
+ TOBN(0x2be41424, 0x5e17fc19), TOBN(0xd833c657, 0x01214057),
+ TOBN(0x1375813b, 0x363e723f), TOBN(0x6820bb88, 0xe6a52e9b),
+ TOBN(0x7e7f6970, 0xd875d56a), TOBN(0xd6a0a9ac, 0x51fbf6bf),
+ TOBN(0x54ba8790, 0xa3083c12), TOBN(0xebaeb23d, 0x6ae7eb64),
+ TOBN(0xa8685c3a, 0xb99a907a), TOBN(0xf1e74550, 0x026bf40b),
+ TOBN(0x7b73a027, 0xc802cd9e), TOBN(0x9a8a927c, 0x4fef4635),
+ TOBN(0xe1b6f60c, 0x08191224), TOBN(0xc4126ebb, 0xde4ec091),
+ TOBN(0xe1dff4dc, 0x4ae38d84), TOBN(0xde3f57db, 0x4f2ef985),
+ TOBN(0x34964337, 0xd446a1dd), TOBN(0x7bf217a0, 0x859e77f6),
+ TOBN(0x8ff10527, 0x8e1d13f5), TOBN(0xa304ef03, 0x74eeae27),
+ TOBN(0xfc6f5e47, 0xd19dfa5a), TOBN(0xdb007de3, 0x7fad982b),
+ TOBN(0x28205ad1, 0x613715f5), TOBN(0x251e6729, 0x7889529e),
+ TOBN(0x72705184, 0x1ae98e78), TOBN(0xf818537d, 0x271cac32),
+ TOBN(0xc8a15b7e, 0xb7f410f5), TOBN(0xc474356f, 0x81f62393),
+ TOBN(0x92dbdc5a, 0xc242316b), TOBN(0xabe060ac, 0xdbf4aff5),
+ TOBN(0x6e8c38fe, 0x909a8ec6), TOBN(0x43e514e5, 0x6116cb94),
+ TOBN(0x2078fa38, 0x07d784f9), TOBN(0x1161a880, 0xf4b5b357),
+ TOBN(0x5283ce79, 0x13adea3d), TOBN(0x0756c3e6, 0xcc6a910b),
+ TOBN(0x60bcfe01, 0xaaa79697), TOBN(0x04a73b29, 0x56391db1),
+ TOBN(0xdd8dad47, 0x189b45a0), TOBN(0xbfac0dd0, 0x48d5b8d9),
+ TOBN(0x34ab3af5, 0x7d3d2ec2), TOBN(0x6fa2fc2d, 0x207bd3af),
+ TOBN(0x9ff40092, 0x66550ded), TOBN(0x719b3e87, 0x1fd5b913),
+ TOBN(0xa573a496, 0x6d17fbc7), TOBN(0x0cd1a70a, 0x73d2b24e),
+ TOBN(0x34e2c5ca, 0xb2676937), TOBN(0xe7050b06, 0xbf669f21),
+ TOBN(0xfbe948b6, 0x1ede9046), TOBN(0xa0530051, 0x97662659),
+ TOBN(0x58cbd4ed, 0xf10124c5), TOBN(0xde2646e4, 0xdd6c06c8),
+ TOBN(0x332f8108, 0x8cad38c0), TOBN(0x471b7e90, 0x6bd68ae2),
+ TOBN(0x56ac3fb2, 0x0d8e27a3), TOBN(0xb54660db, 0x136b4b0d),
+ TOBN(0x123a1e11, 0xa6fd8de4), TOBN(0x44dbffea, 0xa37799ef),
+ TOBN(0x4540b977, 0xce6ac17c), TOBN(0x495173a8, 0xaf60acef)}
+ ,
+ {TOBN(0x9ebb284d, 0x391c2a82), TOBN(0xbcdd4863, 0x158308e8),
+ TOBN(0x006f16ec, 0x83f1edca), TOBN(0xa13e2c37, 0x695dc6c8),
+ TOBN(0x2ab756f0, 0x4a057a87), TOBN(0xa8765500, 0xa6b48f98),
+ TOBN(0x4252face, 0x68651c44), TOBN(0xa52b540b, 0xe1765e02),
+ TOBN(0x4f922fc5, 0x16a0d2bb), TOBN(0x0d5cc16c, 0x1a623499),
+ TOBN(0x9241cf3a, 0x57c62c8b), TOBN(0x2f5e6961, 0xfd1b667f),
+ TOBN(0x5c15c70b, 0xf5a01797), TOBN(0x3d20b44d, 0x60956192),
+ TOBN(0x04911b37, 0x071fdb52), TOBN(0xf648f916, 0x8d6f0f7b),
+ TOBN(0x6dc1acaf, 0xe60b7cf7), TOBN(0x25860a50, 0x84a9d869),
+ TOBN(0x56fc6f09, 0xe7ba8ac4), TOBN(0x828c5bd0, 0x6148d29e),
+ TOBN(0xac6b435e, 0xdc55ae5f), TOBN(0xa527f56c, 0xc0117411),
+ TOBN(0x94d5045e, 0xfd24342c), TOBN(0x2c4c0a35, 0x70b67c0d),
+ TOBN(0x027cc8b8, 0xfac61d9a), TOBN(0x7d25e062, 0xe3c6fe8a),
+ TOBN(0xe08805bf, 0xe5bff503), TOBN(0x13271e6c, 0x6ff632f7),
+ TOBN(0x55dca6c0, 0x232f76a5), TOBN(0x8957c32d, 0x701ef426),
+ TOBN(0xee728bcb, 0xa10a5178), TOBN(0x5ea60411, 0xb62c5173),
+ TOBN(0xfc4e964e, 0xd0b8892b), TOBN(0x9ea17683, 0x9301bb74),
+ TOBN(0x6265c5ae, 0xfcc48626), TOBN(0xe60cf82e, 0xbb3e9102),
+ TOBN(0x57adf797, 0xd4df5531), TOBN(0x235b59a1, 0x8deeefe2),
+ TOBN(0x60adcf58, 0x3f306eb1), TOBN(0x105c2753, 0x3d09492d),
+ TOBN(0x4090914b, 0xb5def996), TOBN(0x1cb69c83, 0x233dd1e7),
+ TOBN(0xc1e9c1d3, 0x9b3d5e76), TOBN(0x1f3338ed, 0xfccf6012),
+ TOBN(0xb1e95d0d, 0x2f5378a8), TOBN(0xacf4c2c7, 0x2f00cd21),
+ TOBN(0x6e984240, 0xeb5fe290), TOBN(0xd66c038d, 0x248088ae),
+ TOBN(0x804d264a, 0xf94d70cf), TOBN(0xbdb802ef, 0x7314bf7e),
+ TOBN(0x8fb54de2, 0x4333ed02), TOBN(0x740461e0, 0x285635d9),
+ TOBN(0x4113b2c8, 0x365e9383), TOBN(0xea762c83, 0x3fdef652),
+ TOBN(0x4eec6e2e, 0x47b956c1), TOBN(0xa3d814be, 0x65620fa4),
+ TOBN(0x9ad5462b, 0xb4d8bc50), TOBN(0x181c0b16, 0xa9195770),
+ TOBN(0xebd4fe1c, 0x78412a68), TOBN(0xae0341bc, 0xc0dff48c),
+ TOBN(0xb6bc45cf, 0x7003e866), TOBN(0xf11a6dea, 0x8a24a41b),
+ TOBN(0x5407151a, 0xd04c24c2), TOBN(0x62c9d27d, 0xda5b7b68),
+ TOBN(0x2e964235, 0x88cceff6), TOBN(0x8594c54f, 0x8b07ed69),
+ TOBN(0x1578e73c, 0xc84d0d0d), TOBN(0x7b4e1055, 0xff532868),
+ TOBN(0xa348c0d5, 0xb5ec995a), TOBN(0xbf4b9d55, 0x14289a54),
+ TOBN(0x9ba155a6, 0x58fbd777), TOBN(0x186ed7a8, 0x1a84491d),
+ TOBN(0xd4992b30, 0x614c0900), TOBN(0xda98d121, 0xbd00c24b),
+ TOBN(0x7f534dc8, 0x7ec4bfa1), TOBN(0x4a5ff674, 0x37dc34bc),
+ TOBN(0x68c196b8, 0x1d7ea1d7), TOBN(0x38cf2893, 0x80a6d208),
+ TOBN(0xfd56cd09, 0xe3cbbd6e), TOBN(0xec72e27e, 0x4205a5b6),
+ TOBN(0x15ea68f5, 0xa44f77f7), TOBN(0x7aa5f9fd, 0xb43c52bc),
+ TOBN(0x86ff676f, 0x94f0e609), TOBN(0xa4cde963, 0x2e2d432b),
+ TOBN(0x8cafa0c0, 0xeee470af), TOBN(0x84137d0e, 0x8a3f5ec8),
+ TOBN(0xebb40411, 0xfaa31231), TOBN(0xa239c13f, 0x6f7f7ccf),
+ TOBN(0x32865719, 0xa8afd30b), TOBN(0x86798328, 0x8a826dce),
+ TOBN(0xdf04e891, 0xc4a8fbe0), TOBN(0xbb6b6e1b, 0xebf56ad3),
+ TOBN(0x0a695b11, 0x471f1ff0), TOBN(0xd76c3389, 0xbe15baf0),
+ TOBN(0x018edb95, 0xbe96c43e), TOBN(0xf2beaaf4, 0x90794158),
+ TOBN(0x152db09e, 0xc3076a27), TOBN(0x5e82908e, 0xe416545d),
+ TOBN(0xa2c41272, 0x356d6f2e), TOBN(0xdc9c9642, 0x31fd74e1),
+ TOBN(0x66ceb88d, 0x519bf615), TOBN(0xe29ecd76, 0x05a2274e),
+ TOBN(0x3a0473c4, 0xbf5e2fa0), TOBN(0x6b6eb671, 0x64284e67),
+ TOBN(0xe8b97932, 0xb88756dd), TOBN(0xed4e8652, 0xf17e3e61),
+ TOBN(0xc2dd1499, 0x3ee1c4a4), TOBN(0xc0aaee17, 0x597f8c0e),
+ TOBN(0x15c4edb9, 0x6c168af3), TOBN(0x6563c7bf, 0xb39ae875),
+ TOBN(0xadfadb6f, 0x20adb436), TOBN(0xad55e8c9, 0x9a042ac0),
+ TOBN(0x975a1ed8, 0xb76da1f5), TOBN(0x10dfa466, 0xa58acb94),
+ TOBN(0x8dd7f7e3, 0xac060282), TOBN(0x6813e66a, 0x572a051e),
+ TOBN(0xb4ccae1e, 0x350cb901), TOBN(0xb653d656, 0x50cb7822),
+ TOBN(0x42484710, 0xdfab3b87), TOBN(0xcd7ee537, 0x9b670fd0),
+ TOBN(0x0a50b12e, 0x523b8bf6), TOBN(0x8009eb5b, 0x8f910c1b),
+ TOBN(0xf535af82, 0x4a167588), TOBN(0x0f835f9c, 0xfb2a2abd),
+ TOBN(0xf59b2931, 0x2afceb62), TOBN(0xc797df2a, 0x169d383f),
+ TOBN(0xeb3f5fb0, 0x66ac02b0), TOBN(0x029d4c6f, 0xdaa2d0ca),
+ TOBN(0xd4059bc1, 0xafab4bc5), TOBN(0x833f5c6f, 0x56783247),
+ TOBN(0xb5346630, 0x8d2d3605), TOBN(0x83387891, 0xd34d8433),
+ TOBN(0xd973b30f, 0xadd9419a), TOBN(0xbcca1099, 0xafe3fce8),
+ TOBN(0x08178315, 0x0809aac6), TOBN(0x01b7f21a, 0x540f0f11),
+ TOBN(0x65c29219, 0x909523c8), TOBN(0xa62f648f, 0xa3a1c741),
+ TOBN(0x88598d4f, 0x60c9e55a), TOBN(0xbce9141b, 0x0e4f347a),
+ TOBN(0x9af97d84, 0x35f9b988), TOBN(0x0210da62, 0x320475b6),
+ TOBN(0x3c076e22, 0x9191476c), TOBN(0x7520dbd9, 0x44fc7834),
+ TOBN(0x6a6b2cfe, 0xc1ab1bbd), TOBN(0xef8a65be, 0xdc650938),
+ TOBN(0x72855540, 0x805d7bc4), TOBN(0xda389396, 0xed11fdfd),
+ TOBN(0xa9d5bd36, 0x74660876), TOBN(0x11d67c54, 0xb45dff35),
+ TOBN(0x6af7d148, 0xa4f5da94), TOBN(0xbb8d4c3f, 0xc0bbeb31),
+ TOBN(0x87a7ebd1, 0xe0a1b12a), TOBN(0x1e4ef88d, 0x770ba95f),
+ TOBN(0x8c33345c, 0xdc2ae9cb), TOBN(0xcecf1276, 0x01cc8403),
+ TOBN(0x687c012e, 0x1b39b80f), TOBN(0xfd90d0ad, 0x35c33ba4),
+ TOBN(0xa3ef5a67, 0x5c9661c2), TOBN(0x368fc88e, 0xe017429e),
+ TOBN(0xd30c6761, 0x196a2fa2), TOBN(0x931b9817, 0xbd5b312e),
+ TOBN(0xba01000c, 0x72f54a31), TOBN(0xa203d2c8, 0x66eaa541),
+ TOBN(0xf2abdee0, 0x98939db3), TOBN(0xe37d6c2c, 0x3e606c02),
+ TOBN(0xf2921574, 0x521ff643), TOBN(0x2781b3c4, 0xd7e2fca3),
+ TOBN(0x664300b0, 0x7850ec06), TOBN(0xac5a38b9, 0x7d3a10cf),
+ TOBN(0x9233188d, 0xe34ab39d), TOBN(0xe77057e4, 0x5072cbb9),
+ TOBN(0xbcf0c042, 0xb59e78df), TOBN(0x4cfc91e8, 0x1d97de52),
+ TOBN(0x4661a26c, 0x3ee0ca4a), TOBN(0x5620a4c1, 0xfb8507bc),
+ TOBN(0x4b44d4aa, 0x049f842c), TOBN(0xceabc5d5, 0x1540e82b),
+ TOBN(0x306710fd, 0x15c6f156), TOBN(0xbe5ae52b, 0x63db1d72),
+ TOBN(0x06f1e7e6, 0x334957f1), TOBN(0x57e388f0, 0x31144a70),
+ TOBN(0xfb69bb2f, 0xdf96447b), TOBN(0x0f78ebd3, 0x73e38a12),
+ TOBN(0xb8222605, 0x2b7ce542), TOBN(0xe6d4ce99, 0x7472bde1),
+ TOBN(0x53e16ebe, 0x09d2f4da), TOBN(0x180ff42e, 0x53b92b2e),
+ TOBN(0xc59bcc02, 0x2c34a1c6), TOBN(0x3803d6f9, 0x422c46c2),
+ TOBN(0x18aff74f, 0x5c14a8a2), TOBN(0x55aebf80, 0x10a08b28),
+ TOBN(0x66097d58, 0x7135593f), TOBN(0x32e6eff7, 0x2be570cd),
+ TOBN(0x584e6a10, 0x2a8c860d), TOBN(0xcd185890, 0xa2eb4163),
+ TOBN(0x7ceae99d, 0x6d97e134), TOBN(0xd42c6b70, 0xdd8447ce),
+ TOBN(0x59ddbb4a, 0xb8c50273), TOBN(0x03c612df, 0x3cf34e1e),
+ TOBN(0x84b9ca15, 0x04b6c5a0), TOBN(0x35216f39, 0x18f0e3a3),
+ TOBN(0x3ec2d2bc, 0xbd986c00), TOBN(0x8bf546d9, 0xd19228fe),
+ TOBN(0xd1c655a4, 0x4cd623c3), TOBN(0x366ce718, 0x502b8e5a),
+ TOBN(0x2cfc84b4, 0xeea0bfe7), TOBN(0xe01d5cee, 0xcf443e8e),
+ TOBN(0x8ec045d9, 0x036520f8), TOBN(0xdfb3c3d1, 0x92d40e98),
+ TOBN(0x0bac4cce, 0xcc559a04), TOBN(0x35eccae5, 0x240ea6b1),
+ TOBN(0x180b32db, 0xf8a5a0ac), TOBN(0x547972a5, 0xeb699700),
+ TOBN(0xa3765801, 0xca26bca0), TOBN(0x57e09d0e, 0xa647f25a),
+ TOBN(0xb956970e, 0x2fdd23cc), TOBN(0xb80288bc, 0x5682e971),
+ TOBN(0xe6e6d91e, 0x9ae86ebc), TOBN(0x0564c83f, 0x8c9f1939),
+ TOBN(0x551932a2, 0x39560368), TOBN(0xe893752b, 0x049c28e2),
+ TOBN(0x0b03cee5, 0xa6a158c3), TOBN(0xe12d656b, 0x04964263),
+ TOBN(0x4b47554e, 0x63e3bc1d), TOBN(0xc719b6a2, 0x45044ff7),
+ TOBN(0x4f24d30a, 0xe48daa07), TOBN(0xa3f37556, 0xc8c1edc3),
+ TOBN(0x9a47bf76, 0x0700d360), TOBN(0xbb1a1824, 0x822ae4e2),
+ TOBN(0x22e275a3, 0x89f1fb4c), TOBN(0x72b1aa23, 0x9968c5f5),
+ TOBN(0xa75feaca, 0xbe063f64), TOBN(0x9b392f43, 0xbce47a09),
+ TOBN(0xd4241509, 0x1ad07aca), TOBN(0x4b0c591b, 0x8d26cd0f),
+ TOBN(0x2d42ddfd, 0x92f1169a), TOBN(0x63aeb1ac, 0x4cbf2392),
+ TOBN(0x1de9e877, 0x0691a2af), TOBN(0xebe79af7, 0xd98021da),
+ TOBN(0xcfdf2a4e, 0x40e50acf), TOBN(0xf0a98ad7, 0xaf01d665),
+ TOBN(0xefb640bf, 0x1831be1f), TOBN(0x6fe8bd2f, 0x80e9ada0),
+ TOBN(0x94c103a1, 0x6cafbc91), TOBN(0x170f8759, 0x8308e08c),
+ TOBN(0x5de2d2ab, 0x9780ff4f), TOBN(0x666466bc, 0x45b201f2),
+ TOBN(0x58af2010, 0xf5b343bc), TOBN(0x0f2e400a, 0xf2f142fe),
+ TOBN(0x3483bfde, 0xa85f4bdf), TOBN(0xf0b1d093, 0x03bfeaa9),
+ TOBN(0x2ea01b95, 0xc7081603), TOBN(0xe943e4c9, 0x3dba1097),
+ TOBN(0x47be92ad, 0xb438f3a6), TOBN(0x00bb7742, 0xe5bf6636),
+ TOBN(0x136b7083, 0x824297b4), TOBN(0x9d0e5580, 0x5584455f),
+ TOBN(0xab48cedc, 0xf1c7d69e), TOBN(0x53a9e481, 0x2a256e76),
+ TOBN(0x0402b0e0, 0x65eb2413), TOBN(0xdadbbb84, 0x8fc407a7),
+ TOBN(0xa65cd5a4, 0x8d7f5492), TOBN(0x21d44293, 0x74bae294),
+ TOBN(0x66917ce6, 0x3b5f1cc4), TOBN(0x37ae52ea, 0xce872e62),
+ TOBN(0xbb087b72, 0x2905f244), TOBN(0x12077086, 0x1e6af74f),
+ TOBN(0x4b644e49, 0x1058edea), TOBN(0x827510e3, 0xb638ca1d),
+ TOBN(0x8cf2b704, 0x6038591c), TOBN(0xffc8b47a, 0xfe635063),
+ TOBN(0x3ae220e6, 0x1b4d5e63), TOBN(0xbd864742, 0x9d961b4b),
+ TOBN(0x610c107e, 0x9bd16bed), TOBN(0x4270352a, 0x1127147b),
+ TOBN(0x7d17ffe6, 0x64cfc50e), TOBN(0x50dee01a, 0x1e36cb42),
+ TOBN(0x068a7622, 0x35dc5f9a), TOBN(0x9a08d536, 0xdf53f62c),
+ TOBN(0x4ed71457, 0x6be5f7de), TOBN(0xd93006f8, 0xc2263c9e),
+ TOBN(0xe073694c, 0xcacacb36), TOBN(0x2ff7a5b4, 0x3ae118ab),
+ TOBN(0x3cce53f1, 0xcd871236), TOBN(0xf156a39d, 0xc2aa6d52),
+ TOBN(0x9cc5f271, 0xb198d76d), TOBN(0xbc615b6f, 0x81383d39),
+ TOBN(0xa54538e8, 0xde3eee6b), TOBN(0x58c77538, 0xab910d91),
+ TOBN(0x31e5bdbc, 0x58d278bd), TOBN(0x3cde4adf, 0xb963acae),
+ TOBN(0xb1881fd2, 0x5302169c), TOBN(0x8ca60fa0, 0xa989ed8b),
+ TOBN(0xa1999458, 0xff96a0ee), TOBN(0xc1141f03, 0xac6c283d),
+ TOBN(0x7677408d, 0x6dfafed3), TOBN(0x33a01653, 0x39661588),
+ TOBN(0x3c9c15ec, 0x0b726fa0), TOBN(0x090cfd93, 0x6c9b56da),
+ TOBN(0xe34f4bae, 0xa3c40af5), TOBN(0x3469eadb, 0xd21129f1),
+ TOBN(0xcc51674a, 0x1e207ce8), TOBN(0x1e293b24, 0xc83b1ef9),
+ TOBN(0x17173d13, 0x1e6c0bb4), TOBN(0x19004695, 0x90776d35),
+ TOBN(0xe7980e34, 0x6de6f922), TOBN(0x873554cb, 0xf4dd9a22),
+ TOBN(0x0316c627, 0xcbf18a51), TOBN(0x4d93651b, 0x3032c081),
+ TOBN(0x207f2771, 0x3946834d), TOBN(0x2c08d7b4, 0x30cdbf80),
+ TOBN(0x137a4fb4, 0x86df2a61), TOBN(0xa1ed9c07, 0xecf7b4a2),
+ TOBN(0xb2e460e2, 0x7bd042ff), TOBN(0xb7f5e2fa, 0x5f62f5ec),
+ TOBN(0x7aa6ec6b, 0xcc2423b7), TOBN(0x75ce0a7f, 0xba63eea7),
+ TOBN(0x67a45fb1, 0xf250a6e1), TOBN(0x93bc919c, 0xe53cdc9f),
+ TOBN(0x9271f56f, 0x871942df), TOBN(0x2372ff6f, 0x7859ad66),
+ TOBN(0x5f4c2b96, 0x33cb1a78), TOBN(0xe3e29101, 0x5838aa83),
+ TOBN(0xa7ed1611, 0xe4e8110c), TOBN(0x2a2d70d5, 0x330198ce),
+ TOBN(0xbdf132e8, 0x6720efe0), TOBN(0xe61a8962, 0x66a471bf),
+ TOBN(0x796d3a85, 0x825808bd), TOBN(0x51dc3cb7, 0x3fd6e902),
+ TOBN(0x643c768a, 0x916219d1), TOBN(0x36cd7685, 0xa2ad7d32),
+ TOBN(0xe3db9d05, 0xb22922a4), TOBN(0x6494c87e, 0xdba29660),
+ TOBN(0xf0ac91df, 0xbcd2ebc7), TOBN(0x4deb57a0, 0x45107f8d),
+ TOBN(0x42271f59, 0xc3d12a73), TOBN(0x5f71687c, 0xa5c2c51d),
+ TOBN(0xcb1f50c6, 0x05797bcb), TOBN(0x29ed0ed9, 0xd6d34eb0),
+ TOBN(0xe5fe5b47, 0x4683c2eb), TOBN(0x4956eeb5, 0x97447c46),
+ TOBN(0x5b163a43, 0x71207167), TOBN(0x93fa2fed, 0x0248c5ef),
+ TOBN(0x67930af2, 0x31f63950), TOBN(0xa77797c1, 0x14caa2c9),
+ TOBN(0x526e80ee, 0x27ac7e62), TOBN(0xe1e6e626, 0x58b28aec),
+ TOBN(0x636178b0, 0xb3c9fef0), TOBN(0xaf7752e0, 0x6d5f90be),
+ TOBN(0x94ecaf18, 0xeece51cf), TOBN(0x2864d0ed, 0xca806e1f),
+ TOBN(0x6de2e383, 0x97c69134), TOBN(0x5a42c316, 0xeb291293),
+ TOBN(0xc7779219, 0x6a60bae0), TOBN(0xa24de346, 0x6b7599d1),
+ TOBN(0x49d374aa, 0xb75d4941), TOBN(0x98900586, 0x2d501ff0),
+ TOBN(0x9f16d40e, 0xeb7974cf), TOBN(0x1033860b, 0xcdd8c115),
+ TOBN(0xb6c69ac8, 0x2094cec3), TOBN(0x9976fb88, 0x403b770c),
+ TOBN(0x1dea026c, 0x4859590d), TOBN(0xb6acbb46, 0x8562d1fd),
+ TOBN(0x7cd6c461, 0x44569d85), TOBN(0xc3190a36, 0x97f0891d),
+ TOBN(0xc6f53195, 0x48d5a17d), TOBN(0x7d919966, 0xd749abc8),
+ TOBN(0x65104837, 0xdd1c8a20), TOBN(0x7e5410c8, 0x2f683419),
+ TOBN(0x958c3ca8, 0xbe94022e), TOBN(0x605c3197, 0x6145dac2),
+ TOBN(0x3fc07501, 0x01683d54), TOBN(0x1d7127c5, 0x595b1234),
+ TOBN(0x10b8f87c, 0x9481277f), TOBN(0x677db2a8, 0xe65a1adb),
+ TOBN(0xec2fccaa, 0xddce3345), TOBN(0x2a6811b7, 0x012a4350),
+ TOBN(0x96760ff1, 0xac598bdc), TOBN(0x054d652a, 0xd1bf4128),
+ TOBN(0x0a1151d4, 0x92a21005), TOBN(0xad7f3971, 0x33110fdf),
+ TOBN(0x8c95928c, 0x1960100f), TOBN(0x6c91c825, 0x7bf03362),
+ TOBN(0xc8c8b2a2, 0xce309f06), TOBN(0xfdb27b59, 0xca27204b),
+ TOBN(0xd223eaa5, 0x0848e32e), TOBN(0xb93e4b2e, 0xe7bfaf1e),
+ TOBN(0xc5308ae6, 0x44aa3ded), TOBN(0x317a666a, 0xc015d573),
+ TOBN(0xc888ce23, 0x1a979707), TOBN(0xf141c1e6, 0x0d5c4958),
+ TOBN(0xb53b7de5, 0x61906373), TOBN(0x858dbade, 0xeb999595),
+ TOBN(0x8cbb47b2, 0xa59e5c36), TOBN(0x660318b3, 0xdcf4e842),
+ TOBN(0xbd161ccd, 0x12ba4b7a), TOBN(0xf399daab, 0xf8c8282a),
+ TOBN(0x1587633a, 0xeeb2130d), TOBN(0xa465311a, 0xda38dd7d),
+ TOBN(0x5f75eec8, 0x64d3779b), TOBN(0x3c5d0476, 0xad64c171),
+ TOBN(0x87410371, 0x2a914428), TOBN(0x8096a891, 0x90e2fc29),
+ TOBN(0xd3d2ae9d, 0x23b3ebc2), TOBN(0x90bdd6db, 0xa580cfd6),
+ TOBN(0x52dbb7f3, 0xc5b01f6c), TOBN(0xe68eded4, 0xe102a2dc),
+ TOBN(0x17785b77, 0x99eb6df0), TOBN(0x26c3cc51, 0x7386b779),
+ TOBN(0x345ed988, 0x6417a48e), TOBN(0xe990b4e4, 0x07d6ef31),
+ TOBN(0x0f456b7e, 0x2586abba), TOBN(0x239ca6a5, 0x59c96e9a),
+ TOBN(0xe327459c, 0xe2eb4206), TOBN(0x3a4c3313, 0xa002b90a),
+ TOBN(0x2a114806, 0xf6a3f6fb), TOBN(0xad5cad2f, 0x85c251dd),
+ TOBN(0x92c1f613, 0xf5a784d3), TOBN(0xec7bfacf, 0x349766d5),
+ TOBN(0x04b3cd33, 0x3e23cb3b), TOBN(0x3979fe84, 0xc5a64b2d),
+ TOBN(0x192e2720, 0x7e589106), TOBN(0xa60c43d1, 0xa15b527f),
+ TOBN(0x2dae9082, 0xbe7cf3a6), TOBN(0xcc86ba92, 0xbc967274),
+ TOBN(0xf28a2ce8, 0xaea0a8a9), TOBN(0x404ca6d9, 0x6ee988b3),
+ TOBN(0xfd7e9c5d, 0x005921b8), TOBN(0xf56297f1, 0x44e79bf9),
+ TOBN(0xa163b460, 0x0d75ddc2), TOBN(0x30b23616, 0xa1f2be87),
+ TOBN(0x4b070d21, 0xbfe50e2b), TOBN(0x7ef8cfd0, 0xe1bfede1),
+ TOBN(0xadba0011, 0x2aac4ae0), TOBN(0x2a3e7d01, 0xb9ebd033),
+ TOBN(0x995277ec, 0xe38d9d1c), TOBN(0xb500249e, 0x9c5d2de3),
+ TOBN(0x8912b820, 0xf13ca8c9), TOBN(0xc8798114, 0x877793af),
+ TOBN(0x19e6125d, 0xec3f1dec), TOBN(0x07b1f040, 0x911178da),
+ TOBN(0xd93ededa, 0x904a6738), TOBN(0x55187a5a, 0x0bebedcd),
+ TOBN(0xf7d04722, 0xeb329d41), TOBN(0xf449099e, 0xf170b391),
+ TOBN(0xfd317a69, 0xca99f828), TOBN(0x50c3db2b, 0x34a4976d),
+ TOBN(0xe9ba7784, 0x3757b392), TOBN(0x326caefd, 0xaa3ca05a),
+ TOBN(0x78e5293b, 0xf1e593d4), TOBN(0x7842a937, 0x0d98fd13),
+ TOBN(0xe694bf96, 0x5f96b10d), TOBN(0x373a9df6, 0x06a8cd05),
+ TOBN(0x997d1e51, 0xe8f0c7fc), TOBN(0x1d019790, 0x63fd972e),
+ TOBN(0x0064d858, 0x5499fb32), TOBN(0x7b67bad9, 0x77a8aeb7),
+ TOBN(0x1d3eb977, 0x2d08eec5), TOBN(0x5fc047a6, 0xcbabae1d),
+ TOBN(0x0577d159, 0xe54a64bb), TOBN(0x8862201b, 0xc43497e4),
+ TOBN(0xad6b4e28, 0x2ce0608d), TOBN(0x8b687b7d, 0x0b167aac),
+ TOBN(0x6ed4d367, 0x8b2ecfa9), TOBN(0x24dfe62d, 0xa90c3c38),
+ TOBN(0xa1862e10, 0x3fe5c42b), TOBN(0x1ca73dca, 0xd5732a9f),
+ TOBN(0x35f038b7, 0x76bb87ad), TOBN(0x674976ab, 0xf242b81f),
+ TOBN(0x4f2bde7e, 0xb0fd90cd), TOBN(0x6efc172e, 0xa7fdf092),
+ TOBN(0x3806b69b, 0x92222f1f), TOBN(0x5a2459ca, 0x6cf7ae70),
+ TOBN(0x6789f69c, 0xa85217ee), TOBN(0x5f232b5e, 0xe3dc85ac),
+ TOBN(0x660e3ec5, 0x48e9e516), TOBN(0x124b4e47, 0x3197eb31),
+ TOBN(0x10a0cb13, 0xaafcca23), TOBN(0x7bd63ba4, 0x8213224f),
+ TOBN(0xaffad7cc, 0x290a7f4f), TOBN(0x6b409c9e, 0x0286b461),
+ TOBN(0x58ab809f, 0xffa407af), TOBN(0xc3122eed, 0xc68ac073),
+ TOBN(0x17bf9e50, 0x4ef24d7e), TOBN(0x5d929794, 0x3e2a5811),
+ TOBN(0x519bc867, 0x02902e01), TOBN(0x76bba5da, 0x39c8a851),
+ TOBN(0xe9f9669c, 0xda94951e), TOBN(0x4b6af58d, 0x66b8d418),
+ TOBN(0xfa321074, 0x17d426a4), TOBN(0xc78e66a9, 0x9dde6027),
+ TOBN(0x0516c083, 0x4a53b964), TOBN(0xfc659d38, 0xff602330),
+ TOBN(0x0ab55e5c, 0x58c5c897), TOBN(0x985099b2, 0x838bc5df),
+ TOBN(0x061d9efc, 0xc52fc238), TOBN(0x712b2728, 0x6ac1da3f),
+ TOBN(0xfb658149, 0x9283fe08), TOBN(0x4954ac94, 0xb8aaa2f7),
+ TOBN(0x85c0ada4, 0x7fb2e74f), TOBN(0xee8ba98e, 0xb89926b0),
+ TOBN(0xe4f9d37d, 0x23d1af5b), TOBN(0x14ccdbf9, 0xba9b015e),
+ TOBN(0xb674481b, 0x7bfe7178), TOBN(0x4e1debae, 0x65405868),
+ TOBN(0x061b2821, 0xc48c867d), TOBN(0x69c15b35, 0x513b30ea),
+ TOBN(0x3b4a1666, 0x36871088), TOBN(0xe5e29f5d, 0x1220b1ff),
+ TOBN(0x4b82bb35, 0x233d9f4d), TOBN(0x4e076333, 0x18cdc675)}
+ ,
+ {TOBN(0x0d53f5c7, 0xa3e6fced), TOBN(0xe8cbbdd5, 0xf45fbdeb),
+ TOBN(0xf85c01df, 0x13339a70), TOBN(0x0ff71880, 0x142ceb81),
+ TOBN(0x4c4e8774, 0xbd70437a), TOBN(0x5fb32891, 0xba0bda6a),
+ TOBN(0x1cdbebd2, 0xf18bd26e), TOBN(0x2f9526f1, 0x03a9d522),
+ TOBN(0x40ce3051, 0x92c4d684), TOBN(0x8b04d725, 0x7612efcd),
+ TOBN(0xb9dcda36, 0x6f9cae20), TOBN(0x0edc4d24, 0xf058856c),
+ TOBN(0x64f2e6bf, 0x85427900), TOBN(0x3de81295, 0xdc09dfea),
+ TOBN(0xd41b4487, 0x379bf26c), TOBN(0x50b62c6d, 0x6df135a9),
+ TOBN(0xd4f8e3b4, 0xc72dfe67), TOBN(0xc416b0f6, 0x90e19fdf),
+ TOBN(0x18b9098d, 0x4c13bd35), TOBN(0xac11118a, 0x15b8cb9e),
+ TOBN(0xf598a318, 0xf0062841), TOBN(0xbfe0602f, 0x89f356f4),
+ TOBN(0x7ae3637e, 0x30177a0c), TOBN(0x34097747, 0x61136537),
+ TOBN(0x0db2fb5e, 0xd005832a), TOBN(0x5f5efd3b, 0x91042e4f),
+ TOBN(0x8c4ffdc6, 0xed70f8ca), TOBN(0xe4645d0b, 0xb52da9cc),
+ TOBN(0x9596f58b, 0xc9001d1f), TOBN(0x52c8f0bc, 0x4e117205),
+ TOBN(0xfd4aa0d2, 0xe398a084), TOBN(0x815bfe3a, 0x104f49de),
+ TOBN(0x97e5443f, 0x23885e5f), TOBN(0xf72f8f99, 0xe8433aab),
+ TOBN(0xbd00b154, 0xe4d4e604), TOBN(0xd0b35e6a, 0xe5e173ff),
+ TOBN(0x57b2a048, 0x9164722d), TOBN(0x3e3c665b, 0x88761ec8),
+ TOBN(0x6bdd1397, 0x3da83832), TOBN(0x3c8b1a1e, 0x73dafe3b),
+ TOBN(0x4497ace6, 0x54317cac), TOBN(0xbe600ab9, 0x521771b3),
+ TOBN(0xb42e409e, 0xb0dfe8b8), TOBN(0x386a67d7, 0x3942310f),
+ TOBN(0x25548d8d, 0x4431cc28), TOBN(0xa7cff142, 0x985dc524),
+ TOBN(0x4d60f5a1, 0x93c4be32), TOBN(0x83ebd5c8, 0xd071c6e1),
+ TOBN(0xba3a80a7, 0xb1fd2b0b), TOBN(0x9b3ad396, 0x5bec33e8),
+ TOBN(0xb3868d61, 0x79743fb3), TOBN(0xcfd169fc, 0xfdb462fa),
+ TOBN(0xd3b499d7, 0x9ce0a6af), TOBN(0x55dc1cf1, 0xe42d3ff8),
+ TOBN(0x04fb9e6c, 0xc6c3e1b2), TOBN(0x47e6961d, 0x6f69a474),
+ TOBN(0x54eb3acc, 0xe548b37b), TOBN(0xb38e7542, 0x84d40549),
+ TOBN(0x8c3daa51, 0x7b341b4f), TOBN(0x2f6928ec, 0x690bf7fa),
+ TOBN(0x0496b323, 0x86ce6c41), TOBN(0x01be1c55, 0x10adadcd),
+ TOBN(0xc04e67e7, 0x4bb5faf9), TOBN(0x3cbaf678, 0xe15c9985),
+ TOBN(0x8cd12145, 0x50ca4247), TOBN(0xba1aa47a, 0xe7dd30aa),
+ TOBN(0x2f81ddf1, 0xe58fee24), TOBN(0x03452936, 0xeec9b0e8),
+ TOBN(0x8bdc3b81, 0x243aea96), TOBN(0x9a2919af, 0x15c3d0e5),
+ TOBN(0x9ea640ec, 0x10948361), TOBN(0x5ac86d5b, 0x6e0bcccf),
+ TOBN(0xf892d918, 0xc36cf440), TOBN(0xaed3e837, 0xc939719c),
+ TOBN(0xb07b08d2, 0xc0218b64), TOBN(0x6f1bcbba, 0xce9790dd),
+ TOBN(0x4a84d6ed, 0x60919b8e), TOBN(0xd8900791, 0x8ac1f9eb),
+ TOBN(0xf84941aa, 0x0dd5daef), TOBN(0xb22fe40a, 0x67fd62c5),
+ TOBN(0x97e15ba2, 0x157f2db3), TOBN(0xbda2fc8f, 0x8e28ca9c),
+ TOBN(0x5d050da4, 0x37b9f454), TOBN(0x3d57eb57, 0x2379d72e),
+ TOBN(0xe9b5eba2, 0xfb5ee997), TOBN(0x01648ca2, 0xe11538ca),
+ TOBN(0x32bb76f6, 0xf6327974), TOBN(0x338f14b8, 0xff3f4bb7),
+ TOBN(0x524d226a, 0xd7ab9a2d), TOBN(0x9c00090d, 0x7dfae958),
+ TOBN(0x0ba5f539, 0x8751d8c2), TOBN(0x8afcbcdd, 0x3ab8262d),
+ TOBN(0x57392729, 0xe99d043b), TOBN(0xef51263b, 0xaebc943a),
+ TOBN(0x9feace93, 0x20862935), TOBN(0x639efc03, 0xb06c817b),
+ TOBN(0x1fe054b3, 0x66b4be7a), TOBN(0x3f25a9de, 0x84a37a1e),
+ TOBN(0xf39ef1ad, 0x78d75cd9), TOBN(0xd7b58f49, 0x5062c1b5),
+ TOBN(0x6f74f9a9, 0xff563436), TOBN(0xf718ff29, 0xe8af51e7),
+ TOBN(0x5234d313, 0x15e97fec), TOBN(0xb6a8e2b1, 0x292f1c0a),
+ TOBN(0xa7f53aa8, 0x327720c1), TOBN(0x956ca322, 0xba092cc8),
+ TOBN(0x8f03d64a, 0x28746c4d), TOBN(0x51fe1782, 0x66d0d392),
+ TOBN(0xd19b34db, 0x3c832c80), TOBN(0x60dccc5c, 0x6da2e3b4),
+ TOBN(0x245dd62e, 0x0a104ccc), TOBN(0xa7ab1de1, 0x620b21fd),
+ TOBN(0xb293ae0b, 0x3893d123), TOBN(0xf7b75783, 0xb15ee71c),
+ TOBN(0x5aa3c614, 0x42a9468b), TOBN(0xd686123c, 0xdb15d744),
+ TOBN(0x8c616891, 0xa7ab4116), TOBN(0x6fcd72c8, 0xa4e6a459),
+ TOBN(0xac219110, 0x77e5fad7), TOBN(0xfb6a20e7, 0x704fa46b),
+ TOBN(0xe839be7d, 0x341d81dc), TOBN(0xcddb6889, 0x32148379),
+ TOBN(0xda6211a1, 0xf7026ead), TOBN(0xf3b2575f, 0xf4d1cc5e),
+ TOBN(0x40cfc8f6, 0xa7a73ae6), TOBN(0x83879a5e, 0x61d5b483),
+ TOBN(0xc5acb1ed, 0x41a50ebc), TOBN(0x59a60cc8, 0x3c07d8fa),
+ TOBN(0x1b73bdce, 0xb1876262), TOBN(0x2b0d79f0, 0x12af4ee9),
+ TOBN(0x8bcf3b0b, 0xd46e1d07), TOBN(0x17d6af9d, 0xe45d152f),
+ TOBN(0x73520461, 0x6d736451), TOBN(0x43cbbd97, 0x56b0bf5a),
+ TOBN(0xb0833a5b, 0xd5999b9d), TOBN(0x702614f0, 0xeb72e398),
+ TOBN(0x0aadf01a, 0x59c3e9f8), TOBN(0x40200e77, 0xce6b3d16),
+ TOBN(0xda22bdd3, 0xdeddafad), TOBN(0x76dedaf4, 0x310d72e1),
+ TOBN(0x49ef807c, 0x4bc2e88f), TOBN(0x6ba81291, 0x146dd5a5),
+ TOBN(0xa1a4077a, 0x7d8d59e9), TOBN(0x87b6a2e7, 0x802db349),
+ TOBN(0xd5679997, 0x1b4e598e), TOBN(0xf499ef1f, 0x06fe4b1d),
+ TOBN(0x3978d3ae, 0xfcb267c5), TOBN(0xb582b557, 0x235786d0),
+ TOBN(0x32b3b2ca, 0x1715cb07), TOBN(0x4c3de6a2, 0x8480241d),
+ TOBN(0x63b5ffed, 0xcb571ecd), TOBN(0xeaf53900, 0xed2fe9a9),
+ TOBN(0xdec98d4a, 0xc3b81990), TOBN(0x1cb83722, 0x9e0cc8fe),
+ TOBN(0xfe0b0491, 0xd2b427b9), TOBN(0x0f2386ac, 0xe983a66c),
+ TOBN(0x930c4d1e, 0xb3291213), TOBN(0xa2f82b2e, 0x59a62ae4),
+ TOBN(0x77233853, 0xf93e89e3), TOBN(0x7f8063ac, 0x11777c7f),
+ TOBN(0xff0eb567, 0x59ad2877), TOBN(0x6f454642, 0x9865c754),
+ TOBN(0xe6fe701a, 0x236e9a84), TOBN(0xc586ef16, 0x06e40fc3),
+ TOBN(0x3f62b6e0, 0x24bafad9), TOBN(0xc8b42bd2, 0x64da906a),
+ TOBN(0xc98e1eb4, 0xda3276a0), TOBN(0x30d0e5fc, 0x06cbf852),
+ TOBN(0x1b6b2ae1, 0xe8b4dfd4), TOBN(0xd754d5c7, 0x8301cbac),
+ TOBN(0x66097629, 0x112a39ac), TOBN(0xf86b5999, 0x93ba4ab9),
+ TOBN(0x26c9dea7, 0x99f9d581), TOBN(0x0473b1a8, 0xc2fafeaa),
+ TOBN(0x1469af55, 0x3b2505a5), TOBN(0x227d16d7, 0xd6a43323),
+ TOBN(0x3316f73c, 0xad3d97f9), TOBN(0x52bf3bb5, 0x1f137455),
+ TOBN(0x953eafeb, 0x09954e7c), TOBN(0xa721dfed, 0xdd732411),
+ TOBN(0xb4929821, 0x141d4579), TOBN(0x3411321c, 0xaa3bd435),
+ TOBN(0xafb355aa, 0x17fa6015), TOBN(0xb4e7ef4a, 0x18e42f0e),
+ TOBN(0x604ac97c, 0x59371000), TOBN(0xe1c48c70, 0x7f759c18),
+ TOBN(0x3f62ecc5, 0xa5db6b65), TOBN(0x0a78b173, 0x38a21495),
+ TOBN(0x6be1819d, 0xbcc8ad94), TOBN(0x70dc04f6, 0xd89c3400),
+ TOBN(0x462557b4, 0xa6b4840a), TOBN(0x544c6ade, 0x60bd21c0),
+ TOBN(0x6a00f24e, 0x907a544b), TOBN(0xa7520dcb, 0x313da210),
+ TOBN(0xfe939b75, 0x11e4994b), TOBN(0x918b6ba6, 0xbc275d70),
+ TOBN(0xd3e5e0fc, 0x644be892), TOBN(0x707a9816, 0xfdaf6c42),
+ TOBN(0x60145567, 0xf15c13fe), TOBN(0x4818ebaa, 0xe130a54a),
+ TOBN(0x28aad3ad, 0x58d2f767), TOBN(0xdc5267fd, 0xd7e7c773),
+ TOBN(0x4919cc88, 0xc3afcc98), TOBN(0xaa2e6ab0, 0x2db8cd4b),
+ TOBN(0xd46fec04, 0xd0c63eaa), TOBN(0xa1cb92c5, 0x19ffa832),
+ TOBN(0x678dd178, 0xe43a631f), TOBN(0xfb5ae1cd, 0x3dc788b3),
+ TOBN(0x68b4fb90, 0x6e77de04), TOBN(0x7992bcf0, 0xf06dbb97),
+ TOBN(0x896e6a13, 0xc417c01d), TOBN(0x8d96332c, 0xb956be01),
+ TOBN(0x902fc93a, 0x413aa2b9), TOBN(0x99a4d915, 0xfc98c8a5),
+ TOBN(0x52c29407, 0x565f1137), TOBN(0x4072690f, 0x21e4f281),
+ TOBN(0x36e607cf, 0x02ff6072), TOBN(0xa47d2ca9, 0x8ad98cdc),
+ TOBN(0xbf471d1e, 0xf5f56609), TOBN(0xbcf86623, 0xf264ada0),
+ TOBN(0xb70c0687, 0xaa9e5cb6), TOBN(0xc98124f2, 0x17401c6c),
+ TOBN(0x8189635f, 0xd4a61435), TOBN(0xd28fb8af, 0xa9d98ea6),
+ TOBN(0xb9a67c2a, 0x40c251f8), TOBN(0x88cd5d87, 0xa2da44be),
+ TOBN(0x437deb96, 0xe09b5423), TOBN(0x150467db, 0x64287dc1),
+ TOBN(0xe161debb, 0xcdabb839), TOBN(0xa79e9742, 0xf1839a3e),
+ TOBN(0xbb8dd3c2, 0x652d202b), TOBN(0x7b3e67f7, 0xe9f97d96),
+ TOBN(0x5aa5d78f, 0xb1cb6ac9), TOBN(0xffa13e8e, 0xca1d0d45),
+ TOBN(0x369295dd, 0x2ba5bf95), TOBN(0xd68bd1f8, 0x39aff05e),
+ TOBN(0xaf0d86f9, 0x26d783f2), TOBN(0x543a59b3, 0xfc3aafc1),
+ TOBN(0x3fcf81d2, 0x7b7da97c), TOBN(0xc990a056, 0xd25dee46),
+ TOBN(0x3e6775b8, 0x519cce2c), TOBN(0xfc9af71f, 0xae13d863),
+ TOBN(0x774a4a6f, 0x47c1605c), TOBN(0x46ba4245, 0x2fd205e8),
+ TOBN(0xa06feea4, 0xd3fd524d), TOBN(0x1e724641, 0x6de1acc2),
+ TOBN(0xf53816f1, 0x334e2b42), TOBN(0x49e5918e, 0x922f0024),
+ TOBN(0x439530b6, 0x65c7322d), TOBN(0xcf12cc01, 0xb3c1b3fb),
+ TOBN(0xc70b0186, 0x0172f685), TOBN(0xb915ee22, 0x1b58391d),
+ TOBN(0x9afdf03b, 0xa317db24), TOBN(0x87dec659, 0x17b8ffc4),
+ TOBN(0x7f46597b, 0xe4d3d050), TOBN(0x80a1c1ed, 0x006500e7),
+ TOBN(0x84902a96, 0x78bf030e), TOBN(0xfb5e9c9a, 0x50560148),
+ TOBN(0x6dae0a92, 0x63362426), TOBN(0xdcaeecf4, 0xa9e30c40),
+ TOBN(0xc0d887bb, 0x518d0c6b), TOBN(0x99181152, 0xcb985b9d),
+ TOBN(0xad186898, 0xef7bc381), TOBN(0x18168ffb, 0x9ee46201),
+ TOBN(0x9a04cdaa, 0x2502753c), TOBN(0xbb279e26, 0x51407c41),
+ TOBN(0xeacb03aa, 0xf23564e5), TOBN(0x18336582, 0x71e61016),
+ TOBN(0x8684b8c4, 0xeb809877), TOBN(0xb336e18d, 0xea0e672e),
+ TOBN(0xefb601f0, 0x34ee5867), TOBN(0x2733edbe, 0x1341cfd1),
+ TOBN(0xb15e809a, 0x26025c3c), TOBN(0xe6e981a6, 0x9350df88),
+ TOBN(0x92376237, 0x8502fd8e), TOBN(0x4791f216, 0x0c12be9b),
+ TOBN(0xb7256789, 0x25f02425), TOBN(0xec863194, 0x7a974443),
+ TOBN(0x7c0ce882, 0xfb41cc52), TOBN(0xc266ff7e, 0xf25c07f2),
+ TOBN(0x3d4da8c3, 0x017025f3), TOBN(0xefcf628c, 0xfb9579b4),
+ TOBN(0x5c4d0016, 0x1f3716ec), TOBN(0x9c27ebc4, 0x6801116e),
+ TOBN(0x5eba0ea1, 0x1da1767e), TOBN(0xfe151452, 0x47004c57),
+ TOBN(0x3ace6df6, 0x8c2373b7), TOBN(0x75c3dffe, 0x5dbc37ac),
+ TOBN(0x3dc32a73, 0xddc925fc), TOBN(0xb679c841, 0x2f65ee0b),
+ TOBN(0x715a3295, 0x451cbfeb), TOBN(0xd9889768, 0xf76e9a29),
+ TOBN(0xec20ce7f, 0xb28ad247), TOBN(0xe99146c4, 0x00894d79),
+ TOBN(0x71457d7c, 0x9f5e3ea7), TOBN(0x097b2662, 0x38030031),
+ TOBN(0xdb7f6ae6, 0xcf9f82a8), TOBN(0x319decb9, 0x438f473a),
+ TOBN(0xa63ab386, 0x283856c3), TOBN(0x13e3172f, 0xb06a361b),
+ TOBN(0x2959f8dc, 0x7d5a006c), TOBN(0x2dbc27c6, 0x75fba752),
+ TOBN(0xc1227ab2, 0x87c22c9e), TOBN(0x06f61f75, 0x71a268b2),
+ TOBN(0x1b6bb971, 0x04779ce2), TOBN(0xaca83812, 0x0aadcb1d),
+ TOBN(0x297ae0bc, 0xaeaab2d5), TOBN(0xa5c14ee7, 0x5bfb9f13),
+ TOBN(0xaa00c583, 0xf17a62c7), TOBN(0x39eb962c, 0x173759f6),
+ TOBN(0x1eeba1d4, 0x86c9a88f), TOBN(0x0ab6c37a, 0xdf016c5e),
+ TOBN(0xa2a147db, 0xa28a0749), TOBN(0x246c20d6, 0xee519165),
+ TOBN(0x5068d1b1, 0xd3810715), TOBN(0xb1e7018c, 0x748160b9),
+ TOBN(0x03f5b1fa, 0xf380ff62), TOBN(0xef7fb1dd, 0xf3cb2c1e),
+ TOBN(0xeab539a8, 0xfc91a7da), TOBN(0x83ddb707, 0xf3f9b561),
+ TOBN(0xc550e211, 0xfe7df7a4), TOBN(0xa7cd07f2, 0x063f6f40),
+ TOBN(0xb0de3635, 0x2976879c), TOBN(0xb5f83f85, 0xe55741da),
+ TOBN(0x4ea9d25e, 0xf3d8ac3d), TOBN(0x6fe2066f, 0x62819f02),
+ TOBN(0x4ab2b9c2, 0xcef4a564), TOBN(0x1e155d96, 0x5ffa2de3),
+ TOBN(0x0eb0a19b, 0xc3a72d00), TOBN(0x4037665b, 0x8513c31b),
+ TOBN(0x2fb2b6bf, 0x04c64637), TOBN(0x45c34d6e, 0x08cdc639),
+ TOBN(0x56f1e10f, 0xf01fd796), TOBN(0x4dfb8101, 0xfe3667b8),
+ TOBN(0xe0eda253, 0x9021d0c0), TOBN(0x7a94e9ff, 0x8a06c6ab),
+ TOBN(0x2d3bb0d9, 0xbb9aa882), TOBN(0xea20e4e5, 0xec05fd10),
+ TOBN(0xed7eeb5f, 0x1a1ca64e), TOBN(0x2fa6b43c, 0xc6327cbd),
+ TOBN(0xb577e3cf, 0x3aa91121), TOBN(0x8c6bd5ea, 0x3a34079b),
+ TOBN(0xd7e5ba39, 0x60e02fc0), TOBN(0xf16dd2c3, 0x90141bf8),
+ TOBN(0xb57276d9, 0x80101b98), TOBN(0x760883fd, 0xb82f0f66),
+ TOBN(0x89d7de75, 0x4bc3eff3), TOBN(0x03b60643, 0x5dc2ab40),
+ TOBN(0xcd6e53df, 0xe05beeac), TOBN(0xf2f1e862, 0xbc3325cd),
+ TOBN(0xdd0f7921, 0x774f03c3), TOBN(0x97ca7221, 0x4552cc1b),
+ TOBN(0x5a0d6afe, 0x1cd19f72), TOBN(0xa20915dc, 0xf183fbeb),
+ TOBN(0x9fda4b40, 0x832c403c), TOBN(0x32738edd, 0xbe425442),
+ TOBN(0x469a1df6, 0xb5eccf1a), TOBN(0x4b5aff42, 0x28bbe1f0),
+ TOBN(0x31359d7f, 0x570dfc93), TOBN(0xa18be235, 0xf0088628),
+ TOBN(0xa5b30fba, 0xb00ed3a9), TOBN(0x34c61374, 0x73cdf8be),
+ TOBN(0x2c5c5f46, 0xabc56797), TOBN(0x5cecf93d, 0xb82a8ae2),
+ TOBN(0x7d3dbe41, 0xa968fbf0), TOBN(0xd23d4583, 0x1a5c7f3d),
+ TOBN(0xf28f69a0, 0xc087a9c7), TOBN(0xc2d75471, 0x474471ca),
+ TOBN(0x36ec9f4a, 0x4eb732ec), TOBN(0x6c943bbd, 0xb1ca6bed),
+ TOBN(0xd64535e1, 0xf2457892), TOBN(0x8b84a8ea, 0xf7e2ac06),
+ TOBN(0xe0936cd3, 0x2499dd5f), TOBN(0x12053d7e, 0x0ed04e57),
+ TOBN(0x4bdd0076, 0xe4305d9d), TOBN(0x34a527b9, 0x1f67f0a2),
+ TOBN(0xe79a4af0, 0x9cec46ea), TOBN(0xb15347a1, 0x658b9bc7),
+ TOBN(0x6bd2796f, 0x35af2f75), TOBN(0xac957990, 0x4051c435),
+ TOBN(0x2669dda3, 0xc33a655d), TOBN(0x5d503c2e, 0x88514aa3),
+ TOBN(0xdfa11337, 0x3753dd41), TOBN(0x3f054673, 0x0b754f78),
+ TOBN(0xbf185677, 0x496125bd), TOBN(0xfb0023c8, 0x3775006c),
+ TOBN(0xfa0f072f, 0x3a037899), TOBN(0x4222b6eb, 0x0e4aea57),
+ TOBN(0x3dde5e76, 0x7866d25a), TOBN(0xb6eb04f8, 0x4837aa6f),
+ TOBN(0x5315591a, 0x2cf1cdb8), TOBN(0x6dfb4f41, 0x2d4e683c),
+ TOBN(0x7e923ea4, 0x48ee1f3a), TOBN(0x9604d9f7, 0x05a2afd5),
+ TOBN(0xbe1d4a33, 0x40ea4948), TOBN(0x5b45f1f4, 0xb44cbd2f),
+ TOBN(0x5faf8376, 0x4acc757e), TOBN(0xa7cf9ab8, 0x63d68ff7),
+ TOBN(0x8ad62f69, 0xdf0e404b), TOBN(0xd65f33c2, 0x12bdafdf),
+ TOBN(0xc365de15, 0xa377b14e), TOBN(0x6bf5463b, 0x8e39f60c),
+ TOBN(0x62030d2d, 0x2ce68148), TOBN(0xd95867ef, 0xe6f843a8),
+ TOBN(0xd39a0244, 0xef5ab017), TOBN(0x0bd2d8c1, 0x4ab55d12),
+ TOBN(0xc9503db3, 0x41639169), TOBN(0x2d4e25b0, 0xf7660c8a),
+ TOBN(0x760cb3b5, 0xe224c5d7), TOBN(0xfa3baf8c, 0x68616919),
+ TOBN(0x9fbca113, 0x8d142552), TOBN(0x1ab18bf1, 0x7669ebf5),
+ TOBN(0x55e6f53e, 0x9bdf25dd), TOBN(0x04cc0bf3, 0xcb6cd154),
+ TOBN(0x595bef49, 0x95e89080), TOBN(0xfe9459a8, 0x104a9ac1),
+ TOBN(0xad2d89ca, 0xcce9bb32), TOBN(0xddea65e1, 0xf7de8285),
+ TOBN(0x62ed8c35, 0xb351bd4b), TOBN(0x4150ff36, 0x0c0e19a7),
+ TOBN(0x86e3c801, 0x345f4e47), TOBN(0x3bf21f71, 0x203a266c),
+ TOBN(0x7ae110d4, 0x855b1f13), TOBN(0x5d6aaf6a, 0x07262517),
+ TOBN(0x1e0f12e1, 0x813d28f1), TOBN(0x6000e11d, 0x7ad7a523),
+ TOBN(0xc7d8deef, 0xc744a17b), TOBN(0x1e990b48, 0x14c05a00),
+ TOBN(0x68fddaee, 0x93e976d5), TOBN(0x696241d1, 0x46610d63),
+ TOBN(0xb204e7c3, 0x893dda88), TOBN(0x8bccfa65, 0x6a3a6946),
+ TOBN(0xb59425b4, 0xc5cd1411), TOBN(0x701b4042, 0xff3658b1),
+ TOBN(0xe3e56bca, 0x4784cf93), TOBN(0x27de5f15, 0x8fe68d60),
+ TOBN(0x4ab9cfce, 0xf8d53f19), TOBN(0xddb10311, 0xa40a730d),
+ TOBN(0x6fa73cd1, 0x4eee0a8a), TOBN(0xfd548748, 0x5249719d),
+ TOBN(0x49d66316, 0xa8123ef0), TOBN(0x73c32db4, 0xe7f95438),
+ TOBN(0x2e2ed209, 0x0d9e7854), TOBN(0xf98a9329, 0x9d9f0507),
+ TOBN(0xc5d33cf6, 0x0c6aa20a), TOBN(0x9a32ba14, 0x75279bb2),
+ TOBN(0x7e3202cb, 0x774a7307), TOBN(0x64ed4bc4, 0xe8c42dbd),
+ TOBN(0xc20f1a06, 0xd4caed0d), TOBN(0xb8021407, 0x171d22b3),
+ TOBN(0xd426ca04, 0xd13268d7), TOBN(0x92377007, 0x25f4d126),
+ TOBN(0x4204cbc3, 0x71f21a85), TOBN(0x18461b7a, 0xf82369ba),
+ TOBN(0xc0c07d31, 0x3fc858f9), TOBN(0x5deb5a50, 0xe2bab569),
+ TOBN(0xd5959d46, 0xd5eea89e), TOBN(0xfdff8424, 0x08437f4b),
+ TOBN(0xf21071e4, 0x3cfe254f), TOBN(0x72417696, 0x95468321),
+ TOBN(0x5d8288b9, 0x102cae3e), TOBN(0x2d143e3d, 0xf1965dff),
+ TOBN(0x00c9a376, 0xa078d847), TOBN(0x6fc0da31, 0x26028731),
+ TOBN(0xa2baeadf, 0xe45083a2), TOBN(0x66bc7218, 0x5e5b4bcd),
+ TOBN(0x2c826442, 0xd04b8e7f), TOBN(0xc19f5451, 0x6c4b586b),
+ TOBN(0x60182c49, 0x5b7eeed5), TOBN(0xd9954ecd, 0x7aa9dfa1),
+ TOBN(0xa403a8ec, 0xc73884ad), TOBN(0x7fb17de2, 0x9bb39041),
+ TOBN(0x694b64c5, 0xabb020e8), TOBN(0x3d18c184, 0x19c4eec7),
+ TOBN(0x9c4673ef, 0x1c4793e5), TOBN(0xc7b8aeb5, 0x056092e6),
+ TOBN(0x3aa1ca43, 0xf0f8c16b), TOBN(0x224ed5ec, 0xd679b2f6),
+ TOBN(0x0d56eeaf, 0x55a205c9), TOBN(0xbfe115ba, 0x4b8e028b),
+ TOBN(0x97e60849, 0x3927f4fe), TOBN(0xf91fbf94, 0x759aa7c5),
+ TOBN(0x985af769, 0x6be90a51), TOBN(0xc1277b78, 0x78ccb823),
+ TOBN(0x395b656e, 0xe7a75952), TOBN(0x00df7de0, 0x928da5f5),
+ TOBN(0x09c23175, 0x4ca4454f), TOBN(0x4ec971f4, 0x7aa2d3c1),
+ TOBN(0x45c3c507, 0xe75d9ccc), TOBN(0x63b7be8a, 0x3dc90306),
+ TOBN(0x37e09c66, 0x5db44bdc), TOBN(0x50d60da1, 0x6841c6a2),
+ TOBN(0x6f9b65ee, 0x08df1b12), TOBN(0x38734879, 0x7ff089df),
+ TOBN(0x9c331a66, 0x3fe8013d), TOBN(0x017f5de9, 0x5f42fcc8),
+ TOBN(0x43077866, 0xe8e57567), TOBN(0xc9f781ce, 0xf9fcdb18),
+ TOBN(0x38131dda, 0x9b12e174), TOBN(0x25d84aa3, 0x8a03752a),
+ TOBN(0x45e09e09, 0x4d0c0ce2), TOBN(0x1564008b, 0x92bebba5),
+ TOBN(0xf7e8ad31, 0xa87284c7), TOBN(0xb7c4b46c, 0x97e7bbaa),
+ TOBN(0x3e22a7b3, 0x97acf4ec), TOBN(0x0426c400, 0x5ea8b640),
+ TOBN(0x5e3295a6, 0x4e969285), TOBN(0x22aabc59, 0xa6a45670),
+ TOBN(0xb929714c, 0x5f5942bc), TOBN(0x9a6168bd, 0xfa3182ed),
+ TOBN(0x2216a665, 0x104152ba), TOBN(0x46908d03, 0xb6926368)}
+ ,
+ {TOBN(0xa9f5d874, 0x5a1251fb), TOBN(0x967747a8, 0xc72725c7),
+ TOBN(0x195c33e5, 0x31ffe89e), TOBN(0x609d210f, 0xe964935e),
+ TOBN(0xcafd6ca8, 0x2fe12227), TOBN(0xaf9b5b96, 0x0426469d),
+ TOBN(0x2e9ee04c, 0x5693183c), TOBN(0x1084a333, 0xc8146fef),
+ TOBN(0x96649933, 0xaed1d1f7), TOBN(0x566eaff3, 0x50563090),
+ TOBN(0x345057f0, 0xad2e39cf), TOBN(0x148ff65b, 0x1f832124),
+ TOBN(0x042e89d4, 0xcf94cf0d), TOBN(0x319bec84, 0x520c58b3),
+ TOBN(0x2a267626, 0x5361aa0d), TOBN(0xc86fa302, 0x8fbc87ad),
+ TOBN(0xfc83d2ab, 0x5c8b06d5), TOBN(0xb1a785a2, 0xfe4eac46),
+ TOBN(0xb99315bc, 0x846f7779), TOBN(0xcf31d816, 0xef9ea505),
+ TOBN(0x2391fe6a, 0x15d7dc85), TOBN(0x2f132b04, 0xb4016b33),
+ TOBN(0x29547fe3, 0x181cb4c7), TOBN(0xdb66d8a6, 0x650155a1),
+ TOBN(0x6b66d7e1, 0xadc1696f), TOBN(0x98ebe593, 0x0acd72d0),
+ TOBN(0x65f24550, 0xcc1b7435), TOBN(0xce231393, 0xb4b9a5ec),
+ TOBN(0x234a22d4, 0xdb067df9), TOBN(0x98dda095, 0xcaff9b00),
+ TOBN(0x1bbc75a0, 0x6100c9c1), TOBN(0x1560a9c8, 0x939cf695),
+ TOBN(0xcf006d3e, 0x99e0925f), TOBN(0x2dd74a96, 0x6322375a),
+ TOBN(0xc58b446a, 0xb56af5ba), TOBN(0x50292683, 0xe0b9b4f1),
+ TOBN(0xe2c34cb4, 0x1aeaffa3), TOBN(0x8b17203f, 0x9b9587c1),
+ TOBN(0x6d559207, 0xead1350c), TOBN(0x2b66a215, 0xfb7f9604),
+ TOBN(0x0850325e, 0xfe51bf74), TOBN(0x9c4f579e, 0x5e460094),
+ TOBN(0x5c87b92a, 0x76da2f25), TOBN(0x889de4e0, 0x6febef33),
+ TOBN(0x6900ec06, 0x646083ce), TOBN(0xbe2a0335, 0xbfe12773),
+ TOBN(0xadd1da35, 0xc5344110), TOBN(0x757568b7, 0xb802cd20),
+ TOBN(0x75559779, 0x00f7e6c8), TOBN(0x38e8b94f, 0x0facd2f0),
+ TOBN(0xfea1f3af, 0x03fde375), TOBN(0x5e11a1d8, 0x75881dfc),
+ TOBN(0xb3a6b02e, 0xc1e2f2ef), TOBN(0x193d2bbb, 0xc605a6c5),
+ TOBN(0x325ffeee, 0x339a0b2d), TOBN(0x27b6a724, 0x9e0c8846),
+ TOBN(0xe4050f1c, 0xf1c367ca), TOBN(0x9bc85a9b, 0xc90fbc7d),
+ TOBN(0xa373c4a2, 0xe1a11032), TOBN(0xb64232b7, 0xad0393a9),
+ TOBN(0xf5577eb0, 0x167dad29), TOBN(0x1604f301, 0x94b78ab2),
+ TOBN(0x0baa94af, 0xe829348b), TOBN(0x77fbd8dd, 0x41654342),
+ TOBN(0xdab50ea5, 0xb964e39a), TOBN(0xd4c29e3c, 0xd0d3c76e),
+ TOBN(0x80dae67c, 0x56d11964), TOBN(0x7307a8bf, 0xe5ffcc2f),
+ TOBN(0x65bbc1aa, 0x91708c3b), TOBN(0xa151e62c, 0x28bf0eeb),
+ TOBN(0x6cb53381, 0x6fa34db7), TOBN(0x5139e05c, 0xa29403a8),
+ TOBN(0x6ff651b4, 0x94a7cd2e), TOBN(0x5671ffd1, 0x0699336c),
+ TOBN(0x6f5fd2cc, 0x979a896a), TOBN(0x11e893a8, 0xd8148cef),
+ TOBN(0x988906a1, 0x65cf7b10), TOBN(0x81b67178, 0xc50d8485),
+ TOBN(0x7c0deb35, 0x8a35b3de), TOBN(0x423ac855, 0xc1d29799),
+ TOBN(0xaf580d87, 0xdac50b74), TOBN(0x28b2b89f, 0x5869734c),
+ TOBN(0x99a3b936, 0x874e28fb), TOBN(0xbb2c9190, 0x25f3f73a),
+ TOBN(0x199f6918, 0x84a9d5b7), TOBN(0x7ebe2325, 0x7e770374),
+ TOBN(0xf442e107, 0x0738efe2), TOBN(0xcf9f3f56, 0xcf9082d2),
+ TOBN(0x719f69e1, 0x09618708), TOBN(0xcc9e8364, 0xc183f9b1),
+ TOBN(0xec203a95, 0x366a21af), TOBN(0x6aec5d6d, 0x068b141f),
+ TOBN(0xee2df78a, 0x994f04e9), TOBN(0xb39ccae8, 0x271245b0),
+ TOBN(0xb875a4a9, 0x97e43f4f), TOBN(0x507dfe11, 0xdb2cea98),
+ TOBN(0x4fbf81cb, 0x489b03e9), TOBN(0xdb86ec5b, 0x6ec414fa),
+ TOBN(0xfad444f9, 0xf51b3ae5), TOBN(0xca7d33d6, 0x1914e3fe),
+ TOBN(0xa9c32f5c, 0x0ae6c4d0), TOBN(0xa9ca1d1e, 0x73969568),
+ TOBN(0x98043c31, 0x1aa7467e), TOBN(0xe832e75c, 0xe21b5ac6),
+ TOBN(0x314b7aea, 0x5232123d), TOBN(0x08307c8c, 0x65ae86db),
+ TOBN(0x06e7165c, 0xaa4668ed), TOBN(0xb170458b, 0xb4d3ec39),
+ TOBN(0x4d2e3ec6, 0xc19bb986), TOBN(0xc5f34846, 0xae0304ed),
+ TOBN(0x917695a0, 0x6c9f9722), TOBN(0x6c7f7317, 0x4cab1c0a),
+ TOBN(0x6295940e, 0x9d6d2e8b), TOBN(0xd318b8c1, 0x549f7c97),
+ TOBN(0x22453204, 0x97713885), TOBN(0x468d834b, 0xa8a440fe),
+ TOBN(0xd81fe5b2, 0xbfba796e), TOBN(0x152364db, 0x6d71f116),
+ TOBN(0xbb8c7c59, 0xb5b66e53), TOBN(0x0b12c61b, 0x2641a192),
+ TOBN(0x31f14802, 0xfcf0a7fd), TOBN(0x42fd0789, 0x5488b01e),
+ TOBN(0x71d78d6d, 0x9952b498), TOBN(0x8eb572d9, 0x07ac5201),
+ TOBN(0xe0a2a44c, 0x4d194a88), TOBN(0xd2b63fd9, 0xba017e66),
+ TOBN(0x78efc6c8, 0xf888aefc), TOBN(0xb76f6bda, 0x4a881a11),
+ TOBN(0x187f314b, 0xb46c2397), TOBN(0x004cf566, 0x5ded2819),
+ TOBN(0xa9ea5704, 0x38764d34), TOBN(0xbba45217, 0x78084709),
+ TOBN(0x06474571, 0x1171121e), TOBN(0xad7b7eb1, 0xe7c9b671),
+ TOBN(0xdacfbc40, 0x730f7507), TOBN(0x178cd8c6, 0xc7ad7bd1),
+ TOBN(0xbf0be101, 0xb2a67238), TOBN(0x3556d367, 0xaf9c14f2),
+ TOBN(0x104b7831, 0xa5662075), TOBN(0x58ca59bb, 0x79d9e60a),
+ TOBN(0x4bc45392, 0xa569a73b), TOBN(0x517a52e8, 0x5698f6c9),
+ TOBN(0x85643da5, 0xaeadd755), TOBN(0x1aed0cd5, 0x2a581b84),
+ TOBN(0xb9b4ff84, 0x80af1372), TOBN(0x244c3113, 0xf1ba5d1f),
+ TOBN(0x2a5dacbe, 0xf5f98d31), TOBN(0x2c3323e8, 0x4375bc2a),
+ TOBN(0x17a3ab4a, 0x5594b1dd), TOBN(0xa1928bfb, 0xceb4797e),
+ TOBN(0xe83af245, 0xe4886a19), TOBN(0x8979d546, 0x72b5a74a),
+ TOBN(0xa0f726bc, 0x19f9e967), TOBN(0xd9d03152, 0xe8fbbf4e),
+ TOBN(0xcfd6f51d, 0xb7707d40), TOBN(0x633084d9, 0x63f6e6e0),
+ TOBN(0xedcd9cdc, 0x55667eaf), TOBN(0x73b7f92b, 0x2e44d56f),
+ TOBN(0xfb2e39b6, 0x4e962b14), TOBN(0x7d408f6e, 0xf671fcbf),
+ TOBN(0xcc634ddc, 0x164a89bb), TOBN(0x74a42bb2, 0x3ef3bd05),
+ TOBN(0x1280dbb2, 0x428decbb), TOBN(0x6103f6bb, 0x402c8596),
+ TOBN(0xfa2bf581, 0x355a5752), TOBN(0x562f96a8, 0x00946674),
+ TOBN(0x4e4ca16d, 0x6da0223b), TOBN(0xfe47819f, 0x28d3aa25),
+ TOBN(0x9eea3075, 0xf8dfcf8a), TOBN(0xa284f0aa, 0x95669825),
+ TOBN(0xb3fca250, 0x867d3fd8), TOBN(0x20757b5f, 0x269d691e),
+ TOBN(0xf2c24020, 0x93b8a5de), TOBN(0xd3f93359, 0xebc06da6),
+ TOBN(0x1178293e, 0xb2739c33), TOBN(0xd2a3e770, 0xbcd686e5),
+ TOBN(0xa76f49f4, 0xcd941534), TOBN(0x0d37406b, 0xe3c71c0e),
+ TOBN(0x172d9397, 0x3b97f7e3), TOBN(0xec17e239, 0xbd7fd0de),
+ TOBN(0xe3290551, 0x6f496ba2), TOBN(0x6a693172, 0x36ad50e7),
+ TOBN(0xc4e539a2, 0x83e7eff5), TOBN(0x752737e7, 0x18e1b4cf),
+ TOBN(0xa2f7932c, 0x68af43ee), TOBN(0x5502468e, 0x703d00bd),
+ TOBN(0xe5dc978f, 0x2fb061f5), TOBN(0xc9a1904a, 0x28c815ad),
+ TOBN(0xd3af538d, 0x470c56a4), TOBN(0x159abc5f, 0x193d8ced),
+ TOBN(0x2a37245f, 0x20108ef3), TOBN(0xfa17081e, 0x223f7178),
+ TOBN(0x27b0fb2b, 0x10c8c0f5), TOBN(0x2102c3ea, 0x40650547),
+ TOBN(0x594564df, 0x8ac3bfa7), TOBN(0x98102033, 0x509dad96),
+ TOBN(0x6989643f, 0xf1d18a13), TOBN(0x35eebd91, 0xd7fc5af0),
+ TOBN(0x078d096a, 0xfaeaafd8), TOBN(0xb7a89341, 0xdef3de98),
+ TOBN(0x2a206e8d, 0xecf2a73a), TOBN(0x066a6397, 0x8e551994),
+ TOBN(0x3a6a088a, 0xb98d53a2), TOBN(0x0ce7c67c, 0x2d1124aa),
+ TOBN(0x48cec671, 0x759a113c), TOBN(0xe3b373d3, 0x4f6f67fa),
+ TOBN(0x5455d479, 0xfd36727b), TOBN(0xe5a428ee, 0xa13c0d81),
+ TOBN(0xb853dbc8, 0x1c86682b), TOBN(0xb78d2727, 0xb8d02b2a),
+ TOBN(0xaaf69bed, 0x8ebc329a), TOBN(0xdb6b40b3, 0x293b2148),
+ TOBN(0xe42ea77d, 0xb8c4961f), TOBN(0xb1a12f7c, 0x20e5e0ab),
+ TOBN(0xa0ec5274, 0x79e8b05e), TOBN(0x68027391, 0xfab60a80),
+ TOBN(0x6bfeea5f, 0x16b1bd5e), TOBN(0xf957e420, 0x4de30ad3),
+ TOBN(0xcbaf664e, 0x6a353b9e), TOBN(0x5c873312, 0x26d14feb),
+ TOBN(0x4e87f98c, 0xb65f57cb), TOBN(0xdb60a621, 0x5e0cdd41),
+ TOBN(0x67c16865, 0xa6881440), TOBN(0x1093ef1a, 0x46ab52aa),
+ TOBN(0xc095afb5, 0x3f4ece64), TOBN(0x6a6bb02e, 0x7604551a),
+ TOBN(0x55d44b4e, 0x0b26b8cd), TOBN(0xe5f9a999, 0xf971268a),
+ TOBN(0xc08ec425, 0x11a7de84), TOBN(0x83568095, 0xfda469dd),
+ TOBN(0x737bfba1, 0x6c6c90a2), TOBN(0x1cb9c4a0, 0xbe229831),
+ TOBN(0x93bccbba, 0xbb2eec64), TOBN(0xa0c23b64, 0xda03adbe),
+ TOBN(0x5f7aa00a, 0xe0e86ac4), TOBN(0x470b941e, 0xfc1401e6),
+ TOBN(0x5ad8d679, 0x9df43574), TOBN(0x4ccfb8a9, 0x0f65d810),
+ TOBN(0x1bce80e3, 0xaa7fbd81), TOBN(0x273291ad, 0x9508d20a),
+ TOBN(0xf5c4b46b, 0x42a92806), TOBN(0x810684ec, 0xa86ab44a),
+ TOBN(0x4591640b, 0xca0bc9f8), TOBN(0xb5efcdfc, 0x5c4b6054),
+ TOBN(0x16fc8907, 0x6e9edd12), TOBN(0xe29d0b50, 0xd4d792f9),
+ TOBN(0xa45fd01c, 0x9b03116d), TOBN(0x85035235, 0xc81765a4),
+ TOBN(0x1fe2a9b2, 0xb4b4b67c), TOBN(0xc1d10df0, 0xe8020604),
+ TOBN(0x9d64abfc, 0xbc8058d8), TOBN(0x8943b9b2, 0x712a0fbb),
+ TOBN(0x90eed914, 0x3b3def04), TOBN(0x85ab3aa2, 0x4ce775ff),
+ TOBN(0x605fd4ca, 0x7bbc9040), TOBN(0x8b34a564, 0xe2c75dfb),
+ TOBN(0x41ffc94a, 0x10358560), TOBN(0x2d8a5072, 0x9e5c28aa),
+ TOBN(0xe915a0fc, 0x4cc7eb15), TOBN(0xe9efab05, 0x8f6d0f5d),
+ TOBN(0xdbab47a9, 0xd19e9b91), TOBN(0x8cfed745, 0x0276154c),
+ TOBN(0x154357ae, 0x2cfede0d), TOBN(0x520630df, 0x19f5a4ef),
+ TOBN(0x25759f7c, 0xe382360f), TOBN(0xb6db05c9, 0x88bf5857),
+ TOBN(0x2917d61d, 0x6c58d46c), TOBN(0x14f8e491, 0xfd20cb7a),
+ TOBN(0xb68a727a, 0x11c20340), TOBN(0x0386f86f, 0xaf7ccbb6),
+ TOBN(0x5c8bc6cc, 0xfee09a20), TOBN(0x7d76ff4a, 0xbb7eea35),
+ TOBN(0xa7bdebe7, 0xdb15be7a), TOBN(0x67a08054, 0xd89f0302),
+ TOBN(0x56bf0ea9, 0xc1193364), TOBN(0xc8244467, 0x62837ebe),
+ TOBN(0x32bd8e8b, 0x20d841b8), TOBN(0x127a0548, 0xdbb8a54f),
+ TOBN(0x83dd4ca6, 0x63b20236), TOBN(0x87714718, 0x203491fa),
+ TOBN(0x4dabcaaa, 0xaa8a5288), TOBN(0x91cc0c8a, 0xaf23a1c9),
+ TOBN(0x34c72c6a, 0x3f220e0c), TOBN(0xbcc20bdf, 0x1232144a),
+ TOBN(0x6e2f42da, 0xa20ede1b), TOBN(0xc441f00c, 0x74a00515),
+ TOBN(0xbf46a5b6, 0x734b8c4b), TOBN(0x57409503, 0x7b56c9a4),
+ TOBN(0x9f735261, 0xe4585d45), TOBN(0x9231faed, 0x6734e642),
+ TOBN(0x1158a176, 0xbe70ee6c), TOBN(0x35f1068d, 0x7c3501bf),
+ TOBN(0x6beef900, 0xa2d26115), TOBN(0x649406f2, 0xef0afee3),
+ TOBN(0x3f43a60a, 0xbc2420a1), TOBN(0x509002a7, 0xd5aee4ac),
+ TOBN(0xb46836a5, 0x3ff3571b), TOBN(0x24f98b78, 0x837927c1),
+ TOBN(0x6254256a, 0x4533c716), TOBN(0xf27abb0b, 0xd07ee196),
+ TOBN(0xd7cf64fc, 0x5c6d5bfd), TOBN(0x6915c751, 0xf0cd7a77),
+ TOBN(0xd9f59012, 0x8798f534), TOBN(0x772b0da8, 0xf81d8b5f),
+ TOBN(0x1244260c, 0x2e03fa69), TOBN(0x36cf0e3a, 0x3be1a374),
+ TOBN(0x6e7c1633, 0xef06b960), TOBN(0xa71a4c55, 0x671f90f6),
+ TOBN(0x7a941251, 0x33c673db), TOBN(0xc0bea510, 0x73e8c131),
+ TOBN(0x61a8a699, 0xd4f6c734), TOBN(0x25e78c88, 0x341ed001),
+ TOBN(0x5c18acf8, 0x8e2f7d90), TOBN(0xfdbf33d7, 0x77be32cd),
+ TOBN(0x0a085cd7, 0xd2eb5ee9), TOBN(0x2d702cfb, 0xb3201115),
+ TOBN(0xb6e0ebdb, 0x85c88ce8), TOBN(0x23a3ce3c, 0x1e01d617),
+ TOBN(0x3041618e, 0x567333ac), TOBN(0x9dd0fd8f, 0x157edb6b),
+ TOBN(0x27f74702, 0xb57872b8), TOBN(0x2ef26b4f, 0x657d5fe1),
+ TOBN(0x95426f0a, 0x57cf3d40), TOBN(0x847e2ad1, 0x65a6067a),
+ TOBN(0xd474d9a0, 0x09996a74), TOBN(0x16a56acd, 0x2a26115c),
+ TOBN(0x02a615c3, 0xd16f4d43), TOBN(0xcc3fc965, 0xaadb85b7),
+ TOBN(0x386bda73, 0xce07d1b0), TOBN(0xd82910c2, 0x58ad4178),
+ TOBN(0x124f82cf, 0xcd2617f4), TOBN(0xcc2f5e8d, 0xef691770),
+ TOBN(0x82702550, 0xb8c30ccc), TOBN(0x7b856aea, 0x1a8e575a),
+ TOBN(0xbb822fef, 0xb1ab9459), TOBN(0x085928bc, 0xec24e38e),
+ TOBN(0x5d0402ec, 0xba8f4b4d), TOBN(0xc07cd4ba, 0x00b4d58b),
+ TOBN(0x5d8dffd5, 0x29227e7a), TOBN(0x61d44d0c, 0x31bf386f),
+ TOBN(0xe486dc2b, 0x135e6f4d), TOBN(0x680962eb, 0xe79410ef),
+ TOBN(0xa61bd343, 0xf10088b5), TOBN(0x6aa76076, 0xe2e28686),
+ TOBN(0x80463d11, 0x8fb98871), TOBN(0xcb26f5c3, 0xbbc76aff),
+ TOBN(0xd4ab8edd, 0xfbe03614), TOBN(0xc8eb579b, 0xc0cf2dee),
+ TOBN(0xcc004c15, 0xc93bae41), TOBN(0x46fbae5d, 0x3aeca3b2),
+ TOBN(0x671235cf, 0x0f1e9ab1), TOBN(0xadfba934, 0x9ec285c1),
+ TOBN(0x88ded013, 0xf216c980), TOBN(0xc8ac4fb8, 0xf79e0bc1),
+ TOBN(0xa29b89c6, 0xfb97a237), TOBN(0xb697b780, 0x9922d8e7),
+ TOBN(0x3142c639, 0xddb945b5), TOBN(0x447b06c7, 0xe094c3a9),
+ TOBN(0xcdcb3642, 0x72266c90), TOBN(0x633aad08, 0xa9385046),
+ TOBN(0xa36c936b, 0xb57c6477), TOBN(0x871f8b64, 0xe94dbcc6),
+ TOBN(0x28d0fb62, 0xa591a67b), TOBN(0x9d40e081, 0xc1d926f5),
+ TOBN(0x3111eaf6, 0xf2d84b5a), TOBN(0x228993f9, 0xa565b644),
+ TOBN(0x0ccbf592, 0x2c83188b), TOBN(0xf87b30ab, 0x3df3e197),
+ TOBN(0xb8658b31, 0x7642bca8), TOBN(0x1a032d7f, 0x52800f17),
+ TOBN(0x051dcae5, 0x79bf9445), TOBN(0xeba6b8ee, 0x54a2e253),
+ TOBN(0x5c8b9cad, 0xd4485692), TOBN(0x84bda40e, 0x8986e9be),
+ TOBN(0xd16d16a4, 0x2f0db448), TOBN(0x8ec80050, 0xa14d4188),
+ TOBN(0xb2b26107, 0x98fa7aaa), TOBN(0x41209ee4, 0xf073aa4e),
+ TOBN(0xf1570359, 0xf2d6b19b), TOBN(0xcbe6868c, 0xfc577caf),
+ TOBN(0x186c4bdc, 0x32c04dd3), TOBN(0xa6c35fae, 0xcfeee397),
+ TOBN(0xb4a1b312, 0xf086c0cf), TOBN(0xe0a5ccc6, 0xd9461fe2),
+ TOBN(0xc32278aa, 0x1536189f), TOBN(0x1126c55f, 0xba6df571),
+ TOBN(0x0f71a602, 0xb194560e), TOBN(0x8b2d7405, 0x324bd6e1),
+ TOBN(0x8481939e, 0x3738be71), TOBN(0xb5090b1a, 0x1a4d97a9),
+ TOBN(0x116c65a3, 0xf05ba915), TOBN(0x21863ad3, 0xaae448aa),
+ TOBN(0xd24e2679, 0xa7aae5d3), TOBN(0x7076013d, 0x0de5c1c4),
+ TOBN(0x2d50f8ba, 0xbb05b629), TOBN(0x73c1abe2, 0x6e66efbb),
+ TOBN(0xefd4b422, 0xf2488af7), TOBN(0xe4105d02, 0x663ba575),
+ TOBN(0x7eb60a8b, 0x53a69457), TOBN(0x62210008, 0xc945973b),
+ TOBN(0xfb255478, 0x77a50ec6), TOBN(0xbf0392f7, 0x0a37a72c),
+ TOBN(0xa0a7a19c, 0x4be18e7a), TOBN(0x90d8ea16, 0x25b1e0af),
+ TOBN(0x7582a293, 0xef953f57), TOBN(0x90a64d05, 0xbdc5465a),
+ TOBN(0xca79c497, 0xe2510717), TOBN(0x560dbb7c, 0x18cb641f),
+ TOBN(0x1d8e3286, 0x4b66abfb), TOBN(0xd26f52e5, 0x59030900),
+ TOBN(0x1ee3f643, 0x5584941a), TOBN(0x6d3b3730, 0x569f5958),
+ TOBN(0x9ff2a62f, 0x4789dba5), TOBN(0x91fcb815, 0x72b5c9b7),
+ TOBN(0xf446cb7d, 0x6c8f9a0e), TOBN(0x48f625c1, 0x39b7ecb5),
+ TOBN(0xbabae801, 0x1c6219b8), TOBN(0xe7a562d9, 0x28ac2f23),
+ TOBN(0xe1b48732, 0x26e20588), TOBN(0x06ee1cad, 0x775af051),
+ TOBN(0xda29ae43, 0xfaff79f7), TOBN(0xc141a412, 0x652ee9e0),
+ TOBN(0x1e127f6f, 0x195f4bd0), TOBN(0x29c6ab4f, 0x072f34f8),
+ TOBN(0x7b7c1477, 0x30448112), TOBN(0x82b51af1, 0xe4a38656),
+ TOBN(0x2bf2028a, 0x2f315010), TOBN(0xc9a4a01f, 0x6ea88cd4),
+ TOBN(0xf63e95d8, 0x257e5818), TOBN(0xdd8efa10, 0xb4519b16),
+ TOBN(0xed8973e0, 0x0da910bf), TOBN(0xed49d077, 0x5c0fe4a9),
+ TOBN(0xac3aac5e, 0xb7caee1e), TOBN(0x1033898d, 0xa7f4da57),
+ TOBN(0x42145c0e, 0x5c6669b9), TOBN(0x42daa688, 0xc1aa2aa0),
+ TOBN(0x629cc15c, 0x1a1d885a), TOBN(0x25572ec0, 0xf4b76817),
+ TOBN(0x8312e435, 0x9c8f8f28), TOBN(0x8107f8cd, 0x81965490),
+ TOBN(0x516ff3a3, 0x6fa6110c), TOBN(0x74fb1eb1, 0xfb93561f),
+ TOBN(0x6c0c9047, 0x8457522b), TOBN(0xcfd32104, 0x6bb8bdc6),
+ TOBN(0x2d6884a2, 0xcc80ad57), TOBN(0x7c27fc35, 0x86a9b637),
+ TOBN(0x3461baed, 0xadf4e8cd), TOBN(0x1d56251a, 0x617242f0),
+ TOBN(0x0b80d209, 0xc955bef4), TOBN(0xdf02cad2, 0x06adb047),
+ TOBN(0xf0d7cb91, 0x5ec74fee), TOBN(0xd2503375, 0x1111ba44),
+ TOBN(0x9671755e, 0xdf53cb36), TOBN(0x54dcb612, 0x3368551b),
+ TOBN(0x66d69aac, 0xc8a025a4), TOBN(0x6be946c6, 0xe77ef445),
+ TOBN(0x719946d1, 0xa995e094), TOBN(0x65e848f6, 0xe51e04d8),
+ TOBN(0xe62f3300, 0x6a1e3113), TOBN(0x1541c7c1, 0x501de503),
+ TOBN(0x4daac9fa, 0xf4acfade), TOBN(0x0e585897, 0x44cd0b71),
+ TOBN(0x544fd869, 0x0a51cd77), TOBN(0x60fc20ed, 0x0031016d),
+ TOBN(0x58b404ec, 0xa4276867), TOBN(0x46f6c3cc, 0x34f34993),
+ TOBN(0x477ca007, 0xc636e5bd), TOBN(0x8018f5e5, 0x7c458b47),
+ TOBN(0xa1202270, 0xe47b668f), TOBN(0xcef48ccd, 0xee14f203),
+ TOBN(0x23f98bae, 0x62ff9b4d), TOBN(0x55acc035, 0xc589eddd),
+ TOBN(0x3fe712af, 0x64db4444), TOBN(0x19e9d634, 0xbecdd480),
+ TOBN(0xe08bc047, 0xa930978a), TOBN(0x2dbf24ec, 0xa1280733),
+ TOBN(0x3c0ae38c, 0x2cd706b2), TOBN(0x5b012a5b, 0x359017b9),
+ TOBN(0x3943c38c, 0x72e0f5ae), TOBN(0x786167ea, 0x57176fa3),
+ TOBN(0xe5f9897d, 0x594881dc), TOBN(0x6b5efad8, 0xcfb820c1),
+ TOBN(0xb2179093, 0xd55018de), TOBN(0x39ad7d32, 0x0bac56ce),
+ TOBN(0xb55122e0, 0x2cfc0e81), TOBN(0x117c4661, 0xf6d89daa),
+ TOBN(0x362d01e1, 0xcb64fa09), TOBN(0x6a309b4e, 0x3e9c4ddd),
+ TOBN(0xfa979fb7, 0xabea49b1), TOBN(0xb4b1d27d, 0x10e2c6c5),
+ TOBN(0xbd61c2c4, 0x23afde7a), TOBN(0xeb6614f8, 0x9786d358),
+ TOBN(0x4a5d816b, 0x7f6f7459), TOBN(0xe431a44f, 0x09360e7b),
+ TOBN(0x8c27a032, 0xc309914c), TOBN(0xcea5d68a, 0xcaede3d8),
+ TOBN(0x3668f665, 0x3a0a3f95), TOBN(0x89369416, 0x7ceba27b),
+ TOBN(0x89981fad, 0xe4728fe9), TOBN(0x7102c8a0, 0x8a093562),
+ TOBN(0xbb80310e, 0x235d21c8), TOBN(0x505e55d1, 0xbefb7f7b),
+ TOBN(0xa0a90811, 0x12958a67), TOBN(0xd67e106a, 0x4d851fef),
+ TOBN(0xb84011a9, 0x431dd80e), TOBN(0xeb7c7cca, 0x73306cd9),
+ TOBN(0x20fadd29, 0xd1b3b730), TOBN(0x83858b5b, 0xfe37b3d3),
+ TOBN(0xbf4cd193, 0xb6251d5c), TOBN(0x1cca1fd3, 0x1352d952),
+ TOBN(0xc66157a4, 0x90fbc051), TOBN(0x7990a638, 0x89b98636),}
+ ,
+ {TOBN(0xe5aa692a, 0x87dec0e1), TOBN(0x010ded8d, 0xf7b39d00),
+ TOBN(0x7b1b80c8, 0x54cfa0b5), TOBN(0x66beb876, 0xa0f8ea28),
+ TOBN(0x50d7f531, 0x3476cd0e), TOBN(0xa63d0e65, 0xb08d3949),
+ TOBN(0x1a09eea9, 0x53479fc6), TOBN(0x82ae9891, 0xf499e742),
+ TOBN(0xab58b910, 0x5ca7d866), TOBN(0x582967e2, 0x3adb3b34),
+ TOBN(0x89ae4447, 0xcceac0bc), TOBN(0x919c667c, 0x7bf56af5),
+ TOBN(0x9aec17b1, 0x60f5dcd7), TOBN(0xec697b9f, 0xddcaadbc),
+ TOBN(0x0b98f341, 0x463467f5), TOBN(0xb187f1f7, 0xa967132f),
+ TOBN(0x90fe7a1d, 0x214aeb18), TOBN(0x1506af3c, 0x741432f7),
+ TOBN(0xbb5565f9, 0xe591a0c4), TOBN(0x10d41a77, 0xb44f1bc3),
+ TOBN(0xa09d65e4, 0xa84bde96), TOBN(0x42f060d8, 0xf20a6a1c),
+ TOBN(0x652a3bfd, 0xf27f9ce7), TOBN(0xb6bdb65c, 0x3b3d739f),
+ TOBN(0xeb5ddcb6, 0xec7fae9f), TOBN(0x995f2714, 0xefb66e5a),
+ TOBN(0xdee95d8e, 0x69445d52), TOBN(0x1b6c2d46, 0x09e27620),
+ TOBN(0x32621c31, 0x8129d716), TOBN(0xb03909f1, 0x0958c1aa),
+ TOBN(0x8c468ef9, 0x1af4af63), TOBN(0x162c429f, 0xfba5cdf6),
+ TOBN(0x2f682343, 0x753b9371), TOBN(0x29cab45a, 0x5f1f9cd7),
+ TOBN(0x571623ab, 0xb245db96), TOBN(0xc507db09, 0x3fd79999),
+ TOBN(0x4e2ef652, 0xaf036c32), TOBN(0x86f0cc78, 0x05018e5c),
+ TOBN(0xc10a73d4, 0xab8be350), TOBN(0x6519b397, 0x7e826327),
+ TOBN(0xe8cb5eef, 0x9c053df7), TOBN(0x8de25b37, 0xb300ea6f),
+ TOBN(0xdb03fa92, 0xc849cffb), TOBN(0x242e43a7, 0xe84169bb),
+ TOBN(0xe4fa51f4, 0xdd6f958e), TOBN(0x6925a77f, 0xf4445a8d),
+ TOBN(0xe6e72a50, 0xe90d8949), TOBN(0xc66648e3, 0x2b1f6390),
+ TOBN(0xb2ab1957, 0x173e460c), TOBN(0x1bbbce75, 0x30704590),
+ TOBN(0xc0a90dbd, 0xdb1c7162), TOBN(0x505e399e, 0x15cdd65d),
+ TOBN(0x68434dcb, 0x57797ab7), TOBN(0x60ad35ba, 0x6a2ca8e8),
+ TOBN(0x4bfdb1e0, 0xde3336c1), TOBN(0xbbef99eb, 0xd8b39015),
+ TOBN(0x6c3b96f3, 0x1711ebec), TOBN(0x2da40f1f, 0xce98fdc4),
+ TOBN(0xb99774d3, 0x57b4411f), TOBN(0x87c8bdf4, 0x15b65bb6),
+ TOBN(0xda3a89e3, 0xc2eef12d), TOBN(0xde95bb9b, 0x3c7471f3),
+ TOBN(0x600f225b, 0xd812c594), TOBN(0x54907c5d, 0x2b75a56b),
+ TOBN(0xa93cc5f0, 0x8db60e35), TOBN(0x743e3cd6, 0xfa833319),
+ TOBN(0x7dad5c41, 0xf81683c9), TOBN(0x70c1e7d9, 0x9c34107e),
+ TOBN(0x0edc4a39, 0xa6be0907), TOBN(0x36d47035, 0x86d0b7d3),
+ TOBN(0x8c76da03, 0x272bfa60), TOBN(0x0b4a07ea, 0x0f08a414),
+ TOBN(0x699e4d29, 0x45c1dd53), TOBN(0xcadc5898, 0x231debb5),
+ TOBN(0xdf49fcc7, 0xa77f00e0), TOBN(0x93057bbf, 0xa73e5a0e),
+ TOBN(0x2f8b7ecd, 0x027a4cd1), TOBN(0x114734b3, 0xc614011a),
+ TOBN(0xe7a01db7, 0x67677c68), TOBN(0x89d9be5e, 0x7e273f4f),
+ TOBN(0xd225cb2e, 0x089808ef), TOBN(0xf1f7a27d, 0xd59e4107),
+ TOBN(0x53afc761, 0x8211b9c9), TOBN(0x0361bc67, 0xe6819159),
+ TOBN(0x2a865d0b, 0x7f071426), TOBN(0x6a3c1810, 0xe7072567),
+ TOBN(0x3e3bca1e, 0x0d6bcabd), TOBN(0xa1b02bc1, 0x408591bc),
+ TOBN(0xe0deee59, 0x31fba239), TOBN(0xf47424d3, 0x98bd91d1),
+ TOBN(0x0f8886f4, 0x071a3c1d), TOBN(0x3f7d41e8, 0xa819233b),
+ TOBN(0x708623c2, 0xcf6eb998), TOBN(0x86bb49af, 0x609a287f),
+ TOBN(0x942bb249, 0x63c90762), TOBN(0x0ef6eea5, 0x55a9654b),
+ TOBN(0x5f6d2d72, 0x36f5defe), TOBN(0xfa9922dc, 0x56f99176),
+ TOBN(0x6c8c5ece, 0xf78ce0c7), TOBN(0x7b44589d, 0xbe09b55e),
+ TOBN(0xe11b3bca, 0x9ea83770), TOBN(0xd7fa2c7f, 0x2ab71547),
+ TOBN(0x2a3dd6fa, 0x2a1ddcc0), TOBN(0x09acb430, 0x5a7b7707),
+ TOBN(0x4add4a2e, 0x649d4e57), TOBN(0xcd53a2b0, 0x1917526e),
+ TOBN(0xc5262330, 0x20b44ac4), TOBN(0x4028746a, 0xbaa2c31d),
+ TOBN(0x51318390, 0x64291d4c), TOBN(0xbf48f151, 0xee5ad909),
+ TOBN(0xcce57f59, 0x7b185681), TOBN(0x7c3ac1b0, 0x4854d442),
+ TOBN(0x65587dc3, 0xc093c171), TOBN(0xae7acb24, 0x24f42b65),
+ TOBN(0x5a338adb, 0x955996cb), TOBN(0xc8e65675, 0x6051f91b),
+ TOBN(0x66711fba, 0x28b8d0b1), TOBN(0x15d74137, 0xb6c10a90),
+ TOBN(0x70cdd7eb, 0x3a232a80), TOBN(0xc9e2f07f, 0x6191ed24),
+ TOBN(0xa80d1db6, 0xf79588c0), TOBN(0xfa52fc69, 0xb55768cc),
+ TOBN(0x0b4df1ae, 0x7f54438a), TOBN(0x0cadd1a7, 0xf9b46a4f),
+ TOBN(0xb40ea6b3, 0x1803dd6f), TOBN(0x488e4fa5, 0x55eaae35),
+ TOBN(0x9f047d55, 0x382e4e16), TOBN(0xc9b5b7e0, 0x2f6e0c98),
+ TOBN(0x6b1bd2d3, 0x95762649), TOBN(0xa9604ee7, 0xc7aea3f6),
+ TOBN(0x3646ff27, 0x6dc6f896), TOBN(0x9bf0e7f5, 0x2860bad1),
+ TOBN(0x2d92c821, 0x7cb44b92), TOBN(0xa2f5ce63, 0xaea9c182),
+ TOBN(0xd0a2afb1, 0x9154a5fd), TOBN(0x482e474c, 0x95801da6),
+ TOBN(0xc19972d0, 0xb611c24b), TOBN(0x1d468e65, 0x60a8f351),
+ TOBN(0xeb758069, 0x7bcf6421), TOBN(0xec9dd0ee, 0x88fbc491),
+ TOBN(0x5b59d2bf, 0x956c2e32), TOBN(0x73dc6864, 0xdcddf94e),
+ TOBN(0xfd5e2321, 0xbcee7665), TOBN(0xa7b4f8ef, 0x5e9a06c4),
+ TOBN(0xfba918dd, 0x7280f855), TOBN(0xbbaac260, 0x8baec688),
+ TOBN(0xa3b3f00f, 0x33400f42), TOBN(0x3d2dba29, 0x66f2e6e4),
+ TOBN(0xb6f71a94, 0x98509375), TOBN(0x8f33031f, 0xcea423cc),
+ TOBN(0x009b8dd0, 0x4807e6fb), TOBN(0x5163cfe5, 0x5cdb954c),
+ TOBN(0x03cc8f17, 0xcf41c6e8), TOBN(0xf1f03c2a, 0x037b925c),
+ TOBN(0xc39c19cc, 0x66d2427c), TOBN(0x823d24ba, 0x7b6c18e4),
+ TOBN(0x32ef9013, 0x901f0b4f), TOBN(0x684360f1, 0xf8941c2e),
+ TOBN(0x0ebaff52, 0x2c28092e), TOBN(0x7891e4e3, 0x256c932f),
+ TOBN(0x51264319, 0xac445e3d), TOBN(0x553432e7, 0x8ea74381),
+ TOBN(0xe6eeaa69, 0x67e9c50a), TOBN(0x27ced284, 0x62e628c7),
+ TOBN(0x3f96d375, 0x7a4afa57), TOBN(0xde0a14c3, 0xe484c150),
+ TOBN(0x364a24eb, 0x38bd9923), TOBN(0x1df18da0, 0xe5177422),
+ TOBN(0x174e8f82, 0xd8d38a9b), TOBN(0x2e97c600, 0xe7de1391),
+ TOBN(0xc5709850, 0xa1c175dd), TOBN(0x969041a0, 0x32ae5035),
+ TOBN(0xcbfd533b, 0x76a2086b), TOBN(0xd6bba71b, 0xd7c2e8fe),
+ TOBN(0xb2d58ee6, 0x099dfb67), TOBN(0x3a8b342d, 0x064a85d9),
+ TOBN(0x3bc07649, 0x522f9be3), TOBN(0x690c075b, 0xdf1f49a8),
+ TOBN(0x80e1aee8, 0x3854ec42), TOBN(0x2a7dbf44, 0x17689dc7),
+ TOBN(0xc004fc0e, 0x3faf4078), TOBN(0xb2f02e9e, 0xdf11862c),
+ TOBN(0xf10a5e0f, 0xa0a1b7b3), TOBN(0x30aca623, 0x8936ec80),
+ TOBN(0xf83cbf05, 0x02f40d9a), TOBN(0x4681c468, 0x2c318a4d),
+ TOBN(0x98575618, 0x0e9c2674), TOBN(0xbe79d046, 0x1847092e),
+ TOBN(0xaf1e480a, 0x78bd01e0), TOBN(0x6dd359e4, 0x72a51db9),
+ TOBN(0x62ce3821, 0xe3afbab6), TOBN(0xc5cee5b6, 0x17733199),
+ TOBN(0xe08b30d4, 0x6ffd9fbb), TOBN(0x6e5bc699, 0x36c610b7),
+ TOBN(0xf343cff2, 0x9ce262cf), TOBN(0xca2e4e35, 0x68b914c1),
+ TOBN(0x011d64c0, 0x16de36c5), TOBN(0xe0b10fdd, 0x42e2b829),
+ TOBN(0x78942981, 0x6685aaf8), TOBN(0xe7511708, 0x230ede97),
+ TOBN(0x671ed8fc, 0x3b922bf8), TOBN(0xe4d8c0a0, 0x4c29b133),
+ TOBN(0x87eb1239, 0x3b6e99c4), TOBN(0xaff3974c, 0x8793beba),
+ TOBN(0x03749405, 0x2c18df9b), TOBN(0xc5c3a293, 0x91007139),
+ TOBN(0x6a77234f, 0xe37a0b95), TOBN(0x02c29a21, 0xb661c96b),
+ TOBN(0xc3aaf1d6, 0x141ecf61), TOBN(0x9195509e, 0x3bb22f53),
+ TOBN(0x29597404, 0x22d51357), TOBN(0x1b083822, 0x537bed60),
+ TOBN(0xcd7d6e35, 0xe07289f0), TOBN(0x1f94c48c, 0x6dd86eff),
+ TOBN(0xc8bb1f82, 0xeb0f9cfa), TOBN(0x9ee0b7e6, 0x1b2eb97d),
+ TOBN(0x5a52fe2e, 0x34d74e31), TOBN(0xa352c310, 0x3bf79ab6),
+ TOBN(0x97ff6c5a, 0xabfeeb8f), TOBN(0xbfbe8fef, 0xf5c97305),
+ TOBN(0xd6081ce6, 0xa7904608), TOBN(0x1f812f3a, 0xc4fca249),
+ TOBN(0x9b24bc9a, 0xb9e5e200), TOBN(0x91022c67, 0x38012ee8),
+ TOBN(0xe83d9c5d, 0x30a713a1), TOBN(0x4876e3f0, 0x84ef0f93),
+ TOBN(0xc9777029, 0xc1fbf928), TOBN(0xef7a6bb3, 0xbce7d2a4),
+ TOBN(0xb8067228, 0xdfa2a659), TOBN(0xd5cd3398, 0xd877a48f),
+ TOBN(0xbea4fd8f, 0x025d0f3f), TOBN(0xd67d2e35, 0x2eae7c2b),
+ TOBN(0x184de7d7, 0xcc5f4394), TOBN(0xb5551b5c, 0x4536e142),
+ TOBN(0x2e89b212, 0xd34aa60a), TOBN(0x14a96fea, 0xf50051d5),
+ TOBN(0x4e21ef74, 0x0d12bb0b), TOBN(0xc522f020, 0x60b9677e),
+ TOBN(0x8b12e467, 0x2df7731d), TOBN(0x39f80382, 0x7b326d31),
+ TOBN(0xdfb8630c, 0x39024a94), TOBN(0xaacb96a8, 0x97319452),
+ TOBN(0xd68a3961, 0xeda3867c), TOBN(0x0c58e2b0, 0x77c4ffca),
+ TOBN(0x3d545d63, 0x4da919fa), TOBN(0xef79b69a, 0xf15e2289),
+ TOBN(0x54bc3d3d, 0x808bab10), TOBN(0xc8ab3007, 0x45f82c37),
+ TOBN(0xc12738b6, 0x7c4a658a), TOBN(0xb3c47639, 0x40e72182),
+ TOBN(0x3b77be46, 0x8798e44f), TOBN(0xdc047df2, 0x17a7f85f),
+ TOBN(0x2439d4c5, 0x5e59d92d), TOBN(0xcedca475, 0xe8e64d8d),
+ TOBN(0xa724cd0d, 0x87ca9b16), TOBN(0x35e4fd59, 0xa5540dfe),
+ TOBN(0xf8c1ff18, 0xe4bcf6b1), TOBN(0x856d6285, 0x295018fa),
+ TOBN(0x433f665c, 0x3263c949), TOBN(0xa6a76dd6, 0xa1f21409),
+ TOBN(0x17d32334, 0xcc7b4f79), TOBN(0xa1d03122, 0x06720e4a),
+ TOBN(0xadb6661d, 0x81d9bed5), TOBN(0xf0d6fb02, 0x11db15d1),
+ TOBN(0x7fd11ad5, 0x1fb747d2), TOBN(0xab50f959, 0x3033762b),
+ TOBN(0x2a7e711b, 0xfbefaf5a), TOBN(0xc7393278, 0x3fef2bbf),
+ TOBN(0xe29fa244, 0x0df6f9be), TOBN(0x9092757b, 0x71efd215),
+ TOBN(0xee60e311, 0x4f3d6fd9), TOBN(0x338542d4, 0x0acfb78b),
+ TOBN(0x44a23f08, 0x38961a0f), TOBN(0x1426eade, 0x986987ca),
+ TOBN(0x36e6ee2e, 0x4a863cc6), TOBN(0x48059420, 0x628b8b79),
+ TOBN(0x30303ad8, 0x7396e1de), TOBN(0x5c8bdc48, 0x38c5aad1),
+ TOBN(0x3e40e11f, 0x5c8f5066), TOBN(0xabd6e768, 0x8d246bbd),
+ TOBN(0x68aa40bb, 0x23330a01), TOBN(0xd23f5ee4, 0xc34eafa0),
+ TOBN(0x3bbee315, 0x5de02c21), TOBN(0x18dd4397, 0xd1d8dd06),
+ TOBN(0x3ba1939a, 0x122d7b44), TOBN(0xe6d3b40a, 0xa33870d6),
+ TOBN(0x8e620f70, 0x1c4fe3f8), TOBN(0xf6bba1a5, 0xd3a50cbf),
+ TOBN(0x4a78bde5, 0xcfc0aee0), TOBN(0x847edc46, 0xc08c50bd),
+ TOBN(0xbaa2439c, 0xad63c9b2), TOBN(0xceb4a728, 0x10fc2acb),
+ TOBN(0xa419e40e, 0x26da033d), TOBN(0x6cc3889d, 0x03e02683),
+ TOBN(0x1cd28559, 0xfdccf725), TOBN(0x0fd7e0f1, 0x8d13d208),
+ TOBN(0x01b9733b, 0x1f0df9d4), TOBN(0x8cc2c5f3, 0xa2b5e4f3),
+ TOBN(0x43053bfa, 0x3a304fd4), TOBN(0x8e87665c, 0x0a9f1aa7),
+ TOBN(0x087f29ec, 0xd73dc965), TOBN(0x15ace455, 0x3e9023db),
+ TOBN(0x2370e309, 0x2bce28b4), TOBN(0xf9723442, 0xb6b1e84a),
+ TOBN(0xbeee662e, 0xb72d9f26), TOBN(0xb19396de, 0xf0e47109),
+ TOBN(0x85b1fa73, 0xe13289d0), TOBN(0x436cf77e, 0x54e58e32),
+ TOBN(0x0ec833b3, 0xe990ef77), TOBN(0x7373e3ed, 0x1b11fc25),
+ TOBN(0xbe0eda87, 0x0fc332ce), TOBN(0xced04970, 0x8d7ea856),
+ TOBN(0xf85ff785, 0x7e977ca0), TOBN(0xb66ee8da, 0xdfdd5d2b),
+ TOBN(0xf5e37950, 0x905af461), TOBN(0x587b9090, 0x966d487c),
+ TOBN(0x6a198a1b, 0x32ba0127), TOBN(0xa7720e07, 0x141615ac),
+ TOBN(0xa23f3499, 0x996ef2f2), TOBN(0xef5f64b4, 0x470bcb3d),
+ TOBN(0xa526a962, 0x92b8c559), TOBN(0x0c14aac0, 0x69740a0f),
+ TOBN(0x0d41a9e3, 0xa6bdc0a5), TOBN(0x97d52106, 0x9c48aef4),
+ TOBN(0xcf16bd30, 0x3e7c253b), TOBN(0xcc834b1a, 0x47fdedc1),
+ TOBN(0x7362c6e5, 0x373aab2e), TOBN(0x264ed85e, 0xc5f590ff),
+ TOBN(0x7a46d9c0, 0x66d41870), TOBN(0xa50c20b1, 0x4787ba09),
+ TOBN(0x185e7e51, 0xe3d44635), TOBN(0xb3b3e080, 0x31e2d8dc),
+ TOBN(0xbed1e558, 0xa179e9d9), TOBN(0x2daa3f79, 0x74a76781),
+ TOBN(0x4372baf2, 0x3a40864f), TOBN(0x46900c54, 0x4fe75cb5),
+ TOBN(0xb95f171e, 0xf76765d0), TOBN(0x4ad726d2, 0x95c87502),
+ TOBN(0x2ec769da, 0x4d7c99bd), TOBN(0x5e2ddd19, 0xc36cdfa8),
+ TOBN(0xc22117fc, 0xa93e6dea), TOBN(0xe8a2583b, 0x93771123),
+ TOBN(0xbe2f6089, 0xfa08a3a2), TOBN(0x4809d5ed, 0x8f0e1112),
+ TOBN(0x3b414aa3, 0xda7a095e), TOBN(0x9049acf1, 0x26f5aadd),
+ TOBN(0x78d46a4d, 0x6be8b84a), TOBN(0xd66b1963, 0xb732b9b3),
+ TOBN(0x5c2ac2a0, 0xde6e9555), TOBN(0xcf52d098, 0xb5bd8770),
+ TOBN(0x15a15fa6, 0x0fd28921), TOBN(0x56ccb81e, 0x8b27536d),
+ TOBN(0x0f0d8ab8, 0x9f4ccbb8), TOBN(0xed5f44d2, 0xdb221729),
+ TOBN(0x43141988, 0x00bed10c), TOBN(0xc94348a4, 0x1d735b8b),
+ TOBN(0x79f3e9c4, 0x29ef8479), TOBN(0x4c13a4e3, 0x614c693f),
+ TOBN(0x32c9af56, 0x8e143a14), TOBN(0xbc517799, 0xe29ac5c4),
+ TOBN(0x05e17992, 0x2774856f), TOBN(0x6e52fb05, 0x6c1bf55f),
+ TOBN(0xaeda4225, 0xe4f19e16), TOBN(0x70f4728a, 0xaf5ccb26),
+ TOBN(0x5d2118d1, 0xb2947f22), TOBN(0xc827ea16, 0x281d6fb9),
+ TOBN(0x8412328d, 0x8cf0eabd), TOBN(0x45ee9fb2, 0x03ef9dcf),
+ TOBN(0x8e700421, 0xbb937d63), TOBN(0xdf8ff2d5, 0xcc4b37a6),
+ TOBN(0xa4c0d5b2, 0x5ced7b68), TOBN(0x6537c1ef, 0xc7308f59),
+ TOBN(0x25ce6a26, 0x3b37f8e8), TOBN(0x170e9a9b, 0xdeebc6ce),
+ TOBN(0xdd037952, 0x8728d72c), TOBN(0x445b0e55, 0x850154bc),
+ TOBN(0x4b7d0e06, 0x83a7337b), TOBN(0x1e3416d4, 0xffecf249),
+ TOBN(0x24840eff, 0x66a2b71f), TOBN(0xd0d9a50a, 0xb37cc26d),
+ TOBN(0xe2198150, 0x6fe28ef7), TOBN(0x3cc5ef16, 0x23324c7f),
+ TOBN(0x220f3455, 0x769b5263), TOBN(0xe2ade2f1, 0xa10bf475),
+ TOBN(0x28cd20fa, 0x458d3671), TOBN(0x1549722c, 0x2dc4847b),
+ TOBN(0x6dd01e55, 0x591941e3), TOBN(0x0e6fbcea, 0x27128ccb),
+ TOBN(0xae1a1e6b, 0x3bef0262), TOBN(0xfa8c472c, 0x8f54e103),
+ TOBN(0x7539c0a8, 0x72c052ec), TOBN(0xd7b27369, 0x5a3490e9),
+ TOBN(0x143fe1f1, 0x71684349), TOBN(0x36b4722e, 0x32e19b97),
+ TOBN(0xdc059227, 0x90980aff), TOBN(0x175c9c88, 0x9e13d674),
+ TOBN(0xa7de5b22, 0x6e6bfdb1), TOBN(0x5ea5b7b2, 0xbedb4b46),
+ TOBN(0xd5570191, 0xd34a6e44), TOBN(0xfcf60d2e, 0xa24ff7e6),
+ TOBN(0x614a392d, 0x677819e1), TOBN(0x7be74c7e, 0xaa5a29e8),
+ TOBN(0xab50fece, 0x63c85f3f), TOBN(0xaca2e2a9, 0x46cab337),
+ TOBN(0x7f700388, 0x122a6fe3), TOBN(0xdb69f703, 0x882a04a8),
+ TOBN(0x9a77935d, 0xcf7aed57), TOBN(0xdf16207c, 0x8d91c86f),
+ TOBN(0x2fca49ab, 0x63ed9998), TOBN(0xa3125c44, 0xa77ddf96),
+ TOBN(0x05dd8a86, 0x24344072), TOBN(0xa023dda2, 0xfec3fb56),
+ TOBN(0x421b41fc, 0x0c743032), TOBN(0x4f2120c1, 0x5e438639),
+ TOBN(0xfb7cae51, 0xc83c1b07), TOBN(0xb2370caa, 0xcac2171a),
+ TOBN(0x2eb2d962, 0x6cc820fb), TOBN(0x59feee5c, 0xb85a44bf),
+ TOBN(0x94620fca, 0x5b6598f0), TOBN(0x6b922cae, 0x7e314051),
+ TOBN(0xff8745ad, 0x106bed4e), TOBN(0x546e71f5, 0xdfa1e9ab),
+ TOBN(0x935c1e48, 0x1ec29487), TOBN(0x9509216c, 0x4d936530),
+ TOBN(0xc7ca3067, 0x85c9a2db), TOBN(0xd6ae5152, 0x6be8606f),
+ TOBN(0x09dbcae6, 0xe14c651d), TOBN(0xc9536e23, 0x9bc32f96),
+ TOBN(0xa90535a9, 0x34521b03), TOBN(0xf39c526c, 0x878756ff),
+ TOBN(0x383172ec, 0x8aedf03c), TOBN(0x20a8075e, 0xefe0c034),
+ TOBN(0xf22f9c62, 0x64026422), TOBN(0x8dd10780, 0x24b9d076),
+ TOBN(0x944c742a, 0x3bef2950), TOBN(0x55b9502e, 0x88a2b00b),
+ TOBN(0xa59e14b4, 0x86a09817), TOBN(0xa39dd3ac, 0x47bb4071),
+ TOBN(0x55137f66, 0x3be0592f), TOBN(0x07fcafd4, 0xc9e63f5b),
+ TOBN(0x963652ee, 0x346eb226), TOBN(0x7dfab085, 0xec2facb7),
+ TOBN(0x273bf2b8, 0x691add26), TOBN(0x30d74540, 0xf2b46c44),
+ TOBN(0x05e8e73e, 0xf2c2d065), TOBN(0xff9b8a00, 0xd42eeac9),
+ TOBN(0x2fcbd205, 0x97209d22), TOBN(0xeb740ffa, 0xde14ea2c),
+ TOBN(0xc71ff913, 0xa8aef518), TOBN(0x7bfc74bb, 0xfff4cfa2),
+ TOBN(0x1716680c, 0xb6b36048), TOBN(0x121b2cce, 0x9ef79af1),
+ TOBN(0xbff3c836, 0xa01eb3d3), TOBN(0x50eb1c6a, 0x5f79077b),
+ TOBN(0xa48c32d6, 0xa004bbcf), TOBN(0x47a59316, 0x7d64f61d),
+ TOBN(0x6068147f, 0x93102016), TOBN(0x12c5f654, 0x94d12576),
+ TOBN(0xefb071a7, 0xc9bc6b91), TOBN(0x7c2da0c5, 0x6e23ea95),
+ TOBN(0xf4fd45b6, 0xd4a1dd5d), TOBN(0x3e7ad9b6, 0x9122b13c),
+ TOBN(0x342ca118, 0xe6f57a48), TOBN(0x1c2e94a7, 0x06f8288f),
+ TOBN(0x99e68f07, 0x5a97d231), TOBN(0x7c80de97, 0x4d838758),
+ TOBN(0xbce0f5d0, 0x05872727), TOBN(0xbe5d95c2, 0x19c4d016),
+ TOBN(0x921d5cb1, 0x9c2492ee), TOBN(0x42192dc1, 0x404d6fb3),
+ TOBN(0x4c84dcd1, 0x32f988d3), TOBN(0xde26d61f, 0xa17b8e85),
+ TOBN(0xc466dcb6, 0x137c7408), TOBN(0x9a38d7b6, 0x36a266da),
+ TOBN(0x7ef5cb06, 0x83bebf1b), TOBN(0xe5cdcbbf, 0x0fd014e3),
+ TOBN(0x30aa376d, 0xf65965a0), TOBN(0x60fe88c2, 0xebb3e95e),
+ TOBN(0x33fd0b61, 0x66ee6f20), TOBN(0x8827dcdb, 0x3f41f0a0),
+ TOBN(0xbf8a9d24, 0x0c56c690), TOBN(0x40265dad, 0xddb7641d),
+ TOBN(0x522b05bf, 0x3a6b662b), TOBN(0x466d1dfe, 0xb1478c9b),
+ TOBN(0xaa616962, 0x1484469b), TOBN(0x0db60549, 0x02df8f9f),
+ TOBN(0xc37bca02, 0x3cb8bf51), TOBN(0x5effe346, 0x21371ce8),
+ TOBN(0xe8f65264, 0xff112c32), TOBN(0x8a9c736d, 0x7b971fb2),
+ TOBN(0xa4f19470, 0x7b75080d), TOBN(0xfc3f2c5a, 0x8839c59b),
+ TOBN(0x1d6c777e, 0x5aeb49c2), TOBN(0xf3db034d, 0xda1addfe),
+ TOBN(0xd76fee5a, 0x5535affc), TOBN(0x0853ac70, 0xb92251fd),
+ TOBN(0x37e3d594, 0x8b2a29d5), TOBN(0x28f1f457, 0x4de00ddb),
+ TOBN(0x8083c1b5, 0xf42c328b), TOBN(0xd8ef1d8f, 0xe493c73b),
+ TOBN(0x96fb6260, 0x41dc61bd), TOBN(0xf74e8a9d, 0x27ee2f8a),
+ TOBN(0x7c605a80, 0x2c946a5d), TOBN(0xeed48d65, 0x3839ccfd),
+ TOBN(0x9894344f, 0x3a29467a), TOBN(0xde81e949, 0xc51eba6d),
+ TOBN(0xdaea066b, 0xa5e5c2f2), TOBN(0x3fc8a614, 0x08c8c7b3),
+ TOBN(0x7adff88f, 0x06d0de9f), TOBN(0xbbc11cf5, 0x3b75ce0a),
+ TOBN(0x9fbb7acc, 0xfbbc87d5), TOBN(0xa1458e26, 0x7badfde2)}
+ ,
+ {TOBN(0x1cb43668, 0xe039c256), TOBN(0x5f26fb8b, 0x7c17fd5d),
+ TOBN(0xeee426af, 0x79aa062b), TOBN(0x072002d0, 0xd78fbf04),
+ TOBN(0x4c9ca237, 0xe84fb7e3), TOBN(0xb401d8a1, 0x0c82133d),
+ TOBN(0xaaa52592, 0x6d7e4181), TOBN(0xe9430833, 0x73dbb152),
+ TOBN(0xf92dda31, 0xbe24319a), TOBN(0x03f7d28b, 0xe095a8e7),
+ TOBN(0xa52fe840, 0x98782185), TOBN(0x276ddafe, 0x29c24dbc),
+ TOBN(0x80cd5496, 0x1d7a64eb), TOBN(0xe4360889, 0x7f1dbe42),
+ TOBN(0x2f81a877, 0x8438d2d5), TOBN(0x7e4d52a8, 0x85169036),
+ TOBN(0x19e3d5b1, 0x1d59715d), TOBN(0xc7eaa762, 0xd788983e),
+ TOBN(0xe5a730b0, 0xabf1f248), TOBN(0xfbab8084, 0xfae3fd83),
+ TOBN(0x65e50d21, 0x53765b2f), TOBN(0xbdd4e083, 0xfa127f3d),
+ TOBN(0x9cf3c074, 0x397b1b10), TOBN(0x59f8090c, 0xb1b59fd3),
+ TOBN(0x7b15fd9d, 0x615faa8f), TOBN(0x8fa1eb40, 0x968554ed),
+ TOBN(0x7bb4447e, 0x7aa44882), TOBN(0x2bb2d0d1, 0x029fff32),
+ TOBN(0x075e2a64, 0x6caa6d2f), TOBN(0x8eb879de, 0x22e7351b),
+ TOBN(0xbcd5624e, 0x9a506c62), TOBN(0x218eaef0, 0xa87e24dc),
+ TOBN(0x37e56847, 0x44ddfa35), TOBN(0x9ccfc5c5, 0xdab3f747),
+ TOBN(0x9ac1df3f, 0x1ee96cf4), TOBN(0x0c0571a1, 0x3b480b8f),
+ TOBN(0x2fbeb3d5, 0x4b3a7b3c), TOBN(0x35c03669, 0x5dcdbb99),
+ TOBN(0x52a0f5dc, 0xb2415b3a), TOBN(0xd57759b4, 0x4413ed9a),
+ TOBN(0x1fe647d8, 0x3d30a2c5), TOBN(0x0857f77e, 0xf78a81dc),
+ TOBN(0x11d5a334, 0x131a4a9b), TOBN(0xc0a94af9, 0x29d393f5),
+ TOBN(0xbc3a5c0b, 0xdaa6ec1a), TOBN(0xba9fe493, 0x88d2d7ed),
+ TOBN(0xbb4335b4, 0xbb614797), TOBN(0x991c4d68, 0x72f83533),
+ TOBN(0x53258c28, 0xd2f01cb3), TOBN(0x93d6eaa3, 0xd75db0b1),
+ TOBN(0x419a2b0d, 0xe87d0db4), TOBN(0xa1e48f03, 0xd8fe8493),
+ TOBN(0xf747faf6, 0xc508b23a), TOBN(0xf137571a, 0x35d53549),
+ TOBN(0x9f5e58e2, 0xfcf9b838), TOBN(0xc7186cee, 0xa7fd3cf5),
+ TOBN(0x77b868ce, 0xe978a1d3), TOBN(0xe3a68b33, 0x7ab92d04),
+ TOBN(0x51029794, 0x87a5b862), TOBN(0x5f0606c3, 0x3a61d41d),
+ TOBN(0x2814be27, 0x6f9326f1), TOBN(0x2f521c14, 0xc6fe3c2e),
+ TOBN(0x17464d7d, 0xacdf7351), TOBN(0x10f5f9d3, 0x777f7e44),
+ TOBN(0xce8e616b, 0x269fb37d), TOBN(0xaaf73804, 0x7de62de5),
+ TOBN(0xaba11175, 0x4fdd4153), TOBN(0x515759ba, 0x3770b49b),
+ TOBN(0x8b09ebf8, 0xaa423a61), TOBN(0x592245a1, 0xcd41fb92),
+ TOBN(0x1cba8ec1, 0x9b4c8936), TOBN(0xa87e91e3, 0xaf36710e),
+ TOBN(0x1fd84ce4, 0x3d34a2e3), TOBN(0xee3759ce, 0xb43b5d61),
+ TOBN(0x895bc78c, 0x619186c7), TOBN(0xf19c3809, 0xcbb9725a),
+ TOBN(0xc0be21aa, 0xde744b1f), TOBN(0xa7d222b0, 0x60f8056b),
+ TOBN(0x74be6157, 0xb23efe11), TOBN(0x6fab2b4f, 0x0cd68253),
+ TOBN(0xad33ea5f, 0x4bf1d725), TOBN(0x9c1d8ee2, 0x4f6c950f),
+ TOBN(0x544ee78a, 0xa377af06), TOBN(0x54f489bb, 0x94a113e1),
+ TOBN(0x8f11d634, 0x992fb7e8), TOBN(0x0169a7aa, 0xa2a44347),
+ TOBN(0x1d49d4af, 0x95020e00), TOBN(0x95945722, 0xe08e120b),
+ TOBN(0xb6e33878, 0xa4d32282), TOBN(0xe36e029d, 0x48020ae7),
+ TOBN(0xe05847fb, 0x37a9b750), TOBN(0xf876812c, 0xb29e3819),
+ TOBN(0x84ad138e, 0xd23a17f0), TOBN(0x6d7b4480, 0xf0b3950e),
+ TOBN(0xdfa8aef4, 0x2fd67ae0), TOBN(0x8d3eea24, 0x52333af6),
+ TOBN(0x0d052075, 0xb15d5acc), TOBN(0xc6d9c79f, 0xbd815bc4),
+ TOBN(0x8dcafd88, 0xdfa36cf2), TOBN(0x908ccbe2, 0x38aa9070),
+ TOBN(0x638722c4, 0xba35afce), TOBN(0x5a3da8b0, 0xfd6abf0b),
+ TOBN(0x2dce252c, 0xc9c335c1), TOBN(0x84e7f0de, 0x65aa799b),
+ TOBN(0x2101a522, 0xb99a72cb), TOBN(0x06de6e67, 0x87618016),
+ TOBN(0x5ff8c7cd, 0xe6f3653e), TOBN(0x0a821ab5, 0xc7a6754a),
+ TOBN(0x7e3fa52b, 0x7cb0b5a2), TOBN(0xa7fb121c, 0xc9048790),
+ TOBN(0x1a725020, 0x06ce053a), TOBN(0xb490a31f, 0x04e929b0),
+ TOBN(0xe17be47d, 0x62dd61ad), TOBN(0x781a961c, 0x6be01371),
+ TOBN(0x1063bfd3, 0xdae3cbba), TOBN(0x35647406, 0x7f73c9ba),
+ TOBN(0xf50e957b, 0x2736a129), TOBN(0xa6313702, 0xed13f256),
+ TOBN(0x9436ee65, 0x3a19fcc5), TOBN(0xcf2bdb29, 0xe7a4c8b6),
+ TOBN(0xb06b1244, 0xc5f95cd8), TOBN(0xda8c8af0, 0xf4ab95f4),
+ TOBN(0x1bae59c2, 0xb9e5836d), TOBN(0x07d51e7e, 0x3acffffc),
+ TOBN(0x01e15e6a, 0xc2ccbcda), TOBN(0x3bc1923f, 0x8528c3e0),
+ TOBN(0x43324577, 0xa49fead4), TOBN(0x61a1b884, 0x2aa7a711),
+ TOBN(0xf9a86e08, 0x700230ef), TOBN(0x0af585a1, 0xbd19adf8),
+ TOBN(0x7645f361, 0xf55ad8f2), TOBN(0x6e676223, 0x46c3614c),
+ TOBN(0x23cb257c, 0x4e774d3f), TOBN(0x82a38513, 0xac102d1b),
+ TOBN(0x9bcddd88, 0x7b126aa5), TOBN(0xe716998b, 0xeefd3ee4),
+ TOBN(0x4239d571, 0xfb167583), TOBN(0xdd011c78, 0xd16c8f8a),
+ TOBN(0x271c2895, 0x69a27519), TOBN(0x9ce0a3b7, 0xd2d64b6a),
+ TOBN(0x8c977289, 0xd5ec6738), TOBN(0xa3b49f9a, 0x8840ef6b),
+ TOBN(0x808c14c9, 0x9a453419), TOBN(0x5c00295b, 0x0cf0a2d5),
+ TOBN(0x524414fb, 0x1d4bcc76), TOBN(0xb07691d2, 0x459a88f1),
+ TOBN(0x77f43263, 0xf70d110f), TOBN(0x64ada5e0, 0xb7abf9f3),
+ TOBN(0xafd0f94e, 0x5b544cf5), TOBN(0xb4a13a15, 0xfd2713fe),
+ TOBN(0xb99b7d6e, 0x250c74f4), TOBN(0x097f2f73, 0x20324e45),
+ TOBN(0x994b37d8, 0xaffa8208), TOBN(0xc3c31b0b, 0xdc29aafc),
+ TOBN(0x3da74651, 0x7a3a607f), TOBN(0xd8e1b8c1, 0xfe6955d6),
+ TOBN(0x716e1815, 0xc8418682), TOBN(0x541d487f, 0x7dc91d97),
+ TOBN(0x48a04669, 0xc6996982), TOBN(0xf39cab15, 0x83a6502e),
+ TOBN(0x025801a0, 0xe68db055), TOBN(0xf3569758, 0xba3338d5),
+ TOBN(0xb0c8c0aa, 0xee2afa84), TOBN(0x4f6985d3, 0xfb6562d1),
+ TOBN(0x351f1f15, 0x132ed17a), TOBN(0x510ed0b4, 0xc04365fe),
+ TOBN(0xa3f98138, 0xe5b1f066), TOBN(0xbc9d95d6, 0x32df03dc),
+ TOBN(0xa83ccf6e, 0x19abd09e), TOBN(0x0b4097c1, 0x4ff17edb),
+ TOBN(0x58a5c478, 0xd64a06ce), TOBN(0x2ddcc3fd, 0x544a58fd),
+ TOBN(0xd449503d, 0x9e8153b8), TOBN(0x3324fd02, 0x7774179b),
+ TOBN(0xaf5d47c8, 0xdbd9120c), TOBN(0xeb860162, 0x34fa94db),
+ TOBN(0x5817bdd1, 0x972f07f4), TOBN(0xe5579e2e, 0xd27bbceb),
+ TOBN(0x86847a1f, 0x5f11e5a6), TOBN(0xb39ed255, 0x7c3cf048),
+ TOBN(0xe1076417, 0xa2f62e55), TOBN(0x6b9ab38f, 0x1bcf82a2),
+ TOBN(0x4bb7c319, 0x7aeb29f9), TOBN(0xf6d17da3, 0x17227a46),
+ TOBN(0xab53ddbd, 0x0f968c00), TOBN(0xa03da7ec, 0x000c880b),
+ TOBN(0x7b239624, 0x6a9ad24d), TOBN(0x612c0401, 0x01ec60d0),
+ TOBN(0x70d10493, 0x109f5df1), TOBN(0xfbda4030, 0x80af7550),
+ TOBN(0x30b93f95, 0xc6b9a9b3), TOBN(0x0c74ec71, 0x007d9418),
+ TOBN(0x94175564, 0x6edb951f), TOBN(0x5f4a9d78, 0x7f22c282),
+ TOBN(0xb7870895, 0xb38d1196), TOBN(0xbc593df3, 0xa228ce7c),
+ TOBN(0xc78c5bd4, 0x6af3641a), TOBN(0x7802200b, 0x3d9b3dcc),
+ TOBN(0x0dc73f32, 0x8be33304), TOBN(0x847ed87d, 0x61ffb79a),
+ TOBN(0xf85c974e, 0x6d671192), TOBN(0x1e14100a, 0xde16f60f),
+ TOBN(0x45cb0d5a, 0x95c38797), TOBN(0x18923bba, 0x9b022da4),
+ TOBN(0xef2be899, 0xbbe7e86e), TOBN(0x4a1510ee, 0x216067bf),
+ TOBN(0xd98c8154, 0x84d5ce3e), TOBN(0x1af777f0, 0xf92a2b90),
+ TOBN(0x9fbcb400, 0x4ef65724), TOBN(0x3e04a4c9, 0x3c0ca6fe),
+ TOBN(0xfb3e2cb5, 0x55002994), TOBN(0x1f3a93c5, 0x5363ecab),
+ TOBN(0x1fe00efe, 0x3923555b), TOBN(0x744bedd9, 0x1e1751ea),
+ TOBN(0x3fb2db59, 0x6ab69357), TOBN(0x8dbd7365, 0xf5e6618b),
+ TOBN(0x99d53099, 0xdf1ea40e), TOBN(0xb3f24a0b, 0x57d61e64),
+ TOBN(0xd088a198, 0x596eb812), TOBN(0x22c8361b, 0x5762940b),
+ TOBN(0x66f01f97, 0xf9c0d95c), TOBN(0x88461172, 0x8e43cdae),
+ TOBN(0x11599a7f, 0xb72b15c3), TOBN(0x135a7536, 0x420d95cc),
+ TOBN(0x2dcdf0f7, 0x5f7ae2f6), TOBN(0x15fc6e1d, 0xd7fa6da2),
+ TOBN(0x81ca829a, 0xd1d441b6), TOBN(0x84c10cf8, 0x04a106b6),
+ TOBN(0xa9b26c95, 0xa73fbbd0), TOBN(0x7f24e0cb, 0x4d8f6ee8),
+ TOBN(0x48b45937, 0x1e25a043), TOBN(0xf8a74fca, 0x036f3dfe),
+ TOBN(0x1ed46585, 0xc9f84296), TOBN(0x7fbaa8fb, 0x3bc278b0),
+ TOBN(0xa8e96cd4, 0x6c4fcbd0), TOBN(0x940a1202, 0x73b60a5f),
+ TOBN(0x34aae120, 0x55a4aec8), TOBN(0x550e9a74, 0xdbd742f0),
+ TOBN(0x794456d7, 0x228c68ab), TOBN(0x492f8868, 0xa4e25ec6),
+ TOBN(0x682915ad, 0xb2d8f398), TOBN(0xf13b51cc, 0x5b84c953),
+ TOBN(0xcda90ab8, 0x5bb917d6), TOBN(0x4b615560, 0x4ea3dee1),
+ TOBN(0x578b4e85, 0x0a52c1c8), TOBN(0xeab1a695, 0x20b75fc4),
+ TOBN(0x60c14f3c, 0xaa0bb3c6), TOBN(0x220f448a, 0xb8216094),
+ TOBN(0x4fe7ee31, 0xb0e63d34), TOBN(0xf4600572, 0xa9e54fab),
+ TOBN(0xc0493334, 0xd5e7b5a4), TOBN(0x8589fb92, 0x06d54831),
+ TOBN(0xaa70f5cc, 0x6583553a), TOBN(0x0879094a, 0xe25649e5),
+ TOBN(0xcc904507, 0x10044652), TOBN(0xebb0696d, 0x02541c4f),
+ TOBN(0x5a171fde, 0xb9718710), TOBN(0x38f1bed8, 0xf374a9f5),
+ TOBN(0xc8c582e1, 0xba39bdc1), TOBN(0xfc457b0a, 0x908cc0ce),
+ TOBN(0x9a187fd4, 0x883841e2), TOBN(0x8ec25b39, 0x38725381),
+ TOBN(0x2553ed05, 0x96f84395), TOBN(0x095c7661, 0x6f6c6897),
+ TOBN(0x917ac85c, 0x4bdc5610), TOBN(0xb2885fe4, 0x179eb301),
+ TOBN(0x5fc65547, 0x8b78bdcc), TOBN(0x4a9fc893, 0xe59e4699),
+ TOBN(0xbb7ff0cd, 0x3ce299af), TOBN(0x195be9b3, 0xadf38b20),
+ TOBN(0x6a929c87, 0xd38ddb8f), TOBN(0x55fcc99c, 0xb21a51b9),
+ TOBN(0x2b695b4c, 0x721a4593), TOBN(0xed1e9a15, 0x768eaac2),
+ TOBN(0xfb63d71c, 0x7489f914), TOBN(0xf98ba31c, 0x78118910),
+ TOBN(0x80291373, 0x9b128eb4), TOBN(0x7801214e, 0xd448af4a),
+ TOBN(0xdbd2e22b, 0x55418dd3), TOBN(0xeffb3c0d, 0xd3998242),
+ TOBN(0xdfa6077c, 0xc7bf3827), TOBN(0xf2165bcb, 0x47f8238f),
+ TOBN(0xfe37cf68, 0x8564d554), TOBN(0xe5f825c4, 0x0a81fb98),
+ TOBN(0x43cc4f67, 0xffed4d6f), TOBN(0xbc609578, 0xb50a34b0),
+ TOBN(0x8aa8fcf9, 0x5041faf1), TOBN(0x5659f053, 0x651773b6),
+ TOBN(0xe87582c3, 0x6044d63b), TOBN(0xa6089409, 0x0cdb0ca0),
+ TOBN(0x8c993e0f, 0xbfb2bcf6), TOBN(0xfc64a719, 0x45985cfc),
+ TOBN(0x15c4da80, 0x83dbedba), TOBN(0x804ae112, 0x2be67df7),
+ TOBN(0xda4c9658, 0xa23defde), TOBN(0x12002ddd, 0x5156e0d3),
+ TOBN(0xe68eae89, 0x5dd21b96), TOBN(0x8b99f28b, 0xcf44624d),
+ TOBN(0x0ae00808, 0x1ec8897a), TOBN(0xdd0a9303, 0x6712f76e),
+ TOBN(0x96237522, 0x4e233de4), TOBN(0x192445b1, 0x2b36a8a5),
+ TOBN(0xabf9ff74, 0x023993d9), TOBN(0x21f37bf4, 0x2aad4a8f),
+ TOBN(0x340a4349, 0xf8bd2bbd), TOBN(0x1d902cd9, 0x4868195d),
+ TOBN(0x3d27bbf1, 0xe5fdb6f1), TOBN(0x7a5ab088, 0x124f9f1c),
+ TOBN(0xc466ab06, 0xf7a09e03), TOBN(0x2f8a1977, 0x31f2c123),
+ TOBN(0xda355dc7, 0x041b6657), TOBN(0xcb840d12, 0x8ece2a7c),
+ TOBN(0xb600ad9f, 0x7db32675), TOBN(0x78fea133, 0x07a06f1b),
+ TOBN(0x5d032269, 0xb31f6094), TOBN(0x07753ef5, 0x83ec37aa),
+ TOBN(0x03485aed, 0x9c0bea78), TOBN(0x41bb3989, 0xbc3f4524),
+ TOBN(0x09403761, 0x697f726d), TOBN(0x6109beb3, 0xdf394820),
+ TOBN(0x804111ea, 0x3b6d1145), TOBN(0xb6271ea9, 0xa8582654),
+ TOBN(0x619615e6, 0x24e66562), TOBN(0xa2554945, 0xd7b6ad9c),
+ TOBN(0xd9c4985e, 0x99bfe35f), TOBN(0x9770ccc0, 0x7b51cdf6),
+ TOBN(0x7c327013, 0x92881832), TOBN(0x8777d45f, 0x286b26d1),
+ TOBN(0x9bbeda22, 0xd847999d), TOBN(0x03aa33b6, 0xc3525d32),
+ TOBN(0x4b7b96d4, 0x28a959a1), TOBN(0xbb3786e5, 0x31e5d234),
+ TOBN(0xaeb5d3ce, 0x6961f247), TOBN(0x20aa85af, 0x02f93d3f),
+ TOBN(0x9cd1ad3d, 0xd7a7ae4f), TOBN(0xbf6688f0, 0x781adaa8),
+ TOBN(0xb1b40e86, 0x7469cead), TOBN(0x1904c524, 0x309fca48),
+ TOBN(0x9b7312af, 0x4b54bbc7), TOBN(0xbe24bf8f, 0x593affa2),
+ TOBN(0xbe5e0790, 0xbd98764b), TOBN(0xa0f45f17, 0xa26e299e),
+ TOBN(0x4af0d2c2, 0x6b8fe4c7), TOBN(0xef170db1, 0x8ae8a3e6),
+ TOBN(0x0e8d61a0, 0x29e0ccc1), TOBN(0xcd53e87e, 0x60ad36ca),
+ TOBN(0x328c6623, 0xc8173822), TOBN(0x7ee1767d, 0xa496be55),
+ TOBN(0x89f13259, 0x648945af), TOBN(0x9e45a5fd, 0x25c8009c),
+ TOBN(0xaf2febd9, 0x1f61ab8c), TOBN(0x43f6bc86, 0x8a275385),
+ TOBN(0x87792348, 0xf2142e79), TOBN(0x17d89259, 0xc6e6238a),
+ TOBN(0x7536d2f6, 0x4a839d9b), TOBN(0x1f428fce, 0x76a1fbdc),
+ TOBN(0x1c109601, 0x0db06dfe), TOBN(0xbfc16bc1, 0x50a3a3cc),
+ TOBN(0xf9cbd9ec, 0x9b30f41b), TOBN(0x5b5da0d6, 0x00138cce),
+ TOBN(0xec1d0a48, 0x56ef96a7), TOBN(0xb47eb848, 0x982bf842),
+ TOBN(0x66deae32, 0xec3f700d), TOBN(0x4e43c42c, 0xaa1181e0),
+ TOBN(0xa1d72a31, 0xd1a4aa2a), TOBN(0x440d4668, 0xc004f3ce),
+ TOBN(0x0d6a2d3b, 0x45fe8a7a), TOBN(0x820e52e2, 0xfb128365),
+ TOBN(0x29ac5fcf, 0x25e51b09), TOBN(0x180cd2bf, 0x2023d159),
+ TOBN(0xa9892171, 0xa1ebf90e), TOBN(0xf97c4c87, 0x7c132181),
+ TOBN(0x9f1dc724, 0xc03dbb7e), TOBN(0xae043765, 0x018cbbe4),
+ TOBN(0xfb0b2a36, 0x0767d153), TOBN(0xa8e2f4d6, 0x249cbaeb),
+ TOBN(0x172a5247, 0xd95ea168), TOBN(0x1758fada, 0x2970764a),
+ TOBN(0xac803a51, 0x1d978169), TOBN(0x299cfe2e, 0xde77e01b),
+ TOBN(0x652a1e17, 0xb0a98927), TOBN(0x2e26e1d1, 0x20014495),
+ TOBN(0x7ae0af9f, 0x7175b56a), TOBN(0xc2e22a80, 0xd64b9f95),
+ TOBN(0x4d0ff9fb, 0xd90a060a), TOBN(0x496a27db, 0xbaf38085),
+ TOBN(0x32305401, 0xda776bcf), TOBN(0xb8cdcef6, 0x725f209e),
+ TOBN(0x61ba0f37, 0x436a0bba), TOBN(0x263fa108, 0x76860049),
+ TOBN(0x92beb98e, 0xda3542cf), TOBN(0xa2d4d14a, 0xd5849538),
+ TOBN(0x989b9d68, 0x12e9a1bc), TOBN(0x61d9075c, 0x5f6e3268),
+ TOBN(0x352c6aa9, 0x99ace638), TOBN(0xde4e4a55, 0x920f43ff),
+ TOBN(0xe5e4144a, 0xd673c017), TOBN(0x667417ae, 0x6f6e05ea),
+ TOBN(0x613416ae, 0xdcd1bd56), TOBN(0x5eb36201, 0x86693711),
+ TOBN(0x2d7bc504, 0x3a1aa914), TOBN(0x175a1299, 0x76dc5975),
+ TOBN(0xe900e0f2, 0x3fc8125c), TOBN(0x569ef68c, 0x11198875),
+ TOBN(0x9012db63, 0x63a113b4), TOBN(0xe3bd3f56, 0x98835766),
+ TOBN(0xa5c94a52, 0x76412dea), TOBN(0xad9e2a09, 0xaa735e5c),
+ TOBN(0x405a984c, 0x508b65e9), TOBN(0xbde4a1d1, 0x6df1a0d1),
+ TOBN(0x1a9433a1, 0xdfba80da), TOBN(0xe9192ff9, 0x9440ad2e),
+ TOBN(0x9f649696, 0x5099fe92), TOBN(0x25ddb65c, 0x0b27a54a),
+ TOBN(0x178279dd, 0xc590da61), TOBN(0x5479a999, 0xfbde681a),
+ TOBN(0xd0e84e05, 0x013fe162), TOBN(0xbe11dc92, 0x632d471b),
+ TOBN(0xdf0b0c45, 0xfc0e089f), TOBN(0x04fb15b0, 0x4c144025),
+ TOBN(0xa61d5fc2, 0x13c99927), TOBN(0xa033e9e0, 0x3de2eb35),
+ TOBN(0xf8185d5c, 0xb8dacbb4), TOBN(0x9a88e265, 0x8644549d),
+ TOBN(0xf717af62, 0x54671ff6), TOBN(0x4bd4241b, 0x5fa58603),
+ TOBN(0x06fba40b, 0xe67773c0), TOBN(0xc1d933d2, 0x6a2847e9),
+ TOBN(0xf4f5acf3, 0x689e2c70), TOBN(0x92aab0e7, 0x46bafd31),
+ TOBN(0x798d76aa, 0x3473f6e5), TOBN(0xcc6641db, 0x93141934),
+ TOBN(0xcae27757, 0xd31e535e), TOBN(0x04cc43b6, 0x87c2ee11),
+ TOBN(0x8d1f9675, 0x2e029ffa), TOBN(0xc2150672, 0xe4cc7a2c),
+ TOBN(0x3b03c1e0, 0x8d68b013), TOBN(0xa9d6816f, 0xedf298f3),
+ TOBN(0x1bfbb529, 0xa2804464), TOBN(0x95a52fae, 0x5db22125),
+ TOBN(0x55b32160, 0x0e1cb64e), TOBN(0x004828f6, 0x7e7fc9fe),
+ TOBN(0x13394b82, 0x1bb0fb93), TOBN(0xb6293a2d, 0x35f1a920),
+ TOBN(0xde35ef21, 0xd145d2d9), TOBN(0xbe6225b3, 0xbb8fa603),
+ TOBN(0x00fc8f6b, 0x32cf252d), TOBN(0xa28e52e6, 0x117cf8c2),
+ TOBN(0x9d1dc89b, 0x4c371e6d), TOBN(0xcebe0675, 0x36ef0f28),
+ TOBN(0x5de05d09, 0xa4292f81), TOBN(0xa8303593, 0x353e3083),
+ TOBN(0xa1715b0a, 0x7e37a9bb), TOBN(0x8c56f61e, 0x2b8faec3),
+ TOBN(0x52507431, 0x33c9b102), TOBN(0x0130cefc, 0xa44431f0),
+ TOBN(0x56039fa0, 0xbd865cfb), TOBN(0x4b03e578, 0xbc5f1dd7),
+ TOBN(0x40edf2e4, 0xbabe7224), TOBN(0xc752496d, 0x3a1988f6),
+ TOBN(0xd1572d3b, 0x564beb6b), TOBN(0x0db1d110, 0x39a1c608),
+ TOBN(0x568d1934, 0x16f60126), TOBN(0x05ae9668, 0xf354af33),
+ TOBN(0x19de6d37, 0xc92544f2), TOBN(0xcc084353, 0xa35837d5),
+ TOBN(0xcbb6869c, 0x1a514ece), TOBN(0xb633e728, 0x2e1d1066),
+ TOBN(0xf15dd69f, 0x936c581c), TOBN(0x96e7b8ce, 0x7439c4f9),
+ TOBN(0x5e676f48, 0x2e448a5b), TOBN(0xb2ca7d5b, 0xfd916bbb),
+ TOBN(0xd55a2541, 0xf5024025), TOBN(0x47bc5769, 0xe4c2d937),
+ TOBN(0x7d31b92a, 0x0362189f), TOBN(0x83f3086e, 0xef7816f9),
+ TOBN(0xf9f46d94, 0xb587579a), TOBN(0xec2d22d8, 0x30e76c5f),
+ TOBN(0x27d57461, 0xb000ffcf), TOBN(0xbb7e65f9, 0x364ffc2c),
+ TOBN(0x7c7c9477, 0x6652a220), TOBN(0x61618f89, 0xd696c981),
+ TOBN(0x5021701d, 0x89effff3), TOBN(0xf2c8ff8e, 0x7c314163),
+ TOBN(0x2da413ad, 0x8efb4d3e), TOBN(0x937b5adf, 0xce176d95),
+ TOBN(0x22867d34, 0x2a67d51c), TOBN(0x262b9b10, 0x18eb3ac9),
+ TOBN(0x4e314fe4, 0xc43ff28b), TOBN(0x76476627, 0x6a664e7a),
+ TOBN(0x3e90e40b, 0xb7a565c2), TOBN(0x8588993a, 0xc1acf831),
+ TOBN(0xd7b501d6, 0x8f938829), TOBN(0x996627ee, 0x3edd7d4c),
+ TOBN(0x37d44a62, 0x90cd34c7), TOBN(0xa8327499, 0xf3833e8d),
+ TOBN(0x2e18917d, 0x4bf50353), TOBN(0x85dd726b, 0x556765fb),
+ TOBN(0x54fe65d6, 0x93d5ab66), TOBN(0x3ddbaced, 0x915c25fe),
+ TOBN(0xa799d9a4, 0x12f22e85), TOBN(0xe2a24867, 0x6d06f6bc),
+ TOBN(0xf4f1ee56, 0x43ca1637), TOBN(0xfda2828b, 0x61ece30a),
+ TOBN(0x758c1a3e, 0xa2dee7a6), TOBN(0xdcde2f3c, 0x734b2284),
+ TOBN(0xaba445d2, 0x4eaba6ad), TOBN(0x35aaf668, 0x76cee0a7),
+ TOBN(0x7e0b04a9, 0xe5aa049a), TOBN(0xe74083ad, 0x91103e84),
+ TOBN(0xbeb183ce, 0x40afecc3), TOBN(0x6b89de9f, 0xea043f7a),}
+ ,
+ {TOBN(0x0e299d23, 0xfe67ba66), TOBN(0x91450760, 0x93cf2f34),
+ TOBN(0xf45b5ea9, 0x97fcf913), TOBN(0x5be00843, 0x8bd7ddda),
+ TOBN(0x358c3e05, 0xd53ff04d), TOBN(0xbf7ccdc3, 0x5de91ef7),
+ TOBN(0xad684dbf, 0xb69ec1a0), TOBN(0x367e7cf2, 0x801fd997),
+ TOBN(0x0ca1f3b7, 0xb0dc8595), TOBN(0x27de4608, 0x9f1d9f2e),
+ TOBN(0x1af3bf39, 0xbadd82a7), TOBN(0x79356a79, 0x65862448),
+ TOBN(0xc0602345, 0xf5f9a052), TOBN(0x1a8b0f89, 0x139a42f9),
+ TOBN(0xb53eee42, 0x844d40fc), TOBN(0x93b0bfe5, 0x4e5b6368),
+ TOBN(0x5434dd02, 0xc024789c), TOBN(0x90dca9ea, 0x41b57bfc),
+ TOBN(0x8aa898e2, 0x243398df), TOBN(0xf607c834, 0x894a94bb),
+ TOBN(0xbb07be97, 0xc2c99b76), TOBN(0x6576ba67, 0x18c29302),
+ TOBN(0x3d79efcc, 0xe703a88c), TOBN(0xf259ced7, 0xb6a0d106),
+ TOBN(0x0f893a5d, 0xc8de610b), TOBN(0xe8c515fb, 0x67e223ce),
+ TOBN(0x7774bfa6, 0x4ead6dc5), TOBN(0x89d20f95, 0x925c728f),
+ TOBN(0x7a1e0966, 0x098583ce), TOBN(0xa2eedb94, 0x93f2a7d7),
+ TOBN(0x1b282097, 0x4c304d4a), TOBN(0x0842e3da, 0xc077282d),
+ TOBN(0xe4d972a3, 0x3b9e2d7b), TOBN(0x7cc60b27, 0xc48218ff),
+ TOBN(0x8fc70838, 0x84149d91), TOBN(0x5c04346f, 0x2f461ecc),
+ TOBN(0xebe9fdf2, 0x614650a9), TOBN(0x5e35b537, 0xc1f666ac),
+ TOBN(0x645613d1, 0x88babc83), TOBN(0x88cace3a, 0xc5e1c93e),
+ TOBN(0x209ca375, 0x3de92e23), TOBN(0xccb03cc8, 0x5fbbb6e3),
+ TOBN(0xccb90f03, 0xd7b1487e), TOBN(0xfa9c2a38, 0xc710941f),
+ TOBN(0x756c3823, 0x6724ceed), TOBN(0x3a902258, 0x192d0323),
+ TOBN(0xb150e519, 0xea5e038e), TOBN(0xdcba2865, 0xc7427591),
+ TOBN(0xe549237f, 0x78890732), TOBN(0xc443bef9, 0x53fcb4d9),
+ TOBN(0x9884d8a6, 0xeb3480d6), TOBN(0x8a35b6a1, 0x3048b186),
+ TOBN(0xb4e44716, 0x65e9a90a), TOBN(0x45bf380d, 0x653006c0),
+ TOBN(0x8f3f820d, 0x4fe9ae3b), TOBN(0x244a35a0, 0x979a3b71),
+ TOBN(0xa1010e9d, 0x74cd06ff), TOBN(0x9c17c7df, 0xaca3eeac),
+ TOBN(0x74c86cd3, 0x8063aa2b), TOBN(0x8595c4b3, 0x734614ff),
+ TOBN(0xa3de00ca, 0x990f62cc), TOBN(0xd9bed213, 0xca0c3be5),
+ TOBN(0x7886078a, 0xdf8ce9f5), TOBN(0xddb27ce3, 0x5cd44444),
+ TOBN(0xed374a66, 0x58926ddd), TOBN(0x138b2d49, 0x908015b8),
+ TOBN(0x886c6579, 0xde1f7ab8), TOBN(0x888b9aa0, 0xc3020b7a),
+ TOBN(0xd3ec034e, 0x3a96e355), TOBN(0xba65b0b8, 0xf30fbe9a),
+ TOBN(0x064c8e50, 0xff21367a), TOBN(0x1f508ea4, 0x0b04b46e),
+ TOBN(0x98561a49, 0x747c866c), TOBN(0xbbb1e5fe, 0x0518a062),
+ TOBN(0x20ff4e8b, 0xecdc3608), TOBN(0x7f55cded, 0x20184027),
+ TOBN(0x8d73ec95, 0xf38c85f0), TOBN(0x5b589fdf, 0x8bc3b8c3),
+ TOBN(0xbe95dd98, 0x0f12b66f), TOBN(0xf5bd1a09, 0x0e338e01),
+ TOBN(0x65163ae5, 0x5e915918), TOBN(0x6158d6d9, 0x86f8a46b),
+ TOBN(0x8466b538, 0xeeebf99c), TOBN(0xca8761f6, 0xbca477ef),
+ TOBN(0xaf3449c2, 0x9ebbc601), TOBN(0xef3b0f41, 0xe0c3ae2f),
+ TOBN(0xaa6c577d, 0x5de63752), TOBN(0xe9166601, 0x64682a51),
+ TOBN(0x5a3097be, 0xfc15aa1e), TOBN(0x40d12548, 0xb54b0745),
+ TOBN(0x5bad4706, 0x519a5f12), TOBN(0xed03f717, 0xa439dee6),
+ TOBN(0x0794bb6c, 0x4a02c499), TOBN(0xf725083d, 0xcffe71d2),
+ TOBN(0x2cad7519, 0x0f3adcaf), TOBN(0x7f68ea1c, 0x43729310),
+ TOBN(0xe747c8c7, 0xb7ffd977), TOBN(0xec104c35, 0x80761a22),
+ TOBN(0x8395ebaf, 0x5a3ffb83), TOBN(0xfb3261f4, 0xe4b63db7),
+ TOBN(0x53544960, 0xd883e544), TOBN(0x13520d70, 0x8cc2eeb8),
+ TOBN(0x08f6337b, 0xd3d65f99), TOBN(0x83997db2, 0x781cf95b),
+ TOBN(0xce6ff106, 0x0dbd2c01), TOBN(0x4f8eea6b, 0x1f9ce934),
+ TOBN(0x546f7c4b, 0x0e993921), TOBN(0x6236a324, 0x5e753fc7),
+ TOBN(0x65a41f84, 0xa16022e9), TOBN(0x0c18d878, 0x43d1dbb2),
+ TOBN(0x73c55640, 0x2d4cef9c), TOBN(0xa0428108, 0x70444c74),
+ TOBN(0x68e4f15e, 0x9afdfb3c), TOBN(0x49a56143, 0x5bdfb6df),
+ TOBN(0xa9bc1bd4, 0x5f823d97), TOBN(0xbceb5970, 0xea111c2a),
+ TOBN(0x366b455f, 0xb269bbc4), TOBN(0x7cd85e1e, 0xe9bc5d62),
+ TOBN(0xc743c41c, 0x4f18b086), TOBN(0xa4b40990, 0x95294fb9),
+ TOBN(0x9c7c581d, 0x26ee8382), TOBN(0xcf17dcc5, 0x359d638e),
+ TOBN(0xee8273ab, 0xb728ae3d), TOBN(0x1d112926, 0xf821f047),
+ TOBN(0x11498477, 0x50491a74), TOBN(0x687fa761, 0xfde0dfb9),
+ TOBN(0x2c258022, 0x7ea435ab), TOBN(0x6b8bdb94, 0x91ce7e3f),
+ TOBN(0x4c5b5dc9, 0x3bf834aa), TOBN(0x04371819, 0x4f6c7e4b),
+ TOBN(0xc284e00a, 0x3736bcad), TOBN(0x0d881118, 0x21ae8f8d),
+ TOBN(0xf9cf0f82, 0xf48c8e33), TOBN(0xa11fd075, 0xa1bf40db),
+ TOBN(0xdceab0de, 0xdc2733e5), TOBN(0xc560a8b5, 0x8e986bd7),
+ TOBN(0x48dd1fe2, 0x3929d097), TOBN(0x3885b290, 0x92f188f1),
+ TOBN(0x0f2ae613, 0xda6fcdac), TOBN(0x9054303e, 0xb662a46c),
+ TOBN(0xb6871e44, 0x0738042a), TOBN(0x98e6a977, 0xbdaf6449),
+ TOBN(0xd8bc0650, 0xd1c9df1b), TOBN(0xef3d6451, 0x36e098f9),
+ TOBN(0x03fbae82, 0xb6d72d28), TOBN(0x77ca9db1, 0xf5d84080),
+ TOBN(0x8a112cff, 0xa58efc1c), TOBN(0x518d761c, 0xc564cb4a),
+ TOBN(0x69b5740e, 0xf0d1b5ce), TOBN(0x717039cc, 0xe9eb1785),
+ TOBN(0x3fe29f90, 0x22f53382), TOBN(0x8e54ba56, 0x6bc7c95c),
+ TOBN(0x9c806d8a, 0xf7f91d0f), TOBN(0x3b61b0f1, 0xa82a5728),
+ TOBN(0x4640032d, 0x94d76754), TOBN(0x273eb5de, 0x47d834c6),
+ TOBN(0x2988abf7, 0x7b4e4d53), TOBN(0xb7ce66bf, 0xde401777),
+ TOBN(0x9fba6b32, 0x715071b3), TOBN(0x82413c24, 0xad3a1a98),
+ TOBN(0x5b7fc8c4, 0xe0e8ad93), TOBN(0xb5679aee, 0x5fab868d),
+ TOBN(0xb1f9d2fa, 0x2b3946f3), TOBN(0x458897dc, 0x5685b50a),
+ TOBN(0x1e98c930, 0x89d0caf3), TOBN(0x39564c5f, 0x78642e92),
+ TOBN(0x1b77729a, 0x0dbdaf18), TOBN(0xf9170722, 0x579e82e6),
+ TOBN(0x680c0317, 0xe4515fa5), TOBN(0xf85cff84, 0xfb0c790f),
+ TOBN(0xc7a82aab, 0x6d2e0765), TOBN(0x7446bca9, 0x35c82b32),
+ TOBN(0x5de607aa, 0x6d63184f), TOBN(0x7c1a46a8, 0x262803a6),
+ TOBN(0xd218313d, 0xaebe8035), TOBN(0x92113ffd, 0xc73c51f8),
+ TOBN(0x4b38e083, 0x12e7e46c), TOBN(0x69d0a37a, 0x56126bd5),
+ TOBN(0xfb3f324b, 0x73c07e04), TOBN(0xa0c22f67, 0x8fda7267),
+ TOBN(0x8f2c0051, 0x4d2c7d8f), TOBN(0xbc45ced3, 0xcbe2cae5),
+ TOBN(0xe1c6cf07, 0xa8f0f277), TOBN(0xbc392312, 0x1eb99a98),
+ TOBN(0x75537b7e, 0x3cc8ac85), TOBN(0x8d725f57, 0xdd02753b),
+ TOBN(0xfd05ff64, 0xb737df2f), TOBN(0x55fe8712, 0xf6d2531d),
+ TOBN(0x57ce04a9, 0x6ab6b01c), TOBN(0x69a02a89, 0x7cd93724),
+ TOBN(0x4f82ac35, 0xcf86699b), TOBN(0x8242d3ad, 0x9cb4b232),
+ TOBN(0x713d0f65, 0xd62105e5), TOBN(0xbb222bfa, 0x2d29be61),
+ TOBN(0xf2f9a79e, 0x6cfbef09), TOBN(0xfc24d8d3, 0xd5d6782f),
+ TOBN(0x5db77085, 0xd4129967), TOBN(0xdb81c3cc, 0xdc3c2a43),
+ TOBN(0x9d655fc0, 0x05d8d9a3), TOBN(0x3f5d057a, 0x54298026),
+ TOBN(0x1157f56d, 0x88c54694), TOBN(0xb26baba5, 0x9b09573e),
+ TOBN(0x2cab03b0, 0x22adffd1), TOBN(0x60a412c8, 0xdd69f383),
+ TOBN(0xed76e98b, 0x54b25039), TOBN(0xd4ee67d3, 0x687e714d),
+ TOBN(0x87739648, 0x7b00b594), TOBN(0xce419775, 0xc9ef709b),
+ TOBN(0x40f76f85, 0x1c203a40), TOBN(0x30d352d6, 0xeafd8f91),
+ TOBN(0xaf196d3d, 0x95578dd2), TOBN(0xea4bb3d7, 0x77cc3f3d),
+ TOBN(0x42a5bd03, 0xb98e782b), TOBN(0xac958c40, 0x0624920d),
+ TOBN(0xb838134c, 0xfc56fcc8), TOBN(0x86ec4ccf, 0x89572e5e),
+ TOBN(0x69c43526, 0x9be47be0), TOBN(0x323b7dd8, 0xcb28fea1),
+ TOBN(0xfa5538ba, 0x3a6c67e5), TOBN(0xef921d70, 0x1d378e46),
+ TOBN(0xf92961fc, 0x3c4b880e), TOBN(0x3f6f914e, 0x98940a67),
+ TOBN(0xa990eb0a, 0xfef0ff39), TOBN(0xa6c2920f, 0xf0eeff9c),
+ TOBN(0xca804166, 0x51b8d9a3), TOBN(0x42531bc9, 0x0ffb0db1),
+ TOBN(0x72ce4718, 0xaa82e7ce), TOBN(0x6e199913, 0xdf574741),
+ TOBN(0xd5f1b13d, 0xd5d36946), TOBN(0x8255dc65, 0xf68f0194),
+ TOBN(0xdc9df4cd, 0x8710d230), TOBN(0x3453c20f, 0x138c1988),
+ TOBN(0x9af98dc0, 0x89a6ef01), TOBN(0x4dbcc3f0, 0x9857df85),
+ TOBN(0x34805601, 0x5c1ad924), TOBN(0x40448da5, 0xd0493046),
+ TOBN(0xf629926d, 0x4ee343e2), TOBN(0x6343f1bd, 0x90e8a301),
+ TOBN(0xefc93491, 0x40815b3f), TOBN(0xf882a423, 0xde8f66fb),
+ TOBN(0x3a12d5f4, 0xe7db9f57), TOBN(0x7dfba38a, 0x3c384c27),
+ TOBN(0x7a904bfd, 0x6fc660b1), TOBN(0xeb6c5db3, 0x2773b21c),
+ TOBN(0xc350ee66, 0x1cdfe049), TOBN(0x9baac0ce, 0x44540f29),
+ TOBN(0xbc57b6ab, 0xa5ec6aad), TOBN(0x167ce8c3, 0x0a7c1baa),
+ TOBN(0xb23a03a5, 0x53fb2b56), TOBN(0x6ce141e7, 0x4e057f78),
+ TOBN(0x796525c3, 0x89e490d9), TOBN(0x0bc95725, 0xa31a7e75),
+ TOBN(0x1ec56791, 0x1220fd06), TOBN(0x716e3a3c, 0x408b0bd6),
+ TOBN(0x31cd6bf7, 0xe8ebeba9), TOBN(0xa7326ca6, 0xbee6b670),
+ TOBN(0x3d9f851c, 0xcd090c43), TOBN(0x561e8f13, 0xf12c3988),
+ TOBN(0x50490b6a, 0x904b7be4), TOBN(0x61690ce1, 0x0410737b),
+ TOBN(0x299e9a37, 0x0f009052), TOBN(0x258758f0, 0xf026092e),
+ TOBN(0x9fa255f3, 0xfdfcdc0f), TOBN(0xdbc9fb1f, 0xc0e1bcd2),
+ TOBN(0x35f9dd6e, 0x24651840), TOBN(0xdca45a84, 0xa5c59abc),
+ TOBN(0x103d396f, 0xecca4938), TOBN(0x4532da0a, 0xb97b3f29),
+ TOBN(0xc4135ea5, 0x1999a6bf), TOBN(0x3aa9505a, 0x5e6bf2ee),
+ TOBN(0xf77cef06, 0x3f5be093), TOBN(0x97d1a0f8, 0xa943152e),
+ TOBN(0x2cb0ebba, 0x2e1c21dd), TOBN(0xf41b29fc, 0x2c6797c4),
+ TOBN(0xc6e17321, 0xb300101f), TOBN(0x4422b0e9, 0xd0d79a89),
+ TOBN(0x49e4901c, 0x92f1bfc4), TOBN(0x06ab1f8f, 0xe1e10ed9),
+ TOBN(0x84d35577, 0xdb2926b8), TOBN(0xca349d39, 0x356e8ec2),
+ TOBN(0x70b63d32, 0x343bf1a9), TOBN(0x8fd3bd28, 0x37d1a6b1),
+ TOBN(0x0454879c, 0x316865b4), TOBN(0xee959ff6, 0xc458efa2),
+ TOBN(0x0461dcf8, 0x9706dc3f), TOBN(0x737db0e2, 0x164e4b2e),
+ TOBN(0x09262680, 0x2f8843c8), TOBN(0x54498bbc, 0x7745e6f6),
+ TOBN(0x359473fa, 0xa29e24af), TOBN(0xfcc3c454, 0x70aa87a1),
+ TOBN(0xfd2c4bf5, 0x00573ace), TOBN(0xb65b514e, 0x28dd1965),
+ TOBN(0xe46ae7cf, 0x2193e393), TOBN(0x60e9a4e1, 0xf5444d97),
+ TOBN(0xe7594e96, 0x00ff38ed), TOBN(0x43d84d2f, 0x0a0e0f02),
+ TOBN(0x8b6db141, 0xee398a21), TOBN(0xb88a56ae, 0xe3bcc5be),
+ TOBN(0x0a1aa52f, 0x373460ea), TOBN(0x20da1a56, 0x160bb19b),
+ TOBN(0xfb54999d, 0x65bf0384), TOBN(0x71a14d24, 0x5d5a180e),
+ TOBN(0xbc44db7b, 0x21737b04), TOBN(0xd84fcb18, 0x01dd8e92),
+ TOBN(0x80de937b, 0xfa44b479), TOBN(0x53505499, 0x5c98fd4f),
+ TOBN(0x1edb12ab, 0x28f08727), TOBN(0x4c58b582, 0xa5f3ef53),
+ TOBN(0xbfb236d8, 0x8327f246), TOBN(0xc3a3bfaa, 0x4d7df320),
+ TOBN(0xecd96c59, 0xb96024f2), TOBN(0xfc293a53, 0x7f4e0433),
+ TOBN(0x5341352b, 0x5acf6e10), TOBN(0xc50343fd, 0xafe652c3),
+ TOBN(0x4af3792d, 0x18577a7f), TOBN(0xe1a4c617, 0xaf16823d),
+ TOBN(0x9b26d0cd, 0x33425d0a), TOBN(0x306399ed, 0x9b7bc47f),
+ TOBN(0x2a792f33, 0x706bb20b), TOBN(0x31219614, 0x98111055),
+ TOBN(0x864ec064, 0x87f5d28b), TOBN(0x11392d91, 0x962277fd),
+ TOBN(0xb5aa7942, 0xbb6aed5f), TOBN(0x080094dc, 0x47e799d9),
+ TOBN(0x4afa588c, 0x208ba19b), TOBN(0xd3e7570f, 0x8512f284),
+ TOBN(0xcbae64e6, 0x02f5799a), TOBN(0xdeebe7ef, 0x514b9492),
+ TOBN(0x30300f98, 0xe5c298ff), TOBN(0x17f561be, 0x3678361f),
+ TOBN(0xf52ff312, 0x98cb9a16), TOBN(0x6233c3bc, 0x5562d490),
+ TOBN(0x7bfa15a1, 0x92e3a2cb), TOBN(0x961bcfd1, 0xe6365119),
+ TOBN(0x3bdd29bf, 0x2c8c53b1), TOBN(0x739704df, 0x822844ba),
+ TOBN(0x7dacfb58, 0x7e7b754b), TOBN(0x23360791, 0xa806c9b9),
+ TOBN(0xe7eb88c9, 0x23504452), TOBN(0x2983e996, 0x852c1783),
+ TOBN(0xdd4ae529, 0x958d881d), TOBN(0x026bae03, 0x262c7b3c),
+ TOBN(0x3a6f9193, 0x960b52d1), TOBN(0xd0980f90, 0x92696cfb),
+ TOBN(0x4c1f428c, 0xd5f30851), TOBN(0x94dfed27, 0x2a4f6630),
+ TOBN(0x4df53772, 0xfc5d48a4), TOBN(0xdd2d5a2f, 0x933260ce),
+ TOBN(0x574115bd, 0xd44cc7a5), TOBN(0x4ba6b20d, 0xbd12533a),
+ TOBN(0x30e93cb8, 0x243057c9), TOBN(0x794c486a, 0x14de320e),
+ TOBN(0xe925d4ce, 0xf21496e4), TOBN(0xf951d198, 0xec696331),
+ TOBN(0x9810e2de, 0x3e8d812f), TOBN(0xd0a47259, 0x389294ab),
+ TOBN(0x513ba2b5, 0x0e3bab66), TOBN(0x462caff5, 0xabad306f),
+ TOBN(0xe2dc6d59, 0xaf04c49e), TOBN(0x1aeb8750, 0xe0b84b0b),
+ TOBN(0xc034f12f, 0x2f7d0ca2), TOBN(0x6d2e8128, 0xe06acf2f),
+ TOBN(0x801f4f83, 0x21facc2f), TOBN(0xa1170c03, 0xf40ef607),
+ TOBN(0xfe0a1d4f, 0x7805a99c), TOBN(0xbde56a36, 0xcc26aba5),
+ TOBN(0x5b1629d0, 0x35531f40), TOBN(0xac212c2b, 0x9afa6108),
+ TOBN(0x30a06bf3, 0x15697be5), TOBN(0x6f0545dc, 0x2c63c7c1),
+ TOBN(0x5d8cb842, 0x7ccdadaf), TOBN(0xd52e379b, 0xac7015bb),
+ TOBN(0xc4f56147, 0xf462c23e), TOBN(0xd44a4298, 0x46bc24b0),
+ TOBN(0xbc73d23a, 0xe2856d4f), TOBN(0x61cedd8c, 0x0832bcdf),
+ TOBN(0x60953556, 0x99f241d7), TOBN(0xee4adbd7, 0x001a349d),
+ TOBN(0x0b35bf6a, 0xaa89e491), TOBN(0x7f0076f4, 0x136f7546),
+ TOBN(0xd19a18ba, 0x9264da3d), TOBN(0x6eb2d2cd, 0x62a7a28b),
+ TOBN(0xcdba941f, 0x8761c971), TOBN(0x1550518b, 0xa3be4a5d),
+ TOBN(0xd0e8e2f0, 0x57d0b70c), TOBN(0xeea8612e, 0xcd133ba3),
+ TOBN(0x814670f0, 0x44416aec), TOBN(0x424db6c3, 0x30775061),
+ TOBN(0xd96039d1, 0x16213fd1), TOBN(0xc61e7fa5, 0x18a3478f),
+ TOBN(0xa805bdcc, 0xcb0c5021), TOBN(0xbdd6f3a8, 0x0cc616dd),
+ TOBN(0x06009667, 0x5d97f7e2), TOBN(0x31db0fc1, 0xaf0bf4b6),
+ TOBN(0x23680ed4, 0x5491627a), TOBN(0xb99a3c66, 0x7d741fb1),
+ TOBN(0xe9bb5f55, 0x36b1ff92), TOBN(0x29738577, 0x512b388d),
+ TOBN(0xdb8a2ce7, 0x50fcf263), TOBN(0x385346d4, 0x6c4f7b47),
+ TOBN(0xbe86c5ef, 0x31631f9e), TOBN(0xbf91da21, 0x03a57a29),
+ TOBN(0xc3b1f796, 0x7b23f821), TOBN(0x0f7d00d2, 0x770db354),
+ TOBN(0x8ffc6c3b, 0xd8fe79da), TOBN(0xcc5e8c40, 0xd525c996),
+ TOBN(0x4640991d, 0xcfff632a), TOBN(0x64d97e8c, 0x67112528),
+ TOBN(0xc232d973, 0x02f1cd1e), TOBN(0xce87eacb, 0x1dd212a4),
+ TOBN(0x6e4c8c73, 0xe69802f7), TOBN(0x12ef0290, 0x1fffddbd),
+ TOBN(0x941ec74e, 0x1bcea6e2), TOBN(0xd0b54024, 0x3cb92cbb),
+ TOBN(0x809fb9d4, 0x7e8f9d05), TOBN(0x3bf16159, 0xf2992aae),
+ TOBN(0xad40f279, 0xf8a7a838), TOBN(0x11aea631, 0x05615660),
+ TOBN(0xbf52e6f1, 0xa01f6fa1), TOBN(0xef046995, 0x3dc2aec9),
+ TOBN(0x785dbec9, 0xd8080711), TOBN(0xe1aec60a, 0x9fdedf76),
+ TOBN(0xece797b5, 0xfa21c126), TOBN(0xc66e898f, 0x05e52732),
+ TOBN(0x39bb69c4, 0x08811fdb), TOBN(0x8bfe1ef8, 0x2fc7f082),
+ TOBN(0xc8e7a393, 0x174f4138), TOBN(0xfba8ad1d, 0xd58d1f98),
+ TOBN(0xbc21d0ce, 0xbfd2fd5b), TOBN(0x0b839a82, 0x6ee60d61),
+ TOBN(0xaacf7658, 0xafd22253), TOBN(0xb526bed8, 0xaae396b3),
+ TOBN(0xccc1bbc2, 0x38564464), TOBN(0x9e3ff947, 0x8c45bc73),
+ TOBN(0xcde9bca3, 0x58188a78), TOBN(0x138b8ee0, 0xd73bf8f7),
+ TOBN(0x5c7e234c, 0x4123c489), TOBN(0x66e69368, 0xfa643297),
+ TOBN(0x0629eeee, 0x39a15fa3), TOBN(0x95fab881, 0xa9e2a927),
+ TOBN(0xb2497007, 0xeafbb1e1), TOBN(0xd75c9ce6, 0xe75b7a93),
+ TOBN(0x3558352d, 0xefb68d78), TOBN(0xa2f26699, 0x223f6396),
+ TOBN(0xeb911ecf, 0xe469b17a), TOBN(0x62545779, 0xe72d3ec2),
+ TOBN(0x8ea47de7, 0x82cb113f), TOBN(0xebe4b086, 0x4e1fa98d),
+ TOBN(0xec2d5ed7, 0x8cdfedb1), TOBN(0xa535c077, 0xfe211a74),
+ TOBN(0x9678109b, 0x11d244c5), TOBN(0xf17c8bfb, 0xbe299a76),
+ TOBN(0xb651412e, 0xfb11fbc4), TOBN(0xea0b5482, 0x94ab3f65),
+ TOBN(0xd8dffd95, 0x0cf78243), TOBN(0x2e719e57, 0xce0361d4),
+ TOBN(0x9007f085, 0x304ddc5b), TOBN(0x095e8c6d, 0x4daba2ea),
+ TOBN(0x5a33cdb4, 0x3f9d28a9), TOBN(0x85b95cd8, 0xe2283003),
+ TOBN(0xbcd6c819, 0xb9744733), TOBN(0x29c5f538, 0xfc7f5783),
+ TOBN(0x6c49b2fa, 0xd59038e4), TOBN(0x68349cc1, 0x3bbe1018),
+ TOBN(0xcc490c1d, 0x21830ee5), TOBN(0x36f9c4ee, 0xe9bfa297),
+ TOBN(0x58fd7294, 0x48de1a94), TOBN(0xaadb13a8, 0x4e8f2cdc),
+ TOBN(0x515eaaa0, 0x81313dba), TOBN(0xc76bb468, 0xc2152dd8),
+ TOBN(0x357f8d75, 0xa653dbf8), TOBN(0xe4d8c4d1, 0xb14ac143),
+ TOBN(0xbdb8e675, 0xb055cb40), TOBN(0x898f8e7b, 0x977b5167),
+ TOBN(0xecc65651, 0xb82fb863), TOBN(0x56544814, 0x6d88f01f),
+ TOBN(0xb0928e95, 0x263a75a9), TOBN(0xcfb6836f, 0x1a22fcda),
+ TOBN(0x651d14db, 0x3f3bd37c), TOBN(0x1d3837fb, 0xb6ad4664),
+ TOBN(0x7c5fb538, 0xff4f94ab), TOBN(0x7243c712, 0x6d7fb8f2),
+ TOBN(0xef13d60c, 0xa85c5287), TOBN(0x18cfb7c7, 0x4bb8dd1b),
+ TOBN(0x82f9bfe6, 0x72908219), TOBN(0x35c4592b, 0x9d5144ab),
+ TOBN(0x52734f37, 0x9cf4b42f), TOBN(0x6bac55e7, 0x8c60ddc4),
+ TOBN(0xb5cd811e, 0x94dea0f6), TOBN(0x259ecae4, 0xe18cc1a3),
+ TOBN(0x6a0e836e, 0x15e660f8), TOBN(0x6c639ea6, 0x0e02bff2),
+ TOBN(0x8721b8cb, 0x7e1026fd), TOBN(0x9e73b50b, 0x63261942),
+ TOBN(0xb8c70974, 0x77f01da3), TOBN(0x1839e6a6, 0x8268f57f),
+ TOBN(0x571b9415, 0x5150b805), TOBN(0x1892389e, 0xf92c7097),
+ TOBN(0x8d69c18e, 0x4a084b95), TOBN(0x7014c512, 0xbe5b495c),
+ TOBN(0x4780db36, 0x1b07523c), TOBN(0x2f6219ce, 0x2c1c64fa),
+ TOBN(0xc38b81b0, 0x602c105a), TOBN(0xab4f4f20, 0x5dc8e360),
+ TOBN(0x20d3c982, 0xcf7d62d2), TOBN(0x1f36e29d, 0x23ba8150),
+ TOBN(0x48ae0bf0, 0x92763f9e), TOBN(0x7a527e6b, 0x1d3a7007),
+ TOBN(0xb4a89097, 0x581a85e3), TOBN(0x1f1a520f, 0xdc158be5),
+ TOBN(0xf98db37d, 0x167d726e), TOBN(0x8802786e, 0x1113e862)}
+ ,
+ {TOBN(0xefb2149e, 0x36f09ab0), TOBN(0x03f163ca, 0x4a10bb5b),
+ TOBN(0xd0297045, 0x06e20998), TOBN(0x56f0af00, 0x1b5a3bab),
+ TOBN(0x7af4cfec, 0x70880e0d), TOBN(0x7332a66f, 0xbe3d913f),
+ TOBN(0x32e6c84a, 0x7eceb4bd), TOBN(0xedc4a79a, 0x9c228f55),
+ TOBN(0xc37c7dd0, 0xc55c4496), TOBN(0xa6a96357, 0x25bbabd2),
+ TOBN(0x5b7e63f2, 0xadd7f363), TOBN(0x9dce3782, 0x2e73f1df),
+ TOBN(0xe1e5a16a, 0xb2b91f71), TOBN(0xe4489823, 0x5ba0163c),
+ TOBN(0xf2759c32, 0xf6e515ad), TOBN(0xa5e2f1f8, 0x8615eecf),
+ TOBN(0x74519be7, 0xabded551), TOBN(0x03d358b8, 0xc8b74410),
+ TOBN(0x4d00b10b, 0x0e10d9a9), TOBN(0x6392b0b1, 0x28da52b7),
+ TOBN(0x6744a298, 0x0b75c904), TOBN(0xc305b0ae, 0xa8f7f96c),
+ TOBN(0x042e421d, 0x182cf932), TOBN(0xf6fc5d50, 0x9e4636ca),
+ TOBN(0x795847c9, 0xd64cc78c), TOBN(0x6c50621b, 0x9b6cb27b),
+ TOBN(0x07099bf8, 0xdf8022ab), TOBN(0x48f862eb, 0xc04eda1d),
+ TOBN(0xd12732ed, 0xe1603c16), TOBN(0x19a80e0f, 0x5c9a9450),
+ TOBN(0xe2257f54, 0xb429b4fc), TOBN(0x66d3b2c6, 0x45460515),
+ TOBN(0x6ca4f87e, 0x822e37be), TOBN(0x73f237b4, 0x253bda4e),
+ TOBN(0xf747f3a2, 0x41190aeb), TOBN(0xf06fa36f, 0x804cf284),
+ TOBN(0x0a6bbb6e, 0xfc621c12), TOBN(0x5d624b64, 0x40b80ec6),
+ TOBN(0x4b072425, 0x7ba556f3), TOBN(0x7fa0c354, 0x3e2d20a8),
+ TOBN(0xe921fa31, 0xe3229d41), TOBN(0xa929c652, 0x94531bd4),
+ TOBN(0x84156027, 0xa6d38209), TOBN(0xf3d69f73, 0x6bdb97bd),
+ TOBN(0x8906d19a, 0x16833631), TOBN(0x68a34c2e, 0x03d51be3),
+ TOBN(0xcb59583b, 0x0e511cd8), TOBN(0x99ce6bfd, 0xfdc132a8),
+ TOBN(0x3facdaaa, 0xffcdb463), TOBN(0x658bbc1a, 0x34a38b08),
+ TOBN(0x12a801f8, 0xf1a9078d), TOBN(0x1567bcf9, 0x6ab855de),
+ TOBN(0xe08498e0, 0x3572359b), TOBN(0xcf0353e5, 0x8659e68b),
+ TOBN(0xbb86e9c8, 0x7d23807c), TOBN(0xbc08728d, 0x2198e8a2),
+ TOBN(0x8de2b7bc, 0x453cadd6), TOBN(0x203900a7, 0xbc0bc1f8),
+ TOBN(0xbcd86e47, 0xa6abd3af), TOBN(0x911cac12, 0x8502effb),
+ TOBN(0x2d550242, 0xec965469), TOBN(0x0e9f7692, 0x29e0017e),
+ TOBN(0x633f078f, 0x65979885), TOBN(0xfb87d449, 0x4cf751ef),
+ TOBN(0xe1790e4b, 0xfc25419a), TOBN(0x36467203, 0x4bff3cfd),
+ TOBN(0xc8db6386, 0x25b6e83f), TOBN(0x6cc69f23, 0x6cad6fd2),
+ TOBN(0x0219e45a, 0x6bc68bb9), TOBN(0xe43d79b6, 0x297f7334),
+ TOBN(0x7d445368, 0x465dc97c), TOBN(0x4b9eea32, 0x2a0b949a),
+ TOBN(0x1b96c6ba, 0x6102d021), TOBN(0xeaafac78, 0x2f4461ea),
+ TOBN(0xd4b85c41, 0xc49f19a8), TOBN(0x275c28e4, 0xcf538875),
+ TOBN(0x35451a9d, 0xdd2e54e0), TOBN(0x6991adb5, 0x0605618b),
+ TOBN(0x5b8b4bcd, 0x7b36cd24), TOBN(0x372a4f8c, 0x56f37216),
+ TOBN(0xc890bd73, 0xa6a5da60), TOBN(0x6f083da0, 0xdc4c9ff0),
+ TOBN(0xf4e14d94, 0xf0536e57), TOBN(0xf9ee1eda, 0xaaec8243),
+ TOBN(0x571241ec, 0x8bdcf8e7), TOBN(0xa5db8271, 0x0b041e26),
+ TOBN(0x9a0b9a99, 0xe3fff040), TOBN(0xcaaf21dd, 0x7c271202),
+ TOBN(0xb4e2b2e1, 0x4f0dd2e8), TOBN(0xe77e7c4f, 0x0a377ac7),
+ TOBN(0x69202c3f, 0x0d7a2198), TOBN(0xf759b7ff, 0x28200eb8),
+ TOBN(0xc87526ed, 0xdcfe314e), TOBN(0xeb84c524, 0x53d5cf99),
+ TOBN(0xb1b52ace, 0x515138b6), TOBN(0x5aa7ff8c, 0x23fca3f4),
+ TOBN(0xff0b13c3, 0xb9791a26), TOBN(0x960022da, 0xcdd58b16),
+ TOBN(0xdbd55c92, 0x57aad2de), TOBN(0x3baaaaa3, 0xf30fe619),
+ TOBN(0x9a4b2346, 0x0d881efd), TOBN(0x506416c0, 0x46325e2a),
+ TOBN(0x91381e76, 0x035c18d4), TOBN(0xb3bb68be, 0xf27817b0),
+ TOBN(0x15bfb8bf, 0x5116f937), TOBN(0x7c64a586, 0xc1268943),
+ TOBN(0x71e25cc3, 0x8419a2c8), TOBN(0x9fd6b0c4, 0x8335f463),
+ TOBN(0x4bf0ba3c, 0xe8ee0e0e), TOBN(0x6f6fba60, 0x298c21fa),
+ TOBN(0x57d57b39, 0xae66bee0), TOBN(0x292d5130, 0x22672544),
+ TOBN(0xf451105d, 0xbab093b3), TOBN(0x012f59b9, 0x02839986),
+ TOBN(0x8a915802, 0x3474a89c), TOBN(0x048c919c, 0x2de03e97),
+ TOBN(0xc476a2b5, 0x91071cd5), TOBN(0x791ed89a, 0x034970a5),
+ TOBN(0x89bd9042, 0xe1b7994b), TOBN(0x8eaf5179, 0xa1057ffd),
+ TOBN(0x6066e2a2, 0xd551ee10), TOBN(0x87a8f1d8, 0x727e09a6),
+ TOBN(0x00d08bab, 0x2c01148d), TOBN(0x6da8e4f1, 0x424f33fe),
+ TOBN(0x466d17f0, 0xcf9a4e71), TOBN(0xff502010, 0x3bf5cb19),
+ TOBN(0xdccf97d8, 0xd062ecc0), TOBN(0x80c0d9af, 0x81d80ac4),
+ TOBN(0xe87771d8, 0x033f2876), TOBN(0xb0186ec6, 0x7d5cc3db),
+ TOBN(0x58e8bb80, 0x3bc9bc1d), TOBN(0x4d1395cc, 0x6f6ef60e),
+ TOBN(0xa73c62d6, 0x186244a0), TOBN(0x918e5f23, 0x110a5b53),
+ TOBN(0xed4878ca, 0x741b7eab), TOBN(0x3038d71a, 0xdbe03e51),
+ TOBN(0x840204b7, 0xa93c3246), TOBN(0x21ab6069, 0xa0b9b4cd),
+ TOBN(0xf5fa6e2b, 0xb1d64218), TOBN(0x1de6ad0e, 0xf3d56191),
+ TOBN(0x570aaa88, 0xff1929c7), TOBN(0xc6df4c6b, 0x640e87b5),
+ TOBN(0xde8a74f2, 0xc65f0ccc), TOBN(0x8b972fd5, 0xe6f6cc01),
+ TOBN(0x3fff36b6, 0x0b846531), TOBN(0xba7e45e6, 0x10a5e475),
+ TOBN(0x84a1d10e, 0x4145b6c5), TOBN(0xf1f7f91a, 0x5e046d9d),
+ TOBN(0x0317a692, 0x44de90d7), TOBN(0x951a1d4a, 0xf199c15e),
+ TOBN(0x91f78046, 0xc9d73deb), TOBN(0x74c82828, 0xfab8224f),
+ TOBN(0xaa6778fc, 0xe7560b90), TOBN(0xb4073e61, 0xa7e824ce),
+ TOBN(0xff0d693c, 0xd642eba8), TOBN(0x7ce2e57a, 0x5dccef38),
+ TOBN(0x89c2c789, 0x1df1ad46), TOBN(0x83a06922, 0x098346fd),
+ TOBN(0x2d715d72, 0xda2fc177), TOBN(0x7b6dd71d, 0x85b6cf1d),
+ TOBN(0xc60a6d0a, 0x73fa9cb0), TOBN(0xedd3992e, 0x328bf5a9),
+ TOBN(0xc380ddd0, 0x832c8c82), TOBN(0xd182d410, 0xa2a0bf50),
+ TOBN(0x7d9d7438, 0xd9a528db), TOBN(0xe8b1a0e9, 0xcaf53994),
+ TOBN(0xddd6e5fe, 0x0e19987c), TOBN(0xacb8df03, 0x190b059d),
+ TOBN(0x53703a32, 0x8300129f), TOBN(0x1f637662, 0x68c43bfd),
+ TOBN(0xbcbd1913, 0x00e54051), TOBN(0x812fcc62, 0x7bf5a8c5),
+ TOBN(0x3f969d5f, 0x29fb85da), TOBN(0x72f4e00a, 0x694759e8),
+ TOBN(0x426b6e52, 0x790726b7), TOBN(0x617bbc87, 0x3bdbb209),
+ TOBN(0x511f8bb9, 0x97aee317), TOBN(0x812a4096, 0xe81536a8),
+ TOBN(0x137dfe59, 0x3ac09b9b), TOBN(0x0682238f, 0xba8c9a7a),
+ TOBN(0x7072ead6, 0xaeccb4bd), TOBN(0x6a34e9aa, 0x692ba633),
+ TOBN(0xc82eaec2, 0x6fff9d33), TOBN(0xfb753512, 0x1d4d2b62),
+ TOBN(0x1a0445ff, 0x1d7aadab), TOBN(0x65d38260, 0xd5f6a67c),
+ TOBN(0x6e62fb08, 0x91cfb26f), TOBN(0xef1e0fa5, 0x5c7d91d6),
+ TOBN(0x47e7c7ba, 0x33db72cd), TOBN(0x017cbc09, 0xfa7c74b2),
+ TOBN(0x3c931590, 0xf50a503c), TOBN(0xcac54f60, 0x616baa42),
+ TOBN(0x9b6cd380, 0xb2369f0f), TOBN(0x97d3a70d, 0x23c76151),
+ TOBN(0x5f9dd6fc, 0x9862a9c6), TOBN(0x044c4ab2, 0x12312f51),
+ TOBN(0x035ea0fd, 0x834a2ddc), TOBN(0x49e6b862, 0xcc7b826d),
+ TOBN(0xb03d6883, 0x62fce490), TOBN(0x62f2497a, 0xb37e36e9),
+ TOBN(0x04b005b6, 0xc6458293), TOBN(0x36bb5276, 0xe8d10af7),
+ TOBN(0xacf2dc13, 0x8ee617b8), TOBN(0x470d2d35, 0xb004b3d4),
+ TOBN(0x06790832, 0xfeeb1b77), TOBN(0x2bb75c39, 0x85657f9c),
+ TOBN(0xd70bd4ed, 0xc0f60004), TOBN(0xfe797ecc, 0x219b018b),
+ TOBN(0x9b5bec2a, 0x753aebcc), TOBN(0xdaf9f3dc, 0xc939eca5),
+ TOBN(0xd6bc6833, 0xd095ad09), TOBN(0x98abdd51, 0xdaa4d2fc),
+ TOBN(0xd9840a31, 0x8d168be5), TOBN(0xcf7c10e0, 0x2325a23c),
+ TOBN(0xa5c02aa0, 0x7e6ecfaf), TOBN(0x2462e7e6, 0xb5bfdf18),
+ TOBN(0xab2d8a8b, 0xa0cc3f12), TOBN(0x68dd485d, 0xbc672a29),
+ TOBN(0x72039752, 0x596f2cd3), TOBN(0x5d3eea67, 0xa0cf3d8d),
+ TOBN(0x810a1a81, 0xe6602671), TOBN(0x8f144a40, 0x14026c0c),
+ TOBN(0xbc753a6d, 0x76b50f85), TOBN(0xc4dc21e8, 0x645cd4a4),
+ TOBN(0xc5262dea, 0x521d0378), TOBN(0x802b8e0e, 0x05011c6f),
+ TOBN(0x1ba19cbb, 0x0b4c19ea), TOBN(0x21db64b5, 0xebf0aaec),
+ TOBN(0x1f394ee9, 0x70342f9d), TOBN(0x93a10aee, 0x1bc44a14),
+ TOBN(0xa7eed31b, 0x3efd0baa), TOBN(0x6e7c824e, 0x1d154e65),
+ TOBN(0xee23fa81, 0x9966e7ee), TOBN(0x64ec4aa8, 0x05b7920d),
+ TOBN(0x2d44462d, 0x2d90aad4), TOBN(0xf44dd195, 0xdf277ad5),
+ TOBN(0x8d6471f1, 0xbb46b6a1), TOBN(0x1e65d313, 0xfd885090),
+ TOBN(0x33a800f5, 0x13a977b4), TOBN(0xaca9d721, 0x0797e1ef),
+ TOBN(0x9a5a85a0, 0xfcff6a17), TOBN(0x9970a3f3, 0x1eca7cee),
+ TOBN(0xbb9f0d6b, 0xc9504be3), TOBN(0xe0c504be, 0xadd24ee2),
+ TOBN(0x7e09d956, 0x77fcc2f4), TOBN(0xef1a5227, 0x65bb5fc4),
+ TOBN(0x145d4fb1, 0x8b9286aa), TOBN(0x66fd0c5d, 0x6649028b),
+ TOBN(0x98857ceb, 0x1bf4581c), TOBN(0xe635e186, 0xaca7b166),
+ TOBN(0x278ddd22, 0x659722ac), TOBN(0xa0903c4c, 0x1db68007),
+ TOBN(0x366e4589, 0x48f21402), TOBN(0x31b49c14, 0xb96abda2),
+ TOBN(0x329c4b09, 0xe0403190), TOBN(0x97197ca3, 0xd29f43fe),
+ TOBN(0x8073dd1e, 0x274983d8), TOBN(0xda1a3bde, 0x55717c8f),
+ TOBN(0xfd3d4da2, 0x0361f9d1), TOBN(0x1332d081, 0x4c7de1ce),
+ TOBN(0x9b7ef7a3, 0xaa6d0e10), TOBN(0x17db2e73, 0xf54f1c4a),
+ TOBN(0xaf3dffae, 0x4cd35567), TOBN(0xaaa2f406, 0xe56f4e71),
+ TOBN(0x8966759e, 0x7ace3fc7), TOBN(0x9594eacf, 0x45a8d8c6),
+ TOBN(0x8de3bd8b, 0x91834e0e), TOBN(0xafe4ca53, 0x548c0421),
+ TOBN(0xfdd7e856, 0xe6ee81c6), TOBN(0x8f671beb, 0x6b891a3a),
+ TOBN(0xf7a58f2b, 0xfae63829), TOBN(0x9ab186fb, 0x9c11ac9f),
+ TOBN(0x8d6eb369, 0x10b5be76), TOBN(0x046b7739, 0xfb040bcd),
+ TOBN(0xccb4529f, 0xcb73de88), TOBN(0x1df0fefc, 0xcf26be03),
+ TOBN(0xad7757a6, 0xbcfcd027), TOBN(0xa8786c75, 0xbb3165ca),
+ TOBN(0xe9db1e34, 0x7e99a4d9), TOBN(0x99ee86df, 0xb06c504b),
+ TOBN(0x5b7c2ddd, 0xc15c9f0a), TOBN(0xdf87a734, 0x4295989e),
+ TOBN(0x59ece47c, 0x03d08fda), TOBN(0xb074d3dd, 0xad5fc702),
+ TOBN(0x20407903, 0x51a03776), TOBN(0x2bb1f77b, 0x2a608007),
+ TOBN(0x25c58f4f, 0xe1153185), TOBN(0xe6df62f6, 0x766e6447),
+ TOBN(0xefb3d1be, 0xed51275a), TOBN(0x5de47dc7, 0x2f0f483f),
+ TOBN(0x7932d98e, 0x97c2bedf), TOBN(0xd5c11927, 0x0219f8a1),
+ TOBN(0x9d751200, 0xa73a294e), TOBN(0x5f88434a, 0x9dc20172),
+ TOBN(0xd28d9fd3, 0xa26f506a), TOBN(0xa890cd31, 0x9d1dcd48),
+ TOBN(0x0aebaec1, 0x70f4d3b4), TOBN(0xfd1a1369, 0x0ffc8d00),
+ TOBN(0xb9d9c240, 0x57d57838), TOBN(0x45929d26, 0x68bac361),
+ TOBN(0x5a2cd060, 0x25b15ca6), TOBN(0x4b3c83e1, 0x6e474446),
+ TOBN(0x1aac7578, 0xee1e5134), TOBN(0xa418f5d6, 0xc91e2f41),
+ TOBN(0x6936fc8a, 0x213ed68b), TOBN(0x860ae7ed, 0x510a5224),
+ TOBN(0x63660335, 0xdef09b53), TOBN(0x641b2897, 0xcd79c98d),
+ TOBN(0x29bd38e1, 0x01110f35), TOBN(0x79c26f42, 0x648b1937),
+ TOBN(0x64dae519, 0x9d9164f4), TOBN(0xd85a2310, 0x0265c273),
+ TOBN(0x7173dd5d, 0x4b07e2b1), TOBN(0xd144c4cb, 0x8d9ea221),
+ TOBN(0xe8b04ea4, 0x1105ab14), TOBN(0x92dda542, 0xfe80d8f1),
+ TOBN(0xe9982fa8, 0xcf03dce6), TOBN(0x8b5ea965, 0x1a22cffc),
+ TOBN(0xf7f4ea7f, 0x3fad88c4), TOBN(0x62db773e, 0x6a5ba95c),
+ TOBN(0xd20f02fb, 0x93f24567), TOBN(0xfd46c69a, 0x315257ca),
+ TOBN(0x0ac74cc7, 0x8bcab987), TOBN(0x46f31c01, 0x5ceca2f5),
+ TOBN(0x40aedb59, 0x888b219e), TOBN(0xe50ecc37, 0xe1fccd02),
+ TOBN(0x1bcd9dad, 0x911f816c), TOBN(0x583cc1ec, 0x8db9b00c),
+ TOBN(0xf3cd2e66, 0xa483bf11), TOBN(0xfa08a6f5, 0xb1b2c169),
+ TOBN(0xf375e245, 0x4be9fa28), TOBN(0x99a7ffec, 0x5b6d011f),
+ TOBN(0x6a3ebddb, 0xc4ae62da), TOBN(0x6cea00ae, 0x374aef5d),
+ TOBN(0xab5fb98d, 0x9d4d05bc), TOBN(0x7cba1423, 0xd560f252),
+ TOBN(0x49b2cc21, 0x208490de), TOBN(0x1ca66ec3, 0xbcfb2879),
+ TOBN(0x7f1166b7, 0x1b6fb16f), TOBN(0xfff63e08, 0x65fe5db3),
+ TOBN(0xb8345abe, 0x8b2610be), TOBN(0xb732ed80, 0x39de3df4),
+ TOBN(0x0e24ed50, 0x211c32b4), TOBN(0xd10d8a69, 0x848ff27d),
+ TOBN(0xc1074398, 0xed4de248), TOBN(0xd7cedace, 0x10488927),
+ TOBN(0xa4aa6bf8, 0x85673e13), TOBN(0xb46bae91, 0x6daf30af),
+ TOBN(0x07088472, 0xfcef7ad8), TOBN(0x61151608, 0xd4b35e97),
+ TOBN(0xbcfe8f26, 0xdde29986), TOBN(0xeb84c4c7, 0xd5a34c79),
+ TOBN(0xc1eec55c, 0x164e1214), TOBN(0x891be86d, 0xa147bb03),
+ TOBN(0x9fab4d10, 0x0ba96835), TOBN(0xbf01e9b8, 0xa5c1ae9f),
+ TOBN(0x6b4de139, 0xb186ebc0), TOBN(0xd5c74c26, 0x85b91bca),
+ TOBN(0x5086a99c, 0xc2d93854), TOBN(0xeed62a7b, 0xa7a9dfbc),
+ TOBN(0x8778ed6f, 0x76b7618a), TOBN(0xbff750a5, 0x03b66062),
+ TOBN(0x4cb7be22, 0xb65186db), TOBN(0x369dfbf0, 0xcc3a6d13),
+ TOBN(0xc7dab26c, 0x7191a321), TOBN(0x9edac3f9, 0x40ed718e),
+ TOBN(0xbc142b36, 0xd0cfd183), TOBN(0xc8af82f6, 0x7c991693),
+ TOBN(0xb3d1e4d8, 0x97ce0b2a), TOBN(0xe6d7c87f, 0xc3a55cdf),
+ TOBN(0x35846b95, 0x68b81afe), TOBN(0x018d12af, 0xd3c239d8),
+ TOBN(0x2b2c6208, 0x01206e15), TOBN(0xe0e42453, 0xa3b882c6),
+ TOBN(0x854470a3, 0xa50162d5), TOBN(0x08157478, 0x7017a62a),
+ TOBN(0x18bd3fb4, 0x820357c7), TOBN(0x992039ae, 0x6f1458ad),
+ TOBN(0x9a1df3c5, 0x25b44aa1), TOBN(0x2d780357, 0xed3d5281),
+ TOBN(0x58cf7e4d, 0xc77ad4d4), TOBN(0xd49a7998, 0xf9df4fc4),
+ TOBN(0x4465a8b5, 0x1d71205e), TOBN(0xa0ee0ea6, 0x649254aa),
+ TOBN(0x4b5eeecf, 0xab7bd771), TOBN(0x6c873073, 0x35c262b9),
+ TOBN(0xdc5bd648, 0x3c9d61e7), TOBN(0x233d6d54, 0x321460d2),
+ TOBN(0xd20c5626, 0xfc195bcc), TOBN(0x25445958, 0x04d78b63),
+ TOBN(0xe03fcb3d, 0x17ec8ef3), TOBN(0x54b690d1, 0x46b8f781),
+ TOBN(0x82fa2c8a, 0x21230646), TOBN(0xf51aabb9, 0x084f418c),
+ TOBN(0xff4fbec1, 0x1a30ba43), TOBN(0x6a5acf73, 0x743c9df7),
+ TOBN(0x1da2b357, 0xd635b4d5), TOBN(0xc3de68dd, 0xecd5c1da),
+ TOBN(0xa689080b, 0xd61af0dd), TOBN(0xdea5938a, 0xd665bf99),
+ TOBN(0x0231d71a, 0xfe637294), TOBN(0x01968aa6, 0xa5a81cd8),
+ TOBN(0x11252d50, 0x048e63b5), TOBN(0xc446bc52, 0x6ca007e9),
+ TOBN(0xef8c50a6, 0x96d6134b), TOBN(0x9361fbf5, 0x9e09a05c),
+ TOBN(0xf17f85a6, 0xdca3291a), TOBN(0xb178d548, 0xff251a21),
+ TOBN(0x87f6374b, 0xa4df3915), TOBN(0x566ce1bf, 0x2fd5d608),
+ TOBN(0x425cba4d, 0x7de35102), TOBN(0x6b745f8f, 0x58c5d5e2),
+ TOBN(0x88402af6, 0x63122edf), TOBN(0x3190f9ed, 0x3b989a89),
+ TOBN(0x4ad3d387, 0xebba3156), TOBN(0xef385ad9, 0xc7c469a5),
+ TOBN(0xb08281de, 0x3f642c29), TOBN(0x20be0888, 0x910ffb88),
+ TOBN(0xf353dd4a, 0xd5292546), TOBN(0x3f1627de, 0x8377a262),
+ TOBN(0xa5faa013, 0xeefcd638), TOBN(0x8f3bf626, 0x74cc77c3),
+ TOBN(0x32618f65, 0xa348f55e), TOBN(0x5787c0dc, 0x9fefeb9e),
+ TOBN(0xf1673aa2, 0xd9a23e44), TOBN(0x88dfa993, 0x4e10690d),
+ TOBN(0x1ced1b36, 0x2bf91108), TOBN(0x9193ceca, 0x3af48649),
+ TOBN(0xfb34327d, 0x2d738fc5), TOBN(0x6697b037, 0x975fee6c),
+ TOBN(0x2f485da0, 0xc04079a5), TOBN(0x2cdf5735, 0x2feaa1ac),
+ TOBN(0x76944420, 0xbd55659e), TOBN(0x7973e32b, 0x4376090c),
+ TOBN(0x86bb4fe1, 0x163b591a), TOBN(0x10441aed, 0xc196f0ca),
+ TOBN(0x3b431f4a, 0x045ad915), TOBN(0x6c11b437, 0xa4afacb1),
+ TOBN(0x30b0c7db, 0x71fdbbd8), TOBN(0xb642931f, 0xeda65acd),
+ TOBN(0x4baae6e8, 0x9c92b235), TOBN(0xa73bbd0e, 0x6b3993a1),
+ TOBN(0xd06d60ec, 0x693dd031), TOBN(0x03cab91b, 0x7156881c),
+ TOBN(0xd615862f, 0x1db3574b), TOBN(0x485b0185, 0x64bb061a),
+ TOBN(0x27434988, 0xa0181e06), TOBN(0x2cd61ad4, 0xc1c0c757),
+ TOBN(0x3effed5a, 0x2ff9f403), TOBN(0x8dc98d8b, 0x62239029),
+ TOBN(0x2206021e, 0x1f17b70d), TOBN(0xafbec0ca, 0xbf510015),
+ TOBN(0x9fed7164, 0x80130dfa), TOBN(0x306dc2b5, 0x8a02dcf5),
+ TOBN(0x48f06620, 0xfeb10fc0), TOBN(0x78d1e1d5, 0x5a57cf51),
+ TOBN(0xadef8c5a, 0x192ef710), TOBN(0x88afbd4b, 0x3b7431f9),
+ TOBN(0x7e1f7407, 0x64250c9e), TOBN(0x6e31318d, 0xb58bec07),
+ TOBN(0xfd4fc4b8, 0x24f89b4e), TOBN(0x65a5dd88, 0x48c36a2a),
+ TOBN(0x4f1eccff, 0xf024baa7), TOBN(0x22a21cf2, 0xcba94650),
+ TOBN(0x95d29dee, 0x42a554f7), TOBN(0x828983a5, 0x002ec4ba),
+ TOBN(0x8112a1f7, 0x8badb73d), TOBN(0x79ea8897, 0xa27c1839),
+ TOBN(0x8969a5a7, 0xd065fd83), TOBN(0xf49af791, 0xb262a0bc),
+ TOBN(0xfcdea8b6, 0xaf2b5127), TOBN(0x10e913e1, 0x564c2dbc),
+ TOBN(0x51239d14, 0xbc21ef51), TOBN(0xe51c3ceb, 0x4ce57292),
+ TOBN(0x795ff068, 0x47bbcc3b), TOBN(0x86b46e1e, 0xbd7e11e6),
+ TOBN(0x0ea6ba23, 0x80041ef4), TOBN(0xd72fe505, 0x6262342e),
+ TOBN(0x8abc6dfd, 0x31d294d4), TOBN(0xbbe017a2, 0x1278c2c9),
+ TOBN(0xb1fcfa09, 0xb389328a), TOBN(0x322fbc62, 0xd01771b5),
+ TOBN(0x04c0d063, 0x60b045bf), TOBN(0xdb652edc, 0x10e52d01),
+ TOBN(0x50ef932c, 0x03ec6627), TOBN(0xde1b3b2d, 0xc1ee50e3),
+ TOBN(0x5ab7bdc5, 0xdc37a90d), TOBN(0xfea67213, 0x31e33a96),
+ TOBN(0x6482b5cb, 0x4f2999aa), TOBN(0x38476cc6, 0xb8cbf0dd),
+ TOBN(0x93ebfacb, 0x173405bb), TOBN(0x15cdafe7, 0xe52369ec),
+ TOBN(0xd42d5ba4, 0xd935b7db), TOBN(0x648b6004, 0x1c99a4cd),
+ TOBN(0x785101bd, 0xa3b5545b), TOBN(0x4bf2c38a, 0x9dd67faf),
+ TOBN(0xb1aadc63, 0x4442449c), TOBN(0xe0e9921a, 0x33ad4fb8),
+ TOBN(0x5c552313, 0xaa686d82), TOBN(0xdee635fa, 0x465d866c),
+ TOBN(0xbc3c224a, 0x18ee6e8a), TOBN(0xeed748a6, 0xed42e02f),
+ TOBN(0xe70f930a, 0xd474cd08), TOBN(0x774ea6ec, 0xfff24adf),
+ TOBN(0x03e2de1c, 0xf3480d4a), TOBN(0xf0d8edc7, 0xbc8acf1a),
+ TOBN(0xf23e3303, 0x68295a9c), TOBN(0xfadd5f68, 0xc546a97d),
+ TOBN(0x895597ad, 0x96f8acb1), TOBN(0xbddd49d5, 0x671bdae2),
+ TOBN(0x16fcd528, 0x21dd43f4), TOBN(0xa5a45412, 0x6619141a)}
+ ,
+ {TOBN(0x8ce9b6bf, 0xc360e25a), TOBN(0xe6425195, 0x075a1a78),
+ TOBN(0x9dc756a8, 0x481732f4), TOBN(0x83c0440f, 0x5432b57a),
+ TOBN(0xc670b3f1, 0xd720281f), TOBN(0x2205910e, 0xd135e051),
+ TOBN(0xded14b0e, 0xdb052be7), TOBN(0x697b3d27, 0xc568ea39),
+ TOBN(0x2e599b9a, 0xfb3ff9ed), TOBN(0x28c2e0ab, 0x17f6515c),
+ TOBN(0x1cbee4fd, 0x474da449), TOBN(0x071279a4, 0x4f364452),
+ TOBN(0x97abff66, 0x01fbe855), TOBN(0x3ee394e8, 0x5fda51c4),
+ TOBN(0x190385f6, 0x67597c0b), TOBN(0x6e9fccc6, 0xa27ee34b),
+ TOBN(0x0b89de93, 0x14092ebb), TOBN(0xf17256bd, 0x428e240c),
+ TOBN(0xcf89a7f3, 0x93d2f064), TOBN(0x4f57841e, 0xe1ed3b14),
+ TOBN(0x4ee14405, 0xe708d855), TOBN(0x856aae72, 0x03f1c3d0),
+ TOBN(0xc8e5424f, 0xbdd7eed5), TOBN(0x3333e4ef, 0x73ab4270),
+ TOBN(0x3bc77ade, 0xdda492f8), TOBN(0xc11a3aea, 0x78297205),
+ TOBN(0x5e89a3e7, 0x34931b4c), TOBN(0x17512e2e, 0x9f5694bb),
+ TOBN(0x5dc349f3, 0x177bf8b6), TOBN(0x232ea4ba, 0x08c7ff3e),
+ TOBN(0x9c4f9d16, 0xf511145d), TOBN(0xccf109a3, 0x33b379c3),
+ TOBN(0xe75e7a88, 0xa1f25897), TOBN(0x7ac6961f, 0xa1b5d4d8),
+ TOBN(0xe3e10773, 0x08f3ed5c), TOBN(0x208a54ec, 0x0a892dfb),
+ TOBN(0xbe826e19, 0x78660710), TOBN(0x0cf70a97, 0x237df2c8),
+ TOBN(0x418a7340, 0xed704da5), TOBN(0xa3eeb9a9, 0x08ca33fd),
+ TOBN(0x49d96233, 0x169bca96), TOBN(0x04d286d4, 0x2da6aafb),
+ TOBN(0xc09606ec, 0xa0c2fa94), TOBN(0x8869d0d5, 0x23ff0fb3),
+ TOBN(0xa99937e5, 0xd0150d65), TOBN(0xa92e2503, 0x240c14c9),
+ TOBN(0x656bf945, 0x108e2d49), TOBN(0x152a733a, 0xa2f59e2b),
+ TOBN(0xb4323d58, 0x8434a920), TOBN(0xc0af8e93, 0x622103c5),
+ TOBN(0x667518ef, 0x938dbf9a), TOBN(0xa1843073, 0x83a9cdf2),
+ TOBN(0x350a94aa, 0x5447ab80), TOBN(0xe5e5a325, 0xc75a3d61),
+ TOBN(0x74ba507f, 0x68411a9e), TOBN(0x10581fc1, 0x594f70c5),
+ TOBN(0x60e28570, 0x80eb24a9), TOBN(0x7bedfb4d, 0x488e0cfd),
+ TOBN(0x721ebbd7, 0xc259cdb8), TOBN(0x0b0da855, 0xbc6390a9),
+ TOBN(0x2b4d04db, 0xde314c70), TOBN(0xcdbf1fbc, 0x6c32e846),
+ TOBN(0x33833eab, 0xb162fc9e), TOBN(0x9939b48b, 0xb0dd3ab7),
+ TOBN(0x5aaa98a7, 0xcb0c9c8c), TOBN(0x75105f30, 0x81c4375c),
+ TOBN(0xceee5057, 0x5ef1c90f), TOBN(0xb31e065f, 0xc23a17bf),
+ TOBN(0x5364d275, 0xd4b6d45a), TOBN(0xd363f3ad, 0x62ec8996),
+ TOBN(0xb5d21239, 0x4391c65b), TOBN(0x84564765, 0xebb41b47),
+ TOBN(0x20d18ecc, 0x37107c78), TOBN(0xacff3b6b, 0x570c2a66),
+ TOBN(0x22f975d9, 0x9bd0d845), TOBN(0xef0a0c46, 0xba178fa0),
+ TOBN(0x1a419651, 0x76b6028e), TOBN(0xc49ec674, 0x248612d4),
+ TOBN(0x5b6ac4f2, 0x7338af55), TOBN(0x06145e62, 0x7bee5a36),
+ TOBN(0x33e95d07, 0xe75746b5), TOBN(0x1c1e1f6d, 0xc40c78be),
+ TOBN(0x967833ef, 0x222ff8e2), TOBN(0x4bedcf6a, 0xb49180ad),
+ TOBN(0x6b37e9c1, 0x3d7a4c8a), TOBN(0x2748887c, 0x6ddfe760),
+ TOBN(0xf7055123, 0xaa3a5bbc), TOBN(0x954ff225, 0x7bbb8e74),
+ TOBN(0xc42b8ab1, 0x97c3dfb9), TOBN(0x55a549b0, 0xcf168154),
+ TOBN(0xad6748e7, 0xc1b50692), TOBN(0x2775780f, 0x6fc5cbcb),
+ TOBN(0x4eab80b8, 0xe1c9d7c8), TOBN(0x8c69dae1, 0x3fdbcd56),
+ TOBN(0x47e6b4fb, 0x9969eace), TOBN(0x002f1085, 0xa705cb5a),
+ TOBN(0x4e23ca44, 0x6d3fea55), TOBN(0xb4ae9c86, 0xf4810568),
+ TOBN(0x47bfb91b, 0x2a62f27d), TOBN(0x60deb4c9, 0xd9bac28c),
+ TOBN(0xa892d894, 0x7de6c34c), TOBN(0x4ee68259, 0x4494587d),
+ TOBN(0x914ee14e, 0x1a3f8a5b), TOBN(0xbb113eaa, 0x28700385),
+ TOBN(0x81ca03b9, 0x2115b4c9), TOBN(0x7c163d38, 0x8908cad1),
+ TOBN(0xc912a118, 0xaa18179a), TOBN(0xe09ed750, 0x886e3081),
+ TOBN(0xa676e3fa, 0x26f516ca), TOBN(0x753cacf7, 0x8e732f91),
+ TOBN(0x51592aea, 0x833da8b4), TOBN(0xc626f42f, 0x4cbea8aa),
+ TOBN(0xef9dc899, 0xa7b56eaf), TOBN(0x00c0e52c, 0x34ef7316),
+ TOBN(0x5b1e4e24, 0xfe818a86), TOBN(0x9d31e20d, 0xc538be47),
+ TOBN(0x22eb932d, 0x3ed68974), TOBN(0xe44bbc08, 0x7c4e87c4),
+ TOBN(0x4121086e, 0x0dde9aef), TOBN(0x8e6b9cff, 0x134f4345),
+ TOBN(0x96892c1f, 0x711b0eb9), TOBN(0xb905f2c8, 0x780ab954),
+ TOBN(0xace26309, 0xa20792db), TOBN(0xec8ac9b3, 0x0684e126),
+ TOBN(0x486ad8b6, 0xb40a2447), TOBN(0x60121fc1, 0x9fe3fb24),
+ TOBN(0x5626fccf, 0x1a8e3b3f), TOBN(0x4e568622, 0x6ad1f394),
+ TOBN(0xda7aae0d, 0x196aa5a1), TOBN(0xe0df8c77, 0x1041b5fb),
+ TOBN(0x451465d9, 0x26b318b7), TOBN(0xc29b6e55, 0x7ab136e9),
+ TOBN(0x2c2ab48b, 0x71148463), TOBN(0xb5738de3, 0x64454a76),
+ TOBN(0x54ccf9a0, 0x5a03abe4), TOBN(0x377c0296, 0x0427d58e),
+ TOBN(0x73f5f0b9, 0x2bb39c1f), TOBN(0x14373f2c, 0xe608d8c5),
+ TOBN(0xdcbfd314, 0x00fbb805), TOBN(0xdf18fb20, 0x83afdcfb),
+ TOBN(0x81a57f42, 0x42b3523f), TOBN(0xe958532d, 0x87f650fb),
+ TOBN(0xaa8dc8b6, 0x8b0a7d7c), TOBN(0x1b75dfb7, 0x150166be),
+ TOBN(0x90e4f7c9, 0x2d7d1413), TOBN(0x67e2d6b5, 0x9834f597),
+ TOBN(0x4fd4f4f9, 0xa808c3e8), TOBN(0xaf8237e0, 0xd5281ec1),
+ TOBN(0x25ab5fdc, 0x84687cee), TOBN(0xc5ded6b1, 0xa5b26c09),
+ TOBN(0x8e4a5aec, 0xc8ea7650), TOBN(0x23b73e5c, 0x14cc417f),
+ TOBN(0x2bfb4318, 0x3037bf52), TOBN(0xb61e6db5, 0x78c725d7),
+ TOBN(0x8efd4060, 0xbbb3e5d7), TOBN(0x2e014701, 0xdbac488e),
+ TOBN(0xac75cf9a, 0x360aa449), TOBN(0xb70cfd05, 0x79634d08),
+ TOBN(0xa591536d, 0xfffb15ef), TOBN(0xb2c37582, 0xd07c106c),
+ TOBN(0xb4293fdc, 0xf50225f9), TOBN(0xc52e175c, 0xb0e12b03),
+ TOBN(0xf649c3ba, 0xd0a8bf64), TOBN(0x745a8fef, 0xeb8ae3c6),
+ TOBN(0x30d7e5a3, 0x58321bc3), TOBN(0xb1732be7, 0x0bc4df48),
+ TOBN(0x1f217993, 0xe9ea5058), TOBN(0xf7a71cde, 0x3e4fd745),
+ TOBN(0x86cc533e, 0x894c5bbb), TOBN(0x6915c7d9, 0x69d83082),
+ TOBN(0xa6aa2d05, 0x5815c244), TOBN(0xaeeee592, 0x49b22ce5),
+ TOBN(0x89e39d13, 0x78135486), TOBN(0x3a275c1f, 0x16b76f2f),
+ TOBN(0xdb6bcc1b, 0xe036e8f5), TOBN(0x4df69b21, 0x5e4709f5),
+ TOBN(0xa188b250, 0x2d0f39aa), TOBN(0x622118bb, 0x15a85947),
+ TOBN(0x2ebf520f, 0xfde0f4fa), TOBN(0xa40e9f29, 0x4860e539),
+ TOBN(0x7b6a51eb, 0x22b57f0f), TOBN(0x849a33b9, 0x7e80644a),
+ TOBN(0x50e5d16f, 0x1cf095fe), TOBN(0xd754b54e, 0xec55f002),
+ TOBN(0x5cfbbb22, 0x236f4a98), TOBN(0x0b0c59e9, 0x066800bb),
+ TOBN(0x4ac69a8f, 0x5a9a7774), TOBN(0x2b33f804, 0xd6bec948),
+ TOBN(0xb3729295, 0x32e6c466), TOBN(0x68956d0f, 0x4e599c73),
+ TOBN(0xa47a249f, 0x155c31cc), TOBN(0x24d80f0d, 0xe1ce284e),
+ TOBN(0xcd821dfb, 0x988baf01), TOBN(0xe6331a7d, 0xdbb16647),
+ TOBN(0x1eb8ad33, 0x094cb960), TOBN(0x593cca38, 0xc91bbca5),
+ TOBN(0x384aac8d, 0x26567456), TOBN(0x40fa0309, 0xc04b6490),
+ TOBN(0x97834cd6, 0xdab6c8f6), TOBN(0x68a7318d, 0x3f91e55f),
+ TOBN(0xa00fd04e, 0xfc4d3157), TOBN(0xb56f8ab2, 0x2bf3bdea),
+ TOBN(0x014f5648, 0x4fa57172), TOBN(0x948c5860, 0x450abdb3),
+ TOBN(0x342b5df0, 0x0ebd4f08), TOBN(0x3e5168cd, 0x0e82938e),
+ TOBN(0x7aedc1ce, 0xb0df5dd0), TOBN(0x6bbbc6d9, 0xe5732516),
+ TOBN(0xc7bfd486, 0x605daaa6), TOBN(0x46fd72b7, 0xbb9a6c9e),
+ TOBN(0xe4847fb1, 0xa124fb89), TOBN(0x75959cbd, 0xa2d8ffbc),
+ TOBN(0x42579f65, 0xc8a588ee), TOBN(0x368c92e6, 0xb80b499d),
+ TOBN(0xea4ef6cd, 0x999a5df1), TOBN(0xaa73bb7f, 0x936fe604),
+ TOBN(0xf347a70d, 0x6457d188), TOBN(0x86eda86b, 0x8b7a388b),
+ TOBN(0xb7cdff06, 0x0ccd6013), TOBN(0xbeb1b6c7, 0xd0053fb2),
+ TOBN(0x0b022387, 0x99240a9f), TOBN(0x1bbb384f, 0x776189b2),
+ TOBN(0x8695e71e, 0x9066193a), TOBN(0x2eb50097, 0x06ffac7e),
+ TOBN(0x0654a9c0, 0x4a7d2caa), TOBN(0x6f3fb3d1, 0xa5aaa290),
+ TOBN(0x835db041, 0xff476e8f), TOBN(0x540b8b0b, 0xc42295e4),
+ TOBN(0xa5c73ac9, 0x05e214f5), TOBN(0x9a74075a, 0x56a0b638),
+ TOBN(0x2e4b1090, 0xce9e680b), TOBN(0x57a5b479, 0x6b8d9afa),
+ TOBN(0x0dca48e7, 0x26bfe65c), TOBN(0x097e391c, 0x7290c307),
+ TOBN(0x683c462e, 0x6669e72e), TOBN(0xf505be1e, 0x062559ac),
+ TOBN(0x5fbe3ea1, 0xe3a3035a), TOBN(0x6431ebf6, 0x9cd50da8),
+ TOBN(0xfd169d5c, 0x1f6407f2), TOBN(0x8d838a95, 0x60fce6b8),
+ TOBN(0x2a2bfa7f, 0x650006f0), TOBN(0xdfd7dad3, 0x50c0fbb2),
+ TOBN(0x92452495, 0xccf9ad96), TOBN(0x183bf494, 0xd95635f9),
+ TOBN(0x02d5df43, 0x4a7bd989), TOBN(0x505385cc, 0xa5431095),
+ TOBN(0xdd98e67d, 0xfd43f53e), TOBN(0xd61e1a6c, 0x500c34a9),
+ TOBN(0x5a4b46c6, 0x4a8a3d62), TOBN(0x8469c4d0, 0x247743d2),
+ TOBN(0x2bb3a13d, 0x88f7e433), TOBN(0x62b23a10, 0x01be5849),
+ TOBN(0xe83596b4, 0xa63d1a4c), TOBN(0x454e7fea, 0x7d183f3e),
+ TOBN(0x643fce61, 0x17afb01c), TOBN(0x4e65e5e6, 0x1c4c3638),
+ TOBN(0x41d85ea1, 0xef74c45b), TOBN(0x2cfbfa66, 0xae328506),
+ TOBN(0x98b078f5, 0x3ada7da9), TOBN(0xd985fe37, 0xec752fbb),
+ TOBN(0xeece68fe, 0x5a0148b4), TOBN(0x6f9a55c7, 0x2d78136d),
+ TOBN(0x232dccc4, 0xd2b729ce), TOBN(0xa27e0dfd, 0x90aafbc4),
+ TOBN(0x96474452, 0x12b4603e), TOBN(0xa876c551, 0x6b706d14),
+ TOBN(0xdf145fcf, 0x69a9d412), TOBN(0xe2ab75b7, 0x2d479c34),
+ TOBN(0x12df9a76, 0x1a23ff97), TOBN(0xc6138992, 0x5d359d10),
+ TOBN(0x6e51c7ae, 0xfa835f22), TOBN(0x69a79cb1, 0xc0fcc4d9),
+ TOBN(0xf57f350d, 0x594cc7e1), TOBN(0x3079ca63, 0x3350ab79),
+ TOBN(0x226fb614, 0x9aff594a), TOBN(0x35afec02, 0x6d59a62b),
+ TOBN(0x9bee46f4, 0x06ed2c6e), TOBN(0x58da1735, 0x7d939a57),
+ TOBN(0x44c50402, 0x8fd1797e), TOBN(0xd8853e7c, 0x5ccea6ca),
+ TOBN(0x4065508d, 0xa35fcd5f), TOBN(0x8965df8c, 0x495ccaeb),
+ TOBN(0x0f2da850, 0x12e1a962), TOBN(0xee471b94, 0xc1cf1cc4),
+ TOBN(0xcef19bc8, 0x0a08fb75), TOBN(0x704958f5, 0x81de3591),
+ TOBN(0x2867f8b2, 0x3aef4f88), TOBN(0x8d749384, 0xea9f9a5f),
+ TOBN(0x1b385537, 0x8c9049f4), TOBN(0x5be948f3, 0x7b92d8b6),
+ TOBN(0xd96f725d, 0xb6e2bd6b), TOBN(0x37a222bc, 0x958c454d),
+ TOBN(0xe7c61abb, 0x8809bf61), TOBN(0x46f07fbc, 0x1346f18d),
+ TOBN(0xfb567a7a, 0xe87c0d1c), TOBN(0x84a461c8, 0x7ef3d07a),
+ TOBN(0x0a5adce6, 0xd9278d98), TOBN(0x24d94813, 0x9dfc73e1),
+ TOBN(0x4f3528b6, 0x054321c3), TOBN(0x2e03fdde, 0x692ea706),
+ TOBN(0x10e60619, 0x47b533c0), TOBN(0x1a8bc73f, 0x2ca3c055),
+ TOBN(0xae58d4b2, 0x1bb62b8f), TOBN(0xb2045a73, 0x584a24e3),
+ TOBN(0x3ab3d5af, 0xbd76e195), TOBN(0x478dd1ad, 0x6938a810),
+ TOBN(0x6ffab393, 0x6ee3d5cb), TOBN(0xdfb693db, 0x22b361e4),
+ TOBN(0xf9694496, 0x51dbf1a7), TOBN(0xcab4b4ef, 0x08a2e762),
+ TOBN(0xe8c92f25, 0xd39bba9a), TOBN(0x850e61bc, 0xf1464d96),
+ TOBN(0xb7e830e3, 0xdc09508b), TOBN(0xfaf6d2cf, 0x74317655),
+ TOBN(0x72606ceb, 0xdf690355), TOBN(0x48bb92b3, 0xd0c3ded6),
+ TOBN(0x65b75484, 0x5c7cf892), TOBN(0xf6cd7ac9, 0xd5d5f01f),
+ TOBN(0xc2c30a59, 0x96401d69), TOBN(0x91268650, 0xed921878),
+ TOBN(0x380bf913, 0xb78c558f), TOBN(0x43c0baeb, 0xc8afdaa9),
+ TOBN(0x377f61d5, 0x54f169d3), TOBN(0xf8da07e3, 0xae5ff20b),
+ TOBN(0xb676c49d, 0xa8a90ea8), TOBN(0x81c1ff2b, 0x83a29b21),
+ TOBN(0x383297ac, 0x2ad8d276), TOBN(0x3001122f, 0xba89f982),
+ TOBN(0xe1d794be, 0x6718e448), TOBN(0x246c1482, 0x7c3e6e13),
+ TOBN(0x56646ef8, 0x5d26b5ef), TOBN(0x80f5091e, 0x88069cdd),
+ TOBN(0xc5992e2f, 0x724bdd38), TOBN(0x02e915b4, 0x8471e8c7),
+ TOBN(0x96ff320a, 0x0d0ff2a9), TOBN(0xbf886487, 0x4384d1a0),
+ TOBN(0xbbe1e6a6, 0xc93f72d6), TOBN(0xd5f75d12, 0xcad800ea),
+ TOBN(0xfa40a09f, 0xe7acf117), TOBN(0x32c8cdd5, 0x7581a355),
+ TOBN(0x74221992, 0x7023c499), TOBN(0xa8afe5d7, 0x38ec3901),
+ TOBN(0x5691afcb, 0xa90e83f0), TOBN(0x41bcaa03, 0x0b8f8eac),
+ TOBN(0xe38b5ff9, 0x8d2668d5), TOBN(0x0715281a, 0x7ad81965),
+ TOBN(0x1bc8fc7c, 0x03c6ce11), TOBN(0xcbbee6e2, 0x8b650436),
+ TOBN(0x06b00fe8, 0x0cdb9808), TOBN(0x17d6e066, 0xfe3ed315),
+ TOBN(0x2e9d38c6, 0x4d0b5018), TOBN(0xab8bfd56, 0x844dcaef),
+ TOBN(0x42894a59, 0x513aed8b), TOBN(0xf77f3b6d, 0x314bd07a),
+ TOBN(0xbbdecb8f, 0x8e42b582), TOBN(0xf10e2fa8, 0xd2390fe6),
+ TOBN(0xefb95022, 0x62a2f201), TOBN(0x4d59ea50, 0x50ee32b0),
+ TOBN(0xd87f7728, 0x6da789a8), TOBN(0xcf98a2cf, 0xf79492c4),
+ TOBN(0xf9577239, 0x720943c2), TOBN(0xba044cf5, 0x3990b9d0),
+ TOBN(0x5aa8e823, 0x95f2884a), TOBN(0x834de6ed, 0x0278a0af),
+ TOBN(0xc8e1ee9a, 0x5f25bd12), TOBN(0x9259ceaa, 0x6f7ab271),
+ TOBN(0x7e6d97a2, 0x77d00b76), TOBN(0x5c0c6eea, 0xa437832a),
+ TOBN(0x5232c20f, 0x5606b81d), TOBN(0xabd7b375, 0x0d991ee5),
+ TOBN(0x4d2bfe35, 0x8632d951), TOBN(0x78f85146, 0x98ed9364),
+ TOBN(0x951873f0, 0xf30c3282), TOBN(0x0da8ac80, 0xa789230b),
+ TOBN(0x3ac7789c, 0x5398967f), TOBN(0xa69b8f7f, 0xbdda0fb5),
+ TOBN(0xe5db7717, 0x6add8545), TOBN(0x1b71cb66, 0x72c49b66),
+ TOBN(0xd8560739, 0x68421d77), TOBN(0x03840fe8, 0x83e3afea),
+ TOBN(0xb391dad5, 0x1ec69977), TOBN(0xae243fb9, 0x307f6726),
+ TOBN(0xc88ac87b, 0xe8ca160c), TOBN(0x5174cced, 0x4ce355f4),
+ TOBN(0x98a35966, 0xe58ba37d), TOBN(0xfdcc8da2, 0x7817335d),
+ TOBN(0x5b752830, 0x83fbc7bf), TOBN(0x68e419d4, 0xd9c96984),
+ TOBN(0x409a39f4, 0x02a40380), TOBN(0x88940faf, 0x1fe977bc),
+ TOBN(0xc640a94b, 0x8f8edea6), TOBN(0x1e22cd17, 0xed11547d),
+ TOBN(0xe28568ce, 0x59ffc3e2), TOBN(0x60aa1b55, 0xc1dee4e7),
+ TOBN(0xc67497c8, 0x837cb363), TOBN(0x06fb438a, 0x105a2bf2),
+ TOBN(0x30357ec4, 0x500d8e20), TOBN(0x1ad9095d, 0x0670db10),
+ TOBN(0x7f589a05, 0xc73b7cfd), TOBN(0xf544607d, 0x880d6d28),
+ TOBN(0x17ba93b1, 0xa20ef103), TOBN(0xad859130, 0x6ba6577b),
+ TOBN(0x65c91cf6, 0x6fa214a0), TOBN(0xd7d49c6c, 0x27990da5),
+ TOBN(0xecd9ec8d, 0x20bb569d), TOBN(0xbd4b2502, 0xeeffbc33),
+ TOBN(0x2056ca5a, 0x6bed0467), TOBN(0x7916a1f7, 0x5b63728c),
+ TOBN(0xd4f9497d, 0x53a4f566), TOBN(0x89734664, 0x97b56810),
+ TOBN(0xf8e1da74, 0x0494a621), TOBN(0x82546a93, 0x8d011c68),
+ TOBN(0x1f3acb19, 0xc61ac162), TOBN(0x52f8fa9c, 0xabad0d3e),
+ TOBN(0x15356523, 0xb4b7ea43), TOBN(0x5a16ad61, 0xae608125),
+ TOBN(0xb0bcb87f, 0x4faed184), TOBN(0x5f236b1d, 0x5029f45f),
+ TOBN(0xd42c7607, 0x0bc6b1fc), TOBN(0xc644324e, 0x68aefce3),
+ TOBN(0x8e191d59, 0x5c5d8446), TOBN(0xc0208077, 0x13ae1979),
+ TOBN(0xadcaee55, 0x3ba59cc7), TOBN(0x20ed6d6b, 0xa2cb81ba),
+ TOBN(0x0952ba19, 0xb6efcffc), TOBN(0x60f12d68, 0x97c0b87c),
+ TOBN(0x4ee2c7c4, 0x9caa30bc), TOBN(0x767238b7, 0x97fbff4e),
+ TOBN(0xebc73921, 0x501b5d92), TOBN(0x3279e3df, 0xc2a37737),
+ TOBN(0x9fc12bc8, 0x6d197543), TOBN(0xfa94dc6f, 0x0a40db4e),
+ TOBN(0x7392b41a, 0x530ccbbd), TOBN(0x87c82146, 0xea823525),
+ TOBN(0xa52f984c, 0x05d98d0c), TOBN(0x2ae57d73, 0x5ef6974c),
+ TOBN(0x9377f7bf, 0x3042a6dd), TOBN(0xb1a007c0, 0x19647a64),
+ TOBN(0xfaa9079a, 0x0cca9767), TOBN(0x3d81a25b, 0xf68f72d5),
+ TOBN(0x752067f8, 0xff81578e), TOBN(0x78622150, 0x9045447d),
+ TOBN(0xc0c22fcf, 0x0505aa6f), TOBN(0x1030f0a6, 0x6bed1c77),
+ TOBN(0x31f29f15, 0x1f0bd739), TOBN(0x2d7989c7, 0xe6debe85),
+ TOBN(0x5c070e72, 0x8e677e98), TOBN(0x0a817bd3, 0x06e81fd5),
+ TOBN(0xc110d830, 0xb0f2ac95), TOBN(0x48d0995a, 0xab20e64e),
+ TOBN(0x0f3e00e1, 0x7729cd9a), TOBN(0x2a570c20, 0xdd556946),
+ TOBN(0x912dbcfd, 0x4e86214d), TOBN(0x2d014ee2, 0xcf615498),
+ TOBN(0x55e2b1e6, 0x3530d76e), TOBN(0xc5135ae4, 0xfd0fd6d1),
+ TOBN(0x0066273a, 0xd4f3049f), TOBN(0xbb8e9893, 0xe7087477),
+ TOBN(0x2dba1ddb, 0x14c6e5fd), TOBN(0xdba37886, 0x51f57e6c),
+ TOBN(0x5aaee0a6, 0x5a72f2cf), TOBN(0x1208bfbf, 0x7bea5642),
+ TOBN(0xf5c6aa3b, 0x67872c37), TOBN(0xd726e083, 0x43f93224),
+ TOBN(0x1854daa5, 0x061f1658), TOBN(0xc0016df1, 0xdf0cd2b3),
+ TOBN(0xc2a3f23e, 0x833d50de), TOBN(0x73b681d2, 0xbbbd3017),
+ TOBN(0x2f046dc4, 0x3ac343c0), TOBN(0x9c847e7d, 0x85716421),
+ TOBN(0xe1e13c91, 0x0917eed4), TOBN(0x3fc9eebd, 0x63a1b9c6),
+ TOBN(0x0f816a72, 0x7fe02299), TOBN(0x6335ccc2, 0x294f3319),
+ TOBN(0x3820179f, 0x4745c5be), TOBN(0xe647b782, 0x922f066e),
+ TOBN(0xc22e49de, 0x02cafb8a), TOBN(0x299bc2ff, 0xfcc2eccc),
+ TOBN(0x9a8feea2, 0x6e0e8282), TOBN(0xa627278b, 0xfe893205),
+ TOBN(0xa7e19733, 0x7933e47b), TOBN(0xf4ff6b13, 0x2e766402),
+ TOBN(0xa4d8be0a, 0x98440d9f), TOBN(0x658f5c2f, 0x38938808),
+ TOBN(0x90b75677, 0xc95b3b3e), TOBN(0xfa044269, 0x3137b6ff),
+ TOBN(0x077b039b, 0x43c47c29), TOBN(0xcca95dd3, 0x8a6445b2),
+ TOBN(0x0b498ba4, 0x2333fc4c), TOBN(0x274f8e68, 0xf736a1b1),
+ TOBN(0x6ca348fd, 0x5f1d4b2e), TOBN(0x24d3be78, 0xa8f10199),
+ TOBN(0x8535f858, 0xca14f530), TOBN(0xa6e7f163, 0x5b982e51),
+ TOBN(0x847c8512, 0x36e1bf62), TOBN(0xf6a7c58e, 0x03448418),
+ TOBN(0x583f3703, 0xf9374ab6), TOBN(0x864f9195, 0x6e564145),
+ TOBN(0x33bc3f48, 0x22526d50), TOBN(0x9f323c80, 0x1262a496),
+ TOBN(0xaa97a7ae, 0x3f046a9a), TOBN(0x70da183e, 0xdf8a039a),
+ TOBN(0x5b68f71c, 0x52aa0ba6), TOBN(0x9be0fe51, 0x21459c2d),
+ TOBN(0xc1e17eb6, 0xcbc613e5), TOBN(0x33131d55, 0x497ea61c),
+ TOBN(0x2f69d39e, 0xaf7eded5), TOBN(0x73c2f434, 0xde6af11b),
+ TOBN(0x4ca52493, 0xa4a375fa), TOBN(0x5f06787c, 0xb833c5c2),
+ TOBN(0x814e091f, 0x3e6e71cf), TOBN(0x76451f57, 0x8b746666)}
+ ,
+ {TOBN(0x80f9bdef, 0x694db7e0), TOBN(0xedca8787, 0xb9fcddc6),
+ TOBN(0x51981c34, 0x03b8dce1), TOBN(0x4274dcf1, 0x70e10ba1),
+ TOBN(0xf72743b8, 0x6def6d1a), TOBN(0xd25b1670, 0xebdb1866),
+ TOBN(0xc4491e8c, 0x050c6f58), TOBN(0x2be2b2ab, 0x87fbd7f5),
+ TOBN(0x3e0e5c9d, 0xd111f8ec), TOBN(0xbcc33f8d, 0xb7c4e760),
+ TOBN(0x702f9a91, 0xbd392a51), TOBN(0x7da4a795, 0xc132e92d),
+ TOBN(0x1a0b0ae3, 0x0bb1151b), TOBN(0x54febac8, 0x02e32251),
+ TOBN(0xea3a5082, 0x694e9e78), TOBN(0xe58ffec1, 0xe4fe40b8),
+ TOBN(0xf85592fc, 0xd1e0cf9e), TOBN(0xdea75f0d, 0xc0e7b2e8),
+ TOBN(0xc04215cf, 0xc135584e), TOBN(0x174fc727, 0x2f57092a),
+ TOBN(0xe7277877, 0xeb930bea), TOBN(0x504caccb, 0x5eb02a5a),
+ TOBN(0xf9fe08f7, 0xf5241b9b), TOBN(0xe7fb62f4, 0x8d5ca954),
+ TOBN(0xfbb8349d, 0x29c4120b), TOBN(0x9f94391f, 0xc0d0d915),
+ TOBN(0xc4074fa7, 0x5410ba51), TOBN(0xa66adbf6, 0x150a5911),
+ TOBN(0xc164543c, 0x34bfca38), TOBN(0xe0f27560, 0xb9e1ccfc),
+ TOBN(0x99da0f53, 0xe820219c), TOBN(0xe8234498, 0xc6b4997a),
+ TOBN(0xcfb88b76, 0x9d4c5423), TOBN(0x9e56eb10, 0xb0521c49),
+ TOBN(0x418e0b5e, 0xbe8700a1), TOBN(0x00cbaad6, 0xf93cb58a),
+ TOBN(0xe923fbde, 0xd92a5e67), TOBN(0xca4979ac, 0x1f347f11),
+ TOBN(0x89162d85, 0x6bc0585b), TOBN(0xdd6254af, 0xac3c70e3),
+ TOBN(0x7b23c513, 0x516e19e4), TOBN(0x56e2e847, 0xc5c4d593),
+ TOBN(0x9f727d73, 0x5ce71ef6), TOBN(0x5b6304a6, 0xf79a44c5),
+ TOBN(0x6638a736, 0x3ab7e433), TOBN(0x1adea470, 0xfe742f83),
+ TOBN(0xe054b854, 0x5b7fc19f), TOBN(0xf935381a, 0xba1d0698),
+ TOBN(0x546eab2d, 0x799e9a74), TOBN(0x96239e0e, 0xa949f729),
+ TOBN(0xca274c6b, 0x7090055a), TOBN(0x835142c3, 0x9020c9b0),
+ TOBN(0xa405667a, 0xa2e8807f), TOBN(0x29f2c085, 0x1aa3d39e),
+ TOBN(0xcc555d64, 0x42fc72f5), TOBN(0xe856e0e7, 0xfbeacb3c),
+ TOBN(0xb5504f9d, 0x918e4936), TOBN(0x65035ef6, 0xb2513982),
+ TOBN(0x0553a0c2, 0x6f4d9cb9), TOBN(0x6cb10d56, 0xbea85509),
+ TOBN(0x48d957b7, 0xa242da11), TOBN(0x16a4d3dd, 0x672b7268),
+ TOBN(0x3d7e637c, 0x8502a96b), TOBN(0x27c7032b, 0x730d463b),
+ TOBN(0xbdc02b18, 0xe4136a14), TOBN(0xbacf969d, 0x678e32bf),
+ TOBN(0xc98d89a3, 0xdd9c3c03), TOBN(0x7b92420a, 0x23becc4f),
+ TOBN(0xd4b41f78, 0xc64d565c), TOBN(0x9f969d00, 0x10f28295),
+ TOBN(0xec7f7f76, 0xb13d051a), TOBN(0x08945e1e, 0xa92da585),
+ TOBN(0x55366b7d, 0x5846426f), TOBN(0xe7d09e89, 0x247d441d),
+ TOBN(0x510b404d, 0x736fbf48), TOBN(0x7fa003d0, 0xe784bd7d),
+ TOBN(0x25f7614f, 0x17fd9596), TOBN(0x49e0e0a1, 0x35cb98db),
+ TOBN(0x2c65957b, 0x2e83a76a), TOBN(0x5d40da8d, 0xcddbe0f8),
+ TOBN(0xf2b8c405, 0x050bad24), TOBN(0x8918426d, 0xc2aa4823),
+ TOBN(0x2aeab3dd, 0xa38365a7), TOBN(0x72031717, 0x7c91b690),
+ TOBN(0x8b00d699, 0x60a94120), TOBN(0x478a255d, 0xe99eaeec),
+ TOBN(0xbf656a5f, 0x6f60aafd), TOBN(0xdfd7cb75, 0x5dee77b3),
+ TOBN(0x37f68bb4, 0xa595939d), TOBN(0x03556479, 0x28740217),
+ TOBN(0x8e740e7c, 0x84ad7612), TOBN(0xd89bc843, 0x9044695f),
+ TOBN(0xf7f3da5d, 0x85a9184d), TOBN(0x562563bb, 0x9fc0b074),
+ TOBN(0x06d2e6aa, 0xf88a888e), TOBN(0x612d8643, 0x161fbe7c),
+ TOBN(0x465edba7, 0xf64085e7), TOBN(0xb230f304, 0x29aa8511),
+ TOBN(0x53388426, 0xcda2d188), TOBN(0x90885735, 0x4b666649),
+ TOBN(0x6f02ff9a, 0x652f54f6), TOBN(0x65c82294, 0x5fae2bf0),
+ TOBN(0x7816ade0, 0x62f5eee3), TOBN(0xdcdbdf43, 0xfcc56d70),
+ TOBN(0x9fb3bba3, 0x54530bb2), TOBN(0xbde3ef77, 0xcb0869ea),
+ TOBN(0x89bc9046, 0x0b431163), TOBN(0x4d03d7d2, 0xe4819a35),
+ TOBN(0x33ae4f9e, 0x43b6a782), TOBN(0x216db307, 0x9c88a686),
+ TOBN(0x91dd88e0, 0x00ffedd9), TOBN(0xb280da9f, 0x12bd4840),
+ TOBN(0x32a7cb8a, 0x1635e741), TOBN(0xfe14008a, 0x78be02a7),
+ TOBN(0x3fafb334, 0x1b7ae030), TOBN(0x7fd508e7, 0x5add0ce9),
+ TOBN(0x72c83219, 0xd607ad51), TOBN(0x0f229c0a, 0x8d40964a),
+ TOBN(0x1be2c336, 0x1c878da2), TOBN(0xe0c96742, 0xeab2ab86),
+ TOBN(0x458f8691, 0x3e538cd7), TOBN(0xa7001f6c, 0x8e08ad53),
+ TOBN(0x52b8c6e6, 0xbf5d15ff), TOBN(0x548234a4, 0x011215dd),
+ TOBN(0xff5a9d2d, 0x3d5b4045), TOBN(0xb0ffeeb6, 0x4a904190),
+ TOBN(0x55a3aca4, 0x48607f8b), TOBN(0x8cbd665c, 0x30a0672a),
+ TOBN(0x87f834e0, 0x42583068), TOBN(0x02da2aeb, 0xf3f6e683),
+ TOBN(0x6b763e5d, 0x05c12248), TOBN(0x7230378f, 0x65a8aefc),
+ TOBN(0x93bd80b5, 0x71e8e5ca), TOBN(0x53ab041c, 0xb3b62524),
+ TOBN(0x1b860513, 0x6c9c552e), TOBN(0xe84d402c, 0xd5524e66),
+ TOBN(0xa37f3573, 0xf37f5937), TOBN(0xeb0f6c7d, 0xd1e4fca5),
+ TOBN(0x2965a554, 0xac8ab0fc), TOBN(0x17fbf56c, 0x274676ac),
+ TOBN(0x2e2f6bd9, 0xacf7d720), TOBN(0x41fc8f88, 0x10224766),
+ TOBN(0x517a14b3, 0x85d53bef), TOBN(0xdae327a5, 0x7d76a7d1),
+ TOBN(0x6ad0a065, 0xc4818267), TOBN(0x33aa189b, 0x37c1bbc1),
+ TOBN(0x64970b52, 0x27392a92), TOBN(0x21699a1c, 0x2d1535ea),
+ TOBN(0xcd20779c, 0xc2d7a7fd), TOBN(0xe3186059, 0x99c83cf2),
+ TOBN(0x9b69440b, 0x72c0b8c7), TOBN(0xa81497d7, 0x7b9e0e4d),
+ TOBN(0x515d5c89, 0x1f5f82dc), TOBN(0x9a7f67d7, 0x6361079e),
+ TOBN(0xa8da81e3, 0x11a35330), TOBN(0xe44990c4, 0x4b18be1b),
+ TOBN(0xc7d5ed95, 0xaf103e59), TOBN(0xece8aba7, 0x8dac9261),
+ TOBN(0xbe82b099, 0x9394b8d3), TOBN(0x6830f09a, 0x16adfe83),
+ TOBN(0x250a29b4, 0x88172d01), TOBN(0x8b20bd65, 0xcaff9e02),
+ TOBN(0xb8a7661e, 0xe8a6329a), TOBN(0x4520304d, 0xd3fce920),
+ TOBN(0xae45da1f, 0x2b47f7ef), TOBN(0xe07f5288, 0x5bffc540),
+ TOBN(0xf7997009, 0x3464f874), TOBN(0x2244c2cd, 0xa6fa1f38),
+ TOBN(0x43c41ac1, 0x94d7d9b1), TOBN(0x5bafdd82, 0xc82e7f17),
+ TOBN(0xdf0614c1, 0x5fda0fca), TOBN(0x74b043a7, 0xa8ae37ad),
+ TOBN(0x3ba6afa1, 0x9e71734c), TOBN(0x15d5437e, 0x9c450f2e),
+ TOBN(0x4a5883fe, 0x67e242b1), TOBN(0x5143bdc2, 0x2c1953c2),
+ TOBN(0x542b8b53, 0xfc5e8920), TOBN(0x363bf9a8, 0x9a9cee08),
+ TOBN(0x02375f10, 0xc3486e08), TOBN(0x2037543b, 0x8c5e70d2),
+ TOBN(0x7109bccc, 0x625640b4), TOBN(0xcbc1051e, 0x8bc62c3b),
+ TOBN(0xf8455fed, 0x803f26ea), TOBN(0x6badceab, 0xeb372424),
+ TOBN(0xa2a9ce7c, 0x6b53f5f9), TOBN(0x64246595, 0x1b176d99),
+ TOBN(0xb1298d36, 0xb95c081b), TOBN(0x53505bb8, 0x1d9a9ee6),
+ TOBN(0x3f6f9e61, 0xf2ba70b0), TOBN(0xd07e16c9, 0x8afad453),
+ TOBN(0x9f1694bb, 0xe7eb4a6a), TOBN(0xdfebced9, 0x3cb0bc8e),
+ TOBN(0x92d3dcdc, 0x53868c8b), TOBN(0x174311a2, 0x386107a6),
+ TOBN(0x4109e07c, 0x689b4e64), TOBN(0x30e4587f, 0x2df3dcb6),
+ TOBN(0x841aea31, 0x0811b3b2), TOBN(0x6144d41d, 0x0cce43ea),
+ TOBN(0x464c4581, 0x2a9a7803), TOBN(0xd03d371f, 0x3e158930),
+ TOBN(0xc676d7f2, 0xb1f3390b), TOBN(0x9f7a1b8c, 0xa5b61272),
+ TOBN(0x4ebebfc9, 0xc2e127a9), TOBN(0x4602500c, 0x5dd997bf),
+ TOBN(0x7f09771c, 0x4711230f), TOBN(0x058eb37c, 0x020f09c1),
+ TOBN(0xab693d4b, 0xfee5e38b), TOBN(0x9289eb1f, 0x4653cbc0),
+ TOBN(0xbecf46ab, 0xd51b9cf5), TOBN(0xd2aa9c02, 0x9f0121af),
+ TOBN(0x36aaf7d2, 0xe90dc274), TOBN(0x909e4ea0, 0x48b95a3c),
+ TOBN(0xe6b70496, 0x6f32dbdb), TOBN(0x672188a0, 0x8b030b3e),
+ TOBN(0xeeffe5b3, 0xcfb617e2), TOBN(0x87e947de, 0x7c82709e),
+ TOBN(0xa44d2b39, 0x1770f5a7), TOBN(0xe4d4d791, 0x0e44eb82),
+ TOBN(0x42e69d1e, 0x3f69712a), TOBN(0xbf11c4d6, 0xac6a820e),
+ TOBN(0xb5e7f3e5, 0x42c4224c), TOBN(0xd6b4e81c, 0x449d941c),
+ TOBN(0x5d72bd16, 0x5450e878), TOBN(0x6a61e28a, 0xee25ac54),
+ TOBN(0x33272094, 0xe6f1cd95), TOBN(0x7512f30d, 0x0d18673f),
+ TOBN(0x32f7a4ca, 0x5afc1464), TOBN(0x2f095656, 0x6bbb977b),
+ TOBN(0x586f47ca, 0xa8226200), TOBN(0x02c868ad, 0x1ac07369),
+ TOBN(0x4ef2b845, 0xc613acbe), TOBN(0x43d7563e, 0x0386054c),
+ TOBN(0x54da9dc7, 0xab952578), TOBN(0xb5423df2, 0x26e84d0b),
+ TOBN(0xa8b64eeb, 0x9b872042), TOBN(0xac205782, 0x5990f6df),
+ TOBN(0x4ff696eb, 0x21f4c77a), TOBN(0x1a79c3e4, 0xaab273af),
+ TOBN(0x29bc922e, 0x9436b3f1), TOBN(0xff807ef8, 0xd6d9a27a),
+ TOBN(0x82acea3d, 0x778f22a0), TOBN(0xfb10b2e8, 0x5b5e7469),
+ TOBN(0xc0b16980, 0x2818ee7d), TOBN(0x011afff4, 0xc91c1a2f),
+ TOBN(0x95a6d126, 0xad124418), TOBN(0x31c081a5, 0xe72e295f),
+ TOBN(0x36bb283a, 0xf2f4db75), TOBN(0xd115540f, 0x7acef462),
+ TOBN(0xc7f3a8f8, 0x33f6746c), TOBN(0x21e46f65, 0xfea990ca),
+ TOBN(0x915fd5c5, 0xcaddb0a9), TOBN(0xbd41f016, 0x78614555),
+ TOBN(0x346f4434, 0x426ffb58), TOBN(0x80559436, 0x14dbc204),
+ TOBN(0xf3dd20fe, 0x5a969b7f), TOBN(0x9d59e956, 0xe899a39a),
+ TOBN(0xf1b0971c, 0x8ad4cf4b), TOBN(0x03448860, 0x2ffb8fb8),
+ TOBN(0xf071ac3c, 0x65340ba4), TOBN(0x408d0596, 0xb27fd758),
+ TOBN(0xe7c78ea4, 0x98c364b0), TOBN(0xa4aac4a5, 0x051e8ab5),
+ TOBN(0xb9e1d560, 0x485d9002), TOBN(0x9acd518a, 0x88844455),
+ TOBN(0xe4ca688f, 0xd06f56c0), TOBN(0xa48af70d, 0xdf027972),
+ TOBN(0x691f0f04, 0x5e9a609d), TOBN(0xa9dd82cd, 0xee61270e),
+ TOBN(0x8903ca63, 0xa0ef18d3), TOBN(0x9fb7ee35, 0x3d6ca3bd),
+ TOBN(0xa7b4a09c, 0xabf47d03), TOBN(0x4cdada01, 0x1c67de8e),
+ TOBN(0x52003749, 0x9355a244), TOBN(0xe77fd2b6, 0x4f2151a9),
+ TOBN(0x695d6cf6, 0x66b4efcb), TOBN(0xc5a0cacf, 0xda2cfe25),
+ TOBN(0x104efe5c, 0xef811865), TOBN(0xf52813e8, 0x9ea5cc3d),
+ TOBN(0x855683dc, 0x40b58dbc), TOBN(0x0338ecde, 0x175fcb11),
+ TOBN(0xf9a05637, 0x74921592), TOBN(0xb4f1261d, 0xb9bb9d31),
+ TOBN(0x551429b7, 0x4e9c5459), TOBN(0xbe182e6f, 0x6ea71f53),
+ TOBN(0xd3a3b07c, 0xdfc50573), TOBN(0x9ba1afda, 0x62be8d44),
+ TOBN(0x9bcfd2cb, 0x52ab65d3), TOBN(0xdf11d547, 0xa9571802),
+ TOBN(0x099403ee, 0x02a2404a), TOBN(0x497406f4, 0x21088a71),
+ TOBN(0x99479409, 0x5004ae71), TOBN(0xbdb42078, 0xa812c362),
+ TOBN(0x2b72a30f, 0xd8828442), TOBN(0x283add27, 0xfcb5ed1c),
+ TOBN(0xf7c0e200, 0x66a40015), TOBN(0x3e3be641, 0x08b295ef),
+ TOBN(0xac127dc1, 0xe038a675), TOBN(0x729deff3, 0x8c5c6320),
+ TOBN(0xb7df8fd4, 0xa90d2c53), TOBN(0x9b74b0ec, 0x681e7cd3),
+ TOBN(0x5cb5a623, 0xdab407e5), TOBN(0xcdbd3615, 0x76b340c6),
+ TOBN(0xa184415a, 0x7d28392c), TOBN(0xc184c1d8, 0xe96f7830),
+ TOBN(0xc3204f19, 0x81d3a80f), TOBN(0xfde0c841, 0xc8e02432),
+ TOBN(0x78203b3e, 0x8149e0c1), TOBN(0x5904bdbb, 0x08053a73),
+ TOBN(0x30fc1dd1, 0x101b6805), TOBN(0x43c223bc, 0x49aa6d49),
+ TOBN(0x9ed67141, 0x7a174087), TOBN(0x311469a0, 0xd5997008),
+ TOBN(0xb189b684, 0x5e43fc61), TOBN(0xf3282375, 0xe0d3ab57),
+ TOBN(0x4fa34b67, 0xb1181da8), TOBN(0x621ed0b2, 0x99ee52b8),
+ TOBN(0x9b178de1, 0xad990676), TOBN(0xd51de67b, 0x56d54065),
+ TOBN(0x2a2c27c4, 0x7538c201), TOBN(0x33856ec8, 0x38a40f5c),
+ TOBN(0x2522fc15, 0xbe6cdcde), TOBN(0x1e603f33, 0x9f0c6f89),
+ TOBN(0x7994edc3, 0x103e30a6), TOBN(0x033a00db, 0x220c853e),
+ TOBN(0xd3cfa409, 0xf7bb7fd7), TOBN(0x70f8781e, 0x462d18f6),
+ TOBN(0xbbd82980, 0x687fe295), TOBN(0x6eef4c32, 0x595669f3),
+ TOBN(0x86a9303b, 0x2f7e85c3), TOBN(0x5fce4621, 0x71988f9b),
+ TOBN(0x5b935bf6, 0xc138acb5), TOBN(0x30ea7d67, 0x25661212),
+ TOBN(0xef1eb5f4, 0xe51ab9a2), TOBN(0x0587c98a, 0xae067c78),
+ TOBN(0xb3ce1b3c, 0x77ca9ca6), TOBN(0x2a553d4d, 0x54b5f057),
+ TOBN(0xc7898236, 0x4da29ec2), TOBN(0xdbdd5d13, 0xb9c57316),
+ TOBN(0xc57d6e6b, 0x2cd80d47), TOBN(0x80b460cf, 0xfe9e7391),
+ TOBN(0x98648cab, 0xf963c31e), TOBN(0x67f9f633, 0xcc4d32fd),
+ TOBN(0x0af42a9d, 0xfdf7c687), TOBN(0x55f292a3, 0x0b015ea7),
+ TOBN(0x89e468b2, 0xcd21ab3d), TOBN(0xe504f022, 0xc393d392),
+ TOBN(0xab21e1d4, 0xa5013af9), TOBN(0xe3283f78, 0xc2c28acb),
+ TOBN(0xf38b35f6, 0x226bf99f), TOBN(0xe8354274, 0x0e291e69),
+ TOBN(0x61673a15, 0xb20c162d), TOBN(0xc101dc75, 0xb04fbdbe),
+ TOBN(0x8323b4c2, 0x255bd617), TOBN(0x6c969693, 0x6c2a9154),
+ TOBN(0xc6e65860, 0x62679387), TOBN(0x8e01db0c, 0xb8c88e23),
+ TOBN(0x33c42873, 0x893a5559), TOBN(0x7630f04b, 0x47a3e149),
+ TOBN(0xb5d80805, 0xddcf35f8), TOBN(0x582ca080, 0x77dfe732),
+ TOBN(0x2c7156e1, 0x0b1894a0), TOBN(0x92034001, 0xd81c68c0),
+ TOBN(0xed225d00, 0xc8b115b5), TOBN(0x237f9c22, 0x83b907f2),
+ TOBN(0x0ea2f32f, 0x4470e2c0), TOBN(0xb725f7c1, 0x58be4e95),
+ TOBN(0x0f1dcafa, 0xb1ae5463), TOBN(0x59ed5187, 0x1ba2fc04),
+ TOBN(0xf6e0f316, 0xd0115d4d), TOBN(0x5180b12f, 0xd3691599),
+ TOBN(0x157e32c9, 0x527f0a41), TOBN(0x7b0b081d, 0xa8e0ecc0),
+ TOBN(0x6dbaaa8a, 0xbf4f0dd0), TOBN(0x99b289c7, 0x4d252696),
+ TOBN(0x79b7755e, 0xdbf864fe), TOBN(0x6974e2b1, 0x76cad3ab),
+ TOBN(0x35dbbee2, 0x06ddd657), TOBN(0xe7cbdd11, 0x2ff3a96d),
+ TOBN(0x88381968, 0x076be758), TOBN(0x2d737e72, 0x08c91f5d),
+ TOBN(0x5f83ab62, 0x86ec3776), TOBN(0x98aa649d, 0x945fa7a1),
+ TOBN(0xf477ec37, 0x72ef0933), TOBN(0x66f52b1e, 0x098c17b1),
+ TOBN(0x9eec58fb, 0xd803738b), TOBN(0x91aaade7, 0xe4e86aa4),
+ TOBN(0x6b1ae617, 0xa5b51492), TOBN(0x63272121, 0xbbc45974),
+ TOBN(0x7e0e28f0, 0x862c5129), TOBN(0x0a8f79a9, 0x3321a4a0),
+ TOBN(0xe26d1664, 0x5041c88f), TOBN(0x0571b805, 0x53233e3a),
+ TOBN(0xd1b0ccde, 0xc9520711), TOBN(0x55a9e4ed, 0x3c8b84bf),
+ TOBN(0x9426bd39, 0xa1fef314), TOBN(0x4f5f638e, 0x6eb93f2b),
+ TOBN(0xba2a1ed3, 0x2bf9341b), TOBN(0xd63c1321, 0x4d42d5a9),
+ TOBN(0xd2964a89, 0x316dc7c5), TOBN(0xd1759606, 0xca511851),
+ TOBN(0xd8a9201f, 0xf9e6ed35), TOBN(0xb7b5ee45, 0x6736925a),
+ TOBN(0x0a83fbbc, 0x99581af7), TOBN(0x3076bc40, 0x64eeb051),
+ TOBN(0x5511c98c, 0x02dec312), TOBN(0x270de898, 0x238dcb78),
+ TOBN(0x2cf4cf9c, 0x539c08c9), TOBN(0xa70cb65e, 0x38d3b06e),
+ TOBN(0xb12ec10e, 0xcfe57bbd), TOBN(0x82c7b656, 0x35a0c2b5),
+ TOBN(0xddc7d5cd, 0x161c67bd), TOBN(0xe32e8985, 0xae3a32cc),
+ TOBN(0x7aba9444, 0xd11a5529), TOBN(0xe964ed02, 0x2427fa1a),
+ TOBN(0x1528392d, 0x24a1770a), TOBN(0xa152ce2c, 0x12c72fcd),
+ TOBN(0x714553a4, 0x8ec07649), TOBN(0x18b4c290, 0x459dd453),
+ TOBN(0xea32b714, 0x7b64b110), TOBN(0xb871bfa5, 0x2e6f07a2),
+ TOBN(0xb67112e5, 0x9e2e3c9b), TOBN(0xfbf250e5, 0x44aa90f6),
+ TOBN(0xf77aedb8, 0xbd539006), TOBN(0x3b0cdf9a, 0xd172a66f),
+ TOBN(0xedf69fea, 0xf8c51187), TOBN(0x05bb67ec, 0x741e4da7),
+ TOBN(0x47df0f32, 0x08114345), TOBN(0x56facb07, 0xbb9792b1),
+ TOBN(0xf3e007e9, 0x8f6229e4), TOBN(0x62d103f4, 0x526fba0f),
+ TOBN(0x4f33bef7, 0xb0339d79), TOBN(0x9841357b, 0xb59bfec1),
+ TOBN(0xfa8dbb59, 0xc34e6705), TOBN(0xc3c7180b, 0x7fdaa84c),
+ TOBN(0xf95872fc, 0xa4108537), TOBN(0x8750cc3b, 0x932a3e5a),
+ TOBN(0xb61cc69d, 0xb7275d7d), TOBN(0xffa0168b, 0x2e59b2e9),
+ TOBN(0xca032abc, 0x6ecbb493), TOBN(0x1d86dbd3, 0x2c9082d8),
+ TOBN(0xae1e0b67, 0xe28ef5ba), TOBN(0x2c9a4699, 0xcb18e169),
+ TOBN(0x0ecd0e33, 0x1e6bbd20), TOBN(0x571b360e, 0xaf5e81d2),
+ TOBN(0xcd9fea58, 0x101c1d45), TOBN(0x6651788e, 0x18880452),
+ TOBN(0xa9972635, 0x1f8dd446), TOBN(0x44bed022, 0xe37281d0),
+ TOBN(0x094b2b2d, 0x33da525d), TOBN(0xf193678e, 0x13144fd8),
+ TOBN(0xb8ab5ba4, 0xf4c1061d), TOBN(0x4343b5fa, 0xdccbe0f4),
+ TOBN(0xa8702371, 0x63812713), TOBN(0x47bf6d2d, 0xf7611d93),
+ TOBN(0x46729b8c, 0xbd21e1d7), TOBN(0x7484d4e0, 0xd629e77d),
+ TOBN(0x830e6eea, 0x60dbac1f), TOBN(0x23d8c484, 0xda06a2f7),
+ TOBN(0x896714b0, 0x50ca535b), TOBN(0xdc8d3644, 0xebd97a9b),
+ TOBN(0x106ef9fa, 0xb12177b4), TOBN(0xf79bf464, 0x534d5d9c),
+ TOBN(0x2537a349, 0xa6ab360b), TOBN(0xc7c54253, 0xa00c744f),
+ TOBN(0xb3c7a047, 0xe5911a76), TOBN(0x61ffa5c8, 0x647f1ee7),
+ TOBN(0x15aed36f, 0x8f56ab42), TOBN(0x6a0d41b0, 0xa3ff9ac9),
+ TOBN(0x68f469f5, 0xcc30d357), TOBN(0xbe9adf81, 0x6b72be96),
+ TOBN(0x1cd926fe, 0x903ad461), TOBN(0x7e89e38f, 0xcaca441b),
+ TOBN(0xf0f82de5, 0xfacf69d4), TOBN(0x363b7e76, 0x4775344c),
+ TOBN(0x6894f312, 0xb2e36d04), TOBN(0x3c6cb4fe, 0x11d1c9a5),
+ TOBN(0x85d9c339, 0x4008e1f2), TOBN(0x5e9a85ea, 0x249f326c),
+ TOBN(0xdc35c60a, 0x678c5e06), TOBN(0xc08b944f, 0x9f86fba9),
+ TOBN(0xde40c02c, 0x89f71f0f), TOBN(0xad8f3e31, 0xff3da3c0),
+ TOBN(0x3ea5096b, 0x42125ded), TOBN(0x13879cbf, 0xa7379183),
+ TOBN(0x6f4714a5, 0x6b306a0b), TOBN(0x359c2ea6, 0x67646c5e),
+ TOBN(0xfacf8943, 0x07726368), TOBN(0x07a58935, 0x65ff431e),
+ TOBN(0x24d661d1, 0x68754ab0), TOBN(0x801fce1d, 0x6f429a76),
+ TOBN(0xc068a85f, 0xa58ce769), TOBN(0xedc35c54, 0x5d5eca2b),
+ TOBN(0xea31276f, 0xa3f660d1), TOBN(0xa0184ebe, 0xb8fc7167),
+ TOBN(0x0f20f21a, 0x1d8db0ae), TOBN(0xd96d095f, 0x56c35e12),
+ TOBN(0xedf402b5, 0xf8c2a25b), TOBN(0x1bb772b9, 0x059204b6),
+ TOBN(0x50cbeae2, 0x19b4e34c), TOBN(0x93109d80, 0x3fa0845a),
+ TOBN(0x54f7ccf7, 0x8ef59fb5), TOBN(0x3b438fe2, 0x88070963),
+ TOBN(0x9e28c659, 0x31f3ba9b), TOBN(0x9cc31b46, 0xead9da92),
+ TOBN(0x3c2f0ba9, 0xb733aa5f), TOBN(0xdece47cb, 0xf05af235),
+ TOBN(0xf8e3f715, 0xa2ac82a5), TOBN(0xc97ba641, 0x2203f18a),
+ TOBN(0xc3af5504, 0x09c11060), TOBN(0x56ea2c05, 0x46af512d),
+ TOBN(0xfac28daf, 0xf3f28146), TOBN(0x87fab43a, 0x959ef494),}
+ ,
+ {TOBN(0x09891641, 0xd4c5105f), TOBN(0x1ae80f8e, 0x6d7fbd65),
+ TOBN(0x9d67225f, 0xbee6bdb0), TOBN(0x3b433b59, 0x7fc4d860),
+ TOBN(0x44e66db6, 0x93e85638), TOBN(0xf7b59252, 0xe3e9862f),
+ TOBN(0xdb785157, 0x665c32ec), TOBN(0x702fefd7, 0xae362f50),
+ TOBN(0x3754475d, 0x0fefb0c3), TOBN(0xd48fb56b, 0x46d7c35d),
+ TOBN(0xa070b633, 0x363798a4), TOBN(0xae89f3d2, 0x8fdb98e6),
+ TOBN(0x970b89c8, 0x6363d14c), TOBN(0x89817521, 0x67abd27d),
+ TOBN(0x9bf7d474, 0x44d5a021), TOBN(0xb3083baf, 0xcac72aee),
+ TOBN(0x389741de, 0xbe949a44), TOBN(0x638e9388, 0x546a4fa5),
+ TOBN(0x3fe6419c, 0xa0047bdc), TOBN(0x7047f648, 0xaaea57ca),
+ TOBN(0x54e48a90, 0x41fbab17), TOBN(0xda8e0b28, 0x576bdba2),
+ TOBN(0xe807eebc, 0xc72afddc), TOBN(0x07d3336d, 0xf42577bf),
+ TOBN(0x62a8c244, 0xbfe20925), TOBN(0x91c19ac3, 0x8fdce867),
+ TOBN(0x5a96a5d5, 0xdd387063), TOBN(0x61d587d4, 0x21d324f6),
+ TOBN(0xe87673a2, 0xa37173ea), TOBN(0x23848008, 0x53778b65),
+ TOBN(0x10f8441e, 0x05bab43e), TOBN(0xfa11fe12, 0x4621efbe),
+ TOBN(0x047b772e, 0x81685d7b), TOBN(0x23f27d81, 0xbf34a976),
+ TOBN(0xc27608e2, 0x915f48ef), TOBN(0x3b0b43fa, 0xa521d5c3),
+ TOBN(0x7613fb26, 0x63ca7284), TOBN(0x7f5729b4, 0x1d4db837),
+ TOBN(0x87b14898, 0x583b526b), TOBN(0x00b732a6, 0xbbadd3d1),
+ TOBN(0x8e02f426, 0x2048e396), TOBN(0x436b50b6, 0x383d9de4),
+ TOBN(0xf78d3481, 0x471e85ad), TOBN(0x8b01ea6a, 0xd005c8d6),
+ TOBN(0xd3c7afee, 0x97015c07), TOBN(0x46cdf1a9, 0x4e3ba2ae),
+ TOBN(0x7a42e501, 0x83d3a1d2), TOBN(0xd54b5268, 0xb541dff4),
+ TOBN(0x3f24cf30, 0x4e23e9bc), TOBN(0x4387f816, 0x126e3624),
+ TOBN(0x26a46a03, 0x3b0b6d61), TOBN(0xaf1bc845, 0x8b2d777c),
+ TOBN(0x25c401ba, 0x527de79c), TOBN(0x0e1346d4, 0x4261bbb6),
+ TOBN(0x4b96c44b, 0x287b4bc7), TOBN(0x658493c7, 0x5254562f),
+ TOBN(0x23f949fe, 0xb8a24a20), TOBN(0x17ebfed1, 0xf52ca53f),
+ TOBN(0x9b691bbe, 0xbcfb4853), TOBN(0x5617ff6b, 0x6278a05d),
+ TOBN(0x241b34c5, 0xe3c99ebd), TOBN(0xfc64242e, 0x1784156a),
+ TOBN(0x4206482f, 0x695d67df), TOBN(0xb967ce0e, 0xee27c011),
+ TOBN(0x65db3751, 0x21c80b5d), TOBN(0x2e7a563c, 0xa31ecca0),
+ TOBN(0xe56ffc4e, 0x5238a07e), TOBN(0x3d6c2966, 0x32ced854),
+ TOBN(0xe99d7d1a, 0xaf70b885), TOBN(0xafc3bad9, 0x2d686459),
+ TOBN(0x9c78bf46, 0x0cc8ba5b), TOBN(0x5a439519, 0x18955aa3),
+ TOBN(0xf8b517a8, 0x5fe4e314), TOBN(0xe60234d0, 0xfcb8906f),
+ TOBN(0xffe542ac, 0xf2061b23), TOBN(0x287e191f, 0x6b4cb59c),
+ TOBN(0x21857ddc, 0x09d877d8), TOBN(0x1c23478c, 0x14678941),
+ TOBN(0xbbf0c056, 0xb6e05ea4), TOBN(0x82da4b53, 0xb01594fe),
+ TOBN(0xf7526791, 0xfadb8608), TOBN(0x049e832d, 0x7b74cdf6),
+ TOBN(0xa43581cc, 0xc2b90a34), TOBN(0x73639eb8, 0x9360b10c),
+ TOBN(0x4fba331f, 0xe1e4a71b), TOBN(0x6ffd6b93, 0x8072f919),
+ TOBN(0x6e53271c, 0x65679032), TOBN(0x67206444, 0xf14272ce),
+ TOBN(0xc0f734a3, 0xb2335834), TOBN(0x9526205a, 0x90ef6860),
+ TOBN(0xcb8be717, 0x04e2bb0d), TOBN(0x2418871e, 0x02f383fa),
+ TOBN(0xd7177681, 0x4082c157), TOBN(0xcc914ad0, 0x29c20073),
+ TOBN(0xf186c1eb, 0xe587e728), TOBN(0x6fdb3c22, 0x61bcd5fd),
+ TOBN(0x30d014a6, 0xf2f9f8e9), TOBN(0x963ece23, 0x4fec49d2),
+ TOBN(0x862025c5, 0x9605a8d9), TOBN(0x39874445, 0x19f8929a),
+ TOBN(0x01b6ff65, 0x12bf476a), TOBN(0x598a64d8, 0x09cf7d91),
+ TOBN(0xd7ec7749, 0x93be56ca), TOBN(0x10899785, 0xcbb33615),
+ TOBN(0xb8a092fd, 0x02eee3ad), TOBN(0xa86b3d35, 0x30145270),
+ TOBN(0x323d98c6, 0x8512b675), TOBN(0x4b8bc785, 0x62ebb40f),
+ TOBN(0x7d301f54, 0x413f9cde), TOBN(0xa5e4fb4f, 0x2bab5664),
+ TOBN(0x1d2b252d, 0x1cbfec23), TOBN(0xfcd576bb, 0xe177120d),
+ TOBN(0x04427d3e, 0x83731a34), TOBN(0x2bb9028e, 0xed836e8e),
+ TOBN(0xb36acff8, 0xb612ca7c), TOBN(0xb88fe5ef, 0xd3d9c73a),
+ TOBN(0xbe2a6bc6, 0xedea4eb3), TOBN(0x43b93133, 0x488eec77),
+ TOBN(0xf41ff566, 0xb17106e1), TOBN(0x469e9172, 0x654efa32),
+ TOBN(0xb4480f04, 0x41c23fa3), TOBN(0xb4712eb0, 0xc1989a2e),
+ TOBN(0x3ccbba0f, 0x93a29ca7), TOBN(0x6e205c14, 0xd619428c),
+ TOBN(0x90db7957, 0xb3641686), TOBN(0x0432691d, 0x45ac8b4e),
+ TOBN(0x07a759ac, 0xf64e0350), TOBN(0x0514d89c, 0x9c972517),
+ TOBN(0x1701147f, 0xa8e67fc3), TOBN(0x9e2e0b8b, 0xab2085be),
+ TOBN(0xd5651824, 0xac284e57), TOBN(0x890d4325, 0x74893664),
+ TOBN(0x8a7c5e6e, 0xc55e68a3), TOBN(0xbf12e90b, 0x4339c85a),
+ TOBN(0x31846b85, 0xf922b655), TOBN(0x9a54ce4d, 0x0bf4d700),
+ TOBN(0xd7f4e83a, 0xf1a14295), TOBN(0x916f955c, 0xb285d4f9),
+ TOBN(0xe57bb0e0, 0x99ffdaba), TOBN(0x28a43034, 0xeab0d152),
+ TOBN(0x0a36ffa2, 0xb8a9cef8), TOBN(0x5517407e, 0xb9ec051a),
+ TOBN(0x9c796096, 0xea68e672), TOBN(0x853db5fb, 0xfb3c77fb),
+ TOBN(0x21474ba9, 0xe864a51a), TOBN(0x6c267699, 0x6e8a1b8b),
+ TOBN(0x7c823626, 0x94120a28), TOBN(0xe61e9a48, 0x8383a5db),
+ TOBN(0x7dd75003, 0x9f84216d), TOBN(0xab020d07, 0xad43cd85),
+ TOBN(0x9437ae48, 0xda12c659), TOBN(0x6449c2eb, 0xe65452ad),
+ TOBN(0xcc7c4c1c, 0x2cf9d7c1), TOBN(0x1320886a, 0xee95e5ab),
+ TOBN(0xbb7b9056, 0xbeae170c), TOBN(0xc8a5b250, 0xdbc0d662),
+ TOBN(0x4ed81432, 0xc11d2303), TOBN(0x7da66912, 0x1f03769f),
+ TOBN(0x3ac7a5fd, 0x84539828), TOBN(0x14dada94, 0x3bccdd02),
+ TOBN(0x8b84c321, 0x7ef6b0d1), TOBN(0x52a9477a, 0x7c933f22),
+ TOBN(0x5ef6728a, 0xfd440b82), TOBN(0x5c3bd859, 0x6ce4bd5e),
+ TOBN(0x918b80f5, 0xf22c2d3e), TOBN(0x368d5040, 0xb7bb6cc5),
+ TOBN(0xb66142a1, 0x2695a11c), TOBN(0x60ac583a, 0xeb19ea70),
+ TOBN(0x317cbb98, 0x0eab2437), TOBN(0x8cc08c55, 0x5e2654c8),
+ TOBN(0xfe2d6520, 0xe6d8307f), TOBN(0xe9f147f3, 0x57428993),
+ TOBN(0x5f9c7d14, 0xd2fd6cf1), TOBN(0xa3ecd064, 0x2d4fcbb0),
+ TOBN(0xad83fef0, 0x8e7341f7), TOBN(0x643f23a0, 0x3a63115c),
+ TOBN(0xd38a78ab, 0xe65ab743), TOBN(0xbf7c75b1, 0x35edc89c),
+ TOBN(0x3dd8752e, 0x530df568), TOBN(0xf85c4a76, 0xe308c682),
+ TOBN(0x4c9955b2, 0xe68acf37), TOBN(0xa544df3d, 0xab32af85),
+ TOBN(0x4b8ec3f5, 0xa25cf493), TOBN(0x4d8f2764, 0x1a622feb),
+ TOBN(0x7bb4f7aa, 0xf0dcbc49), TOBN(0x7de551f9, 0x70bbb45b),
+ TOBN(0xcfd0f3e4, 0x9f2ca2e5), TOBN(0xece58709, 0x1f5c76ef),
+ TOBN(0x32920edd, 0x167d79ae), TOBN(0x039df8a2, 0xfa7d7ec1),
+ TOBN(0xf46206c0, 0xbb30af91), TOBN(0x1ff5e2f5, 0x22676b59),
+ TOBN(0x11f4a039, 0x6ea51d66), TOBN(0x506c1445, 0x807d7a26),
+ TOBN(0x60da5705, 0x755a9b24), TOBN(0x8fc8cc32, 0x1f1a319e),
+ TOBN(0x83642d4d, 0x9433d67d), TOBN(0x7fa5cb8f, 0x6a7dd296),
+ TOBN(0x576591db, 0x9b7bde07), TOBN(0x13173d25, 0x419716fb),
+ TOBN(0xea30599d, 0xd5b340ff), TOBN(0xfc6b5297, 0xb0fe76c5),
+ TOBN(0x1c6968c8, 0xab8f5adc), TOBN(0xf723c7f5, 0x901c928d),
+ TOBN(0x4203c321, 0x9773d402), TOBN(0xdf7c6aa3, 0x1b51dd47),
+ TOBN(0x3d49e37a, 0x552be23c), TOBN(0x57febee8, 0x0b5a6e87),
+ TOBN(0xc5ecbee4, 0x7bd8e739), TOBN(0x79d44994, 0xae63bf75),
+ TOBN(0x168bd00f, 0x38fb8923), TOBN(0x75d48ee4, 0xd0533130),
+ TOBN(0x554f77aa, 0xdb5cdf33), TOBN(0x3396e896, 0x3c696769),
+ TOBN(0x2fdddbf2, 0xd3fd674e), TOBN(0xbbb8f6ee, 0x99d0e3e5),
+ TOBN(0x51b90651, 0xcbae2f70), TOBN(0xefc4bc05, 0x93aaa8eb),
+ TOBN(0x8ecd8689, 0xdd1df499), TOBN(0x1aee99a8, 0x22f367a5),
+ TOBN(0x95d485b9, 0xae8274c5), TOBN(0x6c14d445, 0x7d30b39c),
+ TOBN(0xbafea90b, 0xbcc1ef81), TOBN(0x7c5f317a, 0xa459a2ed),
+ TOBN(0x01211075, 0x4ef44227), TOBN(0xa17bed6e, 0xdc20f496),
+ TOBN(0x0cdfe424, 0x819853cd), TOBN(0x13793298, 0xf71e2ce7),
+ TOBN(0x3c1f3078, 0xdbbe307b), TOBN(0x6dd1c20e, 0x76ee9936),
+ TOBN(0x23ee4b57, 0x423caa20), TOBN(0x4ac3793b, 0x8efb840e),
+ TOBN(0x934438eb, 0xed1f8ca0), TOBN(0x3e546658, 0x4ebb25a2),
+ TOBN(0xc415af0e, 0xc069896f), TOBN(0xc13eddb0, 0x9a5aa43d),
+ TOBN(0x7a04204f, 0xd49eb8f6), TOBN(0xd0d5bdfc, 0xd74f1670),
+ TOBN(0x3697e286, 0x56fc0558), TOBN(0x10207371, 0x01cebade),
+ TOBN(0x5f87e690, 0x0647a82b), TOBN(0x908e0ed4, 0x8f40054f),
+ TOBN(0xa9f633d4, 0x79853803), TOBN(0x8ed13c9a, 0x4a28b252),
+ TOBN(0x3e2ef676, 0x1f460f64), TOBN(0x53930b9b, 0x36d06336),
+ TOBN(0x347073ac, 0x8fc4979b), TOBN(0x84380e0e, 0x5ecd5597),
+ TOBN(0xe3b22c6b, 0xc4fe3c39), TOBN(0xba4a8153, 0x6c7bebdf),
+ TOBN(0xf23ab6b7, 0x25693459), TOBN(0x53bc3770, 0x14922b11),
+ TOBN(0x4645c8ab, 0x5afc60db), TOBN(0xaa022355, 0x20b9f2a3),
+ TOBN(0x52a2954c, 0xce0fc507), TOBN(0x8c2731bb, 0x7ce1c2e7),
+ TOBN(0xf39608ab, 0x18a0339d), TOBN(0xac7a658d, 0x3735436c),
+ TOBN(0xb22c2b07, 0xcd992b4f), TOBN(0x4e83daec, 0xf40dcfd4),
+ TOBN(0x8a34c7be, 0x2f39ea3e), TOBN(0xef0c005f, 0xb0a56d2e),
+ TOBN(0x62731f6a, 0x6edd8038), TOBN(0x5721d740, 0x4e3cb075),
+ TOBN(0x1ea41511, 0xfbeeee1b), TOBN(0xd1ef5e73, 0xef1d0c05),
+ TOBN(0x42feefd1, 0x73c07d35), TOBN(0xe530a00a, 0x8a329493),
+ TOBN(0x5d55b7fe, 0xf15ebfb0), TOBN(0x549de03c, 0xd322491a),
+ TOBN(0xf7b5f602, 0x745b3237), TOBN(0x3632a3a2, 0x1ab6e2b6),
+ TOBN(0x0d3bba89, 0x0ef59f78), TOBN(0x0dfc6443, 0xc9e52b9a),
+ TOBN(0x1dc79699, 0x72631447), TOBN(0xef033917, 0xb3be20b1),
+ TOBN(0x0c92735d, 0xb1383948), TOBN(0xc1fc29a2, 0xc0dd7d7d),
+ TOBN(0x6485b697, 0x403ed068), TOBN(0x13bfaab3, 0xaac93bdc),
+ TOBN(0x410dc6a9, 0x0deeaf52), TOBN(0xb003fb02, 0x4c641c15),
+ TOBN(0x1384978c, 0x5bc504c4), TOBN(0x37640487, 0x864a6a77),
+ TOBN(0x05991bc6, 0x222a77da), TOBN(0x62260a57, 0x5e47eb11),
+ TOBN(0xc7af6613, 0xf21b432c), TOBN(0x22f3acc9, 0xab4953e9),
+ TOBN(0x52934922, 0x8e41d155), TOBN(0x4d024568, 0x3ac059ef),
+ TOBN(0xb0201755, 0x4d884411), TOBN(0xce8055cf, 0xa59a178f),
+ TOBN(0xcd77d1af, 0xf6204549), TOBN(0xa0a00a3e, 0xc7066759),
+ TOBN(0x471071ef, 0x0272c229), TOBN(0x009bcf6b, 0xd3c4b6b0),
+ TOBN(0x2a2638a8, 0x22305177), TOBN(0xd51d59df, 0x41645bbf),
+ TOBN(0xa81142fd, 0xc0a7a3c0), TOBN(0xa17eca6d, 0x4c7063ee),
+ TOBN(0x0bb887ed, 0x60d9dcec), TOBN(0xd6d28e51, 0x20ad2455),
+ TOBN(0xebed6308, 0xa67102ba), TOBN(0x042c3114, 0x8bffa408),
+ TOBN(0xfd099ac5, 0x8aa68e30), TOBN(0x7a6a3d7c, 0x1483513e),
+ TOBN(0xffcc6b75, 0xba2d8f0c), TOBN(0x54dacf96, 0x1e78b954),
+ TOBN(0xf645696f, 0xa4a9af89), TOBN(0x3a411940, 0x06ac98ec),
+ TOBN(0x41b8b3f6, 0x22a67a20), TOBN(0x2d0b1e0f, 0x99dec626),
+ TOBN(0x27c89192, 0x40be34e8), TOBN(0xc7162b37, 0x91907f35),
+ TOBN(0x90188ec1, 0xa956702b), TOBN(0xca132f7d, 0xdf93769c),
+ TOBN(0x3ece44f9, 0x0e2025b4), TOBN(0x67aaec69, 0x0c62f14c),
+ TOBN(0xad741418, 0x22e3cc11), TOBN(0xcf9b75c3, 0x7ff9a50e),
+ TOBN(0x02fa2b16, 0x4d348272), TOBN(0xbd99d61a, 0x9959d56d),
+ TOBN(0xbc4f19db, 0x18762916), TOBN(0xcc7cce50, 0x49c1ac80),
+ TOBN(0x4d59ebaa, 0xd846bd83), TOBN(0x8775a9dc, 0xa9202849),
+ TOBN(0x07ec4ae1, 0x6e1f4ca9), TOBN(0x27eb5875, 0xba893f11),
+ TOBN(0x00284d51, 0x662cc565), TOBN(0x82353a6b, 0x0db4138d),
+ TOBN(0xd9c7aaaa, 0xaa32a594), TOBN(0xf5528b5e, 0xa5669c47),
+ TOBN(0xf3220231, 0x2f23c5ff), TOBN(0xe3e8147a, 0x6affa3a1),
+ TOBN(0xfb423d5c, 0x202ddda0), TOBN(0x3d6414ac, 0x6b871bd4),
+ TOBN(0x586f82e1, 0xa51a168a), TOBN(0xb712c671, 0x48ae5448),
+ TOBN(0x9a2e4bd1, 0x76233eb8), TOBN(0x0188223a, 0x78811ca9),
+ TOBN(0x553c5e21, 0xf7c18de1), TOBN(0x7682e451, 0xb27bb286),
+ TOBN(0x3ed036b3, 0x0e51e929), TOBN(0xf487211b, 0xec9cb34f),
+ TOBN(0x0d094277, 0x0c24efc8), TOBN(0x0349fd04, 0xbef737a4),
+ TOBN(0x6d1c9dd2, 0x514cdd28), TOBN(0x29c135ff, 0x30da9521),
+ TOBN(0xea6e4508, 0xf78b0b6f), TOBN(0x176f5dd2, 0x678c143c),
+ TOBN(0x08148418, 0x4be21e65), TOBN(0x27f7525c, 0xe7df38c4),
+ TOBN(0x1fb70e09, 0x748ab1a4), TOBN(0x9cba50a0, 0x5efe4433),
+ TOBN(0x7846c7a6, 0x15f75af2), TOBN(0x2a7c2c57, 0x5ee73ea8),
+ TOBN(0x42e566a4, 0x3f0a449a), TOBN(0x45474c3b, 0xad90fc3d),
+ TOBN(0x7447be3d, 0x8b61d057), TOBN(0x3e9d1cf1, 0x3a4ec092),
+ TOBN(0x1603e453, 0xf380a6e6), TOBN(0x0b86e431, 0x9b1437c2),
+ TOBN(0x7a4173f2, 0xef29610a), TOBN(0x8fa729a7, 0xf03d57f7),
+ TOBN(0x3e186f6e, 0x6c9c217e), TOBN(0xbe1d3079, 0x91919524),
+ TOBN(0x92a62a70, 0x153d4fb1), TOBN(0x32ed3e34, 0xd68c2f71),
+ TOBN(0xd785027f, 0x9eb1a8b7), TOBN(0xbc37eb77, 0xc5b22fe8),
+ TOBN(0x466b34f0, 0xb9d6a191), TOBN(0x008a89af, 0x9a05f816),
+ TOBN(0x19b028fb, 0x7d42c10a), TOBN(0x7fe8c92f, 0x49b3f6b8),
+ TOBN(0x58907cc0, 0xa5a0ade3), TOBN(0xb3154f51, 0x559d1a7c),
+ TOBN(0x5066efb6, 0xd9790ed6), TOBN(0xa77a0cbc, 0xa6aa793b),
+ TOBN(0x1a915f3c, 0x223e042e), TOBN(0x1c5def04, 0x69c5874b),
+ TOBN(0x0e830078, 0x73b6c1da), TOBN(0x55cf85d2, 0xfcd8557a),
+ TOBN(0x0f7c7c76, 0x0460f3b1), TOBN(0x87052acb, 0x46e58063),
+ TOBN(0x09212b80, 0x907eae66), TOBN(0x3cb068e0, 0x4d721c89),
+ TOBN(0xa87941ae, 0xdd45ac1c), TOBN(0xde8d5c0d, 0x0daa0dbb),
+ TOBN(0xda421fdc, 0xe3502e6e), TOBN(0xc8944201, 0x4d89a084),
+ TOBN(0x7307ba5e, 0xf0c24bfb), TOBN(0xda212beb, 0x20bde0ef),
+ TOBN(0xea2da24b, 0xf82ce682), TOBN(0x058d3816, 0x07f71fe4),
+ TOBN(0x35a02462, 0x5ffad8de), TOBN(0xcd7b05dc, 0xaadcefab),
+ TOBN(0xd442f8ed, 0x1d9f54ec), TOBN(0x8be3d618, 0xb2d3b5ca),
+ TOBN(0xe2220ed0, 0xe06b2ce2), TOBN(0x82699a5f, 0x1b0da4c0),
+ TOBN(0x3ff106f5, 0x71c0c3a7), TOBN(0x8f580f5a, 0x0d34180c),
+ TOBN(0x4ebb120e, 0x22d7d375), TOBN(0x5e5782cc, 0xe9513675),
+ TOBN(0x2275580c, 0x99c82a70), TOBN(0xe8359fbf, 0x15ea8c4c),
+ TOBN(0x53b48db8, 0x7b415e70), TOBN(0xaacf2240, 0x100c6014),
+ TOBN(0x9faaccf5, 0xe4652f1d), TOBN(0xbd6fdd2a, 0xd56157b2),
+ TOBN(0xa4f4fb1f, 0x6261ec50), TOBN(0x244e55ad, 0x476bcd52),
+ TOBN(0x881c9305, 0x047d320b), TOBN(0x1ca983d5, 0x6181263f),
+ TOBN(0x354e9a44, 0x278fb8ee), TOBN(0xad2dbc0f, 0x396e4964),
+ TOBN(0x723f3aa2, 0x9268b3de), TOBN(0x0d1ca29a, 0xe6e0609a),
+ TOBN(0x794866aa, 0x6cf44252), TOBN(0x0b59f3e3, 0x01af87ed),
+ TOBN(0xe234e5ff, 0x7f4a6c51), TOBN(0xa8768fd2, 0x61dc2f7e),
+ TOBN(0xdafc7332, 0x0a94d81f), TOBN(0xd7f84282, 0x06938ce1),
+ TOBN(0xae0b3c0e, 0x0546063e), TOBN(0x7fbadcb2, 0x5d61abc6),
+ TOBN(0xd5d7a2c9, 0x369ac400), TOBN(0xa5978d09, 0xae67d10c),
+ TOBN(0x290f211e, 0x4f85eaac), TOBN(0xe61e2ad1, 0xfacac681),
+ TOBN(0xae125225, 0x388384cd), TOBN(0xa7fb68e9, 0xccfde30f),
+ TOBN(0x7a59b936, 0x3daed4c2), TOBN(0x80a9aa40, 0x2606f789),
+ TOBN(0xb40c1ea5, 0xf6a6d90a), TOBN(0x948364d3, 0x514d5885),
+ TOBN(0x062ebc60, 0x70985182), TOBN(0xa6db5b0e, 0x33310895),
+ TOBN(0x64a12175, 0xe329c2f5), TOBN(0xc5f25bd2, 0x90ea237e),
+ TOBN(0x7915c524, 0x2d0a4c23), TOBN(0xeb5d26e4, 0x6bb3cc52),
+ TOBN(0x369a9116, 0xc09e2c92), TOBN(0x0c527f92, 0xcf182cf8),
+ TOBN(0x9e591938, 0x2aede0ac), TOBN(0xb2922208, 0x6cc34939),
+ TOBN(0x3c9d8962, 0x99a34361), TOBN(0x3c81836d, 0xc1905fe6),
+ TOBN(0x4bfeb57f, 0xa001ec5a), TOBN(0xe993f5bb, 0xa0dc5dba),
+ TOBN(0x47884109, 0x724a1380), TOBN(0x8a0369ab, 0x32fe9a04),
+ TOBN(0xea068d60, 0x8c927db8), TOBN(0xbf5f37cf, 0x94655741),
+ TOBN(0x47d402a2, 0x04b6c7ea), TOBN(0x4551c295, 0x6af259cb),
+ TOBN(0x698b71e7, 0xed77ee8b), TOBN(0xbddf7bd0, 0xf309d5c7),
+ TOBN(0x6201c22c, 0x34e780ca), TOBN(0xab04f7d8, 0x4c295ef4),
+ TOBN(0x1c947294, 0x4313a8ce), TOBN(0xe532e4ac, 0x92ca4cfe),
+ TOBN(0x89738f80, 0xd0a7a97a), TOBN(0xec088c88, 0xa580fd5b),
+ TOBN(0x612b1ecc, 0x42ce9e51), TOBN(0x8f9840fd, 0xb25fdd2a),
+ TOBN(0x3cda78c0, 0x01e7f839), TOBN(0x546b3d3a, 0xece05480),
+ TOBN(0x271719a9, 0x80d30916), TOBN(0x45497107, 0x584c20c4),
+ TOBN(0xaf8f9478, 0x5bc78608), TOBN(0x28c7d484, 0x277e2a4c),
+ TOBN(0xfce01767, 0x88a2ffe4), TOBN(0xdc506a35, 0x28e169a5),
+ TOBN(0x0ea10861, 0x7af9c93a), TOBN(0x1ed24361, 0x03fa0e08),
+ TOBN(0x96eaaa92, 0xa3d694e7), TOBN(0xc0f43b4d, 0xef50bc74),
+ TOBN(0xce6aa58c, 0x64114db4), TOBN(0x8218e8ea, 0x7c000fd4),
+ TOBN(0xac815dfb, 0x185f8844), TOBN(0xcd7e90cb, 0x1557abfb),
+ TOBN(0x23d16655, 0xafbfecdf), TOBN(0x80f3271f, 0x085cac4a),
+ TOBN(0x7fc39aa7, 0xd0e62f47), TOBN(0x88d519d1, 0x460a48e5),
+ TOBN(0x59559ac4, 0xd28f101e), TOBN(0x7981d9e9, 0xca9ae816),
+ TOBN(0x5c38652c, 0x9ac38203), TOBN(0x86eaf87f, 0x57657fe5),
+ TOBN(0x568fc472, 0xe21f5416), TOBN(0x2afff39c, 0xe7e597b5),
+ TOBN(0x3adbbb07, 0x256d4eab), TOBN(0x22598692, 0x8285ab89),
+ TOBN(0x35f8112a, 0x041caefe), TOBN(0x95df02e3, 0xa5064c8b),
+ TOBN(0x4d63356e, 0xc7004bf3), TOBN(0x230a08f4, 0xdb83c7de),
+ TOBN(0xca27b270, 0x8709a7b7), TOBN(0x0d1c4cc4, 0xcb9abd2d),
+ TOBN(0x8a0bc66e, 0x7550fee8), TOBN(0x369cd4c7, 0x9cf7247e),
+ TOBN(0x75562e84, 0x92b5b7e7), TOBN(0x8fed0da0, 0x5802af7b),
+ TOBN(0x6a7091c2, 0xe48fb889), TOBN(0x26882c13, 0x7b8a9d06),
+ TOBN(0xa2498663, 0x1b82a0e2), TOBN(0x844ed736, 0x3518152d),
+ TOBN(0x282f476f, 0xd86e27c7), TOBN(0xa04edaca, 0x04afefdc),
+ TOBN(0x8b256ebc, 0x6119e34d), TOBN(0x56a413e9, 0x0787d78b),}
+ ,
+ {TOBN(0x82ee061d, 0x5a74be50), TOBN(0xe41781c4, 0xdea16ff5),
+ TOBN(0xe0b0c81e, 0x99bfc8a2), TOBN(0x624f4d69, 0x0b547e2d),
+ TOBN(0x3a83545d, 0xbdcc9ae4), TOBN(0x2573dbb6, 0x409b1e8e),
+ TOBN(0x482960c4, 0xa6c93539), TOBN(0xf01059ad, 0x5ae18798),
+ TOBN(0x715c9f97, 0x3112795f), TOBN(0xe8244437, 0x984e6ee1),
+ TOBN(0x55cb4858, 0xecb66bcd), TOBN(0x7c136735, 0xabaffbee),
+ TOBN(0x54661595, 0x5dbec38e), TOBN(0x51c0782c, 0x388ad153),
+ TOBN(0x9ba4c53a, 0xc6e0952f), TOBN(0x27e6782a, 0x1b21dfa8),
+ TOBN(0x682f903d, 0x4ed2dbc2), TOBN(0x0eba59c8, 0x7c3b2d83),
+ TOBN(0x8e9dc84d, 0x9c7e9335), TOBN(0x5f9b21b0, 0x0eb226d7),
+ TOBN(0xe33bd394, 0xaf267bae), TOBN(0xaa86cc25, 0xbe2e15ae),
+ TOBN(0x4f0bf67d, 0x6a8ec500), TOBN(0x5846aa44, 0xf9630658),
+ TOBN(0xfeb09740, 0xe2c2bf15), TOBN(0x627a2205, 0xa9e99704),
+ TOBN(0xec8d73d0, 0xc2fbc565), TOBN(0x223eed8f, 0xc20c8de8),
+ TOBN(0x1ee32583, 0xa8363b49), TOBN(0x1a0b6cb9, 0xc9c2b0a6),
+ TOBN(0x49f7c3d2, 0x90dbc85c), TOBN(0xa8dfbb97, 0x1ef4c1ac),
+ TOBN(0xafb34d4c, 0x65c7c2ab), TOBN(0x1d4610e7, 0xe2c5ea84),
+ TOBN(0x893f6d1b, 0x973c4ab5), TOBN(0xa3cdd7e9, 0x945ba5c4),
+ TOBN(0x60514983, 0x064417ee), TOBN(0x1459b23c, 0xad6bdf2b),
+ TOBN(0x23b2c341, 0x5cf726c3), TOBN(0x3a829635, 0x32d6354a),
+ TOBN(0x294f901f, 0xab192c18), TOBN(0xec5fcbfe, 0x7030164f),
+ TOBN(0xe2e2fcb7, 0xe2246ba6), TOBN(0x1e7c88b3, 0x221a1a0c),
+ TOBN(0x72c7dd93, 0xc92d88c5), TOBN(0x41c2148e, 0x1106fb59),
+ TOBN(0x547dd4f5, 0xa0f60f14), TOBN(0xed9b52b2, 0x63960f31),
+ TOBN(0x6c8349eb, 0xb0a5b358), TOBN(0xb154c5c2, 0x9e7e2ed6),
+ TOBN(0xcad5eccf, 0xeda462db), TOBN(0xf2d6dbe4, 0x2de66b69),
+ TOBN(0x426aedf3, 0x8665e5b2), TOBN(0x488a8513, 0x7b7f5723),
+ TOBN(0x15cc43b3, 0x8bcbb386), TOBN(0x27ad0af3, 0xd791d879),
+ TOBN(0xc16c236e, 0x846e364f), TOBN(0x7f33527c, 0xdea50ca0),
+ TOBN(0xc4810775, 0x0926b86d), TOBN(0x6c2a3609, 0x0598e70c),
+ TOBN(0xa6755e52, 0xf024e924), TOBN(0xe0fa07a4, 0x9db4afca),
+ TOBN(0x15c3ce7d, 0x66831790), TOBN(0x5b4ef350, 0xa6cbb0d6),
+ TOBN(0x2c4aafc4, 0xb6205969), TOBN(0x42563f02, 0xf6c7854f),
+ TOBN(0x016aced5, 0x1d983b48), TOBN(0xfeb356d8, 0x99949755),
+ TOBN(0x8c2a2c81, 0xd1a39bd7), TOBN(0x8f44340f, 0xe6934ae9),
+ TOBN(0x148cf91c, 0x447904da), TOBN(0x7340185f, 0x0f51a926),
+ TOBN(0x2f8f00fb, 0x7409ab46), TOBN(0x057e78e6, 0x80e289b2),
+ TOBN(0x03e5022c, 0xa888e5d1), TOBN(0x3c87111a, 0x9dede4e2),
+ TOBN(0x5b9b0e1c, 0x7809460b), TOBN(0xe751c852, 0x71c9abc7),
+ TOBN(0x8b944e28, 0xc7cc1dc9), TOBN(0x4f201ffa, 0x1d3cfa08),
+ TOBN(0x02fc905c, 0x3e6721ce), TOBN(0xd52d70da, 0xd0b3674c),
+ TOBN(0x5dc2e5ca, 0x18810da4), TOBN(0xa984b273, 0x5c69dd99),
+ TOBN(0x63b92527, 0x84de5ca4), TOBN(0x2f1c9872, 0xc852dec4),
+ TOBN(0x18b03593, 0xc2e3de09), TOBN(0x19d70b01, 0x9813dc2f),
+ TOBN(0x42806b2d, 0xa6dc1d29), TOBN(0xd3030009, 0xf871e144),
+ TOBN(0xa1feb333, 0xaaf49276), TOBN(0xb5583b9e, 0xc70bc04b),
+ TOBN(0x1db0be78, 0x95695f20), TOBN(0xfc841811, 0x89d012b5),
+ TOBN(0x6409f272, 0x05f61643), TOBN(0x40d34174, 0xd5883128),
+ TOBN(0xd79196f5, 0x67419833), TOBN(0x6059e252, 0x863b7b08),
+ TOBN(0x84da1817, 0x1c56700c), TOBN(0x5758ee56, 0xb28d3ec4),
+ TOBN(0x7da2771d, 0x013b0ea6), TOBN(0xfddf524b, 0x54c5e9b9),
+ TOBN(0x7df4faf8, 0x24305d80), TOBN(0x58f5c1bf, 0x3a97763f),
+ TOBN(0xa5af37f1, 0x7c696042), TOBN(0xd4cba22c, 0x4a2538de),
+ TOBN(0x211cb995, 0x9ea42600), TOBN(0xcd105f41, 0x7b069889),
+ TOBN(0xb1e1cf19, 0xddb81e74), TOBN(0x472f2d89, 0x5157b8ca),
+ TOBN(0x086fb008, 0xee9db885), TOBN(0x365cd570, 0x0f26d131),
+ TOBN(0x284b02bb, 0xa2be7053), TOBN(0xdcbbf7c6, 0x7ab9a6d6),
+ TOBN(0x4425559c, 0x20f7a530), TOBN(0x961f2dfa, 0x188767c8),
+ TOBN(0xe2fd9435, 0x70dc80c4), TOBN(0x104d6b63, 0xf0784120),
+ TOBN(0x7f592bc1, 0x53567122), TOBN(0xf6bc1246, 0xf688ad77),
+ TOBN(0x05214c05, 0x0f15dde9), TOBN(0xa47a76a8, 0x0d5f2b82),
+ TOBN(0xbb254d30, 0x62e82b62), TOBN(0x11a05fe0, 0x3ec955ee),
+ TOBN(0x7eaff46e, 0x9d529b36), TOBN(0x55ab1301, 0x8f9e3df6),
+ TOBN(0xc463e371, 0x99317698), TOBN(0xfd251438, 0xccda47ad),
+ TOBN(0xca9c3547, 0x23d695ea), TOBN(0x48ce626e, 0x16e589b5),
+ TOBN(0x6b5b64c7, 0xb187d086), TOBN(0xd02e1794, 0xb2207948),
+ TOBN(0x8b58e98f, 0x7198111d), TOBN(0x90ca6305, 0xdcf9c3cc),
+ TOBN(0x5691fe72, 0xf34089b0), TOBN(0x60941af1, 0xfc7c80ff),
+ TOBN(0xa09bc0a2, 0x22eb51e5), TOBN(0xc0bb7244, 0xaa9cf09a),
+ TOBN(0x36a8077f, 0x80159f06), TOBN(0x8b5c989e, 0xdddc560e),
+ TOBN(0x19d2f316, 0x512e1f43), TOBN(0x02eac554, 0xad08ff62),
+ TOBN(0x012ab84c, 0x07d20b4e), TOBN(0x37d1e115, 0xd6d4e4e1),
+ TOBN(0xb6443e1a, 0xab7b19a8), TOBN(0xf08d067e, 0xdef8cd45),
+ TOBN(0x63adf3e9, 0x685e03da), TOBN(0xcf15a10e, 0x4792b916),
+ TOBN(0xf44bcce5, 0xb738a425), TOBN(0xebe131d5, 0x9636b2fd),
+ TOBN(0x94068841, 0x7850d605), TOBN(0x09684eaa, 0xb40d749d),
+ TOBN(0x8c3c669c, 0x72ba075b), TOBN(0x89f78b55, 0xba469015),
+ TOBN(0x5706aade, 0x3e9f8ba8), TOBN(0x6d8bd565, 0xb32d7ed7),
+ TOBN(0x25f4e63b, 0x805f08d6), TOBN(0x7f48200d, 0xc3bcc1b5),
+ TOBN(0x4e801968, 0xb025d847), TOBN(0x74afac04, 0x87cbe0a8),
+ TOBN(0x43ed2c2b, 0x7e63d690), TOBN(0xefb6bbf0, 0x0223cdb8),
+ TOBN(0x4fec3cae, 0x2884d3fe), TOBN(0x065ecce6, 0xd75e25a4),
+ TOBN(0x6c2294ce, 0x69f79071), TOBN(0x0d9a8e5f, 0x044b8666),
+ TOBN(0x5009f238, 0x17b69d8f), TOBN(0x3c29f8fe, 0xc5dfdaf7),
+ TOBN(0x9067528f, 0xebae68c4), TOBN(0x5b385632, 0x30c5ba21),
+ TOBN(0x540df119, 0x1fdd1aec), TOBN(0xcf37825b, 0xcfba4c78),
+ TOBN(0x77eff980, 0xbeb11454), TOBN(0x40a1a991, 0x60c1b066),
+ TOBN(0xe8018980, 0xf889a1c7), TOBN(0xb9c52ae9, 0x76c24be0),
+ TOBN(0x05fbbcce, 0x45650ef4), TOBN(0xae000f10, 0x8aa29ac7),
+ TOBN(0x884b7172, 0x4f04c470), TOBN(0x7cd4fde2, 0x19bb5c25),
+ TOBN(0x6477b22a, 0xe8840869), TOBN(0xa8868859, 0x5fbd0686),
+ TOBN(0xf23cc02e, 0x1116dfba), TOBN(0x76cd563f, 0xd87d7776),
+ TOBN(0xe2a37598, 0xa9d82abf), TOBN(0x5f188ccb, 0xe6c170f5),
+ TOBN(0x81682200, 0x5066b087), TOBN(0xda22c212, 0xc7155ada),
+ TOBN(0x151e5d3a, 0xfbddb479), TOBN(0x4b606b84, 0x6d715b99),
+ TOBN(0x4a73b54b, 0xf997cb2e), TOBN(0x9a1bfe43, 0x3ecd8b66),
+ TOBN(0x1c312809, 0x2a67d48a), TOBN(0xcd6a671e, 0x031fa9e2),
+ TOBN(0xbec3312a, 0x0e43a34a), TOBN(0x1d935639, 0x55ef47d3),
+ TOBN(0x5ea02489, 0x8fea73ea), TOBN(0x8247b364, 0xa035afb2),
+ TOBN(0xb58300a6, 0x5265b54c), TOBN(0x3286662f, 0x722c7148),
+ TOBN(0xb77fd76b, 0xb4ec4c20), TOBN(0xf0a12fa7, 0x0f3fe3fd),
+ TOBN(0xf845bbf5, 0x41d8c7e8), TOBN(0xe4d969ca, 0x5ec10aa8),
+ TOBN(0x4c0053b7, 0x43e232a3), TOBN(0xdc7a3fac, 0x37f8a45a),
+ TOBN(0x3c4261c5, 0x20d81c8f), TOBN(0xfd4b3453, 0xb00eab00),
+ TOBN(0x76d48f86, 0xd36e3062), TOBN(0x626c5277, 0xa143ff02),
+ TOBN(0x538174de, 0xaf76f42e), TOBN(0x2267aa86, 0x6407ceac),
+ TOBN(0xfad76351, 0x72e572d5), TOBN(0xab861af7, 0xba7330eb),
+ TOBN(0xa0a1c8c7, 0x418d8657), TOBN(0x988821cb, 0x20289a52),
+ TOBN(0x79732522, 0xcccc18ad), TOBN(0xaadf3f8d, 0xf1a6e027),
+ TOBN(0xf7382c93, 0x17c2354d), TOBN(0x5ce1680c, 0xd818b689),
+ TOBN(0x359ebbfc, 0xd9ecbee9), TOBN(0x4330689c, 0x1cae62ac),
+ TOBN(0xb55ce5b4, 0xc51ac38a), TOBN(0x7921dfea, 0xfe238ee8),
+ TOBN(0x3972bef8, 0x271d1ca5), TOBN(0x3e423bc7, 0xe8aabd18),
+ TOBN(0x57b09f3f, 0x44a3e5e3), TOBN(0x5da886ae, 0x7b444d66),
+ TOBN(0x68206634, 0xa9964375), TOBN(0x356a2fa3, 0x699cd0ff),
+ TOBN(0xaf0faa24, 0xdba515e9), TOBN(0x536e1f5c, 0xb321d79a),
+ TOBN(0xd3b9913a, 0x5c04e4ea), TOBN(0xd549dcfe, 0xd6f11513),
+ TOBN(0xee227bf5, 0x79fd1d94), TOBN(0x9f35afee, 0xb43f2c67),
+ TOBN(0xd2638d24, 0xf1314f53), TOBN(0x62baf948, 0xcabcd822),
+ TOBN(0x5542de29, 0x4ef48db0), TOBN(0xb3eb6a04, 0xfc5f6bb2),
+ TOBN(0x23c110ae, 0x1208e16a), TOBN(0x1a4d15b5, 0xf8363e24),
+ TOBN(0x30716844, 0x164be00b), TOBN(0xa8e24824, 0xf6f4690d),
+ TOBN(0x548773a2, 0x90b170cf), TOBN(0xa1bef331, 0x42f191f4),
+ TOBN(0x70f418d0, 0x9247aa97), TOBN(0xea06028e, 0x48be9147),
+ TOBN(0xe13122f3, 0xdbfb894e), TOBN(0xbe9b79f6, 0xce274b18),
+ TOBN(0x85a49de5, 0xca58aadf), TOBN(0x24957758, 0x11487351),
+ TOBN(0x111def61, 0xbb939099), TOBN(0x1d6a974a, 0x26d13694),
+ TOBN(0x4474b4ce, 0xd3fc253b), TOBN(0x3a1485e6, 0x4c5db15e),
+ TOBN(0xe79667b4, 0x147c15b4), TOBN(0xe34f553b, 0x7bc61301),
+ TOBN(0x032b80f8, 0x17094381), TOBN(0x55d8bafd, 0x723eaa21),
+ TOBN(0x5a987995, 0xf1c0e74e), TOBN(0x5a9b292e, 0xebba289c),
+ TOBN(0x413cd4b2, 0xeb4c8251), TOBN(0x98b5d243, 0xd162db0a),
+ TOBN(0xbb47bf66, 0x68342520), TOBN(0x08d68949, 0xbaa862d1),
+ TOBN(0x11f349c7, 0xe906abcd), TOBN(0x454ce985, 0xed7bf00e),
+ TOBN(0xacab5c9e, 0xb55b803b), TOBN(0xb03468ea, 0x31e3c16d),
+ TOBN(0x5c24213d, 0xd273bf12), TOBN(0x211538eb, 0x71587887),
+ TOBN(0x198e4a2f, 0x731dea2d), TOBN(0xd5856cf2, 0x74ed7b2a),
+ TOBN(0x86a632eb, 0x13a664fe), TOBN(0x932cd909, 0xbda41291),
+ TOBN(0x850e95d4, 0xc0c4ddc0), TOBN(0xc0f422f8, 0x347fc2c9),
+ TOBN(0xe68cbec4, 0x86076bcb), TOBN(0xf9e7c0c0, 0xcd6cd286),
+ TOBN(0x65994ddb, 0x0f5f27ca), TOBN(0xe85461fb, 0xa80d59ff),
+ TOBN(0xff05481a, 0x66601023), TOBN(0xc665427a, 0xfc9ebbfb),
+ TOBN(0xb0571a69, 0x7587fd52), TOBN(0x935289f8, 0x8d49efce),
+ TOBN(0x61becc60, 0xea420688), TOBN(0xb22639d9, 0x13a786af),
+ TOBN(0x1a8e6220, 0x361ecf90), TOBN(0x001f23e0, 0x25506463),
+ TOBN(0xe4ae9b5d, 0x0a5c2b79), TOBN(0xebc9cdad, 0xd8149db5),
+ TOBN(0xb33164a1, 0x934aa728), TOBN(0x750eb00e, 0xae9b60f3),
+ TOBN(0x5a91615b, 0x9b9cfbfd), TOBN(0x97015cbf, 0xef45f7f6),
+ TOBN(0xb462c4a5, 0xbf5151df), TOBN(0x21adcc41, 0xb07118f2),
+ TOBN(0xd60c545b, 0x043fa42c), TOBN(0xfc21aa54, 0xe96be1ab),
+ TOBN(0xe84bc32f, 0x4e51ea80), TOBN(0x3dae45f0, 0x259b5d8d),
+ TOBN(0xbb73c7eb, 0xc38f1b5e), TOBN(0xe405a74a, 0xe8ae617d),
+ TOBN(0xbb1ae9c6, 0x9f1c56bd), TOBN(0x8c176b98, 0x49f196a4),
+ TOBN(0xc448f311, 0x6875092b), TOBN(0xb5afe3de, 0x9f976033),
+ TOBN(0xa8dafd49, 0x145813e5), TOBN(0x687fc4d9, 0xe2b34226),
+ TOBN(0xf2dfc92d, 0x4c7ff57f), TOBN(0x004e3fc1, 0x401f1b46),
+ TOBN(0x5afddab6, 0x1430c9ab), TOBN(0x0bdd41d3, 0x2238e997),
+ TOBN(0xf0947430, 0x418042ae), TOBN(0x71f9adda, 0xcdddc4cb),
+ TOBN(0x7090c016, 0xc52dd907), TOBN(0xd9bdf44d, 0x29e2047f),
+ TOBN(0xe6f1fe80, 0x1b1011a6), TOBN(0xb63accbc, 0xd9acdc78),
+ TOBN(0xcfc7e235, 0x1272a95b), TOBN(0x0c667717, 0xa6276ac8),
+ TOBN(0x3c0d3709, 0xe2d7eef7), TOBN(0x5add2b06, 0x9a685b3e),
+ TOBN(0x363ad32d, 0x14ea5d65), TOBN(0xf8e01f06, 0x8d7dd506),
+ TOBN(0xc9ea2213, 0x75b4aac6), TOBN(0xed2a2bf9, 0x0d353466),
+ TOBN(0x439d79b5, 0xe9d3a7c3), TOBN(0x8e0ee5a6, 0x81b7f34b),
+ TOBN(0xcf3dacf5, 0x1dc4ba75), TOBN(0x1d3d1773, 0xeb3310c7),
+ TOBN(0xa8e67112, 0x7747ae83), TOBN(0x31f43160, 0x197d6b40),
+ TOBN(0x0521ccee, 0xcd961400), TOBN(0x67246f11, 0xf6535768),
+ TOBN(0x702fcc5a, 0xef0c3133), TOBN(0x247cc45d, 0x7e16693b),
+ TOBN(0xfd484e49, 0xc729b749), TOBN(0x522cef7d, 0xb218320f),
+ TOBN(0xe56ef405, 0x59ab93b3), TOBN(0x225fba11, 0x9f181071),
+ TOBN(0x33bd6595, 0x15330ed0), TOBN(0xc4be69d5, 0x1ddb32f7),
+ TOBN(0x264c7668, 0x0448087c), TOBN(0xac30903f, 0x71432dae),
+ TOBN(0x3851b266, 0x00f9bf47), TOBN(0x400ed311, 0x6cdd6d03),
+ TOBN(0x045e79fe, 0xf8fd2424), TOBN(0xfdfd974a, 0xfa6da98b),
+ TOBN(0x45c9f641, 0x0c1e673a), TOBN(0x76f2e733, 0x5b2c5168),
+ TOBN(0x1adaebb5, 0x2a601753), TOBN(0xb286514c, 0xc57c2d49),
+ TOBN(0xd8769670, 0x1e0bfd24), TOBN(0x950c547e, 0x04478922),
+ TOBN(0xd1d41969, 0xe5d32bfe), TOBN(0x30bc1472, 0x750d6c3e),
+ TOBN(0x8f3679fe, 0xe0e27f3a), TOBN(0x8f64a7dc, 0xa4a6ee0c),
+ TOBN(0x2fe59937, 0x633dfb1f), TOBN(0xea82c395, 0x977f2547),
+ TOBN(0xcbdfdf1a, 0x661ea646), TOBN(0xc7ccc591, 0xb9085451),
+ TOBN(0x82177962, 0x81761e13), TOBN(0xda57596f, 0x9196885c),
+ TOBN(0xbc17e849, 0x28ffbd70), TOBN(0x1e6e0a41, 0x2671d36f),
+ TOBN(0x61ae872c, 0x4152fcf5), TOBN(0x441c87b0, 0x9e77e754),
+ TOBN(0xd0799dd5, 0xa34dff09), TOBN(0x766b4e44, 0x88a6b171),
+ TOBN(0xdc06a512, 0x11f1c792), TOBN(0xea02ae93, 0x4be35c3e),
+ TOBN(0xe5ca4d6d, 0xe90c469e), TOBN(0x4df4368e, 0x56e4ff5c),
+ TOBN(0x7817acab, 0x4baef62e), TOBN(0x9f5a2202, 0xa85b91e8),
+ TOBN(0x9666ebe6, 0x6ce57610), TOBN(0x32ad31f3, 0xf73bfe03),
+ TOBN(0x628330a4, 0x25bcf4d6), TOBN(0xea950593, 0x515056e6),
+ TOBN(0x59811c89, 0xe1332156), TOBN(0xc89cf1fe, 0x8c11b2d7),
+ TOBN(0x75b63913, 0x04e60cc0), TOBN(0xce811e8d, 0x4625d375),
+ TOBN(0x030e43fc, 0x2d26e562), TOBN(0xfbb30b4b, 0x608d36a0),
+ TOBN(0x634ff82c, 0x48528118), TOBN(0x7c6fe085, 0xcd285911),
+ TOBN(0x7f2830c0, 0x99358f28), TOBN(0x2e60a95e, 0x665e6c09),
+ TOBN(0x08407d3d, 0x9b785dbf), TOBN(0x530889ab, 0xa759bce7),
+ TOBN(0xf228e0e6, 0x52f61239), TOBN(0x2b6d1461, 0x6879be3c),
+ TOBN(0xe6902c04, 0x51a7bbf7), TOBN(0x30ad99f0, 0x76f24a64),
+ TOBN(0x66d9317a, 0x98bc6da0), TOBN(0xf4f877f3, 0xcb596ac0),
+ TOBN(0xb05ff62d, 0x4c44f119), TOBN(0x4555f536, 0xe9b77416),
+ TOBN(0xc7c0d059, 0x8caed63b), TOBN(0x0cd2b7ce, 0xc358b2a9),
+ TOBN(0x3f33287b, 0x46945fa3), TOBN(0xf8785b20, 0xd67c8791),
+ TOBN(0xc54a7a61, 0x9637bd08), TOBN(0x54d4598c, 0x18be79d7),
+ TOBN(0x889e5acb, 0xc46d7ce1), TOBN(0x9a515bb7, 0x8b085877),
+ TOBN(0xfac1a03d, 0x0b7a5050), TOBN(0x7d3e738a, 0xf2926035),
+ TOBN(0x861cc2ce, 0x2a6cb0eb), TOBN(0x6f2e2955, 0x8f7adc79),
+ TOBN(0x61c4d451, 0x33016376), TOBN(0xd9fd2c80, 0x5ad59090),
+ TOBN(0xe5a83738, 0xb2b836a1), TOBN(0x855b41a0, 0x7c0d6622),
+ TOBN(0x186fe317, 0x7cc19af1), TOBN(0x6465c1ff, 0xfdd99acb),
+ TOBN(0x46e5c23f, 0x6974b99e), TOBN(0x75a7cf8b, 0xa2717cbe),
+ TOBN(0x4d2ebc3f, 0x062be658), TOBN(0x094b4447, 0x5f209c98),
+ TOBN(0x4af285ed, 0xb940cb5a), TOBN(0x6706d792, 0x7cc82f10),
+ TOBN(0xc8c8776c, 0x030526fa), TOBN(0xfa8e6f76, 0xa0da9140),
+ TOBN(0x77ea9d34, 0x591ee4f0), TOBN(0x5f46e337, 0x40274166),
+ TOBN(0x1bdf98bb, 0xea671457), TOBN(0xd7c08b46, 0x862a1fe2),
+ TOBN(0x46cc303c, 0x1c08ad63), TOBN(0x99543440, 0x4c845e7b),
+ TOBN(0x1b8fbdb5, 0x48f36bf7), TOBN(0x5b82c392, 0x8c8273a7),
+ TOBN(0x08f712c4, 0x928435d5), TOBN(0x071cf0f1, 0x79330380),
+ TOBN(0xc74c2d24, 0xa8da054a), TOBN(0xcb0e7201, 0x43c46b5c),
+ TOBN(0x0ad7337a, 0xc0b7eff3), TOBN(0x8552225e, 0xc5e48b3c),
+ TOBN(0xe6f78b0c, 0x73f13a5f), TOBN(0x5e70062e, 0x82349cbe),
+ TOBN(0x6b8d5048, 0xe7073969), TOBN(0x392d2a29, 0xc33cb3d2),
+ TOBN(0xee4f727c, 0x4ecaa20f), TOBN(0xa068c99e, 0x2ccde707),
+ TOBN(0xfcd5651f, 0xb87a2913), TOBN(0xea3e3c15, 0x3cc252f0),
+ TOBN(0x777d92df, 0x3b6cd3e4), TOBN(0x7a414143, 0xc5a732e7),
+ TOBN(0xa895951a, 0xa71ff493), TOBN(0xfe980c92, 0xbbd37cf6),
+ TOBN(0x45bd5e64, 0xdecfeeff), TOBN(0x910dc2a9, 0xa44c43e9),
+ TOBN(0xcb403f26, 0xcca9f54d), TOBN(0x928bbdfb, 0x9303f6db),
+ TOBN(0x3c37951e, 0xa9eee67c), TOBN(0x3bd61a52, 0xf79961c3),
+ TOBN(0x09a238e6, 0x395c9a79), TOBN(0x6940ca2d, 0x61eb352d),
+ TOBN(0x7d1e5c5e, 0xc1875631), TOBN(0x1e19742c, 0x1e1b20d1),
+ TOBN(0x4633d908, 0x23fc2e6e), TOBN(0xa76e29a9, 0x08959149),
+ TOBN(0x61069d9c, 0x84ed7da5), TOBN(0x0baa11cf, 0x5dbcad51),
+ TOBN(0xd01eec64, 0x961849da), TOBN(0x93b75f1f, 0xaf3d8c28),
+ TOBN(0x57bc4f9f, 0x1ca2ee44), TOBN(0x5a26322d, 0x00e00558),
+ TOBN(0x1888d658, 0x61a023ef), TOBN(0x1d72aab4, 0xb9e5246e),
+ TOBN(0xa9a26348, 0xe5563ec0), TOBN(0xa0971963, 0xc3439a43),
+ TOBN(0x567dd54b, 0xadb9b5b7), TOBN(0x73fac1a1, 0xc45a524b),
+ TOBN(0x8fe97ef7, 0xfe38e608), TOBN(0x608748d2, 0x3f384f48),
+ TOBN(0xb0571794, 0xc486094f), TOBN(0x869254a3, 0x8bf3a8d6),
+ TOBN(0x148a8dd1, 0x310b0e25), TOBN(0x99ab9f3f, 0x9aa3f7d8),
+ TOBN(0x0927c68a, 0x6706c02e), TOBN(0x22b5e76c, 0x69790e6c),
+ TOBN(0x6c325260, 0x6c71376c), TOBN(0x53a57690, 0x09ef6657),
+ TOBN(0x8d63f852, 0xedffcf3a), TOBN(0xb4d2ed04, 0x3c0a6f55),
+ TOBN(0xdb3aa8de, 0x12519b9e), TOBN(0x5d38e9c4, 0x1e0a569a),
+ TOBN(0x871528bf, 0x303747e2), TOBN(0xa208e77c, 0xf5b5c18d),
+ TOBN(0x9d129c88, 0xca6bf923), TOBN(0xbcbf197f, 0xbf02839f),
+ TOBN(0x9b9bf030, 0x27323194), TOBN(0x3b055a8b, 0x339ca59d),
+ TOBN(0xb46b2312, 0x0f669520), TOBN(0x19789f1f, 0x497e5f24),
+ TOBN(0x9c499468, 0xaaf01801), TOBN(0x72ee1190, 0x8b69d59c),
+ TOBN(0x8bd39595, 0xacf4c079), TOBN(0x3ee11ece, 0x8e0cd048),
+ TOBN(0xebde86ec, 0x1ed66f18), TOBN(0x225d906b, 0xd61fce43),
+ TOBN(0x5cab07d6, 0xe8bed74d), TOBN(0x16e4617f, 0x27855ab7),
+ TOBN(0x6568aadd, 0xb2fbc3dd), TOBN(0xedb5484f, 0x8aeddf5b),
+ TOBN(0x878f20e8, 0x6dcf2fad), TOBN(0x3516497c, 0x615f5699),}
+ ,
+ {TOBN(0xef0a3fec, 0xfa181e69), TOBN(0x9ea02f81, 0x30d69a98),
+ TOBN(0xb2e9cf8e, 0x66eab95d), TOBN(0x520f2beb, 0x24720021),
+ TOBN(0x621c540a, 0x1df84361), TOBN(0x12037721, 0x71fa6d5d),
+ TOBN(0x6e3c7b51, 0x0ff5f6ff), TOBN(0x817a069b, 0xabb2bef3),
+ TOBN(0x83572fb6, 0xb294cda6), TOBN(0x6ce9bf75, 0xb9039f34),
+ TOBN(0x20e012f0, 0x095cbb21), TOBN(0xa0aecc1b, 0xd063f0da),
+ TOBN(0x57c21c3a, 0xf02909e5), TOBN(0xc7d59ecf, 0x48ce9cdc),
+ TOBN(0x2732b844, 0x8ae336f8), TOBN(0x056e3723, 0x3f4f85f4),
+ TOBN(0x8a10b531, 0x89e800ca), TOBN(0x50fe0c17, 0x145208fd),
+ TOBN(0x9e43c0d3, 0xb714ba37), TOBN(0x427d200e, 0x34189acc),
+ TOBN(0x05dee24f, 0xe616e2c0), TOBN(0x9c25f4c8, 0xee1854c1),
+ TOBN(0x4d3222a5, 0x8f342a73), TOBN(0x0807804f, 0xa027c952),
+ TOBN(0xc222653a, 0x4f0d56f3), TOBN(0x961e4047, 0xca28b805),
+ TOBN(0x2c03f8b0, 0x4a73434b), TOBN(0x4c966787, 0xab712a19),
+ TOBN(0xcc196c42, 0x864fee42), TOBN(0xc1be93da, 0x5b0ece5c),
+ TOBN(0xa87d9f22, 0xc131c159), TOBN(0x2bb6d593, 0xdce45655),
+ TOBN(0x22c49ec9, 0xb809b7ce), TOBN(0x8a41486b, 0xe2c72c2c),
+ TOBN(0x813b9420, 0xfea0bf36), TOBN(0xb3d36ee9, 0xa66dac69),
+ TOBN(0x6fddc08a, 0x328cc987), TOBN(0x0a3bcd2c, 0x3a326461),
+ TOBN(0x7103c49d, 0xd810dbba), TOBN(0xf9d81a28, 0x4b78a4c4),
+ TOBN(0x3de865ad, 0xe4d55941), TOBN(0xdedafa5e, 0x30384087),
+ TOBN(0x6f414abb, 0x4ef18b9b), TOBN(0x9ee9ea42, 0xfaee5268),
+ TOBN(0x260faa16, 0x37a55a4a), TOBN(0xeb19a514, 0x015f93b9),
+ TOBN(0x51d7ebd2, 0x9e9c3598), TOBN(0x523fc56d, 0x1932178e),
+ TOBN(0x501d070c, 0xb98fe684), TOBN(0xd60fbe9a, 0x124a1458),
+ TOBN(0xa45761c8, 0x92bc6b3f), TOBN(0xf5384858, 0xfe6f27cb),
+ TOBN(0x4b0271f7, 0xb59e763b), TOBN(0x3d4606a9, 0x5b5a8e5e),
+ TOBN(0x1eda5d9b, 0x05a48292), TOBN(0xda7731d0, 0xe6fec446),
+ TOBN(0xa3e33693, 0x90d45871), TOBN(0xe9764040, 0x06166d8d),
+ TOBN(0xb5c33682, 0x89a90403), TOBN(0x4bd17983, 0x72f1d637),
+ TOBN(0xa616679e, 0xd5d2c53a), TOBN(0x5ec4bcd8, 0xfdcf3b87),
+ TOBN(0xae6d7613, 0xb66a694e), TOBN(0x7460fc76, 0xe3fc27e5),
+ TOBN(0x70469b82, 0x95caabee), TOBN(0xde024ca5, 0x889501e3),
+ TOBN(0x6bdadc06, 0x076ed265), TOBN(0x0cb1236b, 0x5a0ef8b2),
+ TOBN(0x4065ddbf, 0x0972ebf9), TOBN(0xf1dd3875, 0x22aca432),
+ TOBN(0xa88b97cf, 0x744aff76), TOBN(0xd1359afd, 0xfe8e3d24),
+ TOBN(0x52a3ba2b, 0x91502cf3), TOBN(0x2c3832a8, 0x084db75d),
+ TOBN(0x04a12ddd, 0xde30b1c9), TOBN(0x7802eabc, 0xe31fd60c),
+ TOBN(0x33707327, 0xa37fddab), TOBN(0x65d6f2ab, 0xfaafa973),
+ TOBN(0x3525c5b8, 0x11e6f91a), TOBN(0x76aeb0c9, 0x5f46530b),
+ TOBN(0xe8815ff6, 0x2f93a675), TOBN(0xa6ec9684, 0x05f48679),
+ TOBN(0x6dcbb556, 0x358ae884), TOBN(0x0af61472, 0xe19e3873),
+ TOBN(0x72334372, 0xa5f696be), TOBN(0xc65e57ea, 0x6f22fb70),
+ TOBN(0x268da30c, 0x946cea90), TOBN(0x136a8a87, 0x65681b2a),
+ TOBN(0xad5e81dc, 0x0f9f44d4), TOBN(0xf09a6960, 0x2c46585a),
+ TOBN(0xd1649164, 0xc447d1b1), TOBN(0x3b4b36c8, 0x879dc8b1),
+ TOBN(0x20d4177b, 0x3b6b234c), TOBN(0x096a2505, 0x1730d9d0),
+ TOBN(0x0611b9b8, 0xef80531d), TOBN(0xba904b3b, 0x64bb495d),
+ TOBN(0x1192d9d4, 0x93a3147a), TOBN(0x9f30a5dc, 0x9a565545),
+ TOBN(0x90b1f9cb, 0x6ef07212), TOBN(0x29958546, 0x0d87fc13),
+ TOBN(0xd3323eff, 0xc17db9ba), TOBN(0xcb18548c, 0xcb1644a8),
+ TOBN(0x18a306d4, 0x4f49ffbc), TOBN(0x28d658f1, 0x4c2e8684),
+ TOBN(0x44ba60cd, 0xa99f8c71), TOBN(0x67b7abdb, 0x4bf742ff),
+ TOBN(0x66310f9c, 0x914b3f99), TOBN(0xae430a32, 0xf412c161),
+ TOBN(0x1e6776d3, 0x88ace52f), TOBN(0x4bc0fa24, 0x52d7067d),
+ TOBN(0x03c286aa, 0x8f07cd1b), TOBN(0x4cb8f38c, 0xa985b2c1),
+ TOBN(0x83ccbe80, 0x8c3bff36), TOBN(0x005a0bd2, 0x5263e575),
+ TOBN(0x460d7dda, 0x259bdcd1), TOBN(0x4a1c5642, 0xfa5cab6b),
+ TOBN(0x2b7bdbb9, 0x9fe4fc88), TOBN(0x09418e28, 0xcc97bbb5),
+ TOBN(0xd8274fb4, 0xa12321ae), TOBN(0xb137007d, 0x5c87b64e),
+ TOBN(0x80531fe1, 0xc63c4962), TOBN(0x50541e89, 0x981fdb25),
+ TOBN(0xdc1291a1, 0xfd4c2b6b), TOBN(0xc0693a17, 0xa6df4fca),
+ TOBN(0xb2c4604e, 0x0117f203), TOBN(0x245f1963, 0x0a99b8d0),
+ TOBN(0xaedc20aa, 0xc6212c44), TOBN(0xb1ed4e56, 0x520f52a8),
+ TOBN(0xfe48f575, 0xf8547be3), TOBN(0x0a7033cd, 0xa9e45f98),
+ TOBN(0x4b45d3a9, 0x18c50100), TOBN(0xb2a6cd6a, 0xa61d41da),
+ TOBN(0x60bbb4f5, 0x57933c6b), TOBN(0xa7538ebd, 0x2b0d7ffc),
+ TOBN(0x9ea3ab8d, 0x8cd626b6), TOBN(0x8273a484, 0x3601625a),
+ TOBN(0x88859845, 0x0168e508), TOBN(0x8cbc9bb2, 0x99a94abd),
+ TOBN(0x713ac792, 0xfab0a671), TOBN(0xa3995b19, 0x6c9ebffc),
+ TOBN(0xe711668e, 0x1239e152), TOBN(0x56892558, 0xbbb8dff4),
+ TOBN(0x8bfc7dab, 0xdbf17963), TOBN(0x5b59fe5a, 0xb3de1253),
+ TOBN(0x7e3320eb, 0x34a9f7ae), TOBN(0xe5e8cf72, 0xd751efe4),
+ TOBN(0x7ea003bc, 0xd9be2f37), TOBN(0xc0f551a0, 0xb6c08ef7),
+ TOBN(0x56606268, 0x038f6725), TOBN(0x1dd38e35, 0x6d92d3b6),
+ TOBN(0x07dfce7c, 0xc3cbd686), TOBN(0x4e549e04, 0x651c5da8),
+ TOBN(0x4058f93b, 0x08b19340), TOBN(0xc2fae6f4, 0xcac6d89d),
+ TOBN(0x4bad8a8c, 0x8f159cc7), TOBN(0x0ddba4b3, 0xcb0b601c),
+ TOBN(0xda4fc7b5, 0x1dd95f8c), TOBN(0x1d163cd7, 0xcea5c255),
+ TOBN(0x30707d06, 0x274a8c4c), TOBN(0x79d9e008, 0x2802e9ce),
+ TOBN(0x02a29ebf, 0xe6ddd505), TOBN(0x37064e74, 0xb50bed1a),
+ TOBN(0x3f6bae65, 0xa7327d57), TOBN(0x3846f5f1, 0xf83920bc),
+ TOBN(0x87c37491, 0x60df1b9b), TOBN(0x4cfb2895, 0x2d1da29f),
+ TOBN(0x10a478ca, 0x4ed1743c), TOBN(0x390c6030, 0x3edd47c6),
+ TOBN(0x8f3e5312, 0x8c0a78de), TOBN(0xccd02bda, 0x1e85df70),
+ TOBN(0xd6c75c03, 0xa61b6582), TOBN(0x0762921c, 0xfc0eebd1),
+ TOBN(0xd34d0823, 0xd85010c0), TOBN(0xd73aaacb, 0x0044cf1f),
+ TOBN(0xfb4159bb, 0xa3b5e78a), TOBN(0x2287c7f7, 0xe5826f3f),
+ TOBN(0x4aeaf742, 0x580b1a01), TOBN(0xf080415d, 0x60423b79),
+ TOBN(0xe12622cd, 0xa7dea144), TOBN(0x49ea4996, 0x59d62472),
+ TOBN(0xb42991ef, 0x571f3913), TOBN(0x0610f214, 0xf5b25a8a),
+ TOBN(0x47adc585, 0x30b79e8f), TOBN(0xf90e3df6, 0x07a065a2),
+ TOBN(0x5d0a5deb, 0x43e2e034), TOBN(0x53fb5a34, 0x444024aa),
+ TOBN(0xa8628c68, 0x6b0c9f7f), TOBN(0x9c69c29c, 0xac563656),
+ TOBN(0x5a231feb, 0xbace47b6), TOBN(0xbdce0289, 0x9ea5a2ec),
+ TOBN(0x05da1fac, 0x9463853e), TOBN(0x96812c52, 0x509e78aa),
+ TOBN(0xd3fb5771, 0x57151692), TOBN(0xeb2721f8, 0xd98e1c44),
+ TOBN(0xc0506087, 0x32399be1), TOBN(0xda5a5511, 0xd979d8b8),
+ TOBN(0x737ed55d, 0xc6f56780), TOBN(0xe20d3004, 0x0dc7a7f4),
+ TOBN(0x02ce7301, 0xf5941a03), TOBN(0x91ef5215, 0xed30f83a),
+ TOBN(0x28727fc1, 0x4092d85f), TOBN(0x72d223c6, 0x5c49e41a),
+ TOBN(0xa7cf30a2, 0xba6a4d81), TOBN(0x7c086209, 0xb030d87d),
+ TOBN(0x04844c7d, 0xfc588b09), TOBN(0x728cd499, 0x5874bbb0),
+ TOBN(0xcc1281ee, 0xe84c0495), TOBN(0x0769b5ba, 0xec31958f),
+ TOBN(0x665c228b, 0xf99c2471), TOBN(0xf2d8a11b, 0x191eb110),
+ TOBN(0x4594f494, 0xd36d7024), TOBN(0x482ded8b, 0xcdcb25a1),
+ TOBN(0xc958a9d8, 0xdadd4885), TOBN(0x7004477e, 0xf1d2b547),
+ TOBN(0x0a45f6ef, 0x2a0af550), TOBN(0x4fc739d6, 0x2f8d6351),
+ TOBN(0x75cdaf27, 0x786f08a9), TOBN(0x8700bb26, 0x42c2737f),
+ TOBN(0x855a7141, 0x1c4e2670), TOBN(0x810188c1, 0x15076fef),
+ TOBN(0xc251d0c9, 0xabcd3297), TOBN(0xae4c8967, 0xf48108eb),
+ TOBN(0xbd146de7, 0x18ceed30), TOBN(0xf9d4f07a, 0xc986bced),
+ TOBN(0x5ad98ed5, 0x83fa1e08), TOBN(0x7780d33e, 0xbeabd1fb),
+ TOBN(0xe330513c, 0x903b1196), TOBN(0xba11de9e, 0xa47bc8c4),
+ TOBN(0x684334da, 0x02c2d064), TOBN(0x7ecf360d, 0xa48de23b),
+ TOBN(0x57a1b474, 0x0a9089d8), TOBN(0xf28fa439, 0xff36734c),
+ TOBN(0xf2a482cb, 0xea4570b3), TOBN(0xee65d68b, 0xa5ebcee9),
+ TOBN(0x988d0036, 0xb9694cd5), TOBN(0x53edd0e9, 0x37885d32),
+ TOBN(0xe37e3307, 0xbeb9bc6d), TOBN(0xe9abb907, 0x9f5c6768),
+ TOBN(0x4396ccd5, 0x51f2160f), TOBN(0x2500888c, 0x47336da6),
+ TOBN(0x383f9ed9, 0x926fce43), TOBN(0x809dd1c7, 0x04da2930),
+ TOBN(0x30f6f596, 0x8a4cb227), TOBN(0x0d700c7f, 0x73a56b38),
+ TOBN(0x1825ea33, 0xab64a065), TOBN(0xaab9b735, 0x1338df80),
+ TOBN(0x1516100d, 0x9b63f57f), TOBN(0x2574395a, 0x27a6a634),
+ TOBN(0xb5560fb6, 0x700a1acd), TOBN(0xe823fd73, 0xfd999681),
+ TOBN(0xda915d1f, 0x6cb4e1ba), TOBN(0x0d030118, 0x6ebe00a3),
+ TOBN(0x744fb0c9, 0x89fca8cd), TOBN(0x970d01db, 0xf9da0e0b),
+ TOBN(0x0ad8c564, 0x7931d76f), TOBN(0xb15737bf, 0xf659b96a),
+ TOBN(0xdc9933e8, 0xa8b484e7), TOBN(0xb2fdbdf9, 0x7a26dec7),
+ TOBN(0x2349e9a4, 0x9f1f0136), TOBN(0x7860368e, 0x70fddddb),
+ TOBN(0xd93d2c1c, 0xf9ad3e18), TOBN(0x6d6c5f17, 0x689f4e79),
+ TOBN(0x7a544d91, 0xb24ff1b6), TOBN(0x3e12a5eb, 0xfe16cd8c),
+ TOBN(0x543574e9, 0xa56b872f), TOBN(0xa1ad550c, 0xfcf68ea2),
+ TOBN(0x689e37d2, 0x3f560ef7), TOBN(0x8c54b9ca, 0xc9d47a8b),
+ TOBN(0x46d40a4a, 0x088ac342), TOBN(0xec450c7c, 0x1576c6d0),
+ TOBN(0xb589e31c, 0x1f9689e9), TOBN(0xdacf2602, 0xb8781718),
+ TOBN(0xa89237c6, 0xc8cb6b42), TOBN(0x1326fc93, 0xb96ef381),
+ TOBN(0x55d56c6d, 0xb5f07825), TOBN(0xacba2eea, 0x7449e22d),
+ TOBN(0x74e0887a, 0x633c3000), TOBN(0xcb6cd172, 0xd7cbcf71),
+ TOBN(0x309e81de, 0xc36cf1be), TOBN(0x07a18a6d, 0x60ae399b),
+ TOBN(0xb36c2679, 0x9edce57e), TOBN(0x52b892f4, 0xdf001d41),
+ TOBN(0xd884ae5d, 0x16a1f2c6), TOBN(0x9b329424, 0xefcc370a),
+ TOBN(0x3120daf2, 0xbd2e21df), TOBN(0x55298d2d, 0x02470a99),
+ TOBN(0x0b78af6c, 0xa05db32e), TOBN(0x5c76a331, 0x601f5636),
+ TOBN(0xaae861ff, 0xf8a4f29c), TOBN(0x70dc9240, 0xd68f8d49),
+ TOBN(0x960e649f, 0x81b1321c), TOBN(0x3d2c801b, 0x8792e4ce),
+ TOBN(0xf479f772, 0x42521876), TOBN(0x0bed93bc, 0x416c79b1),
+ TOBN(0xa67fbc05, 0x263e5bc9), TOBN(0x01e8e630, 0x521db049),
+ TOBN(0x76f26738, 0xc6f3431e), TOBN(0xe609cb02, 0xe3267541),
+ TOBN(0xb10cff2d, 0x818c877c), TOBN(0x1f0e75ce, 0x786a13cb),
+ TOBN(0xf4fdca64, 0x1158544d), TOBN(0x5d777e89, 0x6cb71ed0),
+ TOBN(0x3c233737, 0xa9aa4755), TOBN(0x7b453192, 0xe527ab40),
+ TOBN(0xdb59f688, 0x39f05ffe), TOBN(0x8f4f4be0, 0x6d82574e),
+ TOBN(0xcce3450c, 0xee292d1b), TOBN(0xaa448a12, 0x61ccd086),
+ TOBN(0xabce91b3, 0xf7914967), TOBN(0x4537f09b, 0x1908a5ed),
+ TOBN(0xa812421e, 0xf51042e7), TOBN(0xfaf5cebc, 0xec0b3a34),
+ TOBN(0x730ffd87, 0x4ca6b39a), TOBN(0x70fb72ed, 0x02efd342),
+ TOBN(0xeb4735f9, 0xd75c8edb), TOBN(0xc11f2157, 0xc278aa51),
+ TOBN(0xc459f635, 0xbf3bfebf), TOBN(0x3a1ff0b4, 0x6bd9601f),
+ TOBN(0xc9d12823, 0xc420cb73), TOBN(0x3e9af3e2, 0x3c2915a3),
+ TOBN(0xe0c82c72, 0xb41c3440), TOBN(0x175239e5, 0xe3039a5f),
+ TOBN(0xe1084b8a, 0x558795a3), TOBN(0x328d0a1d, 0xd01e5c60),
+ TOBN(0x0a495f2e, 0xd3788a04), TOBN(0x25d8ff16, 0x66c11a9f),
+ TOBN(0xf5155f05, 0x9ed692d6), TOBN(0x954fa107, 0x4f425fe4),
+ TOBN(0xd16aabf2, 0xe98aaa99), TOBN(0x90cd8ba0, 0x96b0f88a),
+ TOBN(0x957f4782, 0xc154026a), TOBN(0x54ee0734, 0x52af56d2),
+ TOBN(0xbcf89e54, 0x45b4147a), TOBN(0x3d102f21, 0x9a52816c),
+ TOBN(0x6808517e, 0x39b62e77), TOBN(0x92e25421, 0x69169ad8),
+ TOBN(0xd721d871, 0xbb608558), TOBN(0x60e4ebae, 0xf6d4ff9b),
+ TOBN(0x0ba10819, 0x41f2763e), TOBN(0xca2e45be, 0x51ee3247),
+ TOBN(0x66d172ec, 0x2bfd7a5f), TOBN(0x528a8f2f, 0x74d0b12d),
+ TOBN(0xe17f1e38, 0xdabe70dc), TOBN(0x1d5d7316, 0x9f93983c),
+ TOBN(0x51b2184a, 0xdf423e31), TOBN(0xcb417291, 0xaedb1a10),
+ TOBN(0x2054ca93, 0x625bcab9), TOBN(0x54396860, 0xa98998f0),
+ TOBN(0x4e53f6c4, 0xa54ae57e), TOBN(0x0ffeb590, 0xee648e9d),
+ TOBN(0xfbbdaadc, 0x6afaf6bc), TOBN(0xf88ae796, 0xaa3bfb8a),
+ TOBN(0x209f1d44, 0xd2359ed9), TOBN(0xac68dd03, 0xf3544ce2),
+ TOBN(0xf378da47, 0xfd51e569), TOBN(0xe1abd860, 0x2cc80097),
+ TOBN(0x23ca18d9, 0x343b6e3a), TOBN(0x480797e8, 0xb40a1bae),
+ TOBN(0xd1f0c717, 0x533f3e67), TOBN(0x44896970, 0x06e6cdfc),
+ TOBN(0x8ca21055, 0x52a82e8d), TOBN(0xb2caf785, 0x78460cdc),
+ TOBN(0x4c1b7b62, 0xe9037178), TOBN(0xefc09d2c, 0xdb514b58),
+ TOBN(0x5f2df9ee, 0x9113be5c), TOBN(0x2fbda78f, 0xb3f9271c),
+ TOBN(0xe09a81af, 0x8f83fc54), TOBN(0x06b13866, 0x8afb5141),
+ TOBN(0x38f6480f, 0x43e3865d), TOBN(0x72dd77a8, 0x1ddf47d9),
+ TOBN(0xf2a8e971, 0x4c205ff7), TOBN(0x46d449d8, 0x9d088ad8),
+ TOBN(0x926619ea, 0x185d706f), TOBN(0xe47e02eb, 0xc7dd7f62),
+ TOBN(0xe7f120a7, 0x8cbc2031), TOBN(0xc18bef00, 0x998d4ac9),
+ TOBN(0x18f37a9c, 0x6bdf22da), TOBN(0xefbc432f, 0x90dc82df),
+ TOBN(0xc52cef8e, 0x5d703651), TOBN(0x82887ba0, 0xd99881a5),
+ TOBN(0x7cec9dda, 0xb920ec1d), TOBN(0xd0d7e8c3, 0xec3e8d3b),
+ TOBN(0x445bc395, 0x4ca88747), TOBN(0xedeaa2e0, 0x9fd53535),
+ TOBN(0x461b1d93, 0x6cc87475), TOBN(0xd92a52e2, 0x6d2383bd),
+ TOBN(0xfabccb59, 0xd7903546), TOBN(0x6111a761, 0x3d14b112),
+ TOBN(0x0ae584fe, 0xb3d5f612), TOBN(0x5ea69b8d, 0x60e828ec),
+ TOBN(0x6c078985, 0x54087030), TOBN(0x649cab04, 0xac4821fe),
+ TOBN(0x25ecedcf, 0x8bdce214), TOBN(0xb5622f72, 0x86af7361),
+ TOBN(0x0e1227aa, 0x7038b9e2), TOBN(0xd0efb273, 0xac20fa77),
+ TOBN(0x817ff88b, 0x79df975b), TOBN(0x856bf286, 0x1999503e),
+ TOBN(0xb4d5351f, 0x5038ec46), TOBN(0x740a52c5, 0xfc42af6e),
+ TOBN(0x2e38bb15, 0x2cbb1a3f), TOBN(0xc3eb99fe, 0x17a83429),
+ TOBN(0xca4fcbf1, 0xdd66bb74), TOBN(0x880784d6, 0xcde5e8fc),
+ TOBN(0xddc84c1c, 0xb4e7a0be), TOBN(0x8780510d, 0xbd15a72f),
+ TOBN(0x44bcf1af, 0x81ec30e1), TOBN(0x141e50a8, 0x0a61073e),
+ TOBN(0x0d955718, 0x47be87ae), TOBN(0x68a61417, 0xf76a4372),
+ TOBN(0xf57e7e87, 0xc607c3d3), TOBN(0x043afaf8, 0x5252f332),
+ TOBN(0xcc14e121, 0x1552a4d2), TOBN(0xb6dee692, 0xbb4d4ab4),
+ TOBN(0xb6ab74c8, 0xa03816a4), TOBN(0x84001ae4, 0x6f394a29),
+ TOBN(0x5bed8344, 0xd795fb45), TOBN(0x57326e7d, 0xb79f55a5),
+ TOBN(0xc9533ce0, 0x4accdffc), TOBN(0x53473caf, 0x3993fa04),
+ TOBN(0x7906eb93, 0xa13df4c8), TOBN(0xa73e51f6, 0x97cbe46f),
+ TOBN(0xd1ab3ae1, 0x0ae4ccf8), TOBN(0x25614508, 0x8a5b3dbc),
+ TOBN(0x61eff962, 0x11a71b27), TOBN(0xdf71412b, 0x6bb7fa39),
+ TOBN(0xb31ba6b8, 0x2bd7f3ef), TOBN(0xb0b9c415, 0x69180d29),
+ TOBN(0xeec14552, 0x014cdde5), TOBN(0x702c624b, 0x227b4bbb),
+ TOBN(0x2b15e8c2, 0xd3e988f3), TOBN(0xee3bcc6d, 0xa4f7fd04),
+ TOBN(0x9d00822a, 0x42ac6c85), TOBN(0x2db0cea6, 0x1df9f2b7),
+ TOBN(0xd7cad2ab, 0x42de1e58), TOBN(0x346ed526, 0x2d6fbb61),
+ TOBN(0xb3962995, 0x1a2faf09), TOBN(0x2fa8a580, 0x7c25612e),
+ TOBN(0x30ae04da, 0x7cf56490), TOBN(0x75662908, 0x0eea3961),
+ TOBN(0x3609f5c5, 0x3d080847), TOBN(0xcb081d39, 0x5241d4f6),
+ TOBN(0xb4fb3810, 0x77961a63), TOBN(0xc20c5984, 0x2abb66fc),
+ TOBN(0x3d40aa7c, 0xf902f245), TOBN(0x9cb12736, 0x4e536b1e),
+ TOBN(0x5eda24da, 0x99b3134f), TOBN(0xafbd9c69, 0x5cd011af),
+ TOBN(0x9a16e30a, 0xc7088c7d), TOBN(0x5ab65710, 0x3207389f),
+ TOBN(0x1b09547f, 0xe7407a53), TOBN(0x2322f9d7, 0x4fdc6eab),
+ TOBN(0xc0f2f22d, 0x7430de4d), TOBN(0x19382696, 0xe68ca9a9),
+ TOBN(0x17f1eff1, 0x918e5868), TOBN(0xe3b5b635, 0x586f4204),
+ TOBN(0x146ef980, 0x3fbc4341), TOBN(0x359f2c80, 0x5b5eed4e),
+ TOBN(0x9f35744e, 0x7482e41d), TOBN(0x9a9ac3ec, 0xf3b224c2),
+ TOBN(0x9161a6fe, 0x91fc50ae), TOBN(0x89ccc66b, 0xc613fa7c),
+ TOBN(0x89268b14, 0xc732f15a), TOBN(0x7cd6f4e2, 0xb467ed03),
+ TOBN(0xfbf79869, 0xce56b40e), TOBN(0xf93e094c, 0xc02dde98),
+ TOBN(0xefe0c3a8, 0xedee2cd7), TOBN(0x90f3ffc0, 0xb268fd42),
+ TOBN(0x81a7fd56, 0x08241aed), TOBN(0x95ab7ad8, 0x00b1afe8),
+ TOBN(0x40127056, 0x3e310d52), TOBN(0xd3ffdeb1, 0x09d9fc43),
+ TOBN(0xc8f85c91, 0xd11a8594), TOBN(0x2e74d258, 0x31cf6db8),
+ TOBN(0x829c7ca3, 0x02b5dfd0), TOBN(0xe389cfbe, 0x69143c86),
+ TOBN(0xd01b6405, 0x941768d8), TOBN(0x45103995, 0x03bf825d),
+ TOBN(0xcc4ee166, 0x56cd17e2), TOBN(0xbea3c283, 0xba037e79),
+ TOBN(0x4e1ac06e, 0xd9a47520), TOBN(0xfbfe18aa, 0xaf852404),
+ TOBN(0x5615f8e2, 0x8087648a), TOBN(0x7301e47e, 0xb9d150d9),
+ TOBN(0x79f9f9dd, 0xb299b977), TOBN(0x76697a7b, 0xa5b78314),
+ TOBN(0x10d67468, 0x7d7c90e7), TOBN(0x7afffe03, 0x937210b5),
+ TOBN(0x5aef3e4b, 0x28c22cee), TOBN(0xefb0ecd8, 0x09fd55ae),
+ TOBN(0x4cea7132, 0x0d2a5d6a), TOBN(0x9cfb5fa1, 0x01db6357),
+ TOBN(0x395e0b57, 0xf36e1ac5), TOBN(0x008fa9ad, 0x36cafb7d),
+ TOBN(0x8f6cdf70, 0x5308c4db), TOBN(0x51527a37, 0x95ed2477),
+ TOBN(0xba0dee30, 0x5bd21311), TOBN(0x6ed41b22, 0x909c90d7),
+ TOBN(0xc5f6b758, 0x7c8696d3), TOBN(0x0db8eaa8, 0x3ce83a80),
+ TOBN(0xd297fe37, 0xb24b4b6f), TOBN(0xfe58afe8, 0x522d1f0d),
+ TOBN(0x97358736, 0x8c98dbd9), TOBN(0x6bc226ca, 0x9454a527),
+ TOBN(0xa12b384e, 0xce53c2d0), TOBN(0x779d897d, 0x5e4606da),
+ TOBN(0xa53e47b0, 0x73ec12b0), TOBN(0x462dbbba, 0x5756f1ad),
+ TOBN(0x69fe09f2, 0xcafe37b6), TOBN(0x273d1ebf, 0xecce2e17),
+ TOBN(0x8ac1d538, 0x3cf607fd), TOBN(0x8035f7ff, 0x12e10c25),}
+ ,
+ {TOBN(0x854d34c7, 0x7e6c5520), TOBN(0xc27df9ef, 0xdcb9ea58),
+ TOBN(0x405f2369, 0xd686666d), TOBN(0x29d1febf, 0x0417aa85),
+ TOBN(0x9846819e, 0x93470afe), TOBN(0x3e6a9669, 0xe2a27f9e),
+ TOBN(0x24d008a2, 0xe31e6504), TOBN(0xdba7cecf, 0x9cb7680a),
+ TOBN(0xecaff541, 0x338d6e43), TOBN(0x56f7dd73, 0x4541d5cc),
+ TOBN(0xb5d426de, 0x96bc88ca), TOBN(0x48d94f6b, 0x9ed3a2c3),
+ TOBN(0x6354a3bb, 0x2ef8279c), TOBN(0xd575465b, 0x0b1867f2),
+ TOBN(0xef99b0ff, 0x95225151), TOBN(0xf3e19d88, 0xf94500d8),
+ TOBN(0x92a83268, 0xe32dd620), TOBN(0x913ec99f, 0x627849a2),
+ TOBN(0xedd8fdfa, 0x2c378882), TOBN(0xaf96f33e, 0xee6f8cfe),
+ TOBN(0xc06737e5, 0xdc3fa8a5), TOBN(0x236bb531, 0xb0b03a1d),
+ TOBN(0x33e59f29, 0x89f037b0), TOBN(0x13f9b5a7, 0xd9a12a53),
+ TOBN(0x0d0df6ce, 0x51efb310), TOBN(0xcb5b2eb4, 0x958df5be),
+ TOBN(0xd6459e29, 0x36158e59), TOBN(0x82aae2b9, 0x1466e336),
+ TOBN(0xfb658a39, 0x411aa636), TOBN(0x7152ecc5, 0xd4c0a933),
+ TOBN(0xf10c758a, 0x49f026b7), TOBN(0xf4837f97, 0xcb09311f),
+ TOBN(0xddfb02c4, 0xc753c45f), TOBN(0x18ca81b6, 0xf9c840fe),
+ TOBN(0x846fd09a, 0xb0f8a3e6), TOBN(0xb1162add, 0xe7733dbc),
+ TOBN(0x7070ad20, 0x236e3ab6), TOBN(0xf88cdaf5, 0xb2a56326),
+ TOBN(0x05fc8719, 0x997cbc7a), TOBN(0x442cd452, 0x4b665272),
+ TOBN(0x7807f364, 0xb71698f5), TOBN(0x6ba418d2, 0x9f7b605e),
+ TOBN(0xfd20b00f, 0xa03b2cbb), TOBN(0x883eca37, 0xda54386f),
+ TOBN(0xff0be43f, 0xf3437f24), TOBN(0xe910b432, 0xa48bb33c),
+ TOBN(0x4963a128, 0x329df765), TOBN(0xac1dd556, 0xbe2fe6f7),
+ TOBN(0x557610f9, 0x24a0a3fc), TOBN(0x38e17bf4, 0xe881c3f9),
+ TOBN(0x6ba84faf, 0xed0dac99), TOBN(0xd4a222c3, 0x59eeb918),
+ TOBN(0xc79c1dbe, 0x13f542b6), TOBN(0x1fc65e0d, 0xe425d457),
+ TOBN(0xeffb754f, 0x1debb779), TOBN(0x638d8fd0, 0x9e08af60),
+ TOBN(0x994f523a, 0x626332d5), TOBN(0x7bc38833, 0x5561bb44),
+ TOBN(0x005ed4b0, 0x3d845ea2), TOBN(0xd39d3ee1, 0xc2a1f08a),
+ TOBN(0x6561fdd3, 0xe7676b0d), TOBN(0x620e35ff, 0xfb706017),
+ TOBN(0x36ce424f, 0xf264f9a8), TOBN(0xc4c3419f, 0xda2681f7),
+ TOBN(0xfb6afd2f, 0x69beb6e8), TOBN(0x3a50b993, 0x6d700d03),
+ TOBN(0xc840b2ad, 0x0c83a14f), TOBN(0x573207be, 0x54085bef),
+ TOBN(0x5af882e3, 0x09fe7e5b), TOBN(0x957678a4, 0x3b40a7e1),
+ TOBN(0x172d4bdd, 0x543056e2), TOBN(0x9c1b26b4, 0x0df13c0a),
+ TOBN(0x1c30861c, 0xf405ff06), TOBN(0xebac86bd, 0x486e828b),
+ TOBN(0xe791a971, 0x636933fc), TOBN(0x50e7c2be, 0x7aeee947),
+ TOBN(0xc3d4a095, 0xfa90d767), TOBN(0xae60eb7b, 0xe670ab7b),
+ TOBN(0x17633a64, 0x397b056d), TOBN(0x93a21f33, 0x105012aa),
+ TOBN(0x663c370b, 0xabb88643), TOBN(0x91df36d7, 0x22e21599),
+ TOBN(0x183ba835, 0x8b761671), TOBN(0x381eea1d, 0x728f3bf1),
+ TOBN(0xb9b2f1ba, 0x39966e6c), TOBN(0x7c464a28, 0xe7295492),
+ TOBN(0x0fd5f70a, 0x09b26b7f), TOBN(0xa9aba1f9, 0xfbe009df),
+ TOBN(0x857c1f22, 0x369b87ad), TOBN(0x3c00e5d9, 0x32fca556),
+ TOBN(0x1ad74cab, 0x90b06466), TOBN(0xa7112386, 0x550faaf2),
+ TOBN(0x7435e198, 0x6d9bd5f5), TOBN(0x2dcc7e38, 0x59c3463f),
+ TOBN(0xdc7df748, 0xca7bd4b2), TOBN(0x13cd4c08, 0x9dec2f31),
+ TOBN(0x0d3b5df8, 0xe3237710), TOBN(0x0dadb26e, 0xcbd2f7b0),
+ TOBN(0x9f5966ab, 0xe4aa082b), TOBN(0x666ec8de, 0x350e966e),
+ TOBN(0x1bfd1ed5, 0xee524216), TOBN(0xcd93c59b, 0x41dab0b6),
+ TOBN(0x658a8435, 0xd186d6ba), TOBN(0x1b7d34d2, 0x159d1195),
+ TOBN(0x5936e460, 0x22caf46b), TOBN(0x6a45dd8f, 0x9a96fe4f),
+ TOBN(0xf7925434, 0xb98f474e), TOBN(0x41410412, 0x0053ef15),
+ TOBN(0x71cf8d12, 0x41de97bf), TOBN(0xb8547b61, 0xbd80bef4),
+ TOBN(0xb47d3970, 0xc4db0037), TOBN(0xf1bcd328, 0xfef20dff),
+ TOBN(0x31a92e09, 0x10caad67), TOBN(0x1f591960, 0x5531a1e1),
+ TOBN(0x3bb852e0, 0x5f4fc840), TOBN(0x63e297ca, 0x93a72c6c),
+ TOBN(0x3c2b0b2e, 0x49abad67), TOBN(0x6ec405fc, 0xed3db0d9),
+ TOBN(0xdc14a530, 0x7fef1d40), TOBN(0xccd19846, 0x280896fc),
+ TOBN(0x00f83176, 0x9bb81648), TOBN(0xd69eb485, 0x653120d0),
+ TOBN(0xd17d75f4, 0x4ccabc62), TOBN(0x34a07f82, 0xb749fcb1),
+ TOBN(0x2c3af787, 0xbbfb5554), TOBN(0xb06ed4d0, 0x62e283f8),
+ TOBN(0x5722889f, 0xa19213a0), TOBN(0x162b085e, 0xdcf3c7b4),
+ TOBN(0xbcaecb31, 0xe0dd3eca), TOBN(0xc6237fbc, 0xe52f13a5),
+ TOBN(0xcc2b6b03, 0x27bac297), TOBN(0x2ae1cac5, 0xb917f54a),
+ TOBN(0x474807d4, 0x7845ae4f), TOBN(0xfec7dd92, 0xce5972e0),
+ TOBN(0xc3bd2541, 0x1d7915bb), TOBN(0x66f85dc4, 0xd94907ca),
+ TOBN(0xd981b888, 0xbdbcf0ca), TOBN(0xd75f5da6, 0xdf279e9f),
+ TOBN(0x128bbf24, 0x7054e934), TOBN(0x3c6ff6e5, 0x81db134b),
+ TOBN(0x795b7cf4, 0x047d26e4), TOBN(0xf370f7b8, 0x5049ec37),
+ TOBN(0xc6712d4d, 0xced945af), TOBN(0xdf30b5ec, 0x095642bc),
+ TOBN(0x9b034c62, 0x4896246e), TOBN(0x5652c016, 0xee90bbd1),
+ TOBN(0xeb38636f, 0x87fedb73), TOBN(0x5e32f847, 0x0135a613),
+ TOBN(0x0703b312, 0xcf933c83), TOBN(0xd05bb76e, 0x1a7f47e6),
+ TOBN(0x825e4f0c, 0x949c2415), TOBN(0x569e5622, 0x7250d6f8),
+ TOBN(0xbbe9eb3a, 0x6568013e), TOBN(0x8dbd203f, 0x22f243fc),
+ TOBN(0x9dbd7694, 0xb342734a), TOBN(0x8f6d12f8, 0x46afa984),
+ TOBN(0xb98610a2, 0xc9eade29), TOBN(0xbab4f323, 0x47dd0f18),
+ TOBN(0x5779737b, 0x671c0d46), TOBN(0x10b6a7c6, 0xd3e0a42a),
+ TOBN(0xfb19ddf3, 0x3035b41c), TOBN(0xd336343f, 0x99c45895),
+ TOBN(0x61fe4938, 0x54c857e5), TOBN(0xc4d506be, 0xae4e57d5),
+ TOBN(0x3cd8c8cb, 0xbbc33f75), TOBN(0x7281f08a, 0x9262c77d),
+ TOBN(0x083f4ea6, 0xf11a2823), TOBN(0x8895041e, 0x9fba2e33),
+ TOBN(0xfcdfea49, 0x9c438edf), TOBN(0x7678dcc3, 0x91edba44),
+ TOBN(0xf07b3b87, 0xe2ba50f0), TOBN(0xc13888ef, 0x43948c1b),
+ TOBN(0xc2135ad4, 0x1140af42), TOBN(0x8e5104f3, 0x926ed1a7),
+ TOBN(0xf24430cb, 0x88f6695f), TOBN(0x0ce0637b, 0x6d73c120),
+ TOBN(0xb2db01e6, 0xfe631e8f), TOBN(0x1c5563d7, 0xd7bdd24b),
+ TOBN(0x8daea3ba, 0x369ad44f), TOBN(0x000c81b6, 0x8187a9f9),
+ TOBN(0x5f48a951, 0xaae1fd9a), TOBN(0xe35626c7, 0x8d5aed8a),
+ TOBN(0x20952763, 0x0498c622), TOBN(0x76d17634, 0x773aa504),
+ TOBN(0x36d90dda, 0xeb300f7a), TOBN(0x9dcf7dfc, 0xedb5e801),
+ TOBN(0x645cb268, 0x74d5244c), TOBN(0xa127ee79, 0x348e3aa2),
+ TOBN(0x488acc53, 0x575f1dbb), TOBN(0x95037e85, 0x80e6161e),
+ TOBN(0x57e59283, 0x292650d0), TOBN(0xabe67d99, 0x14938216),
+ TOBN(0x3c7f944b, 0x3f8e1065), TOBN(0xed908cb6, 0x330e8924),
+ TOBN(0x08ee8fd5, 0x6f530136), TOBN(0x2227b7d5, 0xd7ffc169),
+ TOBN(0x4f55c893, 0xb5cd6dd5), TOBN(0x82225e11, 0xa62796e8),
+ TOBN(0x5c6cead1, 0xcb18e12c), TOBN(0x4381ae0c, 0x84f5a51a),
+ TOBN(0x345913d3, 0x7fafa4c8), TOBN(0x3d918082, 0x0491aac0),
+ TOBN(0x9347871f, 0x3e69264c), TOBN(0xbea9dd3c, 0xb4f4f0cd),
+ TOBN(0xbda5d067, 0x3eadd3e7), TOBN(0x0033c1b8, 0x0573bcd8),
+ TOBN(0x25589379, 0x5da2486c), TOBN(0xcb89ee5b, 0x86abbee7),
+ TOBN(0x8fe0a8f3, 0x22532e5d), TOBN(0xb6410ff0, 0x727dfc4c),
+ TOBN(0x619b9d58, 0x226726db), TOBN(0x5ec25669, 0x7a2b2dc7),
+ TOBN(0xaf4d2e06, 0x4c3beb01), TOBN(0x852123d0, 0x7acea556),
+ TOBN(0x0e9470fa, 0xf783487a), TOBN(0x75a7ea04, 0x5664b3eb),
+ TOBN(0x4ad78f35, 0x6798e4ba), TOBN(0x9214e6e5, 0xc7d0e091),
+ TOBN(0xc420b488, 0xb1290403), TOBN(0x64049e0a, 0xfc295749),
+ TOBN(0x03ef5af1, 0x3ae9841f), TOBN(0xdbe4ca19, 0xb0b662a6),
+ TOBN(0x46845c5f, 0xfa453458), TOBN(0xf8dabf19, 0x10b66722),
+ TOBN(0xb650f0aa, 0xcce2793b), TOBN(0x71db851e, 0xc5ec47c1),
+ TOBN(0x3eb78f3e, 0x3b234fa9), TOBN(0xb0c60f35, 0xfc0106ce),
+ TOBN(0x05427121, 0x774eadbd), TOBN(0x25367faf, 0xce323863),
+ TOBN(0x7541b5c9, 0xcd086976), TOBN(0x4ff069e2, 0xdc507ad1),
+ TOBN(0x74145256, 0x8776e667), TOBN(0x6e76142c, 0xb23c6bb5),
+ TOBN(0xdbf30712, 0x1b3a8a87), TOBN(0x60e7363e, 0x98450836),
+ TOBN(0x5741450e, 0xb7366d80), TOBN(0xe4ee14ca, 0x4837dbdf),
+ TOBN(0xa765eb9b, 0x69d4316f), TOBN(0x04548dca, 0x8ef43825),
+ TOBN(0x9c9f4e4c, 0x5ae888eb), TOBN(0x733abb51, 0x56e9ac99),
+ TOBN(0xdaad3c20, 0xba6ac029), TOBN(0x9b8dd3d3, 0x2ba3e38e),
+ TOBN(0xa9bb4c92, 0x0bc5d11a), TOBN(0xf20127a7, 0x9c5f88a3),
+ TOBN(0x4f52b06e, 0x161d3cb8), TOBN(0x26c1ff09, 0x6afaf0a6),
+ TOBN(0x32670d2f, 0x7189e71f), TOBN(0xc6438748, 0x5ecf91e7),
+ TOBN(0x15758e57, 0xdb757a21), TOBN(0x427d09f8, 0x290a9ce5),
+ TOBN(0x846a308f, 0x38384a7a), TOBN(0xaac3acb4, 0xb0732b99),
+ TOBN(0x9e941009, 0x17845819), TOBN(0x95cba111, 0xa7ce5e03),
+ TOBN(0x6f3d4f7f, 0xb00009c4), TOBN(0xb8396c27, 0x8ff28b5f),
+ TOBN(0xb1a9ae43, 0x1c97975d), TOBN(0x9d7ba8af, 0xe5d9fed5),
+ TOBN(0x338cf09f, 0x34f485b6), TOBN(0xbc0ddacc, 0x64122516),
+ TOBN(0xa450da12, 0x05d471fe), TOBN(0x4c3a6250, 0x628dd8c9),
+ TOBN(0x69c7d103, 0xd1295837), TOBN(0xa2893e50, 0x3807eb2f),
+ TOBN(0xd6e1e1de, 0xbdb41491), TOBN(0xc630745b, 0x5e138235),
+ TOBN(0xc892109e, 0x48661ae1), TOBN(0x8d17e7eb, 0xea2b2674),
+ TOBN(0x00ec0f87, 0xc328d6b5), TOBN(0x6d858645, 0xf079ff9e),
+ TOBN(0x6cdf243e, 0x19115ead), TOBN(0x1ce1393e, 0x4bac4fcf),
+ TOBN(0x2c960ed0, 0x9c29f25b), TOBN(0x59be4d8e, 0x9d388a05),
+ TOBN(0x0d46e06c, 0xd0def72b), TOBN(0xb923db5d, 0xe0342748),
+ TOBN(0xf7d3aacd, 0x936d4a3d), TOBN(0x558519cc, 0x0b0b099e),
+ TOBN(0x3ea8ebf8, 0x827097ef), TOBN(0x259353db, 0xd054f55d),
+ TOBN(0x84c89abc, 0x6d2ed089), TOBN(0x5c548b69, 0x8e096a7c),
+ TOBN(0xd587f616, 0x994b995d), TOBN(0x4d1531f6, 0xa5845601),
+ TOBN(0x792ab31e, 0x451fd9f0), TOBN(0xc8b57bb2, 0x65adf6ca),
+ TOBN(0x68440fcb, 0x1cd5ad73), TOBN(0xb9c860e6, 0x6144da4f),
+ TOBN(0x2ab286aa, 0x8462beb8), TOBN(0xcc6b8fff, 0xef46797f),
+ TOBN(0xac820da4, 0x20c8a471), TOBN(0x69ae05a1, 0x77ff7faf),
+ TOBN(0xb9163f39, 0xbfb5da77), TOBN(0xbd03e590, 0x2c73ab7a),
+ TOBN(0x7e862b5e, 0xb2940d9e), TOBN(0x3c663d86, 0x4b9af564),
+ TOBN(0xd8309031, 0xbde3033d), TOBN(0x298231b2, 0xd42c5bc6),
+ TOBN(0x42090d2c, 0x552ad093), TOBN(0xa4799d1c, 0xff854695),
+ TOBN(0x0a88b5d6, 0xd31f0d00), TOBN(0xf8b40825, 0xa2f26b46),
+ TOBN(0xec29b1ed, 0xf1bd7218), TOBN(0xd491c53b, 0x4b24c86e),
+ TOBN(0xd2fe588f, 0x3395ea65), TOBN(0x6f3764f7, 0x4456ef15),
+ TOBN(0xdb43116d, 0xcdc34800), TOBN(0xcdbcd456, 0xc1e33955),
+ TOBN(0xefdb5540, 0x74ab286b), TOBN(0x948c7a51, 0xd18c5d7c),
+ TOBN(0xeb81aa37, 0x7378058e), TOBN(0x41c746a1, 0x04411154),
+ TOBN(0xa10c73bc, 0xfb828ac7), TOBN(0x6439be91, 0x9d972b29),
+ TOBN(0x4bf3b4b0, 0x43a2fbad), TOBN(0x39e6dadf, 0x82b5e840),
+ TOBN(0x4f716408, 0x6397bd4c), TOBN(0x0f7de568, 0x7f1eeccb),
+ TOBN(0x5865c5a1, 0xd2ffbfc1), TOBN(0xf74211fa, 0x4ccb6451),
+ TOBN(0x66368a88, 0xc0b32558), TOBN(0x5b539dc2, 0x9ad7812e),
+ TOBN(0x579483d0, 0x2f3af6f6), TOBN(0x52132078, 0x99934ece),
+ TOBN(0x50b9650f, 0xdcc9e983), TOBN(0xca989ec9, 0xaee42b8a),
+ TOBN(0x6a44c829, 0xd6f62f99), TOBN(0x8f06a309, 0x4c2a7c0c),
+ TOBN(0x4ea2b3a0, 0x98a0cb0a), TOBN(0x5c547b70, 0xbeee8364),
+ TOBN(0x461d40e1, 0x682afe11), TOBN(0x9e0fc77a, 0x7b41c0a8),
+ TOBN(0x79e4aefd, 0xe20d5d36), TOBN(0x2916e520, 0x32dd9f63),
+ TOBN(0xf59e52e8, 0x3f883faf), TOBN(0x396f9639, 0x2b868d35),
+ TOBN(0xc902a9df, 0x4ca19881), TOBN(0x0fc96822, 0xdb2401a6),
+ TOBN(0x41237587, 0x66f1c68d), TOBN(0x10fc6de3, 0xfb476c0d),
+ TOBN(0xf8b6b579, 0x841f5d90), TOBN(0x2ba8446c, 0xfa24f44a),
+ TOBN(0xa237b920, 0xef4a9975), TOBN(0x60bb6004, 0x2330435f),
+ TOBN(0xd6f4ab5a, 0xcfb7e7b5), TOBN(0xb2ac5097, 0x83435391),
+ TOBN(0xf036ee2f, 0xb0d1ea67), TOBN(0xae779a6a, 0x74c56230),
+ TOBN(0x59bff8c8, 0xab838ae6), TOBN(0xcd83ca99, 0x9b38e6f0),
+ TOBN(0xbb27bef5, 0xe33deed3), TOBN(0xe6356f6f, 0x001892a8),
+ TOBN(0xbf3be6cc, 0x7adfbd3e), TOBN(0xaecbc81c, 0x33d1ac9d),
+ TOBN(0xe4feb909, 0xe6e861dc), TOBN(0x90a247a4, 0x53f5f801),
+ TOBN(0x01c50acb, 0x27346e57), TOBN(0xce29242e, 0x461acc1b),
+ TOBN(0x04dd214a, 0x2f998a91), TOBN(0x271ee9b1, 0xd4baf27b),
+ TOBN(0x7e3027d1, 0xe8c26722), TOBN(0x21d1645c, 0x1820dce5),
+ TOBN(0x086f242c, 0x7501779c), TOBN(0xf0061407, 0xfa0e8009),
+ TOBN(0xf23ce477, 0x60187129), TOBN(0x05bbdedb, 0x0fde9bd0),
+ TOBN(0x682f4832, 0x25d98473), TOBN(0xf207fe85, 0x5c658427),
+ TOBN(0xb6fdd7ba, 0x4166ffa1), TOBN(0x0c314056, 0x9eed799d),
+ TOBN(0x0db8048f, 0x4107e28f), TOBN(0x74ed3871, 0x41216840),
+ TOBN(0x74489f8f, 0x56a3c06e), TOBN(0x1e1c005b, 0x12777134),
+ TOBN(0xdb332a73, 0xf37ec3c3), TOBN(0xc65259bd, 0xdd59eba0),
+ TOBN(0x2291709c, 0xdb4d3257), TOBN(0x9a793b25, 0xbd389390),
+ TOBN(0xf39fe34b, 0xe43756f0), TOBN(0x2f76bdce, 0x9afb56c9),
+ TOBN(0x9f37867a, 0x61208b27), TOBN(0xea1d4307, 0x089972c3),
+ TOBN(0x8c595330, 0x8bdf623a), TOBN(0x5f5accda, 0x8441fb7d),
+ TOBN(0xfafa9418, 0x32ddfd95), TOBN(0x6ad40c5a, 0x0fde9be7),
+ TOBN(0x43faba89, 0xaeca8709), TOBN(0xc64a7cf1, 0x2c248a9d),
+ TOBN(0x16620252, 0x72637a76), TOBN(0xaee1c791, 0x22b8d1bb),
+ TOBN(0xf0f798fd, 0x21a843b2), TOBN(0x56e4ed4d, 0x8d005cb1),
+ TOBN(0x355f7780, 0x1f0d8abe), TOBN(0x197b04cf, 0x34522326),
+ TOBN(0x41f9b31f, 0xfd42c13f), TOBN(0x5ef7feb2, 0xb40f933d),
+ TOBN(0x27326f42, 0x5d60bad4), TOBN(0x027ecdb2, 0x8c92cf89),
+ TOBN(0x04aae4d1, 0x4e3352fe), TOBN(0x08414d2f, 0x73591b90),
+ TOBN(0x5ed6124e, 0xb7da7d60), TOBN(0xb985b931, 0x4d13d4ec),
+ TOBN(0xa592d3ab, 0x96bf36f9), TOBN(0x012dbed5, 0xbbdf51df),
+ TOBN(0xa57963c0, 0xdf6c177d), TOBN(0x010ec869, 0x87ca29cf),
+ TOBN(0xba1700f6, 0xbf926dff), TOBN(0x7c9fdbd1, 0xf4bf6bc2),
+ TOBN(0xdc18dc8f, 0x64da11f5), TOBN(0xa6074b7a, 0xd938ae75),
+ TOBN(0x14270066, 0xe84f44a4), TOBN(0x99998d38, 0xd27b954e),
+ TOBN(0xc1be8ab2, 0xb4f38e9a), TOBN(0x8bb55bbf, 0x15c01016),
+ TOBN(0xf73472b4, 0x0ea2ab30), TOBN(0xd365a340, 0xf73d68dd),
+ TOBN(0xc01a7168, 0x19c2e1eb), TOBN(0x32f49e37, 0x34061719),
+ TOBN(0xb73c57f1, 0x01d8b4d6), TOBN(0x03c8423c, 0x26b47700),
+ TOBN(0x321d0bc8, 0xa4d8826a), TOBN(0x6004213c, 0x4bc0e638),
+ TOBN(0xf78c64a1, 0xc1c06681), TOBN(0x16e0a16f, 0xef018e50),
+ TOBN(0x31cbdf91, 0xdb42b2b3), TOBN(0xf8f4ffce, 0xe0d36f58),
+ TOBN(0xcdcc71cd, 0x4cc5e3e0), TOBN(0xd55c7cfa, 0xa129e3e0),
+ TOBN(0xccdb6ba0, 0x0fb2cbf1), TOBN(0x6aba0005, 0xc4bce3cb),
+ TOBN(0x501cdb30, 0xd232cfc4), TOBN(0x9ddcf12e, 0xd58a3cef),
+ TOBN(0x02d2cf9c, 0x87e09149), TOBN(0xdc5d7ec7, 0x2c976257),
+ TOBN(0x6447986e, 0x0b50d7dd), TOBN(0x88fdbaf7, 0x807f112a),
+ TOBN(0x58c9822a, 0xb00ae9f6), TOBN(0x6abfb950, 0x6d3d27e0),
+ TOBN(0xd0a74487, 0x8a429f4f), TOBN(0x0649712b, 0xdb516609),
+ TOBN(0xb826ba57, 0xe769b5df), TOBN(0x82335df2, 0x1fc7aaf2),
+ TOBN(0x2389f067, 0x5c93d995), TOBN(0x59ac367a, 0x68677be6),
+ TOBN(0xa77985ff, 0x21d9951b), TOBN(0x038956fb, 0x85011cce),
+ TOBN(0x608e48cb, 0xbb734e37), TOBN(0xc08c0bf2, 0x2be5b26f),
+ TOBN(0x17bbdd3b, 0xf9b1a0d9), TOBN(0xeac7d898, 0x10483319),
+ TOBN(0xc95c4baf, 0xbc1a6dea), TOBN(0xfdd0e2bf, 0x172aafdb),
+ TOBN(0x40373cbc, 0x8235c41a), TOBN(0x14303f21, 0xfb6f41d5),
+ TOBN(0xba063621, 0x0408f237), TOBN(0xcad3b09a, 0xecd2d1ed),
+ TOBN(0x4667855a, 0x52abb6a2), TOBN(0xba9157dc, 0xaa8b417b),
+ TOBN(0xfe7f3507, 0x4f013efb), TOBN(0x1b112c4b, 0xaa38c4a2),
+ TOBN(0xa1406a60, 0x9ba64345), TOBN(0xe53cba33, 0x6993c80b),
+ TOBN(0x45466063, 0xded40d23), TOBN(0x3d5f1f4d, 0x54908e25),
+ TOBN(0x9ebefe62, 0x403c3c31), TOBN(0x274ea0b5, 0x0672a624),
+ TOBN(0xff818d99, 0x451d1b71), TOBN(0x80e82643, 0x8f79cf79),
+ TOBN(0xa165df13, 0x73ce37f5), TOBN(0xa744ef4f, 0xfe3a21fd),
+ TOBN(0x73f1e7f5, 0xcf551396), TOBN(0xc616898e, 0x868c676b),
+ TOBN(0x671c28c7, 0x8c442c36), TOBN(0xcfe5e558, 0x5e0a317d),
+ TOBN(0x1242d818, 0x7051f476), TOBN(0x56fad2a6, 0x14f03442),
+ TOBN(0x262068bc, 0x0a44d0f6), TOBN(0xdfa2cd6e, 0xce6edf4e),
+ TOBN(0x0f43813a, 0xd15d1517), TOBN(0x61214cb2, 0x377d44f5),
+ TOBN(0xd399aa29, 0xc639b35f), TOBN(0x42136d71, 0x54c51c19),
+ TOBN(0x9774711b, 0x08417221), TOBN(0x0a5546b3, 0x52545a57),
+ TOBN(0x80624c41, 0x1150582d), TOBN(0x9ec5c418, 0xfbc555bc),
+ TOBN(0x2c87dcad, 0x771849f1), TOBN(0xb0c932c5, 0x01d7bf6f),
+ TOBN(0x6aa5cd3e, 0x89116eb2), TOBN(0xd378c25a, 0x51ca7bd3),
+ TOBN(0xc612a0da, 0x9e6e3e31), TOBN(0x0417a54d, 0xb68ad5d0),
+ TOBN(0x00451e4a, 0x22c6edb8), TOBN(0x9fbfe019, 0xb42827ce),
+ TOBN(0x2fa92505, 0xba9384a2), TOBN(0x21b8596e, 0x64ad69c1),
+ TOBN(0x8f4fcc49, 0x983b35a6), TOBN(0xde093760, 0x72754672),
+ TOBN(0x2f14ccc8, 0xf7bffe6d), TOBN(0x27566bff, 0x5d94263d),
+ TOBN(0xb5b4e9c6, 0x2df3ec30), TOBN(0x94f1d7d5, 0x3e6ea6ba),
+ TOBN(0x97b7851a, 0xaaca5e9b), TOBN(0x518aa521, 0x56713b97),
+ TOBN(0x3357e8c7, 0x150a61f6), TOBN(0x7842e7e2, 0xec2c2b69),
+ TOBN(0x8dffaf65, 0x6868a548), TOBN(0xd963bd82, 0xe068fc81),
+ TOBN(0x64da5c8b, 0x65917733), TOBN(0x927090ff, 0x7b247328),}
+ ,
+ {TOBN(0x214bc9a7, 0xd298c241), TOBN(0xe3b697ba, 0x56807cfd),
+ TOBN(0xef1c7802, 0x4564eadb), TOBN(0xdde8cdcf, 0xb48149c5),
+ TOBN(0x946bf0a7, 0x5a4d2604), TOBN(0x27154d7f, 0x6c1538af),
+ TOBN(0x95cc9230, 0xde5b1fcc), TOBN(0xd88519e9, 0x66864f82),
+ TOBN(0xb828dd1a, 0x7cb1282c), TOBN(0xa08d7626, 0xbe46973a),
+ TOBN(0x6baf8d40, 0xe708d6b2), TOBN(0x72571fa1, 0x4daeb3f3),
+ TOBN(0x85b1732f, 0xf22dfd98), TOBN(0x87ab01a7, 0x0087108d),
+ TOBN(0xaaaafea8, 0x5988207a), TOBN(0xccc832f8, 0x69f00755),
+ TOBN(0x964d950e, 0x36ff3bf0), TOBN(0x8ad20f6f, 0xf0b34638),
+ TOBN(0x4d9177b3, 0xb5d7585f), TOBN(0xcf839760, 0xef3f019f),
+ TOBN(0x582fc5b3, 0x8288c545), TOBN(0x2f8e4e9b, 0x13116bd1),
+ TOBN(0xf91e1b2f, 0x332120ef), TOBN(0xcf568724, 0x2a17dd23),
+ TOBN(0x488f1185, 0xca8d9d1a), TOBN(0xadf2c77d, 0xd987ded2),
+ TOBN(0x5f3039f0, 0x60c46124), TOBN(0xe5d70b75, 0x71e095f4),
+ TOBN(0x82d58650, 0x6260e70f), TOBN(0x39d75ea7, 0xf750d105),
+ TOBN(0x8cf3d0b1, 0x75bac364), TOBN(0xf3a7564d, 0x21d01329),
+ TOBN(0x182f04cd, 0x2f52d2a7), TOBN(0x4fde149a, 0xe2df565a),
+ TOBN(0xb80c5eec, 0xa79fb2f7), TOBN(0xab491d7b, 0x22ddc897),
+ TOBN(0x99d76c18, 0xc6312c7f), TOBN(0xca0d5f3d, 0x6aa41a57),
+ TOBN(0x71207325, 0xd15363a0), TOBN(0xe82aa265, 0xbeb252c2),
+ TOBN(0x94ab4700, 0xec3128c2), TOBN(0x6c76d862, 0x8e383f49),
+ TOBN(0xdc36b150, 0xc03024eb), TOBN(0xfb439477, 0x53daac69),
+ TOBN(0xfc68764a, 0x8dc79623), TOBN(0x5b86995d, 0xb440fbb2),
+ TOBN(0xd66879bf, 0xccc5ee0d), TOBN(0x05228942, 0x95aa8bd3),
+ TOBN(0xb51a40a5, 0x1e6a75c1), TOBN(0x24327c76, 0x0ea7d817),
+ TOBN(0x06630182, 0x07774597), TOBN(0xd6fdbec3, 0x97fa7164),
+ TOBN(0x20c99dfb, 0x13c90f48), TOBN(0xd6ac5273, 0x686ef263),
+ TOBN(0xc6a50bdc, 0xfef64eeb), TOBN(0xcd87b281, 0x86fdfc32),
+ TOBN(0xb24aa43e, 0x3fcd3efc), TOBN(0xdd26c034, 0xb8088e9a),
+ TOBN(0xa5ef4dc9, 0xbd3d46ea), TOBN(0xa2f99d58, 0x8a4c6a6f),
+ TOBN(0xddabd355, 0x2f1da46c), TOBN(0x72c3f8ce, 0x1afacdd1),
+ TOBN(0xd90c4eee, 0x92d40578), TOBN(0xd28bb41f, 0xca623b94),
+ TOBN(0x50fc0711, 0x745edc11), TOBN(0x9dd9ad7d, 0x3dc87558),
+ TOBN(0xce6931fb, 0xb49d1e64), TOBN(0x6c77a0a2, 0xc98bd0f9),
+ TOBN(0x62b9a629, 0x6baf7cb1), TOBN(0xcf065f91, 0xccf72d22),
+ TOBN(0x7203cce9, 0x79639071), TOBN(0x09ae4885, 0xf9cb732f),
+ TOBN(0x5e7c3bec, 0xee8314f3), TOBN(0x1c068aed, 0xdbea298f),
+ TOBN(0x08d381f1, 0x7c80acec), TOBN(0x03b56be8, 0xe330495b),
+ TOBN(0xaeffb8f2, 0x9222882d), TOBN(0x95ff38f6, 0xc4af8bf7),
+ TOBN(0x50e32d35, 0x1fc57d8c), TOBN(0x6635be52, 0x17b444f0),
+ TOBN(0x04d15276, 0xa5177900), TOBN(0x4e1dbb47, 0xf6858752),
+ TOBN(0x5b475622, 0xc615796c), TOBN(0xa6fa0387, 0x691867bf),
+ TOBN(0xed7f5d56, 0x2844c6d0), TOBN(0xc633cf9b, 0x03a2477d),
+ TOBN(0xf6be5c40, 0x2d3721d6), TOBN(0xaf312eb7, 0xe9fd68e6),
+ TOBN(0x242792d2, 0xe7417ce1), TOBN(0xff42bc71, 0x970ee7f5),
+ TOBN(0x1ff4dc6d, 0x5c67a41e), TOBN(0x77709b7b, 0x20882a58),
+ TOBN(0x3554731d, 0xbe217f2c), TOBN(0x2af2a8cd, 0x5bb72177),
+ TOBN(0x58eee769, 0x591dd059), TOBN(0xbb2930c9, 0x4bba6477),
+ TOBN(0x863ee047, 0x7d930cfc), TOBN(0x4c262ad1, 0x396fd1f4),
+ TOBN(0xf4765bc8, 0x039af7e1), TOBN(0x2519834b, 0x5ba104f6),
+ TOBN(0x7cd61b4c, 0xd105f961), TOBN(0xa5415da5, 0xd63bca54),
+ TOBN(0x778280a0, 0x88a1f17c), TOBN(0xc4968949, 0x2329512c),
+ TOBN(0x174a9126, 0xcecdaa7a), TOBN(0xfc8c7e0e, 0x0b13247b),
+ TOBN(0x29c110d2, 0x3484c1c4), TOBN(0xf8eb8757, 0x831dfc3b),
+ TOBN(0x022f0212, 0xc0067452), TOBN(0x3f6f69ee, 0x7b9b926c),
+ TOBN(0x09032da0, 0xef42daf4), TOBN(0x79f00ade, 0x83f80de4),
+ TOBN(0x6210db71, 0x81236c97), TOBN(0x74f7685b, 0x3ee0781f),
+ TOBN(0x4df7da7b, 0xa3e41372), TOBN(0x2aae38b1, 0xb1a1553e),
+ TOBN(0x1688e222, 0xf6dd9d1b), TOBN(0x57695448, 0x5b8b6487),
+ TOBN(0x478d2127, 0x4b2edeaa), TOBN(0xb2818fa5, 0x1e85956a),
+ TOBN(0x1e6addda, 0xf176f2c0), TOBN(0x01ca4604, 0xe2572658),
+ TOBN(0x0a404ded, 0x85342ffb), TOBN(0x8cf60f96, 0x441838d6),
+ TOBN(0x9bbc691c, 0xc9071c4a), TOBN(0xfd588744, 0x34442803),
+ TOBN(0x97101c85, 0x809c0d81), TOBN(0xa7fb754c, 0x8c456f7f),
+ TOBN(0xc95f3c5c, 0xd51805e1), TOBN(0xab4ccd39, 0xb299dca8),
+ TOBN(0x3e03d20b, 0x47eaf500), TOBN(0xfa3165c1, 0xd7b80893),
+ TOBN(0x005e8b54, 0xe160e552), TOBN(0xdc4972ba, 0x9019d11f),
+ TOBN(0x21a6972e, 0x0c9a4a7a), TOBN(0xa52c258f, 0x37840fd7),
+ TOBN(0xf8559ff4, 0xc1e99d81), TOBN(0x08e1a7d6, 0xa3c617c0),
+ TOBN(0xb398fd43, 0x248c6ba7), TOBN(0x6ffedd91, 0xd1283794),
+ TOBN(0x8a6a59d2, 0xd629d208), TOBN(0xa9d141d5, 0x3490530e),
+ TOBN(0x42f6fc18, 0x38505989), TOBN(0x09bf250d, 0x479d94ee),
+ TOBN(0x223ad3b1, 0xb3822790), TOBN(0x6c5926c0, 0x93b8971c),
+ TOBN(0x609efc7e, 0x75f7fa62), TOBN(0x45d66a6d, 0x1ec2d989),
+ TOBN(0x4422d663, 0x987d2792), TOBN(0x4a73caad, 0x3eb31d2b),
+ TOBN(0xf06c2ac1, 0xa32cb9e6), TOBN(0xd9445c5f, 0x91aeba84),
+ TOBN(0x6af7a1d5, 0xaf71013f), TOBN(0xe68216e5, 0x0bedc946),
+ TOBN(0xf4cba30b, 0xd27370a0), TOBN(0x7981afbf, 0x870421cc),
+ TOBN(0x02496a67, 0x9449f0e1), TOBN(0x86cfc4be, 0x0a47edae),
+ TOBN(0x3073c936, 0xb1feca22), TOBN(0xf5694612, 0x03f8f8fb),
+ TOBN(0xd063b723, 0x901515ea), TOBN(0x4c6c77a5, 0x749cf038),
+ TOBN(0x6361e360, 0xab9e5059), TOBN(0x596cf171, 0xa76a37c0),
+ TOBN(0x800f53fa, 0x6530ae7a), TOBN(0x0f5e631e, 0x0792a7a6),
+ TOBN(0x5cc29c24, 0xefdb81c9), TOBN(0xa269e868, 0x3f9c40ba),
+ TOBN(0xec14f9e1, 0x2cb7191e), TOBN(0x78ea1bd8, 0xe5b08ea6),
+ TOBN(0x3c65aa9b, 0x46332bb9), TOBN(0x84cc22b3, 0xbf80ce25),
+ TOBN(0x0098e9e9, 0xd49d5bf1), TOBN(0xcd4ec1c6, 0x19087da4),
+ TOBN(0x3c9d07c5, 0xaef6e357), TOBN(0x839a0268, 0x9f8f64b8),
+ TOBN(0xc5e9eb62, 0xc6d8607f), TOBN(0x759689f5, 0x6aa995e4),
+ TOBN(0x70464669, 0xbbb48317), TOBN(0x921474bf, 0xe402417d),
+ TOBN(0xcabe135b, 0x2a354c8c), TOBN(0xd51e52d2, 0x812fa4b5),
+ TOBN(0xec741096, 0x53311fe8), TOBN(0x4f774535, 0xb864514b),
+ TOBN(0xbcadd671, 0x5bde48f8), TOBN(0xc9703873, 0x2189bc7d),
+ TOBN(0x5d45299e, 0xc709ee8a), TOBN(0xd1287ee2, 0x845aaff8),
+ TOBN(0x7d1f8874, 0xdb1dbf1f), TOBN(0xea46588b, 0x990c88d6),
+ TOBN(0x60ba649a, 0x84368313), TOBN(0xd5fdcbce, 0x60d543ae),
+ TOBN(0x90b46d43, 0x810d5ab0), TOBN(0x6739d8f9, 0x04d7e5cc),
+ TOBN(0x021c1a58, 0x0d337c33), TOBN(0x00a61162, 0x68e67c40),
+ TOBN(0x95ef413b, 0x379f0a1f), TOBN(0xfe126605, 0xe9e2ab95),
+ TOBN(0x67578b85, 0x2f5f199c), TOBN(0xf5c00329, 0x2cb84913),
+ TOBN(0xf7956430, 0x37577dd8), TOBN(0x83b82af4, 0x29c5fe88),
+ TOBN(0x9c1bea26, 0xcdbdc132), TOBN(0x589fa086, 0x9c04339e),
+ TOBN(0x033e9538, 0xb13799df), TOBN(0x85fa8b21, 0xd295d034),
+ TOBN(0xdf17f73f, 0xbd9ddcca), TOBN(0xf32bd122, 0xddb66334),
+ TOBN(0x55ef88a7, 0x858b044c), TOBN(0x1f0d69c2, 0x5aa9e397),
+ TOBN(0x55fd9cc3, 0x40d85559), TOBN(0xc774df72, 0x7785ddb2),
+ TOBN(0x5dcce9f6, 0xd3bd2e1c), TOBN(0xeb30da20, 0xa85dfed0),
+ TOBN(0x5ed7f5bb, 0xd3ed09c4), TOBN(0x7d42a35c, 0x82a9c1bd),
+ TOBN(0xcf3de995, 0x9890272d), TOBN(0x75f3432a, 0x3e713a10),
+ TOBN(0x5e13479f, 0xe28227b8), TOBN(0xb8561ea9, 0xfefacdc8),
+ TOBN(0xa6a297a0, 0x8332aafd), TOBN(0x9b0d8bb5, 0x73809b62),
+ TOBN(0xd2fa1cfd, 0x0c63036f), TOBN(0x7a16eb55, 0xbd64bda8),
+ TOBN(0x3f5cf5f6, 0x78e62ddc), TOBN(0x2267c454, 0x07fd752b),
+ TOBN(0x5e361b6b, 0x5e437bbe), TOBN(0x95c59501, 0x8354e075),
+ TOBN(0xec725f85, 0xf2b254d9), TOBN(0x844b617d, 0x2cb52b4e),
+ TOBN(0xed8554f5, 0xcf425fb5), TOBN(0xab67703e, 0x2af9f312),
+ TOBN(0x4cc34ec1, 0x3cf48283), TOBN(0xb09daa25, 0x9c8a705e),
+ TOBN(0xd1e9d0d0, 0x5b7d4f84), TOBN(0x4df6ef64, 0xdb38929d),
+ TOBN(0xe16b0763, 0xaa21ba46), TOBN(0xc6b1d178, 0xa293f8fb),
+ TOBN(0x0ff5b602, 0xd520aabf), TOBN(0x94d671bd, 0xc339397a),
+ TOBN(0x7c7d98cf, 0x4f5792fa), TOBN(0x7c5e0d67, 0x11215261),
+ TOBN(0x9b19a631, 0xa7c5a6d4), TOBN(0xc8511a62, 0x7a45274d),
+ TOBN(0x0c16621c, 0xa5a60d99), TOBN(0xf7fbab88, 0xcf5e48cb),
+ TOBN(0xab1e6ca2, 0xf7ddee08), TOBN(0x83bd08ce, 0xe7867f3c),
+ TOBN(0xf7e48e8a, 0x2ac13e27), TOBN(0x4494f6df, 0x4eb1a9f5),
+ TOBN(0xedbf84eb, 0x981f0a62), TOBN(0x49badc32, 0x536438f0),
+ TOBN(0x50bea541, 0x004f7571), TOBN(0xbac67d10, 0xdf1c94ee),
+ TOBN(0x253d73a1, 0xb727bc31), TOBN(0xb3d01cf2, 0x30686e28),
+ TOBN(0x51b77b1b, 0x55fd0b8b), TOBN(0xa099d183, 0xfeec3173),
+ TOBN(0x202b1fb7, 0x670e72b7), TOBN(0xadc88b33, 0xa8e1635f),
+ TOBN(0x34e8216a, 0xf989d905), TOBN(0xc2e68d20, 0x29b58d01),
+ TOBN(0x11f81c92, 0x6fe55a93), TOBN(0x15f1462a, 0x8f296f40),
+ TOBN(0x1915d375, 0xea3d62f2), TOBN(0xa17765a3, 0x01c8977d),
+ TOBN(0x7559710a, 0xe47b26f6), TOBN(0xe0bd29c8, 0x535077a5),
+ TOBN(0x615f976d, 0x08d84858), TOBN(0x370dfe85, 0x69ced5c1),
+ TOBN(0xbbc7503c, 0xa734fa56), TOBN(0xfbb9f1ec, 0x91ac4574),
+ TOBN(0x95d7ec53, 0x060dd7ef), TOBN(0xeef2dacd, 0x6e657979),
+ TOBN(0x54511af3, 0xe2a08235), TOBN(0x1e324aa4, 0x1f4aea3d),
+ TOBN(0x550e7e71, 0xe6e67671), TOBN(0xbccd5190, 0xbf52faf7),
+ TOBN(0xf880d316, 0x223cc62a), TOBN(0x0d402c7e, 0x2b32eb5d),
+ TOBN(0xa40bc039, 0x306a5a3b), TOBN(0x4e0a41fd, 0x96783a1b),
+ TOBN(0xa1e8d39a, 0x0253cdd4), TOBN(0x6480be26, 0xc7388638),
+ TOBN(0xee365e1d, 0x2285f382), TOBN(0x188d8d8f, 0xec0b5c36),
+ TOBN(0x34ef1a48, 0x1f0f4d82), TOBN(0x1a8f43e1, 0xa487d29a),
+ TOBN(0x8168226d, 0x77aefb3a), TOBN(0xf69a751e, 0x1e72c253),
+ TOBN(0x8e04359a, 0xe9594df1), TOBN(0x475ffd7d, 0xd14c0467),
+ TOBN(0xb5a2c2b1, 0x3844e95c), TOBN(0x85caf647, 0xdd12ef94),
+ TOBN(0x1ecd2a9f, 0xf1063d00), TOBN(0x1dd2e229, 0x23843311),
+ TOBN(0x38f0e09d, 0x73d17244), TOBN(0x3ede7746, 0x8fc653f1),
+ TOBN(0xae4459f5, 0xdc20e21c), TOBN(0x00db2ffa, 0x6a8599ea),
+ TOBN(0x11682c39, 0x30cfd905), TOBN(0x4934d074, 0xa5c112a6),
+ TOBN(0xbdf063c5, 0x568bfe95), TOBN(0x779a440a, 0x016c441a),
+ TOBN(0x0c23f218, 0x97d6fbdc), TOBN(0xd3a5cd87, 0xe0776aac),
+ TOBN(0xcee37f72, 0xd712e8db), TOBN(0xfb28c70d, 0x26f74e8d),
+ TOBN(0xffe0c728, 0xb61301a0), TOBN(0xa6282168, 0xd3724354),
+ TOBN(0x7ff4cb00, 0x768ffedc), TOBN(0xc51b3088, 0x03b02de9),
+ TOBN(0xa5a8147c, 0x3902dda5), TOBN(0x35d2f706, 0xfe6973b4),
+ TOBN(0x5ac2efcf, 0xc257457e), TOBN(0x933f48d4, 0x8700611b),
+ TOBN(0xc365af88, 0x4912beb2), TOBN(0x7f5a4de6, 0x162edf94),
+ TOBN(0xc646ba7c, 0x0c32f34b), TOBN(0x632c6af3, 0xb2091074),
+ TOBN(0x58d4f2e3, 0x753e43a9), TOBN(0x70e1d217, 0x24d4e23f),
+ TOBN(0xb24bf729, 0xafede6a6), TOBN(0x7f4a94d8, 0x710c8b60),
+ TOBN(0xaad90a96, 0x8d4faa6a), TOBN(0xd9ed0b32, 0xb066b690),
+ TOBN(0x52fcd37b, 0x78b6dbfd), TOBN(0x0b64615e, 0x8bd2b431),
+ TOBN(0x228e2048, 0xcfb9fad5), TOBN(0xbeaa386d, 0x240b76bd),
+ TOBN(0x2d6681c8, 0x90dad7bc), TOBN(0x3e553fc3, 0x06d38f5e),
+ TOBN(0xf27cdb9b, 0x9d5f9750), TOBN(0x3e85c52a, 0xd28c5b0e),
+ TOBN(0x190795af, 0x5247c39b), TOBN(0x547831eb, 0xbddd6828),
+ TOBN(0xf327a227, 0x4a82f424), TOBN(0x36919c78, 0x7e47f89d),
+ TOBN(0xe4783919, 0x43c7392c), TOBN(0xf101b9aa, 0x2316fefe),
+ TOBN(0xbcdc9e9c, 0x1c5009d2), TOBN(0xfb55ea13, 0x9cd18345),
+ TOBN(0xf5b5e231, 0xa3ce77c7), TOBN(0xde6b4527, 0xd2f2cb3d),
+ TOBN(0x10f6a333, 0x9bb26f5f), TOBN(0x1e85db8e, 0x044d85b6),
+ TOBN(0xc3697a08, 0x94197e54), TOBN(0x65e18cc0, 0xa7cb4ea8),
+ TOBN(0xa38c4f50, 0xa471fe6e), TOBN(0xf031747a, 0x2f13439c),
+ TOBN(0x53c4a6ba, 0xc007318b), TOBN(0xa8da3ee5, 0x1deccb3d),
+ TOBN(0x0555b31c, 0x558216b1), TOBN(0x90c7810c, 0x2f79e6c2),
+ TOBN(0x9b669f4d, 0xfe8eed3c), TOBN(0x70398ec8, 0xe0fac126),
+ TOBN(0xa96a449e, 0xf701b235), TOBN(0x0ceecdb3, 0xeb94f395),
+ TOBN(0x285fc368, 0xd0cb7431), TOBN(0x0d37bb52, 0x16a18c64),
+ TOBN(0x05110d38, 0xb880d2dd), TOBN(0xa60f177b, 0x65930d57),
+ TOBN(0x7da34a67, 0xf36235f5), TOBN(0x47f5e17c, 0x183816b9),
+ TOBN(0xc7664b57, 0xdb394af4), TOBN(0x39ba215d, 0x7036f789),
+ TOBN(0x46d2ca0e, 0x2f27b472), TOBN(0xc42647ee, 0xf73a84b7),
+ TOBN(0x44bc7545, 0x64488f1d), TOBN(0xaa922708, 0xf4cf85d5),
+ TOBN(0x721a01d5, 0x53e4df63), TOBN(0x649c0c51, 0x5db46ced),
+ TOBN(0x6bf0d64e, 0x3cffcb6c), TOBN(0xe3bf93fe, 0x50f71d96),
+ TOBN(0x75044558, 0xbcc194a0), TOBN(0x16ae3372, 0x6afdc554),
+ TOBN(0xbfc01adf, 0x5ca48f3f), TOBN(0x64352f06, 0xe22a9b84),
+ TOBN(0xcee54da1, 0xc1099e4a), TOBN(0xbbda54e8, 0xfa1b89c0),
+ TOBN(0x166a3df5, 0x6f6e55fb), TOBN(0x1ca44a24, 0x20176f88),
+ TOBN(0x936afd88, 0xdfb7b5ff), TOBN(0xe34c2437, 0x8611d4a0),
+ TOBN(0x7effbb75, 0x86142103), TOBN(0x6704ba1b, 0x1f34fc4d),
+ TOBN(0x7c2a468f, 0x10c1b122), TOBN(0x36b3a610, 0x8c6aace9),
+ TOBN(0xabfcc0a7, 0x75a0d050), TOBN(0x066f9197, 0x3ce33e32),
+ TOBN(0xce905ef4, 0x29fe09be), TOBN(0x89ee25ba, 0xa8376351),
+ TOBN(0x2a3ede22, 0xfd29dc76), TOBN(0x7fd32ed9, 0x36f17260),
+ TOBN(0x0cadcf68, 0x284b4126), TOBN(0x63422f08, 0xa7951fc8),
+ TOBN(0x562b24f4, 0x0807e199), TOBN(0xfe9ce5d1, 0x22ad4490),
+ TOBN(0xc2f51b10, 0x0db2b1b4), TOBN(0xeb3613ff, 0xe4541d0d),
+ TOBN(0xbd2c4a05, 0x2680813b), TOBN(0x527aa55d, 0x561b08d6),
+ TOBN(0xa9f8a40e, 0xa7205558), TOBN(0xe3eea56f, 0x243d0bec),
+ TOBN(0x7b853817, 0xa0ff58b3), TOBN(0xb67d3f65, 0x1a69e627),
+ TOBN(0x0b76bbb9, 0xa869b5d6), TOBN(0xa3afeb82, 0x546723ed),
+ TOBN(0x5f24416d, 0x3e554892), TOBN(0x8413b53d, 0x430e2a45),
+ TOBN(0x99c56aee, 0x9032a2a0), TOBN(0x09432bf6, 0xeec367b1),
+ TOBN(0x552850c6, 0xdaf0ecc1), TOBN(0x49ebce55, 0x5bc92048),
+ TOBN(0xdfb66ba6, 0x54811307), TOBN(0x1b84f797, 0x6f298597),
+ TOBN(0x79590481, 0x8d1d7a0d), TOBN(0xd9fabe03, 0x3a6fa556),
+ TOBN(0xa40f9c59, 0xba9e5d35), TOBN(0xcb1771c1, 0xf6247577),
+ TOBN(0x542a47ca, 0xe9a6312b), TOBN(0xa34b3560, 0x552dd8c5),
+ TOBN(0xfdf94de0, 0x0d794716), TOBN(0xd46124a9, 0x9c623094),
+ TOBN(0x56b7435d, 0x68afe8b4), TOBN(0x27f20540, 0x6c0d8ea1),
+ TOBN(0x12b77e14, 0x73186898), TOBN(0xdbc3dd46, 0x7479490f),
+ TOBN(0x951a9842, 0xc03b0c05), TOBN(0x8b1b3bb3, 0x7921bc96),
+ TOBN(0xa573b346, 0x2b202e0a), TOBN(0x77e4665d, 0x47254d56),
+ TOBN(0x08b70dfc, 0xd23e3984), TOBN(0xab86e8bc, 0xebd14236),
+ TOBN(0xaa3e07f8, 0x57114ba7), TOBN(0x5ac71689, 0xab0ef4f2),
+ TOBN(0x88fca384, 0x0139d9af), TOBN(0x72733f88, 0x76644af0),
+ TOBN(0xf122f72a, 0x65d74f4a), TOBN(0x13931577, 0xa5626c7a),
+ TOBN(0xd5b5d9eb, 0x70f8d5a4), TOBN(0x375adde7, 0xd7bbb228),
+ TOBN(0x31e88b86, 0x0c1c0b32), TOBN(0xd1f568c4, 0x173edbaa),
+ TOBN(0x1592fc83, 0x5459df02), TOBN(0x2beac0fb, 0x0fcd9a7e),
+ TOBN(0xb0a6fdb8, 0x1b473b0a), TOBN(0xe3224c6f, 0x0fe8fc48),
+ TOBN(0x680bd00e, 0xe87edf5b), TOBN(0x30385f02, 0x20e77cf5),
+ TOBN(0xe9ab98c0, 0x4d42d1b2), TOBN(0x72d191d2, 0xd3816d77),
+ TOBN(0x1564daca, 0x0917d9e5), TOBN(0x394eab59, 0x1f8fed7f),
+ TOBN(0xa209aa8d, 0x7fbb3896), TOBN(0x5564f3b9, 0xbe6ac98e),
+ TOBN(0xead21d05, 0xd73654ef), TOBN(0x68d1a9c4, 0x13d78d74),
+ TOBN(0x61e01708, 0x6d4973a0), TOBN(0x83da3500, 0x46e6d32a),
+ TOBN(0x6a3dfca4, 0x68ae0118), TOBN(0xa1b9a4c9, 0xd02da069),
+ TOBN(0x0b2ff9c7, 0xebab8302), TOBN(0x98af07c3, 0x944ba436),
+ TOBN(0x85997326, 0x995f0f9f), TOBN(0x467fade0, 0x71b58bc6),
+ TOBN(0x47e4495a, 0xbd625a2b), TOBN(0xfdd2d01d, 0x33c3b8cd),
+ TOBN(0x2c38ae28, 0xc693f9fa), TOBN(0x48622329, 0x348f7999),
+ TOBN(0x97bf738e, 0x2161f583), TOBN(0x15ee2fa7, 0x565e8cc9),
+ TOBN(0xa1a5c845, 0x5777e189), TOBN(0xcc10bee0, 0x456f2829),
+ TOBN(0x8ad95c56, 0xda762bd5), TOBN(0x152e2214, 0xe9d91da8),
+ TOBN(0x975b0e72, 0x7cb23c74), TOBN(0xfd5d7670, 0xa90c66df),
+ TOBN(0xb5b5b8ad, 0x225ffc53), TOBN(0xab6dff73, 0xfaded2ae),
+ TOBN(0xebd56781, 0x6f4cbe9d), TOBN(0x0ed8b249, 0x6a574bd7),
+ TOBN(0x41c246fe, 0x81a881fa), TOBN(0x91564805, 0xc3db9c70),
+ TOBN(0xd7c12b08, 0x5b862809), TOBN(0x1facd1f1, 0x55858d7b),
+ TOBN(0x7693747c, 0xaf09e92a), TOBN(0x3b69dcba, 0x189a425f),
+ TOBN(0x0be28e9f, 0x967365ef), TOBN(0x57300eb2, 0xe801f5c9),
+ TOBN(0x93b8ac6a, 0xd583352f), TOBN(0xa2cf1f89, 0xcd05b2b7),
+ TOBN(0x7c0c9b74, 0x4dcc40cc), TOBN(0xfee38c45, 0xada523fb),
+ TOBN(0xb49a4dec, 0x1099cc4d), TOBN(0x325c377f, 0x69f069c6),
+ TOBN(0xe12458ce, 0x476cc9ff), TOBN(0x580e0b6c, 0xc6d4cb63),
+ TOBN(0xd561c8b7, 0x9072289b), TOBN(0x0377f264, 0xa619e6da),
+ TOBN(0x26685362, 0x88e591a5), TOBN(0xa453a7bd, 0x7523ca2b),
+ TOBN(0x8a9536d2, 0xc1df4533), TOBN(0xc8e50f2f, 0xbe972f79),
+ TOBN(0xd433e50f, 0x6d3549cf), TOBN(0x6f33696f, 0xfacd665e),
+ TOBN(0x695bfdac, 0xce11fcb4), TOBN(0x810ee252, 0xaf7c9860),
+ TOBN(0x65450fe1, 0x7159bb2c), TOBN(0xf7dfbebe, 0x758b357b),
+ TOBN(0x2b057e74, 0xd69fea72), TOBN(0xd485717a, 0x92731745),}
+ ,
+ {TOBN(0x896c42e8, 0xee36860c), TOBN(0xdaf04dfd, 0x4113c22d),
+ TOBN(0x1adbb7b7, 0x44104213), TOBN(0xe5fd5fa1, 0x1fd394ea),
+ TOBN(0x68235d94, 0x1a4e0551), TOBN(0x6772cfbe, 0x18d10151),
+ TOBN(0x276071e3, 0x09984523), TOBN(0xe4e879de, 0x5a56ba98),
+ TOBN(0xaaafafb0, 0x285b9491), TOBN(0x01a0be88, 0x1e4c705e),
+ TOBN(0xff1d4f5d, 0x2ad9caab), TOBN(0x6e349a4a, 0xc37a233f),
+ TOBN(0xcf1c1246, 0x4a1c6a16), TOBN(0xd99e6b66, 0x29383260),
+ TOBN(0xea3d4366, 0x5f6d5471), TOBN(0x36974d04, 0xff8cc89b),
+ TOBN(0xc26c49a1, 0xcfe89d80), TOBN(0xb42c026d, 0xda9c8371),
+ TOBN(0xca6c013a, 0xdad066d2), TOBN(0xfb8f7228, 0x56a4f3ee),
+ TOBN(0x08b579ec, 0xd850935b), TOBN(0x34c1a74c, 0xd631e1b3),
+ TOBN(0xcb5fe596, 0xac198534), TOBN(0x39ff21f6, 0xe1f24f25),
+ TOBN(0x27f29e14, 0x8f929057), TOBN(0x7a64ae06, 0xc0c853df),
+ TOBN(0x256cd183, 0x58e9c5ce), TOBN(0x9d9cce82, 0xded092a5),
+ TOBN(0xcc6e5979, 0x6e93b7c7), TOBN(0xe1e47092, 0x31bb9e27),
+ TOBN(0xb70b3083, 0xaa9e29a0), TOBN(0xbf181a75, 0x3785e644),
+ TOBN(0xf53f2c65, 0x8ead09f7), TOBN(0x1335e1d5, 0x9780d14d),
+ TOBN(0x69cc20e0, 0xcd1b66bc), TOBN(0x9b670a37, 0xbbe0bfc8),
+ TOBN(0xce53dc81, 0x28efbeed), TOBN(0x0c74e77c, 0x8326a6e5),
+ TOBN(0x3604e0d2, 0xb88e9a63), TOBN(0xbab38fca, 0x13dc2248),
+ TOBN(0x8ed6e8c8, 0x5c0a3f1e), TOBN(0xbcad2492, 0x7c87c37f),
+ TOBN(0xfdfb62bb, 0x9ee3b78d), TOBN(0xeba8e477, 0xcbceba46),
+ TOBN(0x37d38cb0, 0xeeaede4b), TOBN(0x0bc498e8, 0x7976deb6),
+ TOBN(0xb2944c04, 0x6b6147fb), TOBN(0x8b123f35, 0xf71f9609),
+ TOBN(0xa155dcc7, 0xde79dc24), TOBN(0xf1168a32, 0x558f69cd),
+ TOBN(0xbac21595, 0x0d1850df), TOBN(0x15c8295b, 0xb204c848),
+ TOBN(0xf661aa36, 0x7d8184ff), TOBN(0xc396228e, 0x30447bdb),
+ TOBN(0x11cd5143, 0xbde4a59e), TOBN(0xe3a26e3b, 0x6beab5e6),
+ TOBN(0xd3b3a13f, 0x1402b9d0), TOBN(0x573441c3, 0x2c7bc863),
+ TOBN(0x4b301ec4, 0x578c3e6e), TOBN(0xc26fc9c4, 0x0adaf57e),
+ TOBN(0x96e71bfd, 0x7493cea3), TOBN(0xd05d4b3f, 0x1af81456),
+ TOBN(0xdaca2a8a, 0x6a8c608f), TOBN(0x53ef07f6, 0x0725b276),
+ TOBN(0x07a5fbd2, 0x7824fc56), TOBN(0x34675218, 0x13289077),
+ TOBN(0x5bf69fd5, 0xe0c48349), TOBN(0xa613ddd3, 0xb6aa7875),
+ TOBN(0x7f78c19c, 0x5450d866), TOBN(0x46f4409c, 0x8f84a481),
+ TOBN(0x9f1d1928, 0x90fce239), TOBN(0x016c4168, 0xb2ce44b9),
+ TOBN(0xbae023f0, 0xc7435978), TOBN(0xb152c888, 0x20e30e19),
+ TOBN(0x9c241645, 0xe3fa6faf), TOBN(0x735d95c1, 0x84823e60),
+ TOBN(0x03197573, 0x03955317), TOBN(0x0b4b02a9, 0xf03b4995),
+ TOBN(0x076bf559, 0x70274600), TOBN(0x32c5cc53, 0xaaf57508),
+ TOBN(0xe8af6d1f, 0x60624129), TOBN(0xb7bc5d64, 0x9a5e2b5e),
+ TOBN(0x3814b048, 0x5f082d72), TOBN(0x76f267f2, 0xce19677a),
+ TOBN(0x626c630f, 0xb36eed93), TOBN(0x55230cd7, 0x3bf56803),
+ TOBN(0x78837949, 0xce2736a0), TOBN(0x0d792d60, 0xaa6c55f1),
+ TOBN(0x0318dbfd, 0xd5c7c5d2), TOBN(0xb38f8da7, 0x072b342d),
+ TOBN(0x3569bddc, 0x7b8de38a), TOBN(0xf25b5887, 0xa1c94842),
+ TOBN(0xb2d5b284, 0x2946ad60), TOBN(0x854f29ad, 0xe9d1707e),
+ TOBN(0xaa5159dc, 0x2c6a4509), TOBN(0x899f94c0, 0x57189837),
+ TOBN(0xcf6adc51, 0xf4a55b03), TOBN(0x261762de, 0x35e3b2d5),
+ TOBN(0x4cc43012, 0x04827b51), TOBN(0xcd22a113, 0xc6021442),
+ TOBN(0xce2fd61a, 0x247c9569), TOBN(0x59a50973, 0xd152beca),
+ TOBN(0x6c835a11, 0x63a716d4), TOBN(0xc26455ed, 0x187dedcf),
+ TOBN(0x27f536e0, 0x49ce89e7), TOBN(0x18908539, 0xcc890cb5),
+ TOBN(0x308909ab, 0xd83c2aa1), TOBN(0xecd3142b, 0x1ab73bd3),
+ TOBN(0x6a85bf59, 0xb3f5ab84), TOBN(0x3c320a68, 0xf2bea4c6),
+ TOBN(0xad8dc538, 0x6da4541f), TOBN(0xeaf34eb0, 0xb7c41186),
+ TOBN(0x1c780129, 0x977c97c4), TOBN(0x5ff9beeb, 0xc57eb9fa),
+ TOBN(0xa24d0524, 0xc822c478), TOBN(0xfd8eec2a, 0x461cd415),
+ TOBN(0xfbde194e, 0xf027458c), TOBN(0xb4ff5319, 0x1d1be115),
+ TOBN(0x63f874d9, 0x4866d6f4), TOBN(0x35c75015, 0xb21ad0c9),
+ TOBN(0xa6b5c9d6, 0x46ac49d2), TOBN(0x42c77c0b, 0x83137aa9),
+ TOBN(0x24d000fc, 0x68225a38), TOBN(0x0f63cfc8, 0x2fe1e907),
+ TOBN(0x22d1b01b, 0xc6441f95), TOBN(0x7d38f719, 0xec8e448f),
+ TOBN(0x9b33fa5f, 0x787fb1ba), TOBN(0x94dcfda1, 0x190158df),
+ TOBN(0xc47cb339, 0x5f6d4a09), TOBN(0x6b4f355c, 0xee52b826),
+ TOBN(0x3d100f5d, 0xf51b930a), TOBN(0xf4512fac, 0x9f668f69),
+ TOBN(0x546781d5, 0x206c4c74), TOBN(0xd021d4d4, 0xcb4d2e48),
+ TOBN(0x494a54c2, 0xca085c2d), TOBN(0xf1dbaca4, 0x520850a8),
+ TOBN(0x63c79326, 0x490a1aca), TOBN(0xcb64dd9c, 0x41526b02),
+ TOBN(0xbb772591, 0xa2979258), TOBN(0x3f582970, 0x48d97846),
+ TOBN(0xd66b70d1, 0x7c213ba7), TOBN(0xc28febb5, 0xe8a0ced4),
+ TOBN(0x6b911831, 0xc10338c1), TOBN(0x0d54e389, 0xbf0126f3),
+ TOBN(0x7048d460, 0x4af206ee), TOBN(0x786c88f6, 0x77e97cb9),
+ TOBN(0xd4375ae1, 0xac64802e), TOBN(0x469bcfe1, 0xd53ec11c),
+ TOBN(0xfc9b340d, 0x47062230), TOBN(0xe743bb57, 0xc5b4a3ac),
+ TOBN(0xfe00b4aa, 0x59ef45ac), TOBN(0x29a4ef23, 0x59edf188),
+ TOBN(0x40242efe, 0xb483689b), TOBN(0x2575d3f6, 0x513ac262),
+ TOBN(0xf30037c8, 0x0ca6db72), TOBN(0xc9fcce82, 0x98864be2),
+ TOBN(0x84a112ff, 0x0149362d), TOBN(0x95e57582, 0x1c4ae971),
+ TOBN(0x1fa4b1a8, 0x945cf86c), TOBN(0x4525a734, 0x0b024a2f),
+ TOBN(0xe76c8b62, 0x8f338360), TOBN(0x483ff593, 0x28edf32b),
+ TOBN(0x67e8e90a, 0x298b1aec), TOBN(0x9caab338, 0x736d9a21),
+ TOBN(0x5c09d2fd, 0x66892709), TOBN(0x2496b4dc, 0xb55a1d41),
+ TOBN(0x93f5fb1a, 0xe24a4394), TOBN(0x08c75049, 0x6fa8f6c1),
+ TOBN(0xcaead1c2, 0xc905d85f), TOBN(0xe9d7f790, 0x0733ae57),
+ TOBN(0x24c9a65c, 0xf07cdd94), TOBN(0x7389359c, 0xa4b55931),
+ TOBN(0xf58709b7, 0x367e45f7), TOBN(0x1f203067, 0xcb7e7adc),
+ TOBN(0x82444bff, 0xc7b72818), TOBN(0x07303b35, 0xbaac8033),
+ TOBN(0x1e1ee4e4, 0xd13b7ea1), TOBN(0xe6489b24, 0xe0e74180),
+ TOBN(0xa5f2c610, 0x7e70ef70), TOBN(0xa1655412, 0xbdd10894),
+ TOBN(0x555ebefb, 0x7af4194e), TOBN(0x533c1c3c, 0x8e89bd9c),
+ TOBN(0x735b9b57, 0x89895856), TOBN(0x15fb3cd2, 0x567f5c15),
+ TOBN(0x057fed45, 0x526f09fd), TOBN(0xe8a4f10c, 0x8128240a),
+ TOBN(0x9332efc4, 0xff2bfd8d), TOBN(0x214e77a0, 0xbd35aa31),
+ TOBN(0x32896d73, 0x14faa40e), TOBN(0x767867ec, 0x01e5f186),
+ TOBN(0xc9adf8f1, 0x17a1813e), TOBN(0xcb6cda78, 0x54741795),
+ TOBN(0xb7521b6d, 0x349d51aa), TOBN(0xf56b5a9e, 0xe3c7b8e9),
+ TOBN(0xc6f1e5c9, 0x32a096df), TOBN(0x083667c4, 0xa3635024),
+ TOBN(0x365ea135, 0x18087f2f), TOBN(0xf1b8eaac, 0xd136e45d),
+ TOBN(0xc8a0e484, 0x73aec989), TOBN(0xd75a324b, 0x142c9259),
+ TOBN(0xb7b4d001, 0x01dae185), TOBN(0x45434e0b, 0x9b7a94bc),
+ TOBN(0xf54339af, 0xfbd8cb0b), TOBN(0xdcc4569e, 0xe98ef49e),
+ TOBN(0x7789318a, 0x09a51299), TOBN(0x81b4d206, 0xb2b025d8),
+ TOBN(0xf64aa418, 0xfae85792), TOBN(0x3e50258f, 0xacd7baf7),
+ TOBN(0xdce84cdb, 0x2996864b), TOBN(0xa2e67089, 0x1f485fa4),
+ TOBN(0xb28b2bb6, 0x534c6a5a), TOBN(0x31a7ec6b, 0xc94b9d39),
+ TOBN(0x1d217766, 0xd6bc20da), TOBN(0x4acdb5ec, 0x86761190),
+ TOBN(0x68726328, 0x73701063), TOBN(0x4d24ee7c, 0x2128c29b),
+ TOBN(0xc072ebd3, 0xa19fd868), TOBN(0x612e481c, 0xdb8ddd3b),
+ TOBN(0xb4e1d754, 0x1a64d852), TOBN(0x00ef95ac, 0xc4c6c4ab),
+ TOBN(0x1536d2ed, 0xaa0a6c46), TOBN(0x61294086, 0x43774790),
+ TOBN(0x54af25e8, 0x343fda10), TOBN(0x9ff9d98d, 0xfd25d6f2),
+ TOBN(0x0746af7c, 0x468b8835), TOBN(0x977a31cb, 0x730ecea7),
+ TOBN(0xa5096b80, 0xc2cf4a81), TOBN(0xaa986833, 0x6458c37a),
+ TOBN(0x6af29bf3, 0xa6bd9d34), TOBN(0x6a62fe9b, 0x33c5d854),
+ TOBN(0x50e6c304, 0xb7133b5e), TOBN(0x04b60159, 0x7d6e6848),
+ TOBN(0x4cd296df, 0x5579bea4), TOBN(0x10e35ac8, 0x5ceedaf1),
+ TOBN(0x04c4c5fd, 0xe3bcc5b1), TOBN(0x95f9ee8a, 0x89412cf9),
+ TOBN(0x2c9459ee, 0x82b6eb0f), TOBN(0x2e845765, 0x95c2aadd),
+ TOBN(0x774a84ae, 0xd327fcfe), TOBN(0xd8c93722, 0x0368d476),
+ TOBN(0x0dbd5748, 0xf83e8a3b), TOBN(0xa579aa96, 0x8d2495f3),
+ TOBN(0x535996a0, 0xae496e9b), TOBN(0x07afbfe9, 0xb7f9bcc2),
+ TOBN(0x3ac1dc6d, 0x5b7bd293), TOBN(0x3b592cff, 0x7022323d),
+ TOBN(0xba0deb98, 0x9c0a3e76), TOBN(0x18e78e9f, 0x4b197acb),
+ TOBN(0x211cde10, 0x296c36ef), TOBN(0x7ee89672, 0x82c4da77),
+ TOBN(0xb617d270, 0xa57836da), TOBN(0xf0cd9c31, 0x9cb7560b),
+ TOBN(0x01fdcbf7, 0xe455fe90), TOBN(0x3fb53cbb, 0x7e7334f3),
+ TOBN(0x781e2ea4, 0x4e7de4ec), TOBN(0x8adab3ad, 0x0b384fd0),
+ TOBN(0x129eee2f, 0x53d64829), TOBN(0x7a471e17, 0xa261492b),
+ TOBN(0xe4f9adb9, 0xe4cb4a2c), TOBN(0x3d359f6f, 0x97ba2c2d),
+ TOBN(0x346c6786, 0x0aacd697), TOBN(0x92b444c3, 0x75c2f8a8),
+ TOBN(0xc79fa117, 0xd85df44e), TOBN(0x56782372, 0x398ddf31),
+ TOBN(0x60e690f2, 0xbbbab3b8), TOBN(0x4851f8ae, 0x8b04816b),
+ TOBN(0xc72046ab, 0x9c92e4d2), TOBN(0x518c74a1, 0x7cf3136b),
+ TOBN(0xff4eb50a, 0xf9877d4c), TOBN(0x14578d90, 0xa919cabb),
+ TOBN(0x8218f8c4, 0xac5eb2b6), TOBN(0xa3ccc547, 0x542016e4),
+ TOBN(0x025bf48e, 0x327f8349), TOBN(0xf3e97346, 0xf43cb641),
+ TOBN(0xdc2bafdf, 0x500f1085), TOBN(0x57167876, 0x2f063055),
+ TOBN(0x5bd914b9, 0x411925a6), TOBN(0x7c078d48, 0xa1123de5),
+ TOBN(0xee6bf835, 0x182b165d), TOBN(0xb11b5e5b, 0xba519727),
+ TOBN(0xe33ea76c, 0x1eea7b85), TOBN(0x2352b461, 0x92d4f85e),
+ TOBN(0xf101d334, 0xafe115bb), TOBN(0xfabc1294, 0x889175a3),
+ TOBN(0x7f6bcdc0, 0x5233f925), TOBN(0xe0a802db, 0xe77fec55),
+ TOBN(0xbdb47b75, 0x8069b659), TOBN(0x1c5e12de, 0xf98fbd74),
+ TOBN(0x869c58c6, 0x4b8457ee), TOBN(0xa5360f69, 0x4f7ea9f7),
+ TOBN(0xe576c09f, 0xf460b38f), TOBN(0x6b70d548, 0x22b7fb36),
+ TOBN(0x3fd237f1, 0x3bfae315), TOBN(0x33797852, 0xcbdff369),
+ TOBN(0x97df25f5, 0x25b516f9), TOBN(0x46f388f2, 0xba38ad2d),
+ TOBN(0x656c4658, 0x89d8ddbb), TOBN(0x8830b26e, 0x70f38ee8),
+ TOBN(0x4320fd5c, 0xde1212b0), TOBN(0xc34f30cf, 0xe4a2edb2),
+ TOBN(0xabb131a3, 0x56ab64b8), TOBN(0x7f77f0cc, 0xd99c5d26),
+ TOBN(0x66856a37, 0xbf981d94), TOBN(0x19e76d09, 0x738bd76e),
+ TOBN(0xe76c8ac3, 0x96238f39), TOBN(0xc0a482be, 0xa830b366),
+ TOBN(0xb7b8eaff, 0x0b4eb499), TOBN(0x8ecd83bc, 0x4bfb4865),
+ TOBN(0x971b2cb7, 0xa2f3776f), TOBN(0xb42176a4, 0xf4b88adf),
+ TOBN(0xb9617df5, 0xbe1fa446), TOBN(0x8b32d508, 0xcd031bd2),
+ TOBN(0x1c6bd47d, 0x53b618c0), TOBN(0xc424f46c, 0x6a227923),
+ TOBN(0x7303ffde, 0xdd92d964), TOBN(0xe9712878, 0x71b5abf2),
+ TOBN(0x8f48a632, 0xf815561d), TOBN(0x85f48ff5, 0xd3c055d1),
+ TOBN(0x222a1427, 0x7525684f), TOBN(0xd0d841a0, 0x67360cc3),
+ TOBN(0x4245a926, 0x0b9267c6), TOBN(0xc78913f1, 0xcf07f863),
+ TOBN(0xaa844c8e, 0x4d0d9e24), TOBN(0xa42ad522, 0x3d5f9017),
+ TOBN(0xbd371749, 0xa2c989d5), TOBN(0x928292df, 0xe1f5e78e),
+ TOBN(0x493b383e, 0x0a1ea6da), TOBN(0x5136fd8d, 0x13aee529),
+ TOBN(0x860c44b1, 0xf2c34a99), TOBN(0x3b00aca4, 0xbf5855ac),
+ TOBN(0xabf6aaa0, 0xfaaf37be), TOBN(0x65f43682, 0x2a53ec08),
+ TOBN(0x1d9a5801, 0xa11b12e1), TOBN(0x78a7ab2c, 0xe20ed475),
+ TOBN(0x0de1067e, 0x9a41e0d5), TOBN(0x30473f5f, 0x305023ea),
+ TOBN(0xdd3ae09d, 0x169c7d97), TOBN(0x5cd5baa4, 0xcfaef9cd),
+ TOBN(0x5cd7440b, 0x65a44803), TOBN(0xdc13966a, 0x47f364de),
+ TOBN(0x077b2be8, 0x2b8357c1), TOBN(0x0cb1b4c5, 0xe9d57c2a),
+ TOBN(0x7a4ceb32, 0x05ff363e), TOBN(0xf310fa4d, 0xca35a9ef),
+ TOBN(0xdbb7b352, 0xf97f68c6), TOBN(0x0c773b50, 0x0b02cf58),
+ TOBN(0xea2e4821, 0x3c1f96d9), TOBN(0xffb357b0, 0xeee01815),
+ TOBN(0xb9c924cd, 0xe0f28039), TOBN(0x0b36c95a, 0x46a3fbe4),
+ TOBN(0x1faaaea4, 0x5e46db6c), TOBN(0xcae575c3, 0x1928aaff),
+ TOBN(0x7f671302, 0xa70dab86), TOBN(0xfcbd12a9, 0x71c58cfc),
+ TOBN(0xcbef9acf, 0xbee0cb92), TOBN(0x573da0b9, 0xf8c1b583),
+ TOBN(0x4752fcfe, 0x0d41d550), TOBN(0xe7eec0e3, 0x2155cffe),
+ TOBN(0x0fc39fcb, 0x545ae248), TOBN(0x522cb8d1, 0x8065f44e),
+ TOBN(0x263c962a, 0x70cbb96c), TOBN(0xe034362a, 0xbcd124a9),
+ TOBN(0xf120db28, 0x3c2ae58d), TOBN(0xb9a38d49, 0xfef6d507),
+ TOBN(0xb1fd2a82, 0x1ff140fd), TOBN(0xbd162f30, 0x20aee7e0),
+ TOBN(0x4e17a5d4, 0xcb251949), TOBN(0x2aebcb83, 0x4f7e1c3d),
+ TOBN(0x608eb25f, 0x937b0527), TOBN(0xf42e1e47, 0xeb7d9997),
+ TOBN(0xeba699c4, 0xb8a53a29), TOBN(0x1f921c71, 0xe091b536),
+ TOBN(0xcce29e7b, 0x5b26bbd5), TOBN(0x7a8ef5ed, 0x3b61a680),
+ TOBN(0xe5ef8043, 0xba1f1c7e), TOBN(0x16ea8217, 0x18158dda),
+ TOBN(0x01778a2b, 0x599ff0f9), TOBN(0x68a923d7, 0x8104fc6b),
+ TOBN(0x5bfa44df, 0xda694ff3), TOBN(0x4f7199db, 0xf7667f12),
+ TOBN(0xc06d8ff6, 0xe46f2a79), TOBN(0x08b5dead, 0xe9f8131d),
+ TOBN(0x02519a59, 0xabb4ce7c), TOBN(0xc4f710bc, 0xb42aec3e),
+ TOBN(0x3d77b057, 0x78bde41a), TOBN(0x6474bf80, 0xb4186b5a),
+ TOBN(0x048b3f67, 0x88c65741), TOBN(0xc64519de, 0x03c7c154),
+ TOBN(0xdf073846, 0x0edfcc4f), TOBN(0x319aa737, 0x48f1aa6b),
+ TOBN(0x8b9f8a02, 0xca909f77), TOBN(0x90258139, 0x7580bfef),
+ TOBN(0xd8bfd3ca, 0xc0c22719), TOBN(0xc60209e4, 0xc9ca151e),
+ TOBN(0x7a744ab5, 0xd9a1a69c), TOBN(0x6de5048b, 0x14937f8f),
+ TOBN(0x171938d8, 0xe115ac04), TOBN(0x7df70940, 0x1c6b16d2),
+ TOBN(0xa6aeb663, 0x7f8e94e7), TOBN(0xc130388e, 0x2a2cf094),
+ TOBN(0x1850be84, 0x77f54e6e), TOBN(0x9f258a72, 0x65d60fe5),
+ TOBN(0xff7ff0c0, 0x6c9146d6), TOBN(0x039aaf90, 0xe63a830b),
+ TOBN(0x38f27a73, 0x9460342f), TOBN(0x4703148c, 0x3f795f8a),
+ TOBN(0x1bb5467b, 0x9681a97e), TOBN(0x00931ba5, 0xecaeb594),
+ TOBN(0xcdb6719d, 0x786f337c), TOBN(0xd9c01cd2, 0xe704397d),
+ TOBN(0x0f4a3f20, 0x555c2fef), TOBN(0x00452509, 0x7c0af223),
+ TOBN(0x54a58047, 0x84db8e76), TOBN(0x3bacf1aa, 0x93c8aa06),
+ TOBN(0x11ca957c, 0xf7919422), TOBN(0x50641053, 0x78cdaa40),
+ TOBN(0x7a303874, 0x9f7144ae), TOBN(0x170c963f, 0x43d4acfd),
+ TOBN(0x5e148149, 0x58ddd3ef), TOBN(0xa7bde582, 0x9e72dba8),
+ TOBN(0x0769da8b, 0x6fa68750), TOBN(0xfa64e532, 0x572e0249),
+ TOBN(0xfcaadf9d, 0x2619ad31), TOBN(0x87882daa, 0xa7b349cd),
+ TOBN(0x9f6eb731, 0x6c67a775), TOBN(0xcb10471a, 0xefc5d0b1),
+ TOBN(0xb433750c, 0xe1b806b2), TOBN(0x19c5714d, 0x57b1ae7e),
+ TOBN(0xc0dc8b7b, 0xed03fd3f), TOBN(0xdd03344f, 0x31bc194e),
+ TOBN(0xa66c52a7, 0x8c6320b5), TOBN(0x8bc82ce3, 0xd0b6fd93),
+ TOBN(0xf8e13501, 0xb35f1341), TOBN(0xe53156dd, 0x25a43e42),
+ TOBN(0xd3adf27e, 0x4daeb85c), TOBN(0xb81d8379, 0xbbeddeb5),
+ TOBN(0x1b0b546e, 0x2e435867), TOBN(0x9020eb94, 0xeba5dd60),
+ TOBN(0x37d91161, 0x8210cb9d), TOBN(0x4c596b31, 0x5c91f1cf),
+ TOBN(0xb228a90f, 0x0e0b040d), TOBN(0xbaf02d82, 0x45ff897f),
+ TOBN(0x2aac79e6, 0x00fa6122), TOBN(0x24828817, 0x8e36f557),
+ TOBN(0xb9521d31, 0x113ec356), TOBN(0x9e48861e, 0x15eff1f8),
+ TOBN(0x2aa1d412, 0xe0d41715), TOBN(0x71f86203, 0x53f131b8),
+ TOBN(0xf60da8da, 0x3fd19408), TOBN(0x4aa716dc, 0x278d9d99),
+ TOBN(0x394531f7, 0xa8c51c90), TOBN(0xb560b0e8, 0xf59db51c),
+ TOBN(0xa28fc992, 0xfa34bdad), TOBN(0xf024fa14, 0x9cd4f8bd),
+ TOBN(0x5cf530f7, 0x23a9d0d3), TOBN(0x615ca193, 0xe28c9b56),
+ TOBN(0x6d2a483d, 0x6f73c51e), TOBN(0xa4cb2412, 0xea0dc2dd),
+ TOBN(0x50663c41, 0x1eb917ff), TOBN(0x3d3a74cf, 0xeade299e),
+ TOBN(0x29b3990f, 0x4a7a9202), TOBN(0xa9bccf59, 0xa7b15c3d),
+ TOBN(0x66a3ccdc, 0xa5df9208), TOBN(0x48027c14, 0x43f2f929),
+ TOBN(0xd385377c, 0x40b557f0), TOBN(0xe001c366, 0xcd684660),
+ TOBN(0x1b18ed6b, 0xe2183a27), TOBN(0x879738d8, 0x63210329),
+ TOBN(0xa687c74b, 0xbda94882), TOBN(0xd1bbcc48, 0xa684b299),
+ TOBN(0xaf6f1112, 0x863b3724), TOBN(0x6943d1b4, 0x2c8ce9f8),
+ TOBN(0xe044a3bb, 0x098cafb4), TOBN(0x27ed2310, 0x60d48caf),
+ TOBN(0x542b5675, 0x3a31b84d), TOBN(0xcbf3dd50, 0xfcddbed7),
+ TOBN(0x25031f16, 0x41b1d830), TOBN(0xa7ec851d, 0xcb0c1e27),
+ TOBN(0xac1c8fe0, 0xb5ae75db), TOBN(0xb24c7557, 0x08c52120),
+ TOBN(0x57f811dc, 0x1d4636c3), TOBN(0xf8436526, 0x681a9939),
+ TOBN(0x1f6bc6d9, 0x9c81adb3), TOBN(0x840f8ac3, 0x5b7d80d4),
+ TOBN(0x731a9811, 0xf4387f1a), TOBN(0x7c501cd3, 0xb5156880),
+ TOBN(0xa5ca4a07, 0xdfe68867), TOBN(0xf123d8f0, 0x5fcea120),
+ TOBN(0x1fbb0e71, 0xd607039e), TOBN(0x2b70e215, 0xcd3a4546),
+ TOBN(0x32d2f01d, 0x53324091), TOBN(0xb796ff08, 0x180ab19b),
+ TOBN(0x32d87a86, 0x3c57c4aa), TOBN(0x2aed9caf, 0xb7c49a27),
+ TOBN(0x9fb35eac, 0x31630d98), TOBN(0x338e8cdf, 0x5c3e20a3),
+ TOBN(0x80f16182, 0x66cde8db), TOBN(0x4e159980, 0x2d72fd36),
+ TOBN(0xd7b8f13b, 0x9b6e5072), TOBN(0xf5213907, 0x3b7b5dc1),
+ TOBN(0x4d431f1d, 0x8ce4396e), TOBN(0x37a1a680, 0xa7ed2142),
+ TOBN(0xbf375696, 0xd01aaf6b), TOBN(0xaa1c0c54, 0xe63aab66),
+ TOBN(0x3014368b, 0x4ed80940), TOBN(0x67e6d056, 0x7a6fcedd),
+ TOBN(0x7c208c49, 0xca97579f), TOBN(0xfe3d7a81, 0xa23597f6),
+ TOBN(0x5e203202, 0x7e096ae2), TOBN(0xb1f3e1e7, 0x24b39366),
+ TOBN(0x26da26f3, 0x2fdcdffc), TOBN(0x79422f1d, 0x6097be83),}
+ ,
+ {TOBN(0x263a2cfb, 0x9db3b381), TOBN(0x9c3a2dee, 0xd4df0a4b),
+ TOBN(0x728d06e9, 0x7d04e61f), TOBN(0x8b1adfbc, 0x42449325),
+ TOBN(0x6ec1d939, 0x7e053a1b), TOBN(0xee2be5c7, 0x66daf707),
+ TOBN(0x80ba1e14, 0x810ac7ab), TOBN(0xdd2ae778, 0xf530f174),
+ TOBN(0x0435d97a, 0x205b9d8b), TOBN(0x6eb8f064, 0x056756d4),
+ TOBN(0xd5e88a8b, 0xb6f8210e), TOBN(0x070ef12d, 0xec9fd9ea),
+ TOBN(0x4d849505, 0x3bcc876a), TOBN(0x12a75338, 0xa7404ce3),
+ TOBN(0xd22b49e1, 0xb8a1db5e), TOBN(0xec1f2051, 0x14bfa5ad),
+ TOBN(0xadbaeb79, 0xb6828f36), TOBN(0x9d7a0258, 0x01bd5b9e),
+ TOBN(0xeda01e0d, 0x1e844b0c), TOBN(0x4b625175, 0x887edfc9),
+ TOBN(0x14109fdd, 0x9669b621), TOBN(0x88a2ca56, 0xf6f87b98),
+ TOBN(0xfe2eb788, 0x170df6bc), TOBN(0x0cea06f4, 0xffa473f9),
+ TOBN(0x43ed81b5, 0xc4e83d33), TOBN(0xd9f35879, 0x5efd488b),
+ TOBN(0x164a620f, 0x9deb4d0f), TOBN(0xc6927bdb, 0xac6a7394),
+ TOBN(0x45c28df7, 0x9f9e0f03), TOBN(0x2868661e, 0xfcd7e1a9),
+ TOBN(0x7cf4e8d0, 0xffa348f1), TOBN(0x6bd4c284, 0x398538e0),
+ TOBN(0x2618a091, 0x289a8619), TOBN(0xef796e60, 0x6671b173),
+ TOBN(0x664e46e5, 0x9090c632), TOBN(0xa38062d4, 0x1e66f8fb),
+ TOBN(0x6c744a20, 0x0573274e), TOBN(0xd07b67e4, 0xa9271394),
+ TOBN(0x391223b2, 0x6bdc0e20), TOBN(0xbe2d93f1, 0xeb0a05a7),
+ TOBN(0xf23e2e53, 0x3f36d141), TOBN(0xe84bb3d4, 0x4dfca442),
+ TOBN(0xb804a48d, 0x6b7c023a), TOBN(0x1e16a8fa, 0x76431c3b),
+ TOBN(0x1b5452ad, 0xddd472e0), TOBN(0x7d405ee7, 0x0d1ee127),
+ TOBN(0x50fc6f1d, 0xffa27599), TOBN(0x351ac53c, 0xbf391b35),
+ TOBN(0x7efa14b8, 0x4444896b), TOBN(0x64974d2f, 0xf94027fb),
+ TOBN(0xefdcd0e8, 0xde84487d), TOBN(0x8c45b260, 0x2b48989b),
+ TOBN(0xa8fcbbc2, 0xd8463487), TOBN(0xd1b2b3f7, 0x3fbc476c),
+ TOBN(0x21d005b7, 0xc8f443c0), TOBN(0x518f2e67, 0x40c0139c),
+ TOBN(0x56036e8c, 0x06d75fc1), TOBN(0x2dcf7bb7, 0x3249a89f),
+ TOBN(0x81dd1d3d, 0xe245e7dd), TOBN(0xf578dc4b, 0xebd6e2a7),
+ TOBN(0x4c028903, 0xdf2ce7a0), TOBN(0xaee36288, 0x9c39afac),
+ TOBN(0xdc847c31, 0x146404ab), TOBN(0x6304c0d8, 0xa4e97818),
+ TOBN(0xae51dca2, 0xa91f6791), TOBN(0x2abe4190, 0x9baa9efc),
+ TOBN(0xd9d2e2f4, 0x559c7ac1), TOBN(0xe82f4b51, 0xfc9f773a),
+ TOBN(0xa7713027, 0x4073e81c), TOBN(0xc0276fac, 0xfbb596fc),
+ TOBN(0x1d819fc9, 0xa684f70c), TOBN(0x29b47fdd, 0xc9f7b1e0),
+ TOBN(0x358de103, 0x459b1940), TOBN(0xec881c59, 0x5b013e93),
+ TOBN(0x51574c93, 0x49532ad3), TOBN(0x2db1d445, 0xb37b46de),
+ TOBN(0xc6445b87, 0xdf239fd8), TOBN(0xc718af75, 0x151d24ee),
+ TOBN(0xaea1c4a4, 0xf43c6259), TOBN(0x40c0e5d7, 0x70be02f7),
+ TOBN(0x6a4590f4, 0x721b33f2), TOBN(0x2124f1fb, 0xfedf04ea),
+ TOBN(0xf8e53cde, 0x9745efe7), TOBN(0xe7e10432, 0x65f046d9),
+ TOBN(0xc3fca28e, 0xe4d0c7e6), TOBN(0x847e339a, 0x87253b1b),
+ TOBN(0x9b595348, 0x3743e643), TOBN(0xcb6a0a0b, 0x4fd12fc5),
+ TOBN(0xfb6836c3, 0x27d02dcc), TOBN(0x5ad00982, 0x7a68bcc2),
+ TOBN(0x1b24b44c, 0x005e912d), TOBN(0xcc83d20f, 0x811fdcfe),
+ TOBN(0x36527ec1, 0x666fba0c), TOBN(0x69948197, 0x14754635),
+ TOBN(0xfcdcb1a8, 0x556da9c2), TOBN(0xa5934267, 0x81a732b2),
+ TOBN(0xec1214ed, 0xa714181d), TOBN(0x609ac13b, 0x6067b341),
+ TOBN(0xff4b4c97, 0xa545df1f), TOBN(0xa1240501, 0x34d2076b),
+ TOBN(0x6efa0c23, 0x1409ca97), TOBN(0x254cc1a8, 0x20638c43),
+ TOBN(0xd4e363af, 0xdcfb46cd), TOBN(0x62c2adc3, 0x03942a27),
+ TOBN(0xc67b9df0, 0x56e46483), TOBN(0xa55abb20, 0x63736356),
+ TOBN(0xab93c098, 0xc551bc52), TOBN(0x382b49f9, 0xb15fe64b),
+ TOBN(0x9ec221ad, 0x4dff8d47), TOBN(0x79caf615, 0x437df4d6),
+ TOBN(0x5f13dc64, 0xbb456509), TOBN(0xe4c589d9, 0x191f0714),
+ TOBN(0x27b6a8ab, 0x3fd40e09), TOBN(0xe455842e, 0x77313ea9),
+ TOBN(0x8b51d1e2, 0x1f55988b), TOBN(0x5716dd73, 0x062bbbfc),
+ TOBN(0x633c11e5, 0x4e8bf3de), TOBN(0x9a0e77b6, 0x1b85be3b),
+ TOBN(0x56510729, 0x0911cca6), TOBN(0x27e76495, 0xefa6590f),
+ TOBN(0xe4ac8b33, 0x070d3aab), TOBN(0x2643672b, 0x9a2cd5e5),
+ TOBN(0x52eff79b, 0x1cfc9173), TOBN(0x665ca49b, 0x90a7c13f),
+ TOBN(0x5a8dda59, 0xb3efb998), TOBN(0x8a5b922d, 0x052f1341),
+ TOBN(0xae9ebbab, 0x3cf9a530), TOBN(0x35986e7b, 0xf56da4d7),
+ TOBN(0x3a636b5c, 0xff3513cc), TOBN(0xbb0cf8ba, 0x3198f7dd),
+ TOBN(0xb8d40522, 0x41f16f86), TOBN(0x760575d8, 0xde13a7bf),
+ TOBN(0x36f74e16, 0x9f7aa181), TOBN(0x163a3ecf, 0xf509ed1c),
+ TOBN(0x6aead61f, 0x3c40a491), TOBN(0x158c95fc, 0xdfe8fcaa),
+ TOBN(0xa3991b6e, 0x13cda46f), TOBN(0x79482415, 0x342faed0),
+ TOBN(0xf3ba5bde, 0x666b5970), TOBN(0x1d52e6bc, 0xb26ab6dd),
+ TOBN(0x768ba1e7, 0x8608dd3d), TOBN(0x4930db2a, 0xea076586),
+ TOBN(0xd9575714, 0xe7dc1afa), TOBN(0x1fc7bf7d, 0xf7c58817),
+ TOBN(0x6b47accd, 0xd9eee96c), TOBN(0x0ca277fb, 0xe58cec37),
+ TOBN(0x113fe413, 0xe702c42a), TOBN(0xdd1764ee, 0xc47cbe51),
+ TOBN(0x041e7cde, 0x7b3ed739), TOBN(0x50cb7459, 0x5ce9e1c0),
+ TOBN(0x35568513, 0x2925b212), TOBN(0x7cff95c4, 0x001b081c),
+ TOBN(0x63ee4cbd, 0x8088b454), TOBN(0xdb7f32f7, 0x9a9e0c8a),
+ TOBN(0xb377d418, 0x6b2447cb), TOBN(0xe3e982aa, 0xd370219b),
+ TOBN(0x06ccc1e4, 0xc2a2a593), TOBN(0x72c36865, 0x0773f24f),
+ TOBN(0xa13b4da7, 0x95859423), TOBN(0x8bbf1d33, 0x75040c8f),
+ TOBN(0x726f0973, 0xda50c991), TOBN(0x48afcd5b, 0x822d6ee2),
+ TOBN(0xe5fc718b, 0x20fd7771), TOBN(0xb9e8e77d, 0xfd0807a1),
+ TOBN(0x7f5e0f44, 0x99a7703d), TOBN(0x6972930e, 0x618e36f3),
+ TOBN(0x2b7c77b8, 0x23807bbe), TOBN(0xe5b82405, 0xcb27ff50),
+ TOBN(0xba8b8be3, 0xbd379062), TOBN(0xd64b7a1d, 0x2dce4a92),
+ TOBN(0x040a73c5, 0xb2952e37), TOBN(0x0a9e252e, 0xd438aeca),
+ TOBN(0xdd43956b, 0xc39d3bcb), TOBN(0x1a31ca00, 0xb32b2d63),
+ TOBN(0xd67133b8, 0x5c417a18), TOBN(0xd08e4790, 0x2ef442c8),
+ TOBN(0x98cb1ae9, 0x255c0980), TOBN(0x4bd86381, 0x2b4a739f),
+ TOBN(0x5a5c31e1, 0x1e4a45a1), TOBN(0x1e5d55fe, 0x9cb0db2f),
+ TOBN(0x74661b06, 0x8ff5cc29), TOBN(0x026b389f, 0x0eb8a4f4),
+ TOBN(0x536b21a4, 0x58848c24), TOBN(0x2e5bf8ec, 0x81dc72b0),
+ TOBN(0x03c187d0, 0xad886aac), TOBN(0x5c16878a, 0xb771b645),
+ TOBN(0xb07dfc6f, 0xc74045ab), TOBN(0x2c6360bf, 0x7800caed),
+ TOBN(0x24295bb5, 0xb9c972a3), TOBN(0xc9e6f88e, 0x7c9a6dba),
+ TOBN(0x90ffbf24, 0x92a79aa6), TOBN(0xde29d50a, 0x41c26ac2),
+ TOBN(0x9f0af483, 0xd309cbe6), TOBN(0x5b020d8a, 0xe0bced4f),
+ TOBN(0x606e986d, 0xb38023e3), TOBN(0xad8f2c9d, 0x1abc6933),
+ TOBN(0x19292e1d, 0xe7400e93), TOBN(0xfe3e18a9, 0x52be5e4d),
+ TOBN(0xe8e9771d, 0x2e0680bf), TOBN(0x8c5bec98, 0xc54db063),
+ TOBN(0x2af9662a, 0x74a55d1f), TOBN(0xe3fbf28f, 0x046f66d8),
+ TOBN(0xa3a72ab4, 0xd4dc4794), TOBN(0x09779f45, 0x5c7c2dd8),
+ TOBN(0xd893bdaf, 0xc3d19d8d), TOBN(0xd5a75094, 0x57d6a6df),
+ TOBN(0x8cf8fef9, 0x952e6255), TOBN(0x3da67cfb, 0xda9a8aff),
+ TOBN(0x4c23f62a, 0x2c160dcd), TOBN(0x34e6c5e3, 0x8f90eaef),
+ TOBN(0x35865519, 0xa9a65d5a), TOBN(0x07c48aae, 0x8fd38a3d),
+ TOBN(0xb7e7aeda, 0x50068527), TOBN(0x2c09ef23, 0x1c90936a),
+ TOBN(0x31ecfeb6, 0xe879324c), TOBN(0xa0871f6b, 0xfb0ec938),
+ TOBN(0xb1f0fb68, 0xd84d835d), TOBN(0xc90caf39, 0x861dc1e6),
+ TOBN(0x12e5b046, 0x7594f8d7), TOBN(0x26897ae2, 0x65012b92),
+ TOBN(0xbcf68a08, 0xa4d6755d), TOBN(0x403ee41c, 0x0991fbda),
+ TOBN(0x733e343e, 0x3bbf17e8), TOBN(0xd2c7980d, 0x679b3d65),
+ TOBN(0x33056232, 0xd2e11305), TOBN(0x966be492, 0xf3c07a6f),
+ TOBN(0x6a8878ff, 0xbb15509d), TOBN(0xff221101, 0x0a9b59a4),
+ TOBN(0x6c9f564a, 0xabe30129), TOBN(0xc6f2c940, 0x336e64cf),
+ TOBN(0x0fe75262, 0x8b0c8022), TOBN(0xbe0267e9, 0x6ae8db87),
+ TOBN(0x22e192f1, 0x93bc042b), TOBN(0xf085b534, 0xb237c458),
+ TOBN(0xa0d192bd, 0x832c4168), TOBN(0x7a76e9e3, 0xbdf6271d),
+ TOBN(0x52a882fa, 0xb88911b5), TOBN(0xc85345e4, 0xb4db0eb5),
+ TOBN(0xa3be02a6, 0x81a7c3ff), TOBN(0x51889c8c, 0xf0ec0469),
+ TOBN(0x9d031369, 0xa5e829e5), TOBN(0xcbb4c6fc, 0x1607aa41),
+ TOBN(0x75ac59a6, 0x241d84c1), TOBN(0xc043f2bf, 0x8829e0ee),
+ TOBN(0x82a38f75, 0x8ea5e185), TOBN(0x8bda40b9, 0xd87cbd9f),
+ TOBN(0x9e65e75e, 0x2d8fc601), TOBN(0x3d515f74, 0xa35690b3),
+ TOBN(0x534acf4f, 0xda79e5ac), TOBN(0x68b83b3a, 0x8630215f),
+ TOBN(0x5c748b2e, 0xd085756e), TOBN(0xb0317258, 0xe5d37cb2),
+ TOBN(0x6735841a, 0xc5ccc2c4), TOBN(0x7d7dc96b, 0x3d9d5069),
+ TOBN(0xa147e410, 0xfd1754bd), TOBN(0x65296e94, 0xd399ddd5),
+ TOBN(0xf6b5b2d0, 0xbc8fa5bc), TOBN(0x8a5ead67, 0x500c277b),
+ TOBN(0x214625e6, 0xdfa08a5d), TOBN(0x51fdfedc, 0x959cf047),
+ TOBN(0x6bc9430b, 0x289fca32), TOBN(0xe36ff0cf, 0x9d9bdc3f),
+ TOBN(0x2fe187cb, 0x58ea0ede), TOBN(0xed66af20, 0x5a900b3f),
+ TOBN(0x00e0968b, 0x5fa9f4d6), TOBN(0x2d4066ce, 0x37a362e7),
+ TOBN(0xa99a9748, 0xbd07e772), TOBN(0x710989c0, 0x06a4f1d0),
+ TOBN(0xd5dedf35, 0xce40cbd8), TOBN(0xab55c5f0, 0x1743293d),
+ TOBN(0x766f1144, 0x8aa24e2c), TOBN(0x94d874f8, 0x605fbcb4),
+ TOBN(0xa365f0e8, 0xa518001b), TOBN(0xee605eb6, 0x9d04ef0f),
+ TOBN(0x5a3915cd, 0xba8d4d25), TOBN(0x44c0e1b8, 0xb5113472),
+ TOBN(0xcbb024e8, 0x8b6740dc), TOBN(0x89087a53, 0xee1d4f0c),
+ TOBN(0xa88fa05c, 0x1fc4e372), TOBN(0x8bf395cb, 0xaf8b3af2),
+ TOBN(0x1e71c9a1, 0xdeb8568b), TOBN(0xa35daea0, 0x80fb3d32),
+ TOBN(0xe8b6f266, 0x2cf8fb81), TOBN(0x6d51afe8, 0x9490696a),
+ TOBN(0x81beac6e, 0x51803a19), TOBN(0xe3d24b7f, 0x86219080),
+ TOBN(0x727cfd9d, 0xdf6f463c), TOBN(0x8c6865ca, 0x72284ee8),
+ TOBN(0x32c88b7d, 0xb743f4ef), TOBN(0x3793909b, 0xe7d11dce),
+ TOBN(0xd398f922, 0x2ff2ebe8), TOBN(0x2c70ca44, 0xe5e49796),
+ TOBN(0xdf4d9929, 0xcb1131b1), TOBN(0x7826f298, 0x25888e79),
+ TOBN(0x4d3a112c, 0xf1d8740a), TOBN(0x00384cb6, 0x270afa8b),
+ TOBN(0xcb64125b, 0x3ab48095), TOBN(0x3451c256, 0x62d05106),
+ TOBN(0xd73d577d, 0xa4955845), TOBN(0x39570c16, 0xbf9f4433),
+ TOBN(0xd7dfaad3, 0xadecf263), TOBN(0xf1c3d8d1, 0xdc76e102),
+ TOBN(0x5e774a58, 0x54c6a836), TOBN(0xdad4b672, 0x3e92d47b),
+ TOBN(0xbe7e990f, 0xf0d796a0), TOBN(0x5fc62478, 0xdf0e8b02),
+ TOBN(0x8aae8bf4, 0x030c00ad), TOBN(0x3d2db93b, 0x9004ba0f),
+ TOBN(0xe48c8a79, 0xd85d5ddc), TOBN(0xe907caa7, 0x6bb07f34),
+ TOBN(0x58db343a, 0xa39eaed5), TOBN(0x0ea6e007, 0xadaf5724),
+ TOBN(0xe00df169, 0xd23233f3), TOBN(0x3e322796, 0x77cb637f),
+ TOBN(0x1f897c0e, 0x1da0cf6c), TOBN(0xa651f5d8, 0x31d6bbdd),
+ TOBN(0xdd61af19, 0x1a230c76), TOBN(0xbd527272, 0xcdaa5e4a),
+ TOBN(0xca753636, 0xd0abcd7e), TOBN(0x78bdd37c, 0x370bd8dc),
+ TOBN(0xc23916c2, 0x17cd93fe), TOBN(0x65b97a4d, 0xdadce6e2),
+ TOBN(0xe04ed4eb, 0x174e42f8), TOBN(0x1491ccaa, 0xbb21480a),
+ TOBN(0x145a8280, 0x23196332), TOBN(0x3c3862d7, 0x587b479a),
+ TOBN(0x9f4a88a3, 0x01dcd0ed), TOBN(0x4da2b7ef, 0x3ea12f1f),
+ TOBN(0xf8e7ae33, 0xb126e48e), TOBN(0x404a0b32, 0xf494e237),
+ TOBN(0x9beac474, 0xc55acadb), TOBN(0x4ee5cf3b, 0xcbec9fd9),
+ TOBN(0x336b33b9, 0x7df3c8c3), TOBN(0xbd905fe3, 0xb76808fd),
+ TOBN(0x8f436981, 0xaa45c16a), TOBN(0x255c5bfa, 0x3dd27b62),
+ TOBN(0x71965cbf, 0xc3dd9b4d), TOBN(0xce23edbf, 0xfc068a87),
+ TOBN(0xb78d4725, 0x745b029b), TOBN(0x74610713, 0xcefdd9bd),
+ TOBN(0x7116f75f, 0x1266bf52), TOBN(0x02046722, 0x18e49bb6),
+ TOBN(0xdf43df9f, 0x3d6f19e3), TOBN(0xef1bc7d0, 0xe685cb2f),
+ TOBN(0xcddb27c1, 0x7078c432), TOBN(0xe1961b9c, 0xb77fedb7),
+ TOBN(0x1edc2f5c, 0xc2290570), TOBN(0x2c3fefca, 0x19cbd886),
+ TOBN(0xcf880a36, 0xc2af389a), TOBN(0x96c610fd, 0xbda71cea),
+ TOBN(0xf03977a9, 0x32aa8463), TOBN(0x8eb7763f, 0x8586d90a),
+ TOBN(0x3f342454, 0x2a296e77), TOBN(0xc8718683, 0x42837a35),
+ TOBN(0x7dc71090, 0x6a09c731), TOBN(0x54778ffb, 0x51b816db),
+ TOBN(0x6b33bfec, 0xaf06defd), TOBN(0xfe3c105f, 0x8592b70b),
+ TOBN(0xf937fda4, 0x61da6114), TOBN(0x3c13e651, 0x4c266ad7),
+ TOBN(0xe363a829, 0x855938e8), TOBN(0x2eeb5d9e, 0x9de54b72),
+ TOBN(0xbeb93b0e, 0x20ccfab9), TOBN(0x3dffbb5f, 0x25e61a25),
+ TOBN(0x7f655e43, 0x1acc093d), TOBN(0x0cb6cc3d, 0x3964ce61),
+ TOBN(0x6ab283a1, 0xe5e9b460), TOBN(0x55d787c5, 0xa1c7e72d),
+ TOBN(0x4d2efd47, 0xdeadbf02), TOBN(0x11e80219, 0xac459068),
+ TOBN(0x810c7626, 0x71f311f0), TOBN(0xfa17ef8d, 0x4ab6ef53),
+ TOBN(0xaf47fd25, 0x93e43bff), TOBN(0x5cb5ff3f, 0x0be40632),
+ TOBN(0x54687106, 0x8ee61da3), TOBN(0x7764196e, 0xb08afd0f),
+ TOBN(0x831ab3ed, 0xf0290a8f), TOBN(0xcae81966, 0xcb47c387),
+ TOBN(0xaad7dece, 0x184efb4f), TOBN(0xdcfc53b3, 0x4749110e),
+ TOBN(0x6698f23c, 0x4cb632f9), TOBN(0xc42a1ad6, 0xb91f8067),
+ TOBN(0xb116a81d, 0x6284180a), TOBN(0xebedf5f8, 0xe901326f),
+ TOBN(0xf2274c9f, 0x97e3e044), TOBN(0x42018520, 0x11d09fc9),
+ TOBN(0x56a65f17, 0xd18e6e23), TOBN(0x2ea61e2a, 0x352b683c),
+ TOBN(0x27d291bc, 0x575eaa94), TOBN(0x9e7bc721, 0xb8ff522d),
+ TOBN(0x5f7268bf, 0xa7f04d6f), TOBN(0x5868c73f, 0xaba41748),
+ TOBN(0x9f85c2db, 0x7be0eead), TOBN(0x511e7842, 0xff719135),
+ TOBN(0x5a06b1e9, 0xc5ea90d7), TOBN(0x0c19e283, 0x26fab631),
+ TOBN(0x8af8f0cf, 0xe9206c55), TOBN(0x89389cb4, 0x3553c06a),
+ TOBN(0x39dbed97, 0xf65f8004), TOBN(0x0621b037, 0xc508991d),
+ TOBN(0x1c52e635, 0x96e78cc4), TOBN(0x5385c8b2, 0x0c06b4a8),
+ TOBN(0xd84ddfdb, 0xb0e87d03), TOBN(0xc49dfb66, 0x934bafad),
+ TOBN(0x7071e170, 0x59f70772), TOBN(0x3a073a84, 0x3a1db56b),
+ TOBN(0x03494903, 0x3b8af190), TOBN(0x7d882de3, 0xd32920f0),
+ TOBN(0x91633f0a, 0xb2cf8940), TOBN(0x72b0b178, 0x6f948f51),
+ TOBN(0x2d28dc30, 0x782653c8), TOBN(0x88829849, 0xdb903a05),
+ TOBN(0xb8095d0c, 0x6a19d2bb), TOBN(0x4b9e7f0c, 0x86f782cb),
+ TOBN(0x7af73988, 0x2d907064), TOBN(0xd12be0fe, 0x8b32643c),
+ TOBN(0x358ed23d, 0x0e165dc3), TOBN(0x3d47ce62, 0x4e2378ce),
+ TOBN(0x7e2bb0b9, 0xfeb8a087), TOBN(0x3246e8ae, 0xe29e10b9),
+ TOBN(0x459f4ec7, 0x03ce2b4d), TOBN(0xe9b4ca1b, 0xbbc077cf),
+ TOBN(0x2613b4f2, 0x0e9940c1), TOBN(0xfc598bb9, 0x047d1eb1),
+ TOBN(0x9744c62b, 0x45036099), TOBN(0xa9dee742, 0x167c65d8),
+ TOBN(0x0c511525, 0xdabe1943), TOBN(0xda110554, 0x93c6c624),
+ TOBN(0xae00a52c, 0x651a3be2), TOBN(0xcda5111d, 0x884449a6),
+ TOBN(0x063c06f4, 0xff33bed1), TOBN(0x73baaf9a, 0x0d3d76b4),
+ TOBN(0x52fb0c9d, 0x7fc63668), TOBN(0x6886c9dd, 0x0c039cde),
+ TOBN(0x602bd599, 0x55b22351), TOBN(0xb00cab02, 0x360c7c13),
+ TOBN(0x8cb616bc, 0x81b69442), TOBN(0x41486700, 0xb55c3cee),
+ TOBN(0x71093281, 0xf49ba278), TOBN(0xad956d9c, 0x64a50710),
+ TOBN(0x9561f28b, 0x638a7e81), TOBN(0x54155cdf, 0x5980ddc3),
+ TOBN(0xb2db4a96, 0xd26f247a), TOBN(0x9d774e4e, 0x4787d100),
+ TOBN(0x1a9e6e2e, 0x078637d2), TOBN(0x1c363e2d, 0x5e0ae06a),
+ TOBN(0x7493483e, 0xe9cfa354), TOBN(0x76843cb3, 0x7f74b98d),
+ TOBN(0xbaca6591, 0xd4b66947), TOBN(0xb452ce98, 0x04460a8c),
+ TOBN(0x6830d246, 0x43768f55), TOBN(0xf4197ed8, 0x7dff12df),
+ TOBN(0x6521b472, 0x400dd0f7), TOBN(0x59f5ca8f, 0x4b1e7093),
+ TOBN(0x6feff11b, 0x080338ae), TOBN(0x0ada31f6, 0xa29ca3c6),
+ TOBN(0x24794eb6, 0x94a2c215), TOBN(0xd83a43ab, 0x05a57ab4),
+ TOBN(0x264a543a, 0x2a6f89fe), TOBN(0x2c2a3868, 0xdd5ec7c2),
+ TOBN(0xd3373940, 0x8439d9b2), TOBN(0x715ea672, 0x0acd1f11),
+ TOBN(0x42c1d235, 0xe7e6cc19), TOBN(0x81ce6e96, 0xb990585c),
+ TOBN(0x04e5dfe0, 0xd809c7bd), TOBN(0xd7b2580c, 0x8f1050ab),
+ TOBN(0x6d91ad78, 0xd8a4176f), TOBN(0x0af556ee, 0x4e2e897c),
+ TOBN(0x162a8b73, 0x921de0ac), TOBN(0x52ac9c22, 0x7ea78400),
+ TOBN(0xee2a4eea, 0xefce2174), TOBN(0xbe61844e, 0x6d637f79),
+ TOBN(0x0491f1bc, 0x789a283b), TOBN(0x72d3ac3d, 0x880836f4),
+ TOBN(0xaa1c5ea3, 0x88e5402d), TOBN(0x1b192421, 0xd5cc473d),
+ TOBN(0x5c0b9998, 0x9dc84cac), TOBN(0xb0a8482d, 0x9c6e75b8),
+ TOBN(0x639961d0, 0x3a191ce2), TOBN(0xda3bc865, 0x6d837930),
+ TOBN(0xca990653, 0x056e6f8f), TOBN(0x84861c41, 0x64d133a7),
+ TOBN(0x8b403276, 0x746abe40), TOBN(0xb7b4d51a, 0xebf8e303),
+ TOBN(0x05b43211, 0x220a255d), TOBN(0xc997152c, 0x02419e6e),
+ TOBN(0x76ff47b6, 0x630c2fea), TOBN(0x50518677, 0x281fdade),
+ TOBN(0x3283b8ba, 0xcf902b0b), TOBN(0x8d4b4eb5, 0x37db303b),
+ TOBN(0xcc89f42d, 0x755011bc), TOBN(0xb43d74bb, 0xdd09d19b),
+ TOBN(0x65746bc9, 0x8adba350), TOBN(0x364eaf8c, 0xb51c1927),
+ TOBN(0x13c76596, 0x10ad72ec), TOBN(0x30045121, 0xf8d40c20),
+ TOBN(0x6d2d99b7, 0xea7b979b), TOBN(0xcd78cd74, 0xe6fb3bcd),
+ TOBN(0x11e45a9e, 0x86cffbfe), TOBN(0x78a61cf4, 0x637024f6),
+ TOBN(0xd06bc872, 0x3d502295), TOBN(0xf1376854, 0x458cb288),
+ TOBN(0xb9db26a1, 0x342f8586), TOBN(0xf33effcf, 0x4beee09e),
+ TOBN(0xd7e0c4cd, 0xb30cfb3a), TOBN(0x6d09b8c1, 0x6c9db4c8),
+ TOBN(0x40ba1a42, 0x07c8d9df), TOBN(0x6fd495f7, 0x1c52c66d),
+ TOBN(0xfb0e169f, 0x275264da), TOBN(0x80c2b746, 0xe57d8362),
+ TOBN(0xedd987f7, 0x49ad7222), TOBN(0xfdc229af, 0x4398ec7b),}
+ ,
+ {TOBN(0xb0d1ed84, 0x52666a58), TOBN(0x4bcb6e00, 0xe6a9c3c2),
+ TOBN(0x3c57411c, 0x26906408), TOBN(0xcfc20755, 0x13556400),
+ TOBN(0xa08b1c50, 0x5294dba3), TOBN(0xa30ba286, 0x8b7dd31e),
+ TOBN(0xd70ba90e, 0x991eca74), TOBN(0x094e142c, 0xe762c2b9),
+ TOBN(0xb81d783e, 0x979f3925), TOBN(0x1efd130a, 0xaf4c89a7),
+ TOBN(0x525c2144, 0xfd1bf7fa), TOBN(0x4b296904, 0x1b265a9e),
+ TOBN(0xed8e9634, 0xb9db65b6), TOBN(0x35c82e32, 0x03599d8a),
+ TOBN(0xdaa7a54f, 0x403563f3), TOBN(0x9df088ad, 0x022c38ab),
+ TOBN(0xe5cfb066, 0xbb3fd30a), TOBN(0x429169da, 0xeff0354e),
+ TOBN(0x809cf852, 0x3524e36c), TOBN(0x136f4fb3, 0x0155be1d),
+ TOBN(0x4826af01, 0x1fbba712), TOBN(0x6ef0f0b4, 0x506ba1a1),
+ TOBN(0xd9928b31, 0x77aea73e), TOBN(0xe2bf6af2, 0x5eaa244e),
+ TOBN(0x8d084f12, 0x4237b64b), TOBN(0x688ebe99, 0xe3ecfd07),
+ TOBN(0x57b8a70c, 0xf6845dd8), TOBN(0x808fc59c, 0x5da4a325),
+ TOBN(0xa9032b2b, 0xa3585862), TOBN(0xb66825d5, 0xedf29386),
+ TOBN(0xb5a5a8db, 0x431ec29b), TOBN(0xbb143a98, 0x3a1e8dc8),
+ TOBN(0x35ee94ce, 0x12ae381b), TOBN(0x3a7f176c, 0x86ccda90),
+ TOBN(0xc63a657e, 0x4606eaca), TOBN(0x9ae5a380, 0x43cd04df),
+ TOBN(0x9bec8d15, 0xed251b46), TOBN(0x1f5d6d30, 0xcaca5e64),
+ TOBN(0x347b3b35, 0x9ff20f07), TOBN(0x4d65f034, 0xf7e4b286),
+ TOBN(0x9e93ba24, 0xf111661e), TOBN(0xedced484, 0xb105eb04),
+ TOBN(0x96dc9ba1, 0xf424b578), TOBN(0xbf8f66b7, 0xe83e9069),
+ TOBN(0x872d4df4, 0xd7ed8216), TOBN(0xbf07f377, 0x8e2cbecf),
+ TOBN(0x4281d899, 0x98e73754), TOBN(0xfec85fbb, 0x8aab8708),
+ TOBN(0x9a3c0dee, 0xa5ba5b0b), TOBN(0xe6a116ce, 0x42d05299),
+ TOBN(0xae9775fe, 0xe9b02d42), TOBN(0x72b05200, 0xa1545cb6),
+ TOBN(0xbc506f7d, 0x31a3b4ea), TOBN(0xe5893078, 0x8bbd9b32),
+ TOBN(0xc8bc5f37, 0xe4b12a97), TOBN(0x6b000c06, 0x4a73b671),
+ TOBN(0x13b5bf22, 0x765fa7d0), TOBN(0x59805bf0, 0x1d6a5370),
+ TOBN(0x67a5e29d, 0x4280db98), TOBN(0x4f53916f, 0x776b1ce3),
+ TOBN(0x714ff61f, 0x33ddf626), TOBN(0x4206238e, 0xa085d103),
+ TOBN(0x1c50d4b7, 0xe5809ee3), TOBN(0x999f450d, 0x85f8eb1d),
+ TOBN(0x658a6051, 0xe4c79e9b), TOBN(0x1394cb73, 0xc66a9fea),
+ TOBN(0x27f31ed5, 0xc6be7b23), TOBN(0xf4c88f36, 0x5aa6f8fe),
+ TOBN(0x0fb0721f, 0x4aaa499e), TOBN(0x68b3a7d5, 0xe3fb2a6b),
+ TOBN(0xa788097d, 0x3a92851d), TOBN(0x060e7f8a, 0xe96f4913),
+ TOBN(0x82eebe73, 0x1a3a93bc), TOBN(0x42bbf465, 0xa21adc1a),
+ TOBN(0xc10b6fa4, 0xef030efd), TOBN(0x247aa4c7, 0x87b097bb),
+ TOBN(0x8b8dc632, 0xf60c77da), TOBN(0x6ffbc26a, 0xc223523e),
+ TOBN(0xa4f6ff11, 0x344579cf), TOBN(0x5825653c, 0x980250f6),
+ TOBN(0xb2dd097e, 0xbc1aa2b9), TOBN(0x07889393, 0x37a0333a),
+ TOBN(0x1cf55e71, 0x37a0db38), TOBN(0x2648487f, 0x792c1613),
+ TOBN(0xdad01336, 0x3fcef261), TOBN(0x6239c81d, 0x0eabf129),
+ TOBN(0x8ee761de, 0x9d276be2), TOBN(0x406a7a34, 0x1eda6ad3),
+ TOBN(0x4bf367ba, 0x4a493b31), TOBN(0x54f20a52, 0x9bf7f026),
+ TOBN(0xb696e062, 0x9795914b), TOBN(0xcddab96d, 0x8bf236ac),
+ TOBN(0x4ff2c70a, 0xed25ea13), TOBN(0xfa1d09eb, 0x81cbbbe7),
+ TOBN(0x88fc8c87, 0x468544c5), TOBN(0x847a670d, 0x696b3317),
+ TOBN(0xf133421e, 0x64bcb626), TOBN(0xaea638c8, 0x26dee0b5),
+ TOBN(0xd6e7680b, 0xb310346c), TOBN(0xe06f4097, 0xd5d4ced3),
+ TOBN(0x09961452, 0x7512a30b), TOBN(0xf3d867fd, 0xe589a59a),
+ TOBN(0x2e73254f, 0x52d0c180), TOBN(0x9063d8a3, 0x333c74ac),
+ TOBN(0xeda6c595, 0xd314e7bc), TOBN(0x2ee7464b, 0x467899ed),
+ TOBN(0x1cef423c, 0x0a1ed5d3), TOBN(0x217e76ea, 0x69cc7613),
+ TOBN(0x27ccce1f, 0xe7cda917), TOBN(0x12d8016b, 0x8a893f16),
+ TOBN(0xbcd6de84, 0x9fc74f6b), TOBN(0xfa5817e2, 0xf3144e61),
+ TOBN(0x1f354164, 0x0821ee4c), TOBN(0x1583eab4, 0x0bc61992),
+ TOBN(0x7490caf6, 0x1d72879f), TOBN(0x998ad9f3, 0xf76ae7b2),
+ TOBN(0x1e181950, 0xa41157f7), TOBN(0xa9d7e1e6, 0xe8da3a7e),
+ TOBN(0x963784eb, 0x8426b95f), TOBN(0x0ee4ed6e, 0x542e2a10),
+ TOBN(0xb79d4cc5, 0xac751e7b), TOBN(0x93f96472, 0xfd4211bd),
+ TOBN(0x8c72d3d2, 0xc8de4fc6), TOBN(0x7b69cbf5, 0xdf44f064),
+ TOBN(0x3da90ca2, 0xf4bf94e1), TOBN(0x1a5325f8, 0xf12894e2),
+ TOBN(0x0a437f6c, 0x7917d60b), TOBN(0x9be70486, 0x96c9cb5d),
+ TOBN(0xb4d880bf, 0xe1dc5c05), TOBN(0xd738adda, 0xeebeeb57),
+ TOBN(0x6f0119d3, 0xdf0fe6a3), TOBN(0x5c686e55, 0x66eaaf5a),
+ TOBN(0x9cb10b50, 0xdfd0b7ec), TOBN(0xbdd0264b, 0x6a497c21),
+ TOBN(0xfc093514, 0x8c546c96), TOBN(0x58a947fa, 0x79dbf42a),
+ TOBN(0xc0b48d4e, 0x49ccd6d7), TOBN(0xff8fb02c, 0x88bd5580),
+ TOBN(0xc75235e9, 0x07d473b2), TOBN(0x4fab1ac5, 0xa2188af3),
+ TOBN(0x030fa3bc, 0x97576ec0), TOBN(0xe8c946e8, 0x0b7e7d2f),
+ TOBN(0x40a5c9cc, 0x70305600), TOBN(0x6d8260a9, 0xc8b013b4),
+ TOBN(0x0368304f, 0x70bba85c), TOBN(0xad090da1, 0xa4a0d311),
+ TOBN(0x7170e870, 0x2415eec1), TOBN(0xbfba35fe, 0x8461ea47),
+ TOBN(0x6279019a, 0xc1e91938), TOBN(0xa47638f3, 0x1afc415f),
+ TOBN(0x36c65cbb, 0xbcba0e0f), TOBN(0x02160efb, 0x034e2c48),
+ TOBN(0xe6c51073, 0x615cd9e4), TOBN(0x498ec047, 0xf1243c06),
+ TOBN(0x3e5a8809, 0xb17b3d8c), TOBN(0x5cd99e61, 0x0cc565f1),
+ TOBN(0x81e312df, 0x7851dafe), TOBN(0xf156f5ba, 0xa79061e2),
+ TOBN(0x80d62b71, 0x880c590e), TOBN(0xbec9746f, 0x0a39faa1),
+ TOBN(0x1d98a9c1, 0xc8ed1f7a), TOBN(0x09e43bb5, 0xa81d5ff2),
+ TOBN(0xd5f00f68, 0x0da0794a), TOBN(0x412050d9, 0x661aa836),
+ TOBN(0xa89f7c4e, 0x90747e40), TOBN(0x6dc05ebb, 0xb62a3686),
+ TOBN(0xdf4de847, 0x308e3353), TOBN(0x53868fbb, 0x9fb53bb9),
+ TOBN(0x2b09d2c3, 0xcfdcf7dd), TOBN(0x41a9fce3, 0x723fcab4),
+ TOBN(0x73d905f7, 0x07f57ca3), TOBN(0x080f9fb1, 0xac8e1555),
+ TOBN(0x7c088e84, 0x9ba7a531), TOBN(0x07d35586, 0xed9a147f),
+ TOBN(0x602846ab, 0xaf48c336), TOBN(0x7320fd32, 0x0ccf0e79),
+ TOBN(0xaa780798, 0xb18bd1ff), TOBN(0x52c2e300, 0xafdd2905),
+ TOBN(0xf27ea3d6, 0x434267cd), TOBN(0x8b96d16d, 0x15605b5f),
+ TOBN(0x7bb31049, 0x4b45706b), TOBN(0xe7f58b8e, 0x743d25f8),
+ TOBN(0xe9b5e45b, 0x87f30076), TOBN(0xd19448d6, 0x5d053d5a),
+ TOBN(0x1ecc8cb9, 0xd3210a04), TOBN(0x6bc7d463, 0xdafb5269),
+ TOBN(0x3e59b10a, 0x67c3489f), TOBN(0x1769788c, 0x65641e1b),
+ TOBN(0x8a53b82d, 0xbd6cb838), TOBN(0x7066d6e6, 0x236d5f22),
+ TOBN(0x03aa1c61, 0x6908536e), TOBN(0xc971da0d, 0x66ae9809),
+ TOBN(0x01b3a86b, 0xc49a2fac), TOBN(0x3b8420c0, 0x3092e77a),
+ TOBN(0x02057300, 0x7d6fb556), TOBN(0x6941b2a1, 0xbff40a87),
+ TOBN(0x140b6308, 0x0658ff2a), TOBN(0x87804363, 0x3424ab36),
+ TOBN(0x0253bd51, 0x5751e299), TOBN(0xc75bcd76, 0x449c3e3a),
+ TOBN(0x92eb4090, 0x7f8f875d), TOBN(0x9c9d754e, 0x56c26bbf),
+ TOBN(0x158cea61, 0x8110bbe7), TOBN(0x62a6b802, 0x745f91ea),
+ TOBN(0xa79c41aa, 0xc6e7394b), TOBN(0x445b6a83, 0xad57ef10),
+ TOBN(0x0c5277eb, 0x6ea6f40c), TOBN(0x319fe96b, 0x88633365),
+ TOBN(0x0b0fc61f, 0x385f63cb), TOBN(0x41250c84, 0x22bdd127),
+ TOBN(0x67d153f1, 0x09e942c2), TOBN(0x60920d08, 0xc021ad5d),
+ TOBN(0x229f5746, 0x724d81a5), TOBN(0xb7ffb892, 0x5bba3299),
+ TOBN(0x518c51a1, 0xde413032), TOBN(0x2a9bfe77, 0x3c2fd94c),
+ TOBN(0xcbcde239, 0x3191f4fd), TOBN(0x43093e16, 0xd3d6ada1),
+ TOBN(0x184579f3, 0x58769606), TOBN(0x2c94a8b3, 0xd236625c),
+ TOBN(0x6922b9c0, 0x5c437d8e), TOBN(0x3d4ae423, 0xd8d9f3c8),
+ TOBN(0xf72c31c1, 0x2e7090a2), TOBN(0x4ac3f5f3, 0xd76a55bd),
+ TOBN(0x342508fc, 0x6b6af991), TOBN(0x0d527100, 0x1b5cebbd),
+ TOBN(0xb84740d0, 0xdd440dd7), TOBN(0x748ef841, 0x780162fd),
+ TOBN(0xa8dbfe0e, 0xdfc6fafb), TOBN(0xeadfdf05, 0xf7300f27),
+ TOBN(0x7d06555f, 0xfeba4ec9), TOBN(0x12c56f83, 0x9e25fa97),
+ TOBN(0x77f84203, 0xd39b8c34), TOBN(0xed8b1be6, 0x3125eddb),
+ TOBN(0x5bbf2441, 0xf6e39dc5), TOBN(0xb00f6ee6, 0x6a5d678a),
+ TOBN(0xba456ecf, 0x57d0ea99), TOBN(0xdcae0f58, 0x17e06c43),
+ TOBN(0x01643de4, 0x0f5b4baa), TOBN(0x2c324341, 0xd161b9be),
+ TOBN(0x80177f55, 0xe126d468), TOBN(0xed325f1f, 0x76748e09),
+ TOBN(0x6116004a, 0xcfa9bdc2), TOBN(0x2d8607e6, 0x3a9fb468),
+ TOBN(0x0e573e27, 0x6009d660), TOBN(0x3a525d2e, 0x8d10c5a1),
+ TOBN(0xd26cb45c, 0x3b9009a0), TOBN(0xb6b0cdc0, 0xde9d7448),
+ TOBN(0x949c9976, 0xe1337c26), TOBN(0x6faadebd, 0xd73d68e5),
+ TOBN(0x9e158614, 0xf1b768d9), TOBN(0x22dfa557, 0x9cc4f069),
+ TOBN(0xccd6da17, 0xbe93c6d6), TOBN(0x24866c61, 0xa504f5b9),
+ TOBN(0x2121353c, 0x8d694da1), TOBN(0x1c6ca580, 0x0140b8c6),
+ TOBN(0xc245ad8c, 0xe964021e), TOBN(0xb83bffba, 0x032b82b3),
+ TOBN(0xfaa220c6, 0x47ef9898), TOBN(0x7e8d3ac6, 0x982c948a),
+ TOBN(0x1faa2091, 0xbc2d124a), TOBN(0xbd54c3dd, 0x05b15ff4),
+ TOBN(0x386bf3ab, 0xc87c6fb7), TOBN(0xfb2b0563, 0xfdeb6f66),
+ TOBN(0x4e77c557, 0x5b45afb4), TOBN(0xe9ded649, 0xefb8912d),
+ TOBN(0x7ec9bbf5, 0x42f6e557), TOBN(0x2570dfff, 0x62671f00),
+ TOBN(0x2b3bfb78, 0x88e084bd), TOBN(0xa024b238, 0xf37fe5b4),
+ TOBN(0x44e7dc04, 0x95649aee), TOBN(0x498ca255, 0x5e7ec1d8),
+ TOBN(0x3bc766ea, 0xaaa07e86), TOBN(0x0db6facb, 0xf3608586),
+ TOBN(0xbadd2549, 0xbdc259c8), TOBN(0x95af3c6e, 0x041c649f),
+ TOBN(0xb36a928c, 0x02e30afb), TOBN(0x9b5356ad, 0x008a88b8),
+ TOBN(0x4b67a5f1, 0xcf1d9e9d), TOBN(0xc6542e47, 0xa5d8d8ce),
+ TOBN(0x73061fe8, 0x7adfb6cc), TOBN(0xcc826fd3, 0x98678141),
+ TOBN(0x00e758b1, 0x3c80515a), TOBN(0x6afe3247, 0x41485083),
+ TOBN(0x0fcb08b9, 0xb6ae8a75), TOBN(0xb8cf388d, 0x4acf51e1),
+ TOBN(0x344a5560, 0x6961b9d6), TOBN(0x1a6778b8, 0x6a97fd0c),
+ TOBN(0xd840fdc1, 0xecc4c7e3), TOBN(0xde9fe47d, 0x16db68cc),
+ TOBN(0xe95f89de, 0xa3e216aa), TOBN(0x84f1a6a4, 0x9594a8be),
+ TOBN(0x7ddc7d72, 0x5a7b162b), TOBN(0xc5cfda19, 0xadc817a3),
+ TOBN(0x80a5d350, 0x78b58d46), TOBN(0x93365b13, 0x82978f19),
+ TOBN(0x2e44d225, 0x26a1fc90), TOBN(0x0d6d10d2, 0x4d70705d),
+ TOBN(0xd94b6b10, 0xd70c45f4), TOBN(0x0f201022, 0xb216c079),
+ TOBN(0xcec966c5, 0x658fde41), TOBN(0xa8d2bc7d, 0x7e27601d),
+ TOBN(0xbfcce3e1, 0xff230be7), TOBN(0x3394ff6b, 0x0033ffb5),
+ TOBN(0xd890c509, 0x8132c9af), TOBN(0xaac4b0eb, 0x361e7868),
+ TOBN(0x5194ded3, 0xe82d15aa), TOBN(0x4550bd2e, 0x23ae6b7d),
+ TOBN(0x3fda318e, 0xea5399d4), TOBN(0xd989bffa, 0x91638b80),
+ TOBN(0x5ea124d0, 0xa14aa12d), TOBN(0x1fb1b899, 0x3667b944),
+ TOBN(0x95ec7969, 0x44c44d6a), TOBN(0x91df144a, 0x57e86137),
+ TOBN(0x915fd620, 0x73adac44), TOBN(0x8f01732d, 0x59a83801),
+ TOBN(0xec579d25, 0x3aa0a633), TOBN(0x06de5e7c, 0xc9d6d59c),
+ TOBN(0xc132f958, 0xb1ef8010), TOBN(0x29476f96, 0xe65c1a02),
+ TOBN(0x336a77c0, 0xd34c3565), TOBN(0xef1105b2, 0x1b9f1e9e),
+ TOBN(0x63e6d08b, 0xf9e08002), TOBN(0x9aff2f21, 0xc613809e),
+ TOBN(0xb5754f85, 0x3a80e75d), TOBN(0xde71853e, 0x6bbda681),
+ TOBN(0x86f041df, 0x8197fd7a), TOBN(0x8b332e08, 0x127817fa),
+ TOBN(0x05d99be8, 0xb9c20cda), TOBN(0x89f7aad5, 0xd5cd0c98),
+ TOBN(0x7ef936fe, 0x5bb94183), TOBN(0x92ca0753, 0xb05cd7f2),
+ TOBN(0x9d65db11, 0x74a1e035), TOBN(0x02628cc8, 0x13eaea92),
+ TOBN(0xf2d9e242, 0x49e4fbf2), TOBN(0x94fdfd9b, 0xe384f8b7),
+ TOBN(0x65f56054, 0x63428c6b), TOBN(0x2f7205b2, 0x90b409a5),
+ TOBN(0xf778bb78, 0xff45ae11), TOBN(0xa13045be, 0xc5ee53b2),
+ TOBN(0xe00a14ff, 0x03ef77fe), TOBN(0x689cd59f, 0xffef8bef),
+ TOBN(0x3578f0ed, 0x1e9ade22), TOBN(0xe99f3ec0, 0x6268b6a8),
+ TOBN(0xa2057d91, 0xea1b3c3e), TOBN(0x2d1a7053, 0xb8823a4a),
+ TOBN(0xabbb336a, 0x2cca451e), TOBN(0xcd2466e3, 0x2218bb5d),
+ TOBN(0x3ac1f42f, 0xc8cb762d), TOBN(0x7e312aae, 0x7690211f),
+ TOBN(0xebb9bd73, 0x45d07450), TOBN(0x207c4b82, 0x46c2213f),
+ TOBN(0x99d425c1, 0x375913ec), TOBN(0x94e45e96, 0x67908220),
+ TOBN(0xc08f3087, 0xcd67dbf6), TOBN(0xa5670fbe, 0xc0887056),
+ TOBN(0x6717b64a, 0x66f5b8fc), TOBN(0xd5a56aea, 0x786fec28),
+ TOBN(0xa8c3f55f, 0xc0ff4952), TOBN(0xa77fefae, 0x457ac49b),
+ TOBN(0x29882d7c, 0x98379d44), TOBN(0xd000bdfb, 0x509edc8a),
+ TOBN(0xc6f95979, 0xe66fe464), TOBN(0x504a6115, 0xfa61bde0),
+ TOBN(0x56b3b871, 0xeffea31a), TOBN(0x2d3de26d, 0xf0c21a54),
+ TOBN(0x21dbff31, 0x834753bf), TOBN(0xe67ecf49, 0x69269d86),
+ TOBN(0x7a176952, 0x151fe690), TOBN(0x03515804, 0x7f2adb5f),
+ TOBN(0xee794b15, 0xd1b62a8d), TOBN(0xf004ceec, 0xaae454e6),
+ TOBN(0x0897ea7c, 0xf0386fac), TOBN(0x3b62ff12, 0xd1fca751),
+ TOBN(0x154181df, 0x1b7a04ec), TOBN(0x2008e04a, 0xfb5847ec),
+ TOBN(0xd147148e, 0x41dbd772), TOBN(0x2b419f73, 0x22942654),
+ TOBN(0x669f30d3, 0xe9c544f7), TOBN(0x52a2c223, 0xc8540149),
+ TOBN(0x5da9ee14, 0x634dfb02), TOBN(0x5f074ff0, 0xf47869f3),
+ TOBN(0x74ee878d, 0xa3933acc), TOBN(0xe6510651, 0x4fe35ed1),
+ TOBN(0xb3eb9482, 0xf1012e7a), TOBN(0x51013cc0, 0xa8a566ae),
+ TOBN(0xdd5e9243, 0x47c00d3b), TOBN(0x7fde089d, 0x946bb0e5),
+ TOBN(0x030754fe, 0xc731b4b3), TOBN(0x12a136a4, 0x99fda062),
+ TOBN(0x7c1064b8, 0x5a1a35bc), TOBN(0xbf1f5763, 0x446c84ef),
+ TOBN(0xed29a56d, 0xa16d4b34), TOBN(0x7fba9d09, 0xdca21c4f),
+ TOBN(0x66d7ac00, 0x6d8de486), TOBN(0x60061987, 0x73a2a5e1),
+ TOBN(0x8b400f86, 0x9da28ff0), TOBN(0x3133f708, 0x43c4599c),
+ TOBN(0x9911c9b8, 0xee28cb0d), TOBN(0xcd7e2874, 0x8e0af61d),
+ TOBN(0x5a85f0f2, 0x72ed91fc), TOBN(0x85214f31, 0x9cd4a373),
+ TOBN(0x881fe5be, 0x1925253c), TOBN(0xd8dc98e0, 0x91e8bc76),
+ TOBN(0x7120affe, 0x585cc3a2), TOBN(0x724952ed, 0x735bf97a),
+ TOBN(0x5581e7dc, 0x3eb34581), TOBN(0x5cbff4f2, 0xe52ee57d),
+ TOBN(0x8d320a0e, 0x87d8cc7b), TOBN(0x9beaa7f3, 0xf1d280d0),
+ TOBN(0x7a0b9571, 0x9beec704), TOBN(0x9126332e, 0x5b7f0057),
+ TOBN(0x01fbc1b4, 0x8ed3bd6d), TOBN(0x35bb2c12, 0xd945eb24),
+ TOBN(0x6404694e, 0x9a8ae255), TOBN(0xb6092eec, 0x8d6abfb3),
+ TOBN(0x4d76143f, 0xcc058865), TOBN(0x7b0a5af2, 0x6e249922),
+ TOBN(0x8aef9440, 0x6a50d353), TOBN(0xe11e4bcc, 0x64f0e07a),
+ TOBN(0x4472993a, 0xa14a90fa), TOBN(0x7706e20c, 0xba0c51d4),
+ TOBN(0xf403292f, 0x1532672d), TOBN(0x52573bfa, 0x21829382),
+ TOBN(0x6a7bb6a9, 0x3b5bdb83), TOBN(0x08da65c0, 0xa4a72318),
+ TOBN(0xc58d22aa, 0x63eb065f), TOBN(0x1717596c, 0x1b15d685),
+ TOBN(0x112df0d0, 0xb266d88b), TOBN(0xf688ae97, 0x5941945a),
+ TOBN(0x487386e3, 0x7c292cac), TOBN(0x42f3b50d, 0x57d6985c),
+ TOBN(0x6da4f998, 0x6a90fc34), TOBN(0xc8f257d3, 0x65ca8a8d),
+ TOBN(0xc2feabca, 0x6951f762), TOBN(0xe1bc81d0, 0x74c323ac),
+ TOBN(0x1bc68f67, 0x251a2a12), TOBN(0x10d86587, 0xbe8a70dc),
+ TOBN(0xd648af7f, 0xf0f84d2e), TOBN(0xf0aa9ebc, 0x6a43ac92),
+ TOBN(0x69e3be04, 0x27596893), TOBN(0xb6bb02a6, 0x45bf452b),
+ TOBN(0x0875c11a, 0xf4c698c8), TOBN(0x6652b5c7, 0xbece3794),
+ TOBN(0x7b3755fd, 0x4f5c0499), TOBN(0x6ea16558, 0xb5532b38),
+ TOBN(0xd1c69889, 0xa2e96ef7), TOBN(0x9c773c3a, 0x61ed8f48),
+ TOBN(0x2b653a40, 0x9b323abc), TOBN(0xe26605e1, 0xf0e1d791),
+ TOBN(0x45d41064, 0x4a87157a), TOBN(0x8f9a78b7, 0xcbbce616),
+ TOBN(0xcf1e44aa, 0xc407eddd), TOBN(0x81ddd1d8, 0xa35b964f),
+ TOBN(0x473e339e, 0xfd083999), TOBN(0x6c94bdde, 0x8e796802),
+ TOBN(0x5a304ada, 0x8545d185), TOBN(0x82ae44ea, 0x738bb8cb),
+ TOBN(0x628a35e3, 0xdf87e10e), TOBN(0xd3624f3d, 0xa15b9fe3),
+ TOBN(0xcc44209b, 0x14be4254), TOBN(0x7d0efcbc, 0xbdbc2ea5),
+ TOBN(0x1f603362, 0x04c37bbe), TOBN(0x21f363f5, 0x56a5852c),
+ TOBN(0xa1503d1c, 0xa8501550), TOBN(0x2251e0e1, 0xd8ab10bb),
+ TOBN(0xde129c96, 0x6961c51c), TOBN(0x1f7246a4, 0x81910f68),
+ TOBN(0x2eb744ee, 0x5f2591f2), TOBN(0x3c47d33f, 0x5e627157),
+ TOBN(0x4d6d62c9, 0x22f3bd68), TOBN(0x6120a64b, 0xcb8df856),
+ TOBN(0x3a9ac6c0, 0x7b5d07df), TOBN(0xa92b9558, 0x7ef39783),
+ TOBN(0xe128a134, 0xab3a9b4f), TOBN(0x41c18807, 0xb1252f05),
+ TOBN(0xfc7ed089, 0x80ba9b1c), TOBN(0xac8dc6de, 0xc532a9dd),
+ TOBN(0xbf829cef, 0x55246809), TOBN(0x101b784f, 0x5b4ee80f),
+ TOBN(0xc09945bb, 0xb6f11603), TOBN(0x57b09dbe, 0x41d2801e),
+ TOBN(0xfba5202f, 0xa97534a8), TOBN(0x7fd8ae5f, 0xc17b9614),
+ TOBN(0xa50ba666, 0x78308435), TOBN(0x9572f77c, 0xd3868c4d),
+ TOBN(0x0cef7bfd, 0x2dd7aab0), TOBN(0xe7958e08, 0x2c7c79ff),
+ TOBN(0x81262e42, 0x25346689), TOBN(0x716da290, 0xb07c7004),
+ TOBN(0x35f911ea, 0xb7950ee3), TOBN(0x6fd72969, 0x261d21b5),
+ TOBN(0x52389803, 0x08b640d3), TOBN(0x5b0026ee, 0x887f12a1),
+ TOBN(0x20e21660, 0x742e9311), TOBN(0x0ef6d541, 0x5ff77ff7),
+ TOBN(0x969127f0, 0xf9c41135), TOBN(0xf21d60c9, 0x68a64993),
+ TOBN(0x656e5d0c, 0xe541875c), TOBN(0xf1e0f84e, 0xa1d3c233),
+ TOBN(0x9bcca359, 0x06002d60), TOBN(0xbe2da60c, 0x06191552),
+ TOBN(0x5da8bbae, 0x61181ec3), TOBN(0x9f04b823, 0x65806f19),
+ TOBN(0xf1604a7d, 0xd4b79bb8), TOBN(0xaee806fb, 0x52c878c8),
+ TOBN(0x34144f11, 0x8d47b8e8), TOBN(0x72edf52b, 0x949f9054),
+ TOBN(0xebfca84e, 0x2127015a), TOBN(0x9051d0c0, 0x9cb7cef3),
+ TOBN(0x86e8fe58, 0x296deec8), TOBN(0x33b28188, 0x41010d74),}
+ ,
+ {TOBN(0x01079383, 0x171b445f), TOBN(0x9bcf21e3, 0x8131ad4c),
+ TOBN(0x8cdfe205, 0xc93987e8), TOBN(0xe63f4152, 0xc92e8c8f),
+ TOBN(0x729462a9, 0x30add43d), TOBN(0x62ebb143, 0xc980f05a),
+ TOBN(0x4f3954e5, 0x3b06e968), TOBN(0xfe1d75ad, 0x242cf6b1),
+ TOBN(0x5f95c6c7, 0xaf8685c8), TOBN(0xd4c1c8ce, 0x2f8f01aa),
+ TOBN(0xc44bbe32, 0x2574692a), TOBN(0xb8003478, 0xd4a4a068),
+ TOBN(0x7c8fc6e5, 0x2eca3cdb), TOBN(0xea1db16b, 0xec04d399),
+ TOBN(0xb05bc82e, 0x8f2bc5cf), TOBN(0x763d517f, 0xf44793d2),
+ TOBN(0x4451c1b8, 0x08bd98d0), TOBN(0x644b1cd4, 0x6575f240),
+ TOBN(0x6907eb33, 0x7375d270), TOBN(0x56c8bebd, 0xfa2286bd),
+ TOBN(0xc713d2ac, 0xc4632b46), TOBN(0x17da427a, 0xafd60242),
+ TOBN(0x313065b7, 0xc95c7546), TOBN(0xf8239898, 0xbf17a3de),
+ TOBN(0xf3b7963f, 0x4c830320), TOBN(0x842c7aa0, 0x903203e3),
+ TOBN(0xaf22ca0a, 0xe7327afb), TOBN(0x38e13092, 0x967609b6),
+ TOBN(0x73b8fb62, 0x757558f1), TOBN(0x3cc3e831, 0xf7eca8c1),
+ TOBN(0xe4174474, 0xf6331627), TOBN(0xa77989ca, 0xc3c40234),
+ TOBN(0xe5fd17a1, 0x44a081e0), TOBN(0xd797fb7d, 0xb70e296a),
+ TOBN(0x2b472b30, 0x481f719c), TOBN(0x0e632a98, 0xfe6f8c52),
+ TOBN(0x89ccd116, 0xc5f0c284), TOBN(0xf51088af, 0x2d987c62),
+ TOBN(0x2a2bccda, 0x4c2de6cf), TOBN(0x810f9efe, 0xf679f0f9),
+ TOBN(0xb0f394b9, 0x7ffe4b3e), TOBN(0x0b691d21, 0xe5fa5d21),
+ TOBN(0xb0bd7747, 0x9dfbbc75), TOBN(0xd2830fda, 0xfaf78b00),
+ TOBN(0xf78c249c, 0x52434f57), TOBN(0x4b1f7545, 0x98096dab),
+ TOBN(0x73bf6f94, 0x8ff8c0b3), TOBN(0x34aef03d, 0x454e134c),
+ TOBN(0xf8d151f4, 0xb7ac7ec5), TOBN(0xd6ceb95a, 0xe50da7d5),
+ TOBN(0xa1b492b0, 0xdc3a0eb8), TOBN(0x75157b69, 0xb3dd2863),
+ TOBN(0xe2c4c74e, 0xc5413d62), TOBN(0xbe329ff7, 0xbc5fc4c7),
+ TOBN(0x835a2aea, 0x60fa9dda), TOBN(0xf117f5ad, 0x7445cb87),
+ TOBN(0xae8317f4, 0xb0166f7a), TOBN(0xfbd3e3f7, 0xceec74e6),
+ TOBN(0xfdb516ac, 0xe0874bfd), TOBN(0x3d846019, 0xc681f3a3),
+ TOBN(0x0b12ee5c, 0x7c1620b0), TOBN(0xba68b4dd, 0x2b63c501),
+ TOBN(0xac03cd32, 0x6668c51e), TOBN(0x2a6279f7, 0x4e0bcb5b),
+ TOBN(0x17bd69b0, 0x6ae85c10), TOBN(0x72946979, 0x1dfdd3a6),
+ TOBN(0xd9a03268, 0x2c078bec), TOBN(0x41c6a658, 0xbfd68a52),
+ TOBN(0xcdea1024, 0x0e023900), TOBN(0xbaeec121, 0xb10d144d),
+ TOBN(0x5a600e74, 0x058ab8dc), TOBN(0x1333af21, 0xbb89ccdd),
+ TOBN(0xdf25eae0, 0x3aaba1f1), TOBN(0x2cada16e, 0x3b7144cf),
+ TOBN(0x657ee27d, 0x71ab98bc), TOBN(0x99088b4c, 0x7a6fc96e),
+ TOBN(0x05d5c0a0, 0x3549dbd4), TOBN(0x42cbdf8f, 0xf158c3ac),
+ TOBN(0x3fb6b3b0, 0x87edd685), TOBN(0x22071cf6, 0x86f064d0),
+ TOBN(0xd2d6721f, 0xff2811e5), TOBN(0xdb81b703, 0xfe7fae8c),
+ TOBN(0x3cfb74ef, 0xd3f1f7bb), TOBN(0x0cdbcd76, 0x16cdeb5d),
+ TOBN(0x4f39642a, 0x566a808c), TOBN(0x02b74454, 0x340064d6),
+ TOBN(0xfabbadca, 0x0528fa6f), TOBN(0xe4c3074c, 0xd3fc0bb6),
+ TOBN(0xb32cb8b0, 0xb796d219), TOBN(0xc3e95f4f, 0x34741dd9),
+ TOBN(0x87212125, 0x68edf6f5), TOBN(0x7a03aee4, 0xa2b9cb8e),
+ TOBN(0x0cd3c376, 0xf53a89aa), TOBN(0x0d8af9b1, 0x948a28dc),
+ TOBN(0xcf86a3f4, 0x902ab04f), TOBN(0x8aacb62a, 0x7f42002d),
+ TOBN(0x106985eb, 0xf62ffd52), TOBN(0xe670b54e, 0x5797bf10),
+ TOBN(0x4b405209, 0xc5e30aef), TOBN(0x12c97a20, 0x4365b5e9),
+ TOBN(0x104646ce, 0x1fe32093), TOBN(0x13cb4ff6, 0x3907a8c9),
+ TOBN(0x8b9f30d1, 0xd46e726b), TOBN(0xe1985e21, 0xaba0f499),
+ TOBN(0xc573dea9, 0x10a230cd), TOBN(0x24f46a93, 0xcd30f947),
+ TOBN(0xf2623fcf, 0xabe2010a), TOBN(0x3f278cb2, 0x73f00e4f),
+ TOBN(0xed55c67d, 0x50b920eb), TOBN(0xf1cb9a2d, 0x8e760571),
+ TOBN(0x7c50d109, 0x0895b709), TOBN(0x4207cf07, 0x190d4369),
+ TOBN(0x3b027e81, 0xc4127fe1), TOBN(0xa9f8b9ad, 0x3ae9c566),
+ TOBN(0x5ab10851, 0xacbfbba5), TOBN(0xa747d648, 0x569556f5),
+ TOBN(0xcc172b5c, 0x2ba97bf7), TOBN(0x15e0f77d, 0xbcfa3324),
+ TOBN(0xa345b797, 0x7686279d), TOBN(0x5a723480, 0xe38003d3),
+ TOBN(0xfd8e139f, 0x8f5fcda8), TOBN(0xf3e558c4, 0xbdee5bfd),
+ TOBN(0xd76cbaf4, 0xe33f9f77), TOBN(0x3a4c97a4, 0x71771969),
+ TOBN(0xda27e84b, 0xf6dce6a7), TOBN(0xff373d96, 0x13e6c2d1),
+ TOBN(0xf115193c, 0xd759a6e9), TOBN(0x3f9b7025, 0x63d2262c),
+ TOBN(0xd9764a31, 0x317cd062), TOBN(0x30779d8e, 0x199f8332),
+ TOBN(0xd8074106, 0x16b11b0b), TOBN(0x7917ab9f, 0x78aeaed8),
+ TOBN(0xb67a9cbe, 0x28fb1d8e), TOBN(0x2e313563, 0x136eda33),
+ TOBN(0x010b7069, 0xa371a86c), TOBN(0x44d90fa2, 0x6744e6b7),
+ TOBN(0x68190867, 0xd6b3e243), TOBN(0x9fe6cd9d, 0x59048c48),
+ TOBN(0xb900b028, 0x95731538), TOBN(0xa012062f, 0x32cae04f),
+ TOBN(0x8107c8bc, 0x9399d082), TOBN(0x47e8c54a, 0x41df12e2),
+ TOBN(0x14ba5117, 0xb6ef3f73), TOBN(0x22260bea, 0x81362f0b),
+ TOBN(0x90ea261e, 0x1a18cc20), TOBN(0x2192999f, 0x2321d636),
+ TOBN(0xef64d314, 0xe311b6a0), TOBN(0xd7401e4c, 0x3b54a1f5),
+ TOBN(0x19019983, 0x6fbca2ba), TOBN(0x46ad3293, 0x8fbffc4b),
+ TOBN(0xa142d3f6, 0x3786bf40), TOBN(0xeb5cbc26, 0xb67039fc),
+ TOBN(0x9cb0ae6c, 0x252bd479), TOBN(0x05e0f88a, 0x12b5848f),
+ TOBN(0x78f6d2b2, 0xa5c97663), TOBN(0x6f6e149b, 0xc162225c),
+ TOBN(0xe602235c, 0xde601a89), TOBN(0xd17bbe98, 0xf373be1f),
+ TOBN(0xcaf49a5b, 0xa8471827), TOBN(0x7e1a0a85, 0x18aaa116),
+ TOBN(0x6c833196, 0x270580c3), TOBN(0x1e233839, 0xf1c98a14),
+ TOBN(0x67b2f7b4, 0xae34e0a5), TOBN(0x47ac8745, 0xd8ce7289),
+ TOBN(0x2b74779a, 0x100dd467), TOBN(0x274a4337, 0x4ee50d09),
+ TOBN(0x603dcf13, 0x83608bc9), TOBN(0xcd9da6c3, 0xc89e8388),
+ TOBN(0x2660199f, 0x355116ac), TOBN(0xcc38bb59, 0xb6d18eed),
+ TOBN(0x3075f31f, 0x2f4bc071), TOBN(0x9774457f, 0x265dc57e),
+ TOBN(0x06a6a9c8, 0xc6db88bb), TOBN(0x6429d07f, 0x4ec98e04),
+ TOBN(0x8d05e57b, 0x05ecaa8b), TOBN(0x20f140b1, 0x7872ea7b),
+ TOBN(0xdf8c0f09, 0xca494693), TOBN(0x48d3a020, 0xf252e909),
+ TOBN(0x4c5c29af, 0x57b14b12), TOBN(0x7e6fa37d, 0xbf47ad1c),
+ TOBN(0x66e7b506, 0x49a0c938), TOBN(0xb72c0d48, 0x6be5f41f),
+ TOBN(0x6a6242b8, 0xb2359412), TOBN(0xcd35c774, 0x8e859480),
+ TOBN(0x12536fea, 0x87baa627), TOBN(0x58c1fec1, 0xf72aa680),
+ TOBN(0x6c29b637, 0x601e5dc9), TOBN(0x9e3c3c1c, 0xde9e01b9),
+ TOBN(0xefc8127b, 0x2bcfe0b0), TOBN(0x35107102, 0x2a12f50d),
+ TOBN(0x6ccd6cb1, 0x4879b397), TOBN(0xf792f804, 0xf8a82f21),
+ TOBN(0x509d4804, 0xa9b46402), TOBN(0xedddf85d, 0xc10f0850),
+ TOBN(0x928410dc, 0x4b6208aa), TOBN(0xf6229c46, 0x391012dc),
+ TOBN(0xc5a7c41e, 0x7727b9b6), TOBN(0x289e4e4b, 0xaa444842),
+ TOBN(0x049ba1d9, 0xe9a947ea), TOBN(0x44f9e47f, 0x83c8debc),
+ TOBN(0xfa77a1fe, 0x611f8b8e), TOBN(0xfd2e416a, 0xf518f427),
+ TOBN(0xc5fffa70, 0x114ebac3), TOBN(0xfe57c4e9, 0x5d89697b),
+ TOBN(0xfdd053ac, 0xb1aaf613), TOBN(0x31df210f, 0xea585a45),
+ TOBN(0x318cc10e, 0x24985034), TOBN(0x1a38efd1, 0x5f1d6130),
+ TOBN(0xbf86f237, 0x0b1e9e21), TOBN(0xb258514d, 0x1dbe88aa),
+ TOBN(0x1e38a588, 0x90c1baf9), TOBN(0x2936a01e, 0xbdb9b692),
+ TOBN(0xd576de98, 0x6dd5b20c), TOBN(0xb586bf71, 0x70f98ecf),
+ TOBN(0xcccf0f12, 0xc42d2fd7), TOBN(0x8717e61c, 0xfb35bd7b),
+ TOBN(0x8b1e5722, 0x35e6fc06), TOBN(0x3477728f, 0x0b3e13d5),
+ TOBN(0x150c294d, 0xaa8a7372), TOBN(0xc0291d43, 0x3bfa528a),
+ TOBN(0xc6c8bc67, 0xcec5a196), TOBN(0xdeeb31e4, 0x5c2e8a7c),
+ TOBN(0xba93e244, 0xfb6e1c51), TOBN(0xb9f8b71b, 0x2e28e156),
+ TOBN(0xce65a287, 0x968a2ab9), TOBN(0xe3c5ce69, 0x46bbcb1f),
+ TOBN(0xf8c835b9, 0xe7ae3f30), TOBN(0x16bbee26, 0xff72b82b),
+ TOBN(0x665e2017, 0xfd42cd22), TOBN(0x1e139970, 0xf8b1d2a0),
+ TOBN(0x125cda29, 0x79204932), TOBN(0x7aee94a5, 0x49c3bee5),
+ TOBN(0x68c70160, 0x89821a66), TOBN(0xf7c37678, 0x8f981669),
+ TOBN(0xd90829fc, 0x48cc3645), TOBN(0x346af049, 0xd70addfc),
+ TOBN(0x2057b232, 0x370bf29c), TOBN(0xf90c73ce, 0x42e650ee),
+ TOBN(0xe03386ea, 0xa126ab90), TOBN(0x0e266e7e, 0x975a087b),
+ TOBN(0x80578eb9, 0x0fca65d9), TOBN(0x7e2989ea, 0x16af45b8),
+ TOBN(0x7438212d, 0xcac75a4e), TOBN(0x38c7ca39, 0x4fef36b8),
+ TOBN(0x8650c494, 0xd402676a), TOBN(0x26ab5a66, 0xf72c7c48),
+ TOBN(0x4e6cb426, 0xce3a464e), TOBN(0xf8f99896, 0x2b72f841),
+ TOBN(0x8c318491, 0x1a335cc8), TOBN(0x563459ba, 0x6a5913e4),
+ TOBN(0x1b920d61, 0xc7b32919), TOBN(0x805ab8b6, 0xa02425ad),
+ TOBN(0x2ac512da, 0x8d006086), TOBN(0x6ca4846a, 0xbcf5c0fd),
+ TOBN(0xafea51d8, 0xac2138d7), TOBN(0xcb647545, 0x344cd443),
+ TOBN(0x0429ee8f, 0xbd7d9040), TOBN(0xee66a2de, 0x819b9c96),
+ TOBN(0x54f9ec25, 0xdea7d744), TOBN(0x2ffea642, 0x671721bb),
+ TOBN(0x4f19dbd1, 0x114344ea), TOBN(0x04304536, 0xfd0dbc8b),
+ TOBN(0x014b50aa, 0x29ec7f91), TOBN(0xb5fc22fe, 0xbb06014d),
+ TOBN(0x60d963a9, 0x1ee682e0), TOBN(0xdf48abc0, 0xfe85c727),
+ TOBN(0x0cadba13, 0x2e707c2d), TOBN(0xde608d3a, 0xa645aeff),
+ TOBN(0x05f1c28b, 0xedafd883), TOBN(0x3c362ede, 0xbd94de1f),
+ TOBN(0x8dd0629d, 0x13593e41), TOBN(0x0a5e736f, 0x766d6eaf),
+ TOBN(0xbfa92311, 0xf68cf9d1), TOBN(0xa4f9ef87, 0xc1797556),
+ TOBN(0x10d75a1f, 0x5601c209), TOBN(0x651c374c, 0x09b07361),
+ TOBN(0x49950b58, 0x88b5cead), TOBN(0x0ef00058, 0x6fa9dbaa),
+ TOBN(0xf51ddc26, 0x4e15f33a), TOBN(0x1f8b5ca6, 0x2ef46140),
+ TOBN(0x343ac0a3, 0xee9523f0), TOBN(0xbb75eab2, 0x975ea978),
+ TOBN(0x1bccf332, 0x107387f4), TOBN(0x790f9259, 0x9ab0062e),
+ TOBN(0xf1a363ad, 0x1e4f6a5f), TOBN(0x06e08b84, 0x62519a50),
+ TOBN(0x60915187, 0x7265f1ee), TOBN(0x6a80ca34, 0x93ae985e),
+ TOBN(0x81b29768, 0xaaba4864), TOBN(0xb13cabf2, 0x8d52a7d6),
+ TOBN(0xb5c36348, 0x8ead03f1), TOBN(0xc932ad95, 0x81c7c1c0),
+ TOBN(0x5452708e, 0xcae1e27b), TOBN(0x9dac4269, 0x1b0df648),
+ TOBN(0x233e3f0c, 0xdfcdb8bc), TOBN(0xe6ceccdf, 0xec540174),
+ TOBN(0xbd0d845e, 0x95081181), TOBN(0xcc8a7920, 0x699355d5),
+ TOBN(0x111c0f6d, 0xc3b375a8), TOBN(0xfd95bc6b, 0xfd51e0dc),
+ TOBN(0x4a106a26, 0x6888523a), TOBN(0x4d142bd6, 0xcb01a06d),
+ TOBN(0x79bfd289, 0xadb9b397), TOBN(0x0bdbfb94, 0xe9863914),
+ TOBN(0x29d8a229, 0x1660f6a6), TOBN(0x7f6abcd6, 0x551c042d),
+ TOBN(0x13039deb, 0x0ac3ffe8), TOBN(0xa01be628, 0xec8523fb),
+ TOBN(0x6ea34103, 0x0ca1c328), TOBN(0xc74114bd, 0xb903928e),
+ TOBN(0x8aa4ff4e, 0x9e9144b0), TOBN(0x7064091f, 0x7f9a4b17),
+ TOBN(0xa3f4f521, 0xe447f2c4), TOBN(0x81b8da7a, 0x604291f0),
+ TOBN(0xd680bc46, 0x7d5926de), TOBN(0x84f21fd5, 0x34a1202f),
+ TOBN(0x1d1e3181, 0x4e9df3d8), TOBN(0x1ca4861a, 0x39ab8d34),
+ TOBN(0x809ddeec, 0x5b19aa4a), TOBN(0x59f72f7e, 0x4d329366),
+ TOBN(0xa2f93f41, 0x386d5087), TOBN(0x40bf739c, 0xdd67d64f),
+ TOBN(0xb4494205, 0x66702158), TOBN(0xc33c65be, 0x73b1e178),
+ TOBN(0xcdcd657c, 0x38ca6153), TOBN(0x97f4519a, 0xdc791976),
+ TOBN(0xcc7c7f29, 0xcd6e1f39), TOBN(0x38de9cfb, 0x7e3c3932),
+ TOBN(0xe448eba3, 0x7b793f85), TOBN(0xe9f8dbf9, 0xf067e914),
+ TOBN(0xc0390266, 0xf114ae87), TOBN(0x39ed75a7, 0xcd6a8e2a),
+ TOBN(0xadb14848, 0x7ffba390), TOBN(0x67f8cb8b, 0x6af9bc09),
+ TOBN(0x322c3848, 0x9c7476db), TOBN(0xa320fecf, 0x52a538d6),
+ TOBN(0xe0493002, 0xb2aced2b), TOBN(0xdfba1809, 0x616bd430),
+ TOBN(0x531c4644, 0xc331be70), TOBN(0xbc04d32e, 0x90d2e450),
+ TOBN(0x1805a0d1, 0x0f9f142d), TOBN(0x2c44a0c5, 0x47ee5a23),
+ TOBN(0x31875a43, 0x3989b4e3), TOBN(0x6b1949fd, 0x0c063481),
+ TOBN(0x2dfb9e08, 0xbe0f4492), TOBN(0x3ff0da03, 0xe9d5e517),
+ TOBN(0x03dbe9a1, 0xf79466a8), TOBN(0x0b87bcd0, 0x15ea9932),
+ TOBN(0xeb64fc83, 0xab1f58ab), TOBN(0x6d9598da, 0x817edc8a),
+ TOBN(0x699cff66, 0x1d3b67e5), TOBN(0x645c0f29, 0x92635853),
+ TOBN(0x253cdd82, 0xeabaf21c), TOBN(0x82b9602a, 0x2241659e),
+ TOBN(0x2cae07ec, 0x2d9f7091), TOBN(0xbe4c720c, 0x8b48cd9b),
+ TOBN(0x6ce5bc03, 0x6f08d6c9), TOBN(0x36e8a997, 0xaf10bf40),
+ TOBN(0x83422d21, 0x3e10ff12), TOBN(0x7b26d3eb, 0xbcc12494),
+ TOBN(0xb240d2d0, 0xc9469ad6), TOBN(0xc4a11b4d, 0x30afa05b),
+ TOBN(0x4b604ace, 0xdd6ba286), TOBN(0x18486600, 0x3ee2864c),
+ TOBN(0x5869d6ba, 0x8d9ce5be), TOBN(0x0d8f68c5, 0xff4bfb0d),
+ TOBN(0xb69f210b, 0x5700cf73), TOBN(0x61f6653a, 0x6d37c135),
+ TOBN(0xff3d432b, 0x5aff5a48), TOBN(0x0d81c4b9, 0x72ba3a69),
+ TOBN(0xee879ae9, 0xfa1899ef), TOBN(0xbac7e2a0, 0x2d6acafd),
+ TOBN(0xd6d93f6c, 0x1c664399), TOBN(0x4c288de1, 0x5bcb135d),
+ TOBN(0x83031dab, 0x9dab7cbf), TOBN(0xfe23feb0, 0x3abbf5f0),
+ TOBN(0x9f1b2466, 0xcdedca85), TOBN(0x140bb710, 0x1a09538c),
+ TOBN(0xac8ae851, 0x5e11115d), TOBN(0x0d63ff67, 0x6f03f59e),
+ TOBN(0x755e5551, 0x7d234afb), TOBN(0x61c2db4e, 0x7e208fc1),
+ TOBN(0xaa9859ce, 0xf28a4b5d), TOBN(0xbdd6d4fc, 0x34af030f),
+ TOBN(0xd1c4a26d, 0x3be01cb1), TOBN(0x9ba14ffc, 0x243aa07c),
+ TOBN(0xf95cd3a9, 0xb2503502), TOBN(0xe379bc06, 0x7d2a93ab),
+ TOBN(0x3efc18e9, 0xd4ca8d68), TOBN(0x083558ec, 0x80bb412a),
+ TOBN(0xd903b940, 0x9645a968), TOBN(0xa499f0b6, 0x9ba6054f),
+ TOBN(0x208b573c, 0xb8349abe), TOBN(0x3baab3e5, 0x30b4fc1c),
+ TOBN(0x87e978ba, 0xcb524990), TOBN(0x3524194e, 0xccdf0e80),
+ TOBN(0x62711725, 0x7d4bcc42), TOBN(0xe90a3d9b, 0xb90109ba),
+ TOBN(0x3b1bdd57, 0x1323e1e0), TOBN(0xb78e9bd5, 0x5eae1599),
+ TOBN(0x0794b746, 0x9e03d278), TOBN(0x80178605, 0xd70e6297),
+ TOBN(0x171792f8, 0x99c97855), TOBN(0x11b393ee, 0xf5a86b5c),
+ TOBN(0x48ef6582, 0xd8884f27), TOBN(0xbd44737a, 0xbf19ba5f),
+ TOBN(0x8698de4c, 0xa42062c6), TOBN(0x8975eb80, 0x61ce9c54),
+ TOBN(0xd50e57c7, 0xd7fe71f3), TOBN(0x15342190, 0xbc97ce38),
+ TOBN(0x51bda2de, 0x4df07b63), TOBN(0xba12aeae, 0x200eb87d),
+ TOBN(0xabe135d2, 0xa9b4f8f6), TOBN(0x04619d65, 0xfad6d99c),
+ TOBN(0x4a6683a7, 0x7994937c), TOBN(0x7a778c8b, 0x6f94f09a),
+ TOBN(0x8c508623, 0x20a71b89), TOBN(0x241a2aed, 0x1c229165),
+ TOBN(0x352be595, 0xaaf83a99), TOBN(0x9fbfee7f, 0x1562bac8),
+ TOBN(0xeaf658b9, 0x5c4017e3), TOBN(0x1dc7f9e0, 0x15120b86),
+ TOBN(0xd84f13dd, 0x4c034d6f), TOBN(0x283dd737, 0xeaea3038),
+ TOBN(0x197f2609, 0xcd85d6a2), TOBN(0x6ebbc345, 0xfae60177),
+ TOBN(0xb80f031b, 0x4e12fede), TOBN(0xde55d0c2, 0x07a2186b),
+ TOBN(0x1fb3e37f, 0x24dcdd5a), TOBN(0x8d602da5, 0x7ed191fb),
+ TOBN(0x108fb056, 0x76023e0d), TOBN(0x70178c71, 0x459c20c0),
+ TOBN(0xfad5a386, 0x3fe54cf0), TOBN(0xa4a3ec4f, 0x02bbb475),
+ TOBN(0x1aa5ec20, 0x919d94d7), TOBN(0x5d3b63b5, 0xa81e4ab3),
+ TOBN(0x7fa733d8, 0x5ad3d2af), TOBN(0xfbc586dd, 0xd1ac7a37),
+ TOBN(0x282925de, 0x40779614), TOBN(0xfe0ffffb, 0xe74a242a),
+ TOBN(0x3f39e67f, 0x906151e5), TOBN(0xcea27f5f, 0x55e10649),
+ TOBN(0xdca1d4e1, 0xc17cf7b7), TOBN(0x0c326d12, 0x2fe2362d),
+ TOBN(0x05f7ac33, 0x7dd35df3), TOBN(0x0c3b7639, 0xc396dbdf),
+ TOBN(0x0912f5ac, 0x03b7db1c), TOBN(0x9dea4b70, 0x5c9ed4a9),
+ TOBN(0x475e6e53, 0xaae3f639), TOBN(0xfaba0e7c, 0xfc278bac),
+ TOBN(0x16f9e221, 0x9490375f), TOBN(0xaebf9746, 0xa5a7ed0a),
+ TOBN(0x45f9af3f, 0xf41ad5d6), TOBN(0x03c4623c, 0xb2e99224),
+ TOBN(0x82c5bb5c, 0xb3cf56aa), TOBN(0x64311819, 0x34567ed3),
+ TOBN(0xec57f211, 0x8be489ac), TOBN(0x2821895d, 0xb9a1104b),
+ TOBN(0x610dc875, 0x6064e007), TOBN(0x8e526f3f, 0x5b20d0fe),
+ TOBN(0x6e71ca77, 0x5b645aee), TOBN(0x3d1dcb9f, 0x800e10ff),
+ TOBN(0x36b51162, 0x189cf6de), TOBN(0x2c5a3e30, 0x6bb17353),
+ TOBN(0xc186cd3e, 0x2a6c6fbf), TOBN(0xa74516fa, 0x4bf97906),
+ TOBN(0x5b4b8f4b, 0x279d6901), TOBN(0x0c4e57b4, 0x2b573743),
+ TOBN(0x75fdb229, 0xb6e386b6), TOBN(0xb46793fd, 0x99deac27),
+ TOBN(0xeeec47ea, 0xcf712629), TOBN(0xe965f3c4, 0xcbc3b2dd),
+ TOBN(0x8dd1fb83, 0x425c6559), TOBN(0x7fc00ee6, 0x0af06fda),
+ TOBN(0xe98c9225, 0x33d956df), TOBN(0x0f1ef335, 0x4fbdc8a2),
+ TOBN(0x2abb5145, 0xb79b8ea2), TOBN(0x40fd2945, 0xbdbff288),
+ TOBN(0x6a814ac4, 0xd7185db7), TOBN(0xc4329d6f, 0xc084609a),
+ TOBN(0xc9ba7b52, 0xed1be45d), TOBN(0x891dd20d, 0xe4cd2c74),
+ TOBN(0x5a4d4a7f, 0x824139b1), TOBN(0x66c17716, 0xb873c710),
+ TOBN(0x5e5bc141, 0x2843c4e0), TOBN(0xd5ac4817, 0xb97eb5bf),
+ TOBN(0xc0f8af54, 0x450c95c7), TOBN(0xc91b3fa0, 0x318406c5),
+ TOBN(0x360c340a, 0xab9d97f8), TOBN(0xfb57bd07, 0x90a2d611),
+ TOBN(0x4339ae3c, 0xa6a6f7e5), TOBN(0x9c1fcd2a, 0x2feb8a10),
+ TOBN(0x972bcca9, 0xc7ea7432), TOBN(0x1b0b924c, 0x308076f6),
+ TOBN(0x80b2814a, 0x2a5b4ca5), TOBN(0x2f78f55b, 0x61ef3b29),
+ TOBN(0xf838744a, 0xc18a414f), TOBN(0xc611eaae, 0x903d0a86),
+ TOBN(0x94dabc16, 0x2a453f55), TOBN(0xe6f2e3da, 0x14efb279),
+ TOBN(0x5b7a6017, 0x9320dc3c), TOBN(0x692e382f, 0x8df6b5a4),
+ TOBN(0x3f5e15e0, 0x2d40fa90), TOBN(0xc87883ae, 0x643dd318),
+ TOBN(0x511053e4, 0x53544774), TOBN(0x834d0ecc, 0x3adba2bc),
+ TOBN(0x4215d7f7, 0xbae371f5), TOBN(0xfcfd57bf, 0x6c8663bc),
+ TOBN(0xded2383d, 0xd6901b1d), TOBN(0x3b49fbb4, 0xb5587dc3),
+ TOBN(0xfd44a08d, 0x07625f62), TOBN(0x3ee4d65b, 0x9de9b762),}
+ ,
+ {TOBN(0x64e5137d, 0x0d63d1fa), TOBN(0x658fc052, 0x02a9d89f),
+ TOBN(0x48894874, 0x50436309), TOBN(0xe9ae30f8, 0xd598da61),
+ TOBN(0x2ed710d1, 0x818baf91), TOBN(0xe27e9e06, 0x8b6a0c20),
+ TOBN(0x1e28dcfb, 0x1c1a6b44), TOBN(0x883acb64, 0xd6ac57dc),
+ TOBN(0x8735728d, 0xc2c6ff70), TOBN(0x79d6122f, 0xc5dc2235),
+ TOBN(0x23f5d003, 0x19e277f9), TOBN(0x7ee84e25, 0xdded8cc7),
+ TOBN(0x91a8afb0, 0x63cd880a), TOBN(0x3f3ea7c6, 0x3574af60),
+ TOBN(0x0cfcdc84, 0x02de7f42), TOBN(0x62d0792f, 0xb31aa152),
+ TOBN(0x8e1b4e43, 0x8a5807ce), TOBN(0xad283893, 0xe4109a7e),
+ TOBN(0xc30cc9cb, 0xafd59dda), TOBN(0xf65f36c6, 0x3d8d8093),
+ TOBN(0xdf31469e, 0xa60d32b2), TOBN(0xee93df4b, 0x3e8191c8),
+ TOBN(0x9c1017c5, 0x355bdeb5), TOBN(0xd2623185, 0x8616aa28),
+ TOBN(0xb02c83f9, 0xdec31a21), TOBN(0x988c8b23, 0x6ad9d573),
+ TOBN(0x53e983ae, 0xa57be365), TOBN(0xe968734d, 0x646f834e),
+ TOBN(0x9137ea8f, 0x5da6309b), TOBN(0x10f3a624, 0xc1f1ce16),
+ TOBN(0x782a9ea2, 0xca440921), TOBN(0xdf94739e, 0x5b46f1b5),
+ TOBN(0x9f9be006, 0xcce85c9b), TOBN(0x360e70d6, 0xa4c7c2d3),
+ TOBN(0x2cd5beea, 0xaefa1e60), TOBN(0x64cf63c0, 0x8c3d2b6d),
+ TOBN(0xfb107fa3, 0xe1cf6f90), TOBN(0xb7e937c6, 0xd5e044e6),
+ TOBN(0x74e8ca78, 0xce34db9f), TOBN(0x4f8b36c1, 0x3e210bd0),
+ TOBN(0x1df165a4, 0x34a35ea8), TOBN(0x3418e0f7, 0x4d4412f6),
+ TOBN(0x5af1f8af, 0x518836c3), TOBN(0x42ceef4d, 0x130e1965),
+ TOBN(0x5560ca0b, 0x543a1957), TOBN(0xc33761e5, 0x886cb123),
+ TOBN(0x66624b1f, 0xfe98ed30), TOBN(0xf772f4bf, 0x1090997d),
+ TOBN(0xf4e540bb, 0x4885d410), TOBN(0x7287f810, 0x9ba5f8d7),
+ TOBN(0x22d0d865, 0xde98dfb1), TOBN(0x49ff51a1, 0xbcfbb8a3),
+ TOBN(0xb6b6fa53, 0x6bc3012e), TOBN(0x3d31fd72, 0x170d541d),
+ TOBN(0x8018724f, 0x4b0f4966), TOBN(0x79e7399f, 0x87dbde07),
+ TOBN(0x56f8410e, 0xf4f8b16a), TOBN(0x97241afe, 0xc47b266a),
+ TOBN(0x0a406b8e, 0x6d9c87c1), TOBN(0x803f3e02, 0xcd42ab1b),
+ TOBN(0x7f0309a8, 0x04dbec69), TOBN(0xa83b85f7, 0x3bbad05f),
+ TOBN(0xc6097273, 0xad8e197f), TOBN(0xc097440e, 0x5067adc1),
+ TOBN(0x730eafb6, 0x3524ff16), TOBN(0xd7f9b51e, 0x823fc6ce),
+ TOBN(0x27bd0d32, 0x443e4ac0), TOBN(0x40c59ad9, 0x4d66f217),
+ TOBN(0x6c33136f, 0x17c387a4), TOBN(0x5043b8d5, 0xeb86804d),
+ TOBN(0x74970312, 0x675a73c9), TOBN(0x838fdb31, 0xf16669b6),
+ TOBN(0xc507b6dd, 0x418e7ddd), TOBN(0x39888d93, 0x472f19d6),
+ TOBN(0x7eae26be, 0x0c27eb4d), TOBN(0x17b53ed3, 0xfbabb884),
+ TOBN(0xfc27021b, 0x2b01ae4f), TOBN(0x88462e87, 0xcf488682),
+ TOBN(0xbee096ec, 0x215e2d87), TOBN(0xeb2fea9a, 0xd242e29b),
+ TOBN(0x5d985b5f, 0xb821fc28), TOBN(0x89d2e197, 0xdc1e2ad2),
+ TOBN(0x55b566b8, 0x9030ba62), TOBN(0xe3fd41b5, 0x4f41b1c6),
+ TOBN(0xb738ac2e, 0xb9a96d61), TOBN(0x7f8567ca, 0x369443f4),
+ TOBN(0x8698622d, 0xf803a440), TOBN(0x2b586236, 0x8fe2f4dc),
+ TOBN(0xbbcc00c7, 0x56b95bce), TOBN(0x5ec03906, 0x616da680),
+ TOBN(0x79162ee6, 0x72214252), TOBN(0x43132b63, 0x86a892d2),
+ TOBN(0x4bdd3ff2, 0x2f3263bf), TOBN(0xd5b3733c, 0x9cd0a142),
+ TOBN(0x592eaa82, 0x44415ccb), TOBN(0x663e8924, 0x8d5474ea),
+ TOBN(0x8058a25e, 0x5236344e), TOBN(0x82e8df9d, 0xbda76ee6),
+ TOBN(0xdcf6efd8, 0x11cc3d22), TOBN(0x00089cda, 0x3b4ab529),
+ TOBN(0x91d3a071, 0xbd38a3db), TOBN(0x4ea97fc0, 0xef72b925),
+ TOBN(0x0c9fc15b, 0xea3edf75), TOBN(0x5a6297cd, 0xa4348ed3),
+ TOBN(0x0d38ab35, 0xce7c42d4), TOBN(0x9fd493ef, 0x82feab10),
+ TOBN(0x46056b6d, 0x82111b45), TOBN(0xda11dae1, 0x73efc5c3),
+ TOBN(0xdc740278, 0x5545a7fb), TOBN(0xbdb2601c, 0x40d507e6),
+ TOBN(0x121dfeeb, 0x7066fa58), TOBN(0x214369a8, 0x39ae8c2a),
+ TOBN(0x195709cb, 0x06e0956c), TOBN(0x4c9d254f, 0x010cd34b),
+ TOBN(0xf51e13f7, 0x0471a532), TOBN(0xe19d6791, 0x1e73054d),
+ TOBN(0xf702a628, 0xdb5c7be3), TOBN(0xc7141218, 0xb24dde05),
+ TOBN(0xdc18233c, 0xf29b2e2e), TOBN(0x3a6bd1e8, 0x85342dba),
+ TOBN(0x3f747fa0, 0xb311898c), TOBN(0xe2a272e4, 0xcd0eac65),
+ TOBN(0x4bba5851, 0xf914d0bc), TOBN(0x7a1a9660, 0xc4a43ee3),
+ TOBN(0xe5a367ce, 0xa1c8cde9), TOBN(0x9d958ba9, 0x7271abe3),
+ TOBN(0xf3ff7eb6, 0x3d1615cd), TOBN(0xa2280dce, 0xf5ae20b0),
+ TOBN(0x56dba5c1, 0xcf640147), TOBN(0xea5a2e3d, 0x5e83d118),
+ TOBN(0x04cd6b6d, 0xda24c511), TOBN(0x1c0f4671, 0xe854d214),
+ TOBN(0x91a6b7a9, 0x69565381), TOBN(0xdc966240, 0xdecf1f5b),
+ TOBN(0x1b22d21c, 0xfcf5d009), TOBN(0x2a05f641, 0x9021dbd5),
+ TOBN(0x8c0ed566, 0xd4312483), TOBN(0x5179a95d, 0x643e216f),
+ TOBN(0xcc185fec, 0x17044493), TOBN(0xb3063339, 0x54991a21),
+ TOBN(0xd801ecdb, 0x0081a726), TOBN(0x0149b0c6, 0x4fa89bbb),
+ TOBN(0xafe9065a, 0x4391b6b9), TOBN(0xedc92786, 0xd633f3a3),
+ TOBN(0xe408c24a, 0xae6a8e13), TOBN(0x85833fde, 0x9f3897ab),
+ TOBN(0x43800e7e, 0xd81a0715), TOBN(0xde08e346, 0xb44ffc5f),
+ TOBN(0x7094184c, 0xcdeff2e0), TOBN(0x49f9387b, 0x165eaed1),
+ TOBN(0x635d6129, 0x777c468a), TOBN(0x8c0dcfd1, 0x538c2dd8),
+ TOBN(0xd6d9d9e3, 0x7a6a308b), TOBN(0x62375830, 0x4c2767d3),
+ TOBN(0x874a8bc6, 0xf38cbeb6), TOBN(0xd94d3f1a, 0xccb6fd9e),
+ TOBN(0x92a9735b, 0xba21f248), TOBN(0x272ad0e5, 0x6cd1efb0),
+ TOBN(0x7437b69c, 0x05b03284), TOBN(0xe7f04702, 0x6948c225),
+ TOBN(0x8a56c04a, 0xcba2ecec), TOBN(0x0c181270, 0xe3a73e41),
+ TOBN(0x6cb34e9d, 0x03e93725), TOBN(0xf77c8713, 0x496521a9),
+ TOBN(0x94569183, 0xfa7f9f90), TOBN(0xf2e7aa4c, 0x8c9707ad),
+ TOBN(0xced2c9ba, 0x26c1c9a3), TOBN(0x9109fe96, 0x40197507),
+ TOBN(0x9ae868a9, 0xe9adfe1c), TOBN(0x3984403d, 0x314e39bb),
+ TOBN(0xb5875720, 0xf2fe378f), TOBN(0x33f901e0, 0xba44a628),
+ TOBN(0xea1125fe, 0x3652438c), TOBN(0xae9ec4e6, 0x9dd1f20b),
+ TOBN(0x1e740d9e, 0xbebf7fbd), TOBN(0x6dbd3ddc, 0x42dbe79c),
+ TOBN(0x62082aec, 0xedd36776), TOBN(0xf612c478, 0xe9859039),
+ TOBN(0xa493b201, 0x032f7065), TOBN(0xebd4d8f2, 0x4ff9b211),
+ TOBN(0x3f23a0aa, 0xaac4cb32), TOBN(0xea3aadb7, 0x15ed4005),
+ TOBN(0xacf17ea4, 0xafa27e63), TOBN(0x56125c1a, 0xc11fd66c),
+ TOBN(0x266344a4, 0x3794f8dc), TOBN(0xdcca923a, 0x483c5c36),
+ TOBN(0x2d6b6bbf, 0x3f9d10a0), TOBN(0xb320c5ca, 0x81d9bdf3),
+ TOBN(0x620e28ff, 0x47b50a95), TOBN(0x933e3b01, 0xcef03371),
+ TOBN(0xf081bf85, 0x99100153), TOBN(0x183be9a0, 0xc3a8c8d6),
+ TOBN(0x4e3ddc5a, 0xd6bbe24d), TOBN(0xc6c74630, 0x53843795),
+ TOBN(0x78193dd7, 0x65ec2d4c), TOBN(0xb8df26cc, 0xcd3c89b2),
+ TOBN(0x98dbe399, 0x5a483f8d), TOBN(0x72d8a957, 0x7dd3313a),
+ TOBN(0x65087294, 0xab0bd375), TOBN(0xfcd89248, 0x7c259d16),
+ TOBN(0x8a9443d7, 0x7613aa81), TOBN(0x80100800, 0x85fe6584),
+ TOBN(0x70fc4dbc, 0x7fb10288), TOBN(0xf58280d3, 0xe86beee8),
+ TOBN(0x14fdd82f, 0x7c978c38), TOBN(0xdf1204c1, 0x0de44d7b),
+ TOBN(0xa08a1c84, 0x4160252f), TOBN(0x591554ca, 0xc17646a5),
+ TOBN(0x214a37d6, 0xa05bd525), TOBN(0x48d5f09b, 0x07957b3c),
+ TOBN(0x0247cdcb, 0xd7109bc9), TOBN(0x40f9e4bb, 0x30599ce7),
+ TOBN(0xc325fa03, 0xf46ad2ec), TOBN(0x00f766cf, 0xc3e3f9ee),
+ TOBN(0xab556668, 0xd43a4577), TOBN(0x68d30a61, 0x3ee03b93),
+ TOBN(0x7ddc81ea, 0x77b46a08), TOBN(0xcf5a6477, 0xc7480699),
+ TOBN(0x43a8cb34, 0x6633f683), TOBN(0x1b867e6b, 0x92363c60),
+ TOBN(0x43921114, 0x1f60558e), TOBN(0xcdbcdd63, 0x2f41450e),
+ TOBN(0x7fc04601, 0xcc630e8b), TOBN(0xea7c66d5, 0x97038b43),
+ TOBN(0x7259b8a5, 0x04e99fd8), TOBN(0x98a8dd12, 0x4785549a),
+ TOBN(0x0e459a7c, 0x840552e1), TOBN(0xcdfcf4d0, 0x4bb0909e),
+ TOBN(0x34a86db2, 0x53758da7), TOBN(0xe643bb83, 0xeac997e1),
+ TOBN(0x96400bd7, 0x530c5b7e), TOBN(0x9f97af87, 0xb41c8b52),
+ TOBN(0x34fc8820, 0xfbeee3f9), TOBN(0x93e53490, 0x49091afd),
+ TOBN(0x764b9be5, 0x9a31f35c), TOBN(0x71f37864, 0x57e3d924),
+ TOBN(0x02fb34e0, 0x943aa75e), TOBN(0xa18c9c58, 0xab8ff6e4),
+ TOBN(0x080f31b1, 0x33cf0d19), TOBN(0x5c9682db, 0x083518a7),
+ TOBN(0x873d4ca6, 0xb709c3de), TOBN(0x64a84262, 0x3575b8f0),
+ TOBN(0x6275da1f, 0x020154bb), TOBN(0x97678caa, 0xd17cf1ab),
+ TOBN(0x8779795f, 0x951a95c3), TOBN(0xdd35b163, 0x50fccc08),
+ TOBN(0x32709627, 0x33d8f031), TOBN(0x3c5ab10a, 0x498dd85c),
+ TOBN(0xb6c185c3, 0x41dca566), TOBN(0x7de7feda, 0xd8622aa3),
+ TOBN(0x99e84d92, 0x901b6dfb), TOBN(0x30a02b0e, 0x7c4ad288),
+ TOBN(0xc7c81daa, 0x2fd3cf36), TOBN(0xd1319547, 0xdf89e59f),
+ TOBN(0xb2be8184, 0xcd496733), TOBN(0xd5f449eb, 0x93d3412b),
+ TOBN(0x7ea41b1b, 0x25fe531d), TOBN(0xf9797432, 0x6a1d5646),
+ TOBN(0x86067f72, 0x2bde501a), TOBN(0xf91481c0, 0x0c85e89c),
+ TOBN(0xca8ee465, 0xf8b05bc6), TOBN(0x1844e1cf, 0x02e83cda),
+ TOBN(0xca82114a, 0xb4dbe33b), TOBN(0x0f9f8769, 0x4eabfde2),
+ TOBN(0x4936b1c0, 0x38b27fe2), TOBN(0x63b6359b, 0xaba402df),
+ TOBN(0x40c0ea2f, 0x656bdbab), TOBN(0x9c992a89, 0x6580c39c),
+ TOBN(0x600e8f15, 0x2a60aed1), TOBN(0xeb089ca4, 0xe0bf49df),
+ TOBN(0x9c233d7d, 0x2d42d99a), TOBN(0x648d3f95, 0x4c6bc2fa),
+ TOBN(0xdcc383a8, 0xe1add3f3), TOBN(0xf42c0c6a, 0x4f64a348),
+ TOBN(0x2abd176f, 0x0030dbdb), TOBN(0x4de501a3, 0x7d6c215e),
+ TOBN(0x4a107c1f, 0x4b9a64bc), TOBN(0xa77f0ad3, 0x2496cd59),
+ TOBN(0xfb78ac62, 0x7688dffb), TOBN(0x7025a2ca, 0x67937d8e),
+ TOBN(0xfde8b2d1, 0xd1a8f4e7), TOBN(0xf5b3da47, 0x7354927c),
+ TOBN(0xe48606a3, 0xd9205735), TOBN(0xac477cc6, 0xe177b917),
+ TOBN(0xfb1f73d2, 0xa883239a), TOBN(0xe12572f6, 0xcc8b8357),
+ TOBN(0x9d355e9c, 0xfb1f4f86), TOBN(0x89b795f8, 0xd9f3ec6e),
+ TOBN(0x27be56f1, 0xb54398dc), TOBN(0x1890efd7, 0x3fedeed5),
+ TOBN(0x62f77f1f, 0x9c6d0140), TOBN(0x7ef0e314, 0x596f0ee4),
+ TOBN(0x50ca6631, 0xcc61dab3), TOBN(0x4a39801d, 0xf4866e4f),
+ TOBN(0x66c8d032, 0xae363b39), TOBN(0x22c591e5, 0x2ead66aa),
+ TOBN(0x954ba308, 0xde02a53e), TOBN(0x2a6c060f, 0xd389f357),
+ TOBN(0xe6cfcde8, 0xfbf40b66), TOBN(0x8e02fc56, 0xc6340ce1),
+ TOBN(0xe4957795, 0x73adb4ba), TOBN(0x7b86122c, 0xa7b03805),
+ TOBN(0x63f83512, 0x0c8e6fa6), TOBN(0x83660ea0, 0x057d7804),
+ TOBN(0xbad79105, 0x21ba473c), TOBN(0xb6c50bee, 0xded5389d),
+ TOBN(0xee2caf4d, 0xaa7c9bc0), TOBN(0xd97b8de4, 0x8c4e98a7),
+ TOBN(0xa9f63e70, 0xab3bbddb), TOBN(0x3898aabf, 0x2597815a),
+ TOBN(0x7659af89, 0xac15b3d9), TOBN(0xedf7725b, 0x703ce784),
+ TOBN(0x25470fab, 0xe085116b), TOBN(0x04a43375, 0x87285310),
+ TOBN(0x4e39187e, 0xe2bfd52f), TOBN(0x36166b44, 0x7d9ebc74),
+ TOBN(0x92ad433c, 0xfd4b322c), TOBN(0x726aa817, 0xba79ab51),
+ TOBN(0xf96eacd8, 0xc1db15eb), TOBN(0xfaf71e91, 0x0476be63),
+ TOBN(0xdd69a640, 0x641fad98), TOBN(0xb7995918, 0x29622559),
+ TOBN(0x03c6daa5, 0xde4199dc), TOBN(0x92cadc97, 0xad545eb4),
+ TOBN(0x1028238b, 0x256534e4), TOBN(0x73e80ce6, 0x8595409a),
+ TOBN(0x690d4c66, 0xd05dc59b), TOBN(0xc95f7b8f, 0x981dee80),
+ TOBN(0xf4337014, 0xd856ac25), TOBN(0x441bd9dd, 0xac524dca),
+ TOBN(0x640b3d85, 0x5f0499f5), TOBN(0x39cf84a9, 0xd5fda182),
+ TOBN(0x04e7b055, 0xb2aa95a0), TOBN(0x29e33f0a, 0x0ddf1860),
+ TOBN(0x082e74b5, 0x423f6b43), TOBN(0x217edeb9, 0x0aaa2b0f),
+ TOBN(0x58b83f35, 0x83cbea55), TOBN(0xc485ee4d, 0xbc185d70),
+ TOBN(0x833ff03b, 0x1e5f6992), TOBN(0xb5b9b9cc, 0xcf0c0dd5),
+ TOBN(0x7caaee8e, 0x4e9e8a50), TOBN(0x462e907b, 0x6269dafd),
+ TOBN(0x6ed5cee9, 0xfbe791c6), TOBN(0x68ca3259, 0xed430790),
+ TOBN(0x2b72bdf2, 0x13b5ba88), TOBN(0x60294c8a, 0x35ef0ac4),
+ TOBN(0x9c3230ed, 0x19b99b08), TOBN(0x560fff17, 0x6c2589aa),
+ TOBN(0x552b8487, 0xd6770374), TOBN(0xa373202d, 0x9a56f685),
+ TOBN(0xd3e7f907, 0x45f175d9), TOBN(0x3c2f315f, 0xd080d810),
+ TOBN(0x1130e9dd, 0x7b9520e8), TOBN(0xc078f9e2, 0x0af037b5),
+ TOBN(0x38cd2ec7, 0x1e9c104c), TOBN(0x0f684368, 0xc472fe92),
+ TOBN(0xd3f1b5ed, 0x6247e7ef), TOBN(0xb32d33a9, 0x396dfe21),
+ TOBN(0x46f59cf4, 0x4a9aa2c2), TOBN(0x69cd5168, 0xff0f7e41),
+ TOBN(0x3f59da0f, 0x4b3234da), TOBN(0xcf0b0235, 0xb4579ebe),
+ TOBN(0x6d1cbb25, 0x6d2476c7), TOBN(0x4f0837e6, 0x9dc30f08),
+ TOBN(0x9a4075bb, 0x906f6e98), TOBN(0x253bb434, 0xc761e7d1),
+ TOBN(0xde2e645f, 0x6e73af10), TOBN(0xb89a4060, 0x0c5f131c),
+ TOBN(0xd12840c5, 0xb8cc037f), TOBN(0x3d093a5b, 0x7405bb47),
+ TOBN(0x6202c253, 0x206348b8), TOBN(0xbf5d57fc, 0xc55a3ca7),
+ TOBN(0x89f6c90c, 0x8c3bef48), TOBN(0x23ac7623, 0x5a0a960a),
+ TOBN(0xdfbd3d6b, 0x552b42ab), TOBN(0x3ef22458, 0x132061f6),
+ TOBN(0xd74e9bda, 0xc97e6516), TOBN(0x88779360, 0xc230f49e),
+ TOBN(0xa6ec1de3, 0x1e74ea49), TOBN(0x581dcee5, 0x3fb645a2),
+ TOBN(0xbaef2391, 0x8f483f14), TOBN(0x6d2dddfc, 0xd137d13b),
+ TOBN(0x54cde50e, 0xd2743a42), TOBN(0x89a34fc5, 0xe4d97e67),
+ TOBN(0x13f1f5b3, 0x12e08ce5), TOBN(0xa80540b8, 0xa7f0b2ca),
+ TOBN(0x854bcf77, 0x01982805), TOBN(0xb8653ffd, 0x233bea04),
+ TOBN(0x8e7b8787, 0x02b0b4c9), TOBN(0x2675261f, 0x9acb170a),
+ TOBN(0x061a9d90, 0x930c14e5), TOBN(0xb59b30e0, 0xdef0abea),
+ TOBN(0x1dc19ea6, 0x0200ec7d), TOBN(0xb6f4a3f9, 0x0bce132b),
+ TOBN(0xb8d5de90, 0xf13e27e0), TOBN(0xbaee5ef0, 0x1fade16f),
+ TOBN(0x6f406aaa, 0xe4c6cf38), TOBN(0xab4cfe06, 0xd1369815),
+ TOBN(0x0dcffe87, 0xefd550c6), TOBN(0x9d4f59c7, 0x75ff7d39),
+ TOBN(0xb02553b1, 0x51deb6ad), TOBN(0x812399a4, 0xb1877749),
+ TOBN(0xce90f71f, 0xca6006e1), TOBN(0xc32363a6, 0xb02b6e77),
+ TOBN(0x02284fbe, 0xdc36c64d), TOBN(0x86c81e31, 0xa7e1ae61),
+ TOBN(0x2576c7e5, 0xb909d94a), TOBN(0x8b6f7d02, 0x818b2bb0),
+ TOBN(0xeca3ed07, 0x56faa38a), TOBN(0xa3790e6c, 0x9305bb54),
+ TOBN(0xd784eeda, 0x7bc73061), TOBN(0xbd56d369, 0x6dd50614),
+ TOBN(0xd6575949, 0x229a8aa9), TOBN(0xdcca8f47, 0x4595ec28),
+ TOBN(0x814305c1, 0x06ab4fe6), TOBN(0xc8c39768, 0x24f43f16),
+ TOBN(0xe2a45f36, 0x523f2b36), TOBN(0x995c6493, 0x920d93bb),
+ TOBN(0xf8afdab7, 0x90f1632b), TOBN(0x79ebbecd, 0x1c295954),
+ TOBN(0xc7bb3ddb, 0x79592f48), TOBN(0x67216a7b, 0x5f88e998),
+ TOBN(0xd91f098b, 0xbc01193e), TOBN(0xf7d928a5, 0xb1db83fc),
+ TOBN(0x55e38417, 0xe991f600), TOBN(0x2a91113e, 0x2981a934),
+ TOBN(0xcbc9d648, 0x06b13bde), TOBN(0xb011b6ac, 0x0755ff44),
+ TOBN(0x6f4cb518, 0x045ec613), TOBN(0x522d2d31, 0xc2f5930a),
+ TOBN(0x5acae1af, 0x382e65de), TOBN(0x57643067, 0x27bc966f),
+ TOBN(0x5e12705d, 0x1c7193f0), TOBN(0xf0f32f47, 0x3be8858e),
+ TOBN(0x785c3d7d, 0x96c6dfc7), TOBN(0xd75b4a20, 0xbf31795d),
+ TOBN(0x91acf17b, 0x342659d4), TOBN(0xe596ea34, 0x44f0378f),
+ TOBN(0x4515708f, 0xce52129d), TOBN(0x17387e1e, 0x79f2f585),
+ TOBN(0x72cfd2e9, 0x49dee168), TOBN(0x1ae05223, 0x3e2af239),
+ TOBN(0x009e75be, 0x1d94066a), TOBN(0x6cca31c7, 0x38abf413),
+ TOBN(0xb50bd61d, 0x9bc49908), TOBN(0x4a9b4a8c, 0xf5e2bc1e),
+ TOBN(0xeb6cc5f7, 0x946f83ac), TOBN(0x27da93fc, 0xebffab28),
+ TOBN(0xea314c96, 0x4821c8c5), TOBN(0x8de49ded, 0xa83c15f4),
+ TOBN(0x7a64cf20, 0x7af33004), TOBN(0x45f1bfeb, 0xc9627e10),
+ TOBN(0x878b0626, 0x54b9df60), TOBN(0x5e4fdc3c, 0xa95c0b33),
+ TOBN(0xe54a37ca, 0xc2035d8e), TOBN(0x9087cda9, 0x80f20b8c),
+ TOBN(0x36f61c23, 0x8319ade4), TOBN(0x766f287a, 0xde8cfdf8),
+ TOBN(0x48821948, 0x346f3705), TOBN(0x49a7b853, 0x16e4f4a2),
+ TOBN(0xb9b3f8a7, 0x5cedadfd), TOBN(0x8f562815, 0x8db2a815),
+ TOBN(0xc0b7d554, 0x01f68f95), TOBN(0x12971e27, 0x688a208e),
+ TOBN(0xc9f8b696, 0xd0ff34fc), TOBN(0x20824de2, 0x1222718c),
+ TOBN(0x7213cf9f, 0x0c95284d), TOBN(0xe2ad741b, 0xdc158240),
+ TOBN(0x0ee3a6df, 0x54043ccf), TOBN(0x16ff479b, 0xd84412b3),
+ TOBN(0xf6c74ee0, 0xdfc98af0), TOBN(0xa78a169f, 0x52fcd2fb),
+ TOBN(0xd8ae8746, 0x99c930e9), TOBN(0x1d33e858, 0x49e117a5),
+ TOBN(0x7581fcb4, 0x6624759f), TOBN(0xde50644f, 0x5bedc01d),
+ TOBN(0xbeec5d00, 0xcaf3155e), TOBN(0x672d66ac, 0xbc73e75f),
+ TOBN(0x86b9d8c6, 0x270b01db), TOBN(0xd249ef83, 0x50f55b79),
+ TOBN(0x6131d6d4, 0x73978fe3), TOBN(0xcc4e4542, 0x754b00a1),
+ TOBN(0x4e05df05, 0x57dfcfe9), TOBN(0x94b29cdd, 0x51ef6bf0),
+ TOBN(0xe4530cff, 0x9bc7edf2), TOBN(0x8ac236fd, 0xd3da65f3),
+ TOBN(0x0faf7d5f, 0xc8eb0b48), TOBN(0x4d2de14c, 0x660eb039),
+ TOBN(0xc006bba7, 0x60430e54), TOBN(0x10a2d0d6, 0xda3289ab),
+ TOBN(0x9c037a5d, 0xd7979c59), TOBN(0x04d1f3d3, 0xa116d944),
+ TOBN(0x9ff22473, 0x8a0983cd), TOBN(0x28e25b38, 0xc883cabb),
+ TOBN(0xe968dba5, 0x47a58995), TOBN(0x2c80b505, 0x774eebdf),
+ TOBN(0xee763b71, 0x4a953beb), TOBN(0x502e223f, 0x1642e7f6),
+ TOBN(0x6fe4b641, 0x61d5e722), TOBN(0x9d37c5b0, 0xdbef5316),
+ TOBN(0x0115ed70, 0xf8330bc7), TOBN(0x139850e6, 0x75a72789),
+ TOBN(0x27d7faec, 0xffceccc2), TOBN(0x3016a860, 0x4fd9f7f6),
+ TOBN(0xc492ec64, 0x4cd8f64c), TOBN(0x58a2d790, 0x279d7b51),
+ TOBN(0x0ced1fc5, 0x1fc75256), TOBN(0x3e658aed, 0x8f433017),
+ TOBN(0x0b61942e, 0x05da59eb), TOBN(0xba3d60a3, 0x0ddc3722),
+ TOBN(0x7c311cd1, 0x742e7f87), TOBN(0x6473ffee, 0xf6b01b6e),}
+ ,
+ {TOBN(0x8303604f, 0x692ac542), TOBN(0xf079ffe1, 0x227b91d3),
+ TOBN(0x19f63e63, 0x15aaf9bd), TOBN(0xf99ee565, 0xf1f344fb),
+ TOBN(0x8a1d661f, 0xd6219199), TOBN(0x8c883bc6, 0xd48ce41c),
+ TOBN(0x1065118f, 0x3c74d904), TOBN(0x713889ee, 0x0faf8b1b),
+ TOBN(0x972b3f8f, 0x81a1b3be), TOBN(0x4f3ce145, 0xce2764a0),
+ TOBN(0xe2d0f1cc, 0x28c4f5f7), TOBN(0xdeee0c0d, 0xc7f3985b),
+ TOBN(0x7df4adc0, 0xd39e25c3), TOBN(0x40619820, 0xc467a080),
+ TOBN(0x440ebc93, 0x61cf5a58), TOBN(0x527729a6, 0x422ad600),
+ TOBN(0xca6c0937, 0xb1b76ba6), TOBN(0x1a2eab85, 0x4d2026dc),
+ TOBN(0xb1715e15, 0x19d9ae0a), TOBN(0xf1ad9199, 0xbac4a026),
+ TOBN(0x35b3dfb8, 0x07ea7b0e), TOBN(0xedf5496f, 0x3ed9eb89),
+ TOBN(0x8932e5ff, 0x2d6d08ab), TOBN(0xf314874e, 0x25bd2731),
+ TOBN(0xefb26a75, 0x3f73f449), TOBN(0x1d1c94f8, 0x8d44fc79),
+ TOBN(0x49f0fbc5, 0x3bc0dc4d), TOBN(0xb747ea0b, 0x3698a0d0),
+ TOBN(0x5218c3fe, 0x228d291e), TOBN(0x35b804b5, 0x43c129d6),
+ TOBN(0xfac859b8, 0xd1acc516), TOBN(0x6c10697d, 0x95d6e668),
+ TOBN(0xc38e438f, 0x0876fd4e), TOBN(0x45f0c307, 0x83d2f383),
+ TOBN(0x203cc2ec, 0xb10934cb), TOBN(0x6a8f2439, 0x2c9d46ee),
+ TOBN(0xf16b431b, 0x65ccde7b), TOBN(0x41e2cd18, 0x27e76a6f),
+ TOBN(0xb9c8cf8f, 0x4e3484d7), TOBN(0x64426efd, 0x8315244a),
+ TOBN(0x1c0a8e44, 0xfc94dea3), TOBN(0x34c8cdbf, 0xdad6a0b0),
+ TOBN(0x919c3840, 0x04113cef), TOBN(0xfd32fba4, 0x15490ffa),
+ TOBN(0x58d190f6, 0x795dcfb7), TOBN(0xfef01b03, 0x83588baf),
+ TOBN(0x9e6d1d63, 0xca1fc1c0), TOBN(0x53173f96, 0xf0a41ac9),
+ TOBN(0x2b1d402a, 0xba16f73b), TOBN(0x2fb31014, 0x8cf9b9fc),
+ TOBN(0x2d51e60e, 0x446ef7bf), TOBN(0xc731021b, 0xb91e1745),
+ TOBN(0x9d3b4724, 0x4fee99d4), TOBN(0x4bca48b6, 0xfac5c1ea),
+ TOBN(0x70f5f514, 0xbbea9af7), TOBN(0x751f55a5, 0x974c283a),
+ TOBN(0x6e30251a, 0xcb452fdb), TOBN(0x31ee6965, 0x50f30650),
+ TOBN(0xb0b3e508, 0x933548d9), TOBN(0xb8949a4f, 0xf4b0ef5b),
+ TOBN(0x208b8326, 0x3c88f3bd), TOBN(0xab147c30, 0xdb1d9989),
+ TOBN(0xed6515fd, 0x44d4df03), TOBN(0x17a12f75, 0xe72eb0c5),
+ TOBN(0x3b59796d, 0x36cf69db), TOBN(0x1219eee9, 0x56670c18),
+ TOBN(0xfe3341f7, 0x7a070d8e), TOBN(0x9b70130b, 0xa327f90c),
+ TOBN(0x36a32462, 0x0ae18e0e), TOBN(0x2021a623, 0x46c0a638),
+ TOBN(0x251b5817, 0xc62eb0d4), TOBN(0x87bfbcdf, 0x4c762293),
+ TOBN(0xf78ab505, 0xcdd61d64), TOBN(0x8c7a53fc, 0xc8c18857),
+ TOBN(0xa653ce6f, 0x16147515), TOBN(0x9c923aa5, 0xea7d52d5),
+ TOBN(0xc24709cb, 0x5c18871f), TOBN(0x7d53bec8, 0x73b3cc74),
+ TOBN(0x59264aff, 0xfdd1d4c4), TOBN(0x5555917e, 0x240da582),
+ TOBN(0xcae8bbda, 0x548f5a0e), TOBN(0x1910eaba, 0x3bbfbbe1),
+ TOBN(0xae579685, 0x7677afc3), TOBN(0x49ea61f1, 0x73ff0b5c),
+ TOBN(0x78655478, 0x4f7c3922), TOBN(0x95d337cd, 0x20c68eef),
+ TOBN(0x68f1e1e5, 0xdf779ab9), TOBN(0x14b491b0, 0xb5cf69a8),
+ TOBN(0x7a6cbbe0, 0x28e3fe89), TOBN(0xe7e1fee4, 0xc5aac0eb),
+ TOBN(0x7f47eda5, 0x697e5140), TOBN(0x4f450137, 0xb454921f),
+ TOBN(0xdb625f84, 0x95cd8185), TOBN(0x74be0ba1, 0xcdb2e583),
+ TOBN(0xaee4fd7c, 0xdd5e6de4), TOBN(0x4251437d, 0xe8101739),
+ TOBN(0x686d72a0, 0xac620366), TOBN(0x4be3fb9c, 0xb6d59344),
+ TOBN(0x6e8b44e7, 0xa1eb75b9), TOBN(0x84e39da3, 0x91a5c10c),
+ TOBN(0x37cc1490, 0xb38f0409), TOBN(0x02951943, 0x2c2ade82),
+ TOBN(0x9b688783, 0x1190a2d8), TOBN(0x25627d14, 0x231182ba),
+ TOBN(0x6eb550aa, 0x658a6d87), TOBN(0x1405aaa7, 0xcf9c7325),
+ TOBN(0xd147142e, 0x5c8748c9), TOBN(0x7f637e4f, 0x53ede0e0),
+ TOBN(0xf8ca2776, 0x14ffad2c), TOBN(0xe58fb1bd, 0xbafb6791),
+ TOBN(0x17158c23, 0xbf8f93fc), TOBN(0x7f15b373, 0x0a4a4655),
+ TOBN(0x39d4add2, 0xd842ca72), TOBN(0xa71e4391, 0x3ed96305),
+ TOBN(0x5bb09cbe, 0x6700be14), TOBN(0x68d69d54, 0xd8befcf6),
+ TOBN(0xa45f5367, 0x37183bcf), TOBN(0x7152b7bb, 0x3370dff7),
+ TOBN(0xcf887baa, 0xbf12525b), TOBN(0xe7ac7bdd, 0xd6d1e3cd),
+ TOBN(0x25914f78, 0x81fdad90), TOBN(0xcf638f56, 0x0d2cf6ab),
+ TOBN(0xb90bc03f, 0xcc054de5), TOBN(0x932811a7, 0x18b06350),
+ TOBN(0x2f00b330, 0x9bbd11ff), TOBN(0x76108a6f, 0xb4044974),
+ TOBN(0x801bb9e0, 0xa851d266), TOBN(0x0dd099be, 0xbf8990c1),
+ TOBN(0x58c5aaaa, 0xabe32986), TOBN(0x0fe9dd2a, 0x50d59c27),
+ TOBN(0x84951ff4, 0x8d307305), TOBN(0x6c23f829, 0x86529b78),
+ TOBN(0x50bb2218, 0x0b136a79), TOBN(0x7e2174de, 0x77a20996),
+ TOBN(0x6f00a4b9, 0xc0bb4da6), TOBN(0x89a25a17, 0xefdde8da),
+ TOBN(0xf728a27e, 0xc11ee01d), TOBN(0xf900553a, 0xe5f10dfb),
+ TOBN(0x189a83c8, 0x02ec893c), TOBN(0x3ca5bdc1, 0x23f66d77),
+ TOBN(0x98781537, 0x97eada9f), TOBN(0x59c50ab3, 0x10256230),
+ TOBN(0x346042d9, 0x323c69b3), TOBN(0x1b715a6d, 0x2c460449),
+ TOBN(0xa41dd476, 0x6ae06e0b), TOBN(0xcdd7888e, 0x9d42e25f),
+ TOBN(0x0f395f74, 0x56b25a20), TOBN(0xeadfe0ae, 0x8700e27e),
+ TOBN(0xb09d52a9, 0x69950093), TOBN(0x3525d9cb, 0x327f8d40),
+ TOBN(0xb8235a94, 0x67df886a), TOBN(0x77e4b0dd, 0x035faec2),
+ TOBN(0x115eb20a, 0x517d7061), TOBN(0x77fe3433, 0x6c2df683),
+ TOBN(0x6870ddc7, 0xcdc6fc67), TOBN(0xb1610588, 0x0b87de83),
+ TOBN(0x343584ca, 0xd9c4ddbe), TOBN(0xb3164f1c, 0x3d754be2),
+ TOBN(0x0731ed3a, 0xc1e6c894), TOBN(0x26327dec, 0x4f6b904c),
+ TOBN(0x9d49c6de, 0x97b5cd32), TOBN(0x40835dae, 0xb5eceecd),
+ TOBN(0xc66350ed, 0xd9ded7fe), TOBN(0x8aeebb5c, 0x7a678804),
+ TOBN(0x51d42fb7, 0x5b8ee9ec), TOBN(0xd7a17bdd, 0x8e3ca118),
+ TOBN(0x40d7511a, 0x2ef4400e), TOBN(0xc48990ac, 0x875a66f4),
+ TOBN(0x8de07d2a, 0x2199e347), TOBN(0xbee75556, 0x2a39e051),
+ TOBN(0x56918786, 0x916e51dc), TOBN(0xeb191313, 0x4a2d89ec),
+ TOBN(0x6679610d, 0x37d341ed), TOBN(0x434fbb41, 0x56d51c2b),
+ TOBN(0xe54b7ee7, 0xd7492dba), TOBN(0xaa33a79a, 0x59021493),
+ TOBN(0x49fc5054, 0xe4bd6d3d), TOBN(0x09540f04, 0x5ab551d0),
+ TOBN(0x8acc9085, 0x4942d3a6), TOBN(0x231af02f, 0x2d28323b),
+ TOBN(0x93458cac, 0x0992c163), TOBN(0x1fef8e71, 0x888e3bb4),
+ TOBN(0x27578da5, 0xbe8c268c), TOBN(0xcc8be792, 0xe805ec00),
+ TOBN(0x29267bae, 0xc61c3855), TOBN(0xebff429d, 0x58c1fd3b),
+ TOBN(0x22d886c0, 0x8c0b93b8), TOBN(0xca5e00b2, 0x2ddb8953),
+ TOBN(0xcf330117, 0xc3fed8b7), TOBN(0xd49ac6fa, 0x819c01f6),
+ TOBN(0x6ddaa6bd, 0x3c0fbd54), TOBN(0x91743068, 0x8049a2cf),
+ TOBN(0xd67f981e, 0xaff2ef81), TOBN(0xc3654d35, 0x2818ae80),
+ TOBN(0x81d05044, 0x1b2aa892), TOBN(0x2db067bf, 0x3d099328),
+ TOBN(0xe7c79e86, 0x703dcc97), TOBN(0xe66f9b37, 0xe133e215),
+ TOBN(0xcdf119a6, 0xe39a7a5c), TOBN(0x47c60de3, 0x876f1b61),
+ TOBN(0x6e405939, 0xd860f1b2), TOBN(0x3e9a1dbc, 0xf5ed4d4a),
+ TOBN(0x3f23619e, 0xc9b6bcbd), TOBN(0x5ee790cf, 0x734e4497),
+ TOBN(0xf0a834b1, 0x5bdaf9bb), TOBN(0x02cedda7, 0x4ca295f0),
+ TOBN(0x4619aa2b, 0xcb8e378c), TOBN(0xe5613244, 0xcc987ea4),
+ TOBN(0x0bc022cc, 0x76b23a50), TOBN(0x4a2793ad, 0x0a6c21ce),
+ TOBN(0x38328780, 0x89cac3f5), TOBN(0x29176f1b, 0xcba26d56),
+ TOBN(0x06296187, 0x4f6f59eb), TOBN(0x86e9bca9, 0x8bdc658e),
+ TOBN(0x2ca9c4d3, 0x57e30402), TOBN(0x5438b216, 0x516a09bb),
+ TOBN(0x0a6a063c, 0x7672765a), TOBN(0x37a3ce64, 0x0547b9bf),
+ TOBN(0x42c099c8, 0x98b1a633), TOBN(0xb5ab800d, 0x05ee6961),
+ TOBN(0xf1963f59, 0x11a5acd6), TOBN(0xbaee6157, 0x46201063),
+ TOBN(0x36d9a649, 0xa596210a), TOBN(0xaed04363, 0x1ba7138c),
+ TOBN(0xcf817d1c, 0xa4a82b76), TOBN(0x5586960e, 0xf3806be9),
+ TOBN(0x7ab67c89, 0x09dc6bb5), TOBN(0x52ace7a0, 0x114fe7eb),
+ TOBN(0xcd987618, 0xcbbc9b70), TOBN(0x4f06fd5a, 0x604ca5e1),
+ TOBN(0x90af14ca, 0x6dbde133), TOBN(0x1afe4322, 0x948a3264),
+ TOBN(0xa70d2ca6, 0xc44b2c6c), TOBN(0xab726799, 0x0ef87dfe),
+ TOBN(0x310f64dc, 0x2e696377), TOBN(0x49b42e68, 0x4c8126a0),
+ TOBN(0x0ea444c3, 0xcea0b176), TOBN(0x53a8ddf7, 0xcb269182),
+ TOBN(0xf3e674eb, 0xbbba9dcb), TOBN(0x0d2878a8, 0xd8669d33),
+ TOBN(0x04b935d5, 0xd019b6a3), TOBN(0xbb5cf88e, 0x406f1e46),
+ TOBN(0xa1912d16, 0x5b57c111), TOBN(0x9803fc21, 0x19ebfd78),
+ TOBN(0x4f231c9e, 0xc07764a9), TOBN(0xd93286ee, 0xb75bd055),
+ TOBN(0x83a9457d, 0x8ee6c9de), TOBN(0x04695915, 0x6087ec90),
+ TOBN(0x14c6dd8a, 0x58d6cd46), TOBN(0x9cb633b5, 0x8e6634d2),
+ TOBN(0xc1305047, 0xf81bc328), TOBN(0x12ede0e2, 0x26a177e5),
+ TOBN(0x332cca62, 0x065a6f4f), TOBN(0xc3a47ecd, 0x67be487b),
+ TOBN(0x741eb187, 0x0f47ed1c), TOBN(0x99e66e58, 0xe7598b14),
+ TOBN(0x6f0544ca, 0x63d0ff12), TOBN(0xe5efc784, 0xb610a05f),
+ TOBN(0xf72917b1, 0x7cad7b47), TOBN(0x3ff6ea20, 0xf2cac0c0),
+ TOBN(0xcc23791b, 0xf21db8b7), TOBN(0x7dac70b1, 0xd7d93565),
+ TOBN(0x682cda1d, 0x694bdaad), TOBN(0xeb88bb8c, 0x1023516d),
+ TOBN(0xc4c634b4, 0xdfdbeb1b), TOBN(0x22f5ca72, 0xb4ee4dea),
+ TOBN(0x1045a368, 0xe6524821), TOBN(0xed9e8a3f, 0x052b18b2),
+ TOBN(0x9b7f2cb1, 0xb961f49a), TOBN(0x7fee2ec1, 0x7b009670),
+ TOBN(0x350d8754, 0x22507a6d), TOBN(0x561bd711, 0x4db55f1d),
+ TOBN(0x4c189ccc, 0x320bbcaf), TOBN(0x568434cf, 0xdf1de48c),
+ TOBN(0x6af1b00e, 0x0fa8f128), TOBN(0xf0ba9d02, 0x8907583c),
+ TOBN(0x735a4004, 0x32ff9f60), TOBN(0x3dd8e4b6, 0xc25dcf33),
+ TOBN(0xf2230f16, 0x42c74cef), TOBN(0xd8117623, 0x013fa8ad),
+ TOBN(0x36822876, 0xf51fe76e), TOBN(0x8a6811cc, 0x11d62589),
+ TOBN(0xc3fc7e65, 0x46225718), TOBN(0xb7df2c9f, 0xc82fdbcd),
+ TOBN(0x3b1d4e52, 0xdd7b205b), TOBN(0xb6959478, 0x47a2e414),
+ TOBN(0x05e4d793, 0xefa91148), TOBN(0xb47ed446, 0xfd2e9675),
+ TOBN(0x1a7098b9, 0x04c9d9bf), TOBN(0x661e2881, 0x1b793048),
+ TOBN(0xb1a16966, 0xb01ee461), TOBN(0xbc521308, 0x2954746f),
+ TOBN(0xc909a0fc, 0x2477de50), TOBN(0xd80bb41c, 0x7dbd51ef),
+ TOBN(0xa85be7ec, 0x53294905), TOBN(0x6d465b18, 0x83958f97),
+ TOBN(0x16f6f330, 0xfb6840fd), TOBN(0xfaaeb214, 0x3401e6c8),
+ TOBN(0xaf83d30f, 0xccb5b4f8), TOBN(0x22885739, 0x266dec4b),
+ TOBN(0x51b4367c, 0x7bc467df), TOBN(0x926562e3, 0xd842d27a),
+ TOBN(0xdfcb6614, 0x0fea14a6), TOBN(0xeb394dae, 0xf2734cd9),
+ TOBN(0x3eeae5d2, 0x11c0be98), TOBN(0xb1e6ed11, 0x814e8165),
+ TOBN(0x191086bc, 0xe52bce1c), TOBN(0x14b74cc6, 0xa75a04da),
+ TOBN(0x63cf1186, 0x8c060985), TOBN(0x071047de, 0x2dbd7f7c),
+ TOBN(0x4e433b8b, 0xce0942ca), TOBN(0xecbac447, 0xd8fec61d),
+ TOBN(0x8f0ed0e2, 0xebf3232f), TOBN(0xfff80f9e, 0xc52a2edd),
+ TOBN(0xad9ab433, 0x75b55fdb), TOBN(0x73ca7820, 0xe42e0c11),
+ TOBN(0x6dace0a0, 0xe6251b46), TOBN(0x89bc6b5c, 0x4c0d932d),
+ TOBN(0x3438cd77, 0x095da19a), TOBN(0x2f24a939, 0x8d48bdfb),
+ TOBN(0x99b47e46, 0x766561b7), TOBN(0x736600e6, 0x0ed0322a),
+ TOBN(0x06a47cb1, 0x638e1865), TOBN(0x927c1c2d, 0xcb136000),
+ TOBN(0x29542337, 0x0cc5df69), TOBN(0x99b37c02, 0x09d649a9),
+ TOBN(0xc5f0043c, 0x6aefdb27), TOBN(0x6cdd9987, 0x1be95c27),
+ TOBN(0x69850931, 0x390420d2), TOBN(0x299c40ac, 0x0983efa4),
+ TOBN(0x3a05e778, 0xaf39aead), TOBN(0x84274408, 0x43a45193),
+ TOBN(0x6bcd0fb9, 0x91a711a0), TOBN(0x461592c8, 0x9f52ab17),
+ TOBN(0xb49302b4, 0xda3c6ed6), TOBN(0xc51fddc7, 0x330d7067),
+ TOBN(0x94babeb6, 0xda50d531), TOBN(0x521b840d, 0xa6a7b9da),
+ TOBN(0x5305151e, 0x404bdc89), TOBN(0x1bcde201, 0xd0d07449),
+ TOBN(0xf427a78b, 0x3b76a59a), TOBN(0xf84841ce, 0x07791a1b),
+ TOBN(0xebd314be, 0xbf91ed1c), TOBN(0x8e61d34c, 0xbf172943),
+ TOBN(0x1d5dc451, 0x5541b892), TOBN(0xb186ee41, 0xfc9d9e54),
+ TOBN(0x9d9f345e, 0xd5bf610d), TOBN(0x3e7ba65d, 0xf6acca9f),
+ TOBN(0x9dda787a, 0xa8369486), TOBN(0x09f9dab7, 0x8eb5ba53),
+ TOBN(0x5afb2033, 0xd6481bc3), TOBN(0x76f4ce30, 0xafa62104),
+ TOBN(0xa8fa00cf, 0xf4f066b5), TOBN(0x89ab5143, 0x461dafc2),
+ TOBN(0x44339ed7, 0xa3389998), TOBN(0x2ff862f1, 0xbc214903),
+ TOBN(0x2c88f985, 0xb05556e3), TOBN(0xcd96058e, 0x3467081e),
+ TOBN(0x7d6a4176, 0xedc637ea), TOBN(0xe1743d09, 0x36a5acdc),
+ TOBN(0x66fd72e2, 0x7eb37726), TOBN(0xf7fa264e, 0x1481a037),
+ TOBN(0x9fbd3bde, 0x45f4aa79), TOBN(0xed1e0147, 0x767c3e22),
+ TOBN(0x7621f979, 0x82e7abe2), TOBN(0x19eedc72, 0x45f633f8),
+ TOBN(0xe69b155e, 0x6137bf3a), TOBN(0xa0ad13ce, 0x414ee94e),
+ TOBN(0x93e3d524, 0x1c0e651a), TOBN(0xab1a6e2a, 0x02ce227e),
+ TOBN(0xe7af1797, 0x4ab27eca), TOBN(0x245446de, 0xbd444f39),
+ TOBN(0x59e22a21, 0x56c07613), TOBN(0x43deafce, 0xf4275498),
+ TOBN(0x10834ccb, 0x67fd0946), TOBN(0xa75841e5, 0x47406edf),
+ TOBN(0xebd6a677, 0x7b0ac93d), TOBN(0xa6e37b0d, 0x78f5e0d7),
+ TOBN(0x2516c096, 0x76f5492b), TOBN(0x1e4bf888, 0x9ac05f3a),
+ TOBN(0xcdb42ce0, 0x4df0ba2b), TOBN(0x935d5cfd, 0x5062341b),
+ TOBN(0x8a303333, 0x82acac20), TOBN(0x429438c4, 0x5198b00e),
+ TOBN(0x1d083bc9, 0x049d33fa), TOBN(0x58b82dda, 0x946f67ff),
+ TOBN(0xac3e2db8, 0x67a1d6a3), TOBN(0x62e6bead, 0x1798aac8),
+ TOBN(0xfc85980f, 0xde46c58c), TOBN(0xa7f69379, 0x69c8d7be),
+ TOBN(0x23557927, 0x837b35ec), TOBN(0x06a933d8, 0xe0790c0c),
+ TOBN(0x827c0e9b, 0x077ff55d), TOBN(0x53977798, 0xbb26e680),
+ TOBN(0x59530874, 0x1d9cb54f), TOBN(0xcca3f449, 0x4aac53ef),
+ TOBN(0x11dc5c87, 0xa07eda0f), TOBN(0xc138bccf, 0xfd6400c8),
+ TOBN(0x549680d3, 0x13e5da72), TOBN(0xc93eed82, 0x4540617e),
+ TOBN(0xfd3db157, 0x4d0b75c0), TOBN(0x9716eb42, 0x6386075b),
+ TOBN(0x0639605c, 0x817b2c16), TOBN(0x09915109, 0xf1e4f201),
+ TOBN(0x35c9a928, 0x5cca6c3b), TOBN(0xb25f7d1a, 0x3505c900),
+ TOBN(0xeb9f7d20, 0x630480c4), TOBN(0xc3c7b8c6, 0x2a1a501c),
+ TOBN(0x3f99183c, 0x5a1f8e24), TOBN(0xfdb118fa, 0x9dd255f0),
+ TOBN(0xb9b18b90, 0xc27f62a6), TOBN(0xe8f732f7, 0x396ec191),
+ TOBN(0x524a2d91, 0x0be786ab), TOBN(0x5d32adef, 0x0ac5a0f5),
+ TOBN(0x9b53d4d6, 0x9725f694), TOBN(0x032a76c6, 0x0510ba89),
+ TOBN(0x840391a3, 0xebeb1544), TOBN(0x44b7b88c, 0x3ed73ac3),
+ TOBN(0xd24bae7a, 0x256cb8b3), TOBN(0x7ceb151a, 0xe394cb12),
+ TOBN(0xbd6b66d0, 0x5bc1e6a8), TOBN(0xec70cecb, 0x090f07bf),
+ TOBN(0x270644ed, 0x7d937589), TOBN(0xee9e1a3d, 0x5f1dccfe),
+ TOBN(0xb0d40a84, 0x745b98d2), TOBN(0xda429a21, 0x2556ed40),
+ TOBN(0xf676eced, 0x85148cb9), TOBN(0x5a22d40c, 0xded18936),
+ TOBN(0x3bc4b9e5, 0x70e8a4ce), TOBN(0xbfd1445b, 0x9eae0379),
+ TOBN(0xf23f2c0c, 0x1a0bd47e), TOBN(0xa9c0bb31, 0xe1845531),
+ TOBN(0x9ddc4d60, 0x0a4c3f6b), TOBN(0xbdfaad79, 0x2c15ef44),
+ TOBN(0xce55a236, 0x7f484acc), TOBN(0x08653ca7, 0x055b1f15),
+ TOBN(0x2efa8724, 0x538873a3), TOBN(0x09299e5d, 0xace1c7e7),
+ TOBN(0x07afab66, 0xade332ba), TOBN(0x9be1fdf6, 0x92dd71b7),
+ TOBN(0xa49b5d59, 0x5758b11c), TOBN(0x0b852893, 0xc8654f40),
+ TOBN(0xb63ef6f4, 0x52379447), TOBN(0xd4957d29, 0x105e690c),
+ TOBN(0x7d484363, 0x646559b0), TOBN(0xf4a8273c, 0x49788a8e),
+ TOBN(0xee406cb8, 0x34ce54a9), TOBN(0x1e1c260f, 0xf86fda9b),
+ TOBN(0xe150e228, 0xcf6a4a81), TOBN(0x1fa3b6a3, 0x1b488772),
+ TOBN(0x1e6ff110, 0xc5a9c15b), TOBN(0xc6133b91, 0x8ad6aa47),
+ TOBN(0x8ac5d55c, 0x9dffa978), TOBN(0xba1d1c1d, 0x5f3965f2),
+ TOBN(0xf969f4e0, 0x7732b52f), TOBN(0xfceecdb5, 0xa5172a07),
+ TOBN(0xb0120a5f, 0x10f2b8f5), TOBN(0xc83a6cdf, 0x5c4c2f63),
+ TOBN(0x4d47a491, 0xf8f9c213), TOBN(0xd9e1cce5, 0xd3f1bbd5),
+ TOBN(0x0d91bc7c, 0xaba7e372), TOBN(0xfcdc74c8, 0xdfd1a2db),
+ TOBN(0x05efa800, 0x374618e5), TOBN(0x11216969, 0x15a7925e),
+ TOBN(0xd4c89823, 0xf6021c5d), TOBN(0x880d5e84, 0xeff14423),
+ TOBN(0x6523bc5a, 0x6dcd1396), TOBN(0xd1acfdfc, 0x113c978b),
+ TOBN(0xb0c164e8, 0xbbb66840), TOBN(0xf7f4301e, 0x72b58459),
+ TOBN(0xc29ad4a6, 0xa638e8ec), TOBN(0xf5ab8961, 0x46b78699),
+ TOBN(0x9dbd7974, 0x0e954750), TOBN(0x0121de88, 0x64f9d2c6),
+ TOBN(0x2e597b42, 0xd985232e), TOBN(0x55b6c3c5, 0x53451777),
+ TOBN(0xbb53e547, 0x519cb9fb), TOBN(0xf134019f, 0x8428600d),
+ TOBN(0x5a473176, 0xe081791a), TOBN(0x2f3e2263, 0x35fb0c08),
+ TOBN(0xb28c3017, 0x73d273b0), TOBN(0xccd21076, 0x7721ef9a),
+ TOBN(0x054cc292, 0xb650dc39), TOBN(0x662246de, 0x6188045e),
+ TOBN(0x904b52fa, 0x6b83c0d1), TOBN(0xa72df267, 0x97e9cd46),
+ TOBN(0x886b43cd, 0x899725e4), TOBN(0x2b651688, 0xd849ff22),
+ TOBN(0x60479b79, 0x02f34533), TOBN(0x5e354c14, 0x0c77c148),
+ TOBN(0xb4bb7581, 0xa8537c78), TOBN(0x188043d7, 0xefe1495f),
+ TOBN(0x9ba12f42, 0x8c1d5026), TOBN(0x2e0c8a26, 0x93d4aaab),
+ TOBN(0xbdba7b8b, 0xaa57c450), TOBN(0x140c9ad6, 0x9bbdafef),
+ TOBN(0x2067aa42, 0x25ac0f18), TOBN(0xf7b1295b, 0x04d1fbf3),
+ TOBN(0x14829111, 0xa4b04824), TOBN(0x2ce3f192, 0x33bd5e91),
+ TOBN(0x9c7a1d55, 0x8f2e1b72), TOBN(0xfe932286, 0x302aa243),
+ TOBN(0x497ca7b4, 0xd4be9554), TOBN(0xb8e821b8, 0xe0547a6e),
+ TOBN(0xfb2838be, 0x67e573e0), TOBN(0x05891db9, 0x4084c44b),
+ TOBN(0x91311373, 0x96c1c2c5), TOBN(0x6aebfa3f, 0xd958444b),
+ TOBN(0xac9cdce9, 0xe56e55c1), TOBN(0x7148ced3, 0x2caa46d0),
+ TOBN(0x2e10c7ef, 0xb61fe8eb), TOBN(0x9fd835da, 0xff97cf4d),}
+ ,
+ {TOBN(0xa36da109, 0x081e9387), TOBN(0xfb9780d7, 0x8c935828),
+ TOBN(0xd5940332, 0xe540b015), TOBN(0xc9d7b51b, 0xe0f466fa),
+ TOBN(0xfaadcd41, 0xd6d9f671), TOBN(0xba6c1e28, 0xb1a2ac17),
+ TOBN(0x066a7833, 0xed201e5f), TOBN(0x19d99719, 0xf90f462b),
+ TOBN(0xf431f462, 0x060b5f61), TOBN(0xa56f46b4, 0x7bd057c2),
+ TOBN(0x348dca6c, 0x47e1bf65), TOBN(0x9a38783e, 0x41bcf1ff),
+ TOBN(0x7a5d33a9, 0xda710718), TOBN(0x5a779987, 0x2e0aeaf6),
+ TOBN(0xca87314d, 0x2d29d187), TOBN(0xfa0edc3e, 0xc687d733),
+ TOBN(0x9df33621, 0x6a31e09b), TOBN(0xde89e44d, 0xc1350e35),
+ TOBN(0x29214871, 0x4ca0cf52), TOBN(0xdf379672, 0x0b88a538),
+ TOBN(0xc92a510a, 0x2591d61b), TOBN(0x79aa87d7, 0x585b447b),
+ TOBN(0xf67db604, 0xe5287f77), TOBN(0x1697c8bf, 0x5efe7a80),
+ TOBN(0x1c894849, 0xcb198ac7), TOBN(0xa884a93d, 0x0f264665),
+ TOBN(0x2da964ef, 0x9b200678), TOBN(0x3c351b87, 0x009834e6),
+ TOBN(0xafb2ef9f, 0xe2c4b44b), TOBN(0x580f6c47, 0x3326790c),
+ TOBN(0xb8480521, 0x0b02264a), TOBN(0x8ba6f9e2, 0x42a194e2),
+ TOBN(0xfc87975f, 0x8fb54738), TOBN(0x35160788, 0x27c3ead3),
+ TOBN(0x834116d2, 0xb74a085a), TOBN(0x53c99a73, 0xa62fe996),
+ TOBN(0x87585be0, 0x5b81c51b), TOBN(0x925bafa8, 0xbe0852b7),
+ TOBN(0x76a4fafd, 0xa84d19a7), TOBN(0x39a45982, 0x585206d4),
+ TOBN(0x499b6ab6, 0x5eb03c0e), TOBN(0xf19b7954, 0x72bc3fde),
+ TOBN(0xa86b5b9c, 0x6e3a80d2), TOBN(0xe4377508, 0x6d42819f),
+ TOBN(0xc1663650, 0xbb3ee8a3), TOBN(0x75eb14fc, 0xb132075f),
+ TOBN(0xa8ccc906, 0x7ad834f6), TOBN(0xea6a2474, 0xe6e92ffd),
+ TOBN(0x9d72fd95, 0x0f8d6758), TOBN(0xcb84e101, 0x408c07dd),
+ TOBN(0xb9114bfd, 0xa5e23221), TOBN(0x358b5fe2, 0xe94e742c),
+ TOBN(0x1c0577ec, 0x95f40e75), TOBN(0xf0155451, 0x3d73f3d6),
+ TOBN(0x9d55cd67, 0xbd1b9b66), TOBN(0x63e86e78, 0xaf8d63c7),
+ TOBN(0x39d934ab, 0xd3c095f1), TOBN(0x04b261be, 0xe4b76d71),
+ TOBN(0x1d2e6970, 0xe73e6984), TOBN(0x879fb23b, 0x5e5fcb11),
+ TOBN(0x11506c72, 0xdfd75490), TOBN(0x3a97d085, 0x61bcf1c1),
+ TOBN(0x43201d82, 0xbf5e7007), TOBN(0x7f0ac52f, 0x798232a7),
+ TOBN(0x2715cbc4, 0x6eb564d4), TOBN(0x8d6c752c, 0x9e570e29),
+ TOBN(0xf80247c8, 0x9ef5fd5d), TOBN(0xc3c66b46, 0xd53eb514),
+ TOBN(0x9666b401, 0x0f87de56), TOBN(0xce62c06f, 0xc6c603b5),
+ TOBN(0xae7b4c60, 0x7e4fc942), TOBN(0x38ac0b77, 0x663a9c19),
+ TOBN(0xcb4d20ee, 0x4b049136), TOBN(0x8b63bf12, 0x356a4613),
+ TOBN(0x1221aef6, 0x70e08128), TOBN(0xe62d8c51, 0x4acb6b16),
+ TOBN(0x71f64a67, 0x379e7896), TOBN(0xb25237a2, 0xcafd7fa5),
+ TOBN(0xf077bd98, 0x3841ba6a), TOBN(0xc4ac0244, 0x3cd16e7e),
+ TOBN(0x548ba869, 0x21fea4ca), TOBN(0xd36d0817, 0xf3dfdac1),
+ TOBN(0x09d8d71f, 0xf4685faf), TOBN(0x8eff66be, 0xc52c459a),
+ TOBN(0x182faee7, 0x0b57235e), TOBN(0xee3c39b1, 0x0106712b),
+ TOBN(0x5107331f, 0xc0fcdcb0), TOBN(0x669fb9dc, 0xa51054ba),
+ TOBN(0xb25101fb, 0x319d7682), TOBN(0xb0293129, 0x0a982fee),
+ TOBN(0x51c1c9b9, 0x0261b344), TOBN(0x0e008c5b, 0xbfd371fa),
+ TOBN(0xd866dd1c, 0x0278ca33), TOBN(0x666f76a6, 0xe5aa53b1),
+ TOBN(0xe5cfb779, 0x6013a2cf), TOBN(0x1d3a1aad, 0xa3521836),
+ TOBN(0xcedd2531, 0x73faa485), TOBN(0xc8ee6c4f, 0xc0a76878),
+ TOBN(0xddbccfc9, 0x2a11667d), TOBN(0x1a418ea9, 0x1c2f695a),
+ TOBN(0xdb11bd92, 0x51f73971), TOBN(0x3e4b3c82, 0xda2ed89f),
+ TOBN(0x9a44f3f4, 0xe73e0319), TOBN(0xd1e3de0f, 0x303431af),
+ TOBN(0x3c5604ff, 0x50f75f9c), TOBN(0x1d8eddf3, 0x7e752b22),
+ TOBN(0x0ef074dd, 0x3c9a1118), TOBN(0xd0ffc172, 0xccb86d7b),
+ TOBN(0xabd1ece3, 0x037d90f2), TOBN(0xe3f307d6, 0x6055856c),
+ TOBN(0x422f9328, 0x7e4c6daf), TOBN(0x902aac66, 0x334879a0),
+ TOBN(0xb6a1e7bf, 0x94cdfade), TOBN(0x6c97e1ed, 0x7fc6d634),
+ TOBN(0x662ad24d, 0xa2fb63f8), TOBN(0xf81be1b9, 0xa5928405),
+ TOBN(0x86d765e4, 0xd14b4206), TOBN(0xbecc2e0e, 0x8fa0db65),
+ TOBN(0xa28838e0, 0xb17fc76c), TOBN(0xe49a602a, 0xe37cf24e),
+ TOBN(0x76b4131a, 0x567193ec), TOBN(0xaf3c305a, 0xe5f6e70b),
+ TOBN(0x9587bd39, 0x031eebdd), TOBN(0x5709def8, 0x71bbe831),
+ TOBN(0x57059983, 0x0eb2b669), TOBN(0x4d80ce1b, 0x875b7029),
+ TOBN(0x838a7da8, 0x0364ac16), TOBN(0x2f431d23, 0xbe1c83ab),
+ TOBN(0xe56812a6, 0xf9294dd3), TOBN(0xb448d01f, 0x9b4b0d77),
+ TOBN(0xf3ae6061, 0x04e8305c), TOBN(0x2bead645, 0x94d8c63e),
+ TOBN(0x0a85434d, 0x84fd8b07), TOBN(0x537b983f, 0xf7a9dee5),
+ TOBN(0xedcc5f18, 0xef55bd85), TOBN(0x2041af62, 0x21c6cf8b),
+ TOBN(0x8e52874c, 0xb940c71e), TOBN(0x211935a9, 0xdb5f4b3a),
+ TOBN(0x94350492, 0x301b1dc3), TOBN(0x33d2646d, 0x29958620),
+ TOBN(0x16b0d64b, 0xef911404), TOBN(0x9d1f25ea, 0x9a3c5ef4),
+ TOBN(0x20f200eb, 0x4a352c78), TOBN(0x43929f2c, 0x4bd0b428),
+ TOBN(0xa5656667, 0xc7196e29), TOBN(0x7992c2f0, 0x9391be48),
+ TOBN(0xaaa97cbd, 0x9ee0cd6e), TOBN(0x51b0310c, 0x3dc8c9bf),
+ TOBN(0x237f8acf, 0xdd9f22cb), TOBN(0xbb1d81a1, 0xb585d584),
+ TOBN(0x8d5d85f5, 0x8c416388), TOBN(0x0d6e5a5a, 0x42fe474f),
+ TOBN(0xe7812766, 0x38235d4e), TOBN(0x1c62bd67, 0x496e3298),
+ TOBN(0x8378660c, 0x3f175bc8), TOBN(0x4d04e189, 0x17afdd4d),
+ TOBN(0x32a81601, 0x85a8068c), TOBN(0xdb58e4e1, 0x92b29a85),
+ TOBN(0xe8a65b86, 0xc70d8a3b), TOBN(0x5f0e6f4e, 0x98a0403b),
+ TOBN(0x08129684, 0x69ed2370), TOBN(0x34dc30bd, 0x0871ee26),
+ TOBN(0x3a5ce948, 0x7c9c5b05), TOBN(0x7d487b80, 0x43a90c87),
+ TOBN(0x4089ba37, 0xdd0e7179), TOBN(0x45f80191, 0xb4041811),
+ TOBN(0x1c3e1058, 0x98747ba5), TOBN(0x98c4e13a, 0x6e1ae592),
+ TOBN(0xd44636e6, 0xe82c9f9e), TOBN(0x711db87c, 0xc33a1043),
+ TOBN(0x6f431263, 0xaa8aec05), TOBN(0x43ff120d, 0x2744a4aa),
+ TOBN(0xd3bd892f, 0xae77779b), TOBN(0xf0fe0cc9, 0x8cdc9f82),
+ TOBN(0xca5f7fe6, 0xf1c5b1bc), TOBN(0xcc63a682, 0x44929a72),
+ TOBN(0xc7eaba0c, 0x09dbe19a), TOBN(0x2f3585ad, 0x6b5c73c2),
+ TOBN(0x8ab8924b, 0x0ae50c30), TOBN(0x17fcd27a, 0x638b30ba),
+ TOBN(0xaf414d34, 0x10b3d5a5), TOBN(0x09c107d2, 0x2a9accf1),
+ TOBN(0x15dac49f, 0x946a6242), TOBN(0xaec3df2a, 0xd707d642),
+ TOBN(0x2c2492b7, 0x3f894ae0), TOBN(0xf59df3e5, 0xb75f18ce),
+ TOBN(0x7cb740d2, 0x8f53cad0), TOBN(0x3eb585fb, 0xc4f01294),
+ TOBN(0x17da0c86, 0x32c7f717), TOBN(0xeb8c795b, 0xaf943f4c),
+ TOBN(0x4ee23fb5, 0xf67c51d2), TOBN(0xef187575, 0x68889949),
+ TOBN(0xa6b4bdb2, 0x0389168b), TOBN(0xc4ecd258, 0xea577d03),
+ TOBN(0x3a63782b, 0x55743082), TOBN(0x6f678f4c, 0xc72f08cd),
+ TOBN(0x553511cf, 0x65e58dd8), TOBN(0xd53b4e3e, 0xd402c0cd),
+ TOBN(0x37de3e29, 0xa037c14c), TOBN(0x86b6c516, 0xc05712aa),
+ TOBN(0x2834da3e, 0xb38dff6f), TOBN(0xbe012c52, 0xea636be8),
+ TOBN(0x292d238c, 0x61dd37f8), TOBN(0x0e54523f, 0x8f8142db),
+ TOBN(0xe31eb436, 0x036a05d8), TOBN(0x83e3cdff, 0x1e93c0ff),
+ TOBN(0x3fd2fe0f, 0x50821ddf), TOBN(0xc8e19b0d, 0xff9eb33b),
+ TOBN(0xc8cc943f, 0xb569a5fe), TOBN(0xad0090d4, 0xd4342d75),
+ TOBN(0x82090b4b, 0xcaeca000), TOBN(0xca39687f, 0x1bd410eb),
+ TOBN(0xe7bb0df7, 0x65959d77), TOBN(0x39d78218, 0x9c964999),
+ TOBN(0xd87f62e8, 0xb2415451), TOBN(0xe5efb774, 0xbed76108),
+ TOBN(0x3ea011a4, 0xe822f0d0), TOBN(0xbc647ad1, 0x5a8704f8),
+ TOBN(0xbb315b35, 0x50c6820f), TOBN(0x863dec3d, 0xb7e76bec),
+ TOBN(0x01ff5d3a, 0xf017bfc7), TOBN(0x20054439, 0x976b8229),
+ TOBN(0x067fca37, 0x0bbd0d3b), TOBN(0xf63dde64, 0x7f5e3d0f),
+ TOBN(0x22dbefb3, 0x2a4c94e9), TOBN(0xafbff0fe, 0x96f8278a),
+ TOBN(0x80aea0b1, 0x3503793d), TOBN(0xb2238029, 0x5f06cd29),
+ TOBN(0x65703e57, 0x8ec3feca), TOBN(0x06c38314, 0x393e7053),
+ TOBN(0xa0b751eb, 0x7c6734c4), TOBN(0xd2e8a435, 0xc59f0f1e),
+ TOBN(0x147d9052, 0x5e9ca895), TOBN(0x2f4dd31e, 0x972072df),
+ TOBN(0xa16fda8e, 0xe6c6755c), TOBN(0xc66826ff, 0xcf196558),
+ TOBN(0x1f1a76a3, 0x0cf43895), TOBN(0xa9d604e0, 0x83c3097b),
+ TOBN(0xe1908309, 0x66390e0e), TOBN(0xa50bf753, 0xb3c85eff),
+ TOBN(0x0696bdde, 0xf6a70251), TOBN(0x548b801b, 0x3c6ab16a),
+ TOBN(0x37fcf704, 0xa4d08762), TOBN(0x090b3def, 0xdff76c4e),
+ TOBN(0x87e8cb89, 0x69cb9158), TOBN(0x44a90744, 0x995ece43),
+ TOBN(0xf85395f4, 0x0ad9fbf5), TOBN(0x49b0f6c5, 0x4fb0c82d),
+ TOBN(0x75d9bc15, 0xadf7cccf), TOBN(0x81a3e5d6, 0xdfa1e1b0),
+ TOBN(0x8c39e444, 0x249bc17e), TOBN(0xf37dccb2, 0x8ea7fd43),
+ TOBN(0xda654873, 0x907fba12), TOBN(0x35daa6da, 0x4a372904),
+ TOBN(0x0564cfc6, 0x6283a6c5), TOBN(0xd09fa4f6, 0x4a9395bf),
+ TOBN(0x688e9ec9, 0xaeb19a36), TOBN(0xd913f1ce, 0xc7bfbfb4),
+ TOBN(0x797b9a3c, 0x61c2faa6), TOBN(0x2f979bec, 0x6a0a9c12),
+ TOBN(0xb5969d0f, 0x359679ec), TOBN(0xebcf523d, 0x079b0460),
+ TOBN(0xfd6b0008, 0x10fab870), TOBN(0x3f2edcda, 0x9373a39c),
+ TOBN(0x0d64f9a7, 0x6f568431), TOBN(0xf848c27c, 0x02f8898c),
+ TOBN(0xf418ade1, 0x260b5bd5), TOBN(0xc1f3e323, 0x6973dee8),
+ TOBN(0x46e9319c, 0x26c185dd), TOBN(0x6d85b7d8, 0x546f0ac4),
+ TOBN(0x427965f2, 0x247f9d57), TOBN(0xb519b636, 0xb0035f48),
+ TOBN(0x6b6163a9, 0xab87d59c), TOBN(0xff9f58c3, 0x39caaa11),
+ TOBN(0x4ac39cde, 0x3177387b), TOBN(0x5f6557c2, 0x873e77f9),
+ TOBN(0x67504006, 0x36a83041), TOBN(0x9b1c96ca, 0x75ef196c),
+ TOBN(0xf34283de, 0xb08c7940), TOBN(0x7ea09644, 0x1128c316),
+ TOBN(0xb510b3b5, 0x6aa39dff), TOBN(0x59b43da2, 0x9f8e4d8c),
+ TOBN(0xa8ce31fd, 0x9e4c4b9f), TOBN(0x0e20be26, 0xc1303c01),
+ TOBN(0x18187182, 0xe8ee47c9), TOBN(0xd9687cdb, 0x7db98101),
+ TOBN(0x7a520e4d, 0xa1e14ff6), TOBN(0x429808ba, 0x8836d572),
+ TOBN(0xa37ca60d, 0x4944b663), TOBN(0xf901f7a9, 0xa3f91ae5),
+ TOBN(0xe4e3e76e, 0x9e36e3b1), TOBN(0x9aa219cf, 0x29d93250),
+ TOBN(0x347fe275, 0x056a2512), TOBN(0xa4d643d9, 0xde65d95c),
+ TOBN(0x9669d396, 0x699fc3ed), TOBN(0xb598dee2, 0xcf8c6bbe),
+ TOBN(0x682ac1e5, 0xdda9e5c6), TOBN(0x4e0d3c72, 0xcaa9fc95),
+ TOBN(0x17faaade, 0x772bea44), TOBN(0x5ef8428c, 0xab0009c8),
+ TOBN(0xcc4ce47a, 0x460ff016), TOBN(0xda6d12bf, 0x725281cb),
+ TOBN(0x44c67848, 0x0223aad2), TOBN(0x6e342afa, 0x36256e28),
+ TOBN(0x1400bb0b, 0x93a37c04), TOBN(0x62b1bc9b, 0xdd10bd96),
+ TOBN(0x7251adeb, 0x0dac46b7), TOBN(0x7d33b92e, 0x7be4ef51),
+ TOBN(0x28b2a94b, 0xe61fa29a), TOBN(0x4b2be13f, 0x06422233),
+ TOBN(0x36d6d062, 0x330d8d37), TOBN(0x5ef80e1e, 0xb28ca005),
+ TOBN(0x174d4699, 0x6d16768e), TOBN(0x9fc4ff6a, 0x628bf217),
+ TOBN(0x77705a94, 0x154e490d), TOBN(0x9d96dd28, 0x8d2d997a),
+ TOBN(0x77e2d9d8, 0xce5d72c4), TOBN(0x9d06c5a4, 0xc11c714f),
+ TOBN(0x02aa5136, 0x79e4a03e), TOBN(0x1386b3c2, 0x030ff28b),
+ TOBN(0xfe82e8a6, 0xfb283f61), TOBN(0x7df203e5, 0xf3abc3fb),
+ TOBN(0xeec7c351, 0x3a4d3622), TOBN(0xf7d17dbf, 0xdf762761),
+ TOBN(0xc3956e44, 0x522055f0), TOBN(0xde3012db, 0x8fa748db),
+ TOBN(0xca9fcb63, 0xbf1dcc14), TOBN(0xa56d9dcf, 0xbe4e2f3a),
+ TOBN(0xb86186b6, 0x8bcec9c2), TOBN(0x7cf24df9, 0x680b9f06),
+ TOBN(0xc46b45ea, 0xc0d29281), TOBN(0xfff42bc5, 0x07b10e12),
+ TOBN(0x12263c40, 0x4d289427), TOBN(0x3d5f1899, 0xb4848ec4),
+ TOBN(0x11f97010, 0xd040800c), TOBN(0xb4c5f529, 0x300feb20),
+ TOBN(0xcc543f8f, 0xde94fdcb), TOBN(0xe96af739, 0xc7c2f05e),
+ TOBN(0xaa5e0036, 0x882692e1), TOBN(0x09c75b68, 0x950d4ae9),
+ TOBN(0x62f63df2, 0xb5932a7a), TOBN(0x2658252e, 0xde0979ad),
+ TOBN(0x2a19343f, 0xb5e69631), TOBN(0x718c7501, 0x525b666b),
+ TOBN(0x26a42d69, 0xea40dc3a), TOBN(0xdc84ad22, 0xaecc018f),
+ TOBN(0x25c36c7b, 0x3270f04a), TOBN(0x46ba6d47, 0x50fa72ed),
+ TOBN(0x6c37d1c5, 0x93e58a8e), TOBN(0xa2394731, 0x120c088c),
+ TOBN(0xc3be4263, 0xcb6e86da), TOBN(0x2c417d36, 0x7126d038),
+ TOBN(0x5b70f9c5, 0x8b6f8efa), TOBN(0x671a2faa, 0x37718536),
+ TOBN(0xd3ced3c6, 0xb539c92b), TOBN(0xe56f1bd9, 0xa31203c2),
+ TOBN(0x8b096ec4, 0x9ff3c8eb), TOBN(0x2deae432, 0x43491cea),
+ TOBN(0x2465c6eb, 0x17943794), TOBN(0x5d267e66, 0x20586843),
+ TOBN(0x9d3d116d, 0xb07159d0), TOBN(0xae07a67f, 0xc1896210),
+ TOBN(0x8fc84d87, 0xbb961579), TOBN(0x30009e49, 0x1c1f8dd6),
+ TOBN(0x8a8caf22, 0xe3132819), TOBN(0xcffa197c, 0xf23ab4ff),
+ TOBN(0x58103a44, 0x205dd687), TOBN(0x57b796c3, 0x0ded67a2),
+ TOBN(0x0b9c3a6c, 0xa1779ad7), TOBN(0xa33cfe2e, 0x357c09c5),
+ TOBN(0x2ea29315, 0x3db4a57e), TOBN(0x91959695, 0x8ebeb52e),
+ TOBN(0x118db9a6, 0xe546c879), TOBN(0x8e996df4, 0x6295c8d6),
+ TOBN(0xdd990484, 0x55ec806b), TOBN(0x24f291ca, 0x165c1035),
+ TOBN(0xcca523bb, 0x440e2229), TOBN(0x324673a2, 0x73ef4d04),
+ TOBN(0xaf3adf34, 0x3e11ec39), TOBN(0x6136d7f1, 0xdc5968d3),
+ TOBN(0x7a7b2899, 0xb053a927), TOBN(0x3eaa2661, 0xae067ecd),
+ TOBN(0x8549b9c8, 0x02779cd9), TOBN(0x061d7940, 0xc53385ea),
+ TOBN(0x3e0ba883, 0xf06d18bd), TOBN(0x4ba6de53, 0xb2700843),
+ TOBN(0xb966b668, 0x591a9e4d), TOBN(0x93f67567, 0x7f4fa0ed),
+ TOBN(0x5a02711b, 0x4347237b), TOBN(0xbc041e2f, 0xe794608e),
+ TOBN(0x55af10f5, 0x70f73d8c), TOBN(0xd2d4d4f7, 0xbb7564f7),
+ TOBN(0xd7d27a89, 0xb3e93ce7), TOBN(0xf7b5a875, 0x5d3a2c1b),
+ TOBN(0xb29e68a0, 0x255b218a), TOBN(0xb533837e, 0x8af76754),
+ TOBN(0xd1b05a73, 0x579fab2e), TOBN(0xb41055a1, 0xecd74385),
+ TOBN(0xb2369274, 0x445e9115), TOBN(0x2972a7c4, 0xf520274e),
+ TOBN(0x6c08334e, 0xf678e68a), TOBN(0x4e4160f0, 0x99b057ed),
+ TOBN(0x3cfe11b8, 0x52ccb69a), TOBN(0x2fd1823a, 0x21c8f772),
+ TOBN(0xdf7f072f, 0x3298f055), TOBN(0x8c0566f9, 0xfec74a6e),
+ TOBN(0xe549e019, 0x5bb4d041), TOBN(0x7c3930ba, 0x9208d850),
+ TOBN(0xe07141fc, 0xaaa2902b), TOBN(0x539ad799, 0xe4f69ad3),
+ TOBN(0xa6453f94, 0x813f9ffd), TOBN(0xc58d3c48, 0x375bc2f7),
+ TOBN(0xb3326fad, 0x5dc64e96), TOBN(0x3aafcaa9, 0xb240e354),
+ TOBN(0x1d1b0903, 0xaca1e7a9), TOBN(0x4ceb9767, 0x1211b8a0),
+ TOBN(0xeca83e49, 0xe32a858e), TOBN(0x4c32892e, 0xae907bad),
+ TOBN(0xd5b42ab6, 0x2eb9b494), TOBN(0x7fde3ee2, 0x1eabae1b),
+ TOBN(0x13b5ab09, 0xcaf54957), TOBN(0xbfb028be, 0xe5f5d5d5),
+ TOBN(0x928a0650, 0x2003e2c0), TOBN(0x90793aac, 0x67476843),
+ TOBN(0x5e942e79, 0xc81710a0), TOBN(0x557e4a36, 0x27ccadd4),
+ TOBN(0x72a2bc56, 0x4bcf6d0c), TOBN(0x09ee5f43, 0x26d7b80c),
+ TOBN(0x6b70dbe9, 0xd4292f19), TOBN(0x56f74c26, 0x63f16b18),
+ TOBN(0xc23db0f7, 0x35fbb42a), TOBN(0xb606bdf6, 0x6ae10040),
+ TOBN(0x1eb15d4d, 0x044573ac), TOBN(0x7dc3cf86, 0x556b0ba4),
+ TOBN(0x97af9a33, 0xc60df6f7), TOBN(0x0b1ef85c, 0xa716ce8c),
+ TOBN(0x2922f884, 0xc96958be), TOBN(0x7c32fa94, 0x35690963),
+ TOBN(0x2d7f667c, 0xeaa00061), TOBN(0xeaaf7c17, 0x3547365c),
+ TOBN(0x1eb4de46, 0x87032d58), TOBN(0xc54f3d83, 0x5e2c79e0),
+ TOBN(0x07818df4, 0x5d04ef23), TOBN(0x55faa9c8, 0x673d41b4),
+ TOBN(0xced64f6f, 0x89b95355), TOBN(0x4860d2ea, 0xb7415c84),
+ TOBN(0x5fdb9bd2, 0x050ebad3), TOBN(0xdb53e0cc, 0x6685a5bf),
+ TOBN(0xb830c031, 0x9feb6593), TOBN(0xdd87f310, 0x6accff17),
+ TOBN(0x2303ebab, 0x9f555c10), TOBN(0x94603695, 0x287e7065),
+ TOBN(0xf88311c3, 0x2e83358c), TOBN(0x508dd9b4, 0xeefb0178),
+ TOBN(0x7ca23706, 0x2dba8652), TOBN(0x62aac5a3, 0x0047abe5),
+ TOBN(0x9a61d2a0, 0x8b1ea7b3), TOBN(0xd495ab63, 0xae8b1485),
+ TOBN(0x38740f84, 0x87052f99), TOBN(0x178ebe5b, 0xb2974eea),
+ TOBN(0x030bbcca, 0x5b36d17f), TOBN(0xb5e4cce3, 0xaaf86eea),
+ TOBN(0xb51a0220, 0x68f8e9e0), TOBN(0xa4348796, 0x09eb3e75),
+ TOBN(0xbe592309, 0xeef1a752), TOBN(0x5d7162d7, 0x6f2aa1ed),
+ TOBN(0xaebfb5ed, 0x0f007dd2), TOBN(0x255e14b2, 0xc89edd22),
+ TOBN(0xba85e072, 0x0303b697), TOBN(0xc5d17e25, 0xf05720ff),
+ TOBN(0x02b58d6e, 0x5128ebb6), TOBN(0x2c80242d, 0xd754e113),
+ TOBN(0x919fca5f, 0xabfae1ca), TOBN(0x937afaac, 0x1a21459b),
+ TOBN(0x9e0ca91c, 0x1f66a4d2), TOBN(0x194cc7f3, 0x23ec1331),
+ TOBN(0xad25143a, 0x8aa11690), TOBN(0xbe40ad8d, 0x09b59e08),
+ TOBN(0x37d60d9b, 0xe750860a), TOBN(0x6c53b008, 0xc6bf434c),
+ TOBN(0xb572415d, 0x1356eb80), TOBN(0xb8bf9da3, 0x9578ded8),
+ TOBN(0x22658e36, 0x5e8fb38b), TOBN(0x9b70ce22, 0x5af8cb22),
+ TOBN(0x7c00018a, 0x829a8180), TOBN(0x84329f93, 0xb81ed295),
+ TOBN(0x7c343ea2, 0x5f3cea83), TOBN(0x38f8655f, 0x67586536),
+ TOBN(0xa661a0d0, 0x1d3ec517), TOBN(0x98744652, 0x512321ae),
+ TOBN(0x084ca591, 0xeca92598), TOBN(0xa9bb9dc9, 0x1dcb3feb),
+ TOBN(0x14c54355, 0x78b4c240), TOBN(0x5ed62a3b, 0x610cafdc),
+ TOBN(0x07512f37, 0x1b38846b), TOBN(0x571bb70a, 0xb0e38161),
+ TOBN(0xb556b95b, 0x2da705d2), TOBN(0x3ef8ada6, 0xb1a08f98),
+ TOBN(0x85302ca7, 0xddecfbe5), TOBN(0x0e530573, 0x943105cd),
+ TOBN(0x60554d55, 0x21a9255d), TOBN(0x63a32fa1, 0xf2f3802a),
+ TOBN(0x35c8c5b0, 0xcd477875), TOBN(0x97f458ea, 0x6ad42da1),
+ TOBN(0x832d7080, 0xeb6b242d), TOBN(0xd30bd023, 0x3b71e246),
+ TOBN(0x7027991b, 0xbe31139d), TOBN(0x68797e91, 0x462e4e53),
+ TOBN(0x423fe20a, 0x6b4e185a), TOBN(0x82f2c67e, 0x42d9b707),
+ TOBN(0x25c81768, 0x4cf7811b), TOBN(0xbd53005e, 0x045bb95d),}
+ ,
+ {TOBN(0xe5f649be, 0x9d8e68fd), TOBN(0xdb0f0533, 0x1b044320),
+ TOBN(0xf6fde9b3, 0xe0c33398), TOBN(0x92f4209b, 0x66c8cfae),
+ TOBN(0xe9d1afcc, 0x1a739d4b), TOBN(0x09aea75f, 0xa28ab8de),
+ TOBN(0x14375fb5, 0xeac6f1d0), TOBN(0x6420b560, 0x708f7aa5),
+ TOBN(0x9eae499c, 0x6254dc41), TOBN(0x7e293924, 0x7a837e7e),
+ TOBN(0x74aec08c, 0x090524a7), TOBN(0xf82b9219, 0x8d6f55f2),
+ TOBN(0x493c962e, 0x1402cec5), TOBN(0x9f17ca17, 0xfa2f30e7),
+ TOBN(0xbcd783e8, 0xe9b879cb), TOBN(0xea3d8c14, 0x5a6f145f),
+ TOBN(0xdede15e7, 0x5e0dee6e), TOBN(0x74f24872, 0xdc628aa2),
+ TOBN(0xd3e9c4fe, 0x7861bb93), TOBN(0x56d4822a, 0x6187b2e0),
+ TOBN(0xb66417cf, 0xc59826f9), TOBN(0xca260969, 0x2408169e),
+ TOBN(0xedf69d06, 0xc79ef885), TOBN(0x00031f8a, 0xdc7d138f),
+ TOBN(0x103c46e6, 0x0ebcf726), TOBN(0x4482b831, 0x6231470e),
+ TOBN(0x6f6dfaca, 0x487c2109), TOBN(0x2e0ace97, 0x62e666ef),
+ TOBN(0x3246a9d3, 0x1f8d1f42), TOBN(0x1b1e83f1, 0x574944d2),
+ TOBN(0x13dfa63a, 0xa57f334b), TOBN(0x0cf8daed, 0x9f025d81),
+ TOBN(0x30d78ea8, 0x00ee11c1), TOBN(0xeb053cd4, 0xb5e3dd75),
+ TOBN(0x9b65b13e, 0xd58c43c5), TOBN(0xc3ad49bd, 0xbd151663),
+ TOBN(0x99fd8e41, 0xb6427990), TOBN(0x12cf15bd, 0x707eae1e),
+ TOBN(0x29ad4f1b, 0x1aabb71e), TOBN(0x5143e74d, 0x07545d0e),
+ TOBN(0x30266336, 0xc88bdee1), TOBN(0x25f29306, 0x5876767c),
+ TOBN(0x9c078571, 0xc6731996), TOBN(0xc88690b2, 0xed552951),
+ TOBN(0x274f2c2d, 0x852705b4), TOBN(0xb0bf8d44, 0x4e09552d),
+ TOBN(0x7628beeb, 0x986575d1), TOBN(0x407be238, 0x7f864651),
+ TOBN(0x0e5e3049, 0xa639fc6b), TOBN(0xe75c35d9, 0x86003625),
+ TOBN(0x0cf35bd8, 0x5dcc1646), TOBN(0x8bcaced2, 0x6c26273a),
+ TOBN(0xe22ecf1d, 0xb5536742), TOBN(0x013dd897, 0x1a9e068b),
+ TOBN(0x17f411cb, 0x8a7909c5), TOBN(0x5757ac98, 0x861dd506),
+ TOBN(0x85de1f0d, 0x1e935abb), TOBN(0xdefd10b4, 0x154de37a),
+ TOBN(0xb8d9e392, 0x369cebb5), TOBN(0x54d5ef9b, 0x761324be),
+ TOBN(0x4d6341ba, 0x74f17e26), TOBN(0xc0a0e3c8, 0x78c1dde4),
+ TOBN(0xa6d77581, 0x87d918fd), TOBN(0x66876015, 0x02ca3a13),
+ TOBN(0xc7313e9c, 0xf36658f0), TOBN(0xc433ef1c, 0x71f8057e),
+ TOBN(0x85326246, 0x1b6a835a), TOBN(0xc8f05398, 0x7c86394c),
+ TOBN(0xff398cdf, 0xe983c4a1), TOBN(0xbf5e8162, 0x03b7b931),
+ TOBN(0x93193c46, 0xb7b9045b), TOBN(0x1e4ebf5d, 0xa4a6e46b),
+ TOBN(0xf9942a60, 0x43a24fe7), TOBN(0x29c1191e, 0xffb3492b),
+ TOBN(0x9f662449, 0x902fde05), TOBN(0xc792a7ac, 0x6713c32d),
+ TOBN(0x2fd88ad8, 0xb737982c), TOBN(0x7e3a0319, 0xa21e60e3),
+ TOBN(0x09b0de44, 0x7383591a), TOBN(0x6df141ee, 0x8310a456),
+ TOBN(0xaec1a039, 0xe6d6f471), TOBN(0x14b2ba0f, 0x1198d12e),
+ TOBN(0xebc1a160, 0x3aeee5ac), TOBN(0x401f4836, 0xe0b964ce),
+ TOBN(0x2ee43796, 0x4fd03f66), TOBN(0x3fdb4e49, 0xdd8f3f12),
+ TOBN(0x6ef267f6, 0x29380f18), TOBN(0x3e8e9670, 0x8da64d16),
+ TOBN(0xbc19180c, 0x207674f1), TOBN(0x112e09a7, 0x33ae8fdb),
+ TOBN(0x99667554, 0x6aaeb71e), TOBN(0x79432af1, 0xe101b1c7),
+ TOBN(0xd5eb558f, 0xde2ddec6), TOBN(0x81392d1f, 0x5357753f),
+ TOBN(0xa7a76b97, 0x3ae1158a), TOBN(0x416fbbff, 0x4a899991),
+ TOBN(0x9e65fdfd, 0x0d4a9dcf), TOBN(0x7bc29e48, 0x944ddf12),
+ TOBN(0xbc1a92d9, 0x3c856866), TOBN(0x273c6905, 0x6e98dfe2),
+ TOBN(0x69fce418, 0xcdfaa6b8), TOBN(0x606bd823, 0x5061c69f),
+ TOBN(0x42d495a0, 0x6af75e27), TOBN(0x8ed3d505, 0x6d873a1f),
+ TOBN(0xaf552841, 0x6ab25b6a), TOBN(0xc6c0ffc7, 0x2b1a4523),
+ TOBN(0xab18827b, 0x21c99e03), TOBN(0x060e8648, 0x9034691b),
+ TOBN(0x5207f90f, 0x93c7f398), TOBN(0x9f4a96cb, 0x82f8d10b),
+ TOBN(0xdd71cd79, 0x3ad0f9e3), TOBN(0x84f435d2, 0xfc3a54f5),
+ TOBN(0x4b03c55b, 0x8e33787f), TOBN(0xef42f975, 0xa6384673),
+ TOBN(0xff7304f7, 0x5051b9f0), TOBN(0x18aca1dc, 0x741c87c2),
+ TOBN(0x56f120a7, 0x2d4bfe80), TOBN(0xfd823b3d, 0x053e732c),
+ TOBN(0x11bccfe4, 0x7537ca16), TOBN(0xdf6c9c74, 0x1b5a996b),
+ TOBN(0xee7332c7, 0x904fc3fa), TOBN(0x14a23f45, 0xc7e3636a),
+ TOBN(0xc38659c3, 0xf091d9aa), TOBN(0x4a995e5d, 0xb12d8540),
+ TOBN(0x20a53bec, 0xf3a5598a), TOBN(0x56534b17, 0xb1eaa995),
+ TOBN(0x9ed3dca4, 0xbf04e03c), TOBN(0x716c563a, 0xd8d56268),
+ TOBN(0x27ba77a4, 0x1d6178e7), TOBN(0xe4c80c40, 0x68a1ff8e),
+ TOBN(0x75011099, 0x0a13f63d), TOBN(0x7bf33521, 0xa61d46f3),
+ TOBN(0x0aff218e, 0x10b365bb), TOBN(0x81021804, 0x0fd7ea75),
+ TOBN(0x05a3fd8a, 0xa4b3a925), TOBN(0xb829e75f, 0x9b3db4e6),
+ TOBN(0x6bdc75a5, 0x4d53e5fb), TOBN(0x04a5dc02, 0xd52717e3),
+ TOBN(0x86af502f, 0xe9a42ec2), TOBN(0x8867e8fb, 0x2630e382),
+ TOBN(0xbf845c6e, 0xbec9889b), TOBN(0x54f491f2, 0xcb47c98d),
+ TOBN(0xa3091fba, 0x790c2a12), TOBN(0xd7f6fd78, 0xc20f708b),
+ TOBN(0xa569ac30, 0xacde5e17), TOBN(0xd0f996d0, 0x6852b4d7),
+ TOBN(0xe51d4bb5, 0x4609ae54), TOBN(0x3fa37d17, 0x0daed061),
+ TOBN(0x62a88684, 0x34b8fb41), TOBN(0x99a2acbd, 0x9efb64f1),
+ TOBN(0xb75c1a5e, 0x6448e1f2), TOBN(0xfa99951a, 0x42b5a069),
+ TOBN(0x6d956e89, 0x2f3b26e7), TOBN(0xf4709860, 0xda875247),
+ TOBN(0x3ad15179, 0x2482dda3), TOBN(0xd64110e3, 0x017d82f0),
+ TOBN(0x14928d2c, 0xfad414e4), TOBN(0x2b155f58, 0x2ed02b24),
+ TOBN(0x481a141b, 0xcb821bf1), TOBN(0x12e3c770, 0x4f81f5da),
+ TOBN(0xe49c5de5, 0x9fff8381), TOBN(0x11053232, 0x5bbec894),
+ TOBN(0xa0d051cc, 0x454d88c4), TOBN(0x4f6db89c, 0x1f8e531b),
+ TOBN(0x34fe3fd6, 0xca563a44), TOBN(0x7f5c2215, 0x58da8ab9),
+ TOBN(0x8445016d, 0x9474f0a1), TOBN(0x17d34d61, 0xcb7d8a0a),
+ TOBN(0x8e9d3910, 0x1c474019), TOBN(0xcaff2629, 0xd52ceefb),
+ TOBN(0xf9cf3e32, 0xc1622c2b), TOBN(0xd4b95e3c, 0xe9071a05),
+ TOBN(0xfbbca61f, 0x1594438c), TOBN(0x1eb6e6a6, 0x04aadedf),
+ TOBN(0x853027f4, 0x68e14940), TOBN(0x221d322a, 0xdfabda9c),
+ TOBN(0xed8ea9f6, 0xb7cb179a), TOBN(0xdc7b764d, 0xb7934dcc),
+ TOBN(0xfcb13940, 0x5e09180d), TOBN(0x6629a6bf, 0xb47dc2dd),
+ TOBN(0xbfc55e4e, 0x9f5a915e), TOBN(0xb1db9d37, 0x6204441e),
+ TOBN(0xf82d68cf, 0x930c5f53), TOBN(0x17d3a142, 0xcbb605b1),
+ TOBN(0xdd5944ea, 0x308780f2), TOBN(0xdc8de761, 0x3845f5e4),
+ TOBN(0x6beaba7d, 0x7624d7a3), TOBN(0x1e709afd, 0x304df11e),
+ TOBN(0x95364376, 0x02170456), TOBN(0xbf204b3a, 0xc8f94b64),
+ TOBN(0x4e53af7c, 0x5680ca68), TOBN(0x0526074a, 0xe0c67574),
+ TOBN(0x95d8cef8, 0xecd92af6), TOBN(0xe6b9fa7a, 0x6cd1745a),
+ TOBN(0x3d546d3d, 0xa325c3e4), TOBN(0x1f57691d, 0x9ae93aae),
+ TOBN(0xe891f3fe, 0x9d2e1a33), TOBN(0xd430093f, 0xac063d35),
+ TOBN(0xeda59b12, 0x5513a327), TOBN(0xdc2134f3, 0x5536f18f),
+ TOBN(0xaa51fe2c, 0x5c210286), TOBN(0x3f68aaee, 0x1cab658c),
+ TOBN(0x5a23a00b, 0xf9357292), TOBN(0x9a626f39, 0x7efdabed),
+ TOBN(0xfe2b3bf3, 0x199d78e3), TOBN(0xb7a2af77, 0x71bbc345),
+ TOBN(0x3d19827a, 0x1e59802c), TOBN(0x823bbc15, 0xb487a51c),
+ TOBN(0x856139f2, 0x99d0a422), TOBN(0x9ac3df65, 0xf456c6fb),
+ TOBN(0xaddf65c6, 0x701f8bd6), TOBN(0x149f321e, 0x3758df87),
+ TOBN(0xb1ecf714, 0x721b7eba), TOBN(0xe17df098, 0x31a3312a),
+ TOBN(0xdb2fd6ec, 0xd5c4d581), TOBN(0xfd02996f, 0x8fcea1b3),
+ TOBN(0xe29fa63e, 0x7882f14f), TOBN(0xc9f6dc35, 0x07c6cadc),
+ TOBN(0x46f22d6f, 0xb882bed0), TOBN(0x1a45755b, 0xd118e52c),
+ TOBN(0x9f2c7c27, 0x7c4608cf), TOBN(0x7ccbdf32, 0x568012c2),
+ TOBN(0xfcb0aedd, 0x61729b0e), TOBN(0x7ca2ca9e, 0xf7d75dbf),
+ TOBN(0xf58fecb1, 0x6f640f62), TOBN(0xe274b92b, 0x39f51946),
+ TOBN(0x7f4dfc04, 0x6288af44), TOBN(0x0a91f32a, 0xeac329e5),
+ TOBN(0x43ad274b, 0xd6aaba31), TOBN(0x719a1640, 0x0f6884f9),
+ TOBN(0x685d29f6, 0xdaf91e20), TOBN(0x5ec1cc33, 0x27e49d52),
+ TOBN(0x38f4de96, 0x3b54a059), TOBN(0x0e0015e5, 0xefbcfdb3),
+ TOBN(0x177d23d9, 0x4dbb8da6), TOBN(0x98724aa2, 0x97a617ad),
+ TOBN(0x30f0885b, 0xfdb6558e), TOBN(0xf9f7a28a, 0xc7899a96),
+ TOBN(0xd2ae8ac8, 0x872dc112), TOBN(0xfa0642ca, 0x73c3c459),
+ TOBN(0x15296981, 0xe7dfc8d6), TOBN(0x67cd4450, 0x1fb5b94a),
+ TOBN(0x0ec71cf1, 0x0eddfd37), TOBN(0xc7e5eeb3, 0x9a8eddc7),
+ TOBN(0x02ac8e3d, 0x81d95028), TOBN(0x0088f172, 0x70b0e35d),
+ TOBN(0xec041fab, 0xe1881fe3), TOBN(0x62cf71b8, 0xd99e7faa),
+ TOBN(0x5043dea7, 0xe0f222c2), TOBN(0x309d42ac, 0x72e65142),
+ TOBN(0x94fe9ddd, 0x9216cd30), TOBN(0xd6539c7d, 0x0f87feec),
+ TOBN(0x03c5a57c, 0x432ac7d7), TOBN(0x72692cf0, 0x327fda10),
+ TOBN(0xec28c85f, 0x280698de), TOBN(0x2331fb46, 0x7ec283b1),
+ TOBN(0xd34bfa32, 0x2867e633), TOBN(0x78709a82, 0x0a9cc815),
+ TOBN(0xb7fe6964, 0x875e2fa5), TOBN(0x25cc064f, 0x9e98bfb5),
+ TOBN(0x9eb0151c, 0x493a65c5), TOBN(0x5fb5d941, 0x53182464),
+ TOBN(0x69e6f130, 0xf04618e2), TOBN(0xa8ecec22, 0xf89c8ab6),
+ TOBN(0xcd6ac88b, 0xb96209bd), TOBN(0x65fa8cdb, 0xb3e1c9e0),
+ TOBN(0xa47d22f5, 0x4a8d8eac), TOBN(0x83895cdf, 0x8d33f963),
+ TOBN(0xa8adca59, 0xb56cd3d1), TOBN(0x10c8350b, 0xdaf38232),
+ TOBN(0x2b161fb3, 0xa5080a9f), TOBN(0xbe7f5c64, 0x3af65b3a),
+ TOBN(0x2c754039, 0x97403a11), TOBN(0x94626cf7, 0x121b96af),
+ TOBN(0x431de7c4, 0x6a983ec2), TOBN(0x3780dd3a, 0x52cc3df7),
+ TOBN(0xe28a0e46, 0x2baf8e3b), TOBN(0xabe68aad, 0x51d299ae),
+ TOBN(0x603eb8f9, 0x647a2408), TOBN(0x14c61ed6, 0x5c750981),
+ TOBN(0x88b34414, 0xc53352e7), TOBN(0x5a34889c, 0x1337d46e),
+ TOBN(0x612c1560, 0xf95f2bc8), TOBN(0x8a3f8441, 0xd4807a3a),
+ TOBN(0x680d9e97, 0x5224da68), TOBN(0x60cd6e88, 0xc3eb00e9),
+ TOBN(0x3875a98e, 0x9a6bc375), TOBN(0xdc80f924, 0x4fd554c2),
+ TOBN(0x6c4b3415, 0x6ac77407), TOBN(0xa1e5ea8f, 0x25420681),
+ TOBN(0x541bfa14, 0x4607a458), TOBN(0x5dbc7e7a, 0x96d7fbf9),
+ TOBN(0x646a851b, 0x31590a47), TOBN(0x039e85ba, 0x15ee6df8),
+ TOBN(0xd19fa231, 0xd7b43fc0), TOBN(0x84bc8be8, 0x299a0e04),
+ TOBN(0x2b9d2936, 0xf20df03a), TOBN(0x24054382, 0x8608d472),
+ TOBN(0x76b6ba04, 0x9149202a), TOBN(0xb21c3831, 0x3670e7b7),
+ TOBN(0xddd93059, 0xd6fdee10), TOBN(0x9da47ad3, 0x78488e71),
+ TOBN(0x99cc1dfd, 0xa0fcfb25), TOBN(0x42abde10, 0x64696954),
+ TOBN(0x14cc15fc, 0x17eab9fe), TOBN(0xd6e863e4, 0xd3e70972),
+ TOBN(0x29a7765c, 0x6432112c), TOBN(0x88660001, 0x5b0774d8),
+ TOBN(0x3729175a, 0x2c088eae), TOBN(0x13afbcae, 0x8230b8d4),
+ TOBN(0x44768151, 0x915f4379), TOBN(0xf086431a, 0xd8d22812),
+ TOBN(0x37461955, 0xc298b974), TOBN(0x905fb5f0, 0xf8711e04),
+ TOBN(0x787abf3a, 0xfe969d18), TOBN(0x392167c2, 0x6f6a494e),
+ TOBN(0xfc7a0d2d, 0x28c511da), TOBN(0xf127c7dc, 0xb66a262d),
+ TOBN(0xf9c4bb95, 0xfd63fdf0), TOBN(0x90016589, 0x3913ef46),
+ TOBN(0x74d2a73c, 0x11aa600d), TOBN(0x2f5379bd, 0x9fb5ab52),
+ TOBN(0xe49e53a4, 0x7fb70068), TOBN(0x68dd39e5, 0x404aa9a7),
+ TOBN(0xb9b0cf57, 0x2ecaa9c3), TOBN(0xba0e103b, 0xe824826b),
+ TOBN(0x60c2198b, 0x4631a3c4), TOBN(0xc5ff84ab, 0xfa8966a2),
+ TOBN(0x2d6ebe22, 0xac95aff8), TOBN(0x1c9bb6db, 0xb5a46d09),
+ TOBN(0x419062da, 0x53ee4f8d), TOBN(0x7b9042d0, 0xbb97efef),
+ TOBN(0x0f87f080, 0x830cf6bd), TOBN(0x4861d19a, 0x6ec8a6c6),
+ TOBN(0xd3a0daa1, 0x202f01aa), TOBN(0xb0111674, 0xf25afbd5),
+ TOBN(0x6d00d6cf, 0x1afb20d9), TOBN(0x13695000, 0x40671bc5),
+ TOBN(0x913ab0dc, 0x2485ea9b), TOBN(0x1f2bed06, 0x9eef61ac),
+ TOBN(0x850c8217, 0x6d799e20), TOBN(0x93415f37, 0x3271c2de),
+ TOBN(0x5afb06e9, 0x6c4f5910), TOBN(0x688a52df, 0xc4e9e421),
+ TOBN(0x30495ba3, 0xe2a9a6db), TOBN(0x4601303d, 0x58f9268b),
+ TOBN(0xbe3b0dad, 0x7eb0f04f), TOBN(0x4ea47250, 0x4456936d),
+ TOBN(0x8caf8798, 0xd33fd3e7), TOBN(0x1ccd8a89, 0xeb433708),
+ TOBN(0x9effe3e8, 0x87fd50ad), TOBN(0xbe240a56, 0x6b29c4df),
+ TOBN(0xec4ffd98, 0xca0e7ebd), TOBN(0xf586783a, 0xe748616e),
+ TOBN(0xa5b00d8f, 0xc77baa99), TOBN(0x0acada29, 0xb4f34c9c),
+ TOBN(0x36dad67d, 0x0fe723ac), TOBN(0x1d8e53a5, 0x39c36c1e),
+ TOBN(0xe4dd342d, 0x1f4bea41), TOBN(0x64fd5e35, 0xebc9e4e0),
+ TOBN(0x96f01f90, 0x57908805), TOBN(0xb5b9ea3d, 0x5ed480dd),
+ TOBN(0x366c5dc2, 0x3efd2dd0), TOBN(0xed2fe305, 0x6e9dfa27),
+ TOBN(0x4575e892, 0x6e9197e2), TOBN(0x11719c09, 0xab502a5d),
+ TOBN(0x264c7bec, 0xe81f213f), TOBN(0x741b9241, 0x55f5c457),
+ TOBN(0x78ac7b68, 0x49a5f4f4), TOBN(0xf91d70a2, 0x9fc45b7d),
+ TOBN(0x39b05544, 0xb0f5f355), TOBN(0x11f06bce, 0xeef930d9),
+ TOBN(0xdb84d25d, 0x038d05e1), TOBN(0x04838ee5, 0xbacc1d51),
+ TOBN(0x9da3ce86, 0x9e8ee00b), TOBN(0xc3412057, 0xc36eda1f),
+ TOBN(0xae80b913, 0x64d9c2f4), TOBN(0x7468bac3, 0xa010a8ff),
+ TOBN(0xdfd20037, 0x37359d41), TOBN(0x1a0f5ab8, 0x15efeacc),
+ TOBN(0x7c25ad2f, 0x659d0ce0), TOBN(0x4011bcbb, 0x6785cff1),
+ TOBN(0x128b9912, 0x7e2192c7), TOBN(0xa549d8e1, 0x13ccb0e8),
+ TOBN(0x805588d8, 0xc85438b1), TOBN(0x5680332d, 0xbc25cb27),
+ TOBN(0xdcd1bc96, 0x1a4bfdf4), TOBN(0x779ff428, 0x706f6566),
+ TOBN(0x8bbee998, 0xf059987a), TOBN(0xf6ce8cf2, 0xcc686de7),
+ TOBN(0xf8ad3c4a, 0x953cfdb2), TOBN(0xd1d426d9, 0x2205da36),
+ TOBN(0xb3c0f13f, 0xc781a241), TOBN(0x3e89360e, 0xd75362a8),
+ TOBN(0xccd05863, 0xc8a91184), TOBN(0x9bd0c9b7, 0xefa8a7f4),
+ TOBN(0x97ee4d53, 0x8a912a4b), TOBN(0xde5e15f8, 0xbcf518fd),
+ TOBN(0x6a055bf8, 0xc467e1e0), TOBN(0x10be4b4b, 0x1587e256),
+ TOBN(0xd90c14f2, 0x668621c9), TOBN(0xd5518f51, 0xab9c92c1),
+ TOBN(0x8e6a0100, 0xd6d47b3c), TOBN(0xcbe980dd, 0x66716175),
+ TOBN(0x500d3f10, 0xddd83683), TOBN(0x3b6cb35d, 0x99cac73c),
+ TOBN(0x53730c8b, 0x6083d550), TOBN(0xcf159767, 0xdf0a1987),
+ TOBN(0x84bfcf53, 0x43ad73b3), TOBN(0x1b528c20, 0x4f035a94),
+ TOBN(0x4294edf7, 0x33eeac69), TOBN(0xb6283e83, 0x817f3240),
+ TOBN(0xc3fdc959, 0x0a5f25b1), TOBN(0xefaf8aa5, 0x5844ee22),
+ TOBN(0xde269ba5, 0xdbdde4de), TOBN(0xe3347160, 0xc56133bf),
+ TOBN(0xc1184219, 0x8d9ea9f8), TOBN(0x090de5db, 0xf3fc1ab5),
+ TOBN(0x404c37b1, 0x0bf22cda), TOBN(0x7de20ec8, 0xf5618894),
+ TOBN(0x754c588e, 0xecdaecab), TOBN(0x6ca4b0ed, 0x88342743),
+ TOBN(0x76f08bdd, 0xf4a938ec), TOBN(0xd182de89, 0x91493ccb),
+ TOBN(0xd652c53e, 0xc8a4186a), TOBN(0xb3e878db, 0x946d8e33),
+ TOBN(0x088453c0, 0x5f37663c), TOBN(0x5cd9daaa, 0xb407748b),
+ TOBN(0xa1f5197f, 0x586d5e72), TOBN(0x47500be8, 0xc443ca59),
+ TOBN(0x78ef35b2, 0xe2652424), TOBN(0x09c5d26f, 0x6dd7767d),
+ TOBN(0x7175a79a, 0xa74d3f7b), TOBN(0x0428fd8d, 0xcf5ea459),
+ TOBN(0x511cb97c, 0xa5d1746d), TOBN(0x36363939, 0xe71d1278),
+ TOBN(0xcf2df955, 0x10350bf4), TOBN(0xb3817439, 0x60aae782),
+ TOBN(0xa748c0e4, 0x3e688809), TOBN(0x98021fbf, 0xd7a5a006),
+ TOBN(0x9076a70c, 0x0e367a98), TOBN(0xbea1bc15, 0x0f62b7c2),
+ TOBN(0x2645a68c, 0x30fe0343), TOBN(0xacaffa78, 0x699dc14f),
+ TOBN(0xf4469964, 0x457bf9c4), TOBN(0x0db6407b, 0x0d2ead83),
+ TOBN(0x68d56cad, 0xb2c6f3eb), TOBN(0x3b512e73, 0xf376356c),
+ TOBN(0xe43b0e1f, 0xfce10408), TOBN(0x89ddc003, 0x5a5e257d),
+ TOBN(0xb0ae0d12, 0x0362e5b3), TOBN(0x07f983c7, 0xb0519161),
+ TOBN(0xc2e94d15, 0x5d5231e7), TOBN(0xcff22aed, 0x0b4f9513),
+ TOBN(0xb02588dd, 0x6ad0b0b5), TOBN(0xb967d1ac, 0x11d0dcd5),
+ TOBN(0x8dac6bc6, 0xcf777b6c), TOBN(0x0062bdbd, 0x4c6d1959),
+ TOBN(0x53da71b5, 0x0ef5cc85), TOBN(0x07012c7d, 0x4006f14f),
+ TOBN(0x4617f962, 0xac47800d), TOBN(0x53365f2b, 0xc102ed75),
+ TOBN(0xb422efcb, 0x4ab8c9d3), TOBN(0x195cb26b, 0x34af31c9),
+ TOBN(0x3a926e29, 0x05f2c4ce), TOBN(0xbd2bdecb, 0x9856966c),
+ TOBN(0x5d16ab3a, 0x85527015), TOBN(0x9f81609e, 0x4486c231),
+ TOBN(0xd8b96b2c, 0xda350002), TOBN(0xbd054690, 0xfa1b7d36),
+ TOBN(0xdc90ebf5, 0xe71d79bc), TOBN(0xf241b6f9, 0x08964e4e),
+ TOBN(0x7c838643, 0x2fe3cd4c), TOBN(0xe0f33acb, 0xb4bc633c),
+ TOBN(0xb4a9ecec, 0x3d139f1f), TOBN(0x05ce69cd, 0xdc4a1f49),
+ TOBN(0xa19d1b16, 0xf5f98aaf), TOBN(0x45bb71d6, 0x6f23e0ef),
+ TOBN(0x33789fcd, 0x46cdfdd3), TOBN(0x9b8e2978, 0xcee040ca),
+ TOBN(0x9c69b246, 0xae0a6828), TOBN(0xba533d24, 0x7078d5aa),
+ TOBN(0x7a2e42c0, 0x7bb4fbdb), TOBN(0xcfb4879a, 0x7035385c),
+ TOBN(0x8c3dd30b, 0x3281705b), TOBN(0x7e361c6c, 0x404fe081),
+ TOBN(0x7b21649c, 0x3f604edf), TOBN(0x5dbf6a3f, 0xe52ffe47),
+ TOBN(0xc41b7c23, 0x4b54d9bf), TOBN(0x1374e681, 0x3511c3d9),
+ TOBN(0x1863bf16, 0xc1b2b758), TOBN(0x90e78507, 0x1e9e6a96),
+ TOBN(0xab4bf98d, 0x5d86f174), TOBN(0xd74e0bd3, 0x85e96fe4),
+ TOBN(0x8afde39f, 0xcac5d344), TOBN(0x90946dbc, 0xbd91b847),
+ TOBN(0xf5b42358, 0xfe1a838c), TOBN(0x05aae6c5, 0x620ac9d8),
+ TOBN(0x8e193bd8, 0xa1ce5a0b), TOBN(0x8f710571, 0x4dabfd72),
+ TOBN(0x8d8fdd48, 0x182caaac), TOBN(0x8c4aeefa, 0x040745cf),
+ TOBN(0x73c6c30a, 0xf3b93e6d), TOBN(0x991241f3, 0x16f42011),
+ TOBN(0xa0158eea, 0xe457a477), TOBN(0xd19857db, 0xee6ddc05),
+ TOBN(0xb3265224, 0x18c41671), TOBN(0x3ffdfc7e, 0x3c2c0d58),
+ TOBN(0x3a3a5254, 0x26ee7cda), TOBN(0x341b0869, 0xdf02c3a8),
+ TOBN(0xa023bf42, 0x723bbfc8), TOBN(0x3d15002a, 0x14452691),}
+ ,
+ {TOBN(0x5ef7324c, 0x85edfa30), TOBN(0x25976554, 0x87d4f3da),
+ TOBN(0x352f5bc0, 0xdcb50c86), TOBN(0x8f6927b0, 0x4832a96c),
+ TOBN(0xd08ee1ba, 0x55f2f94c), TOBN(0x6a996f99, 0x344b45fa),
+ TOBN(0xe133cb8d, 0xa8aa455d), TOBN(0x5d0721ec, 0x758dc1f7),
+ TOBN(0x6ba7a920, 0x79e5fb67), TOBN(0xe1331feb, 0x70aa725e),
+ TOBN(0x5080ccf5, 0x7df5d837), TOBN(0xe4cae01d, 0x7ff72e21),
+ TOBN(0xd9243ee6, 0x0412a77d), TOBN(0x06ff7cac, 0xdf449025),
+ TOBN(0xbe75f7cd, 0x23ef5a31), TOBN(0xbc957822, 0x0ddef7a8),
+ TOBN(0x8cf7230c, 0xb0ce1c55), TOBN(0x5b534d05, 0x0bbfb607),
+ TOBN(0xee1ef113, 0x0e16363b), TOBN(0x27e0aa7a, 0xb4999e82),
+ TOBN(0xce1dac2d, 0x79362c41), TOBN(0x67920c90, 0x91bb6cb0),
+ TOBN(0x1e648d63, 0x2223df24), TOBN(0x0f7d9eef, 0xe32e8f28),
+ TOBN(0x6943f39a, 0xfa833834), TOBN(0x22951722, 0xa6328562),
+ TOBN(0x81d63dd5, 0x4170fc10), TOBN(0x9f5fa58f, 0xaecc2e6d),
+ TOBN(0xb66c8725, 0xe77d9a3b), TOBN(0x11235cea, 0x6384ebe0),
+ TOBN(0x06a8c118, 0x5845e24a), TOBN(0x0137b286, 0xebd093b1),
+ TOBN(0xc589e1ce, 0x44ace150), TOBN(0xe0f8d3d9, 0x4381e97c),
+ TOBN(0x59e99b11, 0x62c5a4b8), TOBN(0x90d262f7, 0xfd0ec9f9),
+ TOBN(0xfbc854c9, 0x283e13c9), TOBN(0x2d04fde7, 0xaedc7085),
+ TOBN(0x057d7765, 0x47dcbecb), TOBN(0x8dbdf591, 0x9a76fa5f),
+ TOBN(0xd0150695, 0x0de1e578), TOBN(0x2e1463e7, 0xe9f72bc6),
+ TOBN(0xffa68441, 0x1b39eca5), TOBN(0x673c8530, 0x7c037f2f),
+ TOBN(0xd0d6a600, 0x747f91da), TOBN(0xb08d43e1, 0xc9cb78e9),
+ TOBN(0x0fc0c644, 0x27b5cef5), TOBN(0x5c1d160a, 0xa60a2fd6),
+ TOBN(0xf98cae53, 0x28c8e13b), TOBN(0x375f10c4, 0xb2eddcd1),
+ TOBN(0xd4eb8b7f, 0x5cce06ad), TOBN(0xb4669f45, 0x80a2e1ef),
+ TOBN(0xd593f9d0, 0x5bbd8699), TOBN(0x5528a4c9, 0xe7976d13),
+ TOBN(0x3923e095, 0x1c7e28d3), TOBN(0xb9293790, 0x3f6bb577),
+ TOBN(0xdb567d6a, 0xc42bd6d2), TOBN(0x6df86468, 0xbb1f96ae),
+ TOBN(0x0efe5b1a, 0x4843b28e), TOBN(0x961bbb05, 0x6379b240),
+ TOBN(0xb6caf5f0, 0x70a6a26b), TOBN(0x70686c0d, 0x328e6e39),
+ TOBN(0x80da06cf, 0x895fc8d3), TOBN(0x804d8810, 0xb363fdc9),
+ TOBN(0xbe22877b, 0x207f1670), TOBN(0x9b0dd188, 0x4e615291),
+ TOBN(0x625ae8dc, 0x97a3c2bf), TOBN(0x08584ef7, 0x439b86e8),
+ TOBN(0xde7190a5, 0xdcd898ff), TOBN(0x26286c40, 0x2058ee3d),
+ TOBN(0x3db0b217, 0x5f87b1c1), TOBN(0xcc334771, 0x102a6db5),
+ TOBN(0xd99de954, 0x2f770fb1), TOBN(0x97c1c620, 0x4cd7535e),
+ TOBN(0xd3b6c448, 0x3f09cefc), TOBN(0xd725af15, 0x5a63b4f8),
+ TOBN(0x0c95d24f, 0xc01e20ec), TOBN(0xdfd37494, 0x9ae7121f),
+ TOBN(0x7d6ddb72, 0xec77b7ec), TOBN(0xfe079d3b, 0x0353a4ae),
+ TOBN(0x3066e70a, 0x2e6ac8d2), TOBN(0x9c6b5a43, 0x106e5c05),
+ TOBN(0x52d3c6f5, 0xede59b8c), TOBN(0x30d6a5c3, 0xfccec9ae),
+ TOBN(0xedec7c22, 0x4fc0a9ef), TOBN(0x190ff083, 0x95c16ced),
+ TOBN(0xbe12ec8f, 0x94de0fde), TOBN(0x0d131ab8, 0x852d3433),
+ TOBN(0x42ace07e, 0x85701291), TOBN(0x94793ed9, 0x194061a8),
+ TOBN(0x30e83ed6, 0xd7f4a485), TOBN(0x9eec7269, 0xf9eeff4d),
+ TOBN(0x90acba59, 0x0c9d8005), TOBN(0x5feca458, 0x1e79b9d1),
+ TOBN(0x8fbe5427, 0x1d506a1e), TOBN(0xa32b2c8e, 0x2439cfa7),
+ TOBN(0x1671c173, 0x73dd0b4e), TOBN(0x37a28214, 0x44a054c6),
+ TOBN(0x81760a1b, 0x4e8b53f1), TOBN(0xa6c04224, 0xf9f93b9e),
+ TOBN(0x18784b34, 0xcf671e3c), TOBN(0x81bbecd2, 0xcda9b994),
+ TOBN(0x38831979, 0xb2ab3848), TOBN(0xef54feb7, 0xf2e03c2d),
+ TOBN(0xcf197ca7, 0xfb8088fa), TOBN(0x01427247, 0x4ddc96c5),
+ TOBN(0xa2d2550a, 0x30777176), TOBN(0x53469898, 0x4d0cf71d),
+ TOBN(0x6ce937b8, 0x3a2aaac6), TOBN(0xe9f91dc3, 0x5af38d9b),
+ TOBN(0x2598ad83, 0xc8bf2899), TOBN(0x8e706ac9, 0xb5536c16),
+ TOBN(0x40dc7495, 0xf688dc98), TOBN(0x26490cd7, 0x124c4afc),
+ TOBN(0xe651ec84, 0x1f18775c), TOBN(0x393ea6c3, 0xb4fdaf4a),
+ TOBN(0x1e1f3343, 0x7f338e0d), TOBN(0x39fb832b, 0x6053e7b5),
+ TOBN(0x46e702da, 0x619e14d5), TOBN(0x859cacd1, 0xcdeef6e0),
+ TOBN(0x63b99ce7, 0x4462007d), TOBN(0xb8ab48a5, 0x4cb5f5b7),
+ TOBN(0x9ec673d2, 0xf55edde7), TOBN(0xd1567f74, 0x8cfaefda),
+ TOBN(0x46381b6b, 0x0887bcec), TOBN(0x694497ce, 0xe178f3c2),
+ TOBN(0x5e6525e3, 0x1e6266cb), TOBN(0x5931de26, 0x697d6413),
+ TOBN(0x87f8df7c, 0x0e58d493), TOBN(0xb1ae5ed0, 0x58b73f12),
+ TOBN(0xc368f784, 0xdea0c34d), TOBN(0x9bd0a120, 0x859a91a0),
+ TOBN(0xb00d88b7, 0xcc863c68), TOBN(0x3a1cc11e, 0x3d1f4d65),
+ TOBN(0xea38e0e7, 0x0aa85593), TOBN(0x37f13e98, 0x7dc4aee8),
+ TOBN(0x10d38667, 0xbc947bad), TOBN(0x738e07ce, 0x2a36ee2e),
+ TOBN(0xc93470cd, 0xc577fcac), TOBN(0xdee1b616, 0x2782470d),
+ TOBN(0x36a25e67, 0x2e793d12), TOBN(0xd6aa6cae, 0xe0f186da),
+ TOBN(0x474d0fd9, 0x80e07af7), TOBN(0xf7cdc47d, 0xba8a5cd4),
+ TOBN(0x28af6d9d, 0xab15247f), TOBN(0x7c789c10, 0x493a537f),
+ TOBN(0x7ac9b110, 0x23a334e7), TOBN(0x0236ac09, 0x12c9c277),
+ TOBN(0xa7e5bd25, 0x1d7a5144), TOBN(0x098b9c2a, 0xf13ec4ec),
+ TOBN(0x3639daca, 0xd3f0abca), TOBN(0x642da81a, 0xa23960f9),
+ TOBN(0x7d2e5c05, 0x4f7269b1), TOBN(0xfcf30777, 0xe287c385),
+ TOBN(0x10edc84f, 0xf2a46f21), TOBN(0x35441757, 0x4f43fa36),
+ TOBN(0xf1327899, 0xfd703431), TOBN(0xa438d7a6, 0x16dd587a),
+ TOBN(0x65c34c57, 0xe9c8352d), TOBN(0xa728edab, 0x5cc5a24e),
+ TOBN(0xaed78abc, 0x42531689), TOBN(0x0a51a0e8, 0x010963ef),
+ TOBN(0x5776fa0a, 0xd717d9b3), TOBN(0xf356c239, 0x7dd3428b),
+ TOBN(0x29903fff, 0x8d3a3dac), TOBN(0x409597fa, 0x3d94491f),
+ TOBN(0x4cd7a5ff, 0xbf4a56a4), TOBN(0xe5096474, 0x8adab462),
+ TOBN(0xa97b5126, 0x5c3427b0), TOBN(0x6401405c, 0xd282c9bd),
+ TOBN(0x3629f8d7, 0x222c5c45), TOBN(0xb1c02c16, 0xe8d50aed),
+ TOBN(0xbea2ed75, 0xd9635bc9), TOBN(0x226790c7, 0x6e24552f),
+ TOBN(0x3c33f2a3, 0x65f1d066), TOBN(0x2a43463e, 0x6dfccc2e),
+ TOBN(0x8cc3453a, 0xdb483761), TOBN(0xe7cc6085, 0x65d5672b),
+ TOBN(0x277ed6cb, 0xde3efc87), TOBN(0x19f2f368, 0x69234eaf),
+ TOBN(0x9aaf4317, 0x5c0b800b), TOBN(0x1f1e7c89, 0x8b6da6e2),
+ TOBN(0x6cfb4715, 0xb94ec75e), TOBN(0xd590dd5f, 0x453118c2),
+ TOBN(0x14e49da1, 0x1f17a34c), TOBN(0x5420ab39, 0x235a1456),
+ TOBN(0xb7637241, 0x2f50363b), TOBN(0x7b15d623, 0xc3fabb6e),
+ TOBN(0xa0ef40b1, 0xe274e49c), TOBN(0x5cf50744, 0x96b1860a),
+ TOBN(0xd6583fbf, 0x66afe5a4), TOBN(0x44240510, 0xf47e3e9a),
+ TOBN(0x99254343, 0x11b2d595), TOBN(0xf1367499, 0xeec8df57),
+ TOBN(0x3cb12c61, 0x3e73dd05), TOBN(0xd248c033, 0x7dac102a),
+ TOBN(0xcf154f13, 0xa77739f5), TOBN(0xbf4288cb, 0x23d2af42),
+ TOBN(0xaa64c9b6, 0x32e4a1cf), TOBN(0xee8c07a8, 0xc8a208f3),
+ TOBN(0xe10d4999, 0x6fe8393f), TOBN(0x0f809a3f, 0xe91f3a32),
+ TOBN(0x61096d1c, 0x802f63c8), TOBN(0x289e1462, 0x57750d3d),
+ TOBN(0xed06167e, 0x9889feea), TOBN(0xd5c9c0e2, 0xe0993909),
+ TOBN(0x46fca0d8, 0x56508ac6), TOBN(0x91826047, 0x4f1b8e83),
+ TOBN(0x4f2c877a, 0x9a4a2751), TOBN(0x71bd0072, 0xcae6fead),
+ TOBN(0x38df8dcc, 0x06aa1941), TOBN(0x5a074b4c, 0x63beeaa8),
+ TOBN(0xd6d65934, 0xc1cec8ed), TOBN(0xa6ecb49e, 0xaabc03bd),
+ TOBN(0xaade91c2, 0xde8a8415), TOBN(0xcfb0efdf, 0x691136e0),
+ TOBN(0x11af45ee, 0x23ab3495), TOBN(0xa132df88, 0x0b77463d),
+ TOBN(0x8923c15c, 0x815d06f4), TOBN(0xc3ceb3f5, 0x0d61a436),
+ TOBN(0xaf52291d, 0xe88fb1da), TOBN(0xea057974, 0x1da12179),
+ TOBN(0xb0d7218c, 0xd2fef720), TOBN(0x6c0899c9, 0x8e1d8845),
+ TOBN(0x98157504, 0x752ddad7), TOBN(0xd60bd74f, 0xa1a68a97),
+ TOBN(0x7047a3a9, 0xf658fb99), TOBN(0x1f5d86d6, 0x5f8511e4),
+ TOBN(0xb8a4bc42, 0x4b5a6d88), TOBN(0x69eb2c33, 0x1abefa7d),
+ TOBN(0x95bf39e8, 0x13c9c510), TOBN(0xf571960a, 0xd48aab43),
+ TOBN(0x7e8cfbcf, 0x704e23c6), TOBN(0xc71b7d22, 0x28aaa65b),
+ TOBN(0xa041b2bd, 0x245e3c83), TOBN(0x69b98834, 0xd21854ff),
+ TOBN(0x89d227a3, 0x963bfeec), TOBN(0x99947aaa, 0xde7da7cb),
+ TOBN(0x1d9ee9db, 0xee68a9b1), TOBN(0x0a08f003, 0x698ec368),
+ TOBN(0xe9ea4094, 0x78ef2487), TOBN(0xc8d2d415, 0x02cfec26),
+ TOBN(0xc52f9a6e, 0xb7dcf328), TOBN(0x0ed489e3, 0x85b6a937),
+ TOBN(0x9b94986b, 0xbef3366e), TOBN(0x0de59c70, 0xedddddb8),
+ TOBN(0xffdb748c, 0xeadddbe2), TOBN(0x9b9784bb, 0x8266ea40),
+ TOBN(0x142b5502, 0x1a93507a), TOBN(0xb4cd1187, 0x8d3c06cf),
+ TOBN(0xdf70e76a, 0x91ec3f40), TOBN(0x484e81ad, 0x4e7553c2),
+ TOBN(0x830f87b5, 0x272e9d6e), TOBN(0xea1c93e5, 0xc6ff514a),
+ TOBN(0x67cc2adc, 0xc4192a8e), TOBN(0xc77e27e2, 0x42f4535a),
+ TOBN(0x9cdbab36, 0xd2b713c5), TOBN(0x86274ea0, 0xcf7b0cd3),
+ TOBN(0x784680f3, 0x09af826b), TOBN(0xbfcc837a, 0x0c72dea3),
+ TOBN(0xa8bdfe9d, 0xd6529b73), TOBN(0x708aa228, 0x63a88002),
+ TOBN(0x6c7a9a54, 0xc91d45b9), TOBN(0xdf1a38bb, 0xfd004f56),
+ TOBN(0x2e8c9a26, 0xb8bad853), TOBN(0x2d52cea3, 0x3723eae7),
+ TOBN(0x054d6d81, 0x56ca2830), TOBN(0xa3317d14, 0x9a8dc411),
+ TOBN(0xa08662fe, 0xfd4ddeda), TOBN(0xed2a153a, 0xb55d792b),
+ TOBN(0x7035c16a, 0xbfc6e944), TOBN(0xb6bc5834, 0x00171cf3),
+ TOBN(0xe27152b3, 0x83d102b6), TOBN(0xfe695a47, 0x0646b848),
+ TOBN(0xa5bb09d8, 0x916e6d37), TOBN(0xb4269d64, 0x0d17015e),
+ TOBN(0x8d8156a1, 0x0a1d2285), TOBN(0xfeef6c51, 0x46d26d72),
+ TOBN(0x9dac57c8, 0x4c5434a7), TOBN(0x0282e5be, 0x59d39e31),
+ TOBN(0xedfff181, 0x721c486d), TOBN(0x301baf10, 0xbc58824e),
+ TOBN(0x8136a6aa, 0x00570031), TOBN(0x55aaf78c, 0x1cddde68),
+ TOBN(0x26829371, 0x59c63952), TOBN(0x3a3bd274, 0x8bc25baf),
+ TOBN(0xecdf8657, 0xb7e52dc3), TOBN(0x2dd8c087, 0xfd78e6c8),
+ TOBN(0x20553274, 0xf5531461), TOBN(0x8b4a1281, 0x5d95499b),
+ TOBN(0xe2c8763a, 0x1a80f9d2), TOBN(0xd1dbe32b, 0x4ddec758),
+ TOBN(0xaf12210d, 0x30c34169), TOBN(0xba74a953, 0x78baa533),
+ TOBN(0x3d133c6e, 0xa438f254), TOBN(0xa431531a, 0x201bef5b),
+ TOBN(0x15295e22, 0xf669d7ec), TOBN(0xca374f64, 0x357fb515),
+ TOBN(0x8a8406ff, 0xeaa3fdb3), TOBN(0x106ae448, 0xdf3f2da8),
+ TOBN(0x8f9b0a90, 0x33c8e9a1), TOBN(0x234645e2, 0x71ad5885),
+ TOBN(0x3d083224, 0x1c0aed14), TOBN(0xf10a7d3e, 0x7a942d46),
+ TOBN(0x7c11deee, 0x40d5c9be), TOBN(0xb2bae7ff, 0xba84ed98),
+ TOBN(0x93e97139, 0xaad58ddd), TOBN(0x3d872796, 0x3f6d1fa3),
+ TOBN(0x483aca81, 0x8569ff13), TOBN(0x8b89a5fb, 0x9a600f72),
+ TOBN(0x4cbc27c3, 0xc06f2b86), TOBN(0x22130713, 0x63ad9c0b),
+ TOBN(0xb5358b1e, 0x48ac2840), TOBN(0x18311294, 0xecba9477),
+ TOBN(0xda58f990, 0xa6946b43), TOBN(0x3098baf9, 0x9ab41819),
+ TOBN(0x66c4c158, 0x4198da52), TOBN(0xab4fc17c, 0x146bfd1b),
+ TOBN(0x2f0a4c3c, 0xbf36a908), TOBN(0x2ae9e34b, 0x58cf7838),
+ TOBN(0xf411529e, 0x3fa11b1f), TOBN(0x21e43677, 0x974af2b4),
+ TOBN(0x7c20958e, 0xc230793b), TOBN(0x710ea885, 0x16e840f3),
+ TOBN(0xfc0b21fc, 0xc5dc67cf), TOBN(0x08d51647, 0x88405718),
+ TOBN(0xd955c21f, 0xcfe49eb7), TOBN(0x9722a5d5, 0x56dd4a1f),
+ TOBN(0xc9ef50e2, 0xc861baa5), TOBN(0xc0c21a5d, 0x9505ac3e),
+ TOBN(0xaf6b9a33, 0x8b7c063f), TOBN(0xc6370339, 0x2f4779c1),
+ TOBN(0x22df99c7, 0x638167c3), TOBN(0xfe6ffe76, 0x795db30c),
+ TOBN(0x2b822d33, 0xa4854989), TOBN(0xfef031dd, 0x30563aa5),
+ TOBN(0x16b09f82, 0xd57c667f), TOBN(0xc70312ce, 0xcc0b76f1),
+ TOBN(0xbf04a9e6, 0xc9118aec), TOBN(0x82fcb419, 0x3409d133),
+ TOBN(0x1a8ab385, 0xab45d44d), TOBN(0xfba07222, 0x617b83a3),
+ TOBN(0xb05f50dd, 0x58e81b52), TOBN(0x1d8db553, 0x21ce5aff),
+ TOBN(0x3097b8d4, 0xe344a873), TOBN(0x7d8d116d, 0xfe36d53e),
+ TOBN(0x6db22f58, 0x7875e750), TOBN(0x2dc5e373, 0x43e144ea),
+ TOBN(0xc05f32e6, 0xe799eb95), TOBN(0xe9e5f4df, 0x6899e6ec),
+ TOBN(0xbdc3bd68, 0x1fab23d5), TOBN(0xb72b8ab7, 0x73af60e6),
+ TOBN(0x8db27ae0, 0x2cecc84a), TOBN(0x600016d8, 0x7bdb871c),
+ TOBN(0x42a44b13, 0xd7c46f58), TOBN(0xb8919727, 0xc3a77d39),
+ TOBN(0xcfc6bbbd, 0xdafd6088), TOBN(0x1a740146, 0x6bd20d39),
+ TOBN(0x8c747abd, 0x98c41072), TOBN(0x4c91e765, 0xbdf68ea1),
+ TOBN(0x7c95e5ca, 0x08819a78), TOBN(0xcf48b729, 0xc9587921),
+ TOBN(0x091c7c5f, 0xdebbcc7d), TOBN(0x6f287404, 0xf0e05149),
+ TOBN(0xf83b5ac2, 0x26cd44ec), TOBN(0x88ae32a6, 0xcfea250e),
+ TOBN(0x6ac5047a, 0x1d06ebc5), TOBN(0xc7e550b4, 0xd434f781),
+ TOBN(0x61ab1cf2, 0x5c727bd2), TOBN(0x2e4badb1, 0x1cf915b0),
+ TOBN(0x1b4dadec, 0xf69d3920), TOBN(0xe61b1ca6, 0xf14c1dfe),
+ TOBN(0x90b479cc, 0xbd6bd51f), TOBN(0x8024e401, 0x8045ec30),
+ TOBN(0xcab29ca3, 0x25ef0e62), TOBN(0x4f2e9416, 0x49e4ebc0),
+ TOBN(0x45eb40ec, 0x0ccced58), TOBN(0x25cd4b9c, 0x0da44f98),
+ TOBN(0x43e06458, 0x871812c6), TOBN(0x99f80d55, 0x16cef651),
+ TOBN(0x571340c9, 0xce6dc153), TOBN(0x138d5117, 0xd8665521),
+ TOBN(0xacdb45bc, 0x4e07014d), TOBN(0x2f34bb38, 0x84b60b91),
+ TOBN(0xf44a4fd2, 0x2ae8921e), TOBN(0xb039288e, 0x892ba1e2),
+ TOBN(0x9da50174, 0xb1c180b2), TOBN(0x6b70ab66, 0x1693dc87),
+ TOBN(0x7e9babc9, 0xe7057481), TOBN(0x4581ddef, 0x9c80dc41),
+ TOBN(0x0c890da9, 0x51294682), TOBN(0x0b5629d3, 0x3f4736e5),
+ TOBN(0x2340c79e, 0xb06f5b41), TOBN(0xa42e84ce, 0x4e243469),
+ TOBN(0xf9a20135, 0x045a71a9), TOBN(0xefbfb415, 0xd27b6fb6),
+ TOBN(0x25ebea23, 0x9d33cd6f), TOBN(0x9caedb88, 0xaa6c0af8),
+ TOBN(0x53dc7e9a, 0xd9ce6f96), TOBN(0x3897f9fd, 0x51e0b15a),
+ TOBN(0xf51cb1f8, 0x8e5d788e), TOBN(0x1aec7ba8, 0xe1d490ee),
+ TOBN(0x265991e0, 0xcc58cb3c), TOBN(0x9f306e8c, 0x9fc3ad31),
+ TOBN(0x5fed006e, 0x5040a0ac), TOBN(0xca9d5043, 0xfb476f2e),
+ TOBN(0xa19c06e8, 0xbeea7a23), TOBN(0xd2865801, 0x0edabb63),
+ TOBN(0xdb92293f, 0x6967469a), TOBN(0x2894d839, 0x8d8a8ed8),
+ TOBN(0x87c9e406, 0xbbc77122), TOBN(0x8671c6f1, 0x2ea3a26a),
+ TOBN(0xe42df8d6, 0xd7de9853), TOBN(0x2e3ce346, 0xb1f2bcc7),
+ TOBN(0xda601dfc, 0x899d50cf), TOBN(0xbfc913de, 0xfb1b598f),
+ TOBN(0x81c4909f, 0xe61f7908), TOBN(0x192e304f, 0x9bbc7b29),
+ TOBN(0xc3ed8738, 0xc104b338), TOBN(0xedbe9e47, 0x783f5d61),
+ TOBN(0x0c06e9be, 0x2db30660), TOBN(0xda3e613f, 0xc0eb7d8e),
+ TOBN(0xd8fa3e97, 0x322e096e), TOBN(0xfebd91e8, 0xd336e247),
+ TOBN(0x8f13ccc4, 0xdf655a49), TOBN(0xa9e00dfc, 0x5eb20210),
+ TOBN(0x84631d0f, 0xc656b6ea), TOBN(0x93a058cd, 0xd8c0d947),
+ TOBN(0x6846904a, 0x67bd3448), TOBN(0x4a3d4e1a, 0xf394fd5c),
+ TOBN(0xc102c1a5, 0xdb225f52), TOBN(0xe3455bba, 0xfc4f5e9a),
+ TOBN(0x6b36985b, 0x4b9ad1ce), TOBN(0xa9818536, 0x5bb7f793),
+ TOBN(0x6c25e1d0, 0x48b1a416), TOBN(0x1381dd53, 0x3c81bee7),
+ TOBN(0xd2a30d61, 0x7a4a7620), TOBN(0xc8412926, 0x39b8944c),
+ TOBN(0x3c1c6fbe, 0x7a97c33a), TOBN(0x941e541d, 0x938664e7),
+ TOBN(0x417499e8, 0x4a34f239), TOBN(0x15fdb83c, 0xb90402d5),
+ TOBN(0xb75f46bf, 0x433aa832), TOBN(0xb61e15af, 0x63215db1),
+ TOBN(0xaabe59d4, 0xa127f89a), TOBN(0x5d541e0c, 0x07e816da),
+ TOBN(0xaaba0659, 0xa618b692), TOBN(0x55327733, 0x17266026),
+ TOBN(0xaf53a0fc, 0x95f57552), TOBN(0x32947650, 0x6cacb0c9),
+ TOBN(0x253ff58d, 0xc821be01), TOBN(0xb0309531, 0xa06f1146),
+ TOBN(0x59bbbdf5, 0x05c2e54d), TOBN(0x158f27ad, 0x26e8dd22),
+ TOBN(0xcc5b7ffb, 0x397e1e53), TOBN(0xae03f65b, 0x7fc1e50d),
+ TOBN(0xa9784ebd, 0x9c95f0f9), TOBN(0x5ed9deb2, 0x24640771),
+ TOBN(0x31244af7, 0x035561c4), TOBN(0x87332f3a, 0x7ee857de),
+ TOBN(0x09e16e9e, 0x2b9e0d88), TOBN(0x52d910f4, 0x56a06049),
+ TOBN(0x507ed477, 0xa9592f48), TOBN(0x85cb917b, 0x2365d678),
+ TOBN(0xf8511c93, 0x4c8998d1), TOBN(0x2186a3f1, 0x730ea58f),
+ TOBN(0x50189626, 0xb2029db0), TOBN(0x9137a6d9, 0x02ceb75a),
+ TOBN(0x2fe17f37, 0x748bc82c), TOBN(0x87c2e931, 0x80469f8c),
+ TOBN(0x850f71cd, 0xbf891aa2), TOBN(0x0ca1b89b, 0x75ec3d8d),
+ TOBN(0x516c43aa, 0x5e1cd3cd), TOBN(0x89397808, 0x9a887c28),
+ TOBN(0x0059c699, 0xddea1f9f), TOBN(0x7737d6fa, 0x8e6868f7),
+ TOBN(0x6d93746a, 0x60f1524b), TOBN(0x36985e55, 0xba052aa7),
+ TOBN(0x41b1d322, 0xed923ea5), TOBN(0x3429759f, 0x25852a11),
+ TOBN(0xbeca6ec3, 0x092e9f41), TOBN(0x3a238c66, 0x62256bbd),
+ TOBN(0xd82958ea, 0x70ad487d), TOBN(0x4ac8aaf9, 0x65610d93),
+ TOBN(0x3fa101b1, 0x5e4ccab0), TOBN(0x9bf430f2, 0x9de14bfb),
+ TOBN(0xa10f5cc6, 0x6531899d), TOBN(0x590005fb, 0xea8ce17d),
+ TOBN(0xc437912f, 0x24544cb6), TOBN(0x9987b71a, 0xd79ac2e3),
+ TOBN(0x13e3d9dd, 0xc058a212), TOBN(0x00075aac, 0xd2de9606),
+ TOBN(0x80ab508b, 0x6cac8369), TOBN(0x87842be7, 0xf54f6c89),
+ TOBN(0xa7ad663d, 0x6bc532a4), TOBN(0x67813de7, 0x78a91bc8),
+ TOBN(0x5dcb61ce, 0xc3427239), TOBN(0x5f3c7cf0, 0xc56934d9),
+ TOBN(0xc079e0fb, 0xe3191591), TOBN(0xe40896bd, 0xb01aada7),
+ TOBN(0x8d466791, 0x0492d25f), TOBN(0x8aeb30c9, 0xe7408276),
+ TOBN(0xe9437495, 0x9287aacc), TOBN(0x23d4708d, 0x79fe03d4),
+ TOBN(0x8cda9cf2, 0xd0c05199), TOBN(0x502fbc22, 0xfae78454),
+ TOBN(0xc0bda9df, 0xf572a182), TOBN(0x5f9b71b8, 0x6158b372),
+ TOBN(0xe0f33a59, 0x2b82dd07), TOBN(0x76302735, 0x9523032e),
+ TOBN(0x7fe1a721, 0xc4505a32), TOBN(0x7b6e3e82, 0xf796409f),}
+ ,
+ {TOBN(0xe3417bc0, 0x35d0b34a), TOBN(0x440b386b, 0x8327c0a7),
+ TOBN(0x8fb7262d, 0xac0362d1), TOBN(0x2c41114c, 0xe0cdf943),
+ TOBN(0x2ba5cef1, 0xad95a0b1), TOBN(0xc09b37a8, 0x67d54362),
+ TOBN(0x26d6cdd2, 0x01e486c9), TOBN(0x20477abf, 0x42ff9297),
+ TOBN(0xa004dcb3, 0x292a9287), TOBN(0xddc15cf6, 0x77b092c7),
+ TOBN(0x083a8464, 0x806c0605), TOBN(0x4a68df70, 0x3db997b0),
+ TOBN(0x9c134e45, 0x05bf7dd0), TOBN(0xa4e63d39, 0x8ccf7f8c),
+ TOBN(0xa6e6517f, 0x41b5f8af), TOBN(0xaa8b9342, 0xad7bc1cc),
+ TOBN(0x126f35b5, 0x1e706ad9), TOBN(0xb99cebb4, 0xc3a9ebdf),
+ TOBN(0xa75389af, 0xbf608d90), TOBN(0x76113c4f, 0xc6c89858),
+ TOBN(0x80de8eb0, 0x97e2b5aa), TOBN(0x7e1022cc, 0x63b91304),
+ TOBN(0x3bdab605, 0x6ccc066c), TOBN(0x33cbb144, 0xb2edf900),
+ TOBN(0xc4176471, 0x7af715d2), TOBN(0xe2f7f594, 0xd0134a96),
+ TOBN(0x2c1873ef, 0xa41ec956), TOBN(0xe4e7b4f6, 0x77821304),
+ TOBN(0xe5c8ff97, 0x88d5374a), TOBN(0x2b915e63, 0x80823d5b),
+ TOBN(0xea6bc755, 0xb2ee8fe2), TOBN(0x6657624c, 0xe7112651),
+ TOBN(0x157af101, 0xdace5aca), TOBN(0xc4fdbcf2, 0x11a6a267),
+ TOBN(0xdaddf340, 0xc49c8609), TOBN(0x97e49f52, 0xe9604a65),
+ TOBN(0x9be8e790, 0x937e2ad5), TOBN(0x846e2508, 0x326e17f1),
+ TOBN(0x3f38007a, 0x0bbbc0dc), TOBN(0xcf03603f, 0xb11e16d6),
+ TOBN(0xd6f800e0, 0x7442f1d5), TOBN(0x475607d1, 0x66e0e3ab),
+ TOBN(0x82807f16, 0xb7c64047), TOBN(0x8858e1e3, 0xa749883d),
+ TOBN(0x5859120b, 0x8231ee10), TOBN(0x1b80e7eb, 0x638a1ece),
+ TOBN(0xcb72525a, 0xc6aa73a4), TOBN(0xa7cdea3d, 0x844423ac),
+ TOBN(0x5ed0c007, 0xf8ae7c38), TOBN(0x6db07a5c, 0x3d740192),
+ TOBN(0xbe5e9c2a, 0x5fe36db3), TOBN(0xd5b9d57a, 0x76e95046),
+ TOBN(0x54ac32e7, 0x8eba20f2), TOBN(0xef11ca8f, 0x71b9a352),
+ TOBN(0x305e373e, 0xff98a658), TOBN(0xffe5a100, 0x823eb667),
+ TOBN(0x57477b11, 0xe51732d2), TOBN(0xdfd6eb28, 0x2538fc0e),
+ TOBN(0x5c43b0cc, 0x3b39eec5), TOBN(0x6af12778, 0xcb36cc57),
+ TOBN(0x70b0852d, 0x06c425ae), TOBN(0x6df92f8c, 0x5c221b9b),
+ TOBN(0x6c8d4f9e, 0xce826d9c), TOBN(0xf59aba7b, 0xb49359c3),
+ TOBN(0x5c8ed8d5, 0xda64309d), TOBN(0x61a6de56, 0x91b30704),
+ TOBN(0xd6b52f6a, 0x2f9b5808), TOBN(0x0eee4194, 0x98c958a7),
+ TOBN(0xcddd9aab, 0x771e4caa), TOBN(0x83965dfd, 0x78bc21be),
+ TOBN(0x02affce3, 0xb3b504f5), TOBN(0x30847a21, 0x561c8291),
+ TOBN(0xd2eb2cf1, 0x52bfda05), TOBN(0xe0e4c4e9, 0x6197b98c),
+ TOBN(0x1d35076c, 0xf8a1726f), TOBN(0x6c06085b, 0x2db11e3d),
+ TOBN(0x15c0c4d7, 0x4463ba14), TOBN(0x9d292f83, 0x0030238c),
+ TOBN(0x1311ee8b, 0x3727536d), TOBN(0xfeea86ef, 0xbeaedc1e),
+ TOBN(0xb9d18cd3, 0x66131e2e), TOBN(0xf31d974f, 0x80fe2682),
+ TOBN(0xb6e49e0f, 0xe4160289), TOBN(0x7c48ec0b, 0x08e92799),
+ TOBN(0x818111d8, 0xd1989aa7), TOBN(0xb34fa0aa, 0xebf926f9),
+ TOBN(0xdb5fe2f5, 0xa245474a), TOBN(0xf80a6ebb, 0x3c7ca756),
+ TOBN(0xa7f96054, 0xafa05dd8), TOBN(0x26dfcf21, 0xfcaf119e),
+ TOBN(0xe20ef2e3, 0x0564bb59), TOBN(0xef4dca50, 0x61cb02b8),
+ TOBN(0xcda7838a, 0x65d30672), TOBN(0x8b08d534, 0xfd657e86),
+ TOBN(0x4c5b4395, 0x46d595c8), TOBN(0x39b58725, 0x425cb836),
+ TOBN(0x8ea61059, 0x3de9abe3), TOBN(0x40434881, 0x9cdc03be),
+ TOBN(0x9b261245, 0xcfedce8c), TOBN(0x78c318b4, 0xcf5234a1),
+ TOBN(0x510bcf16, 0xfde24c99), TOBN(0x2a77cb75, 0xa2c2ff5d),
+ TOBN(0x9c895c2b, 0x27960fb4), TOBN(0xd30ce975, 0xb0eda42b),
+ TOBN(0xfda85393, 0x1a62cc26), TOBN(0x23c69b96, 0x50c0e052),
+ TOBN(0xa227df15, 0xbfc633f3), TOBN(0x2ac78848, 0x1bae7d48),
+ TOBN(0x487878f9, 0x187d073d), TOBN(0x6c2be919, 0x967f807d),
+ TOBN(0x765861d8, 0x336e6d8f), TOBN(0x88b8974c, 0xce528a43),
+ TOBN(0x09521177, 0xff57d051), TOBN(0x2ff38037, 0xfb6a1961),
+ TOBN(0xfc0aba74, 0xa3d76ad4), TOBN(0x7c764803, 0x25a7ec17),
+ TOBN(0x7532d75f, 0x48879bc8), TOBN(0xea7eacc0, 0x58ce6bc1),
+ TOBN(0xc82176b4, 0x8e896c16), TOBN(0x9a30e0b2, 0x2c750fed),
+ TOBN(0xc37e2c2e, 0x421d3aa4), TOBN(0xf926407c, 0xe84fa840),
+ TOBN(0x18abc03d, 0x1454e41c), TOBN(0x26605ecd, 0x3f7af644),
+ TOBN(0x242341a6, 0xd6a5eabf), TOBN(0x1edb84f4, 0x216b668e),
+ TOBN(0xd836edb8, 0x04010102), TOBN(0x5b337ce7, 0x945e1d8c),
+ TOBN(0xd2075c77, 0xc055dc14), TOBN(0x2a0ffa25, 0x81d89cdf),
+ TOBN(0x8ce815ea, 0x6ffdcbaf), TOBN(0xa3428878, 0xfb648867),
+ TOBN(0x277699cf, 0x884655fb), TOBN(0xfa5b5bd6, 0x364d3e41),
+ TOBN(0x01f680c6, 0x441e1cb7), TOBN(0x3fd61e66, 0xb70a7d67),
+ TOBN(0x666ba2dc, 0xcc78cf66), TOBN(0xb3018174, 0x6fdbff77),
+ TOBN(0x8d4dd0db, 0x168d4668), TOBN(0x259455d0, 0x1dab3a2a),
+ TOBN(0xf58564c5, 0xcde3acec), TOBN(0x77141925, 0x13adb276),
+ TOBN(0x527d725d, 0x8a303f65), TOBN(0x55deb6c9, 0xe6f38f7b),
+ TOBN(0xfd5bb657, 0xb1fa70fb), TOBN(0xfa07f50f, 0xd8073a00),
+ TOBN(0xf72e3aa7, 0xbca02500), TOBN(0xf68f895d, 0x9975740d),
+ TOBN(0x30112060, 0x5cae2a6a), TOBN(0x01bd7218, 0x02874842),
+ TOBN(0x3d423891, 0x7ce47bd3), TOBN(0xa66663c1, 0x789544f6),
+ TOBN(0x864d05d7, 0x3272d838), TOBN(0xe22924f9, 0xfa6295c5),
+ TOBN(0x8189593f, 0x6c2fda32), TOBN(0x330d7189, 0xb184b544),
+ TOBN(0x79efa62c, 0xbde1f714), TOBN(0x35771c94, 0xe5cb1a63),
+ TOBN(0x2f4826b8, 0x641c8332), TOBN(0x00a894fb, 0xc8cee854),
+ TOBN(0xb4b9a39b, 0x36194d40), TOBN(0xe857a7c5, 0x77612601),
+ TOBN(0xf4209dd2, 0x4ecf2f58), TOBN(0x82b9e66d, 0x5a033487),
+ TOBN(0xc1e36934, 0xe4e8b9dd), TOBN(0xd2372c9d, 0xa42377d7),
+ TOBN(0x51dc94c7, 0x0e3ae43b), TOBN(0x4c57761e, 0x04474f6f),
+ TOBN(0xdcdacd0a, 0x1058a318), TOBN(0x369cf3f5, 0x78053a9a),
+ TOBN(0xc6c3de50, 0x31c68de2), TOBN(0x4653a576, 0x3c4b6d9f),
+ TOBN(0x1688dd5a, 0xaa4e5c97), TOBN(0x5be80aa1, 0xb7ab3c74),
+ TOBN(0x70cefe7c, 0xbc65c283), TOBN(0x57f95f13, 0x06867091),
+ TOBN(0xa39114e2, 0x4415503b), TOBN(0xc08ff7c6, 0x4cbb17e9),
+ TOBN(0x1eff674d, 0xd7dec966), TOBN(0x6d4690af, 0x53376f63),
+ TOBN(0xff6fe32e, 0xea74237b), TOBN(0xc436d17e, 0xcd57508e),
+ TOBN(0x15aa28e1, 0xedcc40fe), TOBN(0x0d769c04, 0x581bbb44),
+ TOBN(0xc240b6de, 0x34eaacda), TOBN(0xd9e116e8, 0x2ba0f1de),
+ TOBN(0xcbe45ec7, 0x79438e55), TOBN(0x91787c9d, 0x96f752d7),
+ TOBN(0x897f532b, 0xf129ac2f), TOBN(0xd307b7c8, 0x5a36e22c),
+ TOBN(0x91940675, 0x749fb8f3), TOBN(0xd14f95d0, 0x157fdb28),
+ TOBN(0xfe51d029, 0x6ae55043), TOBN(0x8931e98f, 0x44a87de1),
+ TOBN(0xe57f1cc6, 0x09e4fee2), TOBN(0x0d063b67, 0x4e072d92),
+ TOBN(0x70a998b9, 0xed0e4316), TOBN(0xe74a736b, 0x306aca46),
+ TOBN(0xecf0fbf2, 0x4fda97c7), TOBN(0xa40f65cb, 0x3e178d93),
+ TOBN(0x16253604, 0x16df4285), TOBN(0xb0c9babb, 0xd0c56ae2),
+ TOBN(0x73032b19, 0xcfc5cfc3), TOBN(0xe497e5c3, 0x09752056),
+ TOBN(0x12096bb4, 0x164bda96), TOBN(0x1ee42419, 0xa0b74da1),
+ TOBN(0x8fc36243, 0x403826ba), TOBN(0x0c8f0069, 0xdc09e660),
+ TOBN(0x8667e981, 0xc27253c9), TOBN(0x05a6aefb, 0x92b36a45),
+ TOBN(0xa62c4b36, 0x9cb7bb46), TOBN(0x8394f375, 0x11f7027b),
+ TOBN(0x747bc79c, 0x5f109d0f), TOBN(0xcad88a76, 0x5b8cc60a),
+ TOBN(0x80c5a66b, 0x58f09e68), TOBN(0xe753d451, 0xf6127eac),
+ TOBN(0xc44b74a1, 0x5b0ec6f5), TOBN(0x47989fe4, 0x5289b2b8),
+ TOBN(0x745f8484, 0x58d6fc73), TOBN(0xec362a6f, 0xf61c70ab),
+ TOBN(0x070c98a7, 0xb3a8ad41), TOBN(0x73a20fc0, 0x7b63db51),
+ TOBN(0xed2c2173, 0xf44c35f4), TOBN(0x8a56149d, 0x9acc9dca),
+ TOBN(0x98f17881, 0x9ac6e0f4), TOBN(0x360fdeaf, 0xa413b5ed),
+ TOBN(0x0625b8f4, 0xa300b0fd), TOBN(0xf1f4d76a, 0x5b3222d3),
+ TOBN(0x9d6f5109, 0x587f76b8), TOBN(0x8b4ee08d, 0x2317fdb5),
+ TOBN(0x88089bb7, 0x8c68b095), TOBN(0x95570e9a, 0x5808d9b9),
+ TOBN(0xa395c36f, 0x35d33ae7), TOBN(0x200ea123, 0x50bb5a94),
+ TOBN(0x20c789bd, 0x0bafe84b), TOBN(0x243ef52d, 0x0919276a),
+ TOBN(0x3934c577, 0xe23ae233), TOBN(0xb93807af, 0xa460d1ec),
+ TOBN(0xb72a53b1, 0xf8fa76a4), TOBN(0xd8914cb0, 0xc3ca4491),
+ TOBN(0x2e128494, 0x3fb42622), TOBN(0x3b2700ac, 0x500907d5),
+ TOBN(0xf370fb09, 0x1a95ec63), TOBN(0xf8f30be2, 0x31b6dfbd),
+ TOBN(0xf2b2f8d2, 0x69e55f15), TOBN(0x1fead851, 0xcc1323e9),
+ TOBN(0xfa366010, 0xd9e5eef6), TOBN(0x64d487b0, 0xe316107e),
+ TOBN(0x4c076b86, 0xd23ddc82), TOBN(0x03fd344c, 0x7e0143f0),
+ TOBN(0xa95362ff, 0x317af2c5), TOBN(0x0add3db7, 0xe18b7a4f),
+ TOBN(0x9c673e3f, 0x8260e01b), TOBN(0xfbeb49e5, 0x54a1cc91),
+ TOBN(0x91351bf2, 0x92f2e433), TOBN(0xc755e7ec, 0x851141eb),
+ TOBN(0xc9a95139, 0x29607745), TOBN(0x0ca07420, 0xa26f2b28),
+ TOBN(0xcb2790e7, 0x4bc6f9dd), TOBN(0x345bbb58, 0xadcaffc0),
+ TOBN(0xc65ea38c, 0xbe0f27a2), TOBN(0x67c24d7c, 0x641fcb56),
+ TOBN(0x2c25f0a7, 0xa9e2c757), TOBN(0x93f5cdb0, 0x16f16c49),
+ TOBN(0x2ca5a9d7, 0xc5ee30a1), TOBN(0xd1593635, 0xb909b729),
+ TOBN(0x804ce9f3, 0xdadeff48), TOBN(0xec464751, 0xb07c30c3),
+ TOBN(0x89d65ff3, 0x9e49af6a), TOBN(0xf2d6238a, 0x6f3d01bc),
+ TOBN(0x1095561e, 0x0bced843), TOBN(0x51789e12, 0xc8a13fd8),
+ TOBN(0xd633f929, 0x763231df), TOBN(0x46df9f7d, 0xe7cbddef),
+ TOBN(0x01c889c0, 0xcb265da8), TOBN(0xfce1ad10, 0xaf4336d2),
+ TOBN(0x8d110df6, 0xfc6a0a7e), TOBN(0xdd431b98, 0x6da425dc),
+ TOBN(0xcdc4aeab, 0x1834aabe), TOBN(0x84deb124, 0x8439b7fc),
+ TOBN(0x8796f169, 0x3c2a5998), TOBN(0x9b9247b4, 0x7947190d),
+ TOBN(0x55b9d9a5, 0x11597014), TOBN(0x7e9dd70d, 0x7b1566ee),
+ TOBN(0x94ad78f7, 0xcbcd5e64), TOBN(0x0359ac17, 0x9bd4c032),
+ TOBN(0x3b11baaf, 0x7cc222ae), TOBN(0xa6a6e284, 0xba78e812),
+ TOBN(0x8392053f, 0x24cea1a0), TOBN(0xc97bce4a, 0x33621491),
+ TOBN(0x7eb1db34, 0x35399ee9), TOBN(0x473f78ef, 0xece81ad1),
+ TOBN(0x41d72fe0, 0xf63d3d0d), TOBN(0xe620b880, 0xafab62fc),
+ TOBN(0x92096bc9, 0x93158383), TOBN(0x41a21357, 0x8f896f6c),
+ TOBN(0x1b5ee2fa, 0xc7dcfcab), TOBN(0x650acfde, 0x9546e007),
+ TOBN(0xc081b749, 0xb1b02e07), TOBN(0xda9e41a0, 0xf9eca03d),
+ TOBN(0x013ba727, 0x175a54ab), TOBN(0xca0cd190, 0xea5d8d10),
+ TOBN(0x85ea52c0, 0x95fd96a9), TOBN(0x2c591b9f, 0xbc5c3940),
+ TOBN(0x6fb4d4e4, 0x2bad4d5f), TOBN(0xfa4c3590, 0xfef0059b),
+ TOBN(0x6a10218a, 0xf5122294), TOBN(0x9a78a81a, 0xa85751d1),
+ TOBN(0x04f20579, 0xa98e84e7), TOBN(0xfe1242c0, 0x4997e5b5),
+ TOBN(0xe77a273b, 0xca21e1e4), TOBN(0xfcc8b1ef, 0x9411939d),
+ TOBN(0xe20ea302, 0x92d0487a), TOBN(0x1442dbec, 0x294b91fe),
+ TOBN(0x1f7a4afe, 0xbb6b0e8f), TOBN(0x1700ef74, 0x6889c318),
+ TOBN(0xf5bbffc3, 0x70f1fc62), TOBN(0x3b31d4b6, 0x69c79cca),
+ TOBN(0xe8bc2aab, 0xa7f6340d), TOBN(0xb0b08ab4, 0xa725e10a),
+ TOBN(0x44f05701, 0xae340050), TOBN(0xba4b3016, 0x1cf0c569),
+ TOBN(0x5aa29f83, 0xfbe19a51), TOBN(0x1b9ed428, 0xb71d752e),
+ TOBN(0x1666e54e, 0xeb4819f5), TOBN(0x616cdfed, 0x9e18b75b),
+ TOBN(0x112ed5be, 0x3ee27b0b), TOBN(0xfbf28319, 0x44c7de4d),
+ TOBN(0xd685ec85, 0xe0e60d84), TOBN(0x68037e30, 0x1db7ee78),
+ TOBN(0x5b65bdcd, 0x003c4d6e), TOBN(0x33e7363a, 0x93e29a6a),
+ TOBN(0x995b3a61, 0x08d0756c), TOBN(0xd727f85c, 0x2faf134b),
+ TOBN(0xfac6edf7, 0x1d337823), TOBN(0x99b9aa50, 0x0439b8b4),
+ TOBN(0x722eb104, 0xe2b4e075), TOBN(0x49987295, 0x437c4926),
+ TOBN(0xb1e4c0e4, 0x46a9b82d), TOBN(0xd0cb3197, 0x57a006f5),
+ TOBN(0xf3de0f7d, 0xd7808c56), TOBN(0xb5c54d8f, 0x51f89772),
+ TOBN(0x500a114a, 0xadbd31aa), TOBN(0x9afaaaa6, 0x295f6cab),
+ TOBN(0x94705e21, 0x04cf667a), TOBN(0xfc2a811b, 0x9d3935d7),
+ TOBN(0x560b0280, 0x6d09267c), TOBN(0xf19ed119, 0xf780e53b),
+ TOBN(0xf0227c09, 0x067b6269), TOBN(0x967b8533, 0x5caef599),
+ TOBN(0x155b9243, 0x68efeebc), TOBN(0xcd6d34f5, 0xc497bae6),
+ TOBN(0x1dd8d5d3, 0x6cceb370), TOBN(0x2aeac579, 0xa78d7bf9),
+ TOBN(0x5d65017d, 0x70b67a62), TOBN(0x70c8e44f, 0x17c53f67),
+ TOBN(0xd1fc0950, 0x86a34d09), TOBN(0xe0fca256, 0xe7134907),
+ TOBN(0xe24fa29c, 0x80fdd315), TOBN(0x2c4acd03, 0xd87499ad),
+ TOBN(0xbaaf7517, 0x3b5a9ba6), TOBN(0xb9cbe1f6, 0x12e51a51),
+ TOBN(0xd88edae3, 0x5e154897), TOBN(0xe4309c3c, 0x77b66ca0),
+ TOBN(0xf5555805, 0xf67f3746), TOBN(0x85fc37ba, 0xa36401ff),
+ TOBN(0xdf86e2ca, 0xd9499a53), TOBN(0x6270b2a3, 0xecbc955b),
+ TOBN(0xafae64f5, 0x974ad33b), TOBN(0x04d85977, 0xfe7b2df1),
+ TOBN(0x2a3db3ff, 0x4ab03f73), TOBN(0x0b87878a, 0x8702740a),
+ TOBN(0x6d263f01, 0x5a061732), TOBN(0xc25430ce, 0xa32a1901),
+ TOBN(0xf7ebab3d, 0xdb155018), TOBN(0x3a86f693, 0x63a9b78e),
+ TOBN(0x349ae368, 0xda9f3804), TOBN(0x470f07fe, 0xa164349c),
+ TOBN(0xd52f4cc9, 0x8562baa5), TOBN(0xc74a9e86, 0x2b290df3),
+ TOBN(0xd3a1aa35, 0x43471a24), TOBN(0x239446be, 0xb8194511),
+ TOBN(0xbec2dd00, 0x81dcd44d), TOBN(0xca3d7f0f, 0xc42ac82d),
+ TOBN(0x1f3db085, 0xfdaf4520), TOBN(0xbb6d3e80, 0x4549daf2),
+ TOBN(0xf5969d8a, 0x19ad5c42), TOBN(0x7052b13d, 0xdbfd1511),
+ TOBN(0x11890d1b, 0x682b9060), TOBN(0xa71d3883, 0xac34452c),
+ TOBN(0xa438055b, 0x783805b4), TOBN(0x43241277, 0x4725b23e),
+ TOBN(0xf20cf96e, 0x4901bbed), TOBN(0x6419c710, 0xf432a2bb),
+ TOBN(0x57a0fbb9, 0xdfa9cd7d), TOBN(0x589111e4, 0x00daa249),
+ TOBN(0x19809a33, 0x7b60554e), TOBN(0xea5f8887, 0xede283a4),
+ TOBN(0x2d713802, 0x503bfd35), TOBN(0x151bb0af, 0x585d2a53),
+ TOBN(0x40b08f74, 0x43b30ca8), TOBN(0xe10b5bba, 0xd9934583),
+ TOBN(0xe8a546d6, 0xb51110ad), TOBN(0x1dd50e66, 0x28e0b6c5),
+ TOBN(0x292e9d54, 0xcff2b821), TOBN(0x3882555d, 0x47281760),
+ TOBN(0x134838f8, 0x3724d6e3), TOBN(0xf2c679e0, 0x22ddcda1),
+ TOBN(0x40ee8815, 0x6d2a5768), TOBN(0x7f227bd2, 0x1c1e7e2d),
+ TOBN(0x487ba134, 0xd04ff443), TOBN(0x76e2ff3d, 0xc614e54b),
+ TOBN(0x36b88d6f, 0xa3177ec7), TOBN(0xbf731d51, 0x2328fff5),
+ TOBN(0x758caea2, 0x49ba158e), TOBN(0x5ab8ff4c, 0x02938188),
+ TOBN(0x33e16056, 0x35edc56d), TOBN(0x5a69d349, 0x7e940d79),
+ TOBN(0x6c4fd001, 0x03866dcb), TOBN(0x20a38f57, 0x4893cdef),
+ TOBN(0xfbf3e790, 0xfac3a15b), TOBN(0x6ed7ea2e, 0x7a4f8e6b),
+ TOBN(0xa663eb4f, 0xbc3aca86), TOBN(0x22061ea5, 0x080d53f7),
+ TOBN(0x2480dfe6, 0xf546783f), TOBN(0xd38bc6da, 0x5a0a641e),
+ TOBN(0xfb093cd1, 0x2ede8965), TOBN(0x89654db4, 0xacb455cf),
+ TOBN(0x413cbf9a, 0x26e1adee), TOBN(0x291f3764, 0x373294d4),
+ TOBN(0x00797257, 0x648083fe), TOBN(0x25f504d3, 0x208cc341),
+ TOBN(0x635a8e5e, 0xc3a0ee43), TOBN(0x70aaebca, 0x679898ff),
+ TOBN(0x9ee9f547, 0x5dc63d56), TOBN(0xce987966, 0xffb34d00),
+ TOBN(0xf9f86b19, 0x5e26310a), TOBN(0x9e435484, 0x382a8ca8),
+ TOBN(0x253bcb81, 0xc2352fe4), TOBN(0xa4eac8b0, 0x4474b571),
+ TOBN(0xc1b97512, 0xc1ad8cf8), TOBN(0x193b4e9e, 0x99e0b697),
+ TOBN(0x939d2716, 0x01e85df0), TOBN(0x4fb265b3, 0xcd44eafd),
+ TOBN(0x321e7dcd, 0xe51e1ae2), TOBN(0x8e3a8ca6, 0xe3d8b096),
+ TOBN(0x8de46cb0, 0x52604998), TOBN(0x91099ad8, 0x39072aa7),
+ TOBN(0x2617f91c, 0x93aa96b8), TOBN(0x0fc8716b, 0x7fca2e13),
+ TOBN(0xa7106f5e, 0x95328723), TOBN(0xd1c9c40b, 0x262e6522),
+ TOBN(0xb9bafe86, 0x42b7c094), TOBN(0x1873439d, 0x1543c021),
+ TOBN(0xe1baa5de, 0x5cbefd5d), TOBN(0xa363fc5e, 0x521e8aff),
+ TOBN(0xefe6320d, 0xf862eaac), TOBN(0x14419c63, 0x22c647dc),
+ TOBN(0x0e06707c, 0x4e46d428), TOBN(0xcb6c834f, 0x4a178f8f),
+ TOBN(0x0f993a45, 0xd30f917c), TOBN(0xd4c4b049, 0x9879afee),
+ TOBN(0xb6142a1e, 0x70500063), TOBN(0x7c9b41c3, 0xa5d9d605),
+ TOBN(0xbc00fc2f, 0x2f8ba2c7), TOBN(0x0966eb2f, 0x7c67aa28),
+ TOBN(0x13f7b516, 0x5a786972), TOBN(0x3bfb7557, 0x8a2fbba0),
+ TOBN(0x131c4f23, 0x5a2b9620), TOBN(0xbff3ed27, 0x6faf46be),
+ TOBN(0x9b4473d1, 0x7e172323), TOBN(0x421e8878, 0x339f6246),
+ TOBN(0x0fa8587a, 0x25a41632), TOBN(0xc0814124, 0xa35b6c93),
+ TOBN(0x2b18a9f5, 0x59ebb8db), TOBN(0x264e3357, 0x76edb29c),
+ TOBN(0xaf245ccd, 0xc87c51e2), TOBN(0x16b3015b, 0x501e6214),
+ TOBN(0xbb31c560, 0x0a3882ce), TOBN(0x6961bb94, 0xfec11e04),
+ TOBN(0x3b825b8d, 0xeff7a3a0), TOBN(0xbec33738, 0xb1df7326),
+ TOBN(0x68ad747c, 0x99604a1f), TOBN(0xd154c934, 0x9a3bd499),
+ TOBN(0xac33506f, 0x1cc7a906), TOBN(0x73bb5392, 0x6c560e8f),
+ TOBN(0x6428fcbe, 0x263e3944), TOBN(0xc11828d5, 0x1c387434),
+ TOBN(0x3cd04be1, 0x3e4b12ff), TOBN(0xc3aad9f9, 0x2d88667c),
+ TOBN(0xc52ddcf8, 0x248120cf), TOBN(0x985a892e, 0x2a389532),
+ TOBN(0xfbb4b21b, 0x3bb85fa0), TOBN(0xf95375e0, 0x8dfc6269),
+ TOBN(0xfb4fb06c, 0x7ee2acea), TOBN(0x6785426e, 0x309c4d1f),
+ TOBN(0x659b17c8, 0xd8ceb147), TOBN(0x9b649eee, 0xb70a5554),
+ TOBN(0x6b7fa0b5, 0xac6bc634), TOBN(0xd99fe2c7, 0x1d6e732f),
+ TOBN(0x30e6e762, 0x8d3abba2), TOBN(0x18fee6e7, 0xa797b799),
+ TOBN(0x5c9d360d, 0xc696464d), TOBN(0xe3baeb48, 0x27bfde12),
+ TOBN(0x2bf5db47, 0xf23206d5), TOBN(0x2f6d3420, 0x1d260152),
+ TOBN(0x17b87653, 0x3f8ff89a), TOBN(0x5157c30c, 0x378fa458),
+ TOBN(0x7517c5c5, 0x2d4fb936), TOBN(0xef22f7ac, 0xe6518cdc),
+ TOBN(0xdeb483e6, 0xbf847a64), TOBN(0xf5084558, 0x92e0fa89),}
+ ,
+ {TOBN(0xab9659d8, 0xdf7304d4), TOBN(0xb71bcf1b, 0xff210e8e),
+ TOBN(0xa9a2438b, 0xd73fbd60), TOBN(0x4595cd1f, 0x5d11b4de),
+ TOBN(0x9c0d329a, 0x4835859d), TOBN(0x4a0f0d2d, 0x7dbb6e56),
+ TOBN(0xc6038e5e, 0xdf928a4e), TOBN(0xc9429621, 0x8f5ad154),
+ TOBN(0x91213462, 0xf23f2d92), TOBN(0x6cab71bd, 0x60b94078),
+ TOBN(0x6bdd0a63, 0x176cde20), TOBN(0x54c9b20c, 0xee4d54bc),
+ TOBN(0x3cd2d8aa, 0x9f2ac02f), TOBN(0x03f8e617, 0x206eedb0),
+ TOBN(0xc7f68e16, 0x93086434), TOBN(0x831469c5, 0x92dd3db9),
+ TOBN(0x8521df24, 0x8f981354), TOBN(0x587e23ec, 0x3588a259),
+ TOBN(0xcbedf281, 0xd7a0992c), TOBN(0x06930a55, 0x38961407),
+ TOBN(0x09320deb, 0xbe5bbe21), TOBN(0xa7ffa5b5, 0x2491817f),
+ TOBN(0xe6c8b4d9, 0x09065160), TOBN(0xac4f3992, 0xfff6d2a9),
+ TOBN(0x7aa7a158, 0x3ae9c1bd), TOBN(0xe0af6d98, 0xe37ce240),
+ TOBN(0xe54342d9, 0x28ab38b4), TOBN(0xe8b75007, 0x0a1c98ca),
+ TOBN(0xefce86af, 0xe02358f2), TOBN(0x31b8b856, 0xea921228),
+ TOBN(0x052a1912, 0x0a1c67fc), TOBN(0xb4069ea4, 0xe3aead59),
+ TOBN(0x3232d6e2, 0x7fa03cb3), TOBN(0xdb938e5b, 0x0fdd7d88),
+ TOBN(0x04c1d2cd, 0x2ccbfc5d), TOBN(0xd2f45c12, 0xaf3a580f),
+ TOBN(0x592620b5, 0x7883e614), TOBN(0x5fd27e68, 0xbe7c5f26),
+ TOBN(0x139e45a9, 0x1567e1e3), TOBN(0x2cc71d2d, 0x44d8aaaf),
+ TOBN(0x4a9090cd, 0xe36d0757), TOBN(0xf722d7b1, 0xd9a29382),
+ TOBN(0xfb7fb04c, 0x04b48ddf), TOBN(0x628ad2a7, 0xebe16f43),
+ TOBN(0xcd3fbfb5, 0x20226040), TOBN(0x6c34ecb1, 0x5104b6c4),
+ TOBN(0x30c0754e, 0xc903c188), TOBN(0xec336b08, 0x2d23cab0),
+ TOBN(0x473d62a2, 0x1e206ee5), TOBN(0xf1e27480, 0x8c49a633),
+ TOBN(0x87ab956c, 0xe9f6b2c3), TOBN(0x61830b48, 0x62b606ea),
+ TOBN(0x67cd6846, 0xe78e815f), TOBN(0xfe40139f, 0x4c02082a),
+ TOBN(0x52bbbfcb, 0x952ec365), TOBN(0x74c11642, 0x6b9836ab),
+ TOBN(0x9f51439e, 0x558df019), TOBN(0x230da4ba, 0xac712b27),
+ TOBN(0x518919e3, 0x55185a24), TOBN(0x4dcefcdd, 0x84b78f50),
+ TOBN(0xa7d90fb2, 0xa47d4c5a), TOBN(0x55ac9abf, 0xb30e009e),
+ TOBN(0xfd2fc359, 0x74eed273), TOBN(0xb72d824c, 0xdbea8faf),
+ TOBN(0xce721a74, 0x4513e2ca), TOBN(0x0b418612, 0x38240b2c),
+ TOBN(0x05199968, 0xd5baa450), TOBN(0xeb1757ed, 0x2b0e8c25),
+ TOBN(0x6ebc3e28, 0x3dfac6d5), TOBN(0xb2431e2e, 0x48a237f5),
+ TOBN(0x2acb5e23, 0x52f61499), TOBN(0x5558a2a7, 0xe06c936b),
+ TOBN(0xd213f923, 0xcbb13d1b), TOBN(0x98799f42, 0x5bfb9bfe),
+ TOBN(0x1ae8ddc9, 0x701144a9), TOBN(0x0b8b3bb6, 0x4c5595ee),
+ TOBN(0x0ea9ef2e, 0x3ecebb21), TOBN(0x17cb6c4b, 0x3671f9a7),
+ TOBN(0x47ef464f, 0x726f1d1f), TOBN(0x171b9484, 0x6943a276),
+ TOBN(0x51a4ae2d, 0x7ef0329c), TOBN(0x08509222, 0x91c4402a),
+ TOBN(0x64a61d35, 0xafd45bbc), TOBN(0x38f096fe, 0x3035a851),
+ TOBN(0xc7468b74, 0xa1dec027), TOBN(0xe8cf10e7, 0x4fc7dcba),
+ TOBN(0xea35ff40, 0xf4a06353), TOBN(0x0b4c0dfa, 0x8b77dd66),
+ TOBN(0x779b8552, 0xde7e5c19), TOBN(0xfab28609, 0xc1c0256c),
+ TOBN(0x64f58eee, 0xabd4743d), TOBN(0x4e8ef838, 0x7b6cc93b),
+ TOBN(0xee650d26, 0x4cb1bf3d), TOBN(0x4c1f9d09, 0x73dedf61),
+ TOBN(0xaef7c9d7, 0xbfb70ced), TOBN(0x1ec0507e, 0x1641de1e),
+ TOBN(0xcd7e5cc7, 0xcde45079), TOBN(0xde173c9a, 0x516ac9e4),
+ TOBN(0x517a8494, 0xc170315c), TOBN(0x438fd905, 0x91d8e8fb),
+ TOBN(0x5145c506, 0xc7d9630b), TOBN(0x6457a87b, 0xf47d4d75),
+ TOBN(0xd31646bf, 0x0d9a80e8), TOBN(0x453add2b, 0xcef3aabe),
+ TOBN(0xc9941109, 0xa607419d), TOBN(0xfaa71e62, 0xbb6bca80),
+ TOBN(0x34158c13, 0x07c431f3), TOBN(0x594abebc, 0x992bc47a),
+ TOBN(0x6dfea691, 0xeb78399f), TOBN(0x48aafb35, 0x3f42cba4),
+ TOBN(0xedcd65af, 0x077c04f0), TOBN(0x1a29a366, 0xe884491a),
+ TOBN(0x023a40e5, 0x1c21f2bf), TOBN(0xf99a513c, 0xa5057aee),
+ TOBN(0xa3fe7e25, 0xbcab072e), TOBN(0x8568d2e1, 0x40e32bcf),
+ TOBN(0x904594eb, 0xd3f69d9f), TOBN(0x181a9733, 0x07affab1),
+ TOBN(0xe4d68d76, 0xb6e330f4), TOBN(0x87a6dafb, 0xc75a7fc1),
+ TOBN(0x549db2b5, 0xef7d9289), TOBN(0x2480d4a8, 0x197f015a),
+ TOBN(0x61d5590b, 0xc40493b6), TOBN(0x3a55b52e, 0x6f780331),
+ TOBN(0x40eb8115, 0x309eadb0), TOBN(0xdea7de5a, 0x92e5c625),
+ TOBN(0x64d631f0, 0xcc6a3d5a), TOBN(0x9d5e9d7c, 0x93e8dd61),
+ TOBN(0xf297bef5, 0x206d3ffc), TOBN(0x23d5e033, 0x7d808bd4),
+ TOBN(0x4a4f6912, 0xd24cf5ba), TOBN(0xe4d8163b, 0x09cdaa8a),
+ TOBN(0x0e0de9ef, 0xd3082e8e), TOBN(0x4fe1246c, 0x0192f360),
+ TOBN(0x1f900150, 0x4b8eee0a), TOBN(0x5219da81, 0xf1da391b),
+ TOBN(0x7bf6a5c1, 0xf7ea25aa), TOBN(0xd165e6bf, 0xfbb07d5f),
+ TOBN(0xe3539361, 0x89e78671), TOBN(0xa3fcac89, 0x2bac4219),
+ TOBN(0xdfab6fd4, 0xf0baa8ab), TOBN(0x5a4adac1, 0xe2c1c2e5),
+ TOBN(0x6cd75e31, 0x40d85849), TOBN(0xce263fea, 0x19b39181),
+ TOBN(0xcb6803d3, 0x07032c72), TOBN(0x7f40d5ce, 0x790968c8),
+ TOBN(0xa6de86bd, 0xdce978f0), TOBN(0x25547c4f, 0x368f751c),
+ TOBN(0xb1e685fd, 0x65fb2a9e), TOBN(0xce69336f, 0x1eb9179c),
+ TOBN(0xb15d1c27, 0x12504442), TOBN(0xb7df465c, 0xb911a06b),
+ TOBN(0xb8d804a3, 0x315980cd), TOBN(0x693bc492, 0xfa3bebf7),
+ TOBN(0x3578aeee, 0x2253c504), TOBN(0x158de498, 0xcd2474a2),
+ TOBN(0x1331f5c7, 0xcfda8368), TOBN(0xd2d7bbb3, 0x78d7177e),
+ TOBN(0xdf61133a, 0xf3c1e46e), TOBN(0x5836ce7d, 0xd30e7be8),
+ TOBN(0x83084f19, 0x94f834cb), TOBN(0xd35653d4, 0x429ed782),
+ TOBN(0xa542f16f, 0x59e58243), TOBN(0xc2b52f65, 0x0470a22d),
+ TOBN(0xe3b6221b, 0x18f23d96), TOBN(0xcb05abac, 0x3f5252b4),
+ TOBN(0xca00938b, 0x87d61402), TOBN(0x2f186cdd, 0x411933e4),
+ TOBN(0xe042ece5, 0x9a29a5c5), TOBN(0xb19b3c07, 0x3b6c8402),
+ TOBN(0xc97667c7, 0x19d92684), TOBN(0xb5624622, 0xebc66372),
+ TOBN(0x0cb96e65, 0x3c04fa02), TOBN(0x83a7176c, 0x8eaa39aa),
+ TOBN(0x2033561d, 0xeaa1633f), TOBN(0x45a9d086, 0x4533df73),
+ TOBN(0xe0542c1d, 0x3dc090bc), TOBN(0x82c996ef, 0xaa59c167),
+ TOBN(0xe3f735e8, 0x0ee7fc4d), TOBN(0x7b179393, 0x7c35db79),
+ TOBN(0xb6419e25, 0xf8c5dbfd), TOBN(0x4d9d7a1e, 0x1f327b04),
+ TOBN(0x979f6f9b, 0x298dfca8), TOBN(0xc7c5dff1, 0x8de9366a),
+ TOBN(0x1b7a588d, 0x04c82bdd), TOBN(0x68005534, 0xf8319dfd),
+ TOBN(0xde8a55b5, 0xd8eb9580), TOBN(0x5ea886da, 0x8d5bca81),
+ TOBN(0xe8530a01, 0x252a0b4d), TOBN(0x1bffb4fe, 0x35eaa0a1),
+ TOBN(0x2ad828b1, 0xd8e99563), TOBN(0x7de96ef5, 0x95f9cd87),
+ TOBN(0x4abb2d0c, 0xd77d970c), TOBN(0x03cfb933, 0xd33ef9cb),
+ TOBN(0xb0547c01, 0x8b211fe9), TOBN(0x2fe64809, 0xa56ed1c6),
+ TOBN(0xcb7d5624, 0xc2ac98cc), TOBN(0x2a1372c0, 0x1a393e33),
+ TOBN(0xc8d1ec1c, 0x29660521), TOBN(0xf3d31b04, 0xb37ac3e9),
+ TOBN(0xa29ae9df, 0x5ece6e7c), TOBN(0x0603ac8f, 0x0facfb55),
+ TOBN(0xcfe85b7a, 0xdda233a5), TOBN(0xe618919f, 0xbd75f0b8),
+ TOBN(0xf555a3d2, 0x99bf1603), TOBN(0x1f43afc9, 0xf184255a),
+ TOBN(0xdcdaf341, 0x319a3e02), TOBN(0xd3b117ef, 0x03903a39),
+ TOBN(0xe095da13, 0x65d1d131), TOBN(0x86f16367, 0xc37ad03e),
+ TOBN(0x5f37389e, 0x462cd8dd), TOBN(0xc103fa04, 0xd67a60e6),
+ TOBN(0x57c34344, 0xf4b478f0), TOBN(0xce91edd8, 0xe117c98d),
+ TOBN(0x001777b0, 0x231fc12e), TOBN(0x11ae47f2, 0xb207bccb),
+ TOBN(0xd983cf8d, 0x20f8a242), TOBN(0x7aff5b1d, 0xf22e1ad8),
+ TOBN(0x68fd11d0, 0x7fc4feb3), TOBN(0x5d53ae90, 0xb0f1c3e1),
+ TOBN(0x50fb7905, 0xec041803), TOBN(0x85e3c977, 0x14404888),
+ TOBN(0x0e67faed, 0xac628d8f), TOBN(0x2e865150, 0x6668532c),
+ TOBN(0x15acaaa4, 0x6a67a6b0), TOBN(0xf4cdee25, 0xb25cec41),
+ TOBN(0x49ee565a, 0xe4c6701e), TOBN(0x2a04ca66, 0xfc7d63d8),
+ TOBN(0xeb105018, 0xef0543fb), TOBN(0xf709a4f5, 0xd1b0d81d),
+ TOBN(0x5b906ee6, 0x2915d333), TOBN(0xf4a87412, 0x96f1f0ab),
+ TOBN(0xb6b82fa7, 0x4d82f4c2), TOBN(0x90725a60, 0x6804efb3),
+ TOBN(0xbc82ec46, 0xadc3425e), TOBN(0xb7b80581, 0x2787843e),
+ TOBN(0xdf46d91c, 0xdd1fc74c), TOBN(0xdc1c62cb, 0xe783a6c4),
+ TOBN(0x59d1b9f3, 0x1a04cbba), TOBN(0xd87f6f72, 0x95e40764),
+ TOBN(0x02b4cfc1, 0x317f4a76), TOBN(0x8d2703eb, 0x91036bce),
+ TOBN(0x98206cc6, 0xa5e72a56), TOBN(0x57be9ed1, 0xcf53fb0f),
+ TOBN(0x09374571, 0xef0b17ac), TOBN(0x74b2655e, 0xd9181b38),
+ TOBN(0xc8f80ea8, 0x89935d0e), TOBN(0xc0d9e942, 0x91529936),
+ TOBN(0x19686041, 0x1e84e0e5), TOBN(0xa5db84d3, 0xaea34c93),
+ TOBN(0xf9d5bb19, 0x7073a732), TOBN(0xb8d2fe56, 0x6bcfd7c0),
+ TOBN(0x45775f36, 0xf3eb82fa), TOBN(0x8cb20ccc, 0xfdff8b58),
+ TOBN(0x1659b65f, 0x8374c110), TOBN(0xb8b4a422, 0x330c789a),
+ TOBN(0x75e3c3ea, 0x6fe8208b), TOBN(0xbd74b9e4, 0x286e78fe),
+ TOBN(0x0be2e81b, 0xd7d93a1a), TOBN(0x7ed06e27, 0xdd0a5aae),
+ TOBN(0x721f5a58, 0x6be8b800), TOBN(0x428299d1, 0xd846db28),
+ TOBN(0x95cb8e6b, 0x5be88ed3), TOBN(0xc3186b23, 0x1c034e11),
+ TOBN(0xa6312c9e, 0x8977d99b), TOBN(0xbe944331, 0x83f531e7),
+ TOBN(0x8232c0c2, 0x18d3b1d4), TOBN(0x617aae8b, 0xe1247b73),
+ TOBN(0x40153fc4, 0x282aec3b), TOBN(0xc6063d2f, 0xf7b8f823),
+ TOBN(0x68f10e58, 0x3304f94c), TOBN(0x31efae74, 0xee676346),
+ TOBN(0xbadb6c6d, 0x40a9b97c), TOBN(0x14702c63, 0x4f666256),
+ TOBN(0xdeb954f1, 0x5184b2e3), TOBN(0x5184a526, 0x94b6ca40),
+ TOBN(0xfff05337, 0x003c32ea), TOBN(0x5aa374dd, 0x205974c7),
+ TOBN(0x9a763854, 0x4b0dd71a), TOBN(0x459cd27f, 0xdeb947ec),
+ TOBN(0xa6e28161, 0x459c2b92), TOBN(0x2f020fa8, 0x75ee8ef5),
+ TOBN(0xb132ec2d, 0x30b06310), TOBN(0xc3e15899, 0xbc6a4530),
+ TOBN(0xdc5f53fe, 0xaa3f451a), TOBN(0x3a3c7f23, 0xc2d9acac),
+ TOBN(0x2ec2f892, 0x6b27e58b), TOBN(0x68466ee7, 0xd742799f),
+ TOBN(0x98324dd4, 0x1fa26613), TOBN(0xa2dc6dab, 0xbdc29d63),
+ TOBN(0xf9675faa, 0xd712d657), TOBN(0x813994be, 0x21fd8d15),
+ TOBN(0x5ccbb722, 0xfd4f7553), TOBN(0x5135ff8b, 0xf3a36b20),
+ TOBN(0x44be28af, 0x69559df5), TOBN(0x40b65bed, 0x9d41bf30),
+ TOBN(0xd98bf2a4, 0x3734e520), TOBN(0x5e3abbe3, 0x209bdcba),
+ TOBN(0x77c76553, 0xbc945b35), TOBN(0x5331c093, 0xc6ef14aa),
+ TOBN(0x518ffe29, 0x76b60c80), TOBN(0x2285593b, 0x7ace16f8),
+ TOBN(0xab1f64cc, 0xbe2b9784), TOBN(0xe8f2c0d9, 0xab2421b6),
+ TOBN(0x617d7174, 0xc1df065c), TOBN(0xafeeb5ab, 0x5f6578fa),
+ TOBN(0x16ff1329, 0x263b54a8), TOBN(0x45c55808, 0xc990dce3),
+ TOBN(0x42eab6c0, 0xecc8c177), TOBN(0x799ea9b5, 0x5982ecaa),
+ TOBN(0xf65da244, 0xb607ef8e), TOBN(0x8ab226ce, 0x32a3fc2c),
+ TOBN(0x745741e5, 0x7ea973dc), TOBN(0x5c00ca70, 0x20888f2e),
+ TOBN(0x7cdce3cf, 0x45fd9cf1), TOBN(0x8a741ef1, 0x5507f872),
+ TOBN(0x47c51c2f, 0x196b4cec), TOBN(0x70d08e43, 0xc97ea618),
+ TOBN(0x930da15c, 0x15b18a2b), TOBN(0x33b6c678, 0x2f610514),
+ TOBN(0xc662e4f8, 0x07ac9794), TOBN(0x1eccf050, 0xba06cb79),
+ TOBN(0x1ff08623, 0xe7d954e5), TOBN(0x6ef2c5fb, 0x24cf71c3),
+ TOBN(0xb2c063d2, 0x67978453), TOBN(0xa0cf3796, 0x1d654af8),
+ TOBN(0x7cb242ea, 0x7ebdaa37), TOBN(0x206e0b10, 0xb86747e0),
+ TOBN(0x481dae5f, 0xd5ecfefc), TOBN(0x07084fd8, 0xc2bff8fc),
+ TOBN(0x8040a01a, 0xea324596), TOBN(0x4c646980, 0xd4de4036),
+ TOBN(0x9eb8ab4e, 0xd65abfc3), TOBN(0xe01cb91f, 0x13541ec7),
+ TOBN(0x8f029adb, 0xfd695012), TOBN(0x9ae28483, 0x3c7569ec),
+ TOBN(0xa5614c9e, 0xa66d80a1), TOBN(0x680a3e44, 0x75f5f911),
+ TOBN(0x0c07b14d, 0xceba4fc1), TOBN(0x891c285b, 0xa13071c1),
+ TOBN(0xcac67ceb, 0x799ece3c), TOBN(0x29b910a9, 0x41e07e27),
+ TOBN(0x66bdb409, 0xf2e43123), TOBN(0x06f8b137, 0x7ac9ecbe),
+ TOBN(0x5981fafd, 0x38547090), TOBN(0x19ab8b9f, 0x85e3415d),
+ TOBN(0xfc28c194, 0xc7e31b27), TOBN(0x843be0aa, 0x6fbcbb42),
+ TOBN(0xf3b1ed43, 0xa6db836c), TOBN(0x2a1330e4, 0x01a45c05),
+ TOBN(0x4f19f3c5, 0x95c1a377), TOBN(0xa85f39d0, 0x44b5ee33),
+ TOBN(0x3da18e6d, 0x4ae52834), TOBN(0x5a403b39, 0x7423dcb0),
+ TOBN(0xbb555e0a, 0xf2374aef), TOBN(0x2ad599c4, 0x1e8ca111),
+ TOBN(0x1b3a2fb9, 0x014b3bf8), TOBN(0x73092684, 0xf66d5007),
+ TOBN(0x079f1426, 0xc4340102), TOBN(0x1827cf81, 0x8fddf4de),
+ TOBN(0xc83605f6, 0xf10ff927), TOBN(0xd3871451, 0x23739fc6),
+ TOBN(0x6d163450, 0xcac1c2cc), TOBN(0x6b521296, 0xa2ec1ac5),
+ TOBN(0x0606c4f9, 0x6e3cb4a5), TOBN(0xe47d3f41, 0x778abff7),
+ TOBN(0x425a8d5e, 0xbe8e3a45), TOBN(0x53ea9e97, 0xa6102160),
+ TOBN(0x477a106e, 0x39cbb688), TOBN(0x532401d2, 0xf3386d32),
+ TOBN(0x8e564f64, 0xb1b9b421), TOBN(0xca9b8388, 0x81dad33f),
+ TOBN(0xb1422b4e, 0x2093913e), TOBN(0x533d2f92, 0x69bc8112),
+ TOBN(0x3fa017be, 0xebe7b2c7), TOBN(0xb2767c4a, 0xcaf197c6),
+ TOBN(0xc925ff87, 0xaedbae9f), TOBN(0x7daf0eb9, 0x36880a54),
+ TOBN(0x9284ddf5, 0x9c4d0e71), TOBN(0x1581cf93, 0x316f8cf5),
+ TOBN(0x3eeca887, 0x3ac1f452), TOBN(0xb417fce9, 0xfb6aeffe),
+ TOBN(0xa5918046, 0xeefb8dc3), TOBN(0x73d318ac, 0x02209400),
+ TOBN(0xe800400f, 0x728693e5), TOBN(0xe87d814b, 0x339927ed),
+ TOBN(0x93e94d3b, 0x57ea9910), TOBN(0xff8a35b6, 0x2245fb69),
+ TOBN(0x043853d7, 0x7f200d34), TOBN(0x470f1e68, 0x0f653ce1),
+ TOBN(0x81ac05bd, 0x59a06379), TOBN(0xa14052c2, 0x03930c29),
+ TOBN(0x6b72fab5, 0x26bc2797), TOBN(0x13670d16, 0x99f16771),
+ TOBN(0x00170052, 0x1e3e48d1), TOBN(0x978fe401, 0xb7adf678),
+ TOBN(0x55ecfb92, 0xd41c5dd4), TOBN(0x5ff8e247, 0xc7b27da5),
+ TOBN(0xe7518272, 0x013fb606), TOBN(0x5768d7e5, 0x2f547a3c),
+ TOBN(0xbb24eaa3, 0x60017a5f), TOBN(0x6b18e6e4, 0x9c64ce9b),
+ TOBN(0xc225c655, 0x103dde07), TOBN(0xfc3672ae, 0x7592f7ea),
+ TOBN(0x9606ad77, 0xd06283a1), TOBN(0x542fc650, 0xe4d59d99),
+ TOBN(0xabb57c49, 0x2a40e7c2), TOBN(0xac948f13, 0xa8db9f55),
+ TOBN(0x6d4c9682, 0xb04465c3), TOBN(0xe3d062fa, 0x6468bd15),
+ TOBN(0xa51729ac, 0x5f318d7e), TOBN(0x1fc87df6, 0x9eb6fc95),
+ TOBN(0x63d146a8, 0x0591f652), TOBN(0xa861b8f7, 0x589621aa),
+ TOBN(0x59f5f15a, 0xce31348c), TOBN(0x8f663391, 0x440da6da),
+ TOBN(0xcfa778ac, 0xb591ffa3), TOBN(0x027ca9c5, 0x4cdfebce),
+ TOBN(0xbe8e05a5, 0x444ea6b3), TOBN(0x8aab4e69, 0xa78d8254),
+ TOBN(0x2437f04f, 0xb474d6b8), TOBN(0x6597ffd4, 0x045b3855),
+ TOBN(0xbb0aea4e, 0xca47ecaa), TOBN(0x568aae83, 0x85c7ebfc),
+ TOBN(0x0e966e64, 0xc73b2383), TOBN(0x49eb3447, 0xd17d8762),
+ TOBN(0xde107821, 0x8da05dab), TOBN(0x443d8baa, 0x016b7236),
+ TOBN(0x163b63a5, 0xea7610d6), TOBN(0xe47e4185, 0xce1ca979),
+ TOBN(0xae648b65, 0x80baa132), TOBN(0xebf53de2, 0x0e0d5b64),
+ TOBN(0x8d3bfcb4, 0xd3c8c1ca), TOBN(0x0d914ef3, 0x5d04b309),
+ TOBN(0x55ef6415, 0x3de7d395), TOBN(0xbde1666f, 0x26b850e8),
+ TOBN(0xdbe1ca6e, 0xd449ab19), TOBN(0x8902b322, 0xe89a2672),
+ TOBN(0xb1674b7e, 0xdacb7a53), TOBN(0x8e9faf6e, 0xf52523ff),
+ TOBN(0x6ba535da, 0x9a85788b), TOBN(0xd21f03ae, 0xbd0626d4),
+ TOBN(0x099f8c47, 0xe873dc64), TOBN(0xcda8564d, 0x018ec97e),
+ TOBN(0x3e8d7a5c, 0xde92c68c), TOBN(0x78e035a1, 0x73323cc4),
+ TOBN(0x3ef26275, 0xf880ff7c), TOBN(0xa4ee3dff, 0x273eedaa),
+ TOBN(0x58823507, 0xaf4e18f8), TOBN(0x967ec9b5, 0x0672f328),
+ TOBN(0x9ded19d9, 0x559d3186), TOBN(0x5e2ab3de, 0x6cdce39c),
+ TOBN(0xabad6e4d, 0x11c226df), TOBN(0xf9783f43, 0x87723014),
+ TOBN(0x9a49a0cf, 0x1a885719), TOBN(0xfc0c1a5a, 0x90da9dbf),
+ TOBN(0x8bbaec49, 0x571d92ac), TOBN(0x569e85fe, 0x4692517f),
+ TOBN(0x8333b014, 0xa14ea4af), TOBN(0x32f2a62f, 0x12e5c5ad),
+ TOBN(0x98c2ce3a, 0x06d89b85), TOBN(0xb90741aa, 0x2ff77a08),
+ TOBN(0x2530defc, 0x01f795a2), TOBN(0xd6e5ba0b, 0x84b3c199),
+ TOBN(0x7d8e8451, 0x12e4c936), TOBN(0xae419f7d, 0xbd0be17b),
+ TOBN(0xa583fc8c, 0x22262bc9), TOBN(0x6b842ac7, 0x91bfe2bd),
+ TOBN(0x33cef4e9, 0x440d6827), TOBN(0x5f69f4de, 0xef81fb14),
+ TOBN(0xf16cf6f6, 0x234fbb92), TOBN(0x76ae3fc3, 0xd9e7e158),
+ TOBN(0x4e89f6c2, 0xe9740b33), TOBN(0x677bc85d, 0x4962d6a1),
+ TOBN(0x6c6d8a7f, 0x68d10d15), TOBN(0x5f9a7224, 0x0257b1cd),
+ TOBN(0x7096b916, 0x4ad85961), TOBN(0x5f8c47f7, 0xe657ab4a),
+ TOBN(0xde57d7d0, 0xf7461d7e), TOBN(0x7eb6094d, 0x80ce5ee2),
+ TOBN(0x0b1e1dfd, 0x34190547), TOBN(0x8a394f43, 0xf05dd150),
+ TOBN(0x0a9eb24d, 0x97df44e6), TOBN(0x78ca06bf, 0x87675719),
+ TOBN(0x6f0b3462, 0x6ffeec22), TOBN(0x9d91bcea, 0x36cdd8fb),
+ TOBN(0xac83363c, 0xa105be47), TOBN(0x81ba76c1, 0x069710e3),
+ TOBN(0x3d1b24cb, 0x28c682c6), TOBN(0x27f25228, 0x8612575b),
+ TOBN(0xb587c779, 0xe8e66e98), TOBN(0x7b0c03e9, 0x405eb1fe),
+ TOBN(0xfdf0d030, 0x15b548e7), TOBN(0xa8be76e0, 0x38b36af7),
+ TOBN(0x4cdab04a, 0x4f310c40), TOBN(0x6287223e, 0xf47ecaec),
+ TOBN(0x678e6055, 0x8b399320), TOBN(0x61fe3fa6, 0xc01e4646),
+ TOBN(0xc482866b, 0x03261a5e), TOBN(0xdfcf45b8, 0x5c2f244a),
+ TOBN(0x8fab9a51, 0x2f684b43), TOBN(0xf796c654, 0xc7220a66),
+ TOBN(0x1d90707e, 0xf5afa58f), TOBN(0x2c421d97, 0x4fdbe0de),
+ TOBN(0xc4f4cda3, 0xaf2ebc2f), TOBN(0xa0af843d, 0xcb4efe24),
+ TOBN(0x53b857c1, 0x9ccd10b1), TOBN(0xddc9d1eb, 0x914d3e04),
+ TOBN(0x7bdec8bb, 0x62771deb), TOBN(0x829277aa, 0x91c5aa81),
+ TOBN(0x7af18dd6, 0x832391ae), TOBN(0x1740f316, 0xc71a84ca),}
+ ,
+ {TOBN(0x8928e99a, 0xeeaf8c49), TOBN(0xee7aa73d, 0x6e24d728),
+ TOBN(0x4c5007c2, 0xe72b156c), TOBN(0x5fcf57c5, 0xed408a1d),
+ TOBN(0x9f719e39, 0xb6057604), TOBN(0x7d343c01, 0xc2868bbf),
+ TOBN(0x2cca254b, 0x7e103e2d), TOBN(0xe6eb38a9, 0xf131bea2),
+ TOBN(0xb33e624f, 0x8be762b4), TOBN(0x2a9ee4d1, 0x058e3413),
+ TOBN(0x968e6369, 0x67d805fa), TOBN(0x9848949b, 0x7db8bfd7),
+ TOBN(0x5308d7e5, 0xd23a8417), TOBN(0x892f3b1d, 0xf3e29da5),
+ TOBN(0xc95c139e, 0x3dee471f), TOBN(0x8631594d, 0xd757e089),
+ TOBN(0xe0c82a3c, 0xde918dcc), TOBN(0x2e7b5994, 0x26fdcf4b),
+ TOBN(0x82c50249, 0x32cb1b2d), TOBN(0xea613a9d, 0x7657ae07),
+ TOBN(0xc2eb5f6c, 0xf1fdc9f7), TOBN(0xb6eae8b8, 0x879fe682),
+ TOBN(0x253dfee0, 0x591cbc7f), TOBN(0x000da713, 0x3e1290e6),
+ TOBN(0x1083e2ea, 0x1f095615), TOBN(0x0a28ad77, 0x14e68c33),
+ TOBN(0x6bfc0252, 0x3d8818be), TOBN(0xb585113a, 0xf35850cd),
+ TOBN(0x7d935f0b, 0x30df8aa1), TOBN(0xaddda07c, 0x4ab7e3ac),
+ TOBN(0x92c34299, 0x552f00cb), TOBN(0xc33ed1de, 0x2909df6c),
+ TOBN(0x22c2195d, 0x80e87766), TOBN(0x9e99e6d8, 0x9ddf4ac0),
+ TOBN(0x09642e4e, 0x65e74934), TOBN(0x2610ffa2, 0xff1ff241),
+ TOBN(0x4d1d47d4, 0x751c8159), TOBN(0x697b4985, 0xaf3a9363),
+ TOBN(0x0318ca46, 0x87477c33), TOBN(0xa90cb565, 0x9441eff3),
+ TOBN(0x58bb3848, 0x36f024cb), TOBN(0x85be1f77, 0x36016168),
+ TOBN(0x6c59587c, 0xdc7e07f1), TOBN(0x191be071, 0xaf1d8f02),
+ TOBN(0xbf169fa5, 0xcca5e55c), TOBN(0x3864ba3c, 0xf7d04eac),
+ TOBN(0x915e367f, 0x8d7d05db), TOBN(0xb48a876d, 0xa6549e5d),
+ TOBN(0xef89c656, 0x580e40a2), TOBN(0xf194ed8c, 0x728068bc),
+ TOBN(0x74528045, 0xa47990c9), TOBN(0xf53fc7d7, 0x5e1a4649),
+ TOBN(0xbec5ae9b, 0x78593e7d), TOBN(0x2cac4ee3, 0x41db65d7),
+ TOBN(0xa8c1eb24, 0x04a3d39b), TOBN(0x53b7d634, 0x03f8f3ef),
+ TOBN(0x2dc40d48, 0x3e07113c), TOBN(0x6e4a5d39, 0x7d8b63ae),
+ TOBN(0x5582a94b, 0x79684c2b), TOBN(0x932b33d4, 0x622da26c),
+ TOBN(0xf534f651, 0x0dbbf08d), TOBN(0x211d07c9, 0x64c23a52),
+ TOBN(0x0eeece0f, 0xee5bdc9b), TOBN(0xdf178168, 0xf7015558),
+ TOBN(0xd4294635, 0x0a712229), TOBN(0x93cbe448, 0x09273f8c),
+ TOBN(0x00b095ef, 0x8f13bc83), TOBN(0xbb741972, 0x8798978c),
+ TOBN(0x9d7309a2, 0x56dbe6e7), TOBN(0xe578ec56, 0x5a5d39ec),
+ TOBN(0x3961151b, 0x851f9a31), TOBN(0x2da7715d, 0xe5709eb4),
+ TOBN(0x867f3017, 0x53dfabf0), TOBN(0x728d2078, 0xb8e39259),
+ TOBN(0x5c75a0cd, 0x815d9958), TOBN(0xf84867a6, 0x16603be1),
+ TOBN(0xc865b13d, 0x70e35b1c), TOBN(0x02414468, 0x19b03e2c),
+ TOBN(0xe46041da, 0xac1f3121), TOBN(0x7c9017ad, 0x6f028a7c),
+ TOBN(0xabc96de9, 0x0a482873), TOBN(0x4265d6b1, 0xb77e54d4),
+ TOBN(0x68c38e79, 0xa57d88e7), TOBN(0xd461d766, 0x9ce82de3),
+ TOBN(0x817a9ec5, 0x64a7e489), TOBN(0xcc5675cd, 0xa0def5f2),
+ TOBN(0x9a00e785, 0x985d494e), TOBN(0xc626833f, 0x1b03514a),
+ TOBN(0xabe7905a, 0x83cdd60e), TOBN(0x50602fb5, 0xa1170184),
+ TOBN(0x689886cd, 0xb023642a), TOBN(0xd568d090, 0xa6e1fb00),
+ TOBN(0x5b1922c7, 0x0259217f), TOBN(0x93831cd9, 0xc43141e4),
+ TOBN(0xdfca3587, 0x0c95f86e), TOBN(0xdec2057a, 0x568ae828),
+ TOBN(0xc44ea599, 0xf98a759a), TOBN(0x55a0a7a2, 0xf7c23c1d),
+ TOBN(0xd5ffb6e6, 0x94c4f687), TOBN(0x3563cce2, 0x12848478),
+ TOBN(0x812b3517, 0xe7b1fbe1), TOBN(0x8a7dc979, 0x4f7338e0),
+ TOBN(0x211ecee9, 0x52d048db), TOBN(0x2eea4056, 0xc86ea3b8),
+ TOBN(0xd8cb68a7, 0xba772b34), TOBN(0xe16ed341, 0x5f4e2541),
+ TOBN(0x9b32f6a6, 0x0fec14db), TOBN(0xeee376f7, 0x391698be),
+ TOBN(0xe9a7aa17, 0x83674c02), TOBN(0x65832f97, 0x5843022a),
+ TOBN(0x29f3a8da, 0x5ba4990f), TOBN(0x79a59c3a, 0xfb8e3216),
+ TOBN(0x9cdc4d2e, 0xbd19bb16), TOBN(0xc6c7cfd0, 0xb3262d86),
+ TOBN(0xd4ce14d0, 0x969c0b47), TOBN(0x1fa352b7, 0x13e56128),
+ TOBN(0x383d55b8, 0x973db6d3), TOBN(0x71836850, 0xe8e5b7bf),
+ TOBN(0xc7714596, 0xe6bb571f), TOBN(0x259df31f, 0x2d5b2dd2),
+ TOBN(0x568f8925, 0x913cc16d), TOBN(0x18bc5b6d, 0xe1a26f5a),
+ TOBN(0xdfa413be, 0xf5f499ae), TOBN(0xf8835dec, 0xc3f0ae84),
+ TOBN(0xb6e60bd8, 0x65a40ab0), TOBN(0x65596439, 0x194b377e),
+ TOBN(0xbcd85625, 0x92084a69), TOBN(0x5ce433b9, 0x4f23ede0),
+ TOBN(0xe8e8f04f, 0x6ad65143), TOBN(0x11511827, 0xd6e14af6),
+ TOBN(0x3d390a10, 0x8295c0c7), TOBN(0x71e29ee4, 0x621eba16),
+ TOBN(0xa588fc09, 0x63717b46), TOBN(0x02be02fe, 0xe06ad4a2),
+ TOBN(0x931558c6, 0x04c22b22), TOBN(0xbb4d4bd6, 0x12f3c849),
+ TOBN(0x54a4f496, 0x20efd662), TOBN(0x92ba6d20, 0xc5952d14),
+ TOBN(0x2db8ea1e, 0xcc9784c2), TOBN(0x81cc10ca, 0x4b353644),
+ TOBN(0x40b570ad, 0x4b4d7f6c), TOBN(0x5c9f1d96, 0x84a1dcd2),
+ TOBN(0x01379f81, 0x3147e797), TOBN(0xe5c6097b, 0x2bd499f5),
+ TOBN(0x40dcafa6, 0x328e5e20), TOBN(0xf7b5244a, 0x54815550),
+ TOBN(0xb9a4f118, 0x47bfc978), TOBN(0x0ea0e79f, 0xd25825b1),
+ TOBN(0xa50f96eb, 0x646c7ecf), TOBN(0xeb811493, 0x446dea9d),
+ TOBN(0x2af04677, 0xdfabcf69), TOBN(0xbe3a068f, 0xc713f6e8),
+ TOBN(0x860d523d, 0x42e06189), TOBN(0xbf077941, 0x4e3aff13),
+ TOBN(0x0b616dca, 0xc1b20650), TOBN(0xe66dd6d1, 0x2131300d),
+ TOBN(0xd4a0fd67, 0xff99abde), TOBN(0xc9903550, 0xc7aac50d),
+ TOBN(0x022ecf8b, 0x7c46b2d7), TOBN(0x3333b1e8, 0x3abf92af),
+ TOBN(0x11cc113c, 0x6c491c14), TOBN(0x05976688, 0x80dd3f88),
+ TOBN(0xf5b4d9e7, 0x29d932ed), TOBN(0xe982aad8, 0xa2c38b6d),
+ TOBN(0x6f925347, 0x8be0dcf0), TOBN(0x700080ae, 0x65ca53f2),
+ TOBN(0xd8131156, 0x443ca77f), TOBN(0xe92d6942, 0xec51f984),
+ TOBN(0xd2a08af8, 0x85dfe9ae), TOBN(0xd825d9a5, 0x4d2a86ca),
+ TOBN(0x2c53988d, 0x39dff020), TOBN(0xf38b135a, 0x430cdc40),
+ TOBN(0x0c918ae0, 0x62a7150b), TOBN(0xf31fd8de, 0x0c340e9b),
+ TOBN(0xafa0e7ae, 0x4dbbf02e), TOBN(0x5847fb2a, 0x5eba6239),
+ TOBN(0x6b1647dc, 0xdccbac8b), TOBN(0xb642aa78, 0x06f485c8),
+ TOBN(0x873f3765, 0x7038ecdf), TOBN(0x2ce5e865, 0xfa49d3fe),
+ TOBN(0xea223788, 0xc98c4400), TOBN(0x8104a8cd, 0xf1fa5279),
+ TOBN(0xbcf7cc7a, 0x06becfd7), TOBN(0x49424316, 0xc8f974ae),
+ TOBN(0xc0da65e7, 0x84d6365d), TOBN(0xbcb7443f, 0x8f759fb8),
+ TOBN(0x35c712b1, 0x7ae81930), TOBN(0x80428dff, 0x4c6e08ab),
+ TOBN(0xf19dafef, 0xa4faf843), TOBN(0xced8538d, 0xffa9855f),
+ TOBN(0x20ac409c, 0xbe3ac7ce), TOBN(0x358c1fb6, 0x882da71e),
+ TOBN(0xafa9c0e5, 0xfd349961), TOBN(0x2b2cfa51, 0x8421c2fc),
+ TOBN(0x2a80db17, 0xf3a28d38), TOBN(0xa8aba539, 0x5d138e7e),
+ TOBN(0x52012d1d, 0x6e96eb8d), TOBN(0x65d8dea0, 0xcbaf9622),
+ TOBN(0x57735447, 0xb264f56c), TOBN(0xbeebef3f, 0x1b6c8da2),
+ TOBN(0xfc346d98, 0xce785254), TOBN(0xd50e8d72, 0xbb64a161),
+ TOBN(0xc03567c7, 0x49794add), TOBN(0x15a76065, 0x752c7ef6),
+ TOBN(0x59f3a222, 0x961f23d6), TOBN(0x378e4438, 0x73ecc0b0),
+ TOBN(0xc74be434, 0x5a82fde4), TOBN(0xae509af2, 0xd8b9cf34),
+ TOBN(0x4a61ee46, 0x577f44a1), TOBN(0xe09b748c, 0xb611deeb),
+ TOBN(0xc0481b2c, 0xf5f7b884), TOBN(0x35626678, 0x61acfa6b),
+ TOBN(0x37f4c518, 0xbf8d21e6), TOBN(0x22d96531, 0xb205a76d),
+ TOBN(0x37fb85e1, 0x954073c0), TOBN(0xbceafe4f, 0x65b3a567),
+ TOBN(0xefecdef7, 0xbe42a582), TOBN(0xd3fc6080, 0x65046be6),
+ TOBN(0xc9af13c8, 0x09e8dba9), TOBN(0x1e6c9847, 0x641491ff),
+ TOBN(0x3b574925, 0xd30c31f7), TOBN(0xb7eb72ba, 0xac2a2122),
+ TOBN(0x776a0dac, 0xef0859e7), TOBN(0x06fec314, 0x21900942),
+ TOBN(0x2464bc10, 0xf8c22049), TOBN(0x9bfbcce7, 0x875ebf69),
+ TOBN(0xd7a88e2a, 0x4336326b), TOBN(0xda05261c, 0x5bc2acfa),
+ TOBN(0xc29f5bdc, 0xeba7efc8), TOBN(0x471237ca, 0x25dbbf2e),
+ TOBN(0xa72773f2, 0x2975f127), TOBN(0xdc744e8e, 0x04d0b326),
+ TOBN(0x38a7ed16, 0xa56edb73), TOBN(0x64357e37, 0x2c007e70),
+ TOBN(0xa167d15b, 0x5080b400), TOBN(0x07b41164, 0x23de4be1),
+ TOBN(0xb2d91e32, 0x74c89883), TOBN(0x3c162821, 0x2882e7ed),
+ TOBN(0xad6b36ba, 0x7503e482), TOBN(0x48434e8e, 0x0ea34331),
+ TOBN(0x79f4f24f, 0x2c7ae0b9), TOBN(0xc46fbf81, 0x1939b44a),
+ TOBN(0x76fefae8, 0x56595eb1), TOBN(0x417b66ab, 0xcd5f29c7),
+ TOBN(0x5f2332b2, 0xc5ceec20), TOBN(0xd69661ff, 0xe1a1cae2),
+ TOBN(0x5ede7e52, 0x9b0286e6), TOBN(0x9d062529, 0xe276b993),
+ TOBN(0x324794b0, 0x7e50122b), TOBN(0xdd744f8b, 0x4af07ca5),
+ TOBN(0x30a12f08, 0xd63fc97b), TOBN(0x39650f1a, 0x76626d9d),
+ TOBN(0x101b47f7, 0x1fa38477), TOBN(0x3d815f19, 0xd4dc124f),
+ TOBN(0x1569ae95, 0xb26eb58a), TOBN(0xc3cde188, 0x95fb1887),
+ TOBN(0x54e9f37b, 0xf9539a48), TOBN(0xb0100e06, 0x7408c1a5),
+ TOBN(0x821d9811, 0xea580cbb), TOBN(0x8af52d35, 0x86e50c56),
+ TOBN(0xdfbd9d47, 0xdbbf698b), TOBN(0x2961a1ea, 0x03dc1c73),
+ TOBN(0x203d38f8, 0xe76a5df8), TOBN(0x08a53a68, 0x6def707a),
+ TOBN(0x26eefb48, 0x1bee45d4), TOBN(0xb3cee346, 0x3c688036),
+ TOBN(0x463c5315, 0xc42f2469), TOBN(0x19d84d2e, 0x81378162),
+ TOBN(0x22d7c3c5, 0x1c4d349f), TOBN(0x65965844, 0x163d59c5),
+ TOBN(0xcf198c56, 0xb8abceae), TOBN(0x6fb1fb1b, 0x628559d5),
+ TOBN(0x8bbffd06, 0x07bf8fe3), TOBN(0x46259c58, 0x3467734b),
+ TOBN(0xd8953cea, 0x35f7f0d3), TOBN(0x1f0bece2, 0xd65b0ff1),
+ TOBN(0xf7d5b4b3, 0xf3c72914), TOBN(0x29e8ea95, 0x3cb53389),
+ TOBN(0x4a365626, 0x836b6d46), TOBN(0xe849f910, 0xea174fde),
+ TOBN(0x7ec62fbb, 0xf4737f21), TOBN(0xd8dba5ab, 0x6209f5ac),
+ TOBN(0x24b5d7a9, 0xa5f9adbe), TOBN(0x707d28f7, 0xa61dc768),
+ TOBN(0x7711460b, 0xcaa999ea), TOBN(0xba7b174d, 0x1c92e4cc),
+ TOBN(0x3c4bab66, 0x18d4bf2d), TOBN(0xb8f0c980, 0xeb8bd279),
+ TOBN(0x024bea9a, 0x324b4737), TOBN(0xfba9e423, 0x32a83bca),
+ TOBN(0x6e635643, 0xa232dced), TOBN(0x99619367, 0x2571c8ba),
+ TOBN(0xe8c9f357, 0x54b7032b), TOBN(0xf936b3ba, 0x2442d54a),
+ TOBN(0x2263f0f0, 0x8290c65a), TOBN(0x48989780, 0xee2c7fdb),
+ TOBN(0xadc5d55a, 0x13d4f95e), TOBN(0x737cff85, 0xad9b8500),
+ TOBN(0x271c557b, 0x8a73f43d), TOBN(0xbed617a4, 0xe18bc476),
+ TOBN(0x66245401, 0x7dfd8ab2), TOBN(0xae7b89ae, 0x3a2870aa),
+ TOBN(0x1b555f53, 0x23a7e545), TOBN(0x6791e247, 0xbe057e4c),
+ TOBN(0x860136ad, 0x324fa34d), TOBN(0xea111447, 0x4cbeae28),
+ TOBN(0x023a4270, 0xbedd3299), TOBN(0x3d5c3a7f, 0xc1c35c34),
+ TOBN(0xb0f6db67, 0x8d0412d2), TOBN(0xd92625e2, 0xfcdc6b9a),
+ TOBN(0x92ae5ccc, 0x4e28a982), TOBN(0xea251c36, 0x47a3ce7e),
+ TOBN(0x9d658932, 0x790691bf), TOBN(0xed610589, 0x06b736ae),
+ TOBN(0x712c2f04, 0xc0d63b6e), TOBN(0x5cf06fd5, 0xc63d488f),
+ TOBN(0x97363fac, 0xd9588e41), TOBN(0x1f9bf762, 0x2b93257e),
+ TOBN(0xa9d1ffc4, 0x667acace), TOBN(0x1cf4a1aa, 0x0a061ecf),
+ TOBN(0x40e48a49, 0xdc1818d0), TOBN(0x0643ff39, 0xa3621ab0),
+ TOBN(0x5768640c, 0xe39ef639), TOBN(0x1fc099ea, 0x04d86854),
+ TOBN(0x9130b9c3, 0xeccd28fd), TOBN(0xd743cbd2, 0x7eec54ab),
+ TOBN(0x052b146f, 0xe5b475b6), TOBN(0x058d9a82, 0x900a7d1f),
+ TOBN(0x65e02292, 0x91262b72), TOBN(0x96f924f9, 0xbb0edf03),
+ TOBN(0x5cfa59c8, 0xfe206842), TOBN(0xf6037004, 0x5eafa720),
+ TOBN(0x5f30699e, 0x18d7dd96), TOBN(0x381e8782, 0xcbab2495),
+ TOBN(0x91669b46, 0xdd8be949), TOBN(0xb40606f5, 0x26aae8ef),
+ TOBN(0x2812b839, 0xfc6751a4), TOBN(0x16196214, 0xfba800ef),
+ TOBN(0x4398d5ca, 0x4c1a2875), TOBN(0x720c00ee, 0x653d8349),
+ TOBN(0xc2699eb0, 0xd820007c), TOBN(0x880ee660, 0xa39b5825),
+ TOBN(0x70694694, 0x471f6984), TOBN(0xf7d16ea8, 0xe3dda99a),
+ TOBN(0x28d675b2, 0xc0519a23), TOBN(0x9ebf94fe, 0x4f6952e3),
+ TOBN(0xf28bb767, 0xa2294a8a), TOBN(0x85512b4d, 0xfe0af3f5),
+ TOBN(0x18958ba8, 0x99b16a0d), TOBN(0x95c2430c, 0xba7548a7),
+ TOBN(0xb30d1b10, 0xa16be615), TOBN(0xe3ebbb97, 0x85bfb74c),
+ TOBN(0xa3273cfe, 0x18549fdb), TOBN(0xf6e200bf, 0x4fcdb792),
+ TOBN(0x54a76e18, 0x83aba56c), TOBN(0x73ec66f6, 0x89ef6aa2),
+ TOBN(0x8d17add7, 0xd1b9a305), TOBN(0xa959c5b9, 0xb7ae1b9d),
+ TOBN(0x88643522, 0x6bcc094a), TOBN(0xcc5616c4, 0xd7d429b9),
+ TOBN(0xa6dada01, 0xe6a33f7c), TOBN(0xc6217a07, 0x9d4e70ad),
+ TOBN(0xd619a818, 0x09c15b7c), TOBN(0xea06b329, 0x0e80c854),
+ TOBN(0x174811ce, 0xa5f5e7b9), TOBN(0x66dfc310, 0x787c65f4),
+ TOBN(0x4ea7bd69, 0x3316ab54), TOBN(0xc12c4acb, 0x1dcc0f70),
+ TOBN(0xe4308d1a, 0x1e407dd9), TOBN(0xe8a3587c, 0x91afa997),
+ TOBN(0xea296c12, 0xab77b7a5), TOBN(0xb5ad49e4, 0x673c0d52),
+ TOBN(0x40f9b2b2, 0x7006085a), TOBN(0xa88ff340, 0x87bf6ec2),
+ TOBN(0x978603b1, 0x4e3066a6), TOBN(0xb3f99fc2, 0xb5e486e2),
+ TOBN(0x07b53f5e, 0xb2e63645), TOBN(0xbe57e547, 0x84c84232),
+ TOBN(0xd779c216, 0x7214d5cf), TOBN(0x617969cd, 0x029a3aca),
+ TOBN(0xd17668cd, 0x8a7017a0), TOBN(0x77b4d19a, 0xbe9b7ee8),
+ TOBN(0x58fd0e93, 0x9c161776), TOBN(0xa8c4f4ef, 0xd5968a72),
+ TOBN(0x296071cc, 0x67b3de77), TOBN(0xae3c0b8e, 0x634f7905),
+ TOBN(0x67e440c2, 0x8a7100c9), TOBN(0xbb8c3c1b, 0xeb4b9b42),
+ TOBN(0x6d71e8ea, 0xc51b3583), TOBN(0x7591f5af, 0x9525e642),
+ TOBN(0xf73a2f7b, 0x13f509f3), TOBN(0x618487aa, 0x5619ac9b),
+ TOBN(0x3a72e5f7, 0x9d61718a), TOBN(0x00413bcc, 0x7592d28c),
+ TOBN(0x7d9b11d3, 0x963c35cf), TOBN(0x77623bcf, 0xb90a46ed),
+ TOBN(0xdeef273b, 0xdcdd2a50), TOBN(0x4a741f9b, 0x0601846e),
+ TOBN(0x33b89e51, 0x0ec6e929), TOBN(0xcb02319f, 0x8b7f22cd),
+ TOBN(0xbbe1500d, 0x084bae24), TOBN(0x2f0ae8d7, 0x343d2693),
+ TOBN(0xacffb5f2, 0x7cdef811), TOBN(0xaa0c030a, 0x263fb94f),
+ TOBN(0x6eef0d61, 0xa0f442de), TOBN(0xf92e1817, 0x27b139d3),
+ TOBN(0x1ae6deb7, 0x0ad8bc28), TOBN(0xa89e38dc, 0xc0514130),
+ TOBN(0x81eeb865, 0xd2fdca23), TOBN(0x5a15ee08, 0xcc8ef895),
+ TOBN(0x768fa10a, 0x01905614), TOBN(0xeff5b8ef, 0x880ee19b),
+ TOBN(0xf0c0cabb, 0xcb1c8a0e), TOBN(0x2e1ee9cd, 0xb8c838f9),
+ TOBN(0x0587d8b8, 0x8a4a14c0), TOBN(0xf6f27896, 0x2ff698e5),
+ TOBN(0xed38ef1c, 0x89ee6256), TOBN(0xf44ee1fe, 0x6b353b45),
+ TOBN(0x9115c0c7, 0x70e903b3), TOBN(0xc78ec0a1, 0x818f31df),
+ TOBN(0x6c003324, 0xb7dccbc6), TOBN(0xd96dd1f3, 0x163bbc25),
+ TOBN(0x33aa82dd, 0x5cedd805), TOBN(0x123aae4f, 0x7f7eb2f1),
+ TOBN(0x1723fcf5, 0xa26262cd), TOBN(0x1f7f4d5d, 0x0060ebd5),
+ TOBN(0xf19c5c01, 0xb2eaa3af), TOBN(0x2ccb9b14, 0x9790accf),
+ TOBN(0x1f9c1cad, 0x52324aa6), TOBN(0x63200526, 0x7247df54),
+ TOBN(0x5732fe42, 0xbac96f82), TOBN(0x52fe771f, 0x01a1c384),
+ TOBN(0x546ca13d, 0xb1001684), TOBN(0xb56b4eee, 0xa1709f75),
+ TOBN(0x266545a9, 0xd5db8672), TOBN(0xed971c90, 0x1e8f3cfb),
+ TOBN(0x4e7d8691, 0xe3a07b29), TOBN(0x7570d9ec, 0xe4b696b9),
+ TOBN(0xdc5fa067, 0x7bc7e9ae), TOBN(0x68b44caf, 0xc82c4844),
+ TOBN(0x519d34b3, 0xbf44da80), TOBN(0x283834f9, 0x5ab32e66),
+ TOBN(0x6e608797, 0x6278a000), TOBN(0x1e62960e, 0x627312f6),
+ TOBN(0x9b87b27b, 0xe6901c55), TOBN(0x80e78538, 0x24fdbc1f),
+ TOBN(0xbbbc0951, 0x2facc27d), TOBN(0x06394239, 0xac143b5a),
+ TOBN(0x35bb4a40, 0x376c1944), TOBN(0x7cb62694, 0x63da1511),
+ TOBN(0xafd29161, 0xb7148a3b), TOBN(0xa6f9d9ed, 0x4e2ea2ee),
+ TOBN(0x15dc2ca2, 0x880dd212), TOBN(0x903c3813, 0xa61139a9),
+ TOBN(0x2aa7b46d, 0x6c0f8785), TOBN(0x36ce2871, 0x901c60ff),
+ TOBN(0xc683b028, 0xe10d9c12), TOBN(0x7573baa2, 0x032f33d3),
+ TOBN(0x87a9b1f6, 0x67a31b58), TOBN(0xfd3ed11a, 0xf4ffae12),
+ TOBN(0x83dcaa9a, 0x0cb2748e), TOBN(0x8239f018, 0x5d6fdf16),
+ TOBN(0xba67b49c, 0x72753941), TOBN(0x2beec455, 0xc321cb36),
+ TOBN(0x88015606, 0x3f8b84ce), TOBN(0x76417083, 0x8d38c86f),
+ TOBN(0x054f1ca7, 0x598953dd), TOBN(0xc939e110, 0x4e8e7429),
+ TOBN(0x9b1ac2b3, 0x5a914f2f), TOBN(0x39e35ed3, 0xe74b8f9c),
+ TOBN(0xd0debdb2, 0x781b2fb0), TOBN(0x1585638f, 0x2d997ba2),
+ TOBN(0x9c4b646e, 0x9e2fce99), TOBN(0x68a21081, 0x1e80857f),
+ TOBN(0x06d54e44, 0x3643b52a), TOBN(0xde8d6d63, 0x0d8eb843),
+ TOBN(0x70321563, 0x42146a0a), TOBN(0x8ba826f2, 0x5eaa3622),
+ TOBN(0x227a58bd, 0x86138787), TOBN(0x43b6c03c, 0x10281d37),
+ TOBN(0x6326afbb, 0xb54dde39), TOBN(0x744e5e8a, 0xdb6f2d5f),
+ TOBN(0x48b2a99a, 0xcff158e1), TOBN(0xa93c8fa0, 0xef87918f),
+ TOBN(0x2182f956, 0xde058c5c), TOBN(0x216235d2, 0x936f9e7a),
+ TOBN(0xace0c0db, 0xd2e31e67), TOBN(0xc96449bf, 0xf23ac3e7),
+ TOBN(0x7e9a2874, 0x170693bd), TOBN(0xa28e14fd, 0xa45e6335),
+ TOBN(0x5757f6b3, 0x56427344), TOBN(0x822e4556, 0xacf8edf9),
+ TOBN(0x2b7a6ee2, 0xe6a285cd), TOBN(0x5866f211, 0xa9df3af0),
+ TOBN(0x40dde2dd, 0xf845b844), TOBN(0x986c3726, 0x110e5e49),
+ TOBN(0x73680c2a, 0xf7172277), TOBN(0x57b94f0f, 0x0cccb244),
+ TOBN(0xbdff7267, 0x2d438ca7), TOBN(0xbad1ce11, 0xcf4663fd),
+ TOBN(0x9813ed9d, 0xd8f71cae), TOBN(0xf43272a6, 0x961fdaa6),
+ TOBN(0xbeff0119, 0xbd6d1637), TOBN(0xfebc4f91, 0x30361978),
+ TOBN(0x02b37a95, 0x2f41deff), TOBN(0x0e44a59a, 0xe63b89b7),
+ TOBN(0x673257dc, 0x143ff951), TOBN(0x19c02205, 0xd752baf4),
+ TOBN(0x46c23069, 0xc4b7d692), TOBN(0x2e6392c3, 0xfd1502ac),
+ TOBN(0x6057b1a2, 0x1b220846), TOBN(0xe51ff946, 0x0c1b5b63),}
+ ,
+ {TOBN(0x6e85cb51, 0x566c5c43), TOBN(0xcff9c919, 0x3597f046),
+ TOBN(0x9354e90c, 0x4994d94a), TOBN(0xe0a39332, 0x2147927d),
+ TOBN(0x8427fac1, 0x0dc1eb2b), TOBN(0x88cfd8c2, 0x2ff319fa),
+ TOBN(0xe2d4e684, 0x01965274), TOBN(0xfa2e067d, 0x67aaa746),
+ TOBN(0xb6d92a7f, 0x3e5f9f11), TOBN(0x9afe153a, 0xd6cb3b8e),
+ TOBN(0x4d1a6dd7, 0xddf800bd), TOBN(0xf6c13cc0, 0xcaf17e19),
+ TOBN(0x15f6c58e, 0x325fc3ee), TOBN(0x71095400, 0xa31dc3b2),
+ TOBN(0x168e7c07, 0xafa3d3e7), TOBN(0x3f8417a1, 0x94c7ae2d),
+ TOBN(0xec234772, 0x813b230d), TOBN(0x634d0f5f, 0x17344427),
+ TOBN(0x11548ab1, 0xd77fc56a), TOBN(0x7fab1750, 0xce06af77),
+ TOBN(0xb62c10a7, 0x4f7c4f83), TOBN(0xa7d2edc4, 0x220a67d9),
+ TOBN(0x1c404170, 0x921209a0), TOBN(0x0b9815a0, 0xface59f0),
+ TOBN(0x2842589b, 0x319540c3), TOBN(0x18490f59, 0xa283d6f8),
+ TOBN(0xa2731f84, 0xdaae9fcb), TOBN(0x3db6d960, 0xc3683ba0),
+ TOBN(0xc85c63bb, 0x14611069), TOBN(0xb19436af, 0x0788bf05),
+ TOBN(0x905459df, 0x347460d2), TOBN(0x73f6e094, 0xe11a7db1),
+ TOBN(0xdc7f938e, 0xb6357f37), TOBN(0xc5d00f79, 0x2bd8aa62),
+ TOBN(0xc878dcb9, 0x2ca979fc), TOBN(0x37e83ed9, 0xeb023a99),
+ TOBN(0x6b23e273, 0x1560bf3d), TOBN(0x1086e459, 0x1d0fae61),
+ TOBN(0x78248316, 0x9a9414bd), TOBN(0x1b956bc0, 0xf0ea9ea1),
+ TOBN(0x7b85bb91, 0xc31b9c38), TOBN(0x0c5aa90b, 0x48ef57b5),
+ TOBN(0xdedeb169, 0xaf3bab6f), TOBN(0xe610ad73, 0x2d373685),
+ TOBN(0xf13870df, 0x02ba8e15), TOBN(0x0337edb6, 0x8ca7f771),
+ TOBN(0xe4acf747, 0xb62c036c), TOBN(0xd921d576, 0xb6b94e81),
+ TOBN(0xdbc86439, 0x2c422f7a), TOBN(0xfb635362, 0xed348898),
+ TOBN(0x83084668, 0xc45bfcd1), TOBN(0xc357c9e3, 0x2b315e11),
+ TOBN(0xb173b540, 0x5b2e5b8c), TOBN(0x7e946931, 0xe102b9a4),
+ TOBN(0x17c890eb, 0x7b0fb199), TOBN(0xec225a83, 0xd61b662b),
+ TOBN(0xf306a3c8, 0xee3c76cb), TOBN(0x3cf11623, 0xd32a1f6e),
+ TOBN(0xe6d5ab64, 0x6863e956), TOBN(0x3b8a4cbe, 0x5c005c26),
+ TOBN(0xdcd529a5, 0x9ce6bb27), TOBN(0xc4afaa52, 0x04d4b16f),
+ TOBN(0xb0624a26, 0x7923798d), TOBN(0x85e56df6, 0x6b307fab),
+ TOBN(0x0281893c, 0x2bf29698), TOBN(0x91fc19a4, 0xd7ce7603),
+ TOBN(0x75a5dca3, 0xad9a558f), TOBN(0x40ceb3fa, 0x4d50bf77),
+ TOBN(0x1baf6060, 0xbc9ba369), TOBN(0x927e1037, 0x597888c2),
+ TOBN(0xd936bf19, 0x86a34c07), TOBN(0xd4cf10c1, 0xc34ae980),
+ TOBN(0x3a3e5334, 0x859dd614), TOBN(0x9c475b5b, 0x18d0c8ee),
+ TOBN(0x63080d1f, 0x07cd51d5), TOBN(0xc9c0d0a6, 0xb88b4326),
+ TOBN(0x1ac98691, 0xc234296f), TOBN(0x2a0a83a4, 0x94887fb6),
+ TOBN(0x56511427, 0x0cea9cf2), TOBN(0x5230a6e8, 0xa24802f5),
+ TOBN(0xf7a2bf0f, 0x72e3d5c1), TOBN(0x37717446, 0x4f21439e),
+ TOBN(0xfedcbf25, 0x9ce30334), TOBN(0xe0030a78, 0x7ce202f9),
+ TOBN(0x6f2d9ebf, 0x1202e9ca), TOBN(0xe79dde6c, 0x75e6e591),
+ TOBN(0xf52072af, 0xf1dac4f8), TOBN(0x6c8d087e, 0xbb9b404d),
+ TOBN(0xad0fc73d, 0xbce913af), TOBN(0x909e587b, 0x458a07cb),
+ TOBN(0x1300da84, 0xd4f00c8a), TOBN(0x425cd048, 0xb54466ac),
+ TOBN(0xb59cb9be, 0x90e9d8bf), TOBN(0x991616db, 0x3e431b0e),
+ TOBN(0xd3aa117a, 0x531aecff), TOBN(0x91af92d3, 0x59f4dc3b),
+ TOBN(0x9b1ec292, 0xe93fda29), TOBN(0x76bb6c17, 0xe97d91bc),
+ TOBN(0x7509d95f, 0xaface1e6), TOBN(0x3653fe47, 0xbe855ae3),
+ TOBN(0x73180b28, 0x0f680e75), TOBN(0x75eefd1b, 0xeeb6c26c),
+ TOBN(0xa4cdf29f, 0xb66d4236), TOBN(0x2d70a997, 0x6b5821d8),
+ TOBN(0x7a3ee207, 0x20445c36), TOBN(0x71d1ac82, 0x59877174),
+ TOBN(0x0fc539f7, 0x949f73e9), TOBN(0xd05cf3d7, 0x982e3081),
+ TOBN(0x8758e20b, 0x7b1c7129), TOBN(0xffadcc20, 0x569e61f2),
+ TOBN(0xb05d3a2f, 0x59544c2d), TOBN(0xbe16f5c1, 0x9fff5e53),
+ TOBN(0x73cf65b8, 0xaad58135), TOBN(0x622c2119, 0x037aa5be),
+ TOBN(0x79373b3f, 0x646fd6a0), TOBN(0x0e029db5, 0x0d3978cf),
+ TOBN(0x8bdfc437, 0x94fba037), TOBN(0xaefbd687, 0x620797a6),
+ TOBN(0x3fa5382b, 0xbd30d38e), TOBN(0x7627cfbf, 0x585d7464),
+ TOBN(0xb2330fef, 0x4e4ca463), TOBN(0xbcef7287, 0x3566cc63),
+ TOBN(0xd161d2ca, 0xcf780900), TOBN(0x135dc539, 0x5b54827d),
+ TOBN(0x638f052e, 0x27bf1bc6), TOBN(0x10a224f0, 0x07dfa06c),
+ TOBN(0xe973586d, 0x6d3321da), TOBN(0x8b0c5738, 0x26152c8f),
+ TOBN(0x07ef4f2a, 0x34606074), TOBN(0x80fe7fe8, 0xa0f7047a),
+ TOBN(0x3d1a8152, 0xe1a0e306), TOBN(0x32cf43d8, 0x88da5222),
+ TOBN(0xbf89a95f, 0x5f02ffe6), TOBN(0x3d9eb9a4, 0x806ad3ea),
+ TOBN(0x012c17bb, 0x79c8e55e), TOBN(0xfdcd1a74, 0x99c81dac),
+ TOBN(0x7043178b, 0xb9556098), TOBN(0x4090a1df, 0x801c3886),
+ TOBN(0x759800ff, 0x9b67b912), TOBN(0x3e5c0304, 0x232620c8),
+ TOBN(0x4b9d3c4b, 0x70dceeca), TOBN(0xbb2d3c15, 0x181f648e),
+ TOBN(0xf981d837, 0x6e33345c), TOBN(0xb626289b, 0x0cf2297a),
+ TOBN(0x766ac659, 0x8baebdcf), TOBN(0x1a28ae09, 0x75df01e5),
+ TOBN(0xb71283da, 0x375876d8), TOBN(0x4865a96d, 0x607b9800),
+ TOBN(0x25dd1bcd, 0x237936b2), TOBN(0x332f4f4b, 0x60417494),
+ TOBN(0xd0923d68, 0x370a2147), TOBN(0x497f5dfb, 0xdc842203),
+ TOBN(0x9dc74cbd, 0x32be5e0f), TOBN(0x7475bcb7, 0x17a01375),
+ TOBN(0x438477c9, 0x50d872b1), TOBN(0xcec67879, 0xffe1d63d),
+ TOBN(0x9b006014, 0xd8578c70), TOBN(0xc9ad99a8, 0x78bb6b8b),
+ TOBN(0x6799008e, 0x11fb3806), TOBN(0xcfe81435, 0xcd44cab3),
+ TOBN(0xa2ee1582, 0x2f4fb344), TOBN(0xb8823450, 0x483fa6eb),
+ TOBN(0x622d323d, 0x652c7749), TOBN(0xd8474a98, 0xbeb0a15b),
+ TOBN(0xe43c154d, 0x5d1c00d0), TOBN(0x7fd581d9, 0x0e3e7aac),
+ TOBN(0x2b44c619, 0x2525ddf8), TOBN(0x67a033eb, 0xb8ae9739),
+ TOBN(0x113ffec1, 0x9ef2d2e4), TOBN(0x1bf6767e, 0xd5a0ea7f),
+ TOBN(0x57fff75e, 0x03714c0a), TOBN(0xa23c422e, 0x0a23e9ee),
+ TOBN(0xdd5f6b2d, 0x540f83af), TOBN(0xc2c2c27e, 0x55ea46a7),
+ TOBN(0xeb6b4246, 0x672a1208), TOBN(0xd13599f7, 0xae634f7a),
+ TOBN(0xcf914b5c, 0xd7b32c6e), TOBN(0x61a5a640, 0xeaf61814),
+ TOBN(0x8dc3df8b, 0x208a1bbb), TOBN(0xef627fd6, 0xb6d79aa5),
+ TOBN(0x44232ffc, 0xc4c86bc8), TOBN(0xe6f9231b, 0x061539fe),
+ TOBN(0x1d04f25a, 0x958b9533), TOBN(0x180cf934, 0x49e8c885),
+ TOBN(0x89689595, 0x9884aaf7), TOBN(0xb1959be3, 0x07b348a6),
+ TOBN(0x96250e57, 0x3c147c87), TOBN(0xae0efb3a, 0xdd0c61f8),
+ TOBN(0xed00745e, 0xca8c325e), TOBN(0x3c911696, 0xecff3f70),
+ TOBN(0x73acbc65, 0x319ad41d), TOBN(0x7b01a020, 0xf0b1c7ef),
+ TOBN(0xea32b293, 0x63a1483f), TOBN(0x89eabe71, 0x7a248f96),
+ TOBN(0x9c6231d3, 0x343157e5), TOBN(0x93a375e5, 0xdf3c546d),
+ TOBN(0xe76e9343, 0x6a2afe69), TOBN(0xc4f89100, 0xe166c88e),
+ TOBN(0x248efd0d, 0x4f872093), TOBN(0xae0eb3ea, 0x8fe0ea61),
+ TOBN(0xaf89790d, 0x9d79046e), TOBN(0x4d650f2d, 0x6cee0976),
+ TOBN(0xa3935d9a, 0x43071eca), TOBN(0x66fcd2c9, 0x283b0bfe),
+ TOBN(0x0e665eb5, 0x696605f1), TOBN(0xe77e5d07, 0xa54cd38d),
+ TOBN(0x90ee050a, 0x43d950cf), TOBN(0x86ddebda, 0xd32e69b5),
+ TOBN(0x6ad94a3d, 0xfddf7415), TOBN(0xf7fa1309, 0x3f6e8d5a),
+ TOBN(0xc4831d1d, 0xe9957f75), TOBN(0x7de28501, 0xd5817447),
+ TOBN(0x6f1d7078, 0x9e2aeb6b), TOBN(0xba2b9ff4, 0xf67a53c2),
+ TOBN(0x36963767, 0xdf9defc3), TOBN(0x479deed3, 0x0d38022c),
+ TOBN(0xd2edb89b, 0x3a8631e8), TOBN(0x8de855de, 0x7a213746),
+ TOBN(0xb2056cb7, 0xb00c5f11), TOBN(0xdeaefbd0, 0x2c9b85e4),
+ TOBN(0x03f39a8d, 0xd150892d), TOBN(0x37b84686, 0x218b7985),
+ TOBN(0x36296dd8, 0xb7375f1a), TOBN(0x472cd4b1, 0xb78e898e),
+ TOBN(0x15dff651, 0xe9f05de9), TOBN(0xd4045069, 0x2ce98ba9),
+ TOBN(0x8466a7ae, 0x9b38024c), TOBN(0xb910e700, 0xe5a6b5ef),
+ TOBN(0xae1c56ea, 0xb3aa8f0d), TOBN(0xbab2a507, 0x7eee74a6),
+ TOBN(0x0dca11e2, 0x4b4c4620), TOBN(0xfd896e2e, 0x4c47d1f4),
+ TOBN(0xeb45ae53, 0x308fbd93), TOBN(0x46cd5a2e, 0x02c36fda),
+ TOBN(0x6a3d4e90, 0xbaa48385), TOBN(0xdd55e62e, 0x9dbe9960),
+ TOBN(0xa1406aa0, 0x2a81ede7), TOBN(0x6860dd14, 0xf9274ea7),
+ TOBN(0xcfdcb0c2, 0x80414f86), TOBN(0xff410b10, 0x22f94327),
+ TOBN(0x5a33cc38, 0x49ad467b), TOBN(0xefb48b6c, 0x0a7335f1),
+ TOBN(0x14fb54a4, 0xb153a360), TOBN(0x604aa9d2, 0xb52469cc),
+ TOBN(0x5e9dc486, 0x754e48e9), TOBN(0x693cb455, 0x37471e8e),
+ TOBN(0xfb2fd7cd, 0x8d3b37b6), TOBN(0x63345e16, 0xcf09ff07),
+ TOBN(0x9910ba6b, 0x23a5d896), TOBN(0x1fe19e35, 0x7fe4364e),
+ TOBN(0x6e1da8c3, 0x9a33c677), TOBN(0x15b4488b, 0x29fd9fd0),
+ TOBN(0x1f439254, 0x1a1f22bf), TOBN(0x920a8a70, 0xab8163e8),
+ TOBN(0x3fd1b249, 0x07e5658e), TOBN(0xf2c4f79c, 0xb6ec839b),
+ TOBN(0x1abbc3d0, 0x4aa38d1b), TOBN(0x3b0db35c, 0xb5d9510e),
+ TOBN(0x1754ac78, 0x3e60dec0), TOBN(0x53272fd7, 0xea099b33),
+ TOBN(0x5fb0494f, 0x07a8e107), TOBN(0x4a89e137, 0x6a8191fa),
+ TOBN(0xa113b7f6, 0x3c4ad544), TOBN(0x88a2e909, 0x6cb9897b),
+ TOBN(0x17d55de3, 0xb44a3f84), TOBN(0xacb2f344, 0x17c6c690),
+ TOBN(0x32088168, 0x10232390), TOBN(0xf2e8a61f, 0x6c733bf7),
+ TOBN(0xa774aab6, 0x9c2d7652), TOBN(0xfb5307e3, 0xed95c5bc),
+ TOBN(0xa05c73c2, 0x4981f110), TOBN(0x1baae31c, 0xa39458c9),
+ TOBN(0x1def185b, 0xcbea62e7), TOBN(0xe8ac9eae, 0xeaf63059),
+ TOBN(0x098a8cfd, 0x9921851c), TOBN(0xd959c3f1, 0x3abe2f5b),
+ TOBN(0xa4f19525, 0x20e40ae5), TOBN(0x320789e3, 0x07a24aa1),
+ TOBN(0x259e6927, 0x7392b2bc), TOBN(0x58f6c667, 0x1918668b),
+ TOBN(0xce1db2bb, 0xc55d2d8b), TOBN(0x41d58bb7, 0xf4f6ca56),
+ TOBN(0x7650b680, 0x8f877614), TOBN(0x905e16ba, 0xf4c349ed),
+ TOBN(0xed415140, 0xf661acac), TOBN(0x3b8784f0, 0xcb2270af),
+ TOBN(0x3bc280ac, 0x8a402cba), TOBN(0xd53f7146, 0x0937921a),
+ TOBN(0xc03c8ee5, 0xe5681e83), TOBN(0x62126105, 0xf6ac9e4a),
+ TOBN(0x9503a53f, 0x936b1a38), TOBN(0x3d45e2d4, 0x782fecbd),
+ TOBN(0x69a5c439, 0x76e8ae98), TOBN(0xb53b2eeb, 0xbfb4b00e),
+ TOBN(0xf1674712, 0x72386c89), TOBN(0x30ca34a2, 0x4268bce4),
+ TOBN(0x7f1ed86c, 0x78341730), TOBN(0x8ef5beb8, 0xb525e248),
+ TOBN(0xbbc489fd, 0xb74fbf38), TOBN(0x38a92a0e, 0x91a0b382),
+ TOBN(0x7a77ba3f, 0x22433ccf), TOBN(0xde8362d6, 0xa29f05a9),
+ TOBN(0x7f6a30ea, 0x61189afc), TOBN(0x693b5505, 0x59ef114f),
+ TOBN(0x50266bc0, 0xcd1797a1), TOBN(0xea17b47e, 0xf4b7af2d),
+ TOBN(0xd6c4025c, 0x3df9483e), TOBN(0x8cbb9d9f, 0xa37b18c9),
+ TOBN(0x91cbfd9c, 0x4d8424cf), TOBN(0xdb7048f1, 0xab1c3506),
+ TOBN(0x9eaf641f, 0x028206a3), TOBN(0xf986f3f9, 0x25bdf6ce),
+ TOBN(0x262143b5, 0x224c08dc), TOBN(0x2bbb09b4, 0x81b50c91),
+ TOBN(0xc16ed709, 0xaca8c84f), TOBN(0xa6210d9d, 0xb2850ca8),
+ TOBN(0x6d8df67a, 0x09cb54d6), TOBN(0x91eef6e0, 0x500919a4),
+ TOBN(0x90f61381, 0x0f132857), TOBN(0x9acede47, 0xf8d5028b),
+ TOBN(0x844d1b71, 0x90b771c3), TOBN(0x563b71e4, 0xba6426be),
+ TOBN(0x2efa2e83, 0xbdb802ff), TOBN(0x3410cbab, 0xab5b4a41),
+ TOBN(0x555b2d26, 0x30da84dd), TOBN(0xd0711ae9, 0xee1cc29a),
+ TOBN(0xcf3e8c60, 0x2f547792), TOBN(0x03d7d5de, 0xdc678b35),
+ TOBN(0x071a2fa8, 0xced806b8), TOBN(0x222e6134, 0x697f1478),
+ TOBN(0xdc16fd5d, 0xabfcdbbf), TOBN(0x44912ebf, 0x121b53b8),
+ TOBN(0xac943674, 0x2496c27c), TOBN(0x8ea3176c, 0x1ffc26b0),
+ TOBN(0xb6e224ac, 0x13debf2c), TOBN(0x524cc235, 0xf372a832),
+ TOBN(0xd706e1d8, 0x9f6f1b18), TOBN(0x2552f005, 0x44cce35b),
+ TOBN(0x8c8326c2, 0xa88e31fc), TOBN(0xb5468b2c, 0xf9552047),
+ TOBN(0xce683e88, 0x3ff90f2b), TOBN(0x77947bdf, 0x2f0a5423),
+ TOBN(0xd0a1b28b, 0xed56e328), TOBN(0xaee35253, 0xc20134ac),
+ TOBN(0x7e98367d, 0x3567962f), TOBN(0x379ed61f, 0x8188bffb),
+ TOBN(0x73bba348, 0xfaf130a1), TOBN(0x6c1f75e1, 0x904ed734),
+ TOBN(0x18956642, 0x3b4a79fc), TOBN(0xf20bc83d, 0x54ef4493),
+ TOBN(0x836d425d, 0x9111eca1), TOBN(0xe5b5c318, 0x009a8dcf),
+ TOBN(0x3360b25d, 0x13221bc5), TOBN(0x707baad2, 0x6b3eeaf7),
+ TOBN(0xd7279ed8, 0x743a95a1), TOBN(0x7450a875, 0x969e809f),
+ TOBN(0x32b6bd53, 0xe5d0338f), TOBN(0x1e77f7af, 0x2b883bbc),
+ TOBN(0x90da12cc, 0x1063ecd0), TOBN(0xe2697b58, 0xc315be47),
+ TOBN(0x2771a5bd, 0xda85d534), TOBN(0x53e78c1f, 0xff980eea),
+ TOBN(0xadf1cf84, 0x900385e7), TOBN(0x7d3b14f6, 0xc9387b62),
+ TOBN(0x170e74b0, 0xcb8f2bd2), TOBN(0x2d50b486, 0x827fa993),
+ TOBN(0xcdbe8c9a, 0xf6f32bab), TOBN(0x55e906b0, 0xc3b93ab8),
+ TOBN(0x747f22fc, 0x8fe280d1), TOBN(0xcd8e0de5, 0xb2e114ab),
+ TOBN(0x5ab7dbeb, 0xe10b68b0), TOBN(0x9dc63a9c, 0xa480d4b2),
+ TOBN(0x78d4bc3b, 0x4be1495f), TOBN(0x25eb3db8, 0x9359122d),
+ TOBN(0x3f8ac05b, 0x0809cbdc), TOBN(0xbf4187bb, 0xd37c702f),
+ TOBN(0x84cea069, 0x1416a6a5), TOBN(0x8f860c79, 0x43ef881c),
+ TOBN(0x41311f8a, 0x38038a5d), TOBN(0xe78c2ec0, 0xfc612067),
+ TOBN(0x494d2e81, 0x5ad73581), TOBN(0xb4cc9e00, 0x59604097),
+ TOBN(0xff558aec, 0xf3612cba), TOBN(0x35beef7a, 0x9e36c39e),
+ TOBN(0x1845c7cf, 0xdbcf41b9), TOBN(0x5703662a, 0xaea997c0),
+ TOBN(0x8b925afe, 0xe402f6d8), TOBN(0xd0a1b1ae, 0x4dd72162),
+ TOBN(0x9f47b375, 0x03c41c4b), TOBN(0xa023829b, 0x0391d042),
+ TOBN(0x5f5045c3, 0x503b8b0a), TOBN(0x123c2688, 0x98c010e5),
+ TOBN(0x324ec0cc, 0x36ba06ee), TOBN(0xface3115, 0x3dd2cc0c),
+ TOBN(0xb364f3be, 0xf333e91f), TOBN(0xef8aff73, 0x28e832b0),
+ TOBN(0x1e9bad04, 0x2d05841b), TOBN(0x42f0e3df, 0x356a21e2),
+ TOBN(0xa3270bcb, 0x4add627e), TOBN(0xb09a8158, 0xd322e711),
+ TOBN(0x86e326a1, 0x0fee104a), TOBN(0xad7788f8, 0x3703f65d),
+ TOBN(0x7e765430, 0x47bc4833), TOBN(0x6cee582b, 0x2b9b893a),
+ TOBN(0x9cd2a167, 0xe8f55a7b), TOBN(0xefbee3c6, 0xd9e4190d),
+ TOBN(0x33ee7185, 0xd40c2e9d), TOBN(0x844cc9c5, 0xa380b548),
+ TOBN(0x323f8ecd, 0x66926e04), TOBN(0x0001e38f, 0x8110c1ba),
+ TOBN(0x8dbcac12, 0xfc6a7f07), TOBN(0xd65e1d58, 0x0cec0827),
+ TOBN(0xd2cd4141, 0xbe76ca2d), TOBN(0x7895cf5c, 0xe892f33a),
+ TOBN(0x956d230d, 0x367139d2), TOBN(0xa91abd3e, 0xd012c4c1),
+ TOBN(0x34fa4883, 0x87eb36bf), TOBN(0xc5f07102, 0x914b8fb4),
+ TOBN(0x90f0e579, 0xadb9c95f), TOBN(0xfe6ea8cb, 0x28888195),
+ TOBN(0x7b9b5065, 0xedfa9284), TOBN(0x6c510bd2, 0x2b8c8d65),
+ TOBN(0xd7b8ebef, 0xcbe8aafd), TOBN(0xedb3af98, 0x96b1da07),
+ TOBN(0x28ff779d, 0x6295d426), TOBN(0x0c4f6ac7, 0x3fa3ad7b),
+ TOBN(0xec44d054, 0x8b8e2604), TOBN(0x9b32a66d, 0x8b0050e1),
+ TOBN(0x1f943366, 0xf0476ce2), TOBN(0x7554d953, 0xa602c7b4),
+ TOBN(0xbe35aca6, 0x524f2809), TOBN(0xb6881229, 0xfd4edbea),
+ TOBN(0xe8cd0c8f, 0x508efb63), TOBN(0x9eb5b5c8, 0x6abcefc7),
+ TOBN(0xf5621f5f, 0xb441ab4f), TOBN(0x79e6c046, 0xb76a2b22),
+ TOBN(0x74a4792c, 0xe37a1f69), TOBN(0xcbd252cb, 0x03542b60),
+ TOBN(0x785f65d5, 0xb3c20bd3), TOBN(0x8dea6143, 0x4fabc60c),
+ TOBN(0x45e21446, 0xde673629), TOBN(0x57f7aa1e, 0x703c2d21),
+ TOBN(0xa0e99b7f, 0x98c868c7), TOBN(0x4e42f66d, 0x8b641676),
+ TOBN(0x602884dc, 0x91077896), TOBN(0xa0d690cf, 0xc2c9885b),
+ TOBN(0xfeb4da33, 0x3b9a5187), TOBN(0x5f789598, 0x153c87ee),
+ TOBN(0x2192dd47, 0x52b16dba), TOBN(0xdeefc0e6, 0x3524c1b1),
+ TOBN(0x465ea76e, 0xe4383693), TOBN(0x79401711, 0x361b8d98),
+ TOBN(0xa5f9ace9, 0xf21a15cb), TOBN(0x73d26163, 0xefee9aeb),
+ TOBN(0xcca844b3, 0xe677016c), TOBN(0x6c122b07, 0x57eaee06),
+ TOBN(0xb782dce7, 0x15f09690), TOBN(0x508b9b12, 0x2dfc0fc9),
+ TOBN(0x9015ab4b, 0x65d89fc6), TOBN(0x5e79dab7, 0xd6d5bb0f),
+ TOBN(0x64f021f0, 0x6c775aa2), TOBN(0xdf09d8cc, 0x37c7eca1),
+ TOBN(0x9a761367, 0xef2fa506), TOBN(0xed4ca476, 0x5b81eec6),
+ TOBN(0x262ede36, 0x10bbb8b5), TOBN(0x0737ce83, 0x0641ada3),
+ TOBN(0x4c94288a, 0xe9831ccc), TOBN(0x487fc1ce, 0x8065e635),
+ TOBN(0xb13d7ab3, 0xb8bb3659), TOBN(0xdea5df3e, 0x855e4120),
+ TOBN(0xb9a18573, 0x85eb0244), TOBN(0x1a1b8ea3, 0xa7cfe0a3),
+ TOBN(0x3b837119, 0x67b0867c), TOBN(0x8d5e0d08, 0x9d364520),
+ TOBN(0x52dccc1e, 0xd930f0e3), TOBN(0xefbbcec7, 0xbf20bbaf),
+ TOBN(0x99cffcab, 0x0263ad10), TOBN(0xd8199e6d, 0xfcd18f8a),
+ TOBN(0x64e2773f, 0xe9f10617), TOBN(0x0079e8e1, 0x08704848),
+ TOBN(0x1169989f, 0x8a342283), TOBN(0x8097799c, 0xa83012e6),
+ TOBN(0xece966cb, 0x8a6a9001), TOBN(0x93b3afef, 0x072ac7fc),
+ TOBN(0xe6893a2a, 0x2db3d5ba), TOBN(0x263dc462, 0x89bf4fdc),
+ TOBN(0x8852dfc9, 0xe0396673), TOBN(0x7ac70895, 0x3af362b6),
+ TOBN(0xbb9cce4d, 0x5c2f342b), TOBN(0xbf80907a, 0xb52d7aae),
+ TOBN(0x97f3d3cd, 0x2161bcd0), TOBN(0xb25b0834, 0x0962744d),
+ TOBN(0xc5b18ea5, 0x6c3a1dda), TOBN(0xfe4ec7eb, 0x06c92317),
+ TOBN(0xb787b890, 0xad1c4afe), TOBN(0xdccd9a92, 0x0ede801a),
+ TOBN(0x9ac6ddda, 0xdb58da1f), TOBN(0x22bbc12f, 0xb8cae6ee),
+ TOBN(0xc6f8bced, 0x815c4a43), TOBN(0x8105a92c, 0xf96480c7),
+ TOBN(0x0dc3dbf3, 0x7a859d51), TOBN(0xe3ec7ce6, 0x3041196b),
+ TOBN(0xd9f64b25, 0x0d1067c9), TOBN(0xf2321321, 0x3d1f8dd8),
+ TOBN(0x8b5c619c, 0x76497ee8), TOBN(0x5d2b0ac6, 0xc717370e),
+ TOBN(0x98204cb6, 0x4fcf68e1), TOBN(0x0bdec211, 0x62bc6792),
+ TOBN(0x6973ccef, 0xa63b1011), TOBN(0xf9e3fa97, 0xe0de1ac5),
+ TOBN(0x5efb693e, 0x3d0e0c8b), TOBN(0x037248e9, 0xd2d4fcb4),}
+ ,
+ {TOBN(0x80802dc9, 0x1ec34f9e), TOBN(0xd8772d35, 0x33810603),
+ TOBN(0x3f06d66c, 0x530cb4f3), TOBN(0x7be5ed0d, 0xc475c129),
+ TOBN(0xcb9e3c19, 0x31e82b10), TOBN(0xc63d2857, 0xc9ff6b4c),
+ TOBN(0xb92118c6, 0x92a1b45e), TOBN(0x0aec4414, 0x7285bbca),
+ TOBN(0xfc189ae7, 0x1e29a3ef), TOBN(0xcbe906f0, 0x4c93302e),
+ TOBN(0xd0107914, 0xceaae10e), TOBN(0xb7a23f34, 0xb68e19f8),
+ TOBN(0xe9d875c2, 0xefd2119d), TOBN(0x03198c6e, 0xfcadc9c8),
+ TOBN(0x65591bf6, 0x4da17113), TOBN(0x3cf0bbf8, 0x3d443038),
+ TOBN(0xae485bb7, 0x2b724759), TOBN(0x945353e1, 0xb2d4c63a),
+ TOBN(0x82159d07, 0xde7d6f2c), TOBN(0x389caef3, 0x4ec5b109),
+ TOBN(0x4a8ebb53, 0xdb65ef14), TOBN(0x2dc2cb7e, 0xdd99de43),
+ TOBN(0x816fa3ed, 0x83f2405f), TOBN(0x73429bb9, 0xc14208a3),
+ TOBN(0xb618d590, 0xb01e6e27), TOBN(0x047e2ccd, 0xe180b2dc),
+ TOBN(0xd1b299b5, 0x04aea4a9), TOBN(0x412c9e1e, 0x9fa403a4),
+ TOBN(0x88d28a36, 0x79407552), TOBN(0x49c50136, 0xf332b8e3),
+ TOBN(0x3a1b6fcc, 0xe668de19), TOBN(0x178851bc, 0x75122b97),
+ TOBN(0xb1e13752, 0xfb85fa4c), TOBN(0xd61257ce, 0x383c8ce9),
+ TOBN(0xd43da670, 0xd2f74dae), TOBN(0xa35aa23f, 0xbf846bbb),
+ TOBN(0x5e74235d, 0x4421fc83), TOBN(0xf6df8ee0, 0xc363473b),
+ TOBN(0x34d7f52a, 0x3c4aa158), TOBN(0x50d05aab, 0x9bc6d22e),
+ TOBN(0x8c56e735, 0xa64785f4), TOBN(0xbc56637b, 0x5f29cd07),
+ TOBN(0x53b2bb80, 0x3ee35067), TOBN(0x50235a0f, 0xdc919270),
+ TOBN(0x191ab6d8, 0xf2c4aa65), TOBN(0xc3475831, 0x8396023b),
+ TOBN(0x80400ba5, 0xf0f805ba), TOBN(0x8881065b, 0x5ec0f80f),
+ TOBN(0xc370e522, 0xcc1b5e83), TOBN(0xde2d4ad1, 0x860b8bfb),
+ TOBN(0xad364df0, 0x67b256df), TOBN(0x8f12502e, 0xe0138997),
+ TOBN(0x503fa0dc, 0x7783920a), TOBN(0xe80014ad, 0xc0bc866a),
+ TOBN(0x3f89b744, 0xd3064ba6), TOBN(0x03511dcd, 0xcba5dba5),
+ TOBN(0x197dd46d, 0x95a7b1a2), TOBN(0x9c4e7ad6, 0x3c6341fb),
+ TOBN(0x426eca29, 0x484c2ece), TOBN(0x9211e489, 0xde7f4f8a),
+ TOBN(0x14997f6e, 0xc78ef1f4), TOBN(0x2b2c0910, 0x06574586),
+ TOBN(0x17286a6e, 0x1c3eede8), TOBN(0x25f92e47, 0x0f60e018),
+ TOBN(0x805c5646, 0x31890a36), TOBN(0x703ef600, 0x57feea5b),
+ TOBN(0x389f747c, 0xaf3c3030), TOBN(0xe0e5daeb, 0x54dd3739),
+ TOBN(0xfe24a4c3, 0xc9c9f155), TOBN(0x7e4bf176, 0xb5393962),
+ TOBN(0x37183de2, 0xaf20bf29), TOBN(0x4a1bd7b5, 0xf95a8c3b),
+ TOBN(0xa83b9699, 0x46191d3d), TOBN(0x281fc8dd, 0x7b87f257),
+ TOBN(0xb18e2c13, 0x54107588), TOBN(0x6372def7, 0x9b2bafe8),
+ TOBN(0xdaf4bb48, 0x0d8972ca), TOBN(0x3f2dd4b7, 0x56167a3f),
+ TOBN(0x1eace32d, 0x84310cf4), TOBN(0xe3bcefaf, 0xe42700aa),
+ TOBN(0x5fe5691e, 0xd785e73d), TOBN(0xa5db5ab6, 0x2ea60467),
+ TOBN(0x02e23d41, 0xdfc6514a), TOBN(0x35e8048e, 0xe03c3665),
+ TOBN(0x3f8b118f, 0x1adaa0f8), TOBN(0x28ec3b45, 0x84ce1a5a),
+ TOBN(0xe8cacc6e, 0x2c6646b8), TOBN(0x1343d185, 0xdbd0e40f),
+ TOBN(0xe5d7f844, 0xcaaa358c), TOBN(0x1a1db7e4, 0x9924182a),
+ TOBN(0xd64cd42d, 0x9c875d9a), TOBN(0xb37b515f, 0x042eeec8),
+ TOBN(0x4d4dd409, 0x7b165fbe), TOBN(0xfc322ed9, 0xe206eff3),
+ TOBN(0x7dee4102, 0x59b7e17e), TOBN(0x55a481c0, 0x8236ca00),
+ TOBN(0x8c885312, 0xc23fc975), TOBN(0x15715806, 0x05d6297b),
+ TOBN(0xa078868e, 0xf78edd39), TOBN(0x956b31e0, 0x03c45e52),
+ TOBN(0x470275d5, 0xff7b33a6), TOBN(0xc8d5dc3a, 0x0c7e673f),
+ TOBN(0x419227b4, 0x7e2f2598), TOBN(0x8b37b634, 0x4c14a975),
+ TOBN(0xd0667ed6, 0x8b11888c), TOBN(0x5e0e8c3e, 0x803e25dc),
+ TOBN(0x34e5d0dc, 0xb987a24a), TOBN(0x9f40ac3b, 0xae920323),
+ TOBN(0x5463de95, 0x34e0f63a), TOBN(0xa128bf92, 0x6b6328f9),
+ TOBN(0x491ccd7c, 0xda64f1b7), TOBN(0x7ef1ec27, 0xc47bde35),
+ TOBN(0xa857240f, 0xa36a2737), TOBN(0x35dc1366, 0x63621bc1),
+ TOBN(0x7a3a6453, 0xd4fb6897), TOBN(0x80f1a439, 0xc929319d),
+ TOBN(0xfc18274b, 0xf8cb0ba0), TOBN(0xb0b53766, 0x8078c5eb),
+ TOBN(0xfb0d4924, 0x1e01d0ef), TOBN(0x50d7c67d, 0x372ab09c),
+ TOBN(0xb4e370af, 0x3aeac968), TOBN(0xe4f7fee9, 0xc4b63266),
+ TOBN(0xb4acd4c2, 0xe3ac5664), TOBN(0xf8910bd2, 0xceb38cbf),
+ TOBN(0x1c3ae50c, 0xc9c0726e), TOBN(0x15309569, 0xd97b40bf),
+ TOBN(0x70884b7f, 0xfd5a5a1b), TOBN(0x3890896a, 0xef8314cd),
+ TOBN(0x58e1515c, 0xa5618c93), TOBN(0xe665432b, 0x77d942d1),
+ TOBN(0xb32181bf, 0xb6f767a8), TOBN(0x753794e8, 0x3a604110),
+ TOBN(0x09afeb7c, 0xe8c0dbcc), TOBN(0x31e02613, 0x598673a3),
+ TOBN(0x5d98e557, 0x7d46db00), TOBN(0xfc21fb8c, 0x9d985b28),
+ TOBN(0xc9040116, 0xb0843e0b), TOBN(0x53b1b3a8, 0x69b04531),
+ TOBN(0xdd1649f0, 0x85d7d830), TOBN(0xbb3bcc87, 0xcb7427e8),
+ TOBN(0x77261100, 0xc93dce83), TOBN(0x7e79da61, 0xa1922a2a),
+ TOBN(0x587a2b02, 0xf3149ce8), TOBN(0x147e1384, 0xde92ec83),
+ TOBN(0x484c83d3, 0xaf077f30), TOBN(0xea78f844, 0x0658b53a),
+ TOBN(0x912076c2, 0x027aec53), TOBN(0xf34714e3, 0x93c8177d),
+ TOBN(0x37ef5d15, 0xc2376c84), TOBN(0x8315b659, 0x3d1aa783),
+ TOBN(0x3a75c484, 0xef852a90), TOBN(0x0ba0c58a, 0x16086bd4),
+ TOBN(0x29688d7a, 0x529a6d48), TOBN(0x9c7f250d, 0xc2f19203),
+ TOBN(0x123042fb, 0x682e2df9), TOBN(0x2b7587e7, 0xad8121bc),
+ TOBN(0x30fc0233, 0xe0182a65), TOBN(0xb82ecf87, 0xe3e1128a),
+ TOBN(0x71682861, 0x93fb098f), TOBN(0x043e21ae, 0x85e9e6a7),
+ TOBN(0xab5b49d6, 0x66c834ea), TOBN(0x3be43e18, 0x47414287),
+ TOBN(0xf40fb859, 0x219a2a47), TOBN(0x0e6559e9, 0xcc58df3c),
+ TOBN(0xfe1dfe8e, 0x0c6615b4), TOBN(0x14abc8fd, 0x56459d70),
+ TOBN(0x7be0fa8e, 0x05de0386), TOBN(0x8e63ef68, 0xe9035c7c),
+ TOBN(0x116401b4, 0x53b31e91), TOBN(0x0cba7ad4, 0x4436b4d8),
+ TOBN(0x9151f9a0, 0x107afd66), TOBN(0xafaca8d0, 0x1f0ee4c4),
+ TOBN(0x75fe5c1d, 0x9ee9761c), TOBN(0x3497a16b, 0xf0c0588f),
+ TOBN(0x3ee2bebd, 0x0304804c), TOBN(0xa8fb9a60, 0xc2c990b9),
+ TOBN(0xd14d32fe, 0x39251114), TOBN(0x36bf25bc, 0xcac73366),
+ TOBN(0xc9562c66, 0xdba7495c), TOBN(0x324d301b, 0x46ad348b),
+ TOBN(0x9f46620c, 0xd670407e), TOBN(0x0ea8d4f1, 0xe3733a01),
+ TOBN(0xd396d532, 0xb0c324e0), TOBN(0x5b211a0e, 0x03c317cd),
+ TOBN(0x090d7d20, 0x5ffe7b37), TOBN(0x3b7f3efb, 0x1747d2da),
+ TOBN(0xa2cb525f, 0xb54fc519), TOBN(0x6e220932, 0xf66a971e),
+ TOBN(0xddc160df, 0xb486d440), TOBN(0x7fcfec46, 0x3fe13465),
+ TOBN(0x83da7e4e, 0x76e4c151), TOBN(0xd6fa48a1, 0xd8d302b5),
+ TOBN(0xc6304f26, 0x5872cd88), TOBN(0x806c1d3c, 0x278b90a1),
+ TOBN(0x3553e725, 0xcaf0bc1c), TOBN(0xff59e603, 0xbb9d8d5c),
+ TOBN(0xa4550f32, 0x7a0b85dd), TOBN(0xdec5720a, 0x93ecc217),
+ TOBN(0x0b88b741, 0x69d62213), TOBN(0x7212f245, 0x5b365955),
+ TOBN(0x20764111, 0xb5cae787), TOBN(0x13cb7f58, 0x1dfd3124),
+ TOBN(0x2dca77da, 0x1175aefb), TOBN(0xeb75466b, 0xffaae775),
+ TOBN(0x74d76f3b, 0xdb6cff32), TOBN(0x7440f37a, 0x61fcda9a),
+ TOBN(0x1bb3ac92, 0xb525028b), TOBN(0x20fbf8f7, 0xa1975f29),
+ TOBN(0x982692e1, 0xdf83097f), TOBN(0x28738f6c, 0x554b0800),
+ TOBN(0xdc703717, 0xa2ce2f2f), TOBN(0x7913b93c, 0x40814194),
+ TOBN(0x04924593, 0x1fe89636), TOBN(0x7b98443f, 0xf78834a6),
+ TOBN(0x11c6ab01, 0x5114a5a1), TOBN(0x60deb383, 0xffba5f4c),
+ TOBN(0x4caa54c6, 0x01a982e6), TOBN(0x1dd35e11, 0x3491cd26),
+ TOBN(0x973c315f, 0x7cbd6b05), TOBN(0xcab00775, 0x52494724),
+ TOBN(0x04659b1f, 0x6565e15a), TOBN(0xbf30f529, 0x8c8fb026),
+ TOBN(0xfc21641b, 0xa8a0de37), TOBN(0xe9c7a366, 0xfa5e5114),
+ TOBN(0xdb849ca5, 0x52f03ad8), TOBN(0xc7e8dbe9, 0x024e35c0),
+ TOBN(0xa1a2bbac, 0xcfc3c789), TOBN(0xbf733e7d, 0x9c26f262),
+ TOBN(0x882ffbf5, 0xb8444823), TOBN(0xb7224e88, 0x6bf8483b),
+ TOBN(0x53023b8b, 0x65bef640), TOBN(0xaabfec91, 0xd4d5f8cd),
+ TOBN(0xa40e1510, 0x079ea1bd), TOBN(0x1ad9addc, 0xd05d5d26),
+ TOBN(0xdb3f2eab, 0x13e68d4f), TOBN(0x1cff1ae2, 0x640f803f),
+ TOBN(0xe0e7b749, 0xd4cee117), TOBN(0x8e9f275b, 0x4036d909),
+ TOBN(0xce34e31d, 0x8f4d4c38), TOBN(0x22b37f69, 0xd75130fc),
+ TOBN(0x83e0f1fd, 0xb4014604), TOBN(0xa8ce9919, 0x89415078),
+ TOBN(0x82375b75, 0x41792efe), TOBN(0x4f59bf5c, 0x97d4515b),
+ TOBN(0xac4f324f, 0x923a277d), TOBN(0xd9bc9b7d, 0x650f3406),
+ TOBN(0xc6fa87d1, 0x8a39bc51), TOBN(0x82588530, 0x5ccc108f),
+ TOBN(0x5ced3c9f, 0x82e4c634), TOBN(0x8efb8314, 0x3a4464f8),
+ TOBN(0xe706381b, 0x7a1dca25), TOBN(0x6cd15a3c, 0x5a2a412b),
+ TOBN(0x9347a8fd, 0xbfcd8fb5), TOBN(0x31db2eef, 0x6e54cd22),
+ TOBN(0xc4aeb11e, 0xf8d8932f), TOBN(0x11e7c1ed, 0x344411af),
+ TOBN(0x2653050c, 0xdc9a151e), TOBN(0x9edbfc08, 0x3bb0a859),
+ TOBN(0x926c81c7, 0xfd5691e7), TOBN(0x9c1b2342, 0x6f39019a),
+ TOBN(0x64a81c8b, 0x7f8474b9), TOBN(0x90657c07, 0x01761819),
+ TOBN(0x390b3331, 0x55e0375a), TOBN(0xc676c626, 0xb6ebc47d),
+ TOBN(0x51623247, 0xb7d6dee8), TOBN(0x0948d927, 0x79659313),
+ TOBN(0x99700161, 0xe9ab35ed), TOBN(0x06cc32b4, 0x8ddde408),
+ TOBN(0x6f2fd664, 0x061ef338), TOBN(0x1606fa02, 0xc202e9ed),
+ TOBN(0x55388bc1, 0x929ba99b), TOBN(0xc4428c5e, 0x1e81df69),
+ TOBN(0xce2028ae, 0xf91b0b2a), TOBN(0xce870a23, 0xf03dfd3f),
+ TOBN(0x66ec2c87, 0x0affe8ed), TOBN(0xb205fb46, 0x284d0c00),
+ TOBN(0xbf5dffe7, 0x44cefa48), TOBN(0xb6fc37a8, 0xa19876d7),
+ TOBN(0xbecfa84c, 0x08b72863), TOBN(0xd7205ff5, 0x2576374f),
+ TOBN(0x80330d32, 0x8887de41), TOBN(0x5de0df0c, 0x869ea534),
+ TOBN(0x13f42753, 0x3c56ea17), TOBN(0xeb1f6069, 0x452b1a78),
+ TOBN(0x50474396, 0xe30ea15c), TOBN(0x575816a1, 0xc1494125),
+ TOBN(0xbe1ce55b, 0xfe6bb38f), TOBN(0xb901a948, 0x96ae30f7),
+ TOBN(0xe5af0f08, 0xd8fc3548), TOBN(0x5010b5d0, 0xd73bfd08),
+ TOBN(0x993d2880, 0x53fe655a), TOBN(0x99f2630b, 0x1c1309fd),
+ TOBN(0xd8677baf, 0xb4e3b76f), TOBN(0x14e51ddc, 0xb840784b),
+ TOBN(0x326c750c, 0xbf0092ce), TOBN(0xc83d306b, 0xf528320f),
+ TOBN(0xc4456715, 0x77d4715c), TOBN(0xd30019f9, 0x6b703235),
+ TOBN(0x207ccb2e, 0xd669e986), TOBN(0x57c824af, 0xf6dbfc28),
+ TOBN(0xf0eb532f, 0xd8f92a23), TOBN(0x4a557fd4, 0x9bb98fd2),
+ TOBN(0xa57acea7, 0xc1e6199a), TOBN(0x0c663820, 0x8b94b1ed),
+ TOBN(0x9b42be8f, 0xf83a9266), TOBN(0xc7741c97, 0x0101bd45),
+ TOBN(0x95770c11, 0x07bd9ceb), TOBN(0x1f50250a, 0x8b2e0744),
+ TOBN(0xf762eec8, 0x1477b654), TOBN(0xc65b900e, 0x15efe59a),
+ TOBN(0x88c96148, 0x9546a897), TOBN(0x7e8025b3, 0xc30b4d7c),
+ TOBN(0xae4065ef, 0x12045cf9), TOBN(0x6fcb2caf, 0x9ccce8bd),
+ TOBN(0x1fa0ba4e, 0xf2cf6525), TOBN(0xf683125d, 0xcb72c312),
+ TOBN(0xa01da4ea, 0xe312410e), TOBN(0x67e28677, 0x6cd8e830),
+ TOBN(0xabd95752, 0x98fb3f07), TOBN(0x05f11e11, 0xeef649a5),
+ TOBN(0xba47faef, 0x9d3472c2), TOBN(0x3adff697, 0xc77d1345),
+ TOBN(0x4761fa04, 0xdd15afee), TOBN(0x64f1f61a, 0xb9e69462),
+ TOBN(0xfa691fab, 0x9bfb9093), TOBN(0x3df8ae8f, 0xa1133dfe),
+ TOBN(0xcd5f8967, 0x58cc710d), TOBN(0xfbb88d50, 0x16c7fe79),
+ TOBN(0x8e011b4c, 0xe88c50d1), TOBN(0x7532e807, 0xa8771c4f),
+ TOBN(0x64c78a48, 0xe2278ee4), TOBN(0x0b283e83, 0x3845072a),
+ TOBN(0x98a6f291, 0x49e69274), TOBN(0xb96e9668, 0x1868b21c),
+ TOBN(0x38f0adc2, 0xb1a8908e), TOBN(0x90afcff7, 0x1feb829d),
+ TOBN(0x9915a383, 0x210b0856), TOBN(0xa5a80602, 0xdef04889),
+ TOBN(0x800e9af9, 0x7c64d509), TOBN(0x81382d0b, 0xb8996f6f),
+ TOBN(0x490eba53, 0x81927e27), TOBN(0x46c63b32, 0x4af50182),
+ TOBN(0x784c5fd9, 0xd3ad62ce), TOBN(0xe4fa1870, 0xf8ae8736),
+ TOBN(0x4ec9d0bc, 0xd7466b25), TOBN(0x84ddbe1a, 0xdb235c65),
+ TOBN(0x5e2645ee, 0x163c1688), TOBN(0x570bd00e, 0x00eba747),
+ TOBN(0xfa51b629, 0x128bfa0f), TOBN(0x92fce1bd, 0x6c1d3b68),
+ TOBN(0x3e7361dc, 0xb66778b1), TOBN(0x9c7d249d, 0x5561d2bb),
+ TOBN(0xa40b28bf, 0x0bbc6229), TOBN(0x1c83c05e, 0xdfd91497),
+ TOBN(0x5f9f5154, 0xf083df05), TOBN(0xbac38b3c, 0xeee66c9d),
+ TOBN(0xf71db7e3, 0xec0dfcfd), TOBN(0xf2ecda8e, 0x8b0a8416),
+ TOBN(0x52fddd86, 0x7812aa66), TOBN(0x2896ef10, 0x4e6f4272),
+ TOBN(0xff27186a, 0x0fe9a745), TOBN(0x08249fcd, 0x49ca70db),
+ TOBN(0x7425a2e6, 0x441cac49), TOBN(0xf4a0885a, 0xece5ff57),
+ TOBN(0x6e2cb731, 0x7d7ead58), TOBN(0xf96cf7d6, 0x1898d104),
+ TOBN(0xafe67c9d, 0x4f2c9a89), TOBN(0x89895a50, 0x1c7bf5bc),
+ TOBN(0xdc7cb8e5, 0x573cecfa), TOBN(0x66497eae, 0xd15f03e6),
+ TOBN(0x6bc0de69, 0x3f084420), TOBN(0x323b9b36, 0xacd532b0),
+ TOBN(0xcfed390a, 0x0115a3c1), TOBN(0x9414c40b, 0x2d65ca0e),
+ TOBN(0x641406bd, 0x2f530c78), TOBN(0x29369a44, 0x833438f2),
+ TOBN(0x996884f5, 0x903fa271), TOBN(0xe6da0fd2, 0xb9da921e),
+ TOBN(0xa6f2f269, 0x5db01e54), TOBN(0x1ee3e9bd, 0x6876214e),
+ TOBN(0xa26e181c, 0xe27a9497), TOBN(0x36d254e4, 0x8e215e04),
+ TOBN(0x42f32a6c, 0x252cabca), TOBN(0x99481487, 0x80b57614),
+ TOBN(0x4c4dfe69, 0x40d9cae1), TOBN(0x05869580, 0x11a10f09),
+ TOBN(0xca287b57, 0x3491b64b), TOBN(0x77862d5d, 0x3fd4a53b),
+ TOBN(0xbf94856e, 0x50349126), TOBN(0x2be30bd1, 0x71c5268f),
+ TOBN(0x10393f19, 0xcbb650a6), TOBN(0x639531fe, 0x778cf9fd),
+ TOBN(0x02556a11, 0xb2935359), TOBN(0xda38aa96, 0xaf8c126e),
+ TOBN(0x47dbe6c2, 0x0960167f), TOBN(0x37bbabb6, 0x501901cd),
+ TOBN(0xb6e979e0, 0x2c947778), TOBN(0xd69a5175, 0x7a1a1dc6),
+ TOBN(0xc3ed5095, 0x9d9faf0c), TOBN(0x4dd9c096, 0x1d5fa5f0),
+ TOBN(0xa0c4304d, 0x64f16ea8), TOBN(0x8b1cac16, 0x7e718623),
+ TOBN(0x0b576546, 0x7c67f03e), TOBN(0x559cf5ad, 0xcbd88c01),
+ TOBN(0x074877bb, 0x0e2af19a), TOBN(0x1f717ec1, 0xa1228c92),
+ TOBN(0x70bcb800, 0x326e8920), TOBN(0xec6e2c5c, 0x4f312804),
+ TOBN(0x426aea7d, 0x3fca4752), TOBN(0xf12c0949, 0x2211f62a),
+ TOBN(0x24beecd8, 0x7be7b6b5), TOBN(0xb77eaf4c, 0x36d7a27d),
+ TOBN(0x154c2781, 0xfda78fd3), TOBN(0x848a83b0, 0x264eeabe),
+ TOBN(0x81287ef0, 0x4ffe2bc4), TOBN(0x7b6d88c6, 0xb6b6fc2a),
+ TOBN(0x805fb947, 0xce417d99), TOBN(0x4b93dcc3, 0x8b916cc4),
+ TOBN(0x72e65bb3, 0x21273323), TOBN(0xbcc1badd, 0x6ea9886e),
+ TOBN(0x0e223011, 0x4bc5ee85), TOBN(0xa561be74, 0xc18ee1e4),
+ TOBN(0x762fd2d4, 0xa6bcf1f1), TOBN(0x50e6a5a4, 0x95231489),
+ TOBN(0xca96001f, 0xa00b500b), TOBN(0x5c098cfc, 0x5d7dcdf5),
+ TOBN(0xa64e2d2e, 0x8c446a85), TOBN(0xbae9bcf1, 0x971f3c62),
+ TOBN(0x4ec22683, 0x8435a2c5), TOBN(0x8ceaed6c, 0x4bad4643),
+ TOBN(0xe9f8fb47, 0xccccf4e3), TOBN(0xbd4f3fa4, 0x1ce3b21e),
+ TOBN(0xd79fb110, 0xa3db3292), TOBN(0xe28a37da, 0xb536c66a),
+ TOBN(0x279ce87b, 0x8e49e6a9), TOBN(0x70ccfe8d, 0xfdcec8e3),
+ TOBN(0x2193e4e0, 0x3ba464b2), TOBN(0x0f39d60e, 0xaca9a398),
+ TOBN(0x7d7932af, 0xf82c12ab), TOBN(0xd8ff50ed, 0x91e7e0f7),
+ TOBN(0xea961058, 0xfa28a7e0), TOBN(0xc726cf25, 0x0bf5ec74),
+ TOBN(0xe74d55c8, 0xdb229666), TOBN(0x0bd9abbf, 0xa57f5799),
+ TOBN(0x7479ef07, 0x4dfc47b3), TOBN(0xd9c65fc3, 0x0c52f91d),
+ TOBN(0x8e0283fe, 0x36a8bde2), TOBN(0xa32a8b5e, 0x7d4b7280),
+ TOBN(0x6a677c61, 0x12e83233), TOBN(0x0fbb3512, 0xdcc9bf28),
+ TOBN(0x562e8ea5, 0x0d780f61), TOBN(0x0db8b22b, 0x1dc4e89c),
+ TOBN(0x0a6fd1fb, 0x89be0144), TOBN(0x8c77d246, 0xca57113b),
+ TOBN(0x4639075d, 0xff09c91c), TOBN(0x5b47b17f, 0x5060824c),
+ TOBN(0x58aea2b0, 0x16287b52), TOBN(0xa1343520, 0xd0cd8eb0),
+ TOBN(0x6148b4d0, 0xc5d58573), TOBN(0xdd2b6170, 0x291c68ae),
+ TOBN(0xa61b3929, 0x1da3b3b7), TOBN(0x5f946d79, 0x08c4ac10),
+ TOBN(0x4105d4a5, 0x7217d583), TOBN(0x5061da3d, 0x25e6de5e),
+ TOBN(0x3113940d, 0xec1b4991), TOBN(0xf12195e1, 0x36f485ae),
+ TOBN(0xa7507fb2, 0x731a2ee0), TOBN(0x95057a8e, 0x6e9e196e),
+ TOBN(0xa3c2c911, 0x2e130136), TOBN(0x97dfbb36, 0x33c60d15),
+ TOBN(0xcaf3c581, 0xb300ee2b), TOBN(0x77f25d90, 0xf4bac8b8),
+ TOBN(0xdb1c4f98, 0x6d840cd6), TOBN(0x471d62c0, 0xe634288c),
+ TOBN(0x8ec2f85e, 0xcec8a161), TOBN(0x41f37cbc, 0xfa6f4ae2),
+ TOBN(0x6793a20f, 0x4b709985), TOBN(0x7a7bd33b, 0xefa8985b),
+ TOBN(0x2c6a3fbd, 0x938e6446), TOBN(0x19042619, 0x2a8d47c1),
+ TOBN(0x16848667, 0xcc36975f), TOBN(0x02acf168, 0x9d5f1dfb),
+ TOBN(0x62d41ad4, 0x613baa94), TOBN(0xb56fbb92, 0x9f684670),
+ TOBN(0xce610d0d, 0xe9e40569), TOBN(0x7b99c65f, 0x35489fef),
+ TOBN(0x0c88ad1b, 0x3df18b97), TOBN(0x81b7d9be, 0x5d0e9edb),
+ TOBN(0xd85218c0, 0xc716cc0a), TOBN(0xf4b5ff90, 0x85691c49),
+ TOBN(0xa4fd666b, 0xce356ac6), TOBN(0x17c72895, 0x4b327a7a),
+ TOBN(0xf93d5085, 0xda6be7de), TOBN(0xff71530e, 0x3301d34e),
+ TOBN(0x4cd96442, 0xd8f448e8), TOBN(0x9283d331, 0x2ed18ffa),
+ TOBN(0x4d33dd99, 0x2a849870), TOBN(0xa716964b, 0x41576335),
+ TOBN(0xff5e3a9b, 0x179be0e5), TOBN(0x5b9d6b1b, 0x83b13632),
+ TOBN(0x3b8bd7d4, 0xa52f313b), TOBN(0xc9dd95a0, 0x637a4660),
+ TOBN(0x30035962, 0x0b3e218f), TOBN(0xce1481a3, 0xc7b28a3c),
+ TOBN(0xab41b43a, 0x43228d83), TOBN(0x24ae1c30, 0x4ad63f99),
+ TOBN(0x8e525f1a, 0x46a51229), TOBN(0x14af860f, 0xcd26d2b4),
+ TOBN(0xd6baef61, 0x3f714aa1), TOBN(0xf51865ad, 0xeb78795e),
+ TOBN(0xd3e21fce, 0xe6a9d694), TOBN(0x82ceb1dd, 0x8a37b527)}
+};
diff --git a/crypto/ecdh/Makefile b/crypto/ecdh/Makefile
index f0766356a132..1b31ba1f0b3f 100644
--- a/crypto/ecdh/Makefile
+++ b/crypto/ecdh/Makefile
@@ -17,9 +17,9 @@ TEST=ecdhtest.c
APPS=
LIB=$(TOP)/libcrypto.a
-LIBSRC= ech_lib.c ech_ossl.c ech_key.c ech_err.c
+LIBSRC= ech_lib.c ech_ossl.c ech_key.c ech_err.c ech_kdf.c
-LIBOBJ= ech_lib.o ech_ossl.o ech_key.o ech_err.o
+LIBOBJ= ech_lib.o ech_ossl.o ech_key.o ech_err.o ech_kdf.o
SRC= $(LIBSRC)
@@ -85,6 +85,14 @@ ech_err.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h
ech_err.o: ../../include/openssl/ossl_typ.h ../../include/openssl/safestack.h
ech_err.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
ech_err.o: ech_err.c
+ech_kdf.o: ../../include/openssl/asn1.h ../../include/openssl/bio.h
+ech_kdf.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
+ech_kdf.o: ../../include/openssl/ec.h ../../include/openssl/ecdh.h
+ech_kdf.o: ../../include/openssl/evp.h ../../include/openssl/obj_mac.h
+ech_kdf.o: ../../include/openssl/objects.h ../../include/openssl/opensslconf.h
+ech_kdf.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+ech_kdf.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+ech_kdf.o: ../../include/openssl/symhacks.h ech_kdf.c
ech_key.o: ../../include/openssl/asn1.h ../../include/openssl/bio.h
ech_key.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
ech_key.o: ../../include/openssl/ec.h ../../include/openssl/ecdh.h
diff --git a/crypto/ecdh/ecdh.h b/crypto/ecdh/ecdh.h
index a9b811abd050..25348b30fe7c 100644
--- a/crypto/ecdh/ecdh.h
+++ b/crypto/ecdh/ecdh.h
@@ -85,6 +85,8 @@
extern "C" {
#endif
+# define EC_FLAG_COFACTOR_ECDH 0x1000
+
const ECDH_METHOD *ECDH_OpenSSL(void);
void ECDH_set_default_method(const ECDH_METHOD *);
@@ -101,6 +103,11 @@ int ECDH_get_ex_new_index(long argl, void *argp, CRYPTO_EX_new
int ECDH_set_ex_data(EC_KEY *d, int idx, void *arg);
void *ECDH_get_ex_data(EC_KEY *d, int idx);
+int ECDH_KDF_X9_62(unsigned char *out, size_t outlen,
+ const unsigned char *Z, size_t Zlen,
+ const unsigned char *sinfo, size_t sinfolen,
+ const EVP_MD *md);
+
/* BEGIN ERROR CODES */
/*
* The following lines are auto generated by the script mkerr.pl. Any changes
diff --git a/crypto/ecdh/ecdhtest.c b/crypto/ecdh/ecdhtest.c
index 996321d1ee43..2fe2c66443d0 100644
--- a/crypto/ecdh/ecdhtest.c
+++ b/crypto/ecdh/ecdhtest.c
@@ -312,6 +312,170 @@ static int test_ecdh_curve(int nid, const char *text, BN_CTX *ctx, BIO *out)
return (ret);
}
+/* Keys and shared secrets from RFC 7027 */
+
+static const unsigned char bp256_da[] = {
+ 0x81, 0xDB, 0x1E, 0xE1, 0x00, 0x15, 0x0F, 0xF2, 0xEA, 0x33, 0x8D, 0x70,
+ 0x82, 0x71, 0xBE, 0x38, 0x30, 0x0C, 0xB5, 0x42, 0x41, 0xD7, 0x99, 0x50,
+ 0xF7, 0x7B, 0x06, 0x30, 0x39, 0x80, 0x4F, 0x1D
+};
+
+static const unsigned char bp256_db[] = {
+ 0x55, 0xE4, 0x0B, 0xC4, 0x1E, 0x37, 0xE3, 0xE2, 0xAD, 0x25, 0xC3, 0xC6,
+ 0x65, 0x45, 0x11, 0xFF, 0xA8, 0x47, 0x4A, 0x91, 0xA0, 0x03, 0x20, 0x87,
+ 0x59, 0x38, 0x52, 0xD3, 0xE7, 0xD7, 0x6B, 0xD3
+};
+
+static const unsigned char bp256_Z[] = {
+ 0x89, 0xAF, 0xC3, 0x9D, 0x41, 0xD3, 0xB3, 0x27, 0x81, 0x4B, 0x80, 0x94,
+ 0x0B, 0x04, 0x25, 0x90, 0xF9, 0x65, 0x56, 0xEC, 0x91, 0xE6, 0xAE, 0x79,
+ 0x39, 0xBC, 0xE3, 0x1F, 0x3A, 0x18, 0xBF, 0x2B
+};
+
+static const unsigned char bp384_da[] = {
+ 0x1E, 0x20, 0xF5, 0xE0, 0x48, 0xA5, 0x88, 0x6F, 0x1F, 0x15, 0x7C, 0x74,
+ 0xE9, 0x1B, 0xDE, 0x2B, 0x98, 0xC8, 0xB5, 0x2D, 0x58, 0xE5, 0x00, 0x3D,
+ 0x57, 0x05, 0x3F, 0xC4, 0xB0, 0xBD, 0x65, 0xD6, 0xF1, 0x5E, 0xB5, 0xD1,
+ 0xEE, 0x16, 0x10, 0xDF, 0x87, 0x07, 0x95, 0x14, 0x36, 0x27, 0xD0, 0x42
+};
+
+static const unsigned char bp384_db[] = {
+ 0x03, 0x26, 0x40, 0xBC, 0x60, 0x03, 0xC5, 0x92, 0x60, 0xF7, 0x25, 0x0C,
+ 0x3D, 0xB5, 0x8C, 0xE6, 0x47, 0xF9, 0x8E, 0x12, 0x60, 0xAC, 0xCE, 0x4A,
+ 0xCD, 0xA3, 0xDD, 0x86, 0x9F, 0x74, 0xE0, 0x1F, 0x8B, 0xA5, 0xE0, 0x32,
+ 0x43, 0x09, 0xDB, 0x6A, 0x98, 0x31, 0x49, 0x7A, 0xBA, 0xC9, 0x66, 0x70
+};
+
+static const unsigned char bp384_Z[] = {
+ 0x0B, 0xD9, 0xD3, 0xA7, 0xEA, 0x0B, 0x3D, 0x51, 0x9D, 0x09, 0xD8, 0xE4,
+ 0x8D, 0x07, 0x85, 0xFB, 0x74, 0x4A, 0x6B, 0x35, 0x5E, 0x63, 0x04, 0xBC,
+ 0x51, 0xC2, 0x29, 0xFB, 0xBC, 0xE2, 0x39, 0xBB, 0xAD, 0xF6, 0x40, 0x37,
+ 0x15, 0xC3, 0x5D, 0x4F, 0xB2, 0xA5, 0x44, 0x4F, 0x57, 0x5D, 0x4F, 0x42
+};
+
+static const unsigned char bp512_da[] = {
+ 0x16, 0x30, 0x2F, 0xF0, 0xDB, 0xBB, 0x5A, 0x8D, 0x73, 0x3D, 0xAB, 0x71,
+ 0x41, 0xC1, 0xB4, 0x5A, 0xCB, 0xC8, 0x71, 0x59, 0x39, 0x67, 0x7F, 0x6A,
+ 0x56, 0x85, 0x0A, 0x38, 0xBD, 0x87, 0xBD, 0x59, 0xB0, 0x9E, 0x80, 0x27,
+ 0x96, 0x09, 0xFF, 0x33, 0x3E, 0xB9, 0xD4, 0xC0, 0x61, 0x23, 0x1F, 0xB2,
+ 0x6F, 0x92, 0xEE, 0xB0, 0x49, 0x82, 0xA5, 0xF1, 0xD1, 0x76, 0x4C, 0xAD,
+ 0x57, 0x66, 0x54, 0x22
+};
+
+static const unsigned char bp512_db[] = {
+ 0x23, 0x0E, 0x18, 0xE1, 0xBC, 0xC8, 0x8A, 0x36, 0x2F, 0xA5, 0x4E, 0x4E,
+ 0xA3, 0x90, 0x20, 0x09, 0x29, 0x2F, 0x7F, 0x80, 0x33, 0x62, 0x4F, 0xD4,
+ 0x71, 0xB5, 0xD8, 0xAC, 0xE4, 0x9D, 0x12, 0xCF, 0xAB, 0xBC, 0x19, 0x96,
+ 0x3D, 0xAB, 0x8E, 0x2F, 0x1E, 0xBA, 0x00, 0xBF, 0xFB, 0x29, 0xE4, 0xD7,
+ 0x2D, 0x13, 0xF2, 0x22, 0x45, 0x62, 0xF4, 0x05, 0xCB, 0x80, 0x50, 0x36,
+ 0x66, 0xB2, 0x54, 0x29
+};
+
+static const unsigned char bp512_Z[] = {
+ 0xA7, 0x92, 0x70, 0x98, 0x65, 0x5F, 0x1F, 0x99, 0x76, 0xFA, 0x50, 0xA9,
+ 0xD5, 0x66, 0x86, 0x5D, 0xC5, 0x30, 0x33, 0x18, 0x46, 0x38, 0x1C, 0x87,
+ 0x25, 0x6B, 0xAF, 0x32, 0x26, 0x24, 0x4B, 0x76, 0xD3, 0x64, 0x03, 0xC0,
+ 0x24, 0xD7, 0xBB, 0xF0, 0xAA, 0x08, 0x03, 0xEA, 0xFF, 0x40, 0x5D, 0x3D,
+ 0x24, 0xF1, 0x1A, 0x9B, 0x5C, 0x0B, 0xEF, 0x67, 0x9F, 0xE1, 0x45, 0x4B,
+ 0x21, 0xC4, 0xCD, 0x1F
+};
+
+/* Given private value and NID, create EC_KEY structure */
+
+static EC_KEY *mk_eckey(int nid, const unsigned char *p, size_t plen)
+{
+ int ok = 0;
+ EC_KEY *k = NULL;
+ BIGNUM *priv = NULL;
+ EC_POINT *pub = NULL;
+ const EC_GROUP *grp;
+ k = EC_KEY_new_by_curve_name(nid);
+ if (!k)
+ goto err;
+ priv = BN_bin2bn(p, plen, NULL);
+ if (!priv)
+ goto err;
+ if (!EC_KEY_set_private_key(k, priv))
+ goto err;
+ grp = EC_KEY_get0_group(k);
+ pub = EC_POINT_new(grp);
+ if (!pub)
+ goto err;
+ if (!EC_POINT_mul(grp, pub, priv, NULL, NULL, NULL))
+ goto err;
+ if (!EC_KEY_set_public_key(k, pub))
+ goto err;
+ ok = 1;
+ err:
+ if (priv)
+ BN_clear_free(priv);
+ if (pub)
+ EC_POINT_free(pub);
+ if (ok)
+ return k;
+ else if (k)
+ EC_KEY_free(k);
+ return NULL;
+}
+
+/*
+ * Known answer test: compute shared secret and check it matches expected
+ * value.
+ */
+
+static int ecdh_kat(BIO *out, const char *cname, int nid,
+ const unsigned char *k1, size_t k1_len,
+ const unsigned char *k2, size_t k2_len,
+ const unsigned char *Z, size_t Zlen)
+{
+ int rv = 0;
+ EC_KEY *key1 = NULL, *key2 = NULL;
+ unsigned char *Ztmp = NULL;
+ size_t Ztmplen;
+ BIO_puts(out, "Testing ECDH shared secret with ");
+ BIO_puts(out, cname);
+ key1 = mk_eckey(nid, k1, k1_len);
+ key2 = mk_eckey(nid, k2, k2_len);
+ if (!key1 || !key2)
+ goto err;
+ Ztmplen = (EC_GROUP_get_degree(EC_KEY_get0_group(key1)) + 7) / 8;
+ if (Ztmplen != Zlen)
+ goto err;
+ Ztmp = OPENSSL_malloc(Ztmplen);
+ if (!ECDH_compute_key(Ztmp, Ztmplen,
+ EC_KEY_get0_public_key(key2), key1, 0))
+ goto err;
+ if (memcmp(Ztmp, Z, Zlen))
+ goto err;
+ memset(Ztmp, 0, Zlen);
+ if (!ECDH_compute_key(Ztmp, Ztmplen,
+ EC_KEY_get0_public_key(key1), key2, 0))
+ goto err;
+ if (memcmp(Ztmp, Z, Zlen))
+ goto err;
+ rv = 1;
+ err:
+ if (key1)
+ EC_KEY_free(key1);
+ if (key2)
+ EC_KEY_free(key2);
+ if (Ztmp)
+ OPENSSL_free(Ztmp);
+ if (rv)
+ BIO_puts(out, " ok\n");
+ else {
+ fprintf(stderr, "Error in ECDH routines\n");
+ ERR_print_errors_fp(stderr);
+ }
+ return rv;
+}
+
+# define test_ecdh_kat(bio, curve, bits) \
+ ecdh_kat(bio, curve, NID_brainpoolP##bits##r1, \
+ bp##bits##_da, sizeof(bp##bits##_da), \
+ bp##bits##_db, sizeof(bp##bits##_db), \
+ bp##bits##_Z, sizeof(bp##bits##_Z))
+
int main(int argc, char *argv[])
{
BN_CTX *ctx = NULL;
@@ -372,6 +536,12 @@ int main(int argc, char *argv[])
if (!test_ecdh_curve(NID_sect571r1, "NIST Binary-Curve B-571", ctx, out))
goto err;
# endif
+ if (!test_ecdh_kat(out, "Brainpool Prime-Curve brainpoolP256r1", 256))
+ goto err;
+ if (!test_ecdh_kat(out, "Brainpool Prime-Curve brainpoolP384r1", 384))
+ goto err;
+ if (!test_ecdh_kat(out, "Brainpool Prime-Curve brainpoolP512r1", 512))
+ goto err;
ret = 0;
diff --git a/crypto/ecdh/ech_kdf.c b/crypto/ecdh/ech_kdf.c
new file mode 100644
index 000000000000..ac722ac9ee69
--- /dev/null
+++ b/crypto/ecdh/ech_kdf.c
@@ -0,0 +1,111 @@
+/* crypto/ecdh/ec_kdf.c */
+/*
+ * Written by Stephen Henson for the OpenSSL project.
+ */
+/* ====================================================================
+ * Copyright (c) 2013 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#define OPENSSL_FIPSAPI
+
+#include <string.h>
+#include <openssl/ecdh.h>
+#include <openssl/evp.h>
+
+/* Key derivation function from X9.62/SECG */
+/* Way more than we will ever need */
+#define ECDH_KDF_MAX (1 << 30)
+
+int ECDH_KDF_X9_62(unsigned char *out, size_t outlen,
+ const unsigned char *Z, size_t Zlen,
+ const unsigned char *sinfo, size_t sinfolen,
+ const EVP_MD *md)
+{
+ EVP_MD_CTX mctx;
+ int rv = 0;
+ unsigned int i;
+ size_t mdlen;
+ unsigned char ctr[4];
+ if (sinfolen > ECDH_KDF_MAX || outlen > ECDH_KDF_MAX
+ || Zlen > ECDH_KDF_MAX)
+ return 0;
+ mdlen = EVP_MD_size(md);
+ EVP_MD_CTX_init(&mctx);
+ for (i = 1;; i++) {
+ unsigned char mtmp[EVP_MAX_MD_SIZE];
+ EVP_DigestInit_ex(&mctx, md, NULL);
+ ctr[3] = i & 0xFF;
+ ctr[2] = (i >> 8) & 0xFF;
+ ctr[1] = (i >> 16) & 0xFF;
+ ctr[0] = (i >> 24) & 0xFF;
+ if (!EVP_DigestUpdate(&mctx, Z, Zlen))
+ goto err;
+ if (!EVP_DigestUpdate(&mctx, ctr, sizeof(ctr)))
+ goto err;
+ if (!EVP_DigestUpdate(&mctx, sinfo, sinfolen))
+ goto err;
+ if (outlen >= mdlen) {
+ if (!EVP_DigestFinal(&mctx, out, NULL))
+ goto err;
+ outlen -= mdlen;
+ if (outlen == 0)
+ break;
+ out += mdlen;
+ } else {
+ if (!EVP_DigestFinal(&mctx, mtmp, NULL))
+ goto err;
+ memcpy(out, mtmp, outlen);
+ OPENSSL_cleanse(mtmp, mdlen);
+ break;
+ }
+ }
+ rv = 1;
+ err:
+ EVP_MD_CTX_cleanup(&mctx);
+ return rv;
+}
diff --git a/crypto/ecdh/ech_ossl.c b/crypto/ecdh/ech_ossl.c
index d448b19a528f..df115cc262e5 100644
--- a/crypto/ecdh/ech_ossl.c
+++ b/crypto/ecdh/ech_ossl.c
@@ -138,6 +138,16 @@ static int ecdh_compute_key(void *out, size_t outlen, const EC_POINT *pub_key,
}
group = EC_KEY_get0_group(ecdh);
+
+ if (EC_KEY_get_flags(ecdh) & EC_FLAG_COFACTOR_ECDH) {
+ if (!EC_GROUP_get_cofactor(group, x, ctx) ||
+ !BN_mul(x, x, priv_key, ctx)) {
+ ECDHerr(ECDH_F_ECDH_COMPUTE_KEY, ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+ priv_key = x;
+ }
+
if ((tmp = EC_POINT_new(group)) == NULL) {
ECDHerr(ECDH_F_ECDH_COMPUTE_KEY, ERR_R_MALLOC_FAILURE);
goto err;
diff --git a/crypto/ecdsa/ecdsa.h b/crypto/ecdsa/ecdsa.h
index faf76b1104fc..c4016ac3e19b 100644
--- a/crypto/ecdsa/ecdsa.h
+++ b/crypto/ecdsa/ecdsa.h
@@ -228,6 +228,80 @@ int ECDSA_get_ex_new_index(long argl, void *argp, CRYPTO_EX_new
int ECDSA_set_ex_data(EC_KEY *d, int idx, void *arg);
void *ECDSA_get_ex_data(EC_KEY *d, int idx);
+/** Allocates and initialize a ECDSA_METHOD structure
+ * \param ecdsa_method pointer to ECDSA_METHOD to copy. (May be NULL)
+ * \return pointer to a ECDSA_METHOD structure or NULL if an error occurred
+ */
+
+ECDSA_METHOD *ECDSA_METHOD_new(ECDSA_METHOD *ecdsa_method);
+
+/** frees a ECDSA_METHOD structure
+ * \param ecdsa_method pointer to the ECDSA_METHOD structure
+ */
+void ECDSA_METHOD_free(ECDSA_METHOD *ecdsa_method);
+
+/** Sets application specific data in the ECDSA_METHOD
+ * \param ecdsa_method pointer to existing ECDSA_METHOD
+ * \param app application specific data to set
+ */
+
+void ECDSA_METHOD_set_app_data(ECDSA_METHOD *ecdsa_method, void *app);
+
+/** Returns application specific data from a ECDSA_METHOD structure
+ * \param ecdsa_method pointer to ECDSA_METHOD structure
+ * \return pointer to application specific data.
+ */
+
+void *ECDSA_METHOD_get_app_data(ECDSA_METHOD *ecdsa_method);
+
+/** Set the ECDSA_do_sign function in the ECDSA_METHOD
+ * \param ecdsa_method pointer to existing ECDSA_METHOD
+ * \param ecdsa_do_sign a funtion of type ECDSA_do_sign
+ */
+
+void ECDSA_METHOD_set_sign(ECDSA_METHOD *ecdsa_method,
+ ECDSA_SIG *(*ecdsa_do_sign) (const unsigned char
+ *dgst, int dgst_len,
+ const BIGNUM *inv,
+ const BIGNUM *rp,
+ EC_KEY *eckey));
+
+/** Set the ECDSA_sign_setup function in the ECDSA_METHOD
+ * \param ecdsa_method pointer to existing ECDSA_METHOD
+ * \param ecdsa_sign_setup a funtion of type ECDSA_sign_setup
+ */
+
+void ECDSA_METHOD_set_sign_setup(ECDSA_METHOD *ecdsa_method,
+ int (*ecdsa_sign_setup) (EC_KEY *eckey,
+ BN_CTX *ctx,
+ BIGNUM **kinv,
+ BIGNUM **r));
+
+/** Set the ECDSA_do_verify function in the ECDSA_METHOD
+ * \param ecdsa_method pointer to existing ECDSA_METHOD
+ * \param ecdsa_do_verify a funtion of type ECDSA_do_verify
+ */
+
+void ECDSA_METHOD_set_verify(ECDSA_METHOD *ecdsa_method,
+ int (*ecdsa_do_verify) (const unsigned char
+ *dgst, int dgst_len,
+ const ECDSA_SIG *sig,
+ EC_KEY *eckey));
+
+void ECDSA_METHOD_set_flags(ECDSA_METHOD *ecdsa_method, int flags);
+
+/** Set the flags field in the ECDSA_METHOD
+ * \param ecdsa_method pointer to existing ECDSA_METHOD
+ * \param flags flags value to set
+ */
+
+void ECDSA_METHOD_set_name(ECDSA_METHOD *ecdsa_method, char *name);
+
+/** Set the name field in the ECDSA_METHOD
+ * \param ecdsa_method pointer to existing ECDSA_METHOD
+ * \param name name to set
+ */
+
/* BEGIN ERROR CODES */
/*
* The following lines are auto generated by the script mkerr.pl. Any changes
@@ -242,6 +316,7 @@ void ERR_load_ECDSA_strings(void);
# define ECDSA_F_ECDSA_DATA_NEW_METHOD 100
# define ECDSA_F_ECDSA_DO_SIGN 101
# define ECDSA_F_ECDSA_DO_VERIFY 102
+# define ECDSA_F_ECDSA_METHOD_NEW 105
# define ECDSA_F_ECDSA_SIGN_SETUP 103
/* Reason codes. */
diff --git a/crypto/ecdsa/ecs_err.c b/crypto/ecdsa/ecs_err.c
index 6fc64a0077ac..f1fa7b55f977 100644
--- a/crypto/ecdsa/ecs_err.c
+++ b/crypto/ecdsa/ecs_err.c
@@ -74,6 +74,7 @@ static ERR_STRING_DATA ECDSA_str_functs[] = {
{ERR_FUNC(ECDSA_F_ECDSA_DATA_NEW_METHOD), "ECDSA_DATA_NEW_METHOD"},
{ERR_FUNC(ECDSA_F_ECDSA_DO_SIGN), "ECDSA_do_sign"},
{ERR_FUNC(ECDSA_F_ECDSA_DO_VERIFY), "ECDSA_do_verify"},
+ {ERR_FUNC(ECDSA_F_ECDSA_METHOD_NEW), "ECDSA_METHOD_new"},
{ERR_FUNC(ECDSA_F_ECDSA_SIGN_SETUP), "ECDSA_sign_setup"},
{0, NULL}
};
diff --git a/crypto/ecdsa/ecs_lib.c b/crypto/ecdsa/ecs_lib.c
index 0f2d3432f666..1c0231031850 100644
--- a/crypto/ecdsa/ecs_lib.c
+++ b/crypto/ecdsa/ecs_lib.c
@@ -275,3 +275,80 @@ void *ECDSA_get_ex_data(EC_KEY *d, int idx)
return NULL;
return (CRYPTO_get_ex_data(&ecdsa->ex_data, idx));
}
+
+ECDSA_METHOD *ECDSA_METHOD_new(ECDSA_METHOD *ecdsa_meth)
+{
+ ECDSA_METHOD *ret;
+
+ ret = OPENSSL_malloc(sizeof(ECDSA_METHOD));
+ if (ret == NULL) {
+ ECDSAerr(ECDSA_F_ECDSA_METHOD_NEW, ERR_R_MALLOC_FAILURE);
+ return NULL;
+ }
+
+ if (ecdsa_meth)
+ *ret = *ecdsa_meth;
+ else {
+ ret->ecdsa_sign_setup = 0;
+ ret->ecdsa_do_sign = 0;
+ ret->ecdsa_do_verify = 0;
+ ret->name = NULL;
+ ret->flags = 0;
+ }
+ ret->flags |= ECDSA_METHOD_FLAG_ALLOCATED;
+ return ret;
+}
+
+void ECDSA_METHOD_set_sign(ECDSA_METHOD *ecdsa_method,
+ ECDSA_SIG *(*ecdsa_do_sign) (const unsigned char
+ *dgst, int dgst_len,
+ const BIGNUM *inv,
+ const BIGNUM *rp,
+ EC_KEY *eckey))
+{
+ ecdsa_method->ecdsa_do_sign = ecdsa_do_sign;
+}
+
+void ECDSA_METHOD_set_sign_setup(ECDSA_METHOD *ecdsa_method,
+ int (*ecdsa_sign_setup) (EC_KEY *eckey,
+ BN_CTX *ctx,
+ BIGNUM **kinv,
+ BIGNUM **r))
+{
+ ecdsa_method->ecdsa_sign_setup = ecdsa_sign_setup;
+}
+
+void ECDSA_METHOD_set_verify(ECDSA_METHOD *ecdsa_method,
+ int (*ecdsa_do_verify) (const unsigned char
+ *dgst, int dgst_len,
+ const ECDSA_SIG *sig,
+ EC_KEY *eckey))
+{
+ ecdsa_method->ecdsa_do_verify = ecdsa_do_verify;
+}
+
+void ECDSA_METHOD_set_flags(ECDSA_METHOD *ecdsa_method, int flags)
+{
+ ecdsa_method->flags = flags | ECDSA_METHOD_FLAG_ALLOCATED;
+}
+
+void ECDSA_METHOD_set_name(ECDSA_METHOD *ecdsa_method, char *name)
+{
+ ecdsa_method->name = name;
+}
+
+void ECDSA_METHOD_free(ECDSA_METHOD *ecdsa_method)
+{
+ if (ecdsa_method->flags & ECDSA_METHOD_FLAG_ALLOCATED)
+ OPENSSL_free(ecdsa_method);
+}
+
+void ECDSA_METHOD_set_app_data(ECDSA_METHOD *ecdsa_method, void *app)
+{
+ ecdsa_method->app_data = app;
+}
+
+void *ECDSA_METHOD_get_app_data(ECDSA_METHOD *ecdsa_method)
+{
+ return ecdsa_method->app_data;
+}
diff --git a/crypto/ecdsa/ecs_locl.h b/crypto/ecdsa/ecs_locl.h
index 76b2caf1f4a9..d3a5efc54738 100644
--- a/crypto/ecdsa/ecs_locl.h
+++ b/crypto/ecdsa/ecs_locl.h
@@ -79,9 +79,13 @@ struct ecdsa_method {
int (*finish) (EC_KEY *eckey);
# endif
int flags;
- char *app_data;
+ void *app_data;
};
+/* The ECDSA_METHOD was allocated and can be freed */
+
+# define ECDSA_METHOD_FLAG_ALLOCATED 0x2
+
/*
* If this flag is set the ECDSA method is FIPS compliant and can be used in
* FIPS mode. This is set in the validated module method. If an application
diff --git a/crypto/ecdsa/ecs_ossl.c b/crypto/ecdsa/ecs_ossl.c
index 4c5fa6b926e4..dd769609be4c 100644
--- a/crypto/ecdsa/ecs_ossl.c
+++ b/crypto/ecdsa/ecs_ossl.c
@@ -179,10 +179,32 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp,
while (BN_is_zero(r));
/* compute the inverse of k */
- if (!BN_mod_inverse(k, k, order, ctx)) {
- ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
- goto err;
+ if (EC_GROUP_get_mont_data(group) != NULL) {
+ /*
+ * We want inverse in constant time, therefore we utilize the fact
+ * order must be prime and use Fermats Little Theorem instead.
+ */
+ if (!BN_set_word(X, 2)) {
+ ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
+ goto err;
+ }
+ if (!BN_mod_sub(X, order, X, order, ctx)) {
+ ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
+ goto err;
+ }
+ BN_set_flags(X, BN_FLG_CONSTTIME);
+ if (!BN_mod_exp_mont_consttime
+ (k, k, X, order, ctx, EC_GROUP_get_mont_data(group))) {
+ ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
+ goto err;
+ }
+ } else {
+ if (!BN_mod_inverse(k, k, order, ctx)) {
+ ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
+ goto err;
+ }
}
+
/* clear old values if necessary */
if (*rp != NULL)
BN_clear_free(*rp);
diff --git a/crypto/engine/Makefile b/crypto/engine/Makefile
index 8ceb747fabdc..426388e9b174 100644
--- a/crypto/engine/Makefile
+++ b/crypto/engine/Makefile
@@ -22,13 +22,13 @@ LIBSRC= eng_err.c eng_lib.c eng_list.c eng_init.c eng_ctrl.c \
tb_rsa.c tb_dsa.c tb_ecdsa.c tb_dh.c tb_ecdh.c tb_rand.c tb_store.c \
tb_cipher.c tb_digest.c tb_pkmeth.c tb_asnmth.c \
eng_openssl.c eng_cnf.c eng_dyn.c eng_cryptodev.c \
- eng_rsax.c eng_rdrand.c
+ eng_rdrand.c
LIBOBJ= eng_err.o eng_lib.o eng_list.o eng_init.o eng_ctrl.o \
eng_table.o eng_pkey.o eng_fat.o eng_all.o \
tb_rsa.o tb_dsa.o tb_ecdsa.o tb_dh.o tb_ecdh.o tb_rand.o tb_store.o \
tb_cipher.o tb_digest.o tb_pkmeth.o tb_asnmth.o \
eng_openssl.o eng_cnf.o eng_dyn.o eng_cryptodev.o \
- eng_rsax.o eng_rdrand.o
+ eng_rdrand.o
SRC= $(LIBSRC)
@@ -267,20 +267,6 @@ eng_rdrand.o: ../../include/openssl/safestack.h ../../include/openssl/sha.h
eng_rdrand.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
eng_rdrand.o: ../../include/openssl/x509.h ../../include/openssl/x509_vfy.h
eng_rdrand.o: eng_rdrand.c
-eng_rsax.o: ../../include/openssl/asn1.h ../../include/openssl/bio.h
-eng_rsax.o: ../../include/openssl/bn.h ../../include/openssl/buffer.h
-eng_rsax.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
-eng_rsax.o: ../../include/openssl/ec.h ../../include/openssl/ecdh.h
-eng_rsax.o: ../../include/openssl/ecdsa.h ../../include/openssl/engine.h
-eng_rsax.o: ../../include/openssl/err.h ../../include/openssl/evp.h
-eng_rsax.o: ../../include/openssl/lhash.h ../../include/openssl/obj_mac.h
-eng_rsax.o: ../../include/openssl/objects.h ../../include/openssl/opensslconf.h
-eng_rsax.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-eng_rsax.o: ../../include/openssl/pkcs7.h ../../include/openssl/rsa.h
-eng_rsax.o: ../../include/openssl/safestack.h ../../include/openssl/sha.h
-eng_rsax.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
-eng_rsax.o: ../../include/openssl/x509.h ../../include/openssl/x509_vfy.h
-eng_rsax.o: eng_rsax.c
eng_table.o: ../../e_os.h ../../include/openssl/asn1.h
eng_table.o: ../../include/openssl/bio.h ../../include/openssl/buffer.h
eng_table.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
diff --git a/crypto/engine/eng_all.c b/crypto/engine/eng_all.c
index 7edf12e7e174..195a3a95542a 100644
--- a/crypto/engine/eng_all.c
+++ b/crypto/engine/eng_all.c
@@ -76,9 +76,6 @@ void ENGINE_load_builtin_engines(void)
#if !defined(OPENSSL_NO_HW) && (defined(__OpenBSD__) || defined(__FreeBSD__) || defined(HAVE_CRYPTODEV))
ENGINE_load_cryptodev();
#endif
-#ifndef OPENSSL_NO_RSAX
- ENGINE_load_rsax();
-#endif
#ifndef OPENSSL_NO_RDRAND
ENGINE_load_rdrand();
#endif
diff --git a/crypto/engine/eng_cryptodev.c b/crypto/engine/eng_cryptodev.c
index bcb936dfa754..926d95c0d7fc 100644
--- a/crypto/engine/eng_cryptodev.c
+++ b/crypto/engine/eng_cryptodev.c
@@ -54,10 +54,10 @@ void ENGINE_load_cryptodev(void)
# include <sys/types.h>
# include <crypto/cryptodev.h>
-# include <crypto/dh/dh.h>
-# include <crypto/dsa/dsa.h>
-# include <crypto/err/err.h>
-# include <crypto/rsa/rsa.h>
+# include <openssl/dh.h>
+# include <openssl/dsa.h>
+# include <openssl/err.h>
+# include <openssl/rsa.h>
# include <sys/ioctl.h>
# include <errno.h>
# include <stdio.h>
@@ -160,6 +160,17 @@ static struct {
{
CRYPTO_AES_CBC, NID_aes_256_cbc, 16, 32,
},
+# ifdef CRYPTO_AES_CTR
+ {
+ CRYPTO_AES_CTR, NID_aes_128_ctr, 14, 16,
+ },
+ {
+ CRYPTO_AES_CTR, NID_aes_192_ctr, 14, 24,
+ },
+ {
+ CRYPTO_AES_CTR, NID_aes_256_ctr, 14, 32,
+ },
+# endif
{
CRYPTO_BLF_CBC, NID_bf_cbc, 8, 16,
},
@@ -630,6 +641,46 @@ const EVP_CIPHER cryptodev_aes_256_cbc = {
NULL
};
+# ifdef CRYPTO_AES_CTR
+const EVP_CIPHER cryptodev_aes_ctr = {
+ NID_aes_128_ctr,
+ 16, 16, 14,
+ EVP_CIPH_CTR_MODE,
+ cryptodev_init_key,
+ cryptodev_cipher,
+ cryptodev_cleanup,
+ sizeof(struct dev_crypto_state),
+ EVP_CIPHER_set_asn1_iv,
+ EVP_CIPHER_get_asn1_iv,
+ NULL
+};
+
+const EVP_CIPHER cryptodev_aes_ctr_192 = {
+ NID_aes_192_ctr,
+ 16, 24, 14,
+ EVP_CIPH_CTR_MODE,
+ cryptodev_init_key,
+ cryptodev_cipher,
+ cryptodev_cleanup,
+ sizeof(struct dev_crypto_state),
+ EVP_CIPHER_set_asn1_iv,
+ EVP_CIPHER_get_asn1_iv,
+ NULL
+};
+
+const EVP_CIPHER cryptodev_aes_ctr_256 = {
+ NID_aes_256_ctr,
+ 16, 32, 14,
+ EVP_CIPH_CTR_MODE,
+ cryptodev_init_key,
+ cryptodev_cipher,
+ cryptodev_cleanup,
+ sizeof(struct dev_crypto_state),
+ EVP_CIPHER_set_asn1_iv,
+ EVP_CIPHER_get_asn1_iv,
+ NULL
+};
+# endif
/*
* Registered by the ENGINE when used to find out how to deal with
* a particular NID in the ENGINE. this says what we'll do at the
@@ -667,6 +718,17 @@ cryptodev_engine_ciphers(ENGINE *e, const EVP_CIPHER **cipher,
case NID_aes_256_cbc:
*cipher = &cryptodev_aes_256_cbc;
break;
+# ifdef CRYPTO_AES_CTR
+ case NID_aes_128_ctr:
+ *cipher = &cryptodev_aes_ctr;
+ break;
+ case NID_aes_192_ctr:
+ *cipher = &cryptodev_aes_ctr_192;
+ break;
+ case NID_aes_256_ctr:
+ *cipher = &cryptodev_aes_ctr_256;
+ break;
+# endif
default:
*cipher = NULL;
break;
diff --git a/crypto/engine/eng_rsax.c b/crypto/engine/eng_rsax.c
deleted file mode 100644
index 86ee9d893995..000000000000
--- a/crypto/engine/eng_rsax.c
+++ /dev/null
@@ -1,701 +0,0 @@
-/* crypto/engine/eng_rsax.c */
-/* Copyright (c) 2010-2010 Intel Corp.
- * Author: Vinodh.Gopal@intel.com
- * Jim Guilford
- * Erdinc.Ozturk@intel.com
- * Maxim.Perminov@intel.com
- * Ying.Huang@intel.com
- *
- * More information about algorithm used can be found at:
- * http://www.cse.buffalo.edu/srds2009/escs2009_submission_Gopal.pdf
- */
-/* ====================================================================
- * Copyright (c) 1999-2001 The OpenSSL Project. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- * software must display the following acknowledgment:
- * "This product includes software developed by the OpenSSL Project
- * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- * endorse or promote products derived from this software without
- * prior written permission. For written permission, please contact
- * licensing@OpenSSL.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- * nor may "OpenSSL" appear in their names without prior written
- * permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- * acknowledgment:
- * "This product includes software developed by the OpenSSL Project
- * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- *
- * This product includes cryptographic software written by Eric Young
- * (eay@cryptsoft.com). This product includes software written by Tim
- * Hudson (tjh@cryptsoft.com).
- */
-
-#include <openssl/opensslconf.h>
-
-#include <stdio.h>
-#include <string.h>
-#include <openssl/crypto.h>
-#include <openssl/buffer.h>
-#include <openssl/engine.h>
-#ifndef OPENSSL_NO_RSA
-# include <openssl/rsa.h>
-#endif
-#include <openssl/bn.h>
-#include <openssl/err.h>
-
-/* RSAX is available **ONLY* on x86_64 CPUs */
-#undef COMPILE_RSAX
-
-#if (defined(__x86_64) || defined(__x86_64__) || \
- defined(_M_AMD64) || defined (_M_X64)) && !defined(OPENSSL_NO_ASM)
-# define COMPILE_RSAX
-static ENGINE *ENGINE_rsax(void);
-#endif
-
-void ENGINE_load_rsax(void)
-{
-/* On non-x86 CPUs it just returns. */
-#ifdef COMPILE_RSAX
- ENGINE *toadd = ENGINE_rsax();
- if (!toadd)
- return;
- ENGINE_add(toadd);
- ENGINE_free(toadd);
- ERR_clear_error();
-#endif
-}
-
-#ifdef COMPILE_RSAX
-# define E_RSAX_LIB_NAME "rsax engine"
-
-static int e_rsax_destroy(ENGINE *e);
-static int e_rsax_init(ENGINE *e);
-static int e_rsax_finish(ENGINE *e);
-static int e_rsax_ctrl(ENGINE *e, int cmd, long i, void *p, void (*f) (void));
-
-# ifndef OPENSSL_NO_RSA
-/* RSA stuff */
-static int e_rsax_rsa_mod_exp(BIGNUM *r, const BIGNUM *I, RSA *rsa,
- BN_CTX *ctx);
-static int e_rsax_rsa_finish(RSA *r);
-# endif
-
-static const ENGINE_CMD_DEFN e_rsax_cmd_defns[] = {
- {0, NULL, NULL, 0}
-};
-
-# ifndef OPENSSL_NO_RSA
-/* Our internal RSA_METHOD that we provide pointers to */
-static RSA_METHOD e_rsax_rsa = {
- "Intel RSA-X method",
- NULL,
- NULL,
- NULL,
- NULL,
- e_rsax_rsa_mod_exp,
- NULL,
- NULL,
- e_rsax_rsa_finish,
- RSA_FLAG_CACHE_PUBLIC | RSA_FLAG_CACHE_PRIVATE,
- NULL,
- NULL,
- NULL
-};
-# endif
-
-/* Constants used when creating the ENGINE */
-static const char *engine_e_rsax_id = "rsax";
-static const char *engine_e_rsax_name = "RSAX engine support";
-
-/* This internal function is used by ENGINE_rsax() */
-static int bind_helper(ENGINE *e)
-{
-# ifndef OPENSSL_NO_RSA
- const RSA_METHOD *meth1;
-# endif
- if (!ENGINE_set_id(e, engine_e_rsax_id) ||
- !ENGINE_set_name(e, engine_e_rsax_name) ||
-# ifndef OPENSSL_NO_RSA
- !ENGINE_set_RSA(e, &e_rsax_rsa) ||
-# endif
- !ENGINE_set_destroy_function(e, e_rsax_destroy) ||
- !ENGINE_set_init_function(e, e_rsax_init) ||
- !ENGINE_set_finish_function(e, e_rsax_finish) ||
- !ENGINE_set_ctrl_function(e, e_rsax_ctrl) ||
- !ENGINE_set_cmd_defns(e, e_rsax_cmd_defns))
- return 0;
-
-# ifndef OPENSSL_NO_RSA
- meth1 = RSA_PKCS1_SSLeay();
- e_rsax_rsa.rsa_pub_enc = meth1->rsa_pub_enc;
- e_rsax_rsa.rsa_pub_dec = meth1->rsa_pub_dec;
- e_rsax_rsa.rsa_priv_enc = meth1->rsa_priv_enc;
- e_rsax_rsa.rsa_priv_dec = meth1->rsa_priv_dec;
- e_rsax_rsa.bn_mod_exp = meth1->bn_mod_exp;
-# endif
- return 1;
-}
-
-static ENGINE *ENGINE_rsax(void)
-{
- ENGINE *ret = ENGINE_new();
- if (!ret)
- return NULL;
- if (!bind_helper(ret)) {
- ENGINE_free(ret);
- return NULL;
- }
- return ret;
-}
-
-# ifndef OPENSSL_NO_RSA
-/* Used to attach our own key-data to an RSA structure */
-static int rsax_ex_data_idx = -1;
-# endif
-
-static int e_rsax_destroy(ENGINE *e)
-{
- return 1;
-}
-
-/* (de)initialisation functions. */
-static int e_rsax_init(ENGINE *e)
-{
-# ifndef OPENSSL_NO_RSA
- if (rsax_ex_data_idx == -1)
- rsax_ex_data_idx = RSA_get_ex_new_index(0, NULL, NULL, NULL, NULL);
-# endif
- if (rsax_ex_data_idx == -1)
- return 0;
- return 1;
-}
-
-static int e_rsax_finish(ENGINE *e)
-{
- return 1;
-}
-
-static int e_rsax_ctrl(ENGINE *e, int cmd, long i, void *p, void (*f) (void))
-{
- int to_return = 1;
-
- switch (cmd) {
- /* The command isn't understood by this engine */
- default:
- to_return = 0;
- break;
- }
-
- return to_return;
-}
-
-# ifndef OPENSSL_NO_RSA
-
-# ifdef _WIN32
-typedef unsigned __int64 UINT64;
-# else
-typedef unsigned long long UINT64;
-# endif
-typedef unsigned short UINT16;
-
-/*
- * Table t is interleaved in the following manner: The order in memory is
- * t[0][0], t[0][1], ..., t[0][7], t[1][0], ... A particular 512-bit value is
- * stored in t[][index] rather than the more normal t[index][]; i.e. the
- * qwords of a particular entry in t are not adjacent in memory
- */
-
-/* Init BIGNUM b from the interleaved UINT64 array */
-static int interleaved_array_to_bn_512(BIGNUM *b, UINT64 *array);
-
-/*
- * Extract array elements from BIGNUM b To set the whole array from b, call
- * with n=8
- */
-static int bn_extract_to_array_512(const BIGNUM *b, unsigned int n,
- UINT64 *array);
-
-struct mod_ctx_512 {
- UINT64 t[8][8];
- UINT64 m[8];
- UINT64 m1[8]; /* 2^278 % m */
- UINT64 m2[8]; /* 2^640 % m */
- UINT64 k1[2]; /* (- 1/m) % 2^128 */
-};
-
-static int mod_exp_pre_compute_data_512(UINT64 *m, struct mod_ctx_512 *data);
-
-void mod_exp_512(UINT64 *result, /* 512 bits, 8 qwords */
- UINT64 *g, /* 512 bits, 8 qwords */
- UINT64 *exp, /* 512 bits, 8 qwords */
- struct mod_ctx_512 *data);
-
-typedef struct st_e_rsax_mod_ctx {
- UINT64 type;
- union {
- struct mod_ctx_512 b512;
- } ctx;
-
-} E_RSAX_MOD_CTX;
-
-static E_RSAX_MOD_CTX *e_rsax_get_ctx(RSA *rsa, int idx, BIGNUM *m)
-{
- E_RSAX_MOD_CTX *hptr;
-
- if (idx < 0 || idx > 2)
- return NULL;
-
- hptr = RSA_get_ex_data(rsa, rsax_ex_data_idx);
- if (!hptr) {
- hptr = OPENSSL_malloc(3 * sizeof(E_RSAX_MOD_CTX));
- if (!hptr)
- return NULL;
- hptr[2].type = hptr[1].type = hptr[0].type = 0;
- RSA_set_ex_data(rsa, rsax_ex_data_idx, hptr);
- }
-
- if (hptr[idx].type == (UINT64)BN_num_bits(m))
- return hptr + idx;
-
- if (BN_num_bits(m) == 512) {
- UINT64 _m[8];
- bn_extract_to_array_512(m, 8, _m);
- memset(&hptr[idx].ctx.b512, 0, sizeof(struct mod_ctx_512));
- mod_exp_pre_compute_data_512(_m, &hptr[idx].ctx.b512);
- }
-
- hptr[idx].type = BN_num_bits(m);
- return hptr + idx;
-}
-
-static int e_rsax_rsa_finish(RSA *rsa)
-{
- E_RSAX_MOD_CTX *hptr = RSA_get_ex_data(rsa, rsax_ex_data_idx);
- if (hptr) {
- OPENSSL_free(hptr);
- RSA_set_ex_data(rsa, rsax_ex_data_idx, NULL);
- }
- if (rsa->_method_mod_n)
- BN_MONT_CTX_free(rsa->_method_mod_n);
- if (rsa->_method_mod_p)
- BN_MONT_CTX_free(rsa->_method_mod_p);
- if (rsa->_method_mod_q)
- BN_MONT_CTX_free(rsa->_method_mod_q);
- return 1;
-}
-
-static int e_rsax_bn_mod_exp(BIGNUM *r, const BIGNUM *g, const BIGNUM *e,
- const BIGNUM *m, BN_CTX *ctx,
- BN_MONT_CTX *in_mont,
- E_RSAX_MOD_CTX *rsax_mod_ctx)
-{
- if (rsax_mod_ctx && BN_get_flags(e, BN_FLG_CONSTTIME) != 0) {
- if (BN_num_bits(m) == 512) {
- UINT64 _r[8];
- UINT64 _g[8];
- UINT64 _e[8];
-
- /* Init the arrays from the BIGNUMs */
- bn_extract_to_array_512(g, 8, _g);
- bn_extract_to_array_512(e, 8, _e);
-
- mod_exp_512(_r, _g, _e, &rsax_mod_ctx->ctx.b512);
- /* Return the result in the BIGNUM */
- interleaved_array_to_bn_512(r, _r);
- return 1;
- }
- }
-
- return BN_mod_exp_mont(r, g, e, m, ctx, in_mont);
-}
-
-/*
- * Declares for the Intel CIAP 512-bit / CRT / 1024 bit RSA modular
- * exponentiation routine precalculations and a structure to hold the
- * necessary values. These files are meant to live in crypto/rsa/ in the
- * target openssl.
- */
-
-/*
- * Local method: extracts a piece from a BIGNUM, to fit it into
- * an array. Call with n=8 to extract an entire 512-bit BIGNUM
- */
-static int bn_extract_to_array_512(const BIGNUM *b, unsigned int n,
- UINT64 *array)
-{
- int i;
- UINT64 tmp;
- unsigned char bn_buff[64];
- memset(bn_buff, 0, 64);
- if (BN_num_bytes(b) > 64) {
- printf("Can't support this byte size\n");
- return 0;
- }
- if (BN_num_bytes(b) != 0) {
- if (!BN_bn2bin(b, bn_buff + (64 - BN_num_bytes(b)))) {
- printf("Error's in bn2bin\n");
- /* We have to error, here */
- return 0;
- }
- }
- while (n-- > 0) {
- array[n] = 0;
- for (i = 7; i >= 0; i--) {
- tmp = bn_buff[63 - (n * 8 + i)];
- array[n] |= tmp << (8 * i);
- }
- }
- return 1;
-}
-
-/* Init a 512-bit BIGNUM from the UINT64*_ (8 * 64) interleaved array */
-static int interleaved_array_to_bn_512(BIGNUM *b, UINT64 *array)
-{
- unsigned char tmp[64];
- int n = 8;
- int i;
- while (n-- > 0) {
- for (i = 7; i >= 0; i--) {
- tmp[63 - (n * 8 + i)] = (unsigned char)(array[n] >> (8 * i));
- }}
- BN_bin2bn(tmp, 64, b);
- return 0;
-}
-
-/* The main 512bit precompute call */
-static int mod_exp_pre_compute_data_512(UINT64 *m, struct mod_ctx_512 *data)
-{
- BIGNUM two_768, two_640, two_128, two_512, tmp, _m, tmp2;
-
- /* We need a BN_CTX for the modulo functions */
- BN_CTX *ctx;
- /* Some tmps */
- UINT64 _t[8];
- int i, j, ret = 0;
-
- /* Init _m with m */
- BN_init(&_m);
- interleaved_array_to_bn_512(&_m, m);
- memset(_t, 0, 64);
-
- /* Inits */
- BN_init(&two_768);
- BN_init(&two_640);
- BN_init(&two_128);
- BN_init(&two_512);
- BN_init(&tmp);
- BN_init(&tmp2);
-
- /* Create our context */
- if ((ctx = BN_CTX_new()) == NULL) {
- goto err;
- }
- BN_CTX_start(ctx);
-
- /*
- * For production, if you care, these only need to be set once,
- * and may be made constants.
- */
- BN_lshift(&two_768, BN_value_one(), 768);
- BN_lshift(&two_640, BN_value_one(), 640);
- BN_lshift(&two_128, BN_value_one(), 128);
- BN_lshift(&two_512, BN_value_one(), 512);
-
- if (0 == (m[7] & 0x8000000000000000)) {
- goto err;
- }
- if (0 == (m[0] & 0x1)) { /* Odd modulus required for Mont */
- goto err;
- }
-
- /* Precompute m1 */
- BN_mod(&tmp, &two_768, &_m, ctx);
- if (!bn_extract_to_array_512(&tmp, 8, &data->m1[0])) {
- goto err;
- }
-
- /* Precompute m2 */
- BN_mod(&tmp, &two_640, &_m, ctx);
- if (!bn_extract_to_array_512(&tmp, 8, &data->m2[0])) {
- goto err;
- }
-
- /*
- * Precompute k1, a 128b number = ((-1)* m-1 ) mod 2128; k1 should
- * be non-negative.
- */
- BN_mod_inverse(&tmp, &_m, &two_128, ctx);
- if (!BN_is_zero(&tmp)) {
- BN_sub(&tmp, &two_128, &tmp);
- }
- if (!bn_extract_to_array_512(&tmp, 2, &data->k1[0])) {
- goto err;
- }
-
- /* Precompute t */
- for (i = 0; i < 8; i++) {
- BN_zero(&tmp);
- if (i & 1) {
- BN_add(&tmp, &two_512, &tmp);
- }
- if (i & 2) {
- BN_add(&tmp, &two_512, &tmp);
- }
- if (i & 4) {
- BN_add(&tmp, &two_640, &tmp);
- }
-
- BN_nnmod(&tmp2, &tmp, &_m, ctx);
- if (!bn_extract_to_array_512(&tmp2, 8, _t)) {
- goto err;
- }
- for (j = 0; j < 8; j++)
- data->t[j][i] = _t[j];
- }
-
- /* Precompute m */
- for (i = 0; i < 8; i++) {
- data->m[i] = m[i];
- }
-
- ret = 1;
-
- err:
- /* Cleanup */
- if (ctx != NULL) {
- BN_CTX_end(ctx);
- BN_CTX_free(ctx);
- }
- BN_free(&two_768);
- BN_free(&two_640);
- BN_free(&two_128);
- BN_free(&two_512);
- BN_free(&tmp);
- BN_free(&tmp2);
- BN_free(&_m);
-
- return ret;
-}
-
-static int e_rsax_rsa_mod_exp(BIGNUM *r0, const BIGNUM *I, RSA *rsa,
- BN_CTX *ctx)
-{
- BIGNUM *r1, *m1, *vrfy;
- BIGNUM local_dmp1, local_dmq1, local_c, local_r1;
- BIGNUM *dmp1, *dmq1, *c, *pr1;
- int ret = 0;
-
- BN_CTX_start(ctx);
- r1 = BN_CTX_get(ctx);
- m1 = BN_CTX_get(ctx);
- vrfy = BN_CTX_get(ctx);
-
- {
- BIGNUM local_p, local_q;
- BIGNUM *p = NULL, *q = NULL;
- int error = 0;
-
- /*
- * Make sure BN_mod_inverse in Montgomery intialization uses the
- * BN_FLG_CONSTTIME flag (unless RSA_FLAG_NO_CONSTTIME is set)
- */
- if (!(rsa->flags & RSA_FLAG_NO_CONSTTIME)) {
- BN_init(&local_p);
- p = &local_p;
- BN_with_flags(p, rsa->p, BN_FLG_CONSTTIME);
-
- BN_init(&local_q);
- q = &local_q;
- BN_with_flags(q, rsa->q, BN_FLG_CONSTTIME);
- } else {
- p = rsa->p;
- q = rsa->q;
- }
-
- if (rsa->flags & RSA_FLAG_CACHE_PRIVATE) {
- if (!BN_MONT_CTX_set_locked
- (&rsa->_method_mod_p, CRYPTO_LOCK_RSA, p, ctx))
- error = 1;
- if (!BN_MONT_CTX_set_locked
- (&rsa->_method_mod_q, CRYPTO_LOCK_RSA, q, ctx))
- error = 1;
- }
-
- /* clean up */
- if (!(rsa->flags & RSA_FLAG_NO_CONSTTIME)) {
- BN_free(&local_p);
- BN_free(&local_q);
- }
- if (error)
- goto err;
- }
-
- if (rsa->flags & RSA_FLAG_CACHE_PUBLIC)
- if (!BN_MONT_CTX_set_locked
- (&rsa->_method_mod_n, CRYPTO_LOCK_RSA, rsa->n, ctx))
- goto err;
-
- /* compute I mod q */
- if (!(rsa->flags & RSA_FLAG_NO_CONSTTIME)) {
- c = &local_c;
- BN_with_flags(c, I, BN_FLG_CONSTTIME);
- if (!BN_mod(r1, c, rsa->q, ctx))
- goto err;
- } else {
- if (!BN_mod(r1, I, rsa->q, ctx))
- goto err;
- }
-
- /* compute r1^dmq1 mod q */
- if (!(rsa->flags & RSA_FLAG_NO_CONSTTIME)) {
- dmq1 = &local_dmq1;
- BN_with_flags(dmq1, rsa->dmq1, BN_FLG_CONSTTIME);
- } else
- dmq1 = rsa->dmq1;
-
- if (!e_rsax_bn_mod_exp(m1, r1, dmq1, rsa->q, ctx,
- rsa->_method_mod_q, e_rsax_get_ctx(rsa, 0,
- rsa->q)))
- goto err;
-
- /* compute I mod p */
- if (!(rsa->flags & RSA_FLAG_NO_CONSTTIME)) {
- c = &local_c;
- BN_with_flags(c, I, BN_FLG_CONSTTIME);
- if (!BN_mod(r1, c, rsa->p, ctx))
- goto err;
- } else {
- if (!BN_mod(r1, I, rsa->p, ctx))
- goto err;
- }
-
- /* compute r1^dmp1 mod p */
- if (!(rsa->flags & RSA_FLAG_NO_CONSTTIME)) {
- dmp1 = &local_dmp1;
- BN_with_flags(dmp1, rsa->dmp1, BN_FLG_CONSTTIME);
- } else
- dmp1 = rsa->dmp1;
-
- if (!e_rsax_bn_mod_exp(r0, r1, dmp1, rsa->p, ctx,
- rsa->_method_mod_p, e_rsax_get_ctx(rsa, 1,
- rsa->p)))
- goto err;
-
- if (!BN_sub(r0, r0, m1))
- goto err;
- /*
- * This will help stop the size of r0 increasing, which does affect the
- * multiply if it optimised for a power of 2 size
- */
- if (BN_is_negative(r0))
- if (!BN_add(r0, r0, rsa->p))
- goto err;
-
- if (!BN_mul(r1, r0, rsa->iqmp, ctx))
- goto err;
-
- /* Turn BN_FLG_CONSTTIME flag on before division operation */
- if (!(rsa->flags & RSA_FLAG_NO_CONSTTIME)) {
- pr1 = &local_r1;
- BN_with_flags(pr1, r1, BN_FLG_CONSTTIME);
- } else
- pr1 = r1;
- if (!BN_mod(r0, pr1, rsa->p, ctx))
- goto err;
-
- /*
- * If p < q it is occasionally possible for the correction of adding 'p'
- * if r0 is negative above to leave the result still negative. This can
- * break the private key operations: the following second correction
- * should *always* correct this rare occurrence. This will *never* happen
- * with OpenSSL generated keys because they ensure p > q [steve]
- */
- if (BN_is_negative(r0))
- if (!BN_add(r0, r0, rsa->p))
- goto err;
- if (!BN_mul(r1, r0, rsa->q, ctx))
- goto err;
- if (!BN_add(r0, r1, m1))
- goto err;
-
- if (rsa->e && rsa->n) {
- if (!e_rsax_bn_mod_exp
- (vrfy, r0, rsa->e, rsa->n, ctx, rsa->_method_mod_n,
- e_rsax_get_ctx(rsa, 2, rsa->n)))
- goto err;
-
- /*
- * If 'I' was greater than (or equal to) rsa->n, the operation will
- * be equivalent to using 'I mod n'. However, the result of the
- * verify will *always* be less than 'n' so we don't check for
- * absolute equality, just congruency.
- */
- if (!BN_sub(vrfy, vrfy, I))
- goto err;
- if (!BN_mod(vrfy, vrfy, rsa->n, ctx))
- goto err;
- if (BN_is_negative(vrfy))
- if (!BN_add(vrfy, vrfy, rsa->n))
- goto err;
- if (!BN_is_zero(vrfy)) {
- /*
- * 'I' and 'vrfy' aren't congruent mod n. Don't leak
- * miscalculated CRT output, just do a raw (slower) mod_exp and
- * return that instead.
- */
-
- BIGNUM local_d;
- BIGNUM *d = NULL;
-
- if (!(rsa->flags & RSA_FLAG_NO_CONSTTIME)) {
- d = &local_d;
- BN_with_flags(d, rsa->d, BN_FLG_CONSTTIME);
- } else
- d = rsa->d;
- if (!e_rsax_bn_mod_exp(r0, I, d, rsa->n, ctx,
- rsa->_method_mod_n, e_rsax_get_ctx(rsa, 2,
- rsa->n)))
- goto err;
- }
- }
- ret = 1;
-
- err:
- BN_CTX_end(ctx);
-
- return ret;
-}
-# endif /* !OPENSSL_NO_RSA */
-#endif /* !COMPILE_RSAX */
diff --git a/crypto/engine/engine.h b/crypto/engine/engine.h
index e81096ae02a6..bd7b591447dd 100644
--- a/crypto/engine/engine.h
+++ b/crypto/engine/engine.h
@@ -413,7 +413,6 @@ void ENGINE_load_gost(void);
# endif
# endif
void ENGINE_load_cryptodev(void);
-void ENGINE_load_rsax(void);
void ENGINE_load_rdrand(void);
void ENGINE_load_builtin_engines(void);
diff --git a/crypto/evp/Makefile b/crypto/evp/Makefile
index ed1502d3a694..aaaad986e0e8 100644
--- a/crypto/evp/Makefile
+++ b/crypto/evp/Makefile
@@ -28,8 +28,8 @@ LIBSRC= encode.c digest.c evp_enc.c evp_key.c evp_acnf.c evp_cnf.c \
bio_md.c bio_b64.c bio_enc.c evp_err.c e_null.c \
c_all.c c_allc.c c_alld.c evp_lib.c bio_ok.c \
evp_pkey.c evp_pbe.c p5_crpt.c p5_crpt2.c \
- e_old.c pmeth_lib.c pmeth_fn.c pmeth_gn.c m_sigver.c evp_fips.c \
- e_aes_cbc_hmac_sha1.c e_rc4_hmac_md5.c
+ e_old.c pmeth_lib.c pmeth_fn.c pmeth_gn.c m_sigver.c \
+ e_aes_cbc_hmac_sha1.c e_aes_cbc_hmac_sha256.c e_rc4_hmac_md5.c
LIBOBJ= encode.o digest.o evp_enc.o evp_key.o evp_acnf.o evp_cnf.o \
e_des.o e_bf.o e_idea.o e_des3.o e_camellia.o\
@@ -41,8 +41,8 @@ LIBOBJ= encode.o digest.o evp_enc.o evp_key.o evp_acnf.o evp_cnf.o \
bio_md.o bio_b64.o bio_enc.o evp_err.o e_null.o \
c_all.o c_allc.o c_alld.o evp_lib.o bio_ok.o \
evp_pkey.o evp_pbe.o p5_crpt.o p5_crpt2.o \
- e_old.o pmeth_lib.o pmeth_fn.o pmeth_gn.o m_sigver.o evp_fips.o \
- e_aes_cbc_hmac_sha1.o e_rc4_hmac_md5.o
+ e_old.o pmeth_lib.o pmeth_fn.o pmeth_gn.o m_sigver.o \
+ e_aes_cbc_hmac_sha1.o e_aes_cbc_hmac_sha256.o e_rc4_hmac_md5.o
SRC= $(LIBSRC)
@@ -204,16 +204,36 @@ e_aes_cbc_hmac_sha1.o: ../../include/openssl/bio.h
e_aes_cbc_hmac_sha1.o: ../../include/openssl/crypto.h
e_aes_cbc_hmac_sha1.o: ../../include/openssl/e_os2.h
e_aes_cbc_hmac_sha1.o: ../../include/openssl/evp.h
+e_aes_cbc_hmac_sha1.o: ../../include/openssl/modes.h
e_aes_cbc_hmac_sha1.o: ../../include/openssl/obj_mac.h
e_aes_cbc_hmac_sha1.o: ../../include/openssl/objects.h
e_aes_cbc_hmac_sha1.o: ../../include/openssl/opensslconf.h
e_aes_cbc_hmac_sha1.o: ../../include/openssl/opensslv.h
e_aes_cbc_hmac_sha1.o: ../../include/openssl/ossl_typ.h
+e_aes_cbc_hmac_sha1.o: ../../include/openssl/rand.h
e_aes_cbc_hmac_sha1.o: ../../include/openssl/safestack.h
e_aes_cbc_hmac_sha1.o: ../../include/openssl/sha.h
e_aes_cbc_hmac_sha1.o: ../../include/openssl/stack.h
-e_aes_cbc_hmac_sha1.o: ../../include/openssl/symhacks.h e_aes_cbc_hmac_sha1.c
-e_aes_cbc_hmac_sha1.o: evp_locl.h
+e_aes_cbc_hmac_sha1.o: ../../include/openssl/symhacks.h ../modes/modes_lcl.h
+e_aes_cbc_hmac_sha1.o: e_aes_cbc_hmac_sha1.c
+e_aes_cbc_hmac_sha256.o: ../../include/openssl/aes.h
+e_aes_cbc_hmac_sha256.o: ../../include/openssl/asn1.h
+e_aes_cbc_hmac_sha256.o: ../../include/openssl/bio.h
+e_aes_cbc_hmac_sha256.o: ../../include/openssl/crypto.h
+e_aes_cbc_hmac_sha256.o: ../../include/openssl/e_os2.h
+e_aes_cbc_hmac_sha256.o: ../../include/openssl/evp.h
+e_aes_cbc_hmac_sha256.o: ../../include/openssl/modes.h
+e_aes_cbc_hmac_sha256.o: ../../include/openssl/obj_mac.h
+e_aes_cbc_hmac_sha256.o: ../../include/openssl/objects.h
+e_aes_cbc_hmac_sha256.o: ../../include/openssl/opensslconf.h
+e_aes_cbc_hmac_sha256.o: ../../include/openssl/opensslv.h
+e_aes_cbc_hmac_sha256.o: ../../include/openssl/ossl_typ.h
+e_aes_cbc_hmac_sha256.o: ../../include/openssl/rand.h
+e_aes_cbc_hmac_sha256.o: ../../include/openssl/safestack.h
+e_aes_cbc_hmac_sha256.o: ../../include/openssl/sha.h
+e_aes_cbc_hmac_sha256.o: ../../include/openssl/stack.h
+e_aes_cbc_hmac_sha256.o: ../../include/openssl/symhacks.h ../modes/modes_lcl.h
+e_aes_cbc_hmac_sha256.o: e_aes_cbc_hmac_sha256.c
e_bf.o: ../../e_os.h ../../include/openssl/asn1.h ../../include/openssl/bio.h
e_bf.o: ../../include/openssl/blowfish.h ../../include/openssl/buffer.h
e_bf.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
@@ -227,11 +247,13 @@ e_camellia.o: ../../include/openssl/asn1.h ../../include/openssl/bio.h
e_camellia.o: ../../include/openssl/camellia.h ../../include/openssl/crypto.h
e_camellia.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
e_camellia.o: ../../include/openssl/evp.h ../../include/openssl/lhash.h
-e_camellia.o: ../../include/openssl/obj_mac.h ../../include/openssl/objects.h
+e_camellia.o: ../../include/openssl/modes.h ../../include/openssl/obj_mac.h
+e_camellia.o: ../../include/openssl/objects.h
e_camellia.o: ../../include/openssl/opensslconf.h
e_camellia.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
e_camellia.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-e_camellia.o: ../../include/openssl/symhacks.h e_camellia.c evp_locl.h
+e_camellia.o: ../../include/openssl/symhacks.h ../modes/modes_lcl.h
+e_camellia.o: e_camellia.c evp_locl.h
e_cast.o: ../../e_os.h ../../include/openssl/asn1.h ../../include/openssl/bio.h
e_cast.o: ../../include/openssl/buffer.h ../../include/openssl/cast.h
e_cast.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
@@ -260,9 +282,10 @@ e_des3.o: ../../include/openssl/evp.h ../../include/openssl/lhash.h
e_des3.o: ../../include/openssl/obj_mac.h ../../include/openssl/objects.h
e_des3.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h
e_des3.o: ../../include/openssl/ossl_typ.h ../../include/openssl/rand.h
-e_des3.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-e_des3.o: ../../include/openssl/symhacks.h ../../include/openssl/ui.h
-e_des3.o: ../../include/openssl/ui_compat.h ../cryptlib.h e_des3.c evp_locl.h
+e_des3.o: ../../include/openssl/safestack.h ../../include/openssl/sha.h
+e_des3.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
+e_des3.o: ../../include/openssl/ui.h ../../include/openssl/ui_compat.h
+e_des3.o: ../cryptlib.h e_des3.c evp_locl.h
e_idea.o: ../../e_os.h ../../include/openssl/asn1.h ../../include/openssl/bio.h
e_idea.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
e_idea.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
@@ -394,13 +417,6 @@ evp_err.o: ../../include/openssl/objects.h ../../include/openssl/opensslconf.h
evp_err.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
evp_err.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
evp_err.o: ../../include/openssl/symhacks.h evp_err.c
-evp_fips.o: ../../include/openssl/asn1.h ../../include/openssl/bio.h
-evp_fips.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
-evp_fips.o: ../../include/openssl/evp.h ../../include/openssl/obj_mac.h
-evp_fips.o: ../../include/openssl/objects.h ../../include/openssl/opensslconf.h
-evp_fips.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-evp_fips.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-evp_fips.o: ../../include/openssl/symhacks.h evp_fips.c
evp_key.o: ../../e_os.h ../../include/openssl/asn1.h
evp_key.o: ../../include/openssl/bio.h ../../include/openssl/buffer.h
evp_key.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
diff --git a/crypto/evp/c_allc.c b/crypto/evp/c_allc.c
index 3097c2112efc..280e58408f2b 100644
--- a/crypto/evp/c_allc.c
+++ b/crypto/evp/c_allc.c
@@ -93,6 +93,7 @@ void OpenSSL_add_all_ciphers(void)
EVP_add_cipher(EVP_des_ecb());
EVP_add_cipher(EVP_des_ede());
EVP_add_cipher(EVP_des_ede3());
+ EVP_add_cipher(EVP_des_ede3_wrap());
#endif
#ifndef OPENSSL_NO_RC4
@@ -172,6 +173,8 @@ void OpenSSL_add_all_ciphers(void)
EVP_add_cipher(EVP_aes_128_ctr());
EVP_add_cipher(EVP_aes_128_gcm());
EVP_add_cipher(EVP_aes_128_xts());
+ EVP_add_cipher(EVP_aes_128_ccm());
+ EVP_add_cipher(EVP_aes_128_wrap());
EVP_add_cipher_alias(SN_aes_128_cbc, "AES128");
EVP_add_cipher_alias(SN_aes_128_cbc, "aes128");
EVP_add_cipher(EVP_aes_192_ecb());
@@ -182,6 +185,8 @@ void OpenSSL_add_all_ciphers(void)
EVP_add_cipher(EVP_aes_192_ofb());
EVP_add_cipher(EVP_aes_192_ctr());
EVP_add_cipher(EVP_aes_192_gcm());
+ EVP_add_cipher(EVP_aes_192_ccm());
+ EVP_add_cipher(EVP_aes_192_wrap());
EVP_add_cipher_alias(SN_aes_192_cbc, "AES192");
EVP_add_cipher_alias(SN_aes_192_cbc, "aes192");
EVP_add_cipher(EVP_aes_256_ecb());
@@ -193,12 +198,18 @@ void OpenSSL_add_all_ciphers(void)
EVP_add_cipher(EVP_aes_256_ctr());
EVP_add_cipher(EVP_aes_256_gcm());
EVP_add_cipher(EVP_aes_256_xts());
+ EVP_add_cipher(EVP_aes_256_ccm());
+ EVP_add_cipher(EVP_aes_256_wrap());
EVP_add_cipher_alias(SN_aes_256_cbc, "AES256");
EVP_add_cipher_alias(SN_aes_256_cbc, "aes256");
# if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA1)
EVP_add_cipher(EVP_aes_128_cbc_hmac_sha1());
EVP_add_cipher(EVP_aes_256_cbc_hmac_sha1());
# endif
+# if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA256)
+ EVP_add_cipher(EVP_aes_128_cbc_hmac_sha256());
+ EVP_add_cipher(EVP_aes_256_cbc_hmac_sha256());
+# endif
#endif
#ifndef OPENSSL_NO_CAMELLIA
diff --git a/crypto/evp/digest.c b/crypto/evp/digest.c
index 2e202c8fe031..f2643f32486a 100644
--- a/crypto/evp/digest.c
+++ b/crypto/evp/digest.c
@@ -119,6 +119,7 @@
#ifdef OPENSSL_FIPS
# include <openssl/fips.h>
+# include "evp_locl.h"
#endif
void EVP_MD_CTX_init(EVP_MD_CTX *ctx)
@@ -145,6 +146,17 @@ int EVP_DigestInit(EVP_MD_CTX *ctx, const EVP_MD *type)
int EVP_DigestInit_ex(EVP_MD_CTX *ctx, const EVP_MD *type, ENGINE *impl)
{
EVP_MD_CTX_clear_flags(ctx, EVP_MD_CTX_FLAG_CLEANED);
+#ifdef OPENSSL_FIPS
+ /* If FIPS mode switch to approved implementation if possible */
+ if (FIPS_mode()) {
+ const EVP_MD *fipsmd;
+ if (type) {
+ fipsmd = evp_get_fips_md(type);
+ if (fipsmd)
+ type = fipsmd;
+ }
+ }
+#endif
#ifndef OPENSSL_NO_ENGINE
/*
* Whether it's nice or not, "Inits" can be used on "Final"'d contexts so
diff --git a/crypto/evp/e_aes.c b/crypto/evp/e_aes.c
index a4327fcb0554..1734a823c1e5 100644
--- a/crypto/evp/e_aes.c
+++ b/crypto/evp/e_aes.c
@@ -57,12 +57,17 @@
# include <assert.h>
# include <openssl/aes.h>
# include "evp_locl.h"
-# ifndef OPENSSL_FIPS
-# include "modes_lcl.h"
-# include <openssl/rand.h>
+# include "modes_lcl.h"
+# include <openssl/rand.h>
+
+# undef EVP_CIPH_FLAG_FIPS
+# define EVP_CIPH_FLAG_FIPS 0
typedef struct {
- AES_KEY ks;
+ union {
+ double align;
+ AES_KEY ks;
+ } ks;
block128_f block;
union {
cbc128_f cbc;
@@ -71,7 +76,10 @@ typedef struct {
} EVP_AES_KEY;
typedef struct {
- AES_KEY ks; /* AES key schedule to use */
+ union {
+ double align;
+ AES_KEY ks;
+ } ks; /* AES key schedule to use */
int key_set; /* Set if key initialised */
int iv_set; /* Set if an iv is set */
GCM128_CONTEXT gcm;
@@ -84,7 +92,10 @@ typedef struct {
} EVP_AES_GCM_CTX;
typedef struct {
- AES_KEY ks1, ks2; /* AES key schedules to use */
+ union {
+ double align;
+ AES_KEY ks;
+ } ks1, ks2; /* AES key schedules to use */
XTS128_CONTEXT xts;
void (*stream) (const unsigned char *in,
unsigned char *out, size_t length,
@@ -93,7 +104,10 @@ typedef struct {
} EVP_AES_XTS_CTX;
typedef struct {
- AES_KEY ks; /* AES key schedule to use */
+ union {
+ double align;
+ AES_KEY ks;
+ } ks; /* AES key schedule to use */
int key_set; /* Set if key initialised */
int iv_set; /* Set if an iv is set */
int tag_set; /* Set if tag is valid */
@@ -103,9 +117,9 @@ typedef struct {
ccm128_f str;
} EVP_AES_CCM_CTX;
-# define MAXBITCHUNK ((size_t)1<<(sizeof(size_t)*8-4))
+# define MAXBITCHUNK ((size_t)1<<(sizeof(size_t)*8-4))
-# ifdef VPAES_ASM
+# ifdef VPAES_ASM
int vpaes_set_encrypt_key(const unsigned char *userKey, int bits,
AES_KEY *key);
int vpaes_set_decrypt_key(const unsigned char *userKey, int bits,
@@ -120,8 +134,8 @@ void vpaes_cbc_encrypt(const unsigned char *in,
unsigned char *out,
size_t length,
const AES_KEY *key, unsigned char *ivec, int enc);
-# endif
-# ifdef BSAES_ASM
+# endif
+# ifdef BSAES_ASM
void bsaes_cbc_encrypt(const unsigned char *in, unsigned char *out,
size_t length, const AES_KEY *key,
unsigned char ivec[16], int enc);
@@ -134,40 +148,54 @@ void bsaes_xts_encrypt(const unsigned char *inp, unsigned char *out,
void bsaes_xts_decrypt(const unsigned char *inp, unsigned char *out,
size_t len, const AES_KEY *key1,
const AES_KEY *key2, const unsigned char iv[16]);
-# endif
-# ifdef AES_CTR_ASM
+# endif
+# ifdef AES_CTR_ASM
void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
size_t blocks, const AES_KEY *key,
const unsigned char ivec[AES_BLOCK_SIZE]);
-# endif
-# ifdef AES_XTS_ASM
+# endif
+# ifdef AES_XTS_ASM
void AES_xts_encrypt(const char *inp, char *out, size_t len,
const AES_KEY *key1, const AES_KEY *key2,
const unsigned char iv[16]);
void AES_xts_decrypt(const char *inp, char *out, size_t len,
const AES_KEY *key1, const AES_KEY *key2,
const unsigned char iv[16]);
+# endif
+
+# if defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
+# include "ppc_arch.h"
+# ifdef VPAES_ASM
+# define VPAES_CAPABLE (OPENSSL_ppccap_P & PPC_ALTIVEC)
# endif
+# define HWAES_CAPABLE (OPENSSL_ppccap_P & PPC_CRYPTO207)
+# define HWAES_set_encrypt_key aes_p8_set_encrypt_key
+# define HWAES_set_decrypt_key aes_p8_set_decrypt_key
+# define HWAES_encrypt aes_p8_encrypt
+# define HWAES_decrypt aes_p8_decrypt
+# define HWAES_cbc_encrypt aes_p8_cbc_encrypt
+# define HWAES_ctr32_encrypt_blocks aes_p8_ctr32_encrypt_blocks
+# endif
-# if defined(AES_ASM) && !defined(I386_ONLY) && ( \
+# if defined(AES_ASM) && !defined(I386_ONLY) && ( \
((defined(__i386) || defined(__i386__) || \
defined(_M_IX86)) && defined(OPENSSL_IA32_SSE2))|| \
defined(__x86_64) || defined(__x86_64__) || \
defined(_M_AMD64) || defined(_M_X64) || \
defined(__INTEL__) )
-extern unsigned int OPENSSL_ia32cap_P[2];
+extern unsigned int OPENSSL_ia32cap_P[];
-# ifdef VPAES_ASM
-# define VPAES_CAPABLE (OPENSSL_ia32cap_P[1]&(1<<(41-32)))
-# endif
-# ifdef BSAES_ASM
-# define BSAES_CAPABLE (OPENSSL_ia32cap_P[1]&(1<<(41-32)))
-# endif
+# ifdef VPAES_ASM
+# define VPAES_CAPABLE (OPENSSL_ia32cap_P[1]&(1<<(41-32)))
+# endif
+# ifdef BSAES_ASM
+# define BSAES_CAPABLE (OPENSSL_ia32cap_P[1]&(1<<(41-32)))
+# endif
/*
* AES-NI section
*/
-# define AESNI_CAPABLE (OPENSSL_ia32cap_P[1]&(1<<(57-32)))
+# define AESNI_CAPABLE (OPENSSL_ia32cap_P[1]&(1<<(57-32)))
int aesni_set_encrypt_key(const unsigned char *userKey, int bits,
AES_KEY *key);
@@ -218,6 +246,26 @@ void aesni_ccm64_decrypt_blocks(const unsigned char *in,
const unsigned char ivec[16],
unsigned char cmac[16]);
+# if defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)
+size_t aesni_gcm_encrypt(const unsigned char *in,
+ unsigned char *out,
+ size_t len,
+ const void *key, unsigned char ivec[16], u64 *Xi);
+# define AES_gcm_encrypt aesni_gcm_encrypt
+size_t aesni_gcm_decrypt(const unsigned char *in,
+ unsigned char *out,
+ size_t len,
+ const void *key, unsigned char ivec[16], u64 *Xi);
+# define AES_gcm_decrypt aesni_gcm_decrypt
+void gcm_ghash_avx(u64 Xi[2], const u128 Htable[16], const u8 *in,
+ size_t len);
+# define AES_GCM_ASM(gctx) (gctx->ctr==aesni_ctr32_encrypt_blocks && \
+ gctx->gcm.ghash==gcm_ghash_avx)
+# define AES_GCM_ASM2(gctx) (gctx->gcm.block==(block128_f)aesni_encrypt && \
+ gctx->gcm.ghash==gcm_ghash_avx)
+# undef AES_GCM_ASM2 /* minor size optimization */
+# endif
+
static int aesni_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
const unsigned char *iv, int enc)
{
@@ -271,23 +319,23 @@ static int aesni_ecb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
return 1;
}
-# define aesni_ofb_cipher aes_ofb_cipher
+# define aesni_ofb_cipher aes_ofb_cipher
static int aesni_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
const unsigned char *in, size_t len);
-# define aesni_cfb_cipher aes_cfb_cipher
+# define aesni_cfb_cipher aes_cfb_cipher
static int aesni_cfb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
const unsigned char *in, size_t len);
-# define aesni_cfb8_cipher aes_cfb8_cipher
+# define aesni_cfb8_cipher aes_cfb8_cipher
static int aesni_cfb8_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
const unsigned char *in, size_t len);
-# define aesni_cfb1_cipher aes_cfb1_cipher
+# define aesni_cfb1_cipher aes_cfb1_cipher
static int aesni_cfb1_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
const unsigned char *in, size_t len);
-# define aesni_ctr_cipher aes_ctr_cipher
+# define aesni_ctr_cipher aes_ctr_cipher
static int aesni_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
const unsigned char *in, size_t len);
@@ -298,7 +346,7 @@ static int aesni_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
if (!iv && !key)
return 1;
if (key) {
- aesni_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks);
+ aesni_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks.ks);
CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks, (block128_f) aesni_encrypt);
gctx->ctr = (ctr128_f) aesni_ctr32_encrypt_blocks;
/*
@@ -323,7 +371,7 @@ static int aesni_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
return 1;
}
-# define aesni_gcm_cipher aes_gcm_cipher
+# define aesni_gcm_cipher aes_gcm_cipher
static int aesni_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
const unsigned char *in, size_t len);
@@ -337,17 +385,17 @@ static int aesni_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
if (key) {
/* key_len is two AES keys */
if (enc) {
- aesni_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+ aesni_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
xctx->xts.block1 = (block128_f) aesni_encrypt;
xctx->stream = aesni_xts_encrypt;
} else {
- aesni_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+ aesni_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
xctx->xts.block1 = (block128_f) aesni_decrypt;
xctx->stream = aesni_xts_decrypt;
}
aesni_set_encrypt_key(key + ctx->key_len / 2,
- ctx->key_len * 4, &xctx->ks2);
+ ctx->key_len * 4, &xctx->ks2.ks);
xctx->xts.block2 = (block128_f) aesni_encrypt;
xctx->xts.key1 = &xctx->ks1;
@@ -361,7 +409,7 @@ static int aesni_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
return 1;
}
-# define aesni_xts_cipher aes_xts_cipher
+# define aesni_xts_cipher aes_xts_cipher
static int aesni_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
const unsigned char *in, size_t len);
@@ -372,7 +420,7 @@ static int aesni_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
if (!iv && !key)
return 1;
if (key) {
- aesni_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks);
+ aesni_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks.ks);
CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
&cctx->ks, (block128_f) aesni_encrypt);
cctx->str = enc ? (ccm128_f) aesni_ccm64_encrypt_blocks :
@@ -386,11 +434,11 @@ static int aesni_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
return 1;
}
-# define aesni_ccm_cipher aes_ccm_cipher
+# define aesni_ccm_cipher aes_ccm_cipher
static int aesni_ccm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
const unsigned char *in, size_t len);
-# define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) \
+# define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) \
static const EVP_CIPHER aesni_##keylen##_##mode = { \
nid##_##keylen##_##nmode,blocksize,keylen/8,ivlen, \
flags|EVP_CIPH_##MODE##_MODE, \
@@ -411,7 +459,7 @@ static const EVP_CIPHER aes_##keylen##_##mode = { \
const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
{ return AESNI_CAPABLE?&aesni_##keylen##_##mode:&aes_##keylen##_##mode; }
-# define BLOCK_CIPHER_custom(nid,keylen,blocksize,ivlen,mode,MODE,flags) \
+# define BLOCK_CIPHER_custom(nid,keylen,blocksize,ivlen,mode,MODE,flags) \
static const EVP_CIPHER aesni_##keylen##_##mode = { \
nid##_##keylen##_##mode,blocksize, \
(EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
@@ -433,9 +481,378 @@ static const EVP_CIPHER aes_##keylen##_##mode = { \
const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
{ return AESNI_CAPABLE?&aesni_##keylen##_##mode:&aes_##keylen##_##mode; }
+# elif defined(AES_ASM) && (defined(__sparc) || defined(__sparc__))
+
+# include "sparc_arch.h"
+
+extern unsigned int OPENSSL_sparcv9cap_P[];
+
+# define SPARC_AES_CAPABLE (OPENSSL_sparcv9cap_P[1] & CFR_AES)
+
+void aes_t4_set_encrypt_key(const unsigned char *key, int bits, AES_KEY *ks);
+void aes_t4_set_decrypt_key(const unsigned char *key, int bits, AES_KEY *ks);
+void aes_t4_encrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key);
+void aes_t4_decrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key);
+/*
+ * Key-length specific subroutines were chosen for following reason.
+ * Each SPARC T4 core can execute up to 8 threads which share core's
+ * resources. Loading as much key material to registers allows to
+ * minimize references to shared memory interface, as well as amount
+ * of instructions in inner loops [much needed on T4]. But then having
+ * non-key-length specific routines would require conditional branches
+ * either in inner loops or on subroutines' entries. Former is hardly
+ * acceptable, while latter means code size increase to size occupied
+ * by multiple key-length specfic subroutines, so why fight?
+ */
+void aes128_t4_cbc_encrypt(const unsigned char *in, unsigned char *out,
+ size_t len, const AES_KEY *key,
+ unsigned char *ivec);
+void aes128_t4_cbc_decrypt(const unsigned char *in, unsigned char *out,
+ size_t len, const AES_KEY *key,
+ unsigned char *ivec);
+void aes192_t4_cbc_encrypt(const unsigned char *in, unsigned char *out,
+ size_t len, const AES_KEY *key,
+ unsigned char *ivec);
+void aes192_t4_cbc_decrypt(const unsigned char *in, unsigned char *out,
+ size_t len, const AES_KEY *key,
+ unsigned char *ivec);
+void aes256_t4_cbc_encrypt(const unsigned char *in, unsigned char *out,
+ size_t len, const AES_KEY *key,
+ unsigned char *ivec);
+void aes256_t4_cbc_decrypt(const unsigned char *in, unsigned char *out,
+ size_t len, const AES_KEY *key,
+ unsigned char *ivec);
+void aes128_t4_ctr32_encrypt(const unsigned char *in, unsigned char *out,
+ size_t blocks, const AES_KEY *key,
+ unsigned char *ivec);
+void aes192_t4_ctr32_encrypt(const unsigned char *in, unsigned char *out,
+ size_t blocks, const AES_KEY *key,
+ unsigned char *ivec);
+void aes256_t4_ctr32_encrypt(const unsigned char *in, unsigned char *out,
+ size_t blocks, const AES_KEY *key,
+ unsigned char *ivec);
+void aes128_t4_xts_encrypt(const unsigned char *in, unsigned char *out,
+ size_t blocks, const AES_KEY *key1,
+ const AES_KEY *key2, const unsigned char *ivec);
+void aes128_t4_xts_decrypt(const unsigned char *in, unsigned char *out,
+ size_t blocks, const AES_KEY *key1,
+ const AES_KEY *key2, const unsigned char *ivec);
+void aes256_t4_xts_encrypt(const unsigned char *in, unsigned char *out,
+ size_t blocks, const AES_KEY *key1,
+ const AES_KEY *key2, const unsigned char *ivec);
+void aes256_t4_xts_decrypt(const unsigned char *in, unsigned char *out,
+ size_t blocks, const AES_KEY *key1,
+ const AES_KEY *key2, const unsigned char *ivec);
+
+static int aes_t4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+{
+ int ret, mode, bits;
+ EVP_AES_KEY *dat = (EVP_AES_KEY *) ctx->cipher_data;
+
+ mode = ctx->cipher->flags & EVP_CIPH_MODE;
+ bits = ctx->key_len * 8;
+ if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
+ && !enc) {
+ ret = 0;
+ aes_t4_set_decrypt_key(key, bits, ctx->cipher_data);
+ dat->block = (block128_f) aes_t4_decrypt;
+ switch (bits) {
+ case 128:
+ dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ?
+ (cbc128_f) aes128_t4_cbc_decrypt : NULL;
+ break;
+ case 192:
+ dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ?
+ (cbc128_f) aes192_t4_cbc_decrypt : NULL;
+ break;
+ case 256:
+ dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ?
+ (cbc128_f) aes256_t4_cbc_decrypt : NULL;
+ break;
+ default:
+ ret = -1;
+ }
+ } else {
+ ret = 0;
+ aes_t4_set_encrypt_key(key, bits, ctx->cipher_data);
+ dat->block = (block128_f) aes_t4_encrypt;
+ switch (bits) {
+ case 128:
+ if (mode == EVP_CIPH_CBC_MODE)
+ dat->stream.cbc = (cbc128_f) aes128_t4_cbc_encrypt;
+ else if (mode == EVP_CIPH_CTR_MODE)
+ dat->stream.ctr = (ctr128_f) aes128_t4_ctr32_encrypt;
+ else
+ dat->stream.cbc = NULL;
+ break;
+ case 192:
+ if (mode == EVP_CIPH_CBC_MODE)
+ dat->stream.cbc = (cbc128_f) aes192_t4_cbc_encrypt;
+ else if (mode == EVP_CIPH_CTR_MODE)
+ dat->stream.ctr = (ctr128_f) aes192_t4_ctr32_encrypt;
+ else
+ dat->stream.cbc = NULL;
+ break;
+ case 256:
+ if (mode == EVP_CIPH_CBC_MODE)
+ dat->stream.cbc = (cbc128_f) aes256_t4_cbc_encrypt;
+ else if (mode == EVP_CIPH_CTR_MODE)
+ dat->stream.ctr = (ctr128_f) aes256_t4_ctr32_encrypt;
+ else
+ dat->stream.cbc = NULL;
+ break;
+ default:
+ ret = -1;
+ }
+ }
+
+ if (ret < 0) {
+ EVPerr(EVP_F_AES_T4_INIT_KEY, EVP_R_AES_KEY_SETUP_FAILED);
+ return 0;
+ }
+
+ return 1;
+}
+
+# define aes_t4_cbc_cipher aes_cbc_cipher
+static int aes_t4_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+# define aes_t4_ecb_cipher aes_ecb_cipher
+static int aes_t4_ecb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+# define aes_t4_ofb_cipher aes_ofb_cipher
+static int aes_t4_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+# define aes_t4_cfb_cipher aes_cfb_cipher
+static int aes_t4_cfb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+# define aes_t4_cfb8_cipher aes_cfb8_cipher
+static int aes_t4_cfb8_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+# define aes_t4_cfb1_cipher aes_cfb1_cipher
+static int aes_t4_cfb1_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+# define aes_t4_ctr_cipher aes_ctr_cipher
+static int aes_t4_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+static int aes_t4_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+{
+ EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+ if (!iv && !key)
+ return 1;
+ if (key) {
+ int bits = ctx->key_len * 8;
+ aes_t4_set_encrypt_key(key, bits, &gctx->ks.ks);
+ CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks,
+ (block128_f) aes_t4_encrypt);
+ switch (bits) {
+ case 128:
+ gctx->ctr = (ctr128_f) aes128_t4_ctr32_encrypt;
+ break;
+ case 192:
+ gctx->ctr = (ctr128_f) aes192_t4_ctr32_encrypt;
+ break;
+ case 256:
+ gctx->ctr = (ctr128_f) aes256_t4_ctr32_encrypt;
+ break;
+ default:
+ return 0;
+ }
+ /*
+ * If we have an iv can set it directly, otherwise use saved IV.
+ */
+ if (iv == NULL && gctx->iv_set)
+ iv = gctx->iv;
+ if (iv) {
+ CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+ gctx->iv_set = 1;
+ }
+ gctx->key_set = 1;
+ } else {
+ /* If key set use IV, otherwise copy */
+ if (gctx->key_set)
+ CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+ else
+ memcpy(gctx->iv, iv, gctx->ivlen);
+ gctx->iv_set = 1;
+ gctx->iv_gen = 0;
+ }
+ return 1;
+}
+
+# define aes_t4_gcm_cipher aes_gcm_cipher
+static int aes_t4_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+static int aes_t4_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+{
+ EVP_AES_XTS_CTX *xctx = ctx->cipher_data;
+ if (!iv && !key)
+ return 1;
+
+ if (key) {
+ int bits = ctx->key_len * 4;
+ xctx->stream = NULL;
+ /* key_len is two AES keys */
+ if (enc) {
+ aes_t4_set_encrypt_key(key, bits, &xctx->ks1.ks);
+ xctx->xts.block1 = (block128_f) aes_t4_encrypt;
+ switch (bits) {
+ case 128:
+ xctx->stream = aes128_t4_xts_encrypt;
+ break;
+# if 0 /* not yet */
+ case 192:
+ xctx->stream = aes192_t4_xts_encrypt;
+ break;
+# endif
+ case 256:
+ xctx->stream = aes256_t4_xts_encrypt;
+ break;
+ default:
+ return 0;
+ }
+ } else {
+ aes_t4_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
+ xctx->xts.block1 = (block128_f) aes_t4_decrypt;
+ switch (bits) {
+ case 128:
+ xctx->stream = aes128_t4_xts_decrypt;
+ break;
+# if 0 /* not yet */
+ case 192:
+ xctx->stream = aes192_t4_xts_decrypt;
+ break;
+# endif
+ case 256:
+ xctx->stream = aes256_t4_xts_decrypt;
+ break;
+ default:
+ return 0;
+ }
+ }
+
+ aes_t4_set_encrypt_key(key + ctx->key_len / 2,
+ ctx->key_len * 4, &xctx->ks2.ks);
+ xctx->xts.block2 = (block128_f) aes_t4_encrypt;
+
+ xctx->xts.key1 = &xctx->ks1;
+ }
+
+ if (iv) {
+ xctx->xts.key2 = &xctx->ks2;
+ memcpy(ctx->iv, iv, 16);
+ }
+
+ return 1;
+}
+
+# define aes_t4_xts_cipher aes_xts_cipher
+static int aes_t4_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+static int aes_t4_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+{
+ EVP_AES_CCM_CTX *cctx = ctx->cipher_data;
+ if (!iv && !key)
+ return 1;
+ if (key) {
+ int bits = ctx->key_len * 8;
+ aes_t4_set_encrypt_key(key, bits, &cctx->ks.ks);
+ CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+ &cctx->ks, (block128_f) aes_t4_encrypt);
+# if 0 /* not yet */
+ switch (bits) {
+ case 128:
+ cctx->str = enc ? (ccm128_f) aes128_t4_ccm64_encrypt :
+ (ccm128_f) ae128_t4_ccm64_decrypt;
+ break;
+ case 192:
+ cctx->str = enc ? (ccm128_f) aes192_t4_ccm64_encrypt :
+ (ccm128_f) ae192_t4_ccm64_decrypt;
+ break;
+ case 256:
+ cctx->str = enc ? (ccm128_f) aes256_t4_ccm64_encrypt :
+ (ccm128_f) ae256_t4_ccm64_decrypt;
+ break;
+ default:
+ return 0;
+ }
# else
+ cctx->str = NULL;
+# endif
+ cctx->key_set = 1;
+ }
+ if (iv) {
+ memcpy(ctx->iv, iv, 15 - cctx->L);
+ cctx->iv_set = 1;
+ }
+ return 1;
+}
-# define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) \
+# define aes_t4_ccm_cipher aes_ccm_cipher
+static int aes_t4_ccm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+# define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) \
+static const EVP_CIPHER aes_t4_##keylen##_##mode = { \
+ nid##_##keylen##_##nmode,blocksize,keylen/8,ivlen, \
+ flags|EVP_CIPH_##MODE##_MODE, \
+ aes_t4_init_key, \
+ aes_t4_##mode##_cipher, \
+ NULL, \
+ sizeof(EVP_AES_KEY), \
+ NULL,NULL,NULL,NULL }; \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+ nid##_##keylen##_##nmode,blocksize, \
+ keylen/8,ivlen, \
+ flags|EVP_CIPH_##MODE##_MODE, \
+ aes_init_key, \
+ aes_##mode##_cipher, \
+ NULL, \
+ sizeof(EVP_AES_KEY), \
+ NULL,NULL,NULL,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return SPARC_AES_CAPABLE?&aes_t4_##keylen##_##mode:&aes_##keylen##_##mode; }
+
+# define BLOCK_CIPHER_custom(nid,keylen,blocksize,ivlen,mode,MODE,flags) \
+static const EVP_CIPHER aes_t4_##keylen##_##mode = { \
+ nid##_##keylen##_##mode,blocksize, \
+ (EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
+ flags|EVP_CIPH_##MODE##_MODE, \
+ aes_t4_##mode##_init_key, \
+ aes_t4_##mode##_cipher, \
+ aes_##mode##_cleanup, \
+ sizeof(EVP_AES_##MODE##_CTX), \
+ NULL,NULL,aes_##mode##_ctrl,NULL }; \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+ nid##_##keylen##_##mode,blocksize, \
+ (EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
+ flags|EVP_CIPH_##MODE##_MODE, \
+ aes_##mode##_init_key, \
+ aes_##mode##_cipher, \
+ aes_##mode##_cleanup, \
+ sizeof(EVP_AES_##MODE##_CTX), \
+ NULL,NULL,aes_##mode##_ctrl,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return SPARC_AES_CAPABLE?&aes_t4_##keylen##_##mode:&aes_##keylen##_##mode; }
+
+# else
+
+# define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) \
static const EVP_CIPHER aes_##keylen##_##mode = { \
nid##_##keylen##_##nmode,blocksize,keylen/8,ivlen, \
flags|EVP_CIPH_##MODE##_MODE, \
@@ -447,7 +864,7 @@ static const EVP_CIPHER aes_##keylen##_##mode = { \
const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
{ return &aes_##keylen##_##mode; }
-# define BLOCK_CIPHER_custom(nid,keylen,blocksize,ivlen,mode,MODE,flags) \
+# define BLOCK_CIPHER_custom(nid,keylen,blocksize,ivlen,mode,MODE,flags) \
static const EVP_CIPHER aes_##keylen##_##mode = { \
nid##_##keylen##_##mode,blocksize, \
(EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
@@ -459,9 +876,42 @@ static const EVP_CIPHER aes_##keylen##_##mode = { \
NULL,NULL,aes_##mode##_ctrl,NULL }; \
const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
{ return &aes_##keylen##_##mode; }
+# endif
+
+# if defined(OPENSSL_CPUID_OBJ) && (defined(__arm__) || defined(__arm) || defined(__aarch64__))
+# include "arm_arch.h"
+# if __ARM_MAX_ARCH__>=7
+# if defined(BSAES_ASM)
+# define BSAES_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON)
+# endif
+# define HWAES_CAPABLE (OPENSSL_armcap_P & ARMV8_AES)
+# define HWAES_set_encrypt_key aes_v8_set_encrypt_key
+# define HWAES_set_decrypt_key aes_v8_set_decrypt_key
+# define HWAES_encrypt aes_v8_encrypt
+# define HWAES_decrypt aes_v8_decrypt
+# define HWAES_cbc_encrypt aes_v8_cbc_encrypt
+# define HWAES_ctr32_encrypt_blocks aes_v8_ctr32_encrypt_blocks
# endif
+# endif
+
+# if defined(HWAES_CAPABLE)
+int HWAES_set_encrypt_key(const unsigned char *userKey, const int bits,
+ AES_KEY *key);
+int HWAES_set_decrypt_key(const unsigned char *userKey, const int bits,
+ AES_KEY *key);
+void HWAES_encrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key);
+void HWAES_decrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key);
+void HWAES_cbc_encrypt(const unsigned char *in, unsigned char *out,
+ size_t length, const AES_KEY *key,
+ unsigned char *ivec, const int enc);
+void HWAES_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
+ size_t len, const AES_KEY *key,
+ const unsigned char ivec[16]);
+# endif
-# define BLOCK_CIPHER_generic_pack(nid,keylen,flags) \
+# define BLOCK_CIPHER_generic_pack(nid,keylen,flags) \
BLOCK_CIPHER_generic(nid,keylen,16,16,cbc,cbc,CBC,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
BLOCK_CIPHER_generic(nid,keylen,16,0,ecb,ecb,ECB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
BLOCK_CIPHER_generic(nid,keylen,1,16,ofb128,ofb,OFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
@@ -479,51 +929,80 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
mode = ctx->cipher->flags & EVP_CIPH_MODE;
if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
&& !enc)
-# ifdef BSAES_CAPABLE
+# ifdef HWAES_CAPABLE
+ if (HWAES_CAPABLE) {
+ ret = HWAES_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
+ dat->block = (block128_f) HWAES_decrypt;
+ dat->stream.cbc = NULL;
+# ifdef HWAES_cbc_encrypt
+ if (mode == EVP_CIPH_CBC_MODE)
+ dat->stream.cbc = (cbc128_f) HWAES_cbc_encrypt;
+# endif
+ } else
+# endif
+# ifdef BSAES_CAPABLE
if (BSAES_CAPABLE && mode == EVP_CIPH_CBC_MODE) {
- ret = AES_set_decrypt_key(key, ctx->key_len * 8, &dat->ks);
+ ret = AES_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
dat->block = (block128_f) AES_decrypt;
dat->stream.cbc = (cbc128_f) bsaes_cbc_encrypt;
} else
-# endif
-# ifdef VPAES_CAPABLE
+# endif
+# ifdef VPAES_CAPABLE
if (VPAES_CAPABLE) {
- ret = vpaes_set_decrypt_key(key, ctx->key_len * 8, &dat->ks);
+ ret = vpaes_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
dat->block = (block128_f) vpaes_decrypt;
dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ?
(cbc128_f) vpaes_cbc_encrypt : NULL;
} else
-# endif
+# endif
{
- ret = AES_set_decrypt_key(key, ctx->key_len * 8, &dat->ks);
+ ret = AES_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
dat->block = (block128_f) AES_decrypt;
dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ?
(cbc128_f) AES_cbc_encrypt : NULL;
} else
-# ifdef BSAES_CAPABLE
+# ifdef HWAES_CAPABLE
+ if (HWAES_CAPABLE) {
+ ret = HWAES_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
+ dat->block = (block128_f) HWAES_encrypt;
+ dat->stream.cbc = NULL;
+# ifdef HWAES_cbc_encrypt
+ if (mode == EVP_CIPH_CBC_MODE)
+ dat->stream.cbc = (cbc128_f) HWAES_cbc_encrypt;
+ else
+# endif
+# ifdef HWAES_ctr32_encrypt_blocks
+ if (mode == EVP_CIPH_CTR_MODE)
+ dat->stream.ctr = (ctr128_f) HWAES_ctr32_encrypt_blocks;
+ else
+# endif
+ (void)0; /* terminate potentially open 'else' */
+ } else
+# endif
+# ifdef BSAES_CAPABLE
if (BSAES_CAPABLE && mode == EVP_CIPH_CTR_MODE) {
- ret = AES_set_encrypt_key(key, ctx->key_len * 8, &dat->ks);
+ ret = AES_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
dat->block = (block128_f) AES_encrypt;
dat->stream.ctr = (ctr128_f) bsaes_ctr32_encrypt_blocks;
} else
-# endif
-# ifdef VPAES_CAPABLE
+# endif
+# ifdef VPAES_CAPABLE
if (VPAES_CAPABLE) {
- ret = vpaes_set_encrypt_key(key, ctx->key_len * 8, &dat->ks);
+ ret = vpaes_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
dat->block = (block128_f) vpaes_encrypt;
dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ?
(cbc128_f) vpaes_cbc_encrypt : NULL;
} else
-# endif
+# endif
{
- ret = AES_set_encrypt_key(key, ctx->key_len * 8, &dat->ks);
+ ret = AES_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
dat->block = (block128_f) AES_encrypt;
dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ?
(cbc128_f) AES_cbc_encrypt : NULL;
-# ifdef AES_CTR_ASM
+# ifdef AES_CTR_ASM
if (mode == EVP_CIPH_CTR_MODE)
dat->stream.ctr = (ctr128_f) AES_ctr32_encrypt;
-# endif
+# endif
}
if (ret < 0) {
@@ -544,7 +1023,7 @@ static int aes_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
else if (ctx->encrypt)
CRYPTO_cbc128_encrypt(in, out, len, &dat->ks, ctx->iv, dat->block);
else
- CRYPTO_cbc128_encrypt(in, out, len, &dat->ks, ctx->iv, dat->block);
+ CRYPTO_cbc128_decrypt(in, out, len, &dat->ks, ctx->iv, dat->block);
return 1;
}
@@ -680,11 +1159,6 @@ static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
case EVP_CTRL_GCM_SET_IVLEN:
if (arg <= 0)
return 0;
-# ifdef OPENSSL_FIPS
- if (FIPS_module_mode() && !(c->flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW)
- && arg < 12)
- return 0;
-# endif
/* Allocate memory for IV if needed */
if ((arg > EVP_MAX_IV_LENGTH) && (arg > gctx->ivlen)) {
if (gctx->iv != c->iv)
@@ -805,34 +1279,47 @@ static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
return 1;
if (key) {
do {
-# ifdef BSAES_CAPABLE
+# ifdef HWAES_CAPABLE
+ if (HWAES_CAPABLE) {
+ HWAES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks.ks);
+ CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks,
+ (block128_f) HWAES_encrypt);
+# ifdef HWAES_ctr32_encrypt_blocks
+ gctx->ctr = (ctr128_f) HWAES_ctr32_encrypt_blocks;
+# else
+ gctx->ctr = NULL;
+# endif
+ break;
+ } else
+# endif
+# ifdef BSAES_CAPABLE
if (BSAES_CAPABLE) {
- AES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks);
+ AES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks.ks);
CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks,
(block128_f) AES_encrypt);
gctx->ctr = (ctr128_f) bsaes_ctr32_encrypt_blocks;
break;
} else
-# endif
-# ifdef VPAES_CAPABLE
+# endif
+# ifdef VPAES_CAPABLE
if (VPAES_CAPABLE) {
- vpaes_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks);
+ vpaes_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks.ks);
CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks,
(block128_f) vpaes_encrypt);
gctx->ctr = NULL;
break;
} else
-# endif
+# endif
(void)0; /* terminate potentially open 'else' */
- AES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks);
+ AES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks.ks);
CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks,
(block128_f) AES_encrypt);
-# ifdef AES_CTR_ASM
+# ifdef AES_CTR_ASM
gctx->ctr = (ctr128_f) AES_ctr32_encrypt;
-# else
+# else
gctx->ctr = NULL;
-# endif
+# endif
} while (0);
/*
@@ -891,11 +1378,38 @@ static int aes_gcm_tls_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
if (ctx->encrypt) {
/* Encrypt payload */
if (gctx->ctr) {
+ size_t bulk = 0;
+# if defined(AES_GCM_ASM)
+ if (len >= 32 && AES_GCM_ASM(gctx)) {
+ if (CRYPTO_gcm128_encrypt(&gctx->gcm, NULL, NULL, 0))
+ return -1;
+
+ bulk = AES_gcm_encrypt(in, out, len,
+ gctx->gcm.key,
+ gctx->gcm.Yi.c, gctx->gcm.Xi.u);
+ gctx->gcm.len.u[1] += bulk;
+ }
+# endif
if (CRYPTO_gcm128_encrypt_ctr32(&gctx->gcm,
- in, out, len, gctx->ctr))
+ in + bulk,
+ out + bulk,
+ len - bulk, gctx->ctr))
goto err;
} else {
- if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, len))
+ size_t bulk = 0;
+# if defined(AES_GCM_ASM2)
+ if (len >= 32 && AES_GCM_ASM2(gctx)) {
+ if (CRYPTO_gcm128_encrypt(&gctx->gcm, NULL, NULL, 0))
+ return -1;
+
+ bulk = AES_gcm_encrypt(in, out, len,
+ gctx->gcm.key,
+ gctx->gcm.Yi.c, gctx->gcm.Xi.u);
+ gctx->gcm.len.u[1] += bulk;
+ }
+# endif
+ if (CRYPTO_gcm128_encrypt(&gctx->gcm,
+ in + bulk, out + bulk, len - bulk))
goto err;
}
out += len;
@@ -905,11 +1419,38 @@ static int aes_gcm_tls_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
} else {
/* Decrypt */
if (gctx->ctr) {
+ size_t bulk = 0;
+# if defined(AES_GCM_ASM)
+ if (len >= 16 && AES_GCM_ASM(gctx)) {
+ if (CRYPTO_gcm128_decrypt(&gctx->gcm, NULL, NULL, 0))
+ return -1;
+
+ bulk = AES_gcm_decrypt(in, out, len,
+ gctx->gcm.key,
+ gctx->gcm.Yi.c, gctx->gcm.Xi.u);
+ gctx->gcm.len.u[1] += bulk;
+ }
+# endif
if (CRYPTO_gcm128_decrypt_ctr32(&gctx->gcm,
- in, out, len, gctx->ctr))
+ in + bulk,
+ out + bulk,
+ len - bulk, gctx->ctr))
goto err;
} else {
- if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, len))
+ size_t bulk = 0;
+# if defined(AES_GCM_ASM2)
+ if (len >= 16 && AES_GCM_ASM2(gctx)) {
+ if (CRYPTO_gcm128_decrypt(&gctx->gcm, NULL, NULL, 0))
+ return -1;
+
+ bulk = AES_gcm_decrypt(in, out, len,
+ gctx->gcm.key,
+ gctx->gcm.Yi.c, gctx->gcm.Xi.u);
+ gctx->gcm.len.u[1] += bulk;
+ }
+# endif
+ if (CRYPTO_gcm128_decrypt(&gctx->gcm,
+ in + bulk, out + bulk, len - bulk))
goto err;
}
/* Retrieve tag */
@@ -947,20 +1488,90 @@ static int aes_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
return -1;
} else if (ctx->encrypt) {
if (gctx->ctr) {
+ size_t bulk = 0;
+# if defined(AES_GCM_ASM)
+ if (len >= 32 && AES_GCM_ASM(gctx)) {
+ size_t res = (16 - gctx->gcm.mres) % 16;
+
+ if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, res))
+ return -1;
+
+ bulk = AES_gcm_encrypt(in + res,
+ out + res, len - res,
+ gctx->gcm.key, gctx->gcm.Yi.c,
+ gctx->gcm.Xi.u);
+ gctx->gcm.len.u[1] += bulk;
+ bulk += res;
+ }
+# endif
if (CRYPTO_gcm128_encrypt_ctr32(&gctx->gcm,
- in, out, len, gctx->ctr))
+ in + bulk,
+ out + bulk,
+ len - bulk, gctx->ctr))
return -1;
} else {
- if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, len))
+ size_t bulk = 0;
+# if defined(AES_GCM_ASM2)
+ if (len >= 32 && AES_GCM_ASM2(gctx)) {
+ size_t res = (16 - gctx->gcm.mres) % 16;
+
+ if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, res))
+ return -1;
+
+ bulk = AES_gcm_encrypt(in + res,
+ out + res, len - res,
+ gctx->gcm.key, gctx->gcm.Yi.c,
+ gctx->gcm.Xi.u);
+ gctx->gcm.len.u[1] += bulk;
+ bulk += res;
+ }
+# endif
+ if (CRYPTO_gcm128_encrypt(&gctx->gcm,
+ in + bulk, out + bulk, len - bulk))
return -1;
}
} else {
if (gctx->ctr) {
+ size_t bulk = 0;
+# if defined(AES_GCM_ASM)
+ if (len >= 16 && AES_GCM_ASM(gctx)) {
+ size_t res = (16 - gctx->gcm.mres) % 16;
+
+ if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, res))
+ return -1;
+
+ bulk = AES_gcm_decrypt(in + res,
+ out + res, len - res,
+ gctx->gcm.key,
+ gctx->gcm.Yi.c, gctx->gcm.Xi.u);
+ gctx->gcm.len.u[1] += bulk;
+ bulk += res;
+ }
+# endif
if (CRYPTO_gcm128_decrypt_ctr32(&gctx->gcm,
- in, out, len, gctx->ctr))
+ in + bulk,
+ out + bulk,
+ len - bulk, gctx->ctr))
return -1;
} else {
- if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, len))
+ size_t bulk = 0;
+# if defined(AES_GCM_ASM2)
+ if (len >= 16 && AES_GCM_ASM2(gctx)) {
+ size_t res = (16 - gctx->gcm.mres) % 16;
+
+ if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, res))
+ return -1;
+
+ bulk = AES_gcm_decrypt(in + res,
+ out + res, len - res,
+ gctx->gcm.key,
+ gctx->gcm.Yi.c, gctx->gcm.Xi.u);
+ gctx->gcm.len.u[1] += bulk;
+ bulk += res;
+ }
+# endif
+ if (CRYPTO_gcm128_decrypt(&gctx->gcm,
+ in + bulk, out + bulk, len - bulk))
return -1;
}
}
@@ -983,7 +1594,7 @@ static int aes_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
}
-# define CUSTOM_FLAGS (EVP_CIPH_FLAG_DEFAULT_ASN1 \
+# define CUSTOM_FLAGS (EVP_CIPH_FLAG_DEFAULT_ASN1 \
| EVP_CIPH_CUSTOM_IV | EVP_CIPH_FLAG_CUSTOM_CIPHER \
| EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT \
| EVP_CIPH_CUSTOM_COPY)
@@ -1032,47 +1643,69 @@ static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
if (key)
do {
-# ifdef AES_XTS_ASM
+# ifdef AES_XTS_ASM
xctx->stream = enc ? AES_xts_encrypt : AES_xts_decrypt;
-# else
+# else
xctx->stream = NULL;
-# endif
+# endif
/* key_len is two AES keys */
-# ifdef BSAES_CAPABLE
+# ifdef HWAES_CAPABLE
+ if (HWAES_CAPABLE) {
+ if (enc) {
+ HWAES_set_encrypt_key(key, ctx->key_len * 4,
+ &xctx->ks1.ks);
+ xctx->xts.block1 = (block128_f) HWAES_encrypt;
+ } else {
+ HWAES_set_decrypt_key(key, ctx->key_len * 4,
+ &xctx->ks1.ks);
+ xctx->xts.block1 = (block128_f) HWAES_decrypt;
+ }
+
+ HWAES_set_encrypt_key(key + ctx->key_len / 2,
+ ctx->key_len * 4, &xctx->ks2.ks);
+ xctx->xts.block2 = (block128_f) HWAES_encrypt;
+
+ xctx->xts.key1 = &xctx->ks1;
+ break;
+ } else
+# endif
+# ifdef BSAES_CAPABLE
if (BSAES_CAPABLE)
xctx->stream = enc ? bsaes_xts_encrypt : bsaes_xts_decrypt;
else
-# endif
-# ifdef VPAES_CAPABLE
+# endif
+# ifdef VPAES_CAPABLE
if (VPAES_CAPABLE) {
if (enc) {
- vpaes_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+ vpaes_set_encrypt_key(key, ctx->key_len * 4,
+ &xctx->ks1.ks);
xctx->xts.block1 = (block128_f) vpaes_encrypt;
} else {
- vpaes_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+ vpaes_set_decrypt_key(key, ctx->key_len * 4,
+ &xctx->ks1.ks);
xctx->xts.block1 = (block128_f) vpaes_decrypt;
}
vpaes_set_encrypt_key(key + ctx->key_len / 2,
- ctx->key_len * 4, &xctx->ks2);
+ ctx->key_len * 4, &xctx->ks2.ks);
xctx->xts.block2 = (block128_f) vpaes_encrypt;
xctx->xts.key1 = &xctx->ks1;
break;
} else
-# endif
+# endif
(void)0; /* terminate potentially open 'else' */
if (enc) {
- AES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+ AES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
xctx->xts.block1 = (block128_f) AES_encrypt;
} else {
- AES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+ AES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
xctx->xts.block1 = (block128_f) AES_decrypt;
}
AES_set_encrypt_key(key + ctx->key_len / 2,
- ctx->key_len * 4, &xctx->ks2);
+ ctx->key_len * 4, &xctx->ks2.ks);
xctx->xts.block2 = (block128_f) AES_encrypt;
xctx->xts.key1 = &xctx->ks1;
@@ -1094,14 +1727,6 @@ static int aes_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
return 0;
if (!out || !in || len < AES_BLOCK_SIZE)
return 0;
-# ifdef OPENSSL_FIPS
- /* Requirement of SP800-38E */
- if (FIPS_module_mode() && !(ctx->flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW) &&
- (len > (1UL << 20) * 16)) {
- EVPerr(EVP_F_AES_XTS_CIPHER, EVP_R_TOO_LARGE);
- return 0;
- }
-# endif
if (xctx->stream)
(*xctx->stream) (in, out, len,
xctx->xts.key1, xctx->xts.key2, ctx->iv);
@@ -1111,9 +1736,9 @@ static int aes_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
return 1;
}
-# define aes_xts_cleanup NULL
+# define aes_xts_cleanup NULL
-# define XTS_FLAGS (EVP_CIPH_FLAG_DEFAULT_ASN1 | EVP_CIPH_CUSTOM_IV \
+# define XTS_FLAGS (EVP_CIPH_FLAG_DEFAULT_ASN1 | EVP_CIPH_CUSTOM_IV \
| EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT \
| EVP_CIPH_CUSTOM_COPY)
@@ -1191,17 +1816,28 @@ static int aes_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
return 1;
if (key)
do {
-# ifdef VPAES_CAPABLE
+# ifdef HWAES_CAPABLE
+ if (HWAES_CAPABLE) {
+ HWAES_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks.ks);
+
+ CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+ &cctx->ks, (block128_f) HWAES_encrypt);
+ cctx->str = NULL;
+ cctx->key_set = 1;
+ break;
+ } else
+# endif
+# ifdef VPAES_CAPABLE
if (VPAES_CAPABLE) {
- vpaes_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks);
+ vpaes_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks.ks);
CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
&cctx->ks, (block128_f) vpaes_encrypt);
cctx->str = NULL;
cctx->key_set = 1;
break;
}
-# endif
- AES_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks);
+# endif
+ AES_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks.ks);
CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
&cctx->ks, (block128_f) AES_encrypt);
cctx->str = NULL;
@@ -1274,7 +1910,7 @@ static int aes_ccm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
}
-# define aes_ccm_cleanup NULL
+# define aes_ccm_cleanup NULL
BLOCK_CIPHER_custom(NID_aes, 128, 1, 12, ccm, CCM,
EVP_CIPH_FLAG_FIPS | CUSTOM_FLAGS)
@@ -1282,5 +1918,107 @@ BLOCK_CIPHER_custom(NID_aes, 128, 1, 12, ccm, CCM,
EVP_CIPH_FLAG_FIPS | CUSTOM_FLAGS)
BLOCK_CIPHER_custom(NID_aes, 256, 1, 12, ccm, CCM,
EVP_CIPH_FLAG_FIPS | CUSTOM_FLAGS)
-# endif
#endif
+typedef struct {
+ union {
+ double align;
+ AES_KEY ks;
+ } ks;
+ /* Indicates if IV has been set */
+ unsigned char *iv;
+} EVP_AES_WRAP_CTX;
+
+static int aes_wrap_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+{
+ EVP_AES_WRAP_CTX *wctx = ctx->cipher_data;
+ if (!iv && !key)
+ return 1;
+ if (key) {
+ if (ctx->encrypt)
+ AES_set_encrypt_key(key, ctx->key_len * 8, &wctx->ks.ks);
+ else
+ AES_set_decrypt_key(key, ctx->key_len * 8, &wctx->ks.ks);
+ if (!iv)
+ wctx->iv = NULL;
+ }
+ if (iv) {
+ memcpy(ctx->iv, iv, 8);
+ wctx->iv = ctx->iv;
+ }
+ return 1;
+}
+
+static int aes_wrap_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t inlen)
+{
+ EVP_AES_WRAP_CTX *wctx = ctx->cipher_data;
+ size_t rv;
+ if (!in)
+ return 0;
+ if (inlen % 8)
+ return -1;
+ if (ctx->encrypt && inlen < 8)
+ return -1;
+ if (!ctx->encrypt && inlen < 16)
+ return -1;
+ if (!out) {
+ if (ctx->encrypt)
+ return inlen + 8;
+ else
+ return inlen - 8;
+ }
+ if (ctx->encrypt)
+ rv = CRYPTO_128_wrap(&wctx->ks.ks, wctx->iv, out, in, inlen,
+ (block128_f) AES_encrypt);
+ else
+ rv = CRYPTO_128_unwrap(&wctx->ks.ks, wctx->iv, out, in, inlen,
+ (block128_f) AES_decrypt);
+ return rv ? (int)rv : -1;
+}
+
+#define WRAP_FLAGS (EVP_CIPH_WRAP_MODE \
+ | EVP_CIPH_CUSTOM_IV | EVP_CIPH_FLAG_CUSTOM_CIPHER \
+ | EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_FLAG_DEFAULT_ASN1)
+
+static const EVP_CIPHER aes_128_wrap = {
+ NID_id_aes128_wrap,
+ 8, 16, 8, WRAP_FLAGS,
+ aes_wrap_init_key, aes_wrap_cipher,
+ NULL,
+ sizeof(EVP_AES_WRAP_CTX),
+ NULL, NULL, NULL, NULL
+};
+
+const EVP_CIPHER *EVP_aes_128_wrap(void)
+{
+ return &aes_128_wrap;
+}
+
+static const EVP_CIPHER aes_192_wrap = {
+ NID_id_aes192_wrap,
+ 8, 24, 8, WRAP_FLAGS,
+ aes_wrap_init_key, aes_wrap_cipher,
+ NULL,
+ sizeof(EVP_AES_WRAP_CTX),
+ NULL, NULL, NULL, NULL
+};
+
+const EVP_CIPHER *EVP_aes_192_wrap(void)
+{
+ return &aes_192_wrap;
+}
+
+static const EVP_CIPHER aes_256_wrap = {
+ NID_id_aes256_wrap,
+ 8, 32, 8, WRAP_FLAGS,
+ aes_wrap_init_key, aes_wrap_cipher,
+ NULL,
+ sizeof(EVP_AES_WRAP_CTX),
+ NULL, NULL, NULL, NULL
+};
+
+const EVP_CIPHER *EVP_aes_256_wrap(void)
+{
+ return &aes_256_wrap;
+}
diff --git a/crypto/evp/e_aes_cbc_hmac_sha1.c b/crypto/evp/e_aes_cbc_hmac_sha1.c
index d1f5928f628a..8330964ee16b 100644
--- a/crypto/evp/e_aes_cbc_hmac_sha1.c
+++ b/crypto/evp/e_aes_cbc_hmac_sha1.c
@@ -58,7 +58,8 @@
# include <openssl/objects.h>
# include <openssl/aes.h>
# include <openssl/sha.h>
-# include "evp_locl.h"
+# include <openssl/rand.h>
+# include "modes_lcl.h"
# ifndef EVP_CIPH_FLAG_AEAD_CIPHER
# define EVP_CIPH_FLAG_AEAD_CIPHER 0x200000
@@ -70,6 +71,10 @@
# define EVP_CIPH_FLAG_DEFAULT_ASN1 0
# endif
+# if !defined(EVP_CIPH_FLAG_TLS1_1_MULTIBLOCK)
+# define EVP_CIPH_FLAG_TLS1_1_MULTIBLOCK 0
+# endif
+
# define TLS1_1_VERSION 0x0302
typedef struct {
@@ -89,11 +94,7 @@ typedef struct {
defined(_M_AMD64) || defined(_M_X64) || \
defined(__INTEL__) )
-# if defined(__GNUC__) && __GNUC__>=2 && !defined(PEDANTIC)
-# define BSWAP(x) ({ unsigned int r=(x); asm ("bswapl %0":"=r"(r):"0"(r)); r; })
-# endif
-
-extern unsigned int OPENSSL_ia32cap_P[2];
+extern unsigned int OPENSSL_ia32cap_P[];
# define AESNI_CAPABLE (1<<(57-32))
int aesni_set_encrypt_key(const unsigned char *userKey, int bits,
@@ -110,6 +111,10 @@ void aesni_cbc_sha1_enc(const void *inp, void *out, size_t blocks,
const AES_KEY *key, unsigned char iv[16],
SHA_CTX *ctx, const void *in0);
+void aesni256_cbc_sha1_dec(const void *inp, void *out, size_t blocks,
+ const AES_KEY *key, unsigned char iv[16],
+ SHA_CTX *ctx, const void *in0);
+
# define data(ctx) ((EVP_AES_HMAC_SHA1 *)(ctx)->cipher_data)
static int aesni_cbc_hmac_sha1_init_key(EVP_CIPHER_CTX *ctx,
@@ -134,6 +139,7 @@ static int aesni_cbc_hmac_sha1_init_key(EVP_CIPHER_CTX *ctx,
}
# define STITCHED_CALL
+# undef STITCHED_DECRYPT_CALL
# if !defined(STITCHED_CALL)
# define aes_off 0
@@ -177,6 +183,275 @@ static void sha1_update(SHA_CTX *c, const void *data, size_t len)
# endif
# define SHA1_Update sha1_update
+# if !defined(OPENSSL_NO_MULTIBLOCK) && EVP_CIPH_FLAG_TLS1_1_MULTIBLOCK
+
+typedef struct {
+ unsigned int A[8], B[8], C[8], D[8], E[8];
+} SHA1_MB_CTX;
+typedef struct {
+ const unsigned char *ptr;
+ int blocks;
+} HASH_DESC;
+
+void sha1_multi_block(SHA1_MB_CTX *, const HASH_DESC *, int);
+
+typedef struct {
+ const unsigned char *inp;
+ unsigned char *out;
+ int blocks;
+ u64 iv[2];
+} CIPH_DESC;
+
+void aesni_multi_cbc_encrypt(CIPH_DESC *, void *, int);
+
+static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA1 *key,
+ unsigned char *out,
+ const unsigned char *inp,
+ size_t inp_len, int n4x)
+{ /* n4x is 1 or 2 */
+ HASH_DESC hash_d[8], edges[8];
+ CIPH_DESC ciph_d[8];
+ unsigned char storage[sizeof(SHA1_MB_CTX) + 32];
+ union {
+ u64 q[16];
+ u32 d[32];
+ u8 c[128];
+ } blocks[8];
+ SHA1_MB_CTX *ctx;
+ unsigned int frag, last, packlen, i, x4 = 4 * n4x, minblocks, processed =
+ 0;
+ size_t ret = 0;
+ u8 *IVs;
+# if defined(BSWAP8)
+ u64 seqnum;
+# endif
+
+ /* ask for IVs in bulk */
+ if (RAND_bytes((IVs = blocks[0].c), 16 * x4) <= 0)
+ return 0;
+
+ ctx = (SHA1_MB_CTX *) (storage + 32 - ((size_t)storage % 32)); /* align */
+
+ frag = (unsigned int)inp_len >> (1 + n4x);
+ last = (unsigned int)inp_len + frag - (frag << (1 + n4x));
+ if (last > frag && ((last + 13 + 9) % 64) < (x4 - 1)) {
+ frag++;
+ last -= x4 - 1;
+ }
+
+ packlen = 5 + 16 + ((frag + 20 + 16) & -16);
+
+ /* populate descriptors with pointers and IVs */
+ hash_d[0].ptr = inp;
+ ciph_d[0].inp = inp;
+ /* 5+16 is place for header and explicit IV */
+ ciph_d[0].out = out + 5 + 16;
+ memcpy(ciph_d[0].out - 16, IVs, 16);
+ memcpy(ciph_d[0].iv, IVs, 16);
+ IVs += 16;
+
+ for (i = 1; i < x4; i++) {
+ ciph_d[i].inp = hash_d[i].ptr = hash_d[i - 1].ptr + frag;
+ ciph_d[i].out = ciph_d[i - 1].out + packlen;
+ memcpy(ciph_d[i].out - 16, IVs, 16);
+ memcpy(ciph_d[i].iv, IVs, 16);
+ IVs += 16;
+ }
+
+# if defined(BSWAP8)
+ memcpy(blocks[0].c, key->md.data, 8);
+ seqnum = BSWAP8(blocks[0].q[0]);
+# endif
+ for (i = 0; i < x4; i++) {
+ unsigned int len = (i == (x4 - 1) ? last : frag);
+# if !defined(BSWAP8)
+ unsigned int carry, j;
+# endif
+
+ ctx->A[i] = key->md.h0;
+ ctx->B[i] = key->md.h1;
+ ctx->C[i] = key->md.h2;
+ ctx->D[i] = key->md.h3;
+ ctx->E[i] = key->md.h4;
+
+ /* fix seqnum */
+# if defined(BSWAP8)
+ blocks[i].q[0] = BSWAP8(seqnum + i);
+# else
+ for (carry = i, j = 8; j--;) {
+ blocks[i].c[j] = ((u8 *)key->md.data)[j] + carry;
+ carry = (blocks[i].c[j] - carry) >> (sizeof(carry) * 8 - 1);
+ }
+# endif
+ blocks[i].c[8] = ((u8 *)key->md.data)[8];
+ blocks[i].c[9] = ((u8 *)key->md.data)[9];
+ blocks[i].c[10] = ((u8 *)key->md.data)[10];
+ /* fix length */
+ blocks[i].c[11] = (u8)(len >> 8);
+ blocks[i].c[12] = (u8)(len);
+
+ memcpy(blocks[i].c + 13, hash_d[i].ptr, 64 - 13);
+ hash_d[i].ptr += 64 - 13;
+ hash_d[i].blocks = (len - (64 - 13)) / 64;
+
+ edges[i].ptr = blocks[i].c;
+ edges[i].blocks = 1;
+ }
+
+ /* hash 13-byte headers and first 64-13 bytes of inputs */
+ sha1_multi_block(ctx, edges, n4x);
+ /* hash bulk inputs */
+# define MAXCHUNKSIZE 2048
+# if MAXCHUNKSIZE%64
+# error "MAXCHUNKSIZE is not divisible by 64"
+# elif MAXCHUNKSIZE
+ /*
+ * goal is to minimize pressure on L1 cache by moving in shorter steps,
+ * so that hashed data is still in the cache by the time we encrypt it
+ */
+ minblocks = ((frag <= last ? frag : last) - (64 - 13)) / 64;
+ if (minblocks > MAXCHUNKSIZE / 64) {
+ for (i = 0; i < x4; i++) {
+ edges[i].ptr = hash_d[i].ptr;
+ edges[i].blocks = MAXCHUNKSIZE / 64;
+ ciph_d[i].blocks = MAXCHUNKSIZE / 16;
+ }
+ do {
+ sha1_multi_block(ctx, edges, n4x);
+ aesni_multi_cbc_encrypt(ciph_d, &key->ks, n4x);
+
+ for (i = 0; i < x4; i++) {
+ edges[i].ptr = hash_d[i].ptr += MAXCHUNKSIZE;
+ hash_d[i].blocks -= MAXCHUNKSIZE / 64;
+ edges[i].blocks = MAXCHUNKSIZE / 64;
+ ciph_d[i].inp += MAXCHUNKSIZE;
+ ciph_d[i].out += MAXCHUNKSIZE;
+ ciph_d[i].blocks = MAXCHUNKSIZE / 16;
+ memcpy(ciph_d[i].iv, ciph_d[i].out - 16, 16);
+ }
+ processed += MAXCHUNKSIZE;
+ minblocks -= MAXCHUNKSIZE / 64;
+ } while (minblocks > MAXCHUNKSIZE / 64);
+ }
+# endif
+# undef MAXCHUNKSIZE
+ sha1_multi_block(ctx, hash_d, n4x);
+
+ memset(blocks, 0, sizeof(blocks));
+ for (i = 0; i < x4; i++) {
+ unsigned int len = (i == (x4 - 1) ? last : frag),
+ off = hash_d[i].blocks * 64;
+ const unsigned char *ptr = hash_d[i].ptr + off;
+
+ off = (len - processed) - (64 - 13) - off; /* remainder actually */
+ memcpy(blocks[i].c, ptr, off);
+ blocks[i].c[off] = 0x80;
+ len += 64 + 13; /* 64 is HMAC header */
+ len *= 8; /* convert to bits */
+ if (off < (64 - 8)) {
+# ifdef BSWAP4
+ blocks[i].d[15] = BSWAP4(len);
+# else
+ PUTU32(blocks[i].c + 60, len);
+# endif
+ edges[i].blocks = 1;
+ } else {
+# ifdef BSWAP4
+ blocks[i].d[31] = BSWAP4(len);
+# else
+ PUTU32(blocks[i].c + 124, len);
+# endif
+ edges[i].blocks = 2;
+ }
+ edges[i].ptr = blocks[i].c;
+ }
+
+ /* hash input tails and finalize */
+ sha1_multi_block(ctx, edges, n4x);
+
+ memset(blocks, 0, sizeof(blocks));
+ for (i = 0; i < x4; i++) {
+# ifdef BSWAP4
+ blocks[i].d[0] = BSWAP4(ctx->A[i]);
+ ctx->A[i] = key->tail.h0;
+ blocks[i].d[1] = BSWAP4(ctx->B[i]);
+ ctx->B[i] = key->tail.h1;
+ blocks[i].d[2] = BSWAP4(ctx->C[i]);
+ ctx->C[i] = key->tail.h2;
+ blocks[i].d[3] = BSWAP4(ctx->D[i]);
+ ctx->D[i] = key->tail.h3;
+ blocks[i].d[4] = BSWAP4(ctx->E[i]);
+ ctx->E[i] = key->tail.h4;
+ blocks[i].c[20] = 0x80;
+ blocks[i].d[15] = BSWAP4((64 + 20) * 8);
+# else
+ PUTU32(blocks[i].c + 0, ctx->A[i]);
+ ctx->A[i] = key->tail.h0;
+ PUTU32(blocks[i].c + 4, ctx->B[i]);
+ ctx->B[i] = key->tail.h1;
+ PUTU32(blocks[i].c + 8, ctx->C[i]);
+ ctx->C[i] = key->tail.h2;
+ PUTU32(blocks[i].c + 12, ctx->D[i]);
+ ctx->D[i] = key->tail.h3;
+ PUTU32(blocks[i].c + 16, ctx->E[i]);
+ ctx->E[i] = key->tail.h4;
+ blocks[i].c[20] = 0x80;
+ PUTU32(blocks[i].c + 60, (64 + 20) * 8);
+# endif
+ edges[i].ptr = blocks[i].c;
+ edges[i].blocks = 1;
+ }
+
+ /* finalize MACs */
+ sha1_multi_block(ctx, edges, n4x);
+
+ for (i = 0; i < x4; i++) {
+ unsigned int len = (i == (x4 - 1) ? last : frag), pad, j;
+ unsigned char *out0 = out;
+
+ memcpy(ciph_d[i].out, ciph_d[i].inp, len - processed);
+ ciph_d[i].inp = ciph_d[i].out;
+
+ out += 5 + 16 + len;
+
+ /* write MAC */
+ PUTU32(out + 0, ctx->A[i]);
+ PUTU32(out + 4, ctx->B[i]);
+ PUTU32(out + 8, ctx->C[i]);
+ PUTU32(out + 12, ctx->D[i]);
+ PUTU32(out + 16, ctx->E[i]);
+ out += 20;
+ len += 20;
+
+ /* pad */
+ pad = 15 - len % 16;
+ for (j = 0; j <= pad; j++)
+ *(out++) = pad;
+ len += pad + 1;
+
+ ciph_d[i].blocks = (len - processed) / 16;
+ len += 16; /* account for explicit iv */
+
+ /* arrange header */
+ out0[0] = ((u8 *)key->md.data)[8];
+ out0[1] = ((u8 *)key->md.data)[9];
+ out0[2] = ((u8 *)key->md.data)[10];
+ out0[3] = (u8)(len >> 8);
+ out0[4] = (u8)(len);
+
+ ret += len + 5;
+ inp += frag;
+ }
+
+ aesni_multi_cbc_encrypt(ciph_d, &key->ks, n4x);
+
+ OPENSSL_cleanse(blocks, sizeof(blocks));
+ OPENSSL_cleanse(ctx, sizeof(*ctx));
+
+ return ret;
+}
+# endif
+
static int aesni_cbc_hmac_sha1_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
const unsigned char *in, size_t len)
{
@@ -257,10 +532,7 @@ static int aesni_cbc_hmac_sha1_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
/* arrange cache line alignment */
pmac = (void *)(((size_t)mac.c + 31) & ((size_t)0 - 32));
- /* decrypt HMAC|padding at once */
- aesni_cbc_encrypt(in, out, len, &key->ks, ctx->iv, 0);
-
- if (plen) { /* "TLS" mode of operation */
+ if (plen != NO_PAYLOAD_LENGTH) { /* "TLS" mode of operation */
size_t inp_len, mask, j, i;
unsigned int res, maxpad, pad, bitlen;
int ret = 1;
@@ -268,17 +540,37 @@ static int aesni_cbc_hmac_sha1_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
unsigned int u[SHA_LBLOCK];
unsigned char c[SHA_CBLOCK];
} *data = (void *)key->md.data;
+# if defined(STITCHED_DECRYPT_CALL)
+ unsigned char tail_iv[AES_BLOCK_SIZE];
+ int stitch = 0;
+# endif
if ((key->aux.tls_aad[plen - 4] << 8 | key->aux.tls_aad[plen - 3])
- >= TLS1_1_VERSION)
- iv = AES_BLOCK_SIZE;
-
- if (len < (iv + SHA_DIGEST_LENGTH + 1))
+ >= TLS1_1_VERSION) {
+ if (len < (AES_BLOCK_SIZE + SHA_DIGEST_LENGTH + 1))
+ return 0;
+
+ /* omit explicit iv */
+ memcpy(ctx->iv, in, AES_BLOCK_SIZE);
+ in += AES_BLOCK_SIZE;
+ out += AES_BLOCK_SIZE;
+ len -= AES_BLOCK_SIZE;
+ } else if (len < (SHA_DIGEST_LENGTH + 1))
return 0;
- /* omit explicit iv */
- out += iv;
- len -= iv;
+# if defined(STITCHED_DECRYPT_CALL)
+ if (len >= 1024 && ctx->key_len == 32) {
+ /* decrypt last block */
+ memcpy(tail_iv, in + len - 2 * AES_BLOCK_SIZE,
+ AES_BLOCK_SIZE);
+ aesni_cbc_encrypt(in + len - AES_BLOCK_SIZE,
+ out + len - AES_BLOCK_SIZE, AES_BLOCK_SIZE,
+ &key->ks, tail_iv, 0);
+ stitch = 1;
+ } else
+# endif
+ /* decrypt HMAC|padding at once */
+ aesni_cbc_encrypt(in, out, len, &key->ks, ctx->iv, 0);
/* figure out payload length */
pad = out[len - 1];
@@ -298,6 +590,29 @@ static int aesni_cbc_hmac_sha1_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
key->md = key->head;
SHA1_Update(&key->md, key->aux.tls_aad, plen);
+# if defined(STITCHED_DECRYPT_CALL)
+ if (stitch) {
+ blocks = (len - (256 + 32 + SHA_CBLOCK)) / SHA_CBLOCK;
+ aes_off = len - AES_BLOCK_SIZE - blocks * SHA_CBLOCK;
+ sha_off = SHA_CBLOCK - plen;
+
+ aesni_cbc_encrypt(in, out, aes_off, &key->ks, ctx->iv, 0);
+
+ SHA1_Update(&key->md, out, sha_off);
+ aesni256_cbc_sha1_dec(in + aes_off,
+ out + aes_off, blocks, &key->ks,
+ ctx->iv, &key->md, out + sha_off);
+
+ sha_off += blocks *= SHA_CBLOCK;
+ out += sha_off;
+ len -= sha_off;
+ inp_len -= sha_off;
+
+ key->md.Nl += (blocks << 3); /* at most 18 bits */
+ memcpy(ctx->iv, tail_iv, AES_BLOCK_SIZE);
+ }
+# endif
+
# if 1
len -= SHA_DIGEST_LENGTH; /* amend mac */
if (len >= (256 + SHA_CBLOCK)) {
@@ -311,8 +626,8 @@ static int aesni_cbc_hmac_sha1_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
/* but pretend as if we hashed padded payload */
bitlen = key->md.Nl + (inp_len << 3); /* at most 18 bits */
-# ifdef BSWAP
- bitlen = BSWAP(bitlen);
+# ifdef BSWAP4
+ bitlen = BSWAP4(bitlen);
# else
mac.c[0] = 0;
mac.c[1] = (unsigned char)(bitlen >> 16);
@@ -376,12 +691,12 @@ static int aesni_cbc_hmac_sha1_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
pmac->u[3] |= key->md.h3 & mask;
pmac->u[4] |= key->md.h4 & mask;
-# ifdef BSWAP
- pmac->u[0] = BSWAP(pmac->u[0]);
- pmac->u[1] = BSWAP(pmac->u[1]);
- pmac->u[2] = BSWAP(pmac->u[2]);
- pmac->u[3] = BSWAP(pmac->u[3]);
- pmac->u[4] = BSWAP(pmac->u[4]);
+# ifdef BSWAP4
+ pmac->u[0] = BSWAP4(pmac->u[0]);
+ pmac->u[1] = BSWAP4(pmac->u[1]);
+ pmac->u[2] = BSWAP4(pmac->u[2]);
+ pmac->u[3] = BSWAP4(pmac->u[3]);
+ pmac->u[4] = BSWAP4(pmac->u[4]);
# else
for (i = 0; i < 5; i++) {
res = pmac->u[i];
@@ -458,6 +773,33 @@ static int aesni_cbc_hmac_sha1_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
# endif
return ret;
} else {
+# if defined(STITCHED_DECRYPT_CALL)
+ if (len >= 1024 && ctx->key_len == 32) {
+ if (sha_off %= SHA_CBLOCK)
+ blocks = (len - 3 * SHA_CBLOCK) / SHA_CBLOCK;
+ else
+ blocks = (len - 2 * SHA_CBLOCK) / SHA_CBLOCK;
+ aes_off = len - blocks * SHA_CBLOCK;
+
+ aesni_cbc_encrypt(in, out, aes_off, &key->ks, ctx->iv, 0);
+ SHA1_Update(&key->md, out, sha_off);
+ aesni256_cbc_sha1_dec(in + aes_off,
+ out + aes_off, blocks, &key->ks,
+ ctx->iv, &key->md, out + sha_off);
+
+ sha_off += blocks *= SHA_CBLOCK;
+ out += sha_off;
+ len -= sha_off;
+
+ key->md.Nh += blocks >> 29;
+ key->md.Nl += blocks <<= 3;
+ if (key->md.Nl < (unsigned int)blocks)
+ key->md.Nh++;
+ } else
+# endif
+ /* decrypt HMAC|padding at once */
+ aesni_cbc_encrypt(in, out, len, &key->ks, ctx->iv, 0);
+
SHA1_Update(&key->md, out, len);
}
}
@@ -531,6 +873,70 @@ static int aesni_cbc_hmac_sha1_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg,
return SHA_DIGEST_LENGTH;
}
}
+# if !defined(OPENSSL_NO_MULTIBLOCK) && EVP_CIPH_FLAG_TLS1_1_MULTIBLOCK
+ case EVP_CTRL_TLS1_1_MULTIBLOCK_MAX_BUFSIZE:
+ return (int)(5 + 16 + ((arg + 20 + 16) & -16));
+ case EVP_CTRL_TLS1_1_MULTIBLOCK_AAD:
+ {
+ EVP_CTRL_TLS1_1_MULTIBLOCK_PARAM *param =
+ (EVP_CTRL_TLS1_1_MULTIBLOCK_PARAM *) ptr;
+ unsigned int n4x = 1, x4;
+ unsigned int frag, last, packlen, inp_len;
+
+ if (arg < (int)sizeof(EVP_CTRL_TLS1_1_MULTIBLOCK_PARAM))
+ return -1;
+
+ inp_len = param->inp[11] << 8 | param->inp[12];
+
+ if (ctx->encrypt) {
+ if ((param->inp[9] << 8 | param->inp[10]) < TLS1_1_VERSION)
+ return -1;
+
+ if (inp_len) {
+ if (inp_len < 4096)
+ return 0; /* too short */
+
+ if (inp_len >= 8192 && OPENSSL_ia32cap_P[2] & (1 << 5))
+ n4x = 2; /* AVX2 */
+ } else if ((n4x = param->interleave / 4) && n4x <= 2)
+ inp_len = param->len;
+ else
+ return -1;
+
+ key->md = key->head;
+ SHA1_Update(&key->md, param->inp, 13);
+
+ x4 = 4 * n4x;
+ n4x += 1;
+
+ frag = inp_len >> n4x;
+ last = inp_len + frag - (frag << n4x);
+ if (last > frag && ((last + 13 + 9) % 64 < (x4 - 1))) {
+ frag++;
+ last -= x4 - 1;
+ }
+
+ packlen = 5 + 16 + ((frag + 20 + 16) & -16);
+ packlen = (packlen << n4x) - packlen;
+ packlen += 5 + 16 + ((last + 20 + 16) & -16);
+
+ param->interleave = x4;
+
+ return (int)packlen;
+ } else
+ return -1; /* not yet */
+ }
+ case EVP_CTRL_TLS1_1_MULTIBLOCK_ENCRYPT:
+ {
+ EVP_CTRL_TLS1_1_MULTIBLOCK_PARAM *param =
+ (EVP_CTRL_TLS1_1_MULTIBLOCK_PARAM *) ptr;
+
+ return (int)tls1_1_multi_block_encrypt(key, param->out,
+ param->inp, param->len,
+ param->interleave / 4);
+ }
+ case EVP_CTRL_TLS1_1_MULTIBLOCK_DECRYPT:
+# endif
default:
return -1;
}
@@ -544,7 +950,7 @@ static EVP_CIPHER aesni_128_cbc_hmac_sha1_cipher = {
# endif
16, 16, 16,
EVP_CIPH_CBC_MODE | EVP_CIPH_FLAG_DEFAULT_ASN1 |
- EVP_CIPH_FLAG_AEAD_CIPHER,
+ EVP_CIPH_FLAG_AEAD_CIPHER | EVP_CIPH_FLAG_TLS1_1_MULTIBLOCK,
aesni_cbc_hmac_sha1_init_key,
aesni_cbc_hmac_sha1_cipher,
NULL,
@@ -563,7 +969,7 @@ static EVP_CIPHER aesni_256_cbc_hmac_sha1_cipher = {
# endif
16, 32, 16,
EVP_CIPH_CBC_MODE | EVP_CIPH_FLAG_DEFAULT_ASN1 |
- EVP_CIPH_FLAG_AEAD_CIPHER,
+ EVP_CIPH_FLAG_AEAD_CIPHER | EVP_CIPH_FLAG_TLS1_1_MULTIBLOCK,
aesni_cbc_hmac_sha1_init_key,
aesni_cbc_hmac_sha1_cipher,
NULL,
diff --git a/crypto/evp/e_aes_cbc_hmac_sha256.c b/crypto/evp/e_aes_cbc_hmac_sha256.c
new file mode 100644
index 000000000000..b1c586e6fd96
--- /dev/null
+++ b/crypto/evp/e_aes_cbc_hmac_sha256.c
@@ -0,0 +1,973 @@
+/* ====================================================================
+ * Copyright (c) 2011-2013 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/opensslconf.h>
+
+#include <stdio.h>
+#include <string.h>
+
+#if !defined(OPENSSL_NO_AES) && !defined(OPENSSL_NO_SHA256)
+
+# include <openssl/evp.h>
+# include <openssl/objects.h>
+# include <openssl/aes.h>
+# include <openssl/sha.h>
+# include <openssl/rand.h>
+# include "modes_lcl.h"
+
+# ifndef EVP_CIPH_FLAG_AEAD_CIPHER
+# define EVP_CIPH_FLAG_AEAD_CIPHER 0x200000
+# define EVP_CTRL_AEAD_TLS1_AAD 0x16
+# define EVP_CTRL_AEAD_SET_MAC_KEY 0x17
+# endif
+
+# if !defined(EVP_CIPH_FLAG_DEFAULT_ASN1)
+# define EVP_CIPH_FLAG_DEFAULT_ASN1 0
+# endif
+
+# if !defined(EVP_CIPH_FLAG_TLS1_1_MULTIBLOCK)
+# define EVP_CIPH_FLAG_TLS1_1_MULTIBLOCK 0
+# endif
+
+# define TLS1_1_VERSION 0x0302
+
+typedef struct {
+ AES_KEY ks;
+ SHA256_CTX head, tail, md;
+ size_t payload_length; /* AAD length in decrypt case */
+ union {
+ unsigned int tls_ver;
+ unsigned char tls_aad[16]; /* 13 used */
+ } aux;
+} EVP_AES_HMAC_SHA256;
+
+# define NO_PAYLOAD_LENGTH ((size_t)-1)
+
+# if defined(AES_ASM) && ( \
+ defined(__x86_64) || defined(__x86_64__) || \
+ defined(_M_AMD64) || defined(_M_X64) || \
+ defined(__INTEL__) )
+
+extern unsigned int OPENSSL_ia32cap_P[];
+# define AESNI_CAPABLE (1<<(57-32))
+
+int aesni_set_encrypt_key(const unsigned char *userKey, int bits,
+ AES_KEY *key);
+int aesni_set_decrypt_key(const unsigned char *userKey, int bits,
+ AES_KEY *key);
+
+void aesni_cbc_encrypt(const unsigned char *in,
+ unsigned char *out,
+ size_t length,
+ const AES_KEY *key, unsigned char *ivec, int enc);
+
+int aesni_cbc_sha256_enc(const void *inp, void *out, size_t blocks,
+ const AES_KEY *key, unsigned char iv[16],
+ SHA256_CTX *ctx, const void *in0);
+
+# define data(ctx) ((EVP_AES_HMAC_SHA256 *)(ctx)->cipher_data)
+
+static int aesni_cbc_hmac_sha256_init_key(EVP_CIPHER_CTX *ctx,
+ const unsigned char *inkey,
+ const unsigned char *iv, int enc)
+{
+ EVP_AES_HMAC_SHA256 *key = data(ctx);
+ int ret;
+
+ if (enc)
+ memset(&key->ks, 0, sizeof(key->ks.rd_key)),
+ ret = aesni_set_encrypt_key(inkey, ctx->key_len * 8, &key->ks);
+ else
+ ret = aesni_set_decrypt_key(inkey, ctx->key_len * 8, &key->ks);
+
+ SHA256_Init(&key->head); /* handy when benchmarking */
+ key->tail = key->head;
+ key->md = key->head;
+
+ key->payload_length = NO_PAYLOAD_LENGTH;
+
+ return ret < 0 ? 0 : 1;
+}
+
+# define STITCHED_CALL
+
+# if !defined(STITCHED_CALL)
+# define aes_off 0
+# endif
+
+void sha256_block_data_order(void *c, const void *p, size_t len);
+
+static void sha256_update(SHA256_CTX *c, const void *data, size_t len)
+{
+ const unsigned char *ptr = data;
+ size_t res;
+
+ if ((res = c->num)) {
+ res = SHA256_CBLOCK - res;
+ if (len < res)
+ res = len;
+ SHA256_Update(c, ptr, res);
+ ptr += res;
+ len -= res;
+ }
+
+ res = len % SHA256_CBLOCK;
+ len -= res;
+
+ if (len) {
+ sha256_block_data_order(c, ptr, len / SHA256_CBLOCK);
+
+ ptr += len;
+ c->Nh += len >> 29;
+ c->Nl += len <<= 3;
+ if (c->Nl < (unsigned int)len)
+ c->Nh++;
+ }
+
+ if (res)
+ SHA256_Update(c, ptr, res);
+}
+
+# ifdef SHA256_Update
+# undef SHA256_Update
+# endif
+# define SHA256_Update sha256_update
+
+# if !defined(OPENSSL_NO_MULTIBLOCK) && EVP_CIPH_FLAG_TLS1_1_MULTIBLOCK
+
+typedef struct {
+ unsigned int A[8], B[8], C[8], D[8], E[8], F[8], G[8], H[8];
+} SHA256_MB_CTX;
+typedef struct {
+ const unsigned char *ptr;
+ int blocks;
+} HASH_DESC;
+
+void sha256_multi_block(SHA256_MB_CTX *, const HASH_DESC *, int);
+
+typedef struct {
+ const unsigned char *inp;
+ unsigned char *out;
+ int blocks;
+ u64 iv[2];
+} CIPH_DESC;
+
+void aesni_multi_cbc_encrypt(CIPH_DESC *, void *, int);
+
+static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA256 *key,
+ unsigned char *out,
+ const unsigned char *inp,
+ size_t inp_len, int n4x)
+{ /* n4x is 1 or 2 */
+ HASH_DESC hash_d[8], edges[8];
+ CIPH_DESC ciph_d[8];
+ unsigned char storage[sizeof(SHA256_MB_CTX) + 32];
+ union {
+ u64 q[16];
+ u32 d[32];
+ u8 c[128];
+ } blocks[8];
+ SHA256_MB_CTX *ctx;
+ unsigned int frag, last, packlen, i, x4 = 4 * n4x, minblocks, processed =
+ 0;
+ size_t ret = 0;
+ u8 *IVs;
+# if defined(BSWAP8)
+ u64 seqnum;
+# endif
+
+ /* ask for IVs in bulk */
+ if (RAND_bytes((IVs = blocks[0].c), 16 * x4) <= 0)
+ return 0;
+
+ /* align */
+ ctx = (SHA256_MB_CTX *) (storage + 32 - ((size_t)storage % 32));
+
+ frag = (unsigned int)inp_len >> (1 + n4x);
+ last = (unsigned int)inp_len + frag - (frag << (1 + n4x));
+ if (last > frag && ((last + 13 + 9) % 64) < (x4 - 1)) {
+ frag++;
+ last -= x4 - 1;
+ }
+
+ packlen = 5 + 16 + ((frag + 32 + 16) & -16);
+
+ /* populate descriptors with pointers and IVs */
+ hash_d[0].ptr = inp;
+ ciph_d[0].inp = inp;
+ /* 5+16 is place for header and explicit IV */
+ ciph_d[0].out = out + 5 + 16;
+ memcpy(ciph_d[0].out - 16, IVs, 16);
+ memcpy(ciph_d[0].iv, IVs, 16);
+ IVs += 16;
+
+ for (i = 1; i < x4; i++) {
+ ciph_d[i].inp = hash_d[i].ptr = hash_d[i - 1].ptr + frag;
+ ciph_d[i].out = ciph_d[i - 1].out + packlen;
+ memcpy(ciph_d[i].out - 16, IVs, 16);
+ memcpy(ciph_d[i].iv, IVs, 16);
+ IVs += 16;
+ }
+
+# if defined(BSWAP8)
+ memcpy(blocks[0].c, key->md.data, 8);
+ seqnum = BSWAP8(blocks[0].q[0]);
+# endif
+ for (i = 0; i < x4; i++) {
+ unsigned int len = (i == (x4 - 1) ? last : frag);
+# if !defined(BSWAP8)
+ unsigned int carry, j;
+# endif
+
+ ctx->A[i] = key->md.h[0];
+ ctx->B[i] = key->md.h[1];
+ ctx->C[i] = key->md.h[2];
+ ctx->D[i] = key->md.h[3];
+ ctx->E[i] = key->md.h[4];
+ ctx->F[i] = key->md.h[5];
+ ctx->G[i] = key->md.h[6];
+ ctx->H[i] = key->md.h[7];
+
+ /* fix seqnum */
+# if defined(BSWAP8)
+ blocks[i].q[0] = BSWAP8(seqnum + i);
+# else
+ for (carry = i, j = 8; j--;) {
+ blocks[i].c[j] = ((u8 *)key->md.data)[j] + carry;
+ carry = (blocks[i].c[j] - carry) >> (sizeof(carry) * 8 - 1);
+ }
+# endif
+ blocks[i].c[8] = ((u8 *)key->md.data)[8];
+ blocks[i].c[9] = ((u8 *)key->md.data)[9];
+ blocks[i].c[10] = ((u8 *)key->md.data)[10];
+ /* fix length */
+ blocks[i].c[11] = (u8)(len >> 8);
+ blocks[i].c[12] = (u8)(len);
+
+ memcpy(blocks[i].c + 13, hash_d[i].ptr, 64 - 13);
+ hash_d[i].ptr += 64 - 13;
+ hash_d[i].blocks = (len - (64 - 13)) / 64;
+
+ edges[i].ptr = blocks[i].c;
+ edges[i].blocks = 1;
+ }
+
+ /* hash 13-byte headers and first 64-13 bytes of inputs */
+ sha256_multi_block(ctx, edges, n4x);
+ /* hash bulk inputs */
+# define MAXCHUNKSIZE 2048
+# if MAXCHUNKSIZE%64
+# error "MAXCHUNKSIZE is not divisible by 64"
+# elif MAXCHUNKSIZE
+ /*
+ * goal is to minimize pressure on L1 cache by moving in shorter steps,
+ * so that hashed data is still in the cache by the time we encrypt it
+ */
+ minblocks = ((frag <= last ? frag : last) - (64 - 13)) / 64;
+ if (minblocks > MAXCHUNKSIZE / 64) {
+ for (i = 0; i < x4; i++) {
+ edges[i].ptr = hash_d[i].ptr;
+ edges[i].blocks = MAXCHUNKSIZE / 64;
+ ciph_d[i].blocks = MAXCHUNKSIZE / 16;
+ }
+ do {
+ sha256_multi_block(ctx, edges, n4x);
+ aesni_multi_cbc_encrypt(ciph_d, &key->ks, n4x);
+
+ for (i = 0; i < x4; i++) {
+ edges[i].ptr = hash_d[i].ptr += MAXCHUNKSIZE;
+ hash_d[i].blocks -= MAXCHUNKSIZE / 64;
+ edges[i].blocks = MAXCHUNKSIZE / 64;
+ ciph_d[i].inp += MAXCHUNKSIZE;
+ ciph_d[i].out += MAXCHUNKSIZE;
+ ciph_d[i].blocks = MAXCHUNKSIZE / 16;
+ memcpy(ciph_d[i].iv, ciph_d[i].out - 16, 16);
+ }
+ processed += MAXCHUNKSIZE;
+ minblocks -= MAXCHUNKSIZE / 64;
+ } while (minblocks > MAXCHUNKSIZE / 64);
+ }
+# endif
+# undef MAXCHUNKSIZE
+ sha256_multi_block(ctx, hash_d, n4x);
+
+ memset(blocks, 0, sizeof(blocks));
+ for (i = 0; i < x4; i++) {
+ unsigned int len = (i == (x4 - 1) ? last : frag),
+ off = hash_d[i].blocks * 64;
+ const unsigned char *ptr = hash_d[i].ptr + off;
+
+ off = (len - processed) - (64 - 13) - off; /* remainder actually */
+ memcpy(blocks[i].c, ptr, off);
+ blocks[i].c[off] = 0x80;
+ len += 64 + 13; /* 64 is HMAC header */
+ len *= 8; /* convert to bits */
+ if (off < (64 - 8)) {
+# ifdef BSWAP4
+ blocks[i].d[15] = BSWAP4(len);
+# else
+ PUTU32(blocks[i].c + 60, len);
+# endif
+ edges[i].blocks = 1;
+ } else {
+# ifdef BSWAP4
+ blocks[i].d[31] = BSWAP4(len);
+# else
+ PUTU32(blocks[i].c + 124, len);
+# endif
+ edges[i].blocks = 2;
+ }
+ edges[i].ptr = blocks[i].c;
+ }
+
+ /* hash input tails and finalize */
+ sha256_multi_block(ctx, edges, n4x);
+
+ memset(blocks, 0, sizeof(blocks));
+ for (i = 0; i < x4; i++) {
+# ifdef BSWAP4
+ blocks[i].d[0] = BSWAP4(ctx->A[i]);
+ ctx->A[i] = key->tail.h[0];
+ blocks[i].d[1] = BSWAP4(ctx->B[i]);
+ ctx->B[i] = key->tail.h[1];
+ blocks[i].d[2] = BSWAP4(ctx->C[i]);
+ ctx->C[i] = key->tail.h[2];
+ blocks[i].d[3] = BSWAP4(ctx->D[i]);
+ ctx->D[i] = key->tail.h[3];
+ blocks[i].d[4] = BSWAP4(ctx->E[i]);
+ ctx->E[i] = key->tail.h[4];
+ blocks[i].d[5] = BSWAP4(ctx->F[i]);
+ ctx->F[i] = key->tail.h[5];
+ blocks[i].d[6] = BSWAP4(ctx->G[i]);
+ ctx->G[i] = key->tail.h[6];
+ blocks[i].d[7] = BSWAP4(ctx->H[i]);
+ ctx->H[i] = key->tail.h[7];
+ blocks[i].c[32] = 0x80;
+ blocks[i].d[15] = BSWAP4((64 + 32) * 8);
+# else
+ PUTU32(blocks[i].c + 0, ctx->A[i]);
+ ctx->A[i] = key->tail.h[0];
+ PUTU32(blocks[i].c + 4, ctx->B[i]);
+ ctx->B[i] = key->tail.h[1];
+ PUTU32(blocks[i].c + 8, ctx->C[i]);
+ ctx->C[i] = key->tail.h[2];
+ PUTU32(blocks[i].c + 12, ctx->D[i]);
+ ctx->D[i] = key->tail.h[3];
+ PUTU32(blocks[i].c + 16, ctx->E[i]);
+ ctx->E[i] = key->tail.h[4];
+ PUTU32(blocks[i].c + 20, ctx->F[i]);
+ ctx->F[i] = key->tail.h[5];
+ PUTU32(blocks[i].c + 24, ctx->G[i]);
+ ctx->G[i] = key->tail.h[6];
+ PUTU32(blocks[i].c + 28, ctx->H[i]);
+ ctx->H[i] = key->tail.h[7];
+ blocks[i].c[32] = 0x80;
+ PUTU32(blocks[i].c + 60, (64 + 32) * 8);
+# endif
+ edges[i].ptr = blocks[i].c;
+ edges[i].blocks = 1;
+ }
+
+ /* finalize MACs */
+ sha256_multi_block(ctx, edges, n4x);
+
+ for (i = 0; i < x4; i++) {
+ unsigned int len = (i == (x4 - 1) ? last : frag), pad, j;
+ unsigned char *out0 = out;
+
+ memcpy(ciph_d[i].out, ciph_d[i].inp, len - processed);
+ ciph_d[i].inp = ciph_d[i].out;
+
+ out += 5 + 16 + len;
+
+ /* write MAC */
+ PUTU32(out + 0, ctx->A[i]);
+ PUTU32(out + 4, ctx->B[i]);
+ PUTU32(out + 8, ctx->C[i]);
+ PUTU32(out + 12, ctx->D[i]);
+ PUTU32(out + 16, ctx->E[i]);
+ PUTU32(out + 20, ctx->F[i]);
+ PUTU32(out + 24, ctx->G[i]);
+ PUTU32(out + 28, ctx->H[i]);
+ out += 32;
+ len += 32;
+
+ /* pad */
+ pad = 15 - len % 16;
+ for (j = 0; j <= pad; j++)
+ *(out++) = pad;
+ len += pad + 1;
+
+ ciph_d[i].blocks = (len - processed) / 16;
+ len += 16; /* account for explicit iv */
+
+ /* arrange header */
+ out0[0] = ((u8 *)key->md.data)[8];
+ out0[1] = ((u8 *)key->md.data)[9];
+ out0[2] = ((u8 *)key->md.data)[10];
+ out0[3] = (u8)(len >> 8);
+ out0[4] = (u8)(len);
+
+ ret += len + 5;
+ inp += frag;
+ }
+
+ aesni_multi_cbc_encrypt(ciph_d, &key->ks, n4x);
+
+ OPENSSL_cleanse(blocks, sizeof(blocks));
+ OPENSSL_cleanse(ctx, sizeof(*ctx));
+
+ return ret;
+}
+# endif
+
+static int aesni_cbc_hmac_sha256_cipher(EVP_CIPHER_CTX *ctx,
+ unsigned char *out,
+ const unsigned char *in, size_t len)
+{
+ EVP_AES_HMAC_SHA256 *key = data(ctx);
+ unsigned int l;
+ size_t plen = key->payload_length, iv = 0, /* explicit IV in TLS 1.1 and
+ * later */
+ sha_off = 0;
+# if defined(STITCHED_CALL)
+ size_t aes_off = 0, blocks;
+
+ sha_off = SHA256_CBLOCK - key->md.num;
+# endif
+
+ key->payload_length = NO_PAYLOAD_LENGTH;
+
+ if (len % AES_BLOCK_SIZE)
+ return 0;
+
+ if (ctx->encrypt) {
+ if (plen == NO_PAYLOAD_LENGTH)
+ plen = len;
+ else if (len !=
+ ((plen + SHA256_DIGEST_LENGTH +
+ AES_BLOCK_SIZE) & -AES_BLOCK_SIZE))
+ return 0;
+ else if (key->aux.tls_ver >= TLS1_1_VERSION)
+ iv = AES_BLOCK_SIZE;
+
+# if defined(STITCHED_CALL)
+ if (OPENSSL_ia32cap_P[1] & (1 << (60 - 32)) && /* AVX? */
+ plen > (sha_off + iv) &&
+ (blocks = (plen - (sha_off + iv)) / SHA256_CBLOCK)) {
+ SHA256_Update(&key->md, in + iv, sha_off);
+
+ (void)aesni_cbc_sha256_enc(in, out, blocks, &key->ks,
+ ctx->iv, &key->md, in + iv + sha_off);
+ blocks *= SHA256_CBLOCK;
+ aes_off += blocks;
+ sha_off += blocks;
+ key->md.Nh += blocks >> 29;
+ key->md.Nl += blocks <<= 3;
+ if (key->md.Nl < (unsigned int)blocks)
+ key->md.Nh++;
+ } else {
+ sha_off = 0;
+ }
+# endif
+ sha_off += iv;
+ SHA256_Update(&key->md, in + sha_off, plen - sha_off);
+
+ if (plen != len) { /* "TLS" mode of operation */
+ if (in != out)
+ memcpy(out + aes_off, in + aes_off, plen - aes_off);
+
+ /* calculate HMAC and append it to payload */
+ SHA256_Final(out + plen, &key->md);
+ key->md = key->tail;
+ SHA256_Update(&key->md, out + plen, SHA256_DIGEST_LENGTH);
+ SHA256_Final(out + plen, &key->md);
+
+ /* pad the payload|hmac */
+ plen += SHA256_DIGEST_LENGTH;
+ for (l = len - plen - 1; plen < len; plen++)
+ out[plen] = l;
+ /* encrypt HMAC|padding at once */
+ aesni_cbc_encrypt(out + aes_off, out + aes_off, len - aes_off,
+ &key->ks, ctx->iv, 1);
+ } else {
+ aesni_cbc_encrypt(in + aes_off, out + aes_off, len - aes_off,
+ &key->ks, ctx->iv, 1);
+ }
+ } else {
+ union {
+ unsigned int u[SHA256_DIGEST_LENGTH / sizeof(unsigned int)];
+ unsigned char c[64 + SHA256_DIGEST_LENGTH];
+ } mac, *pmac;
+
+ /* arrange cache line alignment */
+ pmac = (void *)(((size_t)mac.c + 63) & ((size_t)0 - 64));
+
+ /* decrypt HMAC|padding at once */
+ aesni_cbc_encrypt(in, out, len, &key->ks, ctx->iv, 0);
+
+ if (plen != NO_PAYLOAD_LENGTH) { /* "TLS" mode of operation */
+ size_t inp_len, mask, j, i;
+ unsigned int res, maxpad, pad, bitlen;
+ int ret = 1;
+ union {
+ unsigned int u[SHA_LBLOCK];
+ unsigned char c[SHA256_CBLOCK];
+ } *data = (void *)key->md.data;
+
+ if ((key->aux.tls_aad[plen - 4] << 8 | key->aux.tls_aad[plen - 3])
+ >= TLS1_1_VERSION)
+ iv = AES_BLOCK_SIZE;
+
+ if (len < (iv + SHA256_DIGEST_LENGTH + 1))
+ return 0;
+
+ /* omit explicit iv */
+ out += iv;
+ len -= iv;
+
+ /* figure out payload length */
+ pad = out[len - 1];
+ maxpad = len - (SHA256_DIGEST_LENGTH + 1);
+ maxpad |= (255 - maxpad) >> (sizeof(maxpad) * 8 - 8);
+ maxpad &= 255;
+
+ inp_len = len - (SHA256_DIGEST_LENGTH + pad + 1);
+ mask = (0 - ((inp_len - len) >> (sizeof(inp_len) * 8 - 1)));
+ inp_len &= mask;
+ ret &= (int)mask;
+
+ key->aux.tls_aad[plen - 2] = inp_len >> 8;
+ key->aux.tls_aad[plen - 1] = inp_len;
+
+ /* calculate HMAC */
+ key->md = key->head;
+ SHA256_Update(&key->md, key->aux.tls_aad, plen);
+
+# if 1
+ len -= SHA256_DIGEST_LENGTH; /* amend mac */
+ if (len >= (256 + SHA256_CBLOCK)) {
+ j = (len - (256 + SHA256_CBLOCK)) & (0 - SHA256_CBLOCK);
+ j += SHA256_CBLOCK - key->md.num;
+ SHA256_Update(&key->md, out, j);
+ out += j;
+ len -= j;
+ inp_len -= j;
+ }
+
+ /* but pretend as if we hashed padded payload */
+ bitlen = key->md.Nl + (inp_len << 3); /* at most 18 bits */
+# ifdef BSWAP4
+ bitlen = BSWAP4(bitlen);
+# else
+ mac.c[0] = 0;
+ mac.c[1] = (unsigned char)(bitlen >> 16);
+ mac.c[2] = (unsigned char)(bitlen >> 8);
+ mac.c[3] = (unsigned char)bitlen;
+ bitlen = mac.u[0];
+# endif
+
+ pmac->u[0] = 0;
+ pmac->u[1] = 0;
+ pmac->u[2] = 0;
+ pmac->u[3] = 0;
+ pmac->u[4] = 0;
+ pmac->u[5] = 0;
+ pmac->u[6] = 0;
+ pmac->u[7] = 0;
+
+ for (res = key->md.num, j = 0; j < len; j++) {
+ size_t c = out[j];
+ mask = (j - inp_len) >> (sizeof(j) * 8 - 8);
+ c &= mask;
+ c |= 0x80 & ~mask & ~((inp_len - j) >> (sizeof(j) * 8 - 8));
+ data->c[res++] = (unsigned char)c;
+
+ if (res != SHA256_CBLOCK)
+ continue;
+
+ /* j is not incremented yet */
+ mask = 0 - ((inp_len + 7 - j) >> (sizeof(j) * 8 - 1));
+ data->u[SHA_LBLOCK - 1] |= bitlen & mask;
+ sha256_block_data_order(&key->md, data, 1);
+ mask &= 0 - ((j - inp_len - 72) >> (sizeof(j) * 8 - 1));
+ pmac->u[0] |= key->md.h[0] & mask;
+ pmac->u[1] |= key->md.h[1] & mask;
+ pmac->u[2] |= key->md.h[2] & mask;
+ pmac->u[3] |= key->md.h[3] & mask;
+ pmac->u[4] |= key->md.h[4] & mask;
+ pmac->u[5] |= key->md.h[5] & mask;
+ pmac->u[6] |= key->md.h[6] & mask;
+ pmac->u[7] |= key->md.h[7] & mask;
+ res = 0;
+ }
+
+ for (i = res; i < SHA256_CBLOCK; i++, j++)
+ data->c[i] = 0;
+
+ if (res > SHA256_CBLOCK - 8) {
+ mask = 0 - ((inp_len + 8 - j) >> (sizeof(j) * 8 - 1));
+ data->u[SHA_LBLOCK - 1] |= bitlen & mask;
+ sha256_block_data_order(&key->md, data, 1);
+ mask &= 0 - ((j - inp_len - 73) >> (sizeof(j) * 8 - 1));
+ pmac->u[0] |= key->md.h[0] & mask;
+ pmac->u[1] |= key->md.h[1] & mask;
+ pmac->u[2] |= key->md.h[2] & mask;
+ pmac->u[3] |= key->md.h[3] & mask;
+ pmac->u[4] |= key->md.h[4] & mask;
+ pmac->u[5] |= key->md.h[5] & mask;
+ pmac->u[6] |= key->md.h[6] & mask;
+ pmac->u[7] |= key->md.h[7] & mask;
+
+ memset(data, 0, SHA256_CBLOCK);
+ j += 64;
+ }
+ data->u[SHA_LBLOCK - 1] = bitlen;
+ sha256_block_data_order(&key->md, data, 1);
+ mask = 0 - ((j - inp_len - 73) >> (sizeof(j) * 8 - 1));
+ pmac->u[0] |= key->md.h[0] & mask;
+ pmac->u[1] |= key->md.h[1] & mask;
+ pmac->u[2] |= key->md.h[2] & mask;
+ pmac->u[3] |= key->md.h[3] & mask;
+ pmac->u[4] |= key->md.h[4] & mask;
+ pmac->u[5] |= key->md.h[5] & mask;
+ pmac->u[6] |= key->md.h[6] & mask;
+ pmac->u[7] |= key->md.h[7] & mask;
+
+# ifdef BSWAP4
+ pmac->u[0] = BSWAP4(pmac->u[0]);
+ pmac->u[1] = BSWAP4(pmac->u[1]);
+ pmac->u[2] = BSWAP4(pmac->u[2]);
+ pmac->u[3] = BSWAP4(pmac->u[3]);
+ pmac->u[4] = BSWAP4(pmac->u[4]);
+ pmac->u[5] = BSWAP4(pmac->u[5]);
+ pmac->u[6] = BSWAP4(pmac->u[6]);
+ pmac->u[7] = BSWAP4(pmac->u[7]);
+# else
+ for (i = 0; i < 8; i++) {
+ res = pmac->u[i];
+ pmac->c[4 * i + 0] = (unsigned char)(res >> 24);
+ pmac->c[4 * i + 1] = (unsigned char)(res >> 16);
+ pmac->c[4 * i + 2] = (unsigned char)(res >> 8);
+ pmac->c[4 * i + 3] = (unsigned char)res;
+ }
+# endif
+ len += SHA256_DIGEST_LENGTH;
+# else
+ SHA256_Update(&key->md, out, inp_len);
+ res = key->md.num;
+ SHA256_Final(pmac->c, &key->md);
+
+ {
+ unsigned int inp_blocks, pad_blocks;
+
+ /* but pretend as if we hashed padded payload */
+ inp_blocks =
+ 1 + ((SHA256_CBLOCK - 9 - res) >> (sizeof(res) * 8 - 1));
+ res += (unsigned int)(len - inp_len);
+ pad_blocks = res / SHA256_CBLOCK;
+ res %= SHA256_CBLOCK;
+ pad_blocks +=
+ 1 + ((SHA256_CBLOCK - 9 - res) >> (sizeof(res) * 8 - 1));
+ for (; inp_blocks < pad_blocks; inp_blocks++)
+ sha1_block_data_order(&key->md, data, 1);
+ }
+# endif
+ key->md = key->tail;
+ SHA256_Update(&key->md, pmac->c, SHA256_DIGEST_LENGTH);
+ SHA256_Final(pmac->c, &key->md);
+
+ /* verify HMAC */
+ out += inp_len;
+ len -= inp_len;
+# if 1
+ {
+ unsigned char *p =
+ out + len - 1 - maxpad - SHA256_DIGEST_LENGTH;
+ size_t off = out - p;
+ unsigned int c, cmask;
+
+ maxpad += SHA256_DIGEST_LENGTH;
+ for (res = 0, i = 0, j = 0; j < maxpad; j++) {
+ c = p[j];
+ cmask =
+ ((int)(j - off - SHA256_DIGEST_LENGTH)) >>
+ (sizeof(int) * 8 - 1);
+ res |= (c ^ pad) & ~cmask; /* ... and padding */
+ cmask &= ((int)(off - 1 - j)) >> (sizeof(int) * 8 - 1);
+ res |= (c ^ pmac->c[i]) & cmask;
+ i += 1 & cmask;
+ }
+ maxpad -= SHA256_DIGEST_LENGTH;
+
+ res = 0 - ((0 - res) >> (sizeof(res) * 8 - 1));
+ ret &= (int)~res;
+ }
+# else
+ for (res = 0, i = 0; i < SHA256_DIGEST_LENGTH; i++)
+ res |= out[i] ^ pmac->c[i];
+ res = 0 - ((0 - res) >> (sizeof(res) * 8 - 1));
+ ret &= (int)~res;
+
+ /* verify padding */
+ pad = (pad & ~res) | (maxpad & res);
+ out = out + len - 1 - pad;
+ for (res = 0, i = 0; i < pad; i++)
+ res |= out[i] ^ pad;
+
+ res = (0 - res) >> (sizeof(res) * 8 - 1);
+ ret &= (int)~res;
+# endif
+ return ret;
+ } else {
+ SHA256_Update(&key->md, out, len);
+ }
+ }
+
+ return 1;
+}
+
+static int aesni_cbc_hmac_sha256_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg,
+ void *ptr)
+{
+ EVP_AES_HMAC_SHA256 *key = data(ctx);
+
+ switch (type) {
+ case EVP_CTRL_AEAD_SET_MAC_KEY:
+ {
+ unsigned int i;
+ unsigned char hmac_key[64];
+
+ memset(hmac_key, 0, sizeof(hmac_key));
+
+ if (arg > (int)sizeof(hmac_key)) {
+ SHA256_Init(&key->head);
+ SHA256_Update(&key->head, ptr, arg);
+ SHA256_Final(hmac_key, &key->head);
+ } else {
+ memcpy(hmac_key, ptr, arg);
+ }
+
+ for (i = 0; i < sizeof(hmac_key); i++)
+ hmac_key[i] ^= 0x36; /* ipad */
+ SHA256_Init(&key->head);
+ SHA256_Update(&key->head, hmac_key, sizeof(hmac_key));
+
+ for (i = 0; i < sizeof(hmac_key); i++)
+ hmac_key[i] ^= 0x36 ^ 0x5c; /* opad */
+ SHA256_Init(&key->tail);
+ SHA256_Update(&key->tail, hmac_key, sizeof(hmac_key));
+
+ OPENSSL_cleanse(hmac_key, sizeof(hmac_key));
+
+ return 1;
+ }
+ case EVP_CTRL_AEAD_TLS1_AAD:
+ {
+ unsigned char *p = ptr;
+ unsigned int len = p[arg - 2] << 8 | p[arg - 1];
+
+ if (arg != EVP_AEAD_TLS1_AAD_LEN)
+ return -1;
+
+ len = p[arg - 2] << 8 | p[arg - 1];
+
+ if (ctx->encrypt) {
+ key->payload_length = len;
+ if ((key->aux.tls_ver =
+ p[arg - 4] << 8 | p[arg - 3]) >= TLS1_1_VERSION) {
+ len -= AES_BLOCK_SIZE;
+ p[arg - 2] = len >> 8;
+ p[arg - 1] = len;
+ }
+ key->md = key->head;
+ SHA256_Update(&key->md, p, arg);
+
+ return (int)(((len + SHA256_DIGEST_LENGTH +
+ AES_BLOCK_SIZE) & -AES_BLOCK_SIZE)
+ - len);
+ } else {
+ memcpy(key->aux.tls_aad, ptr, arg);
+ key->payload_length = arg;
+
+ return SHA256_DIGEST_LENGTH;
+ }
+ }
+# if !defined(OPENSSL_NO_MULTIBLOCK) && EVP_CIPH_FLAG_TLS1_1_MULTIBLOCK
+ case EVP_CTRL_TLS1_1_MULTIBLOCK_MAX_BUFSIZE:
+ return (int)(5 + 16 + ((arg + 32 + 16) & -16));
+ case EVP_CTRL_TLS1_1_MULTIBLOCK_AAD:
+ {
+ EVP_CTRL_TLS1_1_MULTIBLOCK_PARAM *param =
+ (EVP_CTRL_TLS1_1_MULTIBLOCK_PARAM *) ptr;
+ unsigned int n4x = 1, x4;
+ unsigned int frag, last, packlen, inp_len;
+
+ if (arg < (int)sizeof(EVP_CTRL_TLS1_1_MULTIBLOCK_PARAM))
+ return -1;
+
+ inp_len = param->inp[11] << 8 | param->inp[12];
+
+ if (ctx->encrypt) {
+ if ((param->inp[9] << 8 | param->inp[10]) < TLS1_1_VERSION)
+ return -1;
+
+ if (inp_len) {
+ if (inp_len < 4096)
+ return 0; /* too short */
+
+ if (inp_len >= 8192 && OPENSSL_ia32cap_P[2] & (1 << 5))
+ n4x = 2; /* AVX2 */
+ } else if ((n4x = param->interleave / 4) && n4x <= 2)
+ inp_len = param->len;
+ else
+ return -1;
+
+ key->md = key->head;
+ SHA256_Update(&key->md, param->inp, 13);
+
+ x4 = 4 * n4x;
+ n4x += 1;
+
+ frag = inp_len >> n4x;
+ last = inp_len + frag - (frag << n4x);
+ if (last > frag && ((last + 13 + 9) % 64 < (x4 - 1))) {
+ frag++;
+ last -= x4 - 1;
+ }
+
+ packlen = 5 + 16 + ((frag + 32 + 16) & -16);
+ packlen = (packlen << n4x) - packlen;
+ packlen += 5 + 16 + ((last + 32 + 16) & -16);
+
+ param->interleave = x4;
+
+ return (int)packlen;
+ } else
+ return -1; /* not yet */
+ }
+ case EVP_CTRL_TLS1_1_MULTIBLOCK_ENCRYPT:
+ {
+ EVP_CTRL_TLS1_1_MULTIBLOCK_PARAM *param =
+ (EVP_CTRL_TLS1_1_MULTIBLOCK_PARAM *) ptr;
+
+ return (int)tls1_1_multi_block_encrypt(key, param->out,
+ param->inp, param->len,
+ param->interleave / 4);
+ }
+ case EVP_CTRL_TLS1_1_MULTIBLOCK_DECRYPT:
+# endif
+ default:
+ return -1;
+ }
+}
+
+static EVP_CIPHER aesni_128_cbc_hmac_sha256_cipher = {
+# ifdef NID_aes_128_cbc_hmac_sha256
+ NID_aes_128_cbc_hmac_sha256,
+# else
+ NID_undef,
+# endif
+ 16, 16, 16,
+ EVP_CIPH_CBC_MODE | EVP_CIPH_FLAG_DEFAULT_ASN1 |
+ EVP_CIPH_FLAG_AEAD_CIPHER | EVP_CIPH_FLAG_TLS1_1_MULTIBLOCK,
+ aesni_cbc_hmac_sha256_init_key,
+ aesni_cbc_hmac_sha256_cipher,
+ NULL,
+ sizeof(EVP_AES_HMAC_SHA256),
+ EVP_CIPH_FLAG_DEFAULT_ASN1 ? NULL : EVP_CIPHER_set_asn1_iv,
+ EVP_CIPH_FLAG_DEFAULT_ASN1 ? NULL : EVP_CIPHER_get_asn1_iv,
+ aesni_cbc_hmac_sha256_ctrl,
+ NULL
+};
+
+static EVP_CIPHER aesni_256_cbc_hmac_sha256_cipher = {
+# ifdef NID_aes_256_cbc_hmac_sha256
+ NID_aes_256_cbc_hmac_sha256,
+# else
+ NID_undef,
+# endif
+ 16, 32, 16,
+ EVP_CIPH_CBC_MODE | EVP_CIPH_FLAG_DEFAULT_ASN1 |
+ EVP_CIPH_FLAG_AEAD_CIPHER | EVP_CIPH_FLAG_TLS1_1_MULTIBLOCK,
+ aesni_cbc_hmac_sha256_init_key,
+ aesni_cbc_hmac_sha256_cipher,
+ NULL,
+ sizeof(EVP_AES_HMAC_SHA256),
+ EVP_CIPH_FLAG_DEFAULT_ASN1 ? NULL : EVP_CIPHER_set_asn1_iv,
+ EVP_CIPH_FLAG_DEFAULT_ASN1 ? NULL : EVP_CIPHER_get_asn1_iv,
+ aesni_cbc_hmac_sha256_ctrl,
+ NULL
+};
+
+const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha256(void)
+{
+ return ((OPENSSL_ia32cap_P[1] & AESNI_CAPABLE) &&
+ aesni_cbc_sha256_enc(NULL, NULL, 0, NULL, NULL, NULL, NULL) ?
+ &aesni_128_cbc_hmac_sha256_cipher : NULL);
+}
+
+const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha256(void)
+{
+ return ((OPENSSL_ia32cap_P[1] & AESNI_CAPABLE) &&
+ aesni_cbc_sha256_enc(NULL, NULL, 0, NULL, NULL, NULL, NULL) ?
+ &aesni_256_cbc_hmac_sha256_cipher : NULL);
+}
+# else
+const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha256(void)
+{
+ return NULL;
+}
+
+const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha256(void)
+{
+ return NULL;
+}
+# endif
+#endif
diff --git a/crypto/evp/e_camellia.c b/crypto/evp/e_camellia.c
index 27bc4892fff8..f9c84013675d 100644
--- a/crypto/evp/e_camellia.c
+++ b/crypto/evp/e_camellia.c
@@ -61,6 +61,7 @@
# include <assert.h>
# include <openssl/camellia.h>
# include "evp_locl.h"
+# include "modes_lcl.h"
static int camellia_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
const unsigned char *iv, int enc);
@@ -68,48 +69,322 @@ static int camellia_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
/* Camellia subkey Structure */
typedef struct {
CAMELLIA_KEY ks;
+ block128_f block;
+ union {
+ cbc128_f cbc;
+ ctr128_f ctr;
+ } stream;
} EVP_CAMELLIA_KEY;
+# define MAXBITCHUNK ((size_t)1<<(sizeof(size_t)*8-4))
+
/* Attribute operation for Camellia */
# define data(ctx) EVP_C_DATA(EVP_CAMELLIA_KEY,ctx)
-IMPLEMENT_BLOCK_CIPHER(camellia_128, ks, Camellia, EVP_CAMELLIA_KEY,
- NID_camellia_128, 16, 16, 16, 128,
- 0, camellia_init_key, NULL,
- EVP_CIPHER_set_asn1_iv, EVP_CIPHER_get_asn1_iv, NULL)
- IMPLEMENT_BLOCK_CIPHER(camellia_192, ks, Camellia, EVP_CAMELLIA_KEY,
- NID_camellia_192, 16, 24, 16, 128,
- 0, camellia_init_key, NULL,
- EVP_CIPHER_set_asn1_iv, EVP_CIPHER_get_asn1_iv, NULL)
- IMPLEMENT_BLOCK_CIPHER(camellia_256, ks, Camellia, EVP_CAMELLIA_KEY,
- NID_camellia_256, 16, 32, 16, 128,
- 0, camellia_init_key, NULL,
- EVP_CIPHER_set_asn1_iv, EVP_CIPHER_get_asn1_iv, NULL)
-# define IMPLEMENT_CAMELLIA_CFBR(ksize,cbits) IMPLEMENT_CFBR(camellia,Camellia,EVP_CAMELLIA_KEY,ks,ksize,cbits,16)
- IMPLEMENT_CAMELLIA_CFBR(128, 1)
- IMPLEMENT_CAMELLIA_CFBR(192, 1)
- IMPLEMENT_CAMELLIA_CFBR(256, 1)
-
- IMPLEMENT_CAMELLIA_CFBR(128, 8)
- IMPLEMENT_CAMELLIA_CFBR(192, 8)
- IMPLEMENT_CAMELLIA_CFBR(256, 8)
+# if defined(AES_ASM) && (defined(__sparc) || defined(__sparc__))
+/* ---------^^^ this is not a typo, just a way to detect that
+ * assembler support was in general requested... */
+# include "sparc_arch.h"
+
+extern unsigned int OPENSSL_sparcv9cap_P[];
+
+# define SPARC_CMLL_CAPABLE (OPENSSL_sparcv9cap_P[1] & CFR_CAMELLIA)
+
+void cmll_t4_set_key(const unsigned char *key, int bits, CAMELLIA_KEY *ks);
+void cmll_t4_encrypt(const unsigned char *in, unsigned char *out,
+ const CAMELLIA_KEY *key);
+void cmll_t4_decrypt(const unsigned char *in, unsigned char *out,
+ const CAMELLIA_KEY *key);
+
+void cmll128_t4_cbc_encrypt(const unsigned char *in, unsigned char *out,
+ size_t len, const CAMELLIA_KEY *key,
+ unsigned char *ivec);
+void cmll128_t4_cbc_decrypt(const unsigned char *in, unsigned char *out,
+ size_t len, const CAMELLIA_KEY *key,
+ unsigned char *ivec);
+void cmll256_t4_cbc_encrypt(const unsigned char *in, unsigned char *out,
+ size_t len, const CAMELLIA_KEY *key,
+ unsigned char *ivec);
+void cmll256_t4_cbc_decrypt(const unsigned char *in, unsigned char *out,
+ size_t len, const CAMELLIA_KEY *key,
+ unsigned char *ivec);
+void cmll128_t4_ctr32_encrypt(const unsigned char *in, unsigned char *out,
+ size_t blocks, const CAMELLIA_KEY *key,
+ unsigned char *ivec);
+void cmll256_t4_ctr32_encrypt(const unsigned char *in, unsigned char *out,
+ size_t blocks, const CAMELLIA_KEY *key,
+ unsigned char *ivec);
+
+static int cmll_t4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+{
+ int ret, mode, bits;
+ EVP_CAMELLIA_KEY *dat = (EVP_CAMELLIA_KEY *) ctx->cipher_data;
+
+ mode = ctx->cipher->flags & EVP_CIPH_MODE;
+ bits = ctx->key_len * 8;
+
+ cmll_t4_set_key(key, bits, &dat->ks);
+
+ if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
+ && !enc) {
+ ret = 0;
+ dat->block = (block128_f) cmll_t4_decrypt;
+ switch (bits) {
+ case 128:
+ dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ?
+ (cbc128_f) cmll128_t4_cbc_decrypt : NULL;
+ break;
+ case 192:
+ case 256:
+ dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ?
+ (cbc128_f) cmll256_t4_cbc_decrypt : NULL;
+ break;
+ default:
+ ret = -1;
+ }
+ } else {
+ ret = 0;
+ dat->block = (block128_f) cmll_t4_encrypt;
+ switch (bits) {
+ case 128:
+ if (mode == EVP_CIPH_CBC_MODE)
+ dat->stream.cbc = (cbc128_f) cmll128_t4_cbc_encrypt;
+ else if (mode == EVP_CIPH_CTR_MODE)
+ dat->stream.ctr = (ctr128_f) cmll128_t4_ctr32_encrypt;
+ else
+ dat->stream.cbc = NULL;
+ break;
+ case 192:
+ case 256:
+ if (mode == EVP_CIPH_CBC_MODE)
+ dat->stream.cbc = (cbc128_f) cmll256_t4_cbc_encrypt;
+ else if (mode == EVP_CIPH_CTR_MODE)
+ dat->stream.ctr = (ctr128_f) cmll256_t4_ctr32_encrypt;
+ else
+ dat->stream.cbc = NULL;
+ break;
+ default:
+ ret = -1;
+ }
+ }
+
+ if (ret < 0) {
+ EVPerr(EVP_F_CMLL_T4_INIT_KEY, EVP_R_CAMELLIA_KEY_SETUP_FAILED);
+ return 0;
+ }
+
+ return 1;
+}
+
+# define cmll_t4_cbc_cipher camellia_cbc_cipher
+static int cmll_t4_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+# define cmll_t4_ecb_cipher camellia_ecb_cipher
+static int cmll_t4_ecb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+# define cmll_t4_ofb_cipher camellia_ofb_cipher
+static int cmll_t4_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+# define cmll_t4_cfb_cipher camellia_cfb_cipher
+static int cmll_t4_cfb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+# define cmll_t4_cfb8_cipher camellia_cfb8_cipher
+static int cmll_t4_cfb8_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+# define cmll_t4_cfb1_cipher camellia_cfb1_cipher
+static int cmll_t4_cfb1_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+# define cmll_t4_ctr_cipher camellia_ctr_cipher
+static int cmll_t4_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+# define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) \
+static const EVP_CIPHER cmll_t4_##keylen##_##mode = { \
+ nid##_##keylen##_##nmode,blocksize,keylen/8,ivlen, \
+ flags|EVP_CIPH_##MODE##_MODE, \
+ cmll_t4_init_key, \
+ cmll_t4_##mode##_cipher, \
+ NULL, \
+ sizeof(EVP_CAMELLIA_KEY), \
+ NULL,NULL,NULL,NULL }; \
+static const EVP_CIPHER camellia_##keylen##_##mode = { \
+ nid##_##keylen##_##nmode,blocksize, \
+ keylen/8,ivlen, \
+ flags|EVP_CIPH_##MODE##_MODE, \
+ camellia_init_key, \
+ camellia_##mode##_cipher, \
+ NULL, \
+ sizeof(EVP_CAMELLIA_KEY), \
+ NULL,NULL,NULL,NULL }; \
+const EVP_CIPHER *EVP_camellia_##keylen##_##mode(void) \
+{ return SPARC_CMLL_CAPABLE?&cmll_t4_##keylen##_##mode:&camellia_##keylen##_##mode; }
+
+# else
+
+# define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) \
+static const EVP_CIPHER camellia_##keylen##_##mode = { \
+ nid##_##keylen##_##nmode,blocksize,keylen/8,ivlen, \
+ flags|EVP_CIPH_##MODE##_MODE, \
+ camellia_init_key, \
+ camellia_##mode##_cipher, \
+ NULL, \
+ sizeof(EVP_CAMELLIA_KEY), \
+ NULL,NULL,NULL,NULL }; \
+const EVP_CIPHER *EVP_camellia_##keylen##_##mode(void) \
+{ return &camellia_##keylen##_##mode; }
+
+# endif
+
+# define BLOCK_CIPHER_generic_pack(nid,keylen,flags) \
+ BLOCK_CIPHER_generic(nid,keylen,16,16,cbc,cbc,CBC,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
+ BLOCK_CIPHER_generic(nid,keylen,16,0,ecb,ecb,ECB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
+ BLOCK_CIPHER_generic(nid,keylen,1,16,ofb128,ofb,OFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
+ BLOCK_CIPHER_generic(nid,keylen,1,16,cfb128,cfb,CFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
+ BLOCK_CIPHER_generic(nid,keylen,1,16,cfb1,cfb1,CFB,flags) \
+ BLOCK_CIPHER_generic(nid,keylen,1,16,cfb8,cfb8,CFB,flags)
+# if 0 /* not yet, missing NID */
+BLOCK_CIPHER_generic(nid, keylen, 1, 16, ctr, ctr, CTR, flags)
+# endif
/* The subkey for Camellia is generated. */
static int camellia_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
const unsigned char *iv, int enc)
{
- int ret;
-
- ret = Camellia_set_key(key, ctx->key_len * 8, ctx->cipher_data);
+ int ret, mode;
+ EVP_CAMELLIA_KEY *dat = (EVP_CAMELLIA_KEY *) ctx->cipher_data;
+ ret = Camellia_set_key(key, ctx->key_len * 8, &dat->ks);
if (ret < 0) {
EVPerr(EVP_F_CAMELLIA_INIT_KEY, EVP_R_CAMELLIA_KEY_SETUP_FAILED);
return 0;
}
+ mode = ctx->cipher->flags & EVP_CIPH_MODE;
+ if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
+ && !enc) {
+ dat->block = (block128_f) Camellia_decrypt;
+ dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ?
+ (cbc128_f) Camellia_cbc_encrypt : NULL;
+ } else {
+ dat->block = (block128_f) Camellia_encrypt;
+ dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ?
+ (cbc128_f) Camellia_cbc_encrypt : NULL;
+ }
+
return 1;
}
+static int camellia_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+{
+ EVP_CAMELLIA_KEY *dat = (EVP_CAMELLIA_KEY *) ctx->cipher_data;
+
+ if (dat->stream.cbc)
+ (*dat->stream.cbc) (in, out, len, &dat->ks, ctx->iv, ctx->encrypt);
+ else if (ctx->encrypt)
+ CRYPTO_cbc128_encrypt(in, out, len, &dat->ks, ctx->iv, dat->block);
+ else
+ CRYPTO_cbc128_decrypt(in, out, len, &dat->ks, ctx->iv, dat->block);
+
+ return 1;
+}
+
+static int camellia_ecb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+{
+ size_t bl = ctx->cipher->block_size;
+ size_t i;
+ EVP_CAMELLIA_KEY *dat = (EVP_CAMELLIA_KEY *) ctx->cipher_data;
+
+ if (len < bl)
+ return 1;
+
+ for (i = 0, len -= bl; i <= len; i += bl)
+ (*dat->block) (in + i, out + i, &dat->ks);
+
+ return 1;
+}
+
+static int camellia_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+{
+ EVP_CAMELLIA_KEY *dat = (EVP_CAMELLIA_KEY *) ctx->cipher_data;
+
+ CRYPTO_ofb128_encrypt(in, out, len, &dat->ks,
+ ctx->iv, &ctx->num, dat->block);
+ return 1;
+}
+
+static int camellia_cfb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+{
+ EVP_CAMELLIA_KEY *dat = (EVP_CAMELLIA_KEY *) ctx->cipher_data;
+
+ CRYPTO_cfb128_encrypt(in, out, len, &dat->ks,
+ ctx->iv, &ctx->num, ctx->encrypt, dat->block);
+ return 1;
+}
+
+static int camellia_cfb8_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+{
+ EVP_CAMELLIA_KEY *dat = (EVP_CAMELLIA_KEY *) ctx->cipher_data;
+
+ CRYPTO_cfb128_8_encrypt(in, out, len, &dat->ks,
+ ctx->iv, &ctx->num, ctx->encrypt, dat->block);
+ return 1;
+}
+
+static int camellia_cfb1_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+{
+ EVP_CAMELLIA_KEY *dat = (EVP_CAMELLIA_KEY *) ctx->cipher_data;
+
+ if (ctx->flags & EVP_CIPH_FLAG_LENGTH_BITS) {
+ CRYPTO_cfb128_1_encrypt(in, out, len, &dat->ks,
+ ctx->iv, &ctx->num, ctx->encrypt, dat->block);
+ return 1;
+ }
+
+ while (len >= MAXBITCHUNK) {
+ CRYPTO_cfb128_1_encrypt(in, out, MAXBITCHUNK * 8, &dat->ks,
+ ctx->iv, &ctx->num, ctx->encrypt, dat->block);
+ len -= MAXBITCHUNK;
+ }
+ if (len)
+ CRYPTO_cfb128_1_encrypt(in, out, len * 8, &dat->ks,
+ ctx->iv, &ctx->num, ctx->encrypt, dat->block);
+
+ return 1;
+}
+
+# if 0 /* not yet, missing NID */
+static int camellia_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+{
+ unsigned int num = ctx->num;
+ EVP_CAMELLIA_KEY *dat = (EVP_CAMELLIA_KEY *) ctx->cipher_data;
+
+ if (dat->stream.ctr)
+ CRYPTO_ctr128_encrypt_ctr32(in, out, len, &dat->ks,
+ ctx->iv, ctx->buf, &num, dat->stream.ctr);
+ else
+ CRYPTO_ctr128_encrypt(in, out, len, &dat->ks,
+ ctx->iv, ctx->buf, &num, dat->block);
+ ctx->num = (size_t)num;
+ return 1;
+}
+# endif
+
+BLOCK_CIPHER_generic_pack(NID_camellia, 128, 0)
+ BLOCK_CIPHER_generic_pack(NID_camellia, 192, 0)
+ BLOCK_CIPHER_generic_pack(NID_camellia, 256, 0)
#else
# ifdef PEDANTIC
diff --git a/crypto/evp/e_des.c b/crypto/evp/e_des.c
index ea1a4c422879..aae13a675694 100644
--- a/crypto/evp/e_des.c
+++ b/crypto/evp/e_des.c
@@ -65,6 +65,32 @@
# include <openssl/des.h>
# include <openssl/rand.h>
+typedef struct {
+ union {
+ double align;
+ DES_key_schedule ks;
+ } ks;
+ union {
+ void (*cbc) (const void *, void *, size_t, const void *, void *);
+ } stream;
+} EVP_DES_KEY;
+
+# if defined(AES_ASM) && (defined(__sparc) || defined(__sparc__))
+/* ---------^^^ this is not a typo, just a way to detect that
+ * assembler support was in general requested... */
+# include "sparc_arch.h"
+
+extern unsigned int OPENSSL_sparcv9cap_P[];
+
+# define SPARC_DES_CAPABLE (OPENSSL_sparcv9cap_P[1] & CFR_DES)
+
+void des_t4_key_expand(const void *key, DES_key_schedule *ks);
+void des_t4_cbc_encrypt(const void *inp, void *out, size_t len,
+ DES_key_schedule *ks, unsigned char iv[8]);
+void des_t4_cbc_decrypt(const void *inp, void *out, size_t len,
+ DES_key_schedule *ks, unsigned char iv[8]);
+# endif
+
static int des_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
const unsigned char *iv, int enc);
static int des_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr);
@@ -102,6 +128,12 @@ static int des_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
static int des_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
const unsigned char *in, size_t inl)
{
+ EVP_DES_KEY *dat = (EVP_DES_KEY *) ctx->cipher_data;
+
+ if (dat->stream.cbc) {
+ (*dat->stream.cbc) (in, out, inl, &dat->ks.ks, ctx->iv);
+ return 1;
+ }
while (inl >= EVP_MAXCHUNK) {
DES_ncbc_encrypt(in, out, (long)EVP_MAXCHUNK, ctx->cipher_data,
(DES_cblock *)ctx->iv, ctx->encrypt);
@@ -179,16 +211,15 @@ static int des_cfb8_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
return 1;
}
-BLOCK_CIPHER_defs(des, DES_key_schedule, NID_des, 8, 8, 8, 64,
+BLOCK_CIPHER_defs(des, EVP_DES_KEY, NID_des, 8, 8, 8, 64,
EVP_CIPH_RAND_KEY, des_init_key, NULL,
EVP_CIPHER_set_asn1_iv, EVP_CIPHER_get_asn1_iv, des_ctrl)
+ BLOCK_CIPHER_def_cfb(des, EVP_DES_KEY, NID_des, 8, 8, 1,
+ EVP_CIPH_RAND_KEY, des_init_key, NULL,
+ EVP_CIPHER_set_asn1_iv, EVP_CIPHER_get_asn1_iv, des_ctrl)
-BLOCK_CIPHER_def_cfb(des, DES_key_schedule, NID_des, 8, 8, 1,
- EVP_CIPH_RAND_KEY, des_init_key, NULL,
- EVP_CIPHER_set_asn1_iv, EVP_CIPHER_get_asn1_iv, des_ctrl)
-
-BLOCK_CIPHER_def_cfb(des, DES_key_schedule, NID_des, 8, 8, 8,
+ BLOCK_CIPHER_def_cfb(des, EVP_DES_KEY, NID_des, 8, 8, 8,
EVP_CIPH_RAND_KEY, des_init_key, NULL,
EVP_CIPHER_set_asn1_iv, EVP_CIPHER_get_asn1_iv, des_ctrl)
@@ -196,8 +227,22 @@ static int des_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
const unsigned char *iv, int enc)
{
DES_cblock *deskey = (DES_cblock *)key;
+ EVP_DES_KEY *dat = (EVP_DES_KEY *) ctx->cipher_data;
+
+ dat->stream.cbc = NULL;
+# if defined(SPARC_DES_CAPABLE)
+ if (SPARC_DES_CAPABLE) {
+ int mode = ctx->cipher->flags & EVP_CIPH_MODE;
+
+ if (mode == EVP_CIPH_CBC_MODE) {
+ des_t4_key_expand(key, &dat->ks.ks);
+ dat->stream.cbc = enc ? des_t4_cbc_encrypt : des_t4_cbc_decrypt;
+ return 1;
+ }
+ }
+# endif
# ifdef EVP_CHECK_DES_KEY
- if (DES_set_key_checked(deskey, ctx->cipher_data) != 0)
+ if (DES_set_key_checked(deskey, dat->ks.ks) != 0)
return 0;
# else
DES_set_key_unchecked(deskey, ctx->cipher_data);
diff --git a/crypto/evp/e_des3.c b/crypto/evp/e_des3.c
index 07a5aca606de..96f272eb8046 100644
--- a/crypto/evp/e_des3.c
+++ b/crypto/evp/e_des3.c
@@ -65,7 +65,38 @@
# include <openssl/des.h>
# include <openssl/rand.h>
-# ifndef OPENSSL_FIPS
+/* Block use of implementations in FIPS mode */
+# undef EVP_CIPH_FLAG_FIPS
+# define EVP_CIPH_FLAG_FIPS 0
+
+typedef struct {
+ union {
+ double align;
+ DES_key_schedule ks[3];
+ } ks;
+ union {
+ void (*cbc) (const void *, void *, size_t, const void *, void *);
+ } stream;
+} DES_EDE_KEY;
+# define ks1 ks.ks[0]
+# define ks2 ks.ks[1]
+# define ks3 ks.ks[2]
+
+# if defined(AES_ASM) && (defined(__sparc) || defined(__sparc__))
+/* ---------^^^ this is not a typo, just a way to detect that
+ * assembler support was in general requested... */
+# include "sparc_arch.h"
+
+extern unsigned int OPENSSL_sparcv9cap_P[];
+
+# define SPARC_DES_CAPABLE (OPENSSL_sparcv9cap_P[1] & CFR_DES)
+
+void des_t4_key_expand(const void *key, DES_key_schedule *ks);
+void des_t4_ede3_cbc_encrypt(const void *inp, void *out, size_t len,
+ DES_key_schedule *ks, unsigned char iv[8]);
+void des_t4_ede3_cbc_decrypt(const void *inp, void *out, size_t len,
+ DES_key_schedule *ks, unsigned char iv[8]);
+# endif
static int des_ede_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
const unsigned char *iv, int enc);
@@ -75,13 +106,7 @@ static int des_ede3_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
static int des3_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr);
-typedef struct {
- DES_key_schedule ks1; /* key schedule */
- DES_key_schedule ks2; /* key schedule (for ede) */
- DES_key_schedule ks3; /* key schedule (for ede3) */
-} DES_EDE_KEY;
-
-# define data(ctx) ((DES_EDE_KEY *)(ctx)->cipher_data)
+# define data(ctx) ((DES_EDE_KEY *)(ctx)->cipher_data)
/*
* Because of various casts and different args can't use
@@ -123,7 +148,9 @@ static int des_ede_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
static int des_ede_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
const unsigned char *in, size_t inl)
{
-# ifdef KSSL_DEBUG
+ DES_EDE_KEY *dat = data(ctx);
+
+# ifdef KSSL_DEBUG
{
int i;
fprintf(stderr, "des_ede_cbc_cipher(ctx=%p, buflen=%d)\n", ctx,
@@ -133,21 +160,24 @@ static int des_ede_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
fprintf(stderr, "%02X", ctx->iv[i]);
fprintf(stderr, "\n");
}
-# endif /* KSSL_DEBUG */
+# endif /* KSSL_DEBUG */
+ if (dat->stream.cbc) {
+ (*dat->stream.cbc) (in, out, inl, &dat->ks, ctx->iv);
+ return 1;
+ }
+
while (inl >= EVP_MAXCHUNK) {
DES_ede3_cbc_encrypt(in, out, (long)EVP_MAXCHUNK,
- &data(ctx)->ks1, &data(ctx)->ks2,
- &data(ctx)->ks3, (DES_cblock *)ctx->iv,
- ctx->encrypt);
+ &dat->ks1, &dat->ks2, &dat->ks3,
+ (DES_cblock *)ctx->iv, ctx->encrypt);
inl -= EVP_MAXCHUNK;
in += EVP_MAXCHUNK;
out += EVP_MAXCHUNK;
}
if (inl)
DES_ede3_cbc_encrypt(in, out, (long)inl,
- &data(ctx)->ks1, &data(ctx)->ks2,
- &data(ctx)->ks3, (DES_cblock *)ctx->iv,
- ctx->encrypt);
+ &dat->ks1, &dat->ks2, &dat->ks3,
+ (DES_cblock *)ctx->iv, ctx->encrypt);
return 1;
}
@@ -215,39 +245,57 @@ static int des_ede3_cfb8_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
}
BLOCK_CIPHER_defs(des_ede, DES_EDE_KEY, NID_des_ede, 8, 16, 8, 64,
- EVP_CIPH_RAND_KEY, des_ede_init_key, NULL,
- EVP_CIPHER_set_asn1_iv, EVP_CIPHER_get_asn1_iv, des3_ctrl)
-# define des_ede3_cfb64_cipher des_ede_cfb64_cipher
-# define des_ede3_ofb_cipher des_ede_ofb_cipher
-# define des_ede3_cbc_cipher des_ede_cbc_cipher
-# define des_ede3_ecb_cipher des_ede_ecb_cipher
+ EVP_CIPH_RAND_KEY | EVP_CIPH_FLAG_DEFAULT_ASN1,
+ des_ede_init_key, NULL, NULL, NULL, des3_ctrl)
+# define des_ede3_cfb64_cipher des_ede_cfb64_cipher
+# define des_ede3_ofb_cipher des_ede_ofb_cipher
+# define des_ede3_cbc_cipher des_ede_cbc_cipher
+# define des_ede3_ecb_cipher des_ede_ecb_cipher
BLOCK_CIPHER_defs(des_ede3, DES_EDE_KEY, NID_des_ede3, 8, 24, 8, 64,
- EVP_CIPH_RAND_KEY, des_ede3_init_key, NULL,
- EVP_CIPHER_set_asn1_iv, EVP_CIPHER_get_asn1_iv, des3_ctrl)
+ EVP_CIPH_RAND_KEY | EVP_CIPH_FLAG_FIPS |
+ EVP_CIPH_FLAG_DEFAULT_ASN1, des_ede3_init_key, NULL, NULL, NULL,
+ des3_ctrl)
BLOCK_CIPHER_def_cfb(des_ede3, DES_EDE_KEY, NID_des_ede3, 24, 8, 1,
- EVP_CIPH_RAND_KEY, des_ede3_init_key, NULL,
- EVP_CIPHER_set_asn1_iv,
- EVP_CIPHER_get_asn1_iv, des3_ctrl)
+ EVP_CIPH_RAND_KEY | EVP_CIPH_FLAG_FIPS |
+ EVP_CIPH_FLAG_DEFAULT_ASN1, des_ede3_init_key, NULL, NULL,
+ NULL, des3_ctrl)
BLOCK_CIPHER_def_cfb(des_ede3, DES_EDE_KEY, NID_des_ede3, 24, 8, 8,
- EVP_CIPH_RAND_KEY, des_ede3_init_key, NULL,
- EVP_CIPHER_set_asn1_iv,
- EVP_CIPHER_get_asn1_iv, des3_ctrl)
+ EVP_CIPH_RAND_KEY | EVP_CIPH_FLAG_FIPS |
+ EVP_CIPH_FLAG_DEFAULT_ASN1, des_ede3_init_key, NULL, NULL,
+ NULL, des3_ctrl)
static int des_ede_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
const unsigned char *iv, int enc)
{
DES_cblock *deskey = (DES_cblock *)key;
-# ifdef EVP_CHECK_DES_KEY
- if (DES_set_key_checked(&deskey[0], &data(ctx)->ks1)
- ! !DES_set_key_checked(&deskey[1], &data(ctx)->ks2))
+ DES_EDE_KEY *dat = data(ctx);
+
+ dat->stream.cbc = NULL;
+# if defined(SPARC_DES_CAPABLE)
+ if (SPARC_DES_CAPABLE) {
+ int mode = ctx->cipher->flags & EVP_CIPH_MODE;
+
+ if (mode == EVP_CIPH_CBC_MODE) {
+ des_t4_key_expand(&deskey[0], &dat->ks1);
+ des_t4_key_expand(&deskey[1], &dat->ks2);
+ memcpy(&dat->ks3, &dat->ks1, sizeof(dat->ks1));
+ dat->stream.cbc = enc ? des_t4_ede3_cbc_encrypt :
+ des_t4_ede3_cbc_decrypt;
+ return 1;
+ }
+ }
+# endif
+# ifdef EVP_CHECK_DES_KEY
+ if (DES_set_key_checked(&deskey[0], &dat->ks1)
+ ! !DES_set_key_checked(&deskey[1], &dat->ks2))
return 0;
-# else
- DES_set_key_unchecked(&deskey[0], &data(ctx)->ks1);
- DES_set_key_unchecked(&deskey[1], &data(ctx)->ks2);
-# endif
- memcpy(&data(ctx)->ks3, &data(ctx)->ks1, sizeof(data(ctx)->ks1));
+# else
+ DES_set_key_unchecked(&deskey[0], &dat->ks1);
+ DES_set_key_unchecked(&deskey[1], &dat->ks2);
+# endif
+ memcpy(&dat->ks3, &dat->ks1, sizeof(dat->ks1));
return 1;
}
@@ -255,7 +303,9 @@ static int des_ede3_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
const unsigned char *iv, int enc)
{
DES_cblock *deskey = (DES_cblock *)key;
-# ifdef KSSL_DEBUG
+ DES_EDE_KEY *dat = data(ctx);
+
+# ifdef KSSL_DEBUG
{
int i;
fprintf(stderr, "des_ede3_init_key(ctx=%p)\n", ctx);
@@ -270,18 +320,33 @@ static int des_ede3_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
fprintf(stderr, "\n");
}
}
-# endif /* KSSL_DEBUG */
+# endif /* KSSL_DEBUG */
+
+ dat->stream.cbc = NULL;
+# if defined(SPARC_DES_CAPABLE)
+ if (SPARC_DES_CAPABLE) {
+ int mode = ctx->cipher->flags & EVP_CIPH_MODE;
-# ifdef EVP_CHECK_DES_KEY
- if (DES_set_key_checked(&deskey[0], &data(ctx)->ks1)
- || DES_set_key_checked(&deskey[1], &data(ctx)->ks2)
- || DES_set_key_checked(&deskey[2], &data(ctx)->ks3))
+ if (mode == EVP_CIPH_CBC_MODE) {
+ des_t4_key_expand(&deskey[0], &dat->ks1);
+ des_t4_key_expand(&deskey[1], &dat->ks2);
+ des_t4_key_expand(&deskey[2], &dat->ks3);
+ dat->stream.cbc = enc ? des_t4_ede3_cbc_encrypt :
+ des_t4_ede3_cbc_decrypt;
+ return 1;
+ }
+ }
+# endif
+# ifdef EVP_CHECK_DES_KEY
+ if (DES_set_key_checked(&deskey[0], &dat->ks1)
+ || DES_set_key_checked(&deskey[1], &dat->ks2)
+ || DES_set_key_checked(&deskey[2], &dat->ks3))
return 0;
-# else
- DES_set_key_unchecked(&deskey[0], &data(ctx)->ks1);
- DES_set_key_unchecked(&deskey[1], &data(ctx)->ks2);
- DES_set_key_unchecked(&deskey[2], &data(ctx)->ks3);
-# endif
+# else
+ DES_set_key_unchecked(&deskey[0], &dat->ks1);
+ DES_set_key_unchecked(&deskey[1], &dat->ks2);
+ DES_set_key_unchecked(&deskey[2], &dat->ks3);
+# endif
return 1;
}
@@ -315,5 +380,115 @@ const EVP_CIPHER *EVP_des_ede3(void)
{
return &des_ede3_ecb;
}
+
+# ifndef OPENSSL_NO_SHA
+
+# include <openssl/sha.h>
+
+static const unsigned char wrap_iv[8] =
+ { 0x4a, 0xdd, 0xa2, 0x2c, 0x79, 0xe8, 0x21, 0x05 };
+
+static int des_ede3_unwrap(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t inl)
+{
+ unsigned char icv[8], iv[8], sha1tmp[SHA_DIGEST_LENGTH];
+ int rv = -1;
+ if (inl < 24)
+ return -1;
+ if (!out)
+ return inl - 16;
+ memcpy(ctx->iv, wrap_iv, 8);
+ /* Decrypt first block which will end up as icv */
+ des_ede_cbc_cipher(ctx, icv, in, 8);
+ /* Decrypt central blocks */
+ /*
+ * If decrypting in place move whole output along a block so the next
+ * des_ede_cbc_cipher is in place.
+ */
+ if (out == in) {
+ memmove(out, out + 8, inl - 8);
+ in -= 8;
+ }
+ des_ede_cbc_cipher(ctx, out, in + 8, inl - 16);
+ /* Decrypt final block which will be IV */
+ des_ede_cbc_cipher(ctx, iv, in + inl - 8, 8);
+ /* Reverse order of everything */
+ BUF_reverse(icv, NULL, 8);
+ BUF_reverse(out, NULL, inl - 16);
+ BUF_reverse(ctx->iv, iv, 8);
+ /* Decrypt again using new IV */
+ des_ede_cbc_cipher(ctx, out, out, inl - 16);
+ des_ede_cbc_cipher(ctx, icv, icv, 8);
+ /* Work out SHA1 hash of first portion */
+ SHA1(out, inl - 16, sha1tmp);
+
+ if (!CRYPTO_memcmp(sha1tmp, icv, 8))
+ rv = inl - 16;
+ OPENSSL_cleanse(icv, 8);
+ OPENSSL_cleanse(sha1tmp, SHA_DIGEST_LENGTH);
+ OPENSSL_cleanse(iv, 8);
+ OPENSSL_cleanse(ctx->iv, 8);
+ if (rv == -1)
+ OPENSSL_cleanse(out, inl - 16);
+
+ return rv;
+}
+
+static int des_ede3_wrap(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t inl)
+{
+ unsigned char sha1tmp[SHA_DIGEST_LENGTH];
+ if (!out)
+ return inl + 16;
+ /* Copy input to output buffer + 8 so we have space for IV */
+ memmove(out + 8, in, inl);
+ /* Work out ICV */
+ SHA1(in, inl, sha1tmp);
+ memcpy(out + inl + 8, sha1tmp, 8);
+ OPENSSL_cleanse(sha1tmp, SHA_DIGEST_LENGTH);
+ /* Generate random IV */
+ if (RAND_bytes(ctx->iv, 8) <= 0)
+ return -1;
+ memcpy(out, ctx->iv, 8);
+ /* Encrypt everything after IV in place */
+ des_ede_cbc_cipher(ctx, out + 8, out + 8, inl + 8);
+ BUF_reverse(out, NULL, inl + 16);
+ memcpy(ctx->iv, wrap_iv, 8);
+ des_ede_cbc_cipher(ctx, out, out, inl + 16);
+ return inl + 16;
+}
+
+static int des_ede3_wrap_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t inl)
+{
+ /*
+ * Sanity check input length: we typically only wrap keys so EVP_MAXCHUNK
+ * is more than will ever be needed. Also input length must be a multiple
+ * of 8 bits.
+ */
+ if (inl >= EVP_MAXCHUNK || inl % 8)
+ return -1;
+ if (ctx->encrypt)
+ return des_ede3_wrap(ctx, out, in, inl);
+ else
+ return des_ede3_unwrap(ctx, out, in, inl);
+}
+
+static const EVP_CIPHER des3_wrap = {
+ NID_id_smime_alg_CMS3DESwrap,
+ 8, 24, 0,
+ EVP_CIPH_WRAP_MODE | EVP_CIPH_CUSTOM_IV | EVP_CIPH_FLAG_CUSTOM_CIPHER
+ | EVP_CIPH_FLAG_DEFAULT_ASN1,
+ des_ede3_init_key, des_ede3_wrap_cipher,
+ NULL,
+ sizeof(DES_EDE_KEY),
+ NULL, NULL, NULL, NULL
+};
+
+const EVP_CIPHER *EVP_des_ede3_wrap(void)
+{
+ return &des3_wrap;
+}
+
# endif
#endif
diff --git a/crypto/evp/e_null.c b/crypto/evp/e_null.c
index af90ce326bd9..599fcb808d21 100644
--- a/crypto/evp/e_null.c
+++ b/crypto/evp/e_null.c
@@ -61,8 +61,6 @@
#include <openssl/evp.h>
#include <openssl/objects.h>
-#ifndef OPENSSL_FIPS
-
static int null_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
const unsigned char *iv, int enc);
static int null_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
@@ -100,4 +98,3 @@ static int null_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
memcpy((char *)out, (const char *)in, inl);
return 1;
}
-#endif
diff --git a/crypto/evp/encode.c b/crypto/evp/encode.c
index 5c5988fc45e1..c361d1f01269 100644
--- a/crypto/evp/encode.c
+++ b/crypto/evp/encode.c
@@ -248,7 +248,7 @@ int EVP_DecodeUpdate(EVP_ENCODE_CTX *ctx, unsigned char *out, int *outl,
/* We parse the input data */
for (i = 0; i < inl; i++) {
- /* If the current line is > 80 characters, scream alot */
+ /* If the current line is > 80 characters, scream a lot */
if (ln >= 80) {
rv = -1;
goto end;
diff --git a/crypto/evp/evp.h b/crypto/evp/evp.h
index 6cf98acc0b79..39ab7937d256 100644
--- a/crypto/evp/evp.h
+++ b/crypto/evp/evp.h
@@ -113,6 +113,7 @@
# define EVP_PKEY_DSA3 NID_dsaWithSHA1
# define EVP_PKEY_DSA4 NID_dsaWithSHA1_2
# define EVP_PKEY_DH NID_dhKeyAgreement
+# define EVP_PKEY_DHX NID_dhpublicnumber
# define EVP_PKEY_EC NID_X9_62_id_ecPublicKey
# define EVP_PKEY_HMAC NID_hmac
# define EVP_PKEY_CMAC NID_cmac
@@ -345,6 +346,7 @@ struct evp_cipher_st {
# define EVP_CIPH_GCM_MODE 0x6
# define EVP_CIPH_CCM_MODE 0x7
# define EVP_CIPH_XTS_MODE 0x10001
+# define EVP_CIPH_WRAP_MODE 0x10002
# define EVP_CIPH_MODE 0xF0007
/* Set if variable length cipher */
# define EVP_CIPH_VARIABLE_LENGTH 0x8
@@ -375,6 +377,14 @@ struct evp_cipher_st {
*/
# define EVP_CIPH_FLAG_CUSTOM_CIPHER 0x100000
# define EVP_CIPH_FLAG_AEAD_CIPHER 0x200000
+# define EVP_CIPH_FLAG_TLS1_1_MULTIBLOCK 0x400000
+
+/*
+ * Cipher context flag to indicate we can handle wrap mode: if allowed in
+ * older applications it could overflow buffers.
+ */
+
+# define EVP_CIPHER_CTX_FLAG_WRAP_ALLOW 0x1
/* ctrl() values */
@@ -408,9 +418,21 @@ struct evp_cipher_st {
/* Set the GCM invocation field, decrypt only */
# define EVP_CTRL_GCM_SET_IV_INV 0x18
+# define EVP_CTRL_TLS1_1_MULTIBLOCK_AAD 0x19
+# define EVP_CTRL_TLS1_1_MULTIBLOCK_ENCRYPT 0x1a
+# define EVP_CTRL_TLS1_1_MULTIBLOCK_DECRYPT 0x1b
+# define EVP_CTRL_TLS1_1_MULTIBLOCK_MAX_BUFSIZE 0x1c
+
/* RFC 5246 defines additional data to be 13 bytes in length */
# define EVP_AEAD_TLS1_AAD_LEN 13
+typedef struct {
+ unsigned char *out;
+ const unsigned char *inp;
+ size_t len;
+ unsigned int interleave;
+} EVP_CTRL_TLS1_1_MULTIBLOCK_PARAM;
+
/* GCM TLS constants */
/* Length of fixed part of IV derived from PRF */
# define EVP_GCM_TLS_FIXED_IV_LEN 4
@@ -639,7 +661,8 @@ int EVP_DigestSignFinal(EVP_MD_CTX *ctx,
int EVP_DigestVerifyInit(EVP_MD_CTX *ctx, EVP_PKEY_CTX **pctx,
const EVP_MD *type, ENGINE *e, EVP_PKEY *pkey);
-int EVP_DigestVerifyFinal(EVP_MD_CTX *ctx, unsigned char *sig, size_t siglen);
+int EVP_DigestVerifyFinal(EVP_MD_CTX *ctx,
+ const unsigned char *sig, size_t siglen);
int EVP_OpenInit(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *type,
const unsigned char *ek, int ekl, const unsigned char *iv,
@@ -744,6 +767,7 @@ const EVP_CIPHER *EVP_des_cbc(void);
const EVP_CIPHER *EVP_des_ede_cbc(void);
const EVP_CIPHER *EVP_des_ede3_cbc(void);
const EVP_CIPHER *EVP_desx_cbc(void);
+const EVP_CIPHER *EVP_des_ede3_wrap(void);
/*
* This should now be supported through the dev_crypto ENGINE. But also, why
* are rc4 and md5 declarations made here inside a "NO_DES" precompiler
@@ -813,6 +837,7 @@ const EVP_CIPHER *EVP_aes_128_ctr(void);
const EVP_CIPHER *EVP_aes_128_ccm(void);
const EVP_CIPHER *EVP_aes_128_gcm(void);
const EVP_CIPHER *EVP_aes_128_xts(void);
+const EVP_CIPHER *EVP_aes_128_wrap(void);
const EVP_CIPHER *EVP_aes_192_ecb(void);
const EVP_CIPHER *EVP_aes_192_cbc(void);
const EVP_CIPHER *EVP_aes_192_cfb1(void);
@@ -823,6 +848,7 @@ const EVP_CIPHER *EVP_aes_192_ofb(void);
const EVP_CIPHER *EVP_aes_192_ctr(void);
const EVP_CIPHER *EVP_aes_192_ccm(void);
const EVP_CIPHER *EVP_aes_192_gcm(void);
+const EVP_CIPHER *EVP_aes_192_wrap(void);
const EVP_CIPHER *EVP_aes_256_ecb(void);
const EVP_CIPHER *EVP_aes_256_cbc(void);
const EVP_CIPHER *EVP_aes_256_cfb1(void);
@@ -834,10 +860,15 @@ const EVP_CIPHER *EVP_aes_256_ctr(void);
const EVP_CIPHER *EVP_aes_256_ccm(void);
const EVP_CIPHER *EVP_aes_256_gcm(void);
const EVP_CIPHER *EVP_aes_256_xts(void);
+const EVP_CIPHER *EVP_aes_256_wrap(void);
# if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA1)
const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void);
const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void);
# endif
+# ifndef OPENSSL_NO_SHA256
+const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha256(void);
+const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha256(void);
+# endif
# endif
# ifndef OPENSSL_NO_CAMELLIA
const EVP_CIPHER *EVP_camellia_128_ecb(void);
@@ -1028,6 +1059,7 @@ void EVP_PBE_cleanup(void);
# define ASN1_PKEY_CTRL_DEFAULT_MD_NID 0x3
# define ASN1_PKEY_CTRL_CMS_SIGN 0x5
# define ASN1_PKEY_CTRL_CMS_ENVELOPE 0x7
+# define ASN1_PKEY_CTRL_CMS_RI_TYPE 0x8
int EVP_PKEY_asn1_get_count(void);
const EVP_PKEY_ASN1_METHOD *EVP_PKEY_asn1_get0(int idx);
@@ -1091,6 +1123,19 @@ void EVP_PKEY_asn1_set_free(EVP_PKEY_ASN1_METHOD *ameth,
void EVP_PKEY_asn1_set_ctrl(EVP_PKEY_ASN1_METHOD *ameth,
int (*pkey_ctrl) (EVP_PKEY *pkey, int op,
long arg1, void *arg2));
+void EVP_PKEY_asn1_set_item(EVP_PKEY_ASN1_METHOD *ameth,
+ int (*item_verify) (EVP_MD_CTX *ctx,
+ const ASN1_ITEM *it,
+ void *asn,
+ X509_ALGOR *a,
+ ASN1_BIT_STRING *sig,
+ EVP_PKEY *pkey),
+ int (*item_sign) (EVP_MD_CTX *ctx,
+ const ASN1_ITEM *it,
+ void *asn,
+ X509_ALGOR *alg1,
+ X509_ALGOR *alg2,
+ ASN1_BIT_STRING *sig));
# define EVP_PKEY_OP_UNDEFINED 0
# define EVP_PKEY_OP_PARAMGEN (1<<1)
@@ -1121,6 +1166,10 @@ void EVP_PKEY_asn1_set_ctrl(EVP_PKEY_ASN1_METHOD *ameth,
EVP_PKEY_CTX_ctrl(ctx, -1, EVP_PKEY_OP_TYPE_SIG, \
EVP_PKEY_CTRL_MD, 0, (void *)md)
+# define EVP_PKEY_CTX_get_signature_md(ctx, pmd) \
+ EVP_PKEY_CTX_ctrl(ctx, -1, EVP_PKEY_OP_TYPE_SIG, \
+ EVP_PKEY_CTRL_GET_MD, 0, (void *)pmd)
+
# define EVP_PKEY_CTRL_MD 1
# define EVP_PKEY_CTRL_PEER_KEY 2
@@ -1142,6 +1191,8 @@ void EVP_PKEY_asn1_set_ctrl(EVP_PKEY_ASN1_METHOD *ameth,
# define EVP_PKEY_CTRL_CIPHER 12
+# define EVP_PKEY_CTRL_GET_MD 13
+
# define EVP_PKEY_ALG_CTRL 0x1000
# define EVP_PKEY_FLAG_AUTOARGLEN 2
@@ -1327,11 +1378,13 @@ void ERR_load_EVP_strings(void);
# define EVP_F_AESNI_INIT_KEY 165
# define EVP_F_AESNI_XTS_CIPHER 176
# define EVP_F_AES_INIT_KEY 133
+# define EVP_F_AES_T4_INIT_KEY 178
# define EVP_F_AES_XTS 172
# define EVP_F_AES_XTS_CIPHER 175
# define EVP_F_ALG_MODULE_INIT 177
# define EVP_F_CAMELLIA_INIT_KEY 159
# define EVP_F_CMAC_INIT 173
+# define EVP_F_CMLL_T4_INIT_KEY 179
# define EVP_F_D2I_PKEY 100
# define EVP_F_DO_SIGVER_INIT 161
# define EVP_F_DSAPKEY2PKCS8 134
@@ -1471,6 +1524,7 @@ void ERR_load_EVP_strings(void);
# define EVP_R_UNSUPPORTED_PRF 125
# define EVP_R_UNSUPPORTED_PRIVATE_KEY_ALGORITHM 118
# define EVP_R_UNSUPPORTED_SALT_TYPE 126
+# define EVP_R_WRAP_MODE_NOT_ALLOWED 170
# define EVP_R_WRONG_FINAL_BLOCK_LENGTH 109
# define EVP_R_WRONG_PUBLIC_KEY_TYPE 110
diff --git a/crypto/evp/evp_enc.c b/crypto/evp/evp_enc.c
index 4e983c4bda23..65f0e0244dce 100644
--- a/crypto/evp/evp_enc.c
+++ b/crypto/evp/evp_enc.c
@@ -169,8 +169,14 @@ int EVP_CipherInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher,
#endif
#ifdef OPENSSL_FIPS
- if (FIPS_mode())
+ if (FIPS_mode()) {
+ const EVP_CIPHER *fcipher;
+ if (cipher)
+ fcipher = evp_get_fips_cipher(cipher);
+ if (fcipher)
+ cipher = fcipher;
return FIPS_cipherinit(ctx, cipher, key, iv, enc);
+ }
#endif
ctx->cipher = cipher;
if (ctx->cipher->ctx_size) {
@@ -183,7 +189,8 @@ int EVP_CipherInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher,
ctx->cipher_data = NULL;
}
ctx->key_len = cipher->key_len;
- ctx->flags = 0;
+ /* Preserve wrap enable flag, zero everything else */
+ ctx->flags &= EVP_CIPHER_CTX_FLAG_WRAP_ALLOW;
if (ctx->cipher->flags & EVP_CIPH_CTRL_INIT) {
if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_INIT, 0, NULL)) {
EVPerr(EVP_F_EVP_CIPHERINIT_EX, EVP_R_INITIALIZATION_ERROR);
@@ -206,6 +213,12 @@ int EVP_CipherInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher,
|| ctx->cipher->block_size == 8
|| ctx->cipher->block_size == 16);
+ if (!(ctx->flags & EVP_CIPHER_CTX_FLAG_WRAP_ALLOW)
+ && EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_WRAP_MODE) {
+ EVPerr(EVP_F_EVP_CIPHERINIT_EX, EVP_R_WRAP_MODE_NOT_ALLOWED);
+ return 0;
+ }
+
if (!(EVP_CIPHER_CTX_flags(ctx) & EVP_CIPH_CUSTOM_IV)) {
switch (EVP_CIPHER_CTX_mode(ctx)) {
diff --git a/crypto/evp/evp_err.c b/crypto/evp/evp_err.c
index 686a69958dd7..15cf5532b382 100644
--- a/crypto/evp/evp_err.c
+++ b/crypto/evp/evp_err.c
@@ -1,6 +1,6 @@
/* crypto/evp/evp_err.c */
/* ====================================================================
- * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1999-2013 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -73,11 +73,13 @@ static ERR_STRING_DATA EVP_str_functs[] = {
{ERR_FUNC(EVP_F_AESNI_INIT_KEY), "AESNI_INIT_KEY"},
{ERR_FUNC(EVP_F_AESNI_XTS_CIPHER), "AESNI_XTS_CIPHER"},
{ERR_FUNC(EVP_F_AES_INIT_KEY), "AES_INIT_KEY"},
+ {ERR_FUNC(EVP_F_AES_T4_INIT_KEY), "AES_T4_INIT_KEY"},
{ERR_FUNC(EVP_F_AES_XTS), "AES_XTS"},
{ERR_FUNC(EVP_F_AES_XTS_CIPHER), "AES_XTS_CIPHER"},
{ERR_FUNC(EVP_F_ALG_MODULE_INIT), "ALG_MODULE_INIT"},
{ERR_FUNC(EVP_F_CAMELLIA_INIT_KEY), "CAMELLIA_INIT_KEY"},
{ERR_FUNC(EVP_F_CMAC_INIT), "CMAC_INIT"},
+ {ERR_FUNC(EVP_F_CMLL_T4_INIT_KEY), "CMLL_T4_INIT_KEY"},
{ERR_FUNC(EVP_F_D2I_PKEY), "D2I_PKEY"},
{ERR_FUNC(EVP_F_DO_SIGVER_INIT), "DO_SIGVER_INIT"},
{ERR_FUNC(EVP_F_DSAPKEY2PKCS8), "DSAPKEY2PKCS8"},
@@ -232,6 +234,7 @@ static ERR_STRING_DATA EVP_str_reasons[] = {
{ERR_REASON(EVP_R_UNSUPPORTED_PRIVATE_KEY_ALGORITHM),
"unsupported private key algorithm"},
{ERR_REASON(EVP_R_UNSUPPORTED_SALT_TYPE), "unsupported salt type"},
+ {ERR_REASON(EVP_R_WRAP_MODE_NOT_ALLOWED), "wrap mode not allowed"},
{ERR_REASON(EVP_R_WRONG_FINAL_BLOCK_LENGTH), "wrong final block length"},
{ERR_REASON(EVP_R_WRONG_PUBLIC_KEY_TYPE), "wrong public key type"},
{0, NULL}
diff --git a/crypto/evp/evp_extra_test.c b/crypto/evp/evp_extra_test.c
index 21688b0cd383..0f7b011ce85e 100644
--- a/crypto/evp/evp_extra_test.c
+++ b/crypto/evp/evp_extra_test.c
@@ -345,7 +345,7 @@ static int test_EVP_DigestVerifyInit(void)
if (pkey == NULL ||
!EVP_DigestVerifyInit(&md_ctx, NULL, EVP_sha256(), NULL, pkey) ||
!EVP_DigestVerifyUpdate(&md_ctx, kMsg, sizeof(kMsg)) ||
- !EVP_DigestVerifyFinal(&md_ctx, (unsigned char *)kSignature, sizeof(kSignature))) {
+ !EVP_DigestVerifyFinal(&md_ctx, kSignature, sizeof(kSignature))) {
goto out;
}
ret = 1;
diff --git a/crypto/evp/evp_fips.c b/crypto/evp/evp_fips.c
deleted file mode 100644
index 71a32fca8d2a..000000000000
--- a/crypto/evp/evp_fips.c
+++ /dev/null
@@ -1,310 +0,0 @@
-/* crypto/evp/evp_fips.c */
-/*
- * Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
- * project.
- */
-/* ====================================================================
- * Copyright (c) 2011 The OpenSSL Project. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- * software must display the following acknowledgment:
- * "This product includes software developed by the OpenSSL Project
- * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- * endorse or promote products derived from this software without
- * prior written permission. For written permission, please contact
- * licensing@OpenSSL.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- * nor may "OpenSSL" appear in their names without prior written
- * permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- * acknowledgment:
- * "This product includes software developed by the OpenSSL Project
- * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- */
-
-#include <openssl/evp.h>
-
-#ifdef OPENSSL_FIPS
-# include <openssl/fips.h>
-
-const EVP_CIPHER *EVP_aes_128_cbc(void)
-{
- return FIPS_evp_aes_128_cbc();
-}
-
-const EVP_CIPHER *EVP_aes_128_ccm(void)
-{
- return FIPS_evp_aes_128_ccm();
-}
-
-const EVP_CIPHER *EVP_aes_128_cfb1(void)
-{
- return FIPS_evp_aes_128_cfb1();
-}
-
-const EVP_CIPHER *EVP_aes_128_cfb128(void)
-{
- return FIPS_evp_aes_128_cfb128();
-}
-
-const EVP_CIPHER *EVP_aes_128_cfb8(void)
-{
- return FIPS_evp_aes_128_cfb8();
-}
-
-const EVP_CIPHER *EVP_aes_128_ctr(void)
-{
- return FIPS_evp_aes_128_ctr();
-}
-
-const EVP_CIPHER *EVP_aes_128_ecb(void)
-{
- return FIPS_evp_aes_128_ecb();
-}
-
-const EVP_CIPHER *EVP_aes_128_gcm(void)
-{
- return FIPS_evp_aes_128_gcm();
-}
-
-const EVP_CIPHER *EVP_aes_128_ofb(void)
-{
- return FIPS_evp_aes_128_ofb();
-}
-
-const EVP_CIPHER *EVP_aes_128_xts(void)
-{
- return FIPS_evp_aes_128_xts();
-}
-
-const EVP_CIPHER *EVP_aes_192_cbc(void)
-{
- return FIPS_evp_aes_192_cbc();
-}
-
-const EVP_CIPHER *EVP_aes_192_ccm(void)
-{
- return FIPS_evp_aes_192_ccm();
-}
-
-const EVP_CIPHER *EVP_aes_192_cfb1(void)
-{
- return FIPS_evp_aes_192_cfb1();
-}
-
-const EVP_CIPHER *EVP_aes_192_cfb128(void)
-{
- return FIPS_evp_aes_192_cfb128();
-}
-
-const EVP_CIPHER *EVP_aes_192_cfb8(void)
-{
- return FIPS_evp_aes_192_cfb8();
-}
-
-const EVP_CIPHER *EVP_aes_192_ctr(void)
-{
- return FIPS_evp_aes_192_ctr();
-}
-
-const EVP_CIPHER *EVP_aes_192_ecb(void)
-{
- return FIPS_evp_aes_192_ecb();
-}
-
-const EVP_CIPHER *EVP_aes_192_gcm(void)
-{
- return FIPS_evp_aes_192_gcm();
-}
-
-const EVP_CIPHER *EVP_aes_192_ofb(void)
-{
- return FIPS_evp_aes_192_ofb();
-}
-
-const EVP_CIPHER *EVP_aes_256_cbc(void)
-{
- return FIPS_evp_aes_256_cbc();
-}
-
-const EVP_CIPHER *EVP_aes_256_ccm(void)
-{
- return FIPS_evp_aes_256_ccm();
-}
-
-const EVP_CIPHER *EVP_aes_256_cfb1(void)
-{
- return FIPS_evp_aes_256_cfb1();
-}
-
-const EVP_CIPHER *EVP_aes_256_cfb128(void)
-{
- return FIPS_evp_aes_256_cfb128();
-}
-
-const EVP_CIPHER *EVP_aes_256_cfb8(void)
-{
- return FIPS_evp_aes_256_cfb8();
-}
-
-const EVP_CIPHER *EVP_aes_256_ctr(void)
-{
- return FIPS_evp_aes_256_ctr();
-}
-
-const EVP_CIPHER *EVP_aes_256_ecb(void)
-{
- return FIPS_evp_aes_256_ecb();
-}
-
-const EVP_CIPHER *EVP_aes_256_gcm(void)
-{
- return FIPS_evp_aes_256_gcm();
-}
-
-const EVP_CIPHER *EVP_aes_256_ofb(void)
-{
- return FIPS_evp_aes_256_ofb();
-}
-
-const EVP_CIPHER *EVP_aes_256_xts(void)
-{
- return FIPS_evp_aes_256_xts();
-}
-
-const EVP_CIPHER *EVP_des_ede(void)
-{
- return FIPS_evp_des_ede();
-}
-
-const EVP_CIPHER *EVP_des_ede3(void)
-{
- return FIPS_evp_des_ede3();
-}
-
-const EVP_CIPHER *EVP_des_ede3_cbc(void)
-{
- return FIPS_evp_des_ede3_cbc();
-}
-
-const EVP_CIPHER *EVP_des_ede3_cfb1(void)
-{
- return FIPS_evp_des_ede3_cfb1();
-}
-
-const EVP_CIPHER *EVP_des_ede3_cfb64(void)
-{
- return FIPS_evp_des_ede3_cfb64();
-}
-
-const EVP_CIPHER *EVP_des_ede3_cfb8(void)
-{
- return FIPS_evp_des_ede3_cfb8();
-}
-
-const EVP_CIPHER *EVP_des_ede3_ecb(void)
-{
- return FIPS_evp_des_ede3_ecb();
-}
-
-const EVP_CIPHER *EVP_des_ede3_ofb(void)
-{
- return FIPS_evp_des_ede3_ofb();
-}
-
-const EVP_CIPHER *EVP_des_ede_cbc(void)
-{
- return FIPS_evp_des_ede_cbc();
-}
-
-const EVP_CIPHER *EVP_des_ede_cfb64(void)
-{
- return FIPS_evp_des_ede_cfb64();
-}
-
-const EVP_CIPHER *EVP_des_ede_ecb(void)
-{
- return FIPS_evp_des_ede_ecb();
-}
-
-const EVP_CIPHER *EVP_des_ede_ofb(void)
-{
- return FIPS_evp_des_ede_ofb();
-}
-
-const EVP_CIPHER *EVP_enc_null(void)
-{
- return FIPS_evp_enc_null();
-}
-
-const EVP_MD *EVP_sha1(void)
-{
- return FIPS_evp_sha1();
-}
-
-const EVP_MD *EVP_sha224(void)
-{
- return FIPS_evp_sha224();
-}
-
-const EVP_MD *EVP_sha256(void)
-{
- return FIPS_evp_sha256();
-}
-
-const EVP_MD *EVP_sha384(void)
-{
- return FIPS_evp_sha384();
-}
-
-const EVP_MD *EVP_sha512(void)
-{
- return FIPS_evp_sha512();
-}
-
-const EVP_MD *EVP_dss(void)
-{
- return FIPS_evp_dss();
-}
-
-const EVP_MD *EVP_dss1(void)
-{
- return FIPS_evp_dss1();
-}
-
-const EVP_MD *EVP_ecdsa(void)
-{
- return FIPS_evp_ecdsa();
-}
-
-#endif
diff --git a/crypto/evp/evp_lib.c b/crypto/evp/evp_lib.c
index d4d2b4b28081..a53a27ca0c92 100644
--- a/crypto/evp/evp_lib.c
+++ b/crypto/evp/evp_lib.c
@@ -60,6 +60,10 @@
#include "cryptlib.h"
#include <openssl/evp.h>
#include <openssl/objects.h>
+#ifdef OPENSSL_FIPS
+# include <openssl/fips.h>
+# include "evp_locl.h"
+#endif
int EVP_CIPHER_param_to_asn1(EVP_CIPHER_CTX *c, ASN1_TYPE *type)
{
@@ -67,9 +71,13 @@ int EVP_CIPHER_param_to_asn1(EVP_CIPHER_CTX *c, ASN1_TYPE *type)
if (c->cipher->set_asn1_parameters != NULL)
ret = c->cipher->set_asn1_parameters(c, type);
- else if (c->cipher->flags & EVP_CIPH_FLAG_DEFAULT_ASN1)
- ret = EVP_CIPHER_set_asn1_iv(c, type);
- else
+ else if (c->cipher->flags & EVP_CIPH_FLAG_DEFAULT_ASN1) {
+ if (EVP_CIPHER_CTX_mode(c) == EVP_CIPH_WRAP_MODE) {
+ ASN1_TYPE_set(type, V_ASN1_NULL, NULL);
+ ret = 1;
+ } else
+ ret = EVP_CIPHER_set_asn1_iv(c, type);
+ } else
ret = -1;
return (ret);
}
@@ -80,9 +88,11 @@ int EVP_CIPHER_asn1_to_param(EVP_CIPHER_CTX *c, ASN1_TYPE *type)
if (c->cipher->get_asn1_parameters != NULL)
ret = c->cipher->get_asn1_parameters(c, type);
- else if (c->cipher->flags & EVP_CIPH_FLAG_DEFAULT_ASN1)
+ else if (c->cipher->flags & EVP_CIPH_FLAG_DEFAULT_ASN1) {
+ if (EVP_CIPHER_CTX_mode(c) == EVP_CIPH_WRAP_MODE)
+ return 1;
ret = EVP_CIPHER_get_asn1_iv(c, type);
- else
+ } else
ret = -1;
return (ret);
}
@@ -200,12 +210,22 @@ const EVP_CIPHER *EVP_CIPHER_CTX_cipher(const EVP_CIPHER_CTX *ctx)
unsigned long EVP_CIPHER_flags(const EVP_CIPHER *cipher)
{
+#ifdef OPENSSL_FIPS
+ const EVP_CIPHER *fcipher;
+ fcipher = evp_get_fips_cipher(cipher);
+ if (fcipher && fcipher->flags & EVP_CIPH_FLAG_FIPS)
+ return cipher->flags | EVP_CIPH_FLAG_FIPS;
+#endif
return cipher->flags;
}
unsigned long EVP_CIPHER_CTX_flags(const EVP_CIPHER_CTX *ctx)
{
+#ifdef OPENSSL_FIPS
+ return EVP_CIPHER_flags(ctx->cipher);
+#else
return ctx->cipher->flags;
+#endif
}
void *EVP_CIPHER_CTX_get_app_data(const EVP_CIPHER_CTX *ctx)
@@ -272,8 +292,40 @@ int EVP_MD_size(const EVP_MD *md)
return md->md_size;
}
+#ifdef OPENSSL_FIPS
+
+const EVP_MD *evp_get_fips_md(const EVP_MD *md)
+{
+ int nid = EVP_MD_type(md);
+ if (nid == NID_dsa)
+ return FIPS_evp_dss1();
+ else if (nid == NID_dsaWithSHA)
+ return FIPS_evp_dss();
+ else if (nid == NID_ecdsa_with_SHA1)
+ return FIPS_evp_ecdsa();
+ else
+ return FIPS_get_digestbynid(nid);
+}
+
+const EVP_CIPHER *evp_get_fips_cipher(const EVP_CIPHER *cipher)
+{
+ int nid = cipher->nid;
+ if (nid == NID_undef)
+ return FIPS_evp_enc_null();
+ else
+ return FIPS_get_cipherbynid(nid);
+}
+
+#endif
+
unsigned long EVP_MD_flags(const EVP_MD *md)
{
+#ifdef OPENSSL_FIPS
+ const EVP_MD *fmd;
+ fmd = evp_get_fips_md(md);
+ if (fmd && fmd->flags & EVP_MD_FLAG_FIPS)
+ return md->flags | EVP_MD_FLAG_FIPS;
+#endif
return md->flags;
}
diff --git a/crypto/evp/evp_locl.h b/crypto/evp/evp_locl.h
index 980dadab23d4..2bb709a065d5 100644
--- a/crypto/evp/evp_locl.h
+++ b/crypto/evp/evp_locl.h
@@ -333,6 +333,9 @@ int PKCS5_v2_PBKDF2_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass,
const EVP_CIPHER *c, const EVP_MD *md,
int en_de);
+const EVP_MD *evp_get_fips_md(const EVP_MD *md);
+const EVP_CIPHER *evp_get_fips_cipher(const EVP_CIPHER *cipher);
+
#ifdef OPENSSL_FIPS
# ifdef OPENSSL_DOING_MAKEDEPEND
diff --git a/crypto/evp/evp_test.c b/crypto/evp/evp_test.c
index d06b4ee50914..d7441ec7b702 100644
--- a/crypto/evp/evp_test.c
+++ b/crypto/evp/evp_test.c
@@ -132,11 +132,13 @@ static int test1_exit(int ec)
static void test1(const EVP_CIPHER *c, const unsigned char *key, int kn,
const unsigned char *iv, int in,
const unsigned char *plaintext, int pn,
- const unsigned char *ciphertext, int cn, int encdec)
+ const unsigned char *ciphertext, int cn,
+ const unsigned char *aad, int an,
+ const unsigned char *tag, int tn, int encdec)
{
EVP_CIPHER_CTX ctx;
unsigned char out[4096];
- int outl, outl2;
+ int outl, outl2, mode;
printf("Testing cipher %s%s\n", EVP_CIPHER_name(c),
(encdec ==
@@ -147,15 +149,78 @@ static void test1(const EVP_CIPHER *c, const unsigned char *key, int kn,
hexdump(stdout, "IV", iv, in);
hexdump(stdout, "Plaintext", plaintext, pn);
hexdump(stdout, "Ciphertext", ciphertext, cn);
-
- if (kn != c->key_len) {
+ if (an)
+ hexdump(stdout, "AAD", aad, an);
+ if (tn)
+ hexdump(stdout, "Tag", tag, tn);
+ mode = EVP_CIPHER_mode(c);
+ if (kn != EVP_CIPHER_key_length(c)) {
fprintf(stderr, "Key length doesn't match, got %d expected %lu\n", kn,
- (unsigned long)c->key_len);
+ (unsigned long)EVP_CIPHER_key_length(c));
test1_exit(5);
}
EVP_CIPHER_CTX_init(&ctx);
+ EVP_CIPHER_CTX_set_flags(&ctx, EVP_CIPHER_CTX_FLAG_WRAP_ALLOW);
if (encdec != 0) {
- if (!EVP_EncryptInit_ex(&ctx, c, NULL, key, iv)) {
+ if (mode == EVP_CIPH_GCM_MODE) {
+ if (!EVP_EncryptInit_ex(&ctx, c, NULL, NULL, NULL)) {
+ fprintf(stderr, "EncryptInit failed\n");
+ ERR_print_errors_fp(stderr);
+ test1_exit(10);
+ }
+ if (!EVP_CIPHER_CTX_ctrl(&ctx, EVP_CTRL_GCM_SET_IVLEN, in, NULL)) {
+ fprintf(stderr, "IV length set failed\n");
+ ERR_print_errors_fp(stderr);
+ test1_exit(11);
+ }
+ if (!EVP_EncryptInit_ex(&ctx, NULL, NULL, key, iv)) {
+ fprintf(stderr, "Key/IV set failed\n");
+ ERR_print_errors_fp(stderr);
+ test1_exit(12);
+ }
+ if (an && !EVP_EncryptUpdate(&ctx, NULL, &outl, aad, an)) {
+ fprintf(stderr, "AAD set failed\n");
+ ERR_print_errors_fp(stderr);
+ test1_exit(13);
+ }
+ } else if (mode == EVP_CIPH_CCM_MODE) {
+ if (!EVP_EncryptInit_ex(&ctx, c, NULL, NULL, NULL)) {
+ fprintf(stderr, "EncryptInit failed\n");
+ ERR_print_errors_fp(stderr);
+ test1_exit(10);
+ }
+ if (!EVP_CIPHER_CTX_ctrl(&ctx, EVP_CTRL_CCM_SET_IVLEN, in, NULL)) {
+ fprintf(stderr, "IV length set failed\n");
+ ERR_print_errors_fp(stderr);
+ test1_exit(11);
+ }
+ if (!EVP_CIPHER_CTX_ctrl(&ctx, EVP_CTRL_CCM_SET_TAG, tn, NULL)) {
+ fprintf(stderr, "Tag length set failed\n");
+ ERR_print_errors_fp(stderr);
+ test1_exit(11);
+ }
+ if (!EVP_EncryptInit_ex(&ctx, NULL, NULL, key, iv)) {
+ fprintf(stderr, "Key/IV set failed\n");
+ ERR_print_errors_fp(stderr);
+ test1_exit(12);
+ }
+ if (!EVP_EncryptUpdate(&ctx, NULL, &outl, NULL, pn)) {
+ fprintf(stderr, "Plaintext length set failed\n");
+ ERR_print_errors_fp(stderr);
+ test1_exit(12);
+ }
+ if (an && !EVP_EncryptUpdate(&ctx, NULL, &outl, aad, an)) {
+ fprintf(stderr, "AAD set failed\n");
+ ERR_print_errors_fp(stderr);
+ test1_exit(13);
+ }
+ } else if (mode == EVP_CIPH_WRAP_MODE) {
+ if (!EVP_EncryptInit_ex(&ctx, c, NULL, key, in ? iv : NULL)) {
+ fprintf(stderr, "EncryptInit failed\n");
+ ERR_print_errors_fp(stderr);
+ test1_exit(10);
+ }
+ } else if (!EVP_EncryptInit_ex(&ctx, c, NULL, key, iv)) {
fprintf(stderr, "EncryptInit failed\n");
ERR_print_errors_fp(stderr);
test1_exit(10);
@@ -185,10 +250,93 @@ static void test1(const EVP_CIPHER *c, const unsigned char *key, int kn,
hexdump(stderr, "Expected", ciphertext, cn);
test1_exit(9);
}
+ if (mode == EVP_CIPH_GCM_MODE || mode == EVP_CIPH_CCM_MODE) {
+ unsigned char rtag[16];
+ /*
+ * Note: EVP_CTRL_CCM_GET_TAG has same value as
+ * EVP_CTRL_GCM_GET_TAG
+ */
+ if (!EVP_CIPHER_CTX_ctrl(&ctx, EVP_CTRL_GCM_GET_TAG, tn, rtag)) {
+ fprintf(stderr, "Get tag failed\n");
+ ERR_print_errors_fp(stderr);
+ test1_exit(14);
+ }
+ if (memcmp(rtag, tag, tn)) {
+ fprintf(stderr, "Tag mismatch\n");
+ hexdump(stderr, "Got", rtag, tn);
+ hexdump(stderr, "Expected", tag, tn);
+ test1_exit(9);
+ }
+ }
}
if (encdec <= 0) {
- if (!EVP_DecryptInit_ex(&ctx, c, NULL, key, iv)) {
+ if (mode == EVP_CIPH_GCM_MODE) {
+ if (!EVP_DecryptInit_ex(&ctx, c, NULL, NULL, NULL)) {
+ fprintf(stderr, "EncryptInit failed\n");
+ ERR_print_errors_fp(stderr);
+ test1_exit(10);
+ }
+ if (!EVP_CIPHER_CTX_ctrl(&ctx, EVP_CTRL_GCM_SET_IVLEN, in, NULL)) {
+ fprintf(stderr, "IV length set failed\n");
+ ERR_print_errors_fp(stderr);
+ test1_exit(11);
+ }
+ if (!EVP_DecryptInit_ex(&ctx, NULL, NULL, key, iv)) {
+ fprintf(stderr, "Key/IV set failed\n");
+ ERR_print_errors_fp(stderr);
+ test1_exit(12);
+ }
+ if (!EVP_CIPHER_CTX_ctrl
+ (&ctx, EVP_CTRL_GCM_SET_TAG, tn, (void *)tag)) {
+ fprintf(stderr, "Set tag failed\n");
+ ERR_print_errors_fp(stderr);
+ test1_exit(14);
+ }
+ if (an && !EVP_DecryptUpdate(&ctx, NULL, &outl, aad, an)) {
+ fprintf(stderr, "AAD set failed\n");
+ ERR_print_errors_fp(stderr);
+ test1_exit(13);
+ }
+ } else if (mode == EVP_CIPH_CCM_MODE) {
+ if (!EVP_DecryptInit_ex(&ctx, c, NULL, NULL, NULL)) {
+ fprintf(stderr, "DecryptInit failed\n");
+ ERR_print_errors_fp(stderr);
+ test1_exit(10);
+ }
+ if (!EVP_CIPHER_CTX_ctrl(&ctx, EVP_CTRL_CCM_SET_IVLEN, in, NULL)) {
+ fprintf(stderr, "IV length set failed\n");
+ ERR_print_errors_fp(stderr);
+ test1_exit(11);
+ }
+ if (!EVP_CIPHER_CTX_ctrl
+ (&ctx, EVP_CTRL_CCM_SET_TAG, tn, (void *)tag)) {
+ fprintf(stderr, "Tag length set failed\n");
+ ERR_print_errors_fp(stderr);
+ test1_exit(11);
+ }
+ if (!EVP_DecryptInit_ex(&ctx, NULL, NULL, key, iv)) {
+ fprintf(stderr, "Key/Nonce set failed\n");
+ ERR_print_errors_fp(stderr);
+ test1_exit(12);
+ }
+ if (!EVP_DecryptUpdate(&ctx, NULL, &outl, NULL, pn)) {
+ fprintf(stderr, "Plaintext length set failed\n");
+ ERR_print_errors_fp(stderr);
+ test1_exit(12);
+ }
+ if (an && !EVP_EncryptUpdate(&ctx, NULL, &outl, aad, an)) {
+ fprintf(stderr, "AAD set failed\n");
+ ERR_print_errors_fp(stderr);
+ test1_exit(13);
+ }
+ } else if (mode == EVP_CIPH_WRAP_MODE) {
+ if (!EVP_DecryptInit_ex(&ctx, c, NULL, key, in ? iv : NULL)) {
+ fprintf(stderr, "EncryptInit failed\n");
+ ERR_print_errors_fp(stderr);
+ test1_exit(10);
+ }
+ } else if (!EVP_DecryptInit_ex(&ctx, c, NULL, key, iv)) {
fprintf(stderr, "DecryptInit failed\n");
ERR_print_errors_fp(stderr);
test1_exit(11);
@@ -200,7 +348,8 @@ static void test1(const EVP_CIPHER *c, const unsigned char *key, int kn,
ERR_print_errors_fp(stderr);
test1_exit(6);
}
- if (!EVP_DecryptFinal_ex(&ctx, out + outl, &outl2)) {
+ if (mode != EVP_CIPH_CCM_MODE
+ && !EVP_DecryptFinal_ex(&ctx, out + outl, &outl2)) {
fprintf(stderr, "DecryptFinal failed\n");
ERR_print_errors_fp(stderr);
test1_exit(7);
@@ -228,7 +377,9 @@ static void test1(const EVP_CIPHER *c, const unsigned char *key, int kn,
static int test_cipher(const char *cipher, const unsigned char *key, int kn,
const unsigned char *iv, int in,
const unsigned char *plaintext, int pn,
- const unsigned char *ciphertext, int cn, int encdec)
+ const unsigned char *ciphertext, int cn,
+ const unsigned char *aad, int an,
+ const unsigned char *tag, int tn, int encdec)
{
const EVP_CIPHER *c;
@@ -236,7 +387,8 @@ static int test_cipher(const char *cipher, const unsigned char *key, int kn,
if (!c)
return 0;
- test1(c, key, kn, iv, in, plaintext, pn, ciphertext, cn, encdec);
+ test1(c, key, kn, iv, in, plaintext, pn, ciphertext, cn, aad, an, tag, tn,
+ encdec);
return 1;
}
@@ -316,7 +468,7 @@ int main(int argc, char **argv)
perror(szTestFile);
EXIT(2);
}
-
+ ERR_load_crypto_strings();
/* Load up the software EVP_CIPHER and EVP_MD definitions */
OpenSSL_add_all_ciphers();
OpenSSL_add_all_digests();
@@ -346,9 +498,11 @@ int main(int argc, char **argv)
char line[4096];
char *p;
char *cipher;
- unsigned char *iv, *key, *plaintext, *ciphertext;
+ unsigned char *iv, *key, *plaintext, *ciphertext, *aad, *tag;
int encdec;
int kn, in, pn, cn;
+ int an = 0;
+ int tn = 0;
if (!fgets((char *)line, sizeof line, f))
break;
@@ -361,19 +515,37 @@ int main(int argc, char **argv)
plaintext = ustrsep(&p, ":");
ciphertext = ustrsep(&p, ":");
if (p[-1] == '\n') {
- p[-1] = '\0';
encdec = -1;
+ p[-1] = '\0';
+ tag = aad = NULL;
+ an = tn = 0;
} else {
- encdec = atoi(sstrsep(&p, "\n"));
+ aad = ustrsep(&p, ":");
+ tag = ustrsep(&p, ":");
+ if (tag == NULL) {
+ p = (char *)aad;
+ tag = aad = NULL;
+ an = tn = 0;
+ }
+ if (p[-1] == '\n') {
+ encdec = -1;
+ p[-1] = '\0';
+ } else
+ encdec = atoi(sstrsep(&p, "\n"));
}
kn = convert(key);
in = convert(iv);
pn = convert(plaintext);
cn = convert(ciphertext);
+ if (aad) {
+ an = convert(aad);
+ tn = convert(tag);
+ }
if (!test_cipher
- (cipher, key, kn, iv, in, plaintext, pn, ciphertext, cn, encdec)
+ (cipher, key, kn, iv, in, plaintext, pn, ciphertext, cn, aad, an,
+ tag, tn, encdec)
&& !test_digest(cipher, plaintext, pn, ciphertext, cn)) {
#ifdef OPENSSL_NO_AES
if (strstr(cipher, "AES") == cipher) {
diff --git a/crypto/evp/evptests.txt b/crypto/evp/evptests.txt
index c273707c1444..4e9958b3b5bc 100644
--- a/crypto/evp/evptests.txt
+++ b/crypto/evp/evptests.txt
@@ -1,4 +1,5 @@
#cipher:key:iv:plaintext:ciphertext:0/1(decrypt/encrypt)
+#aadcipher:key:iv:plaintext:ciphertext:aad:tag:0/1(decrypt/encrypt)
#digest:::input:output
# SHA(1) tests (from shatest.c)
@@ -332,3 +333,69 @@ SEED-ECB:00000000000000000000000000000000::000102030405060708090A0B0C0D0E0F:5EBA
SEED-ECB:000102030405060708090A0B0C0D0E0F::00000000000000000000000000000000:C11F22F20140505084483597E4370F43:1
SEED-ECB:4706480851E61BE85D74BFB3FD956185::83A2F8A288641FB9A4E9A5CC2F131C7D:EE54D13EBCAE706D226BC3142CD40D4A:1
SEED-ECB:28DBC3BC49FFD87DCFA509B11D422BE7::B41E6BE2EBA84A148E2EED84593C5EC7:9B9B7BFCD1813CB95D0B3618F40F5122:1
+
+# AES CCM 256 bit key
+aes-256-ccm:1bde3251d41a8b5ea013c195ae128b218b3e0306376357077ef1c1c78548b92e:5b8e40746f6b98e00f1d13ff41:53bd72a97089e312422bf72e242377b3c6ee3e2075389b999c4ef7f28bd2b80a:9a5fcccdb4cf04e7293d2775cc76a488f042382d949b43b7d6bb2b9864786726:c17a32514eb6103f3249e076d4c871dc97e04b286699e54491dc18f6d734d4c0:2024931d73bca480c24a24ece6b6c2bf
+
+# AES GCM test vectors from http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-spec.pdf
+aes-128-gcm:00000000000000000000000000000000:000000000000000000000000::::58e2fccefa7e3061367f1d57a4e7455a
+aes-128-gcm:00000000000000000000000000000000:000000000000000000000000:00000000000000000000000000000000:0388dace60b6a392f328c2b971b2fe78::ab6e47d42cec13bdf53a67b21257bddf
+aes-128-gcm:feffe9928665731c6d6a8f9467308308:cafebabefacedbaddecaf888:d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b391aafd255:42831ec2217774244b7221b784d0d49ce3aa212f2c02a4e035c17e2329aca12e21d514b25466931c7d8f6a5aac84aa051ba30b396a0aac973d58e091473f5985::4d5c2af327cd64a62cf35abd2ba6fab4
+aes-128-gcm:feffe9928665731c6d6a8f9467308308:cafebabefacedbaddecaf888:d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39:42831ec2217774244b7221b784d0d49ce3aa212f2c02a4e035c17e2329aca12e21d514b25466931c7d8f6a5aac84aa051ba30b396a0aac973d58e091:feedfacedeadbeeffeedfacedeadbeefabaddad2:5bc94fbc3221a5db94fae95ae7121a47
+aes-128-gcm:feffe9928665731c6d6a8f9467308308:cafebabefacedbad:d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39:61353b4c2806934a777ff51fa22a4755699b2a714fcdc6f83766e5f97b6c742373806900e49f24b22b097544d4896b424989b5e1ebac0f07c23f4598:feedfacedeadbeeffeedfacedeadbeefabaddad2:3612d2e79e3b0785561be14aaca2fccb
+aes-128-gcm:feffe9928665731c6d6a8f9467308308:9313225df88406e555909c5aff5269aa6a7a9538534f7da1e4c303d2a318a728c3c0c95156809539fcf0e2429a6b525416aedbf5a0de6a57a637b39b:d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39:8ce24998625615b603a033aca13fb894be9112a5c3a211a8ba262a3cca7e2ca701e4a9a4fba43c90ccdcb281d48c7c6fd62875d2aca417034c34aee5:feedfacedeadbeeffeedfacedeadbeefabaddad2:619cc5aefffe0bfa462af43c1699d050
+aes-192-gcm:000000000000000000000000000000000000000000000000:000000000000000000000000::::cd33b28ac773f74ba00ed1f312572435
+aes-192-gcm:000000000000000000000000000000000000000000000000:000000000000000000000000:00000000000000000000000000000000:98e7247c07f0fe411c267e4384b0f600::2ff58d80033927ab8ef4d4587514f0fb
+aes-192-gcm:feffe9928665731c6d6a8f9467308308feffe9928665731c:cafebabefacedbaddecaf888:d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b391aafd255:3980ca0b3c00e841eb06fac4872a2757859e1ceaa6efd984628593b40ca1e19c7d773d00c144c525ac619d18c84a3f4718e2448b2fe324d9ccda2710acade256::9924a7c8587336bfb118024db8674a14
+aes-192-gcm:feffe9928665731c6d6a8f9467308308feffe9928665731c:cafebabefacedbaddecaf888:d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39:3980ca0b3c00e841eb06fac4872a2757859e1ceaa6efd984628593b40ca1e19c7d773d00c144c525ac619d18c84a3f4718e2448b2fe324d9ccda2710:feedfacedeadbeeffeedfacedeadbeefabaddad2:2519498e80f1478f37ba55bd6d27618c
+aes-192-gcm:feffe9928665731c6d6a8f9467308308feffe9928665731c:cafebabefacedbad:d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39:0f10f599ae14a154ed24b36e25324db8c566632ef2bbb34f8347280fc4507057fddc29df9a471f75c66541d4d4dad1c9e93a19a58e8b473fa0f062f7:feedfacedeadbeeffeedfacedeadbeefabaddad2:65dcc57fcf623a24094fcca40d3533f8
+aes-192-gcm:feffe9928665731c6d6a8f9467308308feffe9928665731c:9313225df88406e555909c5aff5269aa6a7a9538534f7da1e4c303d2a318a728c3c0c95156809539fcf0e2429a6b525416aedbf5a0de6a57a637b39b:d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39:d27e88681ce3243c4830165a8fdcf9ff1de9a1d8e6b447ef6ef7b79828666e4581e79012af34ddd9e2f037589b292db3e67c036745fa22e7e9b7373b:feedfacedeadbeeffeedfacedeadbeefabaddad2:dcf566ff291c25bbb8568fc3d376a6d9
+aes-256-gcm:0000000000000000000000000000000000000000000000000000000000000000:000000000000000000000000::::530f8afbc74536b9a963b4f1c4cb738b
+aes-256-gcm:0000000000000000000000000000000000000000000000000000000000000000:000000000000000000000000:00000000000000000000000000000000:cea7403d4d606b6e074ec5d3baf39d18::d0d1c8a799996bf0265b98b5d48ab919
+aes-256-gcm:feffe9928665731c6d6a8f9467308308feffe9928665731c6d6a8f9467308308:cafebabefacedbaddecaf888:d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b391aafd255:522dc1f099567d07f47f37a32a84427d643a8cdcbfe5c0c97598a2bd2555d1aa8cb08e48590dbb3da7b08b1056828838c5f61e6393ba7a0abcc9f662898015ad::b094dac5d93471bdec1a502270e3cc6c
+aes-256-gcm:feffe9928665731c6d6a8f9467308308feffe9928665731c6d6a8f9467308308:cafebabefacedbaddecaf888:d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39:522dc1f099567d07f47f37a32a84427d643a8cdcbfe5c0c97598a2bd2555d1aa8cb08e48590dbb3da7b08b1056828838c5f61e6393ba7a0abcc9f662:feedfacedeadbeeffeedfacedeadbeefabaddad2:76fc6ece0f4e1768cddf8853bb2d551b
+aes-256-gcm:feffe9928665731c6d6a8f9467308308feffe9928665731c6d6a8f9467308308:cafebabefacedbad:d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39:c3762df1ca787d32ae47c13bf19844cbaf1ae14d0b976afac52ff7d79bba9de0feb582d33934a4f0954cc2363bc73f7862ac430e64abe499f47c9b1f:feedfacedeadbeeffeedfacedeadbeefabaddad2:3a337dbf46a792c45e454913fe2ea8f2
+aes-256-gcm:feffe9928665731c6d6a8f9467308308feffe9928665731c6d6a8f9467308308:9313225df88406e555909c5aff5269aa6a7a9538534f7da1e4c303d2a318a728c3c0c95156809539fcf0e2429a6b525416aedbf5a0de6a57a637b39b:d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39:5a8def2f0c9e53f1f75d7853659e2a20eeb2b22aafde6419a058ab4f6f746bf40fc0c3b780f244452da3ebf1c5d82cdea2418997200ef82e44ae7e3f:feedfacedeadbeeffeedfacedeadbeefabaddad2:a44a8266ee1c8eb0c8b5d4cf5ae9f19a
+# local add-ons, primarily streaming ghash tests
+# 128 bytes aad
+aes-128-gcm:00000000000000000000000000000000:000000000000000000000000:::d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b391aafd255522dc1f099567d07f47f37a32a84427d643a8cdcbfe5c0c97598a2bd2555d1aa8cb08e48590dbb3da7b08b1056828838c5f61e6393ba7a0abcc9f662898015ad:5fea793a2d6f974d37e68e0cb8ff9492
+# 48 bytes plaintext
+aes-128-gcm:00000000000000000000000000000000:000000000000000000000000:000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000:0388dace60b6a392f328c2b971b2fe78f795aaab494b5923f7fd89ff948bc1e0200211214e7394da2089b6acd093abe0::9dd0a376b08e40eb00c35f29f9ea61a4
+# 80 bytes plaintext
+aes-128-gcm:00000000000000000000000000000000:000000000000000000000000:0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000:0388dace60b6a392f328c2b971b2fe78f795aaab494b5923f7fd89ff948bc1e0200211214e7394da2089b6acd093abe0c94da219118e297d7b7ebcbcc9c388f28ade7d85a8ee35616f7124a9d5270291::98885a3a22bd4742fe7b72172193b163
+# 128 bytes plaintext
+aes-128-gcm:00000000000000000000000000000000:000000000000000000000000:0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000:0388dace60b6a392f328c2b971b2fe78f795aaab494b5923f7fd89ff948bc1e0200211214e7394da2089b6acd093abe0c94da219118e297d7b7ebcbcc9c388f28ade7d85a8ee35616f7124a9d527029195b84d1b96c690ff2f2de30bf2ec89e00253786e126504f0dab90c48a30321de3345e6b0461e7c9e6c6b7afedde83f40::cac45f60e31efd3b5a43b98a22ce1aa1
+# 192 bytes plaintext, iv is chosen so that initial counter LSB is 0xFF
+aes-128-gcm:00000000000000000000000000000000:ffffffff000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000:000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000:56b3373ca9ef6e4a2b64fe1e9a17b61425f10d47a75a5fce13efc6bc784af24f4141bdd48cf7c770887afd573cca5418a9aeffcd7c5ceddfc6a78397b9a85b499da558257267caab2ad0b23ca476a53cb17fb41c4b8b475cb4f3f7165094c229c9e8c4dc0a2a5ff1903e501511221376a1cdb8364c5061a20cae74bc4acd76ceb0abc9fd3217ef9f8c90be402ddf6d8697f4f880dff15bfb7a6b28241ec8fe183c2d59e3f9dfff653c7126f0acb9e64211f42bae12af462b1070bef1ab5e3606::566f8ef683078bfdeeffa869d751a017
+# 80 bytes plaintext, submitted by Intel
+aes-128-gcm:843ffcf5d2b72694d19ed01d01249412:dbcca32ebf9b804617c3aa9e:000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f404142434445464748494a4b4c4d4e4f:6268c6fa2a80b2d137467f092f657ac04d89be2beaa623d61b5a868c8f03ff95d3dcee23ad2f1ab3a6c80eaf4b140eb05de3457f0fbc111a6b43d0763aa422a3013cf1dc37fe417d1fbfc449b75d4cc5:00000000000000000000000000000000101112131415161718191a1b1c1d1e1f:3b629ccfbc1119b7319e1dce2cd6fd6d
+
+# AES XTS test vectors from IEEE Std 1619-2007
+aes-128-xts:0000000000000000000000000000000000000000000000000000000000000000:00000000000000000000000000000000:0000000000000000000000000000000000000000000000000000000000000000:917cf69ebd68b2ec9b9fe9a3eadda692cd43d2f59598ed858c02c2652fbf922e
+aes-128-xts:1111111111111111111111111111111122222222222222222222222222222222:33333333330000000000000000000000:4444444444444444444444444444444444444444444444444444444444444444:c454185e6a16936e39334038acef838bfb186fff7480adc4289382ecd6d394f0
+aes-128-xts:fffefdfcfbfaf9f8f7f6f5f4f3f2f1f022222222222222222222222222222222:33333333330000000000000000000000:4444444444444444444444444444444444444444444444444444444444444444:af85336b597afc1a900b2eb21ec949d292df4c047e0b21532186a5971a227a89
+aes-128-xts:2718281828459045235360287471352631415926535897932384626433832795:00000000000000000000000000000000:000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9fa0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebfc0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedfe0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9fa0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebfc0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedfe0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff:27a7479befa1d476489f308cd4cfa6e2a96e4bbe3208ff25287dd3819616e89cc78cf7f5e543445f8333d8fa7f56000005279fa5d8b5e4ad40e736ddb4d35412328063fd2aab53e5ea1e0a9f332500a5df9487d07a5c92cc512c8866c7e860ce93fdf166a24912b422976146ae20ce846bb7dc9ba94a767aaef20c0d61ad02655ea92dc4c4e41a8952c651d33174be51a10c421110e6d81588ede82103a252d8a750e8768defffed9122810aaeb99f9172af82b604dc4b8e51bcb08235a6f4341332e4ca60482a4ba1a03b3e65008fc5da76b70bf1690db4eae29c5f1badd03c5ccf2a55d705ddcd86d449511ceb7ec30bf12b1fa35b913f9f747a8afd1b130e94bff94effd01a91735ca1726acd0b197c4e5b03393697e126826fb6bbde8ecc1e08298516e2c9ed03ff3c1b7860f6de76d4cecd94c8119855ef5297ca67e9f3e7ff72b1e99785ca0a7e7720c5b36dc6d72cac9574c8cbbc2f801e23e56fd344b07f22154beba0f08ce8891e643ed995c94d9a69c9f1b5f499027a78572aeebd74d20cc39881c213ee770b1010e4bea718846977ae119f7a023ab58cca0ad752afe656bb3c17256a9f6e9bf19fdd5a38fc82bbe872c5539edb609ef4f79c203ebb140f2e583cb2ad15b4aa5b655016a8449277dbd477ef2c8d6c017db738b18deb4a427d1923ce3ff262735779a418f20a282df920147beabe421ee5319d0568
+aes-128-xts:2718281828459045235360287471352631415926535897932384626433832795:01000000000000000000000000000000:27a7479befa1d476489f308cd4cfa6e2a96e4bbe3208ff25287dd3819616e89cc78cf7f5e543445f8333d8fa7f56000005279fa5d8b5e4ad40e736ddb4d35412328063fd2aab53e5ea1e0a9f332500a5df9487d07a5c92cc512c8866c7e860ce93fdf166a24912b422976146ae20ce846bb7dc9ba94a767aaef20c0d61ad02655ea92dc4c4e41a8952c651d33174be51a10c421110e6d81588ede82103a252d8a750e8768defffed9122810aaeb99f9172af82b604dc4b8e51bcb08235a6f4341332e4ca60482a4ba1a03b3e65008fc5da76b70bf1690db4eae29c5f1badd03c5ccf2a55d705ddcd86d449511ceb7ec30bf12b1fa35b913f9f747a8afd1b130e94bff94effd01a91735ca1726acd0b197c4e5b03393697e126826fb6bbde8ecc1e08298516e2c9ed03ff3c1b7860f6de76d4cecd94c8119855ef5297ca67e9f3e7ff72b1e99785ca0a7e7720c5b36dc6d72cac9574c8cbbc2f801e23e56fd344b07f22154beba0f08ce8891e643ed995c94d9a69c9f1b5f499027a78572aeebd74d20cc39881c213ee770b1010e4bea718846977ae119f7a023ab58cca0ad752afe656bb3c17256a9f6e9bf19fdd5a38fc82bbe872c5539edb609ef4f79c203ebb140f2e583cb2ad15b4aa5b655016a8449277dbd477ef2c8d6c017db738b18deb4a427d1923ce3ff262735779a418f20a282df920147beabe421ee5319d0568:264d3ca8512194fec312c8c9891f279fefdd608d0c027b60483a3fa811d65ee59d52d9e40ec5672d81532b38b6b089ce951f0f9c35590b8b978d175213f329bb1c2fd30f2f7f30492a61a532a79f51d36f5e31a7c9a12c286082ff7d2394d18f783e1a8e72c722caaaa52d8f065657d2631fd25bfd8e5baad6e527d763517501c68c5edc3cdd55435c532d7125c8614deed9adaa3acade5888b87bef641c4c994c8091b5bcd387f3963fb5bc37aa922fbfe3df4e5b915e6eb514717bdd2a74079a5073f5c4bfd46adf7d282e7a393a52579d11a028da4d9cd9c77124f9648ee383b1ac763930e7162a8d37f350b2f74b8472cf09902063c6b32e8c2d9290cefbd7346d1c779a0df50edcde4531da07b099c638e83a755944df2aef1aa31752fd323dcb710fb4bfbb9d22b925bc3577e1b8949e729a90bbafeacf7f7879e7b1147e28ba0bae940db795a61b15ecf4df8db07b824bb062802cc98a9545bb2aaeed77cb3fc6db15dcd7d80d7d5bc406c4970a3478ada8899b329198eb61c193fb6275aa8ca340344a75a862aebe92eee1ce032fd950b47d7704a3876923b4ad62844bf4a09c4dbe8b4397184b7471360c9564880aedddb9baa4af2e75394b08cd32ff479c57a07d3eab5d54de5f9738b8d27f27a9f0ab11799d7b7ffefb2704c95c6ad12c39f1e867a4b7b1d7818a4b753dfd2a89ccb45e001a03a867b187f225dd
+aes-128-xts:2718281828459045235360287471352631415926535897932384626433832795:02000000000000000000000000000000:264d3ca8512194fec312c8c9891f279fefdd608d0c027b60483a3fa811d65ee59d52d9e40ec5672d81532b38b6b089ce951f0f9c35590b8b978d175213f329bb1c2fd30f2f7f30492a61a532a79f51d36f5e31a7c9a12c286082ff7d2394d18f783e1a8e72c722caaaa52d8f065657d2631fd25bfd8e5baad6e527d763517501c68c5edc3cdd55435c532d7125c8614deed9adaa3acade5888b87bef641c4c994c8091b5bcd387f3963fb5bc37aa922fbfe3df4e5b915e6eb514717bdd2a74079a5073f5c4bfd46adf7d282e7a393a52579d11a028da4d9cd9c77124f9648ee383b1ac763930e7162a8d37f350b2f74b8472cf09902063c6b32e8c2d9290cefbd7346d1c779a0df50edcde4531da07b099c638e83a755944df2aef1aa31752fd323dcb710fb4bfbb9d22b925bc3577e1b8949e729a90bbafeacf7f7879e7b1147e28ba0bae940db795a61b15ecf4df8db07b824bb062802cc98a9545bb2aaeed77cb3fc6db15dcd7d80d7d5bc406c4970a3478ada8899b329198eb61c193fb6275aa8ca340344a75a862aebe92eee1ce032fd950b47d7704a3876923b4ad62844bf4a09c4dbe8b4397184b7471360c9564880aedddb9baa4af2e75394b08cd32ff479c57a07d3eab5d54de5f9738b8d27f27a9f0ab11799d7b7ffefb2704c95c6ad12c39f1e867a4b7b1d7818a4b753dfd2a89ccb45e001a03a867b187f225dd:fa762a3680b76007928ed4a4f49a9456031b704782e65e16cecb54ed7d017b5e18abd67b338e81078f21edb7868d901ebe9c731a7c18b5e6dec1d6a72e078ac9a4262f860beefa14f4e821018272e411a951502b6e79066e84252c3346f3aa62344351a291d4bedc7a07618bdea2af63145cc7a4b8d4070691ae890cd65733e7946e9021a1dffc4c59f159425ee6d50ca9b135fa6162cea18a939838dc000fb386fad086acce5ac07cb2ece7fd580b00cfa5e98589631dc25e8e2a3daf2ffdec26531659912c9d8f7a15e5865ea8fb5816d6207052bd7128cd743c12c8118791a4736811935eb982a532349e31dd401e0b660a568cb1a4711f552f55ded59f1f15bf7196b3ca12a91e488ef59d64f3a02bf45239499ac6176ae321c4a211ec545365971c5d3f4f09d4eb139bfdf2073d33180b21002b65cc9865e76cb24cd92c874c24c18350399a936ab3637079295d76c417776b94efce3a0ef7206b15110519655c956cbd8b2489405ee2b09a6b6eebe0c53790a12a8998378b33a5b71159625f4ba49d2a2fdba59fbf0897bc7aabd8d707dc140a80f0f309f835d3da54ab584e501dfa0ee977fec543f74186a802b9a37adb3e8291eca04d66520d229e60401e7282bef486ae059aa70696e0e305d777140a7a883ecdcb69b9ff938e8a4231864c69ca2c2043bed007ff3e605e014bcf518138dc3a25c5e236171a2d01d6
+aes-128-xts:2718281828459045235360287471352631415926535897932384626433832795:fd000000000000000000000000000000:8e41b78c390b5af9d758bb214a67e9f6bf7727b09ac6124084c37611398fa45daad94868600ed391fb1acd4857a95b466e62ef9f4b377244d1c152e7b30d731aad30c716d214b707aed99eb5b5e580b3e887cf7497465651d4b60e6042051da3693c3b78c14489543be8b6ad0ba629565bba202313ba7b0d0c94a3252b676f46cc02ce0f8a7d34c0ed229129673c1f61aed579d08a9203a25aac3a77e9db60267996db38df637356d9dcd1632e369939f2a29d89345c66e05066f1a3677aef18dea4113faeb629e46721a66d0a7e785d3e29af2594eb67dfa982affe0aac058f6e15864269b135418261fc3afb089472cf68c45dd7f231c6249ba0255e1e033833fc4d00a3fe02132d7bc3873614b8aee34273581ea0325c81f0270affa13641d052d36f0757d484014354d02d6883ca15c24d8c3956b1bd027bcf41f151fd8023c5340e5606f37e90fdb87c86fb4fa634b3718a30bace06a66eaf8f63c4aa3b637826a87fe8cfa44282e92cb1615af3a28e53bc74c7cba1a0977be9065d0c1a5dec6c54ae38d37f37aa35283e048e5530a85c4e7a29d7b92ec0c3169cdf2a805c7604bce60049b9fb7b8eaac10f51ae23794ceba68bb58112e293b9b692ca721b37c662f8574ed4dba6f88e170881c82cddc1034a0ca7e284bf0962b6b26292d836fa9f73c1ac770eef0f2d3a1eaf61d3e03555fd424eedd67e18a18094f888:d55f684f81f4426e9fde92a5ff02df2ac896af63962888a97910c1379e20b0a3b1db613fb7fe2e07004329ea5c22bfd33e3dbe4cf58cc608c2c26c19a2e2fe22f98732c2b5cb844cc6c0702d91e1d50fc4382a7eba5635cd602432a2306ac4ce82f8d70c8d9bc15f918fe71e74c622d5cf71178bf6e0b9cc9f2b41dd8dbe441c41cd0c73a6dc47a348f6702f9d0e9b1b1431e948e299b9ec2272ab2c5f0c7be86affa5dec87a0bee81d3d50007edaa2bcfccb35605155ff36ed8edd4a40dcd4b243acd11b2b987bdbfaf91a7cac27e9c5aea525ee53de7b2d3332c8644402b823e94a7db26276d2d23aa07180f76b4fd29b9c0823099c9d62c519880aee7e9697617c1497d47bf3e571950311421b6b734d38b0db91eb85331b91ea9f61530f54512a5a52a4bad589eb69781d537f23297bb459bdad2948a29e1550bf4787e0be95bb173cf5fab17dab7a13a052a63453d97ccec1a321954886b7a1299faaeecae35c6eaaca753b041b5e5f093bf83397fd21dd6b3012066fcc058cc32c3b09d7562dee29509b5839392c9ff05f51f3166aaac4ac5f238038a3045e6f72e48ef0fe8bc675e82c318a268e43970271bf119b81bf6a982746554f84e72b9f00280a320a08142923c23c883423ff949827f29bbacdc1ccdb04938ce6098c95ba6b32528f4ef78eed778b2e122ddfd1cbdd11d1c0a6783e011fc536d63d053260637
+aes-128-xts:2718281828459045235360287471352631415926535897932384626433832795:fe000000000000000000000000000000:d55f684f81f4426e9fde92a5ff02df2ac896af63962888a97910c1379e20b0a3b1db613fb7fe2e07004329ea5c22bfd33e3dbe4cf58cc608c2c26c19a2e2fe22f98732c2b5cb844cc6c0702d91e1d50fc4382a7eba5635cd602432a2306ac4ce82f8d70c8d9bc15f918fe71e74c622d5cf71178bf6e0b9cc9f2b41dd8dbe441c41cd0c73a6dc47a348f6702f9d0e9b1b1431e948e299b9ec2272ab2c5f0c7be86affa5dec87a0bee81d3d50007edaa2bcfccb35605155ff36ed8edd4a40dcd4b243acd11b2b987bdbfaf91a7cac27e9c5aea525ee53de7b2d3332c8644402b823e94a7db26276d2d23aa07180f76b4fd29b9c0823099c9d62c519880aee7e9697617c1497d47bf3e571950311421b6b734d38b0db91eb85331b91ea9f61530f54512a5a52a4bad589eb69781d537f23297bb459bdad2948a29e1550bf4787e0be95bb173cf5fab17dab7a13a052a63453d97ccec1a321954886b7a1299faaeecae35c6eaaca753b041b5e5f093bf83397fd21dd6b3012066fcc058cc32c3b09d7562dee29509b5839392c9ff05f51f3166aaac4ac5f238038a3045e6f72e48ef0fe8bc675e82c318a268e43970271bf119b81bf6a982746554f84e72b9f00280a320a08142923c23c883423ff949827f29bbacdc1ccdb04938ce6098c95ba6b32528f4ef78eed778b2e122ddfd1cbdd11d1c0a6783e011fc536d63d053260637:72efc1ebfe1ee25975a6eb3aa8589dda2b261f1c85bdab442a9e5b2dd1d7c3957a16fc08e526d4b1223f1b1232a11af274c3d70dac57f83e0983c498f1a6f1aecb021c3e70085a1e527f1ce41ee5911a82020161529cd82773762daf5459de94a0a82adae7e1703c808543c29ed6fb32d9e004327c1355180c995a07741493a09c21ba01a387882da4f62534b87bb15d60d197201c0fd3bf30c1500a3ecfecdd66d8721f90bcc4c17ee925c61b0a03727a9c0d5f5ca462fbfa0af1c2513a9d9d4b5345bd27a5f6e653f751693e6b6a2b8ead57d511e00e58c45b7b8d005af79288f5c7c22fd4f1bf7a898b03a5634c6a1ae3f9fae5de4f296a2896b23e7ed43ed14fa5a2803f4d28f0d3ffcf24757677aebdb47bb388378708948a8d4126ed1839e0da29a537a8c198b3c66ab00712dd261674bf45a73d67f76914f830ca014b65596f27e4cf62de66125a5566df9975155628b400fbfb3a29040ed50faffdbb18aece7c5c44693260aab386c0a37b11b114f1c415aebb653be468179428d43a4d8bc3ec38813eca30a13cf1bb18d524f1992d44d8b1a42ea30b22e6c95b199d8d182f8840b09d059585c31ad691fa0619ff038aca2c39a943421157361717c49d322028a74648113bd8c9d7ec77cf3c89c1ec8718ceff8516d96b34c3c614f10699c9abc4ed0411506223bea16af35c883accdbe1104eef0cfdb54e12fb230a
+aes-128-xts:2718281828459045235360287471352631415926535897932384626433832795:ff000000000000000000000000000000:72efc1ebfe1ee25975a6eb3aa8589dda2b261f1c85bdab442a9e5b2dd1d7c3957a16fc08e526d4b1223f1b1232a11af274c3d70dac57f83e0983c498f1a6f1aecb021c3e70085a1e527f1ce41ee5911a82020161529cd82773762daf5459de94a0a82adae7e1703c808543c29ed6fb32d9e004327c1355180c995a07741493a09c21ba01a387882da4f62534b87bb15d60d197201c0fd3bf30c1500a3ecfecdd66d8721f90bcc4c17ee925c61b0a03727a9c0d5f5ca462fbfa0af1c2513a9d9d4b5345bd27a5f6e653f751693e6b6a2b8ead57d511e00e58c45b7b8d005af79288f5c7c22fd4f1bf7a898b03a5634c6a1ae3f9fae5de4f296a2896b23e7ed43ed14fa5a2803f4d28f0d3ffcf24757677aebdb47bb388378708948a8d4126ed1839e0da29a537a8c198b3c66ab00712dd261674bf45a73d67f76914f830ca014b65596f27e4cf62de66125a5566df9975155628b400fbfb3a29040ed50faffdbb18aece7c5c44693260aab386c0a37b11b114f1c415aebb653be468179428d43a4d8bc3ec38813eca30a13cf1bb18d524f1992d44d8b1a42ea30b22e6c95b199d8d182f8840b09d059585c31ad691fa0619ff038aca2c39a943421157361717c49d322028a74648113bd8c9d7ec77cf3c89c1ec8718ceff8516d96b34c3c614f10699c9abc4ed0411506223bea16af35c883accdbe1104eef0cfdb54e12fb230a:3260ae8dad1f4a32c5cafe3ab0eb95549d461a67ceb9e5aa2d3afb62dece0553193ba50c75be251e08d1d08f1088576c7efdfaaf3f459559571e12511753b07af073f35da06af0ce0bbf6b8f5ccc5cea500ec1b211bd51f63b606bf6528796ca12173ba39b8935ee44ccce646f90a45bf9ccc567f0ace13dc2d53ebeedc81f58b2e41179dddf0d5a5c42f5d8506c1a5d2f8f59f3ea873cbcd0eec19acbf325423bd3dcb8c2b1bf1d1eaed0eba7f0698e4314fbeb2f1566d1b9253008cbccf45a2b0d9c5c9c21474f4076e02be26050b99dee4fd68a4cf890e496e4fcae7b70f94ea5a9062da0daeba1993d2ccd1dd3c244b8428801495a58b216547e7e847c46d1d756377b6242d2e5fb83bf752b54e0df71e889f3a2bb0f4c10805bf3c590376e3c24e22ff57f7fa965577375325cea5d920db94b9c336b455f6e894c01866fe9fbb8c8d3f70a2957285f6dfb5dcd8cbf54782f8fe7766d4723819913ac773421e3a31095866bad22c86a6036b2518b2059b4229d18c8c2ccbdf906c6cc6e82464ee57bddb0bebcb1dc645325bfb3e665ef7251082c88ebb1cf203bd779fdd38675713c8daadd17e1cabee432b09787b6ddf3304e38b731b45df5df51b78fcfb3d32466028d0ba36555e7e11ab0ee0666061d1645d962444bc47a38188930a84b4d561395c73c087021927ca638b7afc8a8679ccb84c26555440ec7f10445cd
+
+aes-256-xts:27182818284590452353602874713526624977572470936999595749669676273141592653589793238462643383279502884197169399375105820974944592:ff000000000000000000000000000000:000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9fa0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebfc0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedfe0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9fa0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebfc0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedfe0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff:1c3b3a102f770386e4836c99e370cf9bea00803f5e482357a4ae12d414a3e63b5d31e276f8fe4a8d66b317f9ac683f44680a86ac35adfc3345befecb4bb188fd5776926c49a3095eb108fd1098baec70aaa66999a72a82f27d848b21d4a741b0c5cd4d5fff9dac89aeba122961d03a757123e9870f8acf1000020887891429ca2a3e7a7d7df7b10355165c8b9a6d0a7de8b062c4500dc4cd120c0f7418dae3d0b5781c34803fa75421c790dfe1de1834f280d7667b327f6c8cd7557e12ac3a0f93ec05c52e0493ef31a12d3d9260f79a289d6a379bc70c50841473d1a8cc81ec583e9645e07b8d9670655ba5bbcfecc6dc3966380ad8fecb17b6ba02469a020a84e18e8f84252070c13e9f1f289be54fbc481457778f616015e1327a02b140f1505eb309326d68378f8374595c849d84f4c333ec4423885143cb47bd71c5edae9be69a2ffeceb1bec9de244fbe15992b11b77c040f12bd8f6a975a44a0f90c29a9abc3d4d893927284c58754cce294529f8614dcd2aba991925fedc4ae74ffac6e333b93eb4aff0479da9a410e4450e0dd7ae4c6e2910900575da401fc07059f645e8b7e9bfdef33943054ff84011493c27b3429eaedb4ed5376441a77ed43851ad77f16f541dfd269d50d6a5f14fb0aab1cbb4c1550be97f7ab4066193c4caa773dad38014bd2092fa755c824bb5e54c4f36ffda9fcea70b9c6e693e148c151
+aes-256-xts:27182818284590452353602874713526624977572470936999595749669676273141592653589793238462643383279502884197169399375105820974944592:ffff0000000000000000000000000000:000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9fa0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebfc0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedfe0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9fa0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebfc0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedfe0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff:77a31251618a15e6b92d1d66dffe7b50b50bad552305ba0217a610688eff7e11e1d0225438e093242d6db274fde801d4cae06f2092c728b2478559df58e837c2469ee4a4fa794e4bbc7f39bc026e3cb72c33b0888f25b4acf56a2a9804f1ce6d3d6e1dc6ca181d4b546179d55544aa7760c40d06741539c7e3cd9d2f6650b2013fd0eeb8c2b8e3d8d240ccae2d4c98320a7442e1c8d75a42d6e6cfa4c2eca1798d158c7aecdf82490f24bb9b38e108bcda12c3faf9a21141c3613b58367f922aaa26cd22f23d708dae699ad7cb40a8ad0b6e2784973dcb605684c08b8d6998c69aac049921871ebb65301a4619ca80ecb485a31d744223ce8ddc2394828d6a80470c092f5ba413c3378fa6054255c6f9df4495862bbb3287681f931b687c888abf844dfc8fc28331e579928cd12bd2390ae123cf03818d14dedde5c0c24c8ab018bfca75ca096f2d531f3d1619e785f1ada437cab92e980558b3dce1474afb75bfedbf8ff54cb2618e0244c9ac0d3c66fb51598cd2db11f9be39791abe447c63094f7c453b7ff87cb5bb36b7c79efb0872d17058b83b15ab0866ad8a58656c5a7e20dbdf308b2461d97c0ec0024a2715055249cf3b478ddd4740de654f75ca686e0d7345c69ed50cdc2a8b332b1f8824108ac937eb050585608ee734097fc09054fbff89eeaeea791f4a7ab1f9868294a4f9e27b42af8100cb9d59cef9645803
+aes-256-xts:27182818284590452353602874713526624977572470936999595749669676273141592653589793238462643383279502884197169399375105820974944592:ffffff00000000000000000000000000:000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9fa0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebfc0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedfe0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9fa0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebfc0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedfe0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff:e387aaa58ba483afa7e8eb469778317ecf4cf573aa9d4eac23f2cdf914e4e200a8b490e42ee646802dc6ee2b471b278195d60918ececb44bf79966f83faba0499298ebc699c0c8634715a320bb4f075d622e74c8c932004f25b41e361025b5a87815391f6108fc4afa6a05d9303c6ba68a128a55705d415985832fdeaae6c8e19110e84d1b1f199a2692119edc96132658f09da7c623efcec712537a3d94c0bf5d7e352ec94ae5797fdb377dc1551150721adf15bd26a8efc2fcaad56881fa9e62462c28f30ae1ceaca93c345cf243b73f542e2074a705bd2643bb9f7cc79bb6e7091ea6e232df0f9ad0d6cf502327876d82207abf2115cdacf6d5a48f6c1879a65b115f0f8b3cb3c59d15dd8c769bc014795a1837f3901b5845eb491adfefe097b1fa30a12fc1f65ba22905031539971a10f2f36c321bb51331cdefb39e3964c7ef079994f5b69b2edd83a71ef549971ee93f44eac3938fcdd61d01fa71799da3a8091c4c48aa9ed263ff0749df95d44fef6a0bb578ec69456aa5408ae32c7af08ad7ba8921287e3bbee31b767be06a0e705c864a769137df28292283ea81a2480241b44d9921cdbec1bc28dc1fda114bd8e5217ac9d8ebafa720e9da4f9ace231cc949e5b96fe76ffc21063fddc83a6b8679c00d35e09576a875305bed5f36ed242c8900dd1fa965bc950dfce09b132263a1eef52dd6888c309f5a7d712826
+aes-256-xts:27182818284590452353602874713526624977572470936999595749669676273141592653589793238462643383279502884197169399375105820974944592:ffffffff000000000000000000000000:000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9fa0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebfc0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedfe0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9fa0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebfc0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedfe0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff:bf53d2dade78e822a4d949a9bc6766b01b06a8ef70d26748c6a7fc36d80ae4c5520f7c4ab0ac8544424fa405162fef5a6b7f229498063618d39f0003cb5fb8d1c86b643497da1ff945c8d3bedeca4f479702a7a735f043ddb1d6aaade3c4a0ac7ca7f3fa5279bef56f82cd7a2f38672e824814e10700300a055e1630b8f1cb0e919f5e942010a416e2bf48cb46993d3cb6a51c19bacf864785a00bc2ecff15d350875b246ed53e68be6f55bd7e05cfc2b2ed6432198a6444b6d8c247fab941f569768b5c429366f1d3f00f0345b96123d56204c01c63b22ce78baf116e525ed90fdea39fa469494d3866c31e05f295ff21fea8d4e6e13d67e47ce722e9698a1c1048d68ebcde76b86fcf976eab8aa9790268b7068e017a8b9b749409514f1053027fd16c3786ea1bac5f15cb79711ee2abe82f5cf8b13ae73030ef5b9e4457e75d1304f988d62dd6fc4b94ed38ba831da4b7634971b6cd8ec325d9c61c00f1df73627ed3745a5e8489f3a95c69639c32cd6e1d537a85f75cc844726e8a72fc0077ad22000f1d5078f6b866318c668f1ad03d5a5fced5219f2eabbd0aa5c0f460d183f04404a0d6f469558e81fab24a167905ab4c7878502ad3e38fdbe62a41556cec37325759533ce8f25f367c87bb5578d667ae93f9e2fd99bcbc5f2fbba88cf6516139420fcff3b7361d86322c4bd84c82f335abb152c4a93411373aaa8220
+aes-256-xts:27182818284590452353602874713526624977572470936999595749669676273141592653589793238462643383279502884197169399375105820974944592:ffffffffff0000000000000000000000:000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9fa0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebfc0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedfe0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9fa0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebfc0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedfe0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff:64497e5a831e4a932c09be3e5393376daa599548b816031d224bbf50a818ed2350eae7e96087c8a0db51ad290bd00c1ac1620857635bf246c176ab463be30b808da548081ac847b158e1264be25bb0910bbc92647108089415d45fab1b3d2604e8a8eff1ae4020cfa39936b66827b23f371b92200be90251e6d73c5f86de5fd4a950781933d79a28272b782a2ec313efdfcc0628f43d744c2dc2ff3dcb66999b50c7ca895b0c64791eeaa5f29499fb1c026f84ce5b5c72ba1083cddb5ce45434631665c333b60b11593fb253c5179a2c8db813782a004856a1653011e93fb6d876c18366dd8683f53412c0c180f9c848592d593f8609ca736317d356e13e2bff3a9f59cd9aeb19cd482593d8c46128bb32423b37a9adfb482b99453fbe25a41bf6feb4aa0bef5ed24bf73c762978025482c13115e4015aac992e5613a3b5c2f685b84795cb6e9b2656d8c88157e52c42f978d8634c43d06fea928f2822e465aa6576e9bf419384506cc3ce3c54ac1a6f67dc66f3b30191e698380bc999b05abce19dc0c6dcc2dd001ec535ba18deb2df1a101023108318c75dc98611a09dc48a0acdec676fabdf222f07e026f059b672b56e5cbc8e1d21bbd867dd927212054681d70ea737134cdfce93b6f82ae22423274e58a0821cc5502e2d0ab4585e94de6975be5e0b4efce51cd3e70c25a1fbbbd609d273ad5b0d59631c531f6a0a57b9
+
+aes-128-xts:fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0:9a785634120000000000000000000000:000102030405060708090a0b0c0d0e0f10:6c1625db4671522d3d7599601de7ca09ed
+aes-128-xts:fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0:9a785634120000000000000000000000:000102030405060708090a0b0c0d0e0f1011:d069444b7a7e0cab09e24447d24deb1fedbf
+aes-128-xts:fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0:9a785634120000000000000000000000:000102030405060708090a0b0c0d0e0f101112:e5df1351c0544ba1350b3363cd8ef4beedbf9d
+aes-128-xts:fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0:9a785634120000000000000000000000:000102030405060708090a0b0c0d0e0f10111213:9d84c813f719aa2c7be3f66171c7c5c2edbf9dac
+aes-128-xts:e0e1e2e3e4e5e6e7e8e9eaebecedeeefc0c1c2c3c4c5c6c7c8c9cacbcccdcecf:21436587a90000000000000000000000:000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9fa0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebfc0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedfe0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9fa0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebfc0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedfe0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff:38b45812ef43a05bd957e545907e223b954ab4aaf088303ad910eadf14b42be68b2461149d8c8ba85f992be970bc621f1b06573f63e867bf5875acafa04e42ccbd7bd3c2a0fb1fff791ec5ec36c66ae4ac1e806d81fbf709dbe29e471fad38549c8e66f5345d7c1eb94f405d1ec785cc6f6a68f6254dd8339f9d84057e01a17741990482999516b5611a38f41bb6478e6f173f320805dd71b1932fc333cb9ee39936beea9ad96fa10fb4112b901734ddad40bc1878995f8e11aee7d141a2f5d48b7a4e1e7f0b2c04830e69a4fd1378411c2f287edf48c6c4e5c247a19680f7fe41cefbd49b582106e3616cbbe4dfb2344b2ae9519391f3e0fb4922254b1d6d2d19c6d4d537b3a26f3bcc51588b32f3eca0829b6a5ac72578fb814fb43cf80d64a233e3f997a3f02683342f2b33d25b492536b93becb2f5e1a8b82f5b883342729e8ae09d16938841a21a97fb543eea3bbff59f13c1a18449e398701c1ad51648346cbc04c27bb2da3b93a1372ccae548fb53bee476f9e9c91773b1bb19828394d55d3e1a20ed69113a860b6829ffa847224604435070221b257e8dff783615d2cae4803a93aa4334ab482a0afac9c0aeda70b45a481df5dec5df8cc0f423c77a5fd46cd312021d4b438862419a791be03bb4d97c0e59578542531ba466a83baf92cefc151b5cc1611a167893819b63fb8a6b18e86de60290fa72b797b0ce59f3
+# AES wrap tests from RFC3394
+id-aes128-wrap:000102030405060708090A0B0C0D0E0F::00112233445566778899AABBCCDDEEFF:1FA68B0A8112B447AEF34BD8FB5A7B829D3E862371D2CFE5
+id-aes192-wrap:000102030405060708090A0B0C0D0E0F1011121314151617::00112233445566778899AABBCCDDEEFF:96778B25AE6CA435F92B5B97C050AED2468AB8A17AD84E5D
+id-aes256-wrap:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F::00112233445566778899AABBCCDDEEFF:64E8C3F9CE0F5BA263E9777905818A2A93C8191E7D6E8AE7
+id-aes192-wrap:000102030405060708090A0B0C0D0E0F1011121314151617::00112233445566778899AABBCCDDEEFF0001020304050607:031D33264E15D33268F24EC260743EDCE1C6C7DDEE725A936BA814915C6762D2
+id-aes256-wrap:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F::00112233445566778899AABBCCDDEEFF0001020304050607:A8F9BC1612C68B3FF6E6F4FBE30E71E4769C8B80A32CB8958CD5D17D6B254DA1
+id-aes256-wrap:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F::00112233445566778899AABBCCDDEEFF000102030405060708090A0B0C0D0E0F:28C9F404C4B810F4CBCCB35CFB87F8263F5786E2D80ED326CBC7F0E71A99F43BFB988B9B7A02DD21
diff --git a/crypto/evp/m_dss.c b/crypto/evp/m_dss.c
index f22ba522c2f2..147844862d17 100644
--- a/crypto/evp/m_dss.c
+++ b/crypto/evp/m_dss.c
@@ -66,7 +66,6 @@
#endif
#ifndef OPENSSL_NO_SHA
-# ifndef OPENSSL_FIPS
static int init(EVP_MD_CTX *ctx)
{
@@ -102,5 +101,4 @@ const EVP_MD *EVP_dss(void)
{
return (&dsa_md);
}
-# endif
#endif
diff --git a/crypto/evp/m_dss1.c b/crypto/evp/m_dss1.c
index 976148e436a0..e36fabff700e 100644
--- a/crypto/evp/m_dss1.c
+++ b/crypto/evp/m_dss1.c
@@ -68,8 +68,6 @@
# include <openssl/dsa.h>
# endif
-# ifndef OPENSSL_FIPS
-
static int init(EVP_MD_CTX *ctx)
{
return SHA1_Init(ctx->md_data);
@@ -104,5 +102,4 @@ const EVP_MD *EVP_dss1(void)
{
return (&dss1_md);
}
-# endif
#endif
diff --git a/crypto/evp/m_ecdsa.c b/crypto/evp/m_ecdsa.c
index d11a13a6d7a7..803d31495577 100644
--- a/crypto/evp/m_ecdsa.c
+++ b/crypto/evp/m_ecdsa.c
@@ -116,7 +116,6 @@
#include <openssl/x509.h>
#ifndef OPENSSL_NO_SHA
-# ifndef OPENSSL_FIPS
static int init(EVP_MD_CTX *ctx)
{
@@ -152,5 +151,4 @@ const EVP_MD *EVP_ecdsa(void)
{
return (&ecdsa_md);
}
-# endif
#endif
diff --git a/crypto/evp/m_sha1.c b/crypto/evp/m_sha1.c
index 0cc63555ad48..a74e6b77948e 100644
--- a/crypto/evp/m_sha1.c
+++ b/crypto/evp/m_sha1.c
@@ -59,16 +59,14 @@
#include <stdio.h>
#include "cryptlib.h"
-#ifndef OPENSSL_FIPS
+#ifndef OPENSSL_NO_SHA
-# ifndef OPENSSL_NO_SHA
-
-# include <openssl/evp.h>
-# include <openssl/objects.h>
-# include <openssl/sha.h>
-# ifndef OPENSSL_NO_RSA
-# include <openssl/rsa.h>
-# endif
+# include <openssl/evp.h>
+# include <openssl/objects.h>
+# include <openssl/sha.h>
+# ifndef OPENSSL_NO_RSA
+# include <openssl/rsa.h>
+# endif
static int init(EVP_MD_CTX *ctx)
{
@@ -104,9 +102,9 @@ const EVP_MD *EVP_sha1(void)
{
return (&sha1_md);
}
-# endif
+#endif
-# ifndef OPENSSL_NO_SHA256
+#ifndef OPENSSL_NO_SHA256
static int init224(EVP_MD_CTX *ctx)
{
return SHA224_Init(ctx->md_data);
@@ -171,9 +169,9 @@ const EVP_MD *EVP_sha256(void)
{
return (&sha256_md);
}
-# endif /* ifndef OPENSSL_NO_SHA256 */
+#endif /* ifndef OPENSSL_NO_SHA256 */
-# ifndef OPENSSL_NO_SHA512
+#ifndef OPENSSL_NO_SHA512
static int init384(EVP_MD_CTX *ctx)
{
return SHA384_Init(ctx->md_data);
@@ -234,6 +232,4 @@ const EVP_MD *EVP_sha512(void)
{
return (&sha512_md);
}
-# endif /* ifndef OPENSSL_NO_SHA512 */
-
-#endif
+#endif /* ifndef OPENSSL_NO_SHA512 */
diff --git a/crypto/evp/m_sigver.c b/crypto/evp/m_sigver.c
index e153a1833e07..4492d207f28e 100644
--- a/crypto/evp/m_sigver.c
+++ b/crypto/evp/m_sigver.c
@@ -73,15 +73,18 @@ static int do_sigver_init(EVP_MD_CTX *ctx, EVP_PKEY_CTX **pctx,
if (ctx->pctx == NULL)
return 0;
- if (type == NULL) {
- int def_nid;
- if (EVP_PKEY_get_default_digest_nid(pkey, &def_nid) > 0)
- type = EVP_get_digestbynid(def_nid);
- }
+ if (!(ctx->pctx->pmeth->flags & EVP_PKEY_FLAG_SIGCTX_CUSTOM)) {
- if (type == NULL) {
- EVPerr(EVP_F_DO_SIGVER_INIT, EVP_R_NO_DEFAULT_DIGEST);
- return 0;
+ if (type == NULL) {
+ int def_nid;
+ if (EVP_PKEY_get_default_digest_nid(pkey, &def_nid) > 0)
+ type = EVP_get_digestbynid(def_nid);
+ }
+
+ if (type == NULL) {
+ EVPerr(EVP_F_DO_SIGVER_INIT, EVP_R_NO_DEFAULT_DIGEST);
+ return 0;
+ }
}
if (ver) {
@@ -103,6 +106,8 @@ static int do_sigver_init(EVP_MD_CTX *ctx, EVP_PKEY_CTX **pctx,
return 0;
if (pctx)
*pctx = ctx->pctx;
+ if (ctx->pctx->pmeth->flags & EVP_PKEY_FLAG_SIGCTX_CUSTOM)
+ return 1;
if (!EVP_DigestInit_ex(ctx, type, e))
return 0;
return 1;
@@ -124,7 +129,19 @@ int EVP_DigestSignFinal(EVP_MD_CTX *ctx, unsigned char *sigret,
size_t *siglen)
{
int sctx, r = 0;
- if (ctx->pctx->pmeth->signctx)
+ EVP_PKEY_CTX *pctx = ctx->pctx;
+ if (pctx->pmeth->flags & EVP_PKEY_FLAG_SIGCTX_CUSTOM) {
+ EVP_PKEY_CTX *dctx;
+ if (!sigret)
+ return pctx->pmeth->signctx(pctx, sigret, siglen, ctx);
+ dctx = EVP_PKEY_CTX_dup(ctx->pctx);
+ if (!dctx)
+ return 0;
+ r = dctx->pmeth->signctx(dctx, sigret, siglen, ctx);
+ EVP_PKEY_CTX_free(dctx);
+ return r;
+ }
+ if (pctx->pmeth->signctx)
sctx = 1;
else
sctx = 0;
@@ -147,20 +164,19 @@ int EVP_DigestSignFinal(EVP_MD_CTX *ctx, unsigned char *sigret,
return 0;
} else {
if (sctx) {
- if (ctx->pctx->pmeth->signctx(ctx->pctx, sigret, siglen, ctx) <=
- 0)
+ if (pctx->pmeth->signctx(pctx, sigret, siglen, ctx) <= 0)
return 0;
} else {
int s = EVP_MD_size(ctx->digest);
- if (s < 0
- || EVP_PKEY_sign(ctx->pctx, sigret, siglen, NULL, s) <= 0)
+ if (s < 0 || EVP_PKEY_sign(pctx, sigret, siglen, NULL, s) <= 0)
return 0;
}
}
return 1;
}
-int EVP_DigestVerifyFinal(EVP_MD_CTX *ctx, unsigned char *sig, size_t siglen)
+int EVP_DigestVerifyFinal(EVP_MD_CTX *ctx, const unsigned char *sig,
+ size_t siglen)
{
EVP_MD_CTX tmp_ctx;
unsigned char md[EVP_MAX_MD_SIZE];
diff --git a/crypto/evp/p_lib.c b/crypto/evp/p_lib.c
index 2b84dc75ec3a..1171d3086d0b 100644
--- a/crypto/evp/p_lib.c
+++ b/crypto/evp/p_lib.c
@@ -337,7 +337,7 @@ int EVP_PKEY_set1_DH(EVP_PKEY *pkey, DH *key)
DH *EVP_PKEY_get1_DH(EVP_PKEY *pkey)
{
- if (pkey->type != EVP_PKEY_DH) {
+ if (pkey->type != EVP_PKEY_DH && pkey->type != EVP_PKEY_DHX) {
EVPerr(EVP_F_EVP_PKEY_GET1_DH, EVP_R_EXPECTING_A_DH_KEY);
return NULL;
}
diff --git a/crypto/evp/pmeth_lib.c b/crypto/evp/pmeth_lib.c
index ae8bccb0c6eb..9f81d10021a0 100644
--- a/crypto/evp/pmeth_lib.c
+++ b/crypto/evp/pmeth_lib.c
@@ -75,6 +75,7 @@ STACK_OF(EVP_PKEY_METHOD) *app_pkey_methods = NULL;
extern const EVP_PKEY_METHOD rsa_pkey_meth, dh_pkey_meth, dsa_pkey_meth;
extern const EVP_PKEY_METHOD ec_pkey_meth, hmac_pkey_meth, cmac_pkey_meth;
+extern const EVP_PKEY_METHOD dhx_pkey_meth;
static const EVP_PKEY_METHOD *standard_methods[] = {
#ifndef OPENSSL_NO_RSA
@@ -90,7 +91,10 @@ static const EVP_PKEY_METHOD *standard_methods[] = {
&ec_pkey_meth,
#endif
&hmac_pkey_meth,
- &cmac_pkey_meth
+ &cmac_pkey_meth,
+#ifndef OPENSSL_NO_DH
+ &dhx_pkey_meth
+#endif
};
DECLARE_OBJ_BSEARCH_CMP_FN(const EVP_PKEY_METHOD *, const EVP_PKEY_METHOD *,
diff --git a/crypto/hmac/hm_ameth.c b/crypto/hmac/hm_ameth.c
index 641c797ef1d5..29b2b5dffcf7 100644
--- a/crypto/hmac/hm_ameth.c
+++ b/crypto/hmac/hm_ameth.c
@@ -87,7 +87,7 @@ static int hmac_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2)
{
switch (op) {
case ASN1_PKEY_CTRL_DEFAULT_MD_NID:
- *(int *)arg2 = NID_sha1;
+ *(int *)arg2 = NID_sha256;
return 1;
default:
diff --git a/crypto/hmac/hmac.c b/crypto/hmac/hmac.c
index 33d88be1179b..51a0a3efcd67 100644
--- a/crypto/hmac/hmac.c
+++ b/crypto/hmac/hmac.c
@@ -72,6 +72,16 @@ int HMAC_Init_ex(HMAC_CTX *ctx, const void *key, int len,
unsigned char pad[HMAC_MAX_MD_CBLOCK];
#ifdef OPENSSL_FIPS
+ /* If FIPS mode switch to approved implementation if possible */
+ if (FIPS_mode()) {
+ const EVP_MD *fipsmd;
+ if (md) {
+ fipsmd = FIPS_get_digestbynid(EVP_MD_type(md));
+ if (fipsmd)
+ md = fipsmd;
+ }
+ }
+
if (FIPS_mode()) {
/* If we have an ENGINE need to allow non FIPS */
if ((impl || ctx->i_ctx.engine)
diff --git a/crypto/hmac/hmactest.c b/crypto/hmac/hmactest.c
index 271d0ebf264c..5a573950a453 100644
--- a/crypto/hmac/hmactest.c
+++ b/crypto/hmac/hmactest.c
@@ -195,6 +195,7 @@ int main(int argc, char *argv[])
}
printf("test 4 ok\n");
test5:
+ HMAC_CTX_cleanup(&ctx);
HMAC_CTX_init(&ctx);
if (HMAC_Init_ex(&ctx, test[4].key, test[4].key_len, NULL, NULL)) {
printf("Should fail to initialise HMAC with empty MD (test 5)\n");
@@ -284,6 +285,7 @@ test5:
printf("test 5 ok\n");
}
test6:
+ HMAC_CTX_cleanup(&ctx);
HMAC_CTX_init(&ctx);
if (!HMAC_Init_ex(&ctx, test[7].key, test[7].key_len, EVP_sha1(), NULL)) {
printf("Failed to initialise HMAC (test 6)\n");
@@ -314,6 +316,7 @@ test6:
printf("test 6 ok\n");
}
end:
+ HMAC_CTX_cleanup(&ctx);
EXIT(err);
return (0);
}
diff --git a/crypto/jpake/jpake.c b/crypto/jpake/jpake.c
index ed2e888eb4c0..8c38727e20fd 100644
--- a/crypto/jpake/jpake.c
+++ b/crypto/jpake/jpake.c
@@ -4,6 +4,7 @@
#include <openssl/sha.h>
#include <openssl/err.h>
#include <memory.h>
+#include <string.h>
/*
* In the definition, (xa, xb, xc, xd) are Alice's (x1, x2, x3, x4) or
diff --git a/crypto/md32_common.h b/crypto/md32_common.h
index 1823833419c4..96828d2693a1 100644
--- a/crypto/md32_common.h
+++ b/crypto/md32_common.h
@@ -215,12 +215,30 @@
asm ("bswapl %0":"=r"(r):"0"(r)); \
*((unsigned int *)(c))=r; (c)+=4; r; })
# endif
+# elif defined(__aarch64__)
+# if defined(__BYTE_ORDER__)
+# if defined(__ORDER_LITTLE_ENDIAN__) && __BYTE_ORDER__==__ORDER_LITTLE_ENDIAN__
+# define HOST_c2l(c,l) ({ unsigned int r; \
+ asm ("rev %w0,%w1" \
+ :"=r"(r) \
+ :"r"(*((const unsigned int *)(c))));\
+ (c)+=4; (l)=r; })
+# define HOST_l2c(l,c) ({ unsigned int r; \
+ asm ("rev %w0,%w1" \
+ :"=r"(r) \
+ :"r"((unsigned int)(l)));\
+ *((unsigned int *)(c))=r; (c)+=4; r; })
+# elif defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__==__ORDER_BIG_ENDIAN__
+# define HOST_c2l(c,l) ((l)=*((const unsigned int *)(c)), (c)+=4, (l))
+# define HOST_l2c(l,c) (*((unsigned int *)(c))=(l), (c)+=4, (l))
+# endif
+# endif
# endif
# endif
-# endif
-# if defined(__s390__) || defined(__s390x__)
-# define HOST_c2l(c,l) ((l)=*((const unsigned int *)(c)), (c)+=4, (l))
-# define HOST_l2c(l,c) (*((unsigned int *)(c))=(l), (c)+=4, (l))
+# if defined(__s390__) || defined(__s390x__)
+# define HOST_c2l(c,l) ((l)=*((const unsigned int *)(c)), (c)+=4, (l))
+# define HOST_l2c(l,c) (*((unsigned int *)(c))=(l), (c)+=4, (l))
+# endif
# endif
# ifndef HOST_c2l
@@ -250,12 +268,12 @@
(c)+=4; (l); })
# endif
# endif
-# endif
-# if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)
-# ifndef B_ENDIAN
- /* See comment in DATA_ORDER_IS_BIG_ENDIAN section. */
-# define HOST_c2l(c,l) ((l)=*((const unsigned int *)(c)), (c)+=4, l)
-# define HOST_l2c(l,c) (*((unsigned int *)(c))=(l), (c)+=4, l)
+# if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)
+# ifndef B_ENDIAN
+ /* See comment in DATA_ORDER_IS_BIG_ENDIAN section. */
+# define HOST_c2l(c,l) ((l)=*((const unsigned int *)(c)), (c)+=4, l)
+# define HOST_l2c(l,c) (*((unsigned int *)(c))=(l), (c)+=4, l)
+# endif
# endif
# endif
diff --git a/crypto/md5/Makefile b/crypto/md5/Makefile
index 0f87dbd54230..f5240da74cd7 100644
--- a/crypto/md5/Makefile
+++ b/crypto/md5/Makefile
@@ -52,6 +52,9 @@ md5-ia64.s: asm/md5-ia64.S
$(CC) $(CFLAGS) -E asm/md5-ia64.S | \
$(PERL) -ne 's/;\s+/;\n/g; print;' > $@
+md5-sparcv9.S: asm/md5-sparcv9.pl
+ $(PERL) asm/md5-sparcv9.pl $@ $(CFLAGS)
+
files:
$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
diff --git a/crypto/md5/asm/md5-sparcv9.pl b/crypto/md5/asm/md5-sparcv9.pl
new file mode 100755
index 000000000000..407da3c1b0df
--- /dev/null
+++ b/crypto/md5/asm/md5-sparcv9.pl
@@ -0,0 +1,430 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Hardware SPARC T4 support by David S. Miller <davem@davemloft.net>.
+# ====================================================================
+
+# MD5 for SPARCv9, 6.9 cycles per byte on UltraSPARC, >40% faster than
+# code generated by Sun C 5.2.
+
+# SPARC T4 MD5 hardware achieves 3.20 cycles per byte, which is 2.1x
+# faster than software. Multi-process benchmark saturates at 12x
+# single-process result on 8-core processor, or ~11GBps per 2.85GHz
+# socket.
+
+$output=shift;
+open STDOUT,">$output";
+
+use integer;
+
+($ctx,$inp,$len)=("%i0","%i1","%i2"); # input arguments
+
+# 64-bit values
+@X=("%o0","%o1","%o2","%o3","%o4","%o5","%o7","%g1","%g2");
+$tx="%g3";
+($AB,$CD)=("%g4","%g5");
+
+# 32-bit values
+@V=($A,$B,$C,$D)=map("%l$_",(0..3));
+($t1,$t2,$t3,$saved_asi)=map("%l$_",(4..7));
+($shr,$shl1,$shl2)=("%i3","%i4","%i5");
+
+my @K=( 0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee,
+ 0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501,
+ 0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be,
+ 0x6b901122,0xfd987193,0xa679438e,0x49b40821,
+
+ 0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa,
+ 0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8,
+ 0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed,
+ 0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a,
+
+ 0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c,
+ 0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70,
+ 0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05,
+ 0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665,
+
+ 0xf4292244,0x432aff97,0xab9423a7,0xfc93a039,
+ 0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1,
+ 0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1,
+ 0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391, 0 );
+
+sub R0 {
+ my ($i,$a,$b,$c,$d) = @_;
+ my $rot = (7,12,17,22)[$i%4];
+ my $j = ($i+1)/2;
+
+ if ($i&1) {
+ $code.=<<___;
+ srlx @X[$j],$shr,@X[$j] ! align X[`$i+1`]
+ and $b,$t1,$t1 ! round $i
+ sllx @X[$j+1],$shl1,$tx
+ add $t2,$a,$a
+ sllx $tx,$shl2,$tx
+ xor $d,$t1,$t1
+ or $tx,@X[$j],@X[$j]
+ sethi %hi(@K[$i+1]),$t2
+ add $t1,$a,$a
+ or $t2,%lo(@K[$i+1]),$t2
+ sll $a,$rot,$t3
+ add @X[$j],$t2,$t2 ! X[`$i+1`]+K[`$i+1`]
+ srl $a,32-$rot,$a
+ add $b,$t3,$t3
+ xor $b,$c,$t1
+ add $t3,$a,$a
+___
+ } else {
+ $code.=<<___;
+ srlx @X[$j],32,$tx ! extract X[`2*$j+1`]
+ and $b,$t1,$t1 ! round $i
+ add $t2,$a,$a
+ xor $d,$t1,$t1
+ sethi %hi(@K[$i+1]),$t2
+ add $t1,$a,$a
+ or $t2,%lo(@K[$i+1]),$t2
+ sll $a,$rot,$t3
+ add $tx,$t2,$t2 ! X[`2*$j+1`]+K[`$i+1`]
+ srl $a,32-$rot,$a
+ add $b,$t3,$t3
+ xor $b,$c,$t1
+ add $t3,$a,$a
+___
+ }
+}
+
+sub R0_1 {
+ my ($i,$a,$b,$c,$d) = @_;
+ my $rot = (7,12,17,22)[$i%4];
+
+$code.=<<___;
+ srlx @X[0],32,$tx ! extract X[1]
+ and $b,$t1,$t1 ! round $i
+ add $t2,$a,$a
+ xor $d,$t1,$t1
+ sethi %hi(@K[$i+1]),$t2
+ add $t1,$a,$a
+ or $t2,%lo(@K[$i+1]),$t2
+ sll $a,$rot,$t3
+ add $tx,$t2,$t2 ! X[1]+K[`$i+1`]
+ srl $a,32-$rot,$a
+ add $b,$t3,$t3
+ andn $b,$c,$t1
+ add $t3,$a,$a
+___
+}
+
+sub R1 {
+ my ($i,$a,$b,$c,$d) = @_;
+ my $rot = (5,9,14,20)[$i%4];
+ my $j = $i<31 ? (1+5*($i+1))%16 : (5+3*($i+1))%16;
+ my $xi = @X[$j/2];
+
+$code.=<<___ if ($j&1 && ($xi=$tx));
+ srlx @X[$j/2],32,$xi ! extract X[$j]
+___
+$code.=<<___;
+ and $b,$d,$t3 ! round $i
+ add $t2,$a,$a
+ or $t3,$t1,$t1
+ sethi %hi(@K[$i+1]),$t2
+ add $t1,$a,$a
+ or $t2,%lo(@K[$i+1]),$t2
+ sll $a,$rot,$t3
+ add $xi,$t2,$t2 ! X[$j]+K[`$i+1`]
+ srl $a,32-$rot,$a
+ add $b,$t3,$t3
+ `$i<31?"andn":"xor"` $b,$c,$t1
+ add $t3,$a,$a
+___
+}
+
+sub R2 {
+ my ($i,$a,$b,$c,$d) = @_;
+ my $rot = (4,11,16,23)[$i%4];
+ my $j = $i<47 ? (5+3*($i+1))%16 : (0+7*($i+1))%16;
+ my $xi = @X[$j/2];
+
+$code.=<<___ if ($j&1 && ($xi=$tx));
+ srlx @X[$j/2],32,$xi ! extract X[$j]
+___
+$code.=<<___;
+ add $t2,$a,$a ! round $i
+ xor $b,$t1,$t1
+ sethi %hi(@K[$i+1]),$t2
+ add $t1,$a,$a
+ or $t2,%lo(@K[$i+1]),$t2
+ sll $a,$rot,$t3
+ add $xi,$t2,$t2 ! X[$j]+K[`$i+1`]
+ srl $a,32-$rot,$a
+ add $b,$t3,$t3
+ xor $b,$c,$t1
+ add $t3,$a,$a
+___
+}
+
+sub R3 {
+ my ($i,$a,$b,$c,$d) = @_;
+ my $rot = (6,10,15,21)[$i%4];
+ my $j = (0+7*($i+1))%16;
+ my $xi = @X[$j/2];
+
+$code.=<<___;
+ add $t2,$a,$a ! round $i
+___
+$code.=<<___ if ($j&1 && ($xi=$tx));
+ srlx @X[$j/2],32,$xi ! extract X[$j]
+___
+$code.=<<___;
+ orn $b,$d,$t1
+ sethi %hi(@K[$i+1]),$t2
+ xor $c,$t1,$t1
+ or $t2,%lo(@K[$i+1]),$t2
+ add $t1,$a,$a
+ sll $a,$rot,$t3
+ add $xi,$t2,$t2 ! X[$j]+K[`$i+1`]
+ srl $a,32-$rot,$a
+ add $b,$t3,$t3
+ add $t3,$a,$a
+___
+}
+
+$code.=<<___;
+#include "sparc_arch.h"
+
+#ifdef __arch64__
+.register %g2,#scratch
+.register %g3,#scratch
+#endif
+
+.section ".text",#alloc,#execinstr
+
+#ifdef __PIC__
+SPARC_PIC_THUNK(%g1)
+#endif
+
+.globl md5_block_asm_data_order
+.align 32
+md5_block_asm_data_order:
+ SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
+ ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1]
+
+ andcc %g1, CFR_MD5, %g0
+ be .Lsoftware
+ nop
+
+ mov 4, %g1
+ andcc %o1, 0x7, %g0
+ lda [%o0 + %g0]0x88, %f0 ! load context
+ lda [%o0 + %g1]0x88, %f1
+ add %o0, 8, %o0
+ lda [%o0 + %g0]0x88, %f2
+ lda [%o0 + %g1]0x88, %f3
+ bne,pn %icc, .Lhwunaligned
+ sub %o0, 8, %o0
+
+.Lhw_loop:
+ ldd [%o1 + 0x00], %f8
+ ldd [%o1 + 0x08], %f10
+ ldd [%o1 + 0x10], %f12
+ ldd [%o1 + 0x18], %f14
+ ldd [%o1 + 0x20], %f16
+ ldd [%o1 + 0x28], %f18
+ ldd [%o1 + 0x30], %f20
+ subcc %o2, 1, %o2 ! done yet?
+ ldd [%o1 + 0x38], %f22
+ add %o1, 0x40, %o1
+ prefetch [%o1 + 63], 20
+
+ .word 0x81b02800 ! MD5
+
+ bne,pt SIZE_T_CC, .Lhw_loop
+ nop
+
+.Lhwfinish:
+ sta %f0, [%o0 + %g0]0x88 ! store context
+ sta %f1, [%o0 + %g1]0x88
+ add %o0, 8, %o0
+ sta %f2, [%o0 + %g0]0x88
+ sta %f3, [%o0 + %g1]0x88
+ retl
+ nop
+
+.align 8
+.Lhwunaligned:
+ alignaddr %o1, %g0, %o1
+
+ ldd [%o1 + 0x00], %f10
+.Lhwunaligned_loop:
+ ldd [%o1 + 0x08], %f12
+ ldd [%o1 + 0x10], %f14
+ ldd [%o1 + 0x18], %f16
+ ldd [%o1 + 0x20], %f18
+ ldd [%o1 + 0x28], %f20
+ ldd [%o1 + 0x30], %f22
+ ldd [%o1 + 0x38], %f24
+ subcc %o2, 1, %o2 ! done yet?
+ ldd [%o1 + 0x40], %f26
+ add %o1, 0x40, %o1
+ prefetch [%o1 + 63], 20
+
+ faligndata %f10, %f12, %f8
+ faligndata %f12, %f14, %f10
+ faligndata %f14, %f16, %f12
+ faligndata %f16, %f18, %f14
+ faligndata %f18, %f20, %f16
+ faligndata %f20, %f22, %f18
+ faligndata %f22, %f24, %f20
+ faligndata %f24, %f26, %f22
+
+ .word 0x81b02800 ! MD5
+
+ bne,pt SIZE_T_CC, .Lhwunaligned_loop
+ for %f26, %f26, %f10 ! %f10=%f26
+
+ ba .Lhwfinish
+ nop
+
+.align 16
+.Lsoftware:
+ save %sp,-STACK_FRAME,%sp
+
+ rd %asi,$saved_asi
+ wr %g0,0x88,%asi ! ASI_PRIMARY_LITTLE
+ and $inp,7,$shr
+ andn $inp,7,$inp
+
+ sll $shr,3,$shr ! *=8
+ mov 56,$shl2
+ ld [$ctx+0],$A
+ sub $shl2,$shr,$shl2
+ ld [$ctx+4],$B
+ and $shl2,32,$shl1
+ add $shl2,8,$shl2
+ ld [$ctx+8],$C
+ sub $shl2,$shl1,$shl2 ! shr+shl1+shl2==64
+ ld [$ctx+12],$D
+ nop
+
+.Loop:
+ cmp $shr,0 ! was inp aligned?
+ ldxa [$inp+0]%asi,@X[0] ! load little-endian input
+ ldxa [$inp+8]%asi,@X[1]
+ ldxa [$inp+16]%asi,@X[2]
+ ldxa [$inp+24]%asi,@X[3]
+ ldxa [$inp+32]%asi,@X[4]
+ sllx $A,32,$AB ! pack A,B
+ ldxa [$inp+40]%asi,@X[5]
+ sllx $C,32,$CD ! pack C,D
+ ldxa [$inp+48]%asi,@X[6]
+ or $B,$AB,$AB
+ ldxa [$inp+56]%asi,@X[7]
+ or $D,$CD,$CD
+ bnz,a,pn %icc,.+8
+ ldxa [$inp+64]%asi,@X[8]
+
+ srlx @X[0],$shr,@X[0] ! align X[0]
+ sllx @X[1],$shl1,$tx
+ sethi %hi(@K[0]),$t2
+ sllx $tx,$shl2,$tx
+ or $t2,%lo(@K[0]),$t2
+ or $tx,@X[0],@X[0]
+ xor $C,$D,$t1
+ add @X[0],$t2,$t2 ! X[0]+K[0]
+___
+ for ($i=0;$i<15;$i++) { &R0($i,@V); unshift(@V,pop(@V)); }
+ for (;$i<16;$i++) { &R0_1($i,@V); unshift(@V,pop(@V)); }
+ for (;$i<32;$i++) { &R1($i,@V); unshift(@V,pop(@V)); }
+ for (;$i<48;$i++) { &R2($i,@V); unshift(@V,pop(@V)); }
+ for (;$i<64;$i++) { &R3($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ srlx $AB,32,$t1 ! unpack A,B,C,D and accumulate
+ add $inp,64,$inp ! advance inp
+ srlx $CD,32,$t2
+ add $t1,$A,$A
+ subcc $len,1,$len ! done yet?
+ add $AB,$B,$B
+ add $t2,$C,$C
+ add $CD,$D,$D
+ srl $B,0,$B ! clruw $B
+ bne SIZE_T_CC,.Loop
+ srl $D,0,$D ! clruw $D
+
+ st $A,[$ctx+0] ! write out ctx
+ st $B,[$ctx+4]
+ st $C,[$ctx+8]
+ st $D,[$ctx+12]
+
+ wr %g0,$saved_asi,%asi
+ ret
+ restore
+.type md5_block_asm_data_order,#function
+.size md5_block_asm_data_order,(.-md5_block_asm_data_order)
+
+.asciz "MD5 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+.align 4
+___
+
+# Purpose of these subroutines is to explicitly encode VIS instructions,
+# so that one can compile the module without having to specify VIS
+# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# Idea is to reserve for option to produce "universal" binary and let
+# programmer detect if current CPU is VIS capable at run-time.
+sub unvis {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my $ref,$opf;
+my %visopf = ( "faligndata" => 0x048,
+ "for" => 0x07c );
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+ if ($opf=$visopf{$mnemonic}) {
+ foreach ($rs1,$rs2,$rd) {
+ return $ref if (!/%f([0-9]{1,2})/);
+ $_=$1;
+ if ($1>=32) {
+ return $ref if ($1&1);
+ # re-encode for upper double register addressing
+ $_=($1|$1>>5)&31;
+ }
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+sub unalignaddr {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
+my $ref="$mnemonic\t$rs1,$rs2,$rd";
+
+ foreach ($rs1,$rs2,$rd) {
+ if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
+ else { return $ref; }
+ }
+ return sprintf ".word\t0x%08x !%s",
+ 0x81b00300|$rd<<25|$rs1<<14|$rs2,
+ $ref;
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+
+ s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
+ &unvis($1,$2,$3,$4)
+ /ge;
+ s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
+ &unalignaddr($1,$2,$3,$4)
+ /ge;
+
+ print $_,"\n";
+}
+
+close STDOUT;
diff --git a/crypto/md5/md5_locl.h b/crypto/md5/md5_locl.h
index 5f6f2fdedd78..82e69218dae1 100644
--- a/crypto/md5/md5_locl.h
+++ b/crypto/md5/md5_locl.h
@@ -71,6 +71,8 @@
# define md5_block_data_order md5_block_asm_data_order
# elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
# define md5_block_data_order md5_block_asm_data_order
+# elif defined(__sparc) || defined(__sparc__)
+# define md5_block_data_order md5_block_asm_data_order
# endif
#endif
diff --git a/crypto/modes/Makefile b/crypto/modes/Makefile
index e278fa6a2528..a7863d98be2f 100644
--- a/crypto/modes/Makefile
+++ b/crypto/modes/Makefile
@@ -22,9 +22,9 @@ APPS=
LIB=$(TOP)/libcrypto.a
LIBSRC= cbc128.c ctr128.c cts128.c cfb128.c ofb128.c gcm128.c \
- ccm128.c xts128.c
+ ccm128.c xts128.c wrap128.c
LIBOBJ= cbc128.o ctr128.o cts128.o cfb128.o ofb128.o gcm128.o \
- ccm128.o xts128.o $(MODES_ASM_OBJ)
+ ccm128.o xts128.o wrap128.o $(MODES_ASM_OBJ)
SRC= $(LIBSRC)
@@ -50,20 +50,26 @@ ghash-x86.s: asm/ghash-x86.pl
$(PERL) asm/ghash-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@
ghash-x86_64.s: asm/ghash-x86_64.pl
$(PERL) asm/ghash-x86_64.pl $(PERLASM_SCHEME) > $@
+aesni-gcm-x86_64.s: asm/aesni-gcm-x86_64.pl
+ $(PERL) asm/aesni-gcm-x86_64.pl $(PERLASM_SCHEME) > $@
ghash-sparcv9.s: asm/ghash-sparcv9.pl
$(PERL) asm/ghash-sparcv9.pl $@ $(CFLAGS)
ghash-alpha.s: asm/ghash-alpha.pl
- (preproc=/tmp/$$$$.$@; trap "rm $$preproc" INT; \
+ (preproc=$$$$.$@.S; trap "rm $$preproc" INT; \
$(PERL) asm/ghash-alpha.pl > $$preproc && \
- $(CC) -E $$preproc > $@ && rm $$preproc)
-
+ $(CC) -E -P $$preproc > $@ && rm $$preproc)
ghash-parisc.s: asm/ghash-parisc.pl
$(PERL) asm/ghash-parisc.pl $(PERLASM_SCHEME) $@
+ghashv8-armx.S: asm/ghashv8-armx.pl
+ $(PERL) asm/ghashv8-armx.pl $(PERLASM_SCHEME) $@
+ghashp8-ppc.s: asm/ghashp8-ppc.pl
+ $(PERL) asm/ghashp8-ppc.pl $(PERLASM_SCHEME) $@
# GNU make "catch all"
ghash-%.S: asm/ghash-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
ghash-armv4.o: ghash-armv4.S
+ghashv8-armx.o: ghashv8-armx.S
files:
$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
@@ -139,6 +145,14 @@ ofb128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
ofb128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
ofb128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
ofb128.o: ../../include/openssl/symhacks.h modes_lcl.h ofb128.c
+wrap128.o: ../../e_os.h ../../include/openssl/bio.h
+wrap128.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
+wrap128.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
+wrap128.o: ../../include/openssl/lhash.h ../../include/openssl/modes.h
+wrap128.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h
+wrap128.o: ../../include/openssl/ossl_typ.h ../../include/openssl/safestack.h
+wrap128.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
+wrap128.o: ../cryptlib.h wrap128.c
xts128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
xts128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
xts128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
diff --git a/crypto/modes/asm/aesni-gcm-x86_64.pl b/crypto/modes/asm/aesni-gcm-x86_64.pl
new file mode 100755
index 000000000000..7e4e04ea2530
--- /dev/null
+++ b/crypto/modes/asm/aesni-gcm-x86_64.pl
@@ -0,0 +1,1057 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+#
+# AES-NI-CTR+GHASH stitch.
+#
+# February 2013
+#
+# OpenSSL GCM implementation is organized in such way that its
+# performance is rather close to the sum of its streamed components,
+# in the context parallelized AES-NI CTR and modulo-scheduled
+# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
+# was observed to perform significantly better than the sum of the
+# components on contemporary CPUs, the effort was deemed impossible to
+# justify. This module is based on combination of Intel submissions,
+# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
+# Locktyukhin of Intel Corp. who verified that it reduces shuffles
+# pressure with notable relative improvement, achieving 1.0 cycle per
+# byte processed with 128-bit key on Haswell processor, and 0.74 -
+# on Broadwell. [Mentioned results are raw profiled measurements for
+# favourable packet size, one divisible by 96. Applications using the
+# EVP interface will observe a few percent worse performance.]
+#
+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
+# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+ $avx = ($2>=3.0) + ($2>3.0);
+}
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+if ($avx>1) {{{
+
+($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+
+($Ii,$T1,$T2,$Hkey,
+ $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
+
+($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
+
+($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15");
+
+$code=<<___;
+.text
+
+.type _aesni_ctr32_ghash_6x,\@abi-omnipotent
+.align 32
+_aesni_ctr32_ghash_6x:
+ vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb
+ sub \$6,$len
+ vpxor $Z0,$Z0,$Z0 # $Z0 = 0
+ vmovdqu 0x00-0x80($key),$rndkey
+ vpaddb $T2,$T1,$inout1
+ vpaddb $T2,$inout1,$inout2
+ vpaddb $T2,$inout2,$inout3
+ vpaddb $T2,$inout3,$inout4
+ vpaddb $T2,$inout4,$inout5
+ vpxor $rndkey,$T1,$inout0
+ vmovdqu $Z0,16+8(%rsp) # "$Z3" = 0
+ jmp .Loop6x
+
+.align 32
+.Loop6x:
+ add \$`6<<24`,$counter
+ jc .Lhandle_ctr32 # discard $inout[1-5]?
+ vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
+ vpaddb $T2,$inout5,$T1 # next counter value
+ vpxor $rndkey,$inout1,$inout1
+ vpxor $rndkey,$inout2,$inout2
+
+.Lresume_ctr32:
+ vmovdqu $T1,($ivp) # save next counter value
+ vpclmulqdq \$0x10,$Hkey,$Z3,$Z1
+ vpxor $rndkey,$inout3,$inout3
+ vmovups 0x10-0x80($key),$T2 # borrow $T2 for $rndkey
+ vpclmulqdq \$0x01,$Hkey,$Z3,$Z2
+ xor %r12,%r12
+ cmp $in0,$end0
+
+ vaesenc $T2,$inout0,$inout0
+ vmovdqu 0x30+8(%rsp),$Ii # I[4]
+ vpxor $rndkey,$inout4,$inout4
+ vpclmulqdq \$0x00,$Hkey,$Z3,$T1
+ vaesenc $T2,$inout1,$inout1
+ vpxor $rndkey,$inout5,$inout5
+ setnc %r12b
+ vpclmulqdq \$0x11,$Hkey,$Z3,$Z3
+ vaesenc $T2,$inout2,$inout2
+ vmovdqu 0x10-0x20($Xip),$Hkey # $Hkey^2
+ neg %r12
+ vaesenc $T2,$inout3,$inout3
+ vpxor $Z1,$Z2,$Z2
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Z1
+ vpxor $Z0,$Xi,$Xi # modulo-scheduled
+ vaesenc $T2,$inout4,$inout4
+ vpxor $Z1,$T1,$Z0
+ and \$0x60,%r12
+ vmovups 0x20-0x80($key),$rndkey
+ vpclmulqdq \$0x10,$Hkey,$Ii,$T1
+ vaesenc $T2,$inout5,$inout5
+
+ vpclmulqdq \$0x01,$Hkey,$Ii,$T2
+ lea ($in0,%r12),$in0
+ vaesenc $rndkey,$inout0,$inout0
+ vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled [vpxor $Z3,$Xi,$Xi]
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Hkey
+ vmovdqu 0x40+8(%rsp),$Ii # I[3]
+ vaesenc $rndkey,$inout1,$inout1
+ movbe 0x58($in0),%r13
+ vaesenc $rndkey,$inout2,$inout2
+ movbe 0x50($in0),%r12
+ vaesenc $rndkey,$inout3,$inout3
+ mov %r13,0x20+8(%rsp)
+ vaesenc $rndkey,$inout4,$inout4
+ mov %r12,0x28+8(%rsp)
+ vmovdqu 0x30-0x20($Xip),$Z1 # borrow $Z1 for $Hkey^3
+ vaesenc $rndkey,$inout5,$inout5
+
+ vmovups 0x30-0x80($key),$rndkey
+ vpxor $T1,$Z2,$Z2
+ vpclmulqdq \$0x00,$Z1,$Ii,$T1
+ vaesenc $rndkey,$inout0,$inout0
+ vpxor $T2,$Z2,$Z2
+ vpclmulqdq \$0x10,$Z1,$Ii,$T2
+ vaesenc $rndkey,$inout1,$inout1
+ vpxor $Hkey,$Z3,$Z3
+ vpclmulqdq \$0x01,$Z1,$Ii,$Hkey
+ vaesenc $rndkey,$inout2,$inout2
+ vpclmulqdq \$0x11,$Z1,$Ii,$Z1
+ vmovdqu 0x50+8(%rsp),$Ii # I[2]
+ vaesenc $rndkey,$inout3,$inout3
+ vaesenc $rndkey,$inout4,$inout4
+ vpxor $T1,$Z0,$Z0
+ vmovdqu 0x40-0x20($Xip),$T1 # borrow $T1 for $Hkey^4
+ vaesenc $rndkey,$inout5,$inout5
+
+ vmovups 0x40-0x80($key),$rndkey
+ vpxor $T2,$Z2,$Z2
+ vpclmulqdq \$0x00,$T1,$Ii,$T2
+ vaesenc $rndkey,$inout0,$inout0
+ vpxor $Hkey,$Z2,$Z2
+ vpclmulqdq \$0x10,$T1,$Ii,$Hkey
+ vaesenc $rndkey,$inout1,$inout1
+ movbe 0x48($in0),%r13
+ vpxor $Z1,$Z3,$Z3
+ vpclmulqdq \$0x01,$T1,$Ii,$Z1
+ vaesenc $rndkey,$inout2,$inout2
+ movbe 0x40($in0),%r12
+ vpclmulqdq \$0x11,$T1,$Ii,$T1
+ vmovdqu 0x60+8(%rsp),$Ii # I[1]
+ vaesenc $rndkey,$inout3,$inout3
+ mov %r13,0x30+8(%rsp)
+ vaesenc $rndkey,$inout4,$inout4
+ mov %r12,0x38+8(%rsp)
+ vpxor $T2,$Z0,$Z0
+ vmovdqu 0x60-0x20($Xip),$T2 # borrow $T2 for $Hkey^5
+ vaesenc $rndkey,$inout5,$inout5
+
+ vmovups 0x50-0x80($key),$rndkey
+ vpxor $Hkey,$Z2,$Z2
+ vpclmulqdq \$0x00,$T2,$Ii,$Hkey
+ vaesenc $rndkey,$inout0,$inout0
+ vpxor $Z1,$Z2,$Z2
+ vpclmulqdq \$0x10,$T2,$Ii,$Z1
+ vaesenc $rndkey,$inout1,$inout1
+ movbe 0x38($in0),%r13
+ vpxor $T1,$Z3,$Z3
+ vpclmulqdq \$0x01,$T2,$Ii,$T1
+ vpxor 0x70+8(%rsp),$Xi,$Xi # accumulate I[0]
+ vaesenc $rndkey,$inout2,$inout2
+ movbe 0x30($in0),%r12
+ vpclmulqdq \$0x11,$T2,$Ii,$T2
+ vaesenc $rndkey,$inout3,$inout3
+ mov %r13,0x40+8(%rsp)
+ vaesenc $rndkey,$inout4,$inout4
+ mov %r12,0x48+8(%rsp)
+ vpxor $Hkey,$Z0,$Z0
+ vmovdqu 0x70-0x20($Xip),$Hkey # $Hkey^6
+ vaesenc $rndkey,$inout5,$inout5
+
+ vmovups 0x60-0x80($key),$rndkey
+ vpxor $Z1,$Z2,$Z2
+ vpclmulqdq \$0x10,$Hkey,$Xi,$Z1
+ vaesenc $rndkey,$inout0,$inout0
+ vpxor $T1,$Z2,$Z2
+ vpclmulqdq \$0x01,$Hkey,$Xi,$T1
+ vaesenc $rndkey,$inout1,$inout1
+ movbe 0x28($in0),%r13
+ vpxor $T2,$Z3,$Z3
+ vpclmulqdq \$0x00,$Hkey,$Xi,$T2
+ vaesenc $rndkey,$inout2,$inout2
+ movbe 0x20($in0),%r12
+ vpclmulqdq \$0x11,$Hkey,$Xi,$Xi
+ vaesenc $rndkey,$inout3,$inout3
+ mov %r13,0x50+8(%rsp)
+ vaesenc $rndkey,$inout4,$inout4
+ mov %r12,0x58+8(%rsp)
+ vpxor $Z1,$Z2,$Z2
+ vaesenc $rndkey,$inout5,$inout5
+ vpxor $T1,$Z2,$Z2
+
+ vmovups 0x70-0x80($key),$rndkey
+ vpslldq \$8,$Z2,$Z1
+ vpxor $T2,$Z0,$Z0
+ vmovdqu 0x10($const),$Hkey # .Lpoly
+
+ vaesenc $rndkey,$inout0,$inout0
+ vpxor $Xi,$Z3,$Z3
+ vaesenc $rndkey,$inout1,$inout1
+ vpxor $Z1,$Z0,$Z0
+ movbe 0x18($in0),%r13
+ vaesenc $rndkey,$inout2,$inout2
+ movbe 0x10($in0),%r12
+ vpalignr \$8,$Z0,$Z0,$Ii # 1st phase
+ vpclmulqdq \$0x10,$Hkey,$Z0,$Z0
+ mov %r13,0x60+8(%rsp)
+ vaesenc $rndkey,$inout3,$inout3
+ mov %r12,0x68+8(%rsp)
+ vaesenc $rndkey,$inout4,$inout4
+ vmovups 0x80-0x80($key),$T1 # borrow $T1 for $rndkey
+ vaesenc $rndkey,$inout5,$inout5
+
+ vaesenc $T1,$inout0,$inout0
+ vmovups 0x90-0x80($key),$rndkey
+ vaesenc $T1,$inout1,$inout1
+ vpsrldq \$8,$Z2,$Z2
+ vaesenc $T1,$inout2,$inout2
+ vpxor $Z2,$Z3,$Z3
+ vaesenc $T1,$inout3,$inout3
+ vpxor $Ii,$Z0,$Z0
+ movbe 0x08($in0),%r13
+ vaesenc $T1,$inout4,$inout4
+ movbe 0x00($in0),%r12
+ vaesenc $T1,$inout5,$inout5
+ vmovups 0xa0-0x80($key),$T1
+ cmp \$11,$rounds
+ jb .Lenc_tail # 128-bit key
+
+ vaesenc $rndkey,$inout0,$inout0
+ vaesenc $rndkey,$inout1,$inout1
+ vaesenc $rndkey,$inout2,$inout2
+ vaesenc $rndkey,$inout3,$inout3
+ vaesenc $rndkey,$inout4,$inout4
+ vaesenc $rndkey,$inout5,$inout5
+
+ vaesenc $T1,$inout0,$inout0
+ vaesenc $T1,$inout1,$inout1
+ vaesenc $T1,$inout2,$inout2
+ vaesenc $T1,$inout3,$inout3
+ vaesenc $T1,$inout4,$inout4
+ vmovups 0xb0-0x80($key),$rndkey
+ vaesenc $T1,$inout5,$inout5
+ vmovups 0xc0-0x80($key),$T1
+ je .Lenc_tail # 192-bit key
+
+ vaesenc $rndkey,$inout0,$inout0
+ vaesenc $rndkey,$inout1,$inout1
+ vaesenc $rndkey,$inout2,$inout2
+ vaesenc $rndkey,$inout3,$inout3
+ vaesenc $rndkey,$inout4,$inout4
+ vaesenc $rndkey,$inout5,$inout5
+
+ vaesenc $T1,$inout0,$inout0
+ vaesenc $T1,$inout1,$inout1
+ vaesenc $T1,$inout2,$inout2
+ vaesenc $T1,$inout3,$inout3
+ vaesenc $T1,$inout4,$inout4
+ vmovups 0xd0-0x80($key),$rndkey
+ vaesenc $T1,$inout5,$inout5
+ vmovups 0xe0-0x80($key),$T1
+ jmp .Lenc_tail # 256-bit key
+
+.align 32
+.Lhandle_ctr32:
+ vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
+ vpshufb $Ii,$T1,$Z2 # byte-swap counter
+ vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb
+ vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb
+ vpaddd $Z1,$Z2,$inout2
+ vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
+ vpaddd $Z1,$inout1,$inout3
+ vpshufb $Ii,$inout1,$inout1
+ vpaddd $Z1,$inout2,$inout4
+ vpshufb $Ii,$inout2,$inout2
+ vpxor $rndkey,$inout1,$inout1
+ vpaddd $Z1,$inout3,$inout5
+ vpshufb $Ii,$inout3,$inout3
+ vpxor $rndkey,$inout2,$inout2
+ vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value
+ vpshufb $Ii,$inout4,$inout4
+ vpshufb $Ii,$inout5,$inout5
+ vpshufb $Ii,$T1,$T1 # next counter value
+ jmp .Lresume_ctr32
+
+.align 32
+.Lenc_tail:
+ vaesenc $rndkey,$inout0,$inout0
+ vmovdqu $Z3,16+8(%rsp) # postpone vpxor $Z3,$Xi,$Xi
+ vpalignr \$8,$Z0,$Z0,$Xi # 2nd phase
+ vaesenc $rndkey,$inout1,$inout1
+ vpclmulqdq \$0x10,$Hkey,$Z0,$Z0
+ vpxor 0x00($inp),$T1,$T2
+ vaesenc $rndkey,$inout2,$inout2
+ vpxor 0x10($inp),$T1,$Ii
+ vaesenc $rndkey,$inout3,$inout3
+ vpxor 0x20($inp),$T1,$Z1
+ vaesenc $rndkey,$inout4,$inout4
+ vpxor 0x30($inp),$T1,$Z2
+ vaesenc $rndkey,$inout5,$inout5
+ vpxor 0x40($inp),$T1,$Z3
+ vpxor 0x50($inp),$T1,$Hkey
+ vmovdqu ($ivp),$T1 # load next counter value
+
+ vaesenclast $T2,$inout0,$inout0
+ vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb
+ vaesenclast $Ii,$inout1,$inout1
+ vpaddb $T2,$T1,$Ii
+ mov %r13,0x70+8(%rsp)
+ lea 0x60($inp),$inp
+ vaesenclast $Z1,$inout2,$inout2
+ vpaddb $T2,$Ii,$Z1
+ mov %r12,0x78+8(%rsp)
+ lea 0x60($out),$out
+ vmovdqu 0x00-0x80($key),$rndkey
+ vaesenclast $Z2,$inout3,$inout3
+ vpaddb $T2,$Z1,$Z2
+ vaesenclast $Z3, $inout4,$inout4
+ vpaddb $T2,$Z2,$Z3
+ vaesenclast $Hkey,$inout5,$inout5
+ vpaddb $T2,$Z3,$Hkey
+
+ add \$0x60,$ret
+ sub \$0x6,$len
+ jc .L6x_done
+
+ vmovups $inout0,-0x60($out) # save output
+ vpxor $rndkey,$T1,$inout0
+ vmovups $inout1,-0x50($out)
+ vmovdqa $Ii,$inout1 # 0 latency
+ vmovups $inout2,-0x40($out)
+ vmovdqa $Z1,$inout2 # 0 latency
+ vmovups $inout3,-0x30($out)
+ vmovdqa $Z2,$inout3 # 0 latency
+ vmovups $inout4,-0x20($out)
+ vmovdqa $Z3,$inout4 # 0 latency
+ vmovups $inout5,-0x10($out)
+ vmovdqa $Hkey,$inout5 # 0 latency
+ vmovdqu 0x20+8(%rsp),$Z3 # I[5]
+ jmp .Loop6x
+
+.L6x_done:
+ vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled
+ vpxor $Z0,$Xi,$Xi # modulo-scheduled
+
+ ret
+.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
+___
+######################################################################
+#
+# size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len,
+# const AES_KEY *key, unsigned char iv[16],
+# struct { u128 Xi,H,Htbl[9]; } *Xip);
+$code.=<<___;
+.globl aesni_gcm_decrypt
+.type aesni_gcm_decrypt,\@function,6
+.align 32
+aesni_gcm_decrypt:
+ xor $ret,$ret
+ cmp \$0x60,$len # minimal accepted length
+ jb .Lgcm_dec_abort
+
+ lea (%rsp),%rax # save stack pointer
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0xa8(%rsp),%rsp
+ movaps %xmm6,-0xd8(%rax)
+ movaps %xmm7,-0xc8(%rax)
+ movaps %xmm8,-0xb8(%rax)
+ movaps %xmm9,-0xa8(%rax)
+ movaps %xmm10,-0x98(%rax)
+ movaps %xmm11,-0x88(%rax)
+ movaps %xmm12,-0x78(%rax)
+ movaps %xmm13,-0x68(%rax)
+ movaps %xmm14,-0x58(%rax)
+ movaps %xmm15,-0x48(%rax)
+.Lgcm_dec_body:
+___
+$code.=<<___;
+ vzeroupper
+
+ vmovdqu ($ivp),$T1 # input counter value
+ add \$-128,%rsp
+ mov 12($ivp),$counter
+ lea .Lbswap_mask(%rip),$const
+ lea -0x80($key),$in0 # borrow $in0
+ mov \$0xf80,$end0 # borrow $end0
+ vmovdqu ($Xip),$Xi # load Xi
+ and \$-128,%rsp # ensure stack alignment
+ vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
+ lea 0x80($key),$key # size optimization
+ lea 0x20+0x20($Xip),$Xip # size optimization
+ mov 0xf0-0x80($key),$rounds
+ vpshufb $Ii,$Xi,$Xi
+
+ and $end0,$in0
+ and %rsp,$end0
+ sub $in0,$end0
+ jc .Ldec_no_key_aliasing
+ cmp \$768,$end0
+ jnc .Ldec_no_key_aliasing
+ sub $end0,%rsp # avoid aliasing with key
+.Ldec_no_key_aliasing:
+
+ vmovdqu 0x50($inp),$Z3 # I[5]
+ lea ($inp),$in0
+ vmovdqu 0x40($inp),$Z0
+ lea -0xc0($inp,$len),$end0
+ vmovdqu 0x30($inp),$Z1
+ shr \$4,$len
+ xor $ret,$ret
+ vmovdqu 0x20($inp),$Z2
+ vpshufb $Ii,$Z3,$Z3 # passed to _aesni_ctr32_ghash_6x
+ vmovdqu 0x10($inp),$T2
+ vpshufb $Ii,$Z0,$Z0
+ vmovdqu ($inp),$Hkey
+ vpshufb $Ii,$Z1,$Z1
+ vmovdqu $Z0,0x30(%rsp)
+ vpshufb $Ii,$Z2,$Z2
+ vmovdqu $Z1,0x40(%rsp)
+ vpshufb $Ii,$T2,$T2
+ vmovdqu $Z2,0x50(%rsp)
+ vpshufb $Ii,$Hkey,$Hkey
+ vmovdqu $T2,0x60(%rsp)
+ vmovdqu $Hkey,0x70(%rsp)
+
+ call _aesni_ctr32_ghash_6x
+
+ vmovups $inout0,-0x60($out) # save output
+ vmovups $inout1,-0x50($out)
+ vmovups $inout2,-0x40($out)
+ vmovups $inout3,-0x30($out)
+ vmovups $inout4,-0x20($out)
+ vmovups $inout5,-0x10($out)
+
+ vpshufb ($const),$Xi,$Xi # .Lbswap_mask
+ vmovdqu $Xi,-0x40($Xip) # output Xi
+
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps -0xd8(%rax),%xmm6
+ movaps -0xd8(%rax),%xmm7
+ movaps -0xb8(%rax),%xmm8
+ movaps -0xa8(%rax),%xmm9
+ movaps -0x98(%rax),%xmm10
+ movaps -0x88(%rax),%xmm11
+ movaps -0x78(%rax),%xmm12
+ movaps -0x68(%rax),%xmm13
+ movaps -0x58(%rax),%xmm14
+ movaps -0x48(%rax),%xmm15
+___
+$code.=<<___;
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp # restore %rsp
+.Lgcm_dec_abort:
+ mov $ret,%rax # return value
+ ret
+.size aesni_gcm_decrypt,.-aesni_gcm_decrypt
+___
+
+$code.=<<___;
+.type _aesni_ctr32_6x,\@abi-omnipotent
+.align 32
+_aesni_ctr32_6x:
+ vmovdqu 0x00-0x80($key),$Z0 # borrow $Z0 for $rndkey
+ vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb
+ lea -1($rounds),%r13
+ vmovups 0x10-0x80($key),$rndkey
+ lea 0x20-0x80($key),%r12
+ vpxor $Z0,$T1,$inout0
+ add \$`6<<24`,$counter
+ jc .Lhandle_ctr32_2
+ vpaddb $T2,$T1,$inout1
+ vpaddb $T2,$inout1,$inout2
+ vpxor $Z0,$inout1,$inout1
+ vpaddb $T2,$inout2,$inout3
+ vpxor $Z0,$inout2,$inout2
+ vpaddb $T2,$inout3,$inout4
+ vpxor $Z0,$inout3,$inout3
+ vpaddb $T2,$inout4,$inout5
+ vpxor $Z0,$inout4,$inout4
+ vpaddb $T2,$inout5,$T1
+ vpxor $Z0,$inout5,$inout5
+ jmp .Loop_ctr32
+
+.align 16
+.Loop_ctr32:
+ vaesenc $rndkey,$inout0,$inout0
+ vaesenc $rndkey,$inout1,$inout1
+ vaesenc $rndkey,$inout2,$inout2
+ vaesenc $rndkey,$inout3,$inout3
+ vaesenc $rndkey,$inout4,$inout4
+ vaesenc $rndkey,$inout5,$inout5
+ vmovups (%r12),$rndkey
+ lea 0x10(%r12),%r12
+ dec %r13d
+ jnz .Loop_ctr32
+
+ vmovdqu (%r12),$Hkey # last round key
+ vaesenc $rndkey,$inout0,$inout0
+ vpxor 0x00($inp),$Hkey,$Z0
+ vaesenc $rndkey,$inout1,$inout1
+ vpxor 0x10($inp),$Hkey,$Z1
+ vaesenc $rndkey,$inout2,$inout2
+ vpxor 0x20($inp),$Hkey,$Z2
+ vaesenc $rndkey,$inout3,$inout3
+ vpxor 0x30($inp),$Hkey,$Xi
+ vaesenc $rndkey,$inout4,$inout4
+ vpxor 0x40($inp),$Hkey,$T2
+ vaesenc $rndkey,$inout5,$inout5
+ vpxor 0x50($inp),$Hkey,$Hkey
+ lea 0x60($inp),$inp
+
+ vaesenclast $Z0,$inout0,$inout0
+ vaesenclast $Z1,$inout1,$inout1
+ vaesenclast $Z2,$inout2,$inout2
+ vaesenclast $Xi,$inout3,$inout3
+ vaesenclast $T2,$inout4,$inout4
+ vaesenclast $Hkey,$inout5,$inout5
+ vmovups $inout0,0x00($out)
+ vmovups $inout1,0x10($out)
+ vmovups $inout2,0x20($out)
+ vmovups $inout3,0x30($out)
+ vmovups $inout4,0x40($out)
+ vmovups $inout5,0x50($out)
+ lea 0x60($out),$out
+
+ ret
+.align 32
+.Lhandle_ctr32_2:
+ vpshufb $Ii,$T1,$Z2 # byte-swap counter
+ vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb
+ vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb
+ vpaddd $Z1,$Z2,$inout2
+ vpaddd $Z1,$inout1,$inout3
+ vpshufb $Ii,$inout1,$inout1
+ vpaddd $Z1,$inout2,$inout4
+ vpshufb $Ii,$inout2,$inout2
+ vpxor $Z0,$inout1,$inout1
+ vpaddd $Z1,$inout3,$inout5
+ vpshufb $Ii,$inout3,$inout3
+ vpxor $Z0,$inout2,$inout2
+ vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value
+ vpshufb $Ii,$inout4,$inout4
+ vpxor $Z0,$inout3,$inout3
+ vpshufb $Ii,$inout5,$inout5
+ vpxor $Z0,$inout4,$inout4
+ vpshufb $Ii,$T1,$T1 # next counter value
+ vpxor $Z0,$inout5,$inout5
+ jmp .Loop_ctr32
+.size _aesni_ctr32_6x,.-_aesni_ctr32_6x
+
+.globl aesni_gcm_encrypt
+.type aesni_gcm_encrypt,\@function,6
+.align 32
+aesni_gcm_encrypt:
+ xor $ret,$ret
+ cmp \$0x60*3,$len # minimal accepted length
+ jb .Lgcm_enc_abort
+
+ lea (%rsp),%rax # save stack pointer
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0xa8(%rsp),%rsp
+ movaps %xmm6,-0xd8(%rax)
+ movaps %xmm7,-0xc8(%rax)
+ movaps %xmm8,-0xb8(%rax)
+ movaps %xmm9,-0xa8(%rax)
+ movaps %xmm10,-0x98(%rax)
+ movaps %xmm11,-0x88(%rax)
+ movaps %xmm12,-0x78(%rax)
+ movaps %xmm13,-0x68(%rax)
+ movaps %xmm14,-0x58(%rax)
+ movaps %xmm15,-0x48(%rax)
+.Lgcm_enc_body:
+___
+$code.=<<___;
+ vzeroupper
+
+ vmovdqu ($ivp),$T1 # input counter value
+ add \$-128,%rsp
+ mov 12($ivp),$counter
+ lea .Lbswap_mask(%rip),$const
+ lea -0x80($key),$in0 # borrow $in0
+ mov \$0xf80,$end0 # borrow $end0
+ lea 0x80($key),$key # size optimization
+ vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
+ and \$-128,%rsp # ensure stack alignment
+ mov 0xf0-0x80($key),$rounds
+
+ and $end0,$in0
+ and %rsp,$end0
+ sub $in0,$end0
+ jc .Lenc_no_key_aliasing
+ cmp \$768,$end0
+ jnc .Lenc_no_key_aliasing
+ sub $end0,%rsp # avoid aliasing with key
+.Lenc_no_key_aliasing:
+
+ lea ($out),$in0
+ lea -0xc0($out,$len),$end0
+ shr \$4,$len
+
+ call _aesni_ctr32_6x
+ vpshufb $Ii,$inout0,$Xi # save bswapped output on stack
+ vpshufb $Ii,$inout1,$T2
+ vmovdqu $Xi,0x70(%rsp)
+ vpshufb $Ii,$inout2,$Z0
+ vmovdqu $T2,0x60(%rsp)
+ vpshufb $Ii,$inout3,$Z1
+ vmovdqu $Z0,0x50(%rsp)
+ vpshufb $Ii,$inout4,$Z2
+ vmovdqu $Z1,0x40(%rsp)
+ vpshufb $Ii,$inout5,$Z3 # passed to _aesni_ctr32_ghash_6x
+ vmovdqu $Z2,0x30(%rsp)
+
+ call _aesni_ctr32_6x
+
+ vmovdqu ($Xip),$Xi # load Xi
+ lea 0x20+0x20($Xip),$Xip # size optimization
+ sub \$12,$len
+ mov \$0x60*2,$ret
+ vpshufb $Ii,$Xi,$Xi
+
+ call _aesni_ctr32_ghash_6x
+ vmovdqu 0x20(%rsp),$Z3 # I[5]
+ vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
+ vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
+ vpunpckhqdq $Z3,$Z3,$T1
+ vmovdqu 0x20-0x20($Xip),$rndkey # borrow $rndkey for $HK
+ vmovups $inout0,-0x60($out) # save output
+ vpshufb $Ii,$inout0,$inout0 # but keep bswapped copy
+ vpxor $Z3,$T1,$T1
+ vmovups $inout1,-0x50($out)
+ vpshufb $Ii,$inout1,$inout1
+ vmovups $inout2,-0x40($out)
+ vpshufb $Ii,$inout2,$inout2
+ vmovups $inout3,-0x30($out)
+ vpshufb $Ii,$inout3,$inout3
+ vmovups $inout4,-0x20($out)
+ vpshufb $Ii,$inout4,$inout4
+ vmovups $inout5,-0x10($out)
+ vpshufb $Ii,$inout5,$inout5
+ vmovdqu $inout0,0x10(%rsp) # free $inout0
+___
+{ my ($HK,$T3)=($rndkey,$inout0);
+
+$code.=<<___;
+ vmovdqu 0x30(%rsp),$Z2 # I[4]
+ vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2
+ vpunpckhqdq $Z2,$Z2,$T2
+ vpclmulqdq \$0x00,$Hkey,$Z3,$Z1
+ vpxor $Z2,$T2,$T2
+ vpclmulqdq \$0x11,$Hkey,$Z3,$Z3
+ vpclmulqdq \$0x00,$HK,$T1,$T1
+
+ vmovdqu 0x40(%rsp),$T3 # I[3]
+ vpclmulqdq \$0x00,$Ii,$Z2,$Z0
+ vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3
+ vpxor $Z1,$Z0,$Z0
+ vpunpckhqdq $T3,$T3,$Z1
+ vpclmulqdq \$0x11,$Ii,$Z2,$Z2
+ vpxor $T3,$Z1,$Z1
+ vpxor $Z3,$Z2,$Z2
+ vpclmulqdq \$0x10,$HK,$T2,$T2
+ vmovdqu 0x50-0x20($Xip),$HK
+ vpxor $T1,$T2,$T2
+
+ vmovdqu 0x50(%rsp),$T1 # I[2]
+ vpclmulqdq \$0x00,$Hkey,$T3,$Z3
+ vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4
+ vpxor $Z0,$Z3,$Z3
+ vpunpckhqdq $T1,$T1,$Z0
+ vpclmulqdq \$0x11,$Hkey,$T3,$T3
+ vpxor $T1,$Z0,$Z0
+ vpxor $Z2,$T3,$T3
+ vpclmulqdq \$0x00,$HK,$Z1,$Z1
+ vpxor $T2,$Z1,$Z1
+
+ vmovdqu 0x60(%rsp),$T2 # I[1]
+ vpclmulqdq \$0x00,$Ii,$T1,$Z2
+ vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5
+ vpxor $Z3,$Z2,$Z2
+ vpunpckhqdq $T2,$T2,$Z3
+ vpclmulqdq \$0x11,$Ii,$T1,$T1
+ vpxor $T2,$Z3,$Z3
+ vpxor $T3,$T1,$T1
+ vpclmulqdq \$0x10,$HK,$Z0,$Z0
+ vmovdqu 0x80-0x20($Xip),$HK
+ vpxor $Z1,$Z0,$Z0
+
+ vpxor 0x70(%rsp),$Xi,$Xi # accumulate I[0]
+ vpclmulqdq \$0x00,$Hkey,$T2,$Z1
+ vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6
+ vpunpckhqdq $Xi,$Xi,$T3
+ vpxor $Z2,$Z1,$Z1
+ vpclmulqdq \$0x11,$Hkey,$T2,$T2
+ vpxor $Xi,$T3,$T3
+ vpxor $T1,$T2,$T2
+ vpclmulqdq \$0x00,$HK,$Z3,$Z3
+ vpxor $Z0,$Z3,$Z0
+
+ vpclmulqdq \$0x00,$Ii,$Xi,$Z2
+ vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
+ vpunpckhqdq $inout5,$inout5,$T1
+ vpclmulqdq \$0x11,$Ii,$Xi,$Xi
+ vpxor $inout5,$T1,$T1
+ vpxor $Z1,$Z2,$Z1
+ vpclmulqdq \$0x10,$HK,$T3,$T3
+ vmovdqu 0x20-0x20($Xip),$HK
+ vpxor $T2,$Xi,$Z3
+ vpxor $Z0,$T3,$Z2
+
+ vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2
+ vpxor $Z1,$Z3,$T3 # aggregated Karatsuba post-processing
+ vpclmulqdq \$0x00,$Hkey,$inout5,$Z0
+ vpxor $T3,$Z2,$Z2
+ vpunpckhqdq $inout4,$inout4,$T2
+ vpclmulqdq \$0x11,$Hkey,$inout5,$inout5
+ vpxor $inout4,$T2,$T2
+ vpslldq \$8,$Z2,$T3
+ vpclmulqdq \$0x00,$HK,$T1,$T1
+ vpxor $T3,$Z1,$Xi
+ vpsrldq \$8,$Z2,$Z2
+ vpxor $Z2,$Z3,$Z3
+
+ vpclmulqdq \$0x00,$Ii,$inout4,$Z1
+ vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3
+ vpxor $Z0,$Z1,$Z1
+ vpunpckhqdq $inout3,$inout3,$T3
+ vpclmulqdq \$0x11,$Ii,$inout4,$inout4
+ vpxor $inout3,$T3,$T3
+ vpxor $inout5,$inout4,$inout4
+ vpalignr \$8,$Xi,$Xi,$inout5 # 1st phase
+ vpclmulqdq \$0x10,$HK,$T2,$T2
+ vmovdqu 0x50-0x20($Xip),$HK
+ vpxor $T1,$T2,$T2
+
+ vpclmulqdq \$0x00,$Hkey,$inout3,$Z0
+ vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4
+ vpxor $Z1,$Z0,$Z0
+ vpunpckhqdq $inout2,$inout2,$T1
+ vpclmulqdq \$0x11,$Hkey,$inout3,$inout3
+ vpxor $inout2,$T1,$T1
+ vpxor $inout4,$inout3,$inout3
+ vxorps 0x10(%rsp),$Z3,$Z3 # accumulate $inout0
+ vpclmulqdq \$0x00,$HK,$T3,$T3
+ vpxor $T2,$T3,$T3
+
+ vpclmulqdq \$0x10,0x10($const),$Xi,$Xi
+ vxorps $inout5,$Xi,$Xi
+
+ vpclmulqdq \$0x00,$Ii,$inout2,$Z1
+ vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5
+ vpxor $Z0,$Z1,$Z1
+ vpunpckhqdq $inout1,$inout1,$T2
+ vpclmulqdq \$0x11,$Ii,$inout2,$inout2
+ vpxor $inout1,$T2,$T2
+ vpalignr \$8,$Xi,$Xi,$inout5 # 2nd phase
+ vpxor $inout3,$inout2,$inout2
+ vpclmulqdq \$0x10,$HK,$T1,$T1
+ vmovdqu 0x80-0x20($Xip),$HK
+ vpxor $T3,$T1,$T1
+
+ vxorps $Z3,$inout5,$inout5
+ vpclmulqdq \$0x10,0x10($const),$Xi,$Xi
+ vxorps $inout5,$Xi,$Xi
+
+ vpclmulqdq \$0x00,$Hkey,$inout1,$Z0
+ vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6
+ vpxor $Z1,$Z0,$Z0
+ vpunpckhqdq $Xi,$Xi,$T3
+ vpclmulqdq \$0x11,$Hkey,$inout1,$inout1
+ vpxor $Xi,$T3,$T3
+ vpxor $inout2,$inout1,$inout1
+ vpclmulqdq \$0x00,$HK,$T2,$T2
+ vpxor $T1,$T2,$T2
+
+ vpclmulqdq \$0x00,$Ii,$Xi,$Z1
+ vpclmulqdq \$0x11,$Ii,$Xi,$Z3
+ vpxor $Z0,$Z1,$Z1
+ vpclmulqdq \$0x10,$HK,$T3,$Z2
+ vpxor $inout1,$Z3,$Z3
+ vpxor $T2,$Z2,$Z2
+
+ vpxor $Z1,$Z3,$Z0 # aggregated Karatsuba post-processing
+ vpxor $Z0,$Z2,$Z2
+ vpslldq \$8,$Z2,$T1
+ vmovdqu 0x10($const),$Hkey # .Lpoly
+ vpsrldq \$8,$Z2,$Z2
+ vpxor $T1,$Z1,$Xi
+ vpxor $Z2,$Z3,$Z3
+
+ vpalignr \$8,$Xi,$Xi,$T2 # 1st phase
+ vpclmulqdq \$0x10,$Hkey,$Xi,$Xi
+ vpxor $T2,$Xi,$Xi
+
+ vpalignr \$8,$Xi,$Xi,$T2 # 2nd phase
+ vpclmulqdq \$0x10,$Hkey,$Xi,$Xi
+ vpxor $Z3,$T2,$T2
+ vpxor $T2,$Xi,$Xi
+___
+}
+$code.=<<___;
+ vpshufb ($const),$Xi,$Xi # .Lbswap_mask
+ vmovdqu $Xi,-0x40($Xip) # output Xi
+
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps -0xd8(%rax),%xmm6
+ movaps -0xc8(%rax),%xmm7
+ movaps -0xb8(%rax),%xmm8
+ movaps -0xa8(%rax),%xmm9
+ movaps -0x98(%rax),%xmm10
+ movaps -0x88(%rax),%xmm11
+ movaps -0x78(%rax),%xmm12
+ movaps -0x68(%rax),%xmm13
+ movaps -0x58(%rax),%xmm14
+ movaps -0x48(%rax),%xmm15
+___
+$code.=<<___;
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp # restore %rsp
+.Lgcm_enc_abort:
+ mov $ret,%rax # return value
+ ret
+.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
+___
+
+$code.=<<___;
+.align 64
+.Lbswap_mask:
+ .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lpoly:
+ .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.Lone_msb:
+ .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Ltwo_lsb:
+ .byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.Lone_lsb:
+ .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.asciz "AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 64
+___
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___
+.extern __imp_RtlVirtualUnwind
+.type gcm_se_handler,\@abi-omnipotent
+.align 16
+gcm_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ mov 120($context),%rax # pull context->Rax
+
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ mov %r15,240($context)
+ mov %r14,232($context)
+ mov %r13,224($context)
+ mov %r12,216($context)
+ mov %rbp,160($context)
+ mov %rbx,144($context)
+
+ lea -0xd8(%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # & context.Xmm6
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+
+.Lcommon_seh_tail:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size gcm_se_handler,.-gcm_se_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_aesni_gcm_decrypt
+ .rva .LSEH_end_aesni_gcm_decrypt
+ .rva .LSEH_gcm_dec_info
+
+ .rva .LSEH_begin_aesni_gcm_encrypt
+ .rva .LSEH_end_aesni_gcm_encrypt
+ .rva .LSEH_gcm_enc_info
+.section .xdata
+.align 8
+.LSEH_gcm_dec_info:
+ .byte 9,0,0,0
+ .rva gcm_se_handler
+ .rva .Lgcm_dec_body,.Lgcm_dec_abort
+.LSEH_gcm_enc_info:
+ .byte 9,0,0,0
+ .rva gcm_se_handler
+ .rva .Lgcm_enc_body,.Lgcm_enc_abort
+___
+}
+}}} else {{{
+$code=<<___; # assembler is too old
+.text
+
+.globl aesni_gcm_encrypt
+.type aesni_gcm_encrypt,\@abi-omnipotent
+aesni_gcm_encrypt:
+ xor %eax,%eax
+ ret
+.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
+
+.globl aesni_gcm_decrypt
+.type aesni_gcm_decrypt,\@abi-omnipotent
+aesni_gcm_decrypt:
+ xor %eax,%eax
+ ret
+.size aesni_gcm_decrypt,.-aesni_gcm_decrypt
+___
+}}}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/crypto/modes/asm/ghash-armv4.pl b/crypto/modes/asm/ghash-armv4.pl
index d91586ee2925..77fbf34465db 100755
--- a/crypto/modes/asm/ghash-armv4.pl
+++ b/crypto/modes/asm/ghash-armv4.pl
@@ -35,6 +35,20 @@
# Add NEON implementation featuring polynomial multiplication, i.e. no
# lookup tables involved. On Cortex A8 it was measured to process one
# byte in 15 cycles or 55% faster than integer-only code.
+#
+# April 2014
+#
+# Switch to multiplication algorithm suggested in paper referred
+# below and combine it with reduction algorithm from x86 module.
+# Performance improvement over previous version varies from 65% on
+# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
+# processes one byte in 8.45 cycles, A9 - in 10.2, Snapdragon S4 -
+# in 9.33.
+#
+# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
+# Polynomial Multiplication on ARM Processors using the NEON Engine.
+#
+# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
# ====================================================================
# Note about "528B" variant. In ARM case it makes lesser sense to
@@ -303,117 +317,161 @@ $code.=<<___;
.size gcm_gmult_4bit,.-gcm_gmult_4bit
___
{
-my $cnt=$Htbl; # $Htbl is used once in the very beginning
-
-my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
-my ($Qhi, $Qlo, $Z, $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
-
-# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
-# in Zo. Or should I say "top bit", because GHASH is specified in
-# reverse bit order? Otherwise straightforward 128-bt H by one input
-# byte multiplication and modulo-reduction, times 16.
+my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
+my ($t0,$t1,$t2,$t3)=map("q$_",(8..12));
+my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31));
-sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
-sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
-sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
+sub clmul64x64 {
+my ($r,$a,$b)=@_;
+$code.=<<___;
+ vext.8 $t0#lo, $a, $a, #1 @ A1
+ vmull.p8 $t0, $t0#lo, $b @ F = A1*B
+ vext.8 $r#lo, $b, $b, #1 @ B1
+ vmull.p8 $r, $a, $r#lo @ E = A*B1
+ vext.8 $t1#lo, $a, $a, #2 @ A2
+ vmull.p8 $t1, $t1#lo, $b @ H = A2*B
+ vext.8 $t3#lo, $b, $b, #2 @ B2
+ vmull.p8 $t3, $a, $t3#lo @ G = A*B2
+ vext.8 $t2#lo, $a, $a, #3 @ A3
+ veor $t0, $t0, $r @ L = E + F
+ vmull.p8 $t2, $t2#lo, $b @ J = A3*B
+ vext.8 $r#lo, $b, $b, #3 @ B3
+ veor $t1, $t1, $t3 @ M = G + H
+ vmull.p8 $r, $a, $r#lo @ I = A*B3
+ veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
+ vand $t0#hi, $t0#hi, $k48
+ vext.8 $t3#lo, $b, $b, #4 @ B4
+ veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
+ vand $t1#hi, $t1#hi, $k32
+ vmull.p8 $t3, $a, $t3#lo @ K = A*B4
+ veor $t2, $t2, $r @ N = I + J
+ veor $t0#lo, $t0#lo, $t0#hi
+ veor $t1#lo, $t1#lo, $t1#hi
+ veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
+ vand $t2#hi, $t2#hi, $k16
+ vext.8 $t0, $t0, $t0, #15
+ veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
+ vmov.i64 $t3#hi, #0
+ vext.8 $t1, $t1, $t1, #14
+ veor $t2#lo, $t2#lo, $t2#hi
+ vmull.p8 $r, $a, $b @ D = A*B
+ vext.8 $t3, $t3, $t3, #12
+ vext.8 $t2, $t2, $t2, #13
+ veor $t0, $t0, $t1
+ veor $t2, $t2, $t3
+ veor $r, $r, $t0
+ veor $r, $r, $t2
+___
+}
$code.=<<___;
-#if __ARM_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
.fpu neon
+.global gcm_init_neon
+.type gcm_init_neon,%function
+.align 4
+gcm_init_neon:
+ vld1.64 $IN#hi,[r1,:64]! @ load H
+ vmov.i8 $t0,#0xe1
+ vld1.64 $IN#lo,[r1,:64]
+ vshl.i64 $t0#hi,#57
+ vshr.u64 $t0#lo,#63 @ t0=0xc2....01
+ vdup.8 $t1,$IN#hi[7]
+ vshr.u64 $Hlo,$IN#lo,#63
+ vshr.s8 $t1,#7 @ broadcast carry bit
+ vshl.i64 $IN,$IN,#1
+ vand $t0,$t0,$t1
+ vorr $IN#hi,$Hlo @ H<<<=1
+ veor $IN,$IN,$t0 @ twisted H
+ vstmia r0,{$IN}
+
+ ret @ bx lr
+.size gcm_init_neon,.-gcm_init_neon
+
.global gcm_gmult_neon
.type gcm_gmult_neon,%function
.align 4
gcm_gmult_neon:
- sub $Htbl,#16 @ point at H in GCM128_CTX
- vld1.64 `&Dhi("$IN")`,[$Xi,:64]!@ load Xi
- vmov.i32 $mod,#0xe1 @ our irreducible polynomial
- vld1.64 `&Dlo("$IN")`,[$Xi,:64]!
- vshr.u64 $mod,#32
- vldmia $Htbl,{$Hhi-$Hlo} @ load H
- veor $zero,$zero
+ vld1.64 $IN#hi,[$Xi,:64]! @ load Xi
+ vld1.64 $IN#lo,[$Xi,:64]!
+ vmov.i64 $k48,#0x0000ffffffffffff
+ vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
+ vmov.i64 $k32,#0x00000000ffffffff
#ifdef __ARMEL__
vrev64.8 $IN,$IN
#endif
- veor $Qpost,$Qpost
- veor $R,$R
- mov $cnt,#16
- veor $Z,$Z
+ vmov.i64 $k16,#0x000000000000ffff
+ veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
mov $len,#16
- veor $Zo,$Zo
- vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
- b .Linner_neon
+ b .Lgmult_neon
.size gcm_gmult_neon,.-gcm_gmult_neon
.global gcm_ghash_neon
.type gcm_ghash_neon,%function
.align 4
gcm_ghash_neon:
- vld1.64 `&Dhi("$Z")`,[$Xi,:64]! @ load Xi
- vmov.i32 $mod,#0xe1 @ our irreducible polynomial
- vld1.64 `&Dlo("$Z")`,[$Xi,:64]!
- vshr.u64 $mod,#32
- vldmia $Xi,{$Hhi-$Hlo} @ load H
- veor $zero,$zero
- nop
+ vld1.64 $Xl#hi,[$Xi,:64]! @ load Xi
+ vld1.64 $Xl#lo,[$Xi,:64]!
+ vmov.i64 $k48,#0x0000ffffffffffff
+ vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
+ vmov.i64 $k32,#0x00000000ffffffff
#ifdef __ARMEL__
- vrev64.8 $Z,$Z
+ vrev64.8 $Xl,$Xl
#endif
-.Louter_neon:
- vld1.64 `&Dhi($IN)`,[$inp]! @ load inp
- veor $Qpost,$Qpost
- vld1.64 `&Dlo($IN)`,[$inp]!
- veor $R,$R
- mov $cnt,#16
+ vmov.i64 $k16,#0x000000000000ffff
+ veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
+
+.Loop_neon:
+ vld1.64 $IN#hi,[$inp]! @ load inp
+ vld1.64 $IN#lo,[$inp]!
#ifdef __ARMEL__
vrev64.8 $IN,$IN
#endif
- veor $Zo,$Zo
- veor $IN,$Z @ inp^=Xi
- veor $Z,$Z
- vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
-.Linner_neon:
- subs $cnt,$cnt,#1
- vmull.p8 $Qlo,$Hlo,$xi @ H.lo·Xi[i]
- vmull.p8 $Qhi,$Hhi,$xi @ H.hi·Xi[i]
- vext.8 $IN,$zero,#1 @ IN>>=8
-
- veor $Z,$Qpost @ modulo-scheduled part
- vshl.i64 `&Dlo("$R")`,#48
- vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
- veor $T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
-
- veor `&Dhi("$Z")`,`&Dlo("$R")`
- vuzp.8 $Qlo,$Qhi
- vsli.8 $Zo,$T,#1 @ compose the "carry" byte
- vext.8 $Z,$zero,#1 @ Z>>=8
-
- vmull.p8 $R,$Zo,$mod @ "carry"·0xe1
- vshr.u8 $Zo,$T,#7 @ save Z's bottom bit
- vext.8 $Qpost,$Qlo,$zero,#1 @ Qlo>>=8
- veor $Z,$Qhi
- bne .Linner_neon
-
- veor $Z,$Qpost @ modulo-scheduled artefact
- vshl.i64 `&Dlo("$R")`,#48
- veor `&Dhi("$Z")`,`&Dlo("$R")`
-
- @ finalization, normalize Z:Zo
- vand $Zo,$mod @ suffices to mask the bit
- vshr.u64 `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
- vshl.i64 $Z,#1
+ veor $IN,$Xl @ inp^=Xi
+.Lgmult_neon:
+___
+ &clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo·Xi.lo
+$code.=<<___;
+ veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing
+___
+ &clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)·(Xi.lo+Xi.hi)
+ &clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi·Xi.hi
+$code.=<<___;
+ veor $Xm,$Xm,$Xl @ Karatsuba post-processing
+ veor $Xm,$Xm,$Xh
+ veor $Xl#hi,$Xl#hi,$Xm#lo
+ veor $Xh#lo,$Xh#lo,$Xm#hi @ Xh|Xl - 256-bit result
+
+ @ equivalent of reduction_avx from ghash-x86_64.pl
+ vshl.i64 $t1,$Xl,#57 @ 1st phase
+ vshl.i64 $t2,$Xl,#62
+ veor $t2,$t2,$t1 @
+ vshl.i64 $t1,$Xl,#63
+ veor $t2, $t2, $t1 @
+ veor $Xl#hi,$Xl#hi,$t2#lo @
+ veor $Xh#lo,$Xh#lo,$t2#hi
+
+ vshr.u64 $t2,$Xl,#1 @ 2nd phase
+ veor $Xh,$Xh,$Xl
+ veor $Xl,$Xl,$t2 @
+ vshr.u64 $t2,$t2,#6
+ vshr.u64 $Xl,$Xl,#1 @
+ veor $Xl,$Xl,$Xh @
+ veor $Xl,$Xl,$t2 @
+
subs $len,#16
- vorr $Z,`&Q("$Zo")` @ Z=Z:Zo<<1
- bne .Louter_neon
+ bne .Loop_neon
#ifdef __ARMEL__
- vrev64.8 $Z,$Z
+ vrev64.8 $Xl,$Xl
#endif
sub $Xi,#16
- vst1.64 `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi
- vst1.64 `&Dlo("$Z")`,[$Xi,:64]
+ vst1.64 $Xl#hi,[$Xi,:64]! @ write out Xi
+ vst1.64 $Xl#lo,[$Xi,:64]
- bx lr
+ ret @ bx lr
.size gcm_ghash_neon,.-gcm_ghash_neon
#endif
___
@@ -423,7 +481,13 @@ $code.=<<___;
.align 2
___
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
-print $code;
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
+ s/\bret\b/bx lr/go or
+ s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
+
+ print $_,"\n";
+}
close STDOUT; # enforce flush
diff --git a/crypto/modes/asm/ghash-s390x.pl b/crypto/modes/asm/ghash-s390x.pl
index 6a40d5d89c0c..39096b423ad8 100755
--- a/crypto/modes/asm/ghash-s390x.pl
+++ b/crypto/modes/asm/ghash-s390x.pl
@@ -186,13 +186,13 @@ $code.=<<___;
sllg $rem1,$Zlo,3
xgr $Zlo,$tmp
ngr $rem1,$x78
+ sllg $tmp,$Zhi,60
j .Lghash_inner
.align 16
.Lghash_inner:
srlg $Zlo,$Zlo,4
- sllg $tmp,$Zhi,60
- xg $Zlo,8($nlo,$Htbl)
srlg $Zhi,$Zhi,4
+ xg $Zlo,8($nlo,$Htbl)
llgc $xi,0($cnt,$Xi)
xg $Zhi,0($nlo,$Htbl)
sllg $nlo,$xi,4
@@ -213,9 +213,9 @@ $code.=<<___;
sllg $rem1,$Zlo,3
xgr $Zlo,$tmp
ngr $rem1,$x78
+ sllg $tmp,$Zhi,60
brct $cnt,.Lghash_inner
- sllg $tmp,$Zhi,60
srlg $Zlo,$Zlo,4
srlg $Zhi,$Zhi,4
xg $Zlo,8($nlo,$Htbl)
diff --git a/crypto/modes/asm/ghash-sparcv9.pl b/crypto/modes/asm/ghash-sparcv9.pl
index 70e7b044a3ec..0365e0f1ff42 100755
--- a/crypto/modes/asm/ghash-sparcv9.pl
+++ b/crypto/modes/asm/ghash-sparcv9.pl
@@ -36,6 +36,15 @@
# references to input data and Z.hi updates to achieve 12 cycles
# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
+#
+# October 2012
+#
+# Add VIS3 lookup-table-free implementation using polynomial
+# multiplication xmulx[hi] and extended addition addxc[cc]
+# instructions. 4.52/7.63x improvement on T3/T4 or in absolute
+# terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark
+# saturates at ~15.5x single-process result on 8-core processor,
+# or ~20.5GBps per 2.85GHz socket.
$bits=32;
for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
@@ -66,6 +75,10 @@ $Htbl="%i1";
$inp="%i2";
$len="%i3";
+$code.=<<___ if ($bits==64);
+.register %g2,#scratch
+.register %g3,#scratch
+___
$code.=<<___;
.section ".text",#alloc,#execinstr
@@ -321,10 +334,238 @@ gcm_gmult_4bit:
restore
.type gcm_gmult_4bit,#function
.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
-.asciz "GHASH for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+{{{
+# Straightforward 128x128-bit multiplication using Karatsuba algorithm
+# followed by pair of 64-bit reductions [with a shortcut in first one,
+# which allowed to break dependency between reductions and remove one
+# multiplication from critical path]. While it might be suboptimal
+# with regard to sheer number of multiplications, other methods [such
+# as aggregate reduction] would require more 64-bit registers, which
+# we don't have in 32-bit application context.
+
+($Xip,$Htable,$inp,$len)=map("%i$_",(0..3));
+
+($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)=
+ (map("%o$_",(0..5,7)),map("%g$_",(1..5)));
+
+($shl,$shr)=map("%l$_",(0..7));
+
+# For details regarding "twisted H" see ghash-x86.pl.
+$code.=<<___;
+.globl gcm_init_vis3
+.align 32
+gcm_init_vis3:
+ save %sp,-$frame,%sp
+
+ ldx [%i1+0],$Hhi
+ ldx [%i1+8],$Hlo
+ mov 0xE1,$Xhi
+ mov 1,$Xlo
+ sllx $Xhi,57,$Xhi
+ srax $Hhi,63,$C0 ! broadcast carry
+ addcc $Hlo,$Hlo,$Hlo ! H<<=1
+ addxc $Hhi,$Hhi,$Hhi
+ and $C0,$Xlo,$Xlo
+ and $C0,$Xhi,$Xhi
+ xor $Xlo,$Hlo,$Hlo
+ xor $Xhi,$Hhi,$Hhi
+ stx $Hlo,[%i0+8] ! save twisted H
+ stx $Hhi,[%i0+0]
+
+ sethi %hi(0xA0406080),$V
+ sethi %hi(0x20C0E000),%l0
+ or $V,%lo(0xA0406080),$V
+ or %l0,%lo(0x20C0E000),%l0
+ sllx $V,32,$V
+ or %l0,$V,$V ! (0xE0·i)&0xff=0xA040608020C0E000
+ stx $V,[%i0+16]
+
+ ret
+ restore
+.type gcm_init_vis3,#function
+.size gcm_init_vis3,.-gcm_init_vis3
+
+.globl gcm_gmult_vis3
+.align 32
+gcm_gmult_vis3:
+ save %sp,-$frame,%sp
+
+ ldx [$Xip+8],$Xlo ! load Xi
+ ldx [$Xip+0],$Xhi
+ ldx [$Htable+8],$Hlo ! load twisted H
+ ldx [$Htable+0],$Hhi
+
+ mov 0xE1,%l7
+ sllx %l7,57,$xE1 ! 57 is not a typo
+ ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
+
+ xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
+ xmulx $Xlo,$Hlo,$C0
+ xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing
+ xmulx $C2,$Hhl,$C1
+ xmulxhi $Xlo,$Hlo,$Xlo
+ xmulxhi $C2,$Hhl,$C2
+ xmulxhi $Xhi,$Hhi,$C3
+ xmulx $Xhi,$Hhi,$Xhi
+
+ sll $C0,3,$sqr
+ srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
+ xor $C0,$sqr,$sqr
+ sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
+
+ xor $C0,$C1,$C1 ! Karatsuba post-processing
+ xor $Xlo,$C2,$C2
+ xor $sqr,$Xlo,$Xlo ! real destination is $C1
+ xor $C3,$C2,$C2
+ xor $Xlo,$C1,$C1
+ xor $Xhi,$C2,$C2
+ xor $Xhi,$C1,$C1
+
+ xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
+ xor $C0,$C2,$C2
+ xmulx $C1,$xE1,$C0
+ xor $C1,$C3,$C3
+ xmulxhi $C1,$xE1,$C1
+
+ xor $Xlo,$C2,$C2
+ xor $C0,$C2,$C2
+ xor $C1,$C3,$C3
+
+ stx $C2,[$Xip+8] ! save Xi
+ stx $C3,[$Xip+0]
+
+ ret
+ restore
+.type gcm_gmult_vis3,#function
+.size gcm_gmult_vis3,.-gcm_gmult_vis3
+
+.globl gcm_ghash_vis3
+.align 32
+gcm_ghash_vis3:
+ save %sp,-$frame,%sp
+
+ ldx [$Xip+8],$C2 ! load Xi
+ ldx [$Xip+0],$C3
+ ldx [$Htable+8],$Hlo ! load twisted H
+ ldx [$Htable+0],$Hhi
+
+ mov 0xE1,%l7
+ sllx %l7,57,$xE1 ! 57 is not a typo
+ ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
+
+ and $inp,7,$shl
+ andn $inp,7,$inp
+ sll $shl,3,$shl
+ prefetch [$inp+63], 20
+ sub %g0,$shl,$shr
+
+ xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
+.Loop:
+ ldx [$inp+8],$Xlo
+ brz,pt $shl,1f
+ ldx [$inp+0],$Xhi
+
+ ldx [$inp+16],$C1 ! align data
+ srlx $Xlo,$shr,$C0
+ sllx $Xlo,$shl,$Xlo
+ sllx $Xhi,$shl,$Xhi
+ srlx $C1,$shr,$C1
+ or $C0,$Xhi,$Xhi
+ or $C1,$Xlo,$Xlo
+1:
+ add $inp,16,$inp
+ sub $len,16,$len
+ xor $C2,$Xlo,$Xlo
+ xor $C3,$Xhi,$Xhi
+ prefetch [$inp+63], 20
+
+ xmulx $Xlo,$Hlo,$C0
+ xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing
+ xmulx $C2,$Hhl,$C1
+ xmulxhi $Xlo,$Hlo,$Xlo
+ xmulxhi $C2,$Hhl,$C2
+ xmulxhi $Xhi,$Hhi,$C3
+ xmulx $Xhi,$Hhi,$Xhi
+
+ sll $C0,3,$sqr
+ srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
+ xor $C0,$sqr,$sqr
+ sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
+
+ xor $C0,$C1,$C1 ! Karatsuba post-processing
+ xor $Xlo,$C2,$C2
+ xor $sqr,$Xlo,$Xlo ! real destination is $C1
+ xor $C3,$C2,$C2
+ xor $Xlo,$C1,$C1
+ xor $Xhi,$C2,$C2
+ xor $Xhi,$C1,$C1
+
+ xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
+ xor $C0,$C2,$C2
+ xmulx $C1,$xE1,$C0
+ xor $C1,$C3,$C3
+ xmulxhi $C1,$xE1,$C1
+
+ xor $Xlo,$C2,$C2
+ xor $C0,$C2,$C2
+ brnz,pt $len,.Loop
+ xor $C1,$C3,$C3
+
+ stx $C2,[$Xip+8] ! save Xi
+ stx $C3,[$Xip+0]
+
+ ret
+ restore
+.type gcm_ghash_vis3,#function
+.size gcm_ghash_vis3,.-gcm_ghash_vis3
+___
+}}}
+$code.=<<___;
+.asciz "GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-print $code;
+
+# Purpose of these subroutines is to explicitly encode VIS instructions,
+# so that one can compile the module without having to specify VIS
+# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# Idea is to reserve for option to produce "universal" binary and let
+# programmer detect if current CPU is VIS capable at run-time.
+sub unvis3 {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
+my ($ref,$opf);
+my %visopf = ( "addxc" => 0x011,
+ "addxccc" => 0x013,
+ "xmulx" => 0x115,
+ "xmulxhi" => 0x116 );
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+ if ($opf=$visopf{$mnemonic}) {
+ foreach ($rs1,$rs2,$rd) {
+ return $ref if (!/%([goli])([0-9])/);
+ $_=$bias{$1}+$2;
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+
+ s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
+ &unvis3($1,$2,$3,$4)
+ /ge;
+
+ print $_,"\n";
+}
+
close STDOUT;
diff --git a/crypto/modes/asm/ghash-x86.pl b/crypto/modes/asm/ghash-x86.pl
index 83c727e07f95..23a5527b30af 100755
--- a/crypto/modes/asm/ghash-x86.pl
+++ b/crypto/modes/asm/ghash-x86.pl
@@ -12,25 +12,27 @@
# The module implements "4-bit" GCM GHASH function and underlying
# single multiplication operation in GF(2^128). "4-bit" means that it
# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
-# code paths: vanilla x86 and vanilla MMX. Former will be executed on
-# 486 and Pentium, latter on all others. MMX GHASH features so called
+# code paths: vanilla x86 and vanilla SSE. Former will be executed on
+# 486 and Pentium, latter on all others. SSE GHASH features so called
# "528B" variant of "4-bit" method utilizing additional 256+16 bytes
# of per-key storage [+512 bytes shared table]. Performance results
# are for streamed GHASH subroutine and are expressed in cycles per
# processed byte, less is better:
#
-# gcc 2.95.3(*) MMX assembler x86 assembler
+# gcc 2.95.3(*) SSE assembler x86 assembler
#
# Pentium 105/111(**) - 50
# PIII 68 /75 12.2 24
# P4 125/125 17.8 84(***)
# Opteron 66 /70 10.1 30
# Core2 54 /67 8.4 18
+# Atom 105/105 16.8 53
+# VIA Nano 69 /71 13.0 27
#
# (*) gcc 3.4.x was observed to generate few percent slower code,
# which is one of reasons why 2.95.3 results were chosen,
# another reason is lack of 3.4.x results for older CPUs;
-# comparison with MMX results is not completely fair, because C
+# comparison with SSE results is not completely fair, because C
# results are for vanilla "256B" implementation, while
# assembler results are for "528B";-)
# (**) second number is result for code compiled with -fPIC flag,
@@ -40,8 +42,8 @@
#
# To summarize, it's >2-5 times faster than gcc-generated code. To
# anchor it to something else SHA1 assembler processes one byte in
-# 11-13 cycles on contemporary x86 cores. As for choice of MMX in
-# particular, see comment at the end of the file...
+# ~7 cycles on contemporary x86 cores. As for choice of MMX/SSE
+# in particular, see comment at the end of the file...
# May 2010
#
@@ -113,6 +115,16 @@
# similar manner resulted in almost 20% degradation on Sandy Bridge,
# where original 64-bit code processes one byte in 1.95 cycles.
+#####################################################################
+# For reference, AMD Bulldozer processes one byte in 1.98 cycles in
+# 32-bit mode and 1.89 in 64-bit.
+
+# February 2013
+#
+# Overhaul: aggregate Karatsuba post-processing, improve ILP in
+# reduction_alg9. Resulting performance is 1.96 cycles per byte on
+# Westmere, 1.95 - on Sandy/Ivy Bridge, 1.76 - on Bulldozer.
+
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
@@ -822,17 +834,18 @@ $len="ebx";
&static_label("bswap");
sub clmul64x64_T2 { # minimal "register" pressure
-my ($Xhi,$Xi,$Hkey)=@_;
+my ($Xhi,$Xi,$Hkey,$HK)=@_;
&movdqa ($Xhi,$Xi); #
&pshufd ($T1,$Xi,0b01001110);
- &pshufd ($T2,$Hkey,0b01001110);
+ &pshufd ($T2,$Hkey,0b01001110) if (!defined($HK));
&pxor ($T1,$Xi); #
- &pxor ($T2,$Hkey);
+ &pxor ($T2,$Hkey) if (!defined($HK));
+ $HK=$T2 if (!defined($HK));
&pclmulqdq ($Xi,$Hkey,0x00); #######
&pclmulqdq ($Xhi,$Hkey,0x11); #######
- &pclmulqdq ($T1,$T2,0x00); #######
+ &pclmulqdq ($T1,$HK,0x00); #######
&xorps ($T1,$Xi); #
&xorps ($T1,$Xhi); #
@@ -879,31 +892,32 @@ if (1) { # Algorithm 9 with <<1 twist.
# below. Algorithm 9 was therefore chosen for
# further optimization...
-sub reduction_alg9 { # 17/13 times faster than Intel version
+sub reduction_alg9 { # 17/11 times faster than Intel version
my ($Xhi,$Xi) = @_;
# 1st phase
- &movdqa ($T1,$Xi); #
+ &movdqa ($T2,$Xi); #
+ &movdqa ($T1,$Xi);
+ &psllq ($Xi,5);
+ &pxor ($T1,$Xi); #
&psllq ($Xi,1);
&pxor ($Xi,$T1); #
- &psllq ($Xi,5); #
- &pxor ($Xi,$T1); #
&psllq ($Xi,57); #
- &movdqa ($T2,$Xi); #
+ &movdqa ($T1,$Xi); #
&pslldq ($Xi,8);
- &psrldq ($T2,8); #
- &pxor ($Xi,$T1);
- &pxor ($Xhi,$T2); #
+ &psrldq ($T1,8); #
+ &pxor ($Xi,$T2);
+ &pxor ($Xhi,$T1); #
# 2nd phase
&movdqa ($T2,$Xi);
+ &psrlq ($Xi,1);
+ &pxor ($Xhi,$T2); #
+ &pxor ($T2,$Xi);
&psrlq ($Xi,5);
&pxor ($Xi,$T2); #
&psrlq ($Xi,1); #
- &pxor ($Xi,$T2); #
- &pxor ($T2,$Xhi);
- &psrlq ($Xi,1); #
- &pxor ($Xi,$T2); #
+ &pxor ($Xi,$Xhi) #
}
&function_begin_B("gcm_init_clmul");
@@ -937,8 +951,14 @@ my ($Xhi,$Xi) = @_;
&clmul64x64_T2 ($Xhi,$Xi,$Hkey);
&reduction_alg9 ($Xhi,$Xi);
+ &pshufd ($T1,$Hkey,0b01001110);
+ &pshufd ($T2,$Xi,0b01001110);
+ &pxor ($T1,$Hkey); # Karatsuba pre-processing
&movdqu (&QWP(0,$Htbl),$Hkey); # save H
+ &pxor ($T2,$Xi); # Karatsuba pre-processing
&movdqu (&QWP(16,$Htbl),$Xi); # save H^2
+ &palignr ($T2,$T1,8); # low part is H.lo^H.hi
+ &movdqu (&QWP(32,$Htbl),$T2); # save Karatsuba "salt"
&ret ();
&function_end_B("gcm_init_clmul");
@@ -956,8 +976,9 @@ my ($Xhi,$Xi) = @_;
&movdqa ($T3,&QWP(0,$const));
&movups ($Hkey,&QWP(0,$Htbl));
&pshufb ($Xi,$T3);
+ &movups ($T2,&QWP(32,$Htbl));
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
&reduction_alg9 ($Xhi,$Xi);
&pshufb ($Xi,$T3);
@@ -994,79 +1015,109 @@ my ($Xhi,$Xi) = @_;
&movdqu ($Xn,&QWP(16,$inp)); # Ii+1
&pshufb ($T1,$T3);
&pshufb ($Xn,$T3);
+ &movdqu ($T3,&QWP(32,$Htbl));
&pxor ($Xi,$T1); # Ii+Xi
- &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1
+ &pshufd ($T1,$Xn,0b01001110); # H*Ii+1
+ &movdqa ($Xhn,$Xn);
+ &pxor ($T1,$Xn); #
+ &lea ($inp,&DWP(32,$inp)); # i+=2
+
+ &pclmulqdq ($Xn,$Hkey,0x00); #######
+ &pclmulqdq ($Xhn,$Hkey,0x11); #######
+ &pclmulqdq ($T1,$T3,0x00); #######
&movups ($Hkey,&QWP(16,$Htbl)); # load H^2
+ &nop ();
- &lea ($inp,&DWP(32,$inp)); # i+=2
&sub ($len,0x20);
&jbe (&label("even_tail"));
+ &jmp (&label("mod_loop"));
-&set_label("mod_loop");
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
- &movdqu ($T1,&QWP(0,$inp)); # Ii
- &movups ($Hkey,&QWP(0,$Htbl)); # load H
+&set_label("mod_loop",32);
+ &pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi)
+ &movdqa ($Xhi,$Xi);
+ &pxor ($T2,$Xi); #
+ &nop ();
- &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
- &pxor ($Xhi,$Xhn);
+ &pclmulqdq ($Xi,$Hkey,0x00); #######
+ &pclmulqdq ($Xhi,$Hkey,0x11); #######
+ &pclmulqdq ($T2,$T3,0x10); #######
+ &movups ($Hkey,&QWP(0,$Htbl)); # load H
- &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
- &pshufb ($T1,$T3);
- &pshufb ($Xn,$T3);
+ &xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
+ &movdqa ($T3,&QWP(0,$const));
+ &xorps ($Xhi,$Xhn);
+ &movdqu ($Xhn,&QWP(0,$inp)); # Ii
+ &pxor ($T1,$Xi); # aggregated Karatsuba post-processing
+ &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
+ &pxor ($T1,$Xhi); #
- &movdqa ($T3,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1
- &movdqa ($Xhn,$Xn);
- &pxor ($Xhi,$T1); # "Ii+Xi", consume early
+ &pshufb ($Xhn,$T3);
+ &pxor ($T2,$T1); #
- &movdqa ($T1,$Xi); #&reduction_alg9($Xhi,$Xi); 1st phase
+ &movdqa ($T1,$T2); #
+ &psrldq ($T2,8);
+ &pslldq ($T1,8); #
+ &pxor ($Xhi,$T2);
+ &pxor ($Xi,$T1); #
+ &pshufb ($Xn,$T3);
+ &pxor ($Xhi,$Xhn); # "Ii+Xi", consume early
+
+ &movdqa ($Xhn,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1
+ &movdqa ($T2,$Xi); #&reduction_alg9($Xhi,$Xi); 1st phase
+ &movdqa ($T1,$Xi);
+ &psllq ($Xi,5);
+ &pxor ($T1,$Xi); #
&psllq ($Xi,1);
&pxor ($Xi,$T1); #
- &psllq ($Xi,5); #
- &pxor ($Xi,$T1); #
&pclmulqdq ($Xn,$Hkey,0x00); #######
+ &movups ($T3,&QWP(32,$Htbl));
&psllq ($Xi,57); #
- &movdqa ($T2,$Xi); #
+ &movdqa ($T1,$Xi); #
&pslldq ($Xi,8);
- &psrldq ($T2,8); #
- &pxor ($Xi,$T1);
- &pshufd ($T1,$T3,0b01001110);
+ &psrldq ($T1,8); #
+ &pxor ($Xi,$T2);
+ &pxor ($Xhi,$T1); #
+ &pshufd ($T1,$Xhn,0b01001110);
+ &movdqa ($T2,$Xi); # 2nd phase
+ &psrlq ($Xi,1);
+ &pxor ($T1,$Xhn);
&pxor ($Xhi,$T2); #
- &pxor ($T1,$T3);
- &pshufd ($T3,$Hkey,0b01001110);
- &pxor ($T3,$Hkey); #
-
&pclmulqdq ($Xhn,$Hkey,0x11); #######
- &movdqa ($T2,$Xi); # 2nd phase
+ &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
+ &pxor ($T2,$Xi);
&psrlq ($Xi,5);
&pxor ($Xi,$T2); #
&psrlq ($Xi,1); #
- &pxor ($Xi,$T2); #
- &pxor ($T2,$Xhi);
- &psrlq ($Xi,1); #
- &pxor ($Xi,$T2); #
-
+ &pxor ($Xi,$Xhi) #
&pclmulqdq ($T1,$T3,0x00); #######
- &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
- &xorps ($T1,$Xn); #
- &xorps ($T1,$Xhn); #
-
- &movdqa ($T3,$T1); #
- &psrldq ($T1,8);
- &pslldq ($T3,8); #
- &pxor ($Xhn,$T1);
- &pxor ($Xn,$T3); #
- &movdqa ($T3,&QWP(0,$const));
&lea ($inp,&DWP(32,$inp));
&sub ($len,0x20);
&ja (&label("mod_loop"));
&set_label("even_tail");
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
+ &pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi)
+ &movdqa ($Xhi,$Xi);
+ &pxor ($T2,$Xi); #
- &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
- &pxor ($Xhi,$Xhn);
+ &pclmulqdq ($Xi,$Hkey,0x00); #######
+ &pclmulqdq ($Xhi,$Hkey,0x11); #######
+ &pclmulqdq ($T2,$T3,0x10); #######
+ &movdqa ($T3,&QWP(0,$const));
+
+ &xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
+ &xorps ($Xhi,$Xhn);
+ &pxor ($T1,$Xi); # aggregated Karatsuba post-processing
+ &pxor ($T1,$Xhi); #
+
+ &pxor ($T2,$T1); #
+
+ &movdqa ($T1,$T2); #
+ &psrldq ($T2,8);
+ &pslldq ($T1,8); #
+ &pxor ($Xhi,$T2);
+ &pxor ($Xi,$T1); #
&reduction_alg9 ($Xhi,$Xi);
@@ -1273,13 +1324,6 @@ my ($Xhi,$Xi)=@_;
&set_label("bswap",64);
&data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
&data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial
-}} # $sse2
-
-&set_label("rem_4bit",64);
- &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
- &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
- &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
- &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
&set_label("rem_8bit",64);
&data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E);
&data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E);
@@ -1313,6 +1357,13 @@ my ($Xhi,$Xi)=@_;
&data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E);
&data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE);
&data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE);
+}} # $sse2
+
+&set_label("rem_4bit",64);
+ &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
+ &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
+ &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
+ &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
}}} # !$x86only
&asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
diff --git a/crypto/modes/asm/ghash-x86_64.pl b/crypto/modes/asm/ghash-x86_64.pl
index 38d779edbcfd..6e656ca13b80 100755
--- a/crypto/modes/asm/ghash-x86_64.pl
+++ b/crypto/modes/asm/ghash-x86_64.pl
@@ -22,6 +22,8 @@
# P4 28.6 14.0 +100%
# Opteron 19.3 7.7 +150%
# Core2 17.8 8.1(**) +120%
+# Atom 31.6 16.8 +88%
+# VIA Nano 21.8 10.1 +115%
#
# (*) comparison is not completely fair, because C results are
# for vanilla "256B" implementation, while assembler results
@@ -39,6 +41,44 @@
# providing access to a Westmere-based system on behalf of Intel
# Open Source Technology Centre.
+# December 2012
+#
+# Overhaul: aggregate Karatsuba post-processing, improve ILP in
+# reduction_alg9, increase reduction aggregate factor to 4x. As for
+# the latter. ghash-x86.pl discusses that it makes lesser sense to
+# increase aggregate factor. Then why increase here? Critical path
+# consists of 3 independent pclmulqdq instructions, Karatsuba post-
+# processing and reduction. "On top" of this we lay down aggregated
+# multiplication operations, triplets of independent pclmulqdq's. As
+# issue rate for pclmulqdq is limited, it makes lesser sense to
+# aggregate more multiplications than it takes to perform remaining
+# non-multiplication operations. 2x is near-optimal coefficient for
+# contemporary Intel CPUs (therefore modest improvement coefficient),
+# but not for Bulldozer. Latter is because logical SIMD operations
+# are twice as slow in comparison to Intel, so that critical path is
+# longer. A CPU with higher pclmulqdq issue rate would also benefit
+# from higher aggregate factor...
+#
+# Westmere 1.78(+13%)
+# Sandy Bridge 1.80(+8%)
+# Ivy Bridge 1.80(+7%)
+# Haswell 0.55(+93%) (if system doesn't support AVX)
+# Broadwell 0.45(+110%)(if system doesn't support AVX)
+# Bulldozer 1.49(+27%)
+# Silvermont 2.88(+13%)
+
+# March 2013
+#
+# ... 8x aggregate factor AVX code path is using reduction algorithm
+# suggested by Shay Gueron[1]. Even though contemporary AVX-capable
+# CPUs such as Sandy and Ivy Bridge can execute it, the code performs
+# sub-optimally in comparison to above mentioned version. But thanks
+# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
+# it performs in 0.41 cycles per byte on Haswell processor, and in
+# 0.29 on Broadwell.
+#
+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
+
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -50,9 +90,30 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+ $avx = ($2>=3.0) + ($2>3.0);
+}
+
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
+$do4xaggr=1;
+
# common register layout
$nlo="%rax";
$nhi="%rbx";
@@ -160,6 +221,7 @@ ___
$code=<<___;
.text
+.extern OPENSSL_ia32cap_P
.globl gcm_gmult_4bit
.type gcm_gmult_4bit,\@function,2
@@ -352,19 +414,27 @@ ___
($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
sub clmul64x64_T2 { # minimal register pressure
-my ($Xhi,$Xi,$Hkey,$modulo)=@_;
+my ($Xhi,$Xi,$Hkey,$HK)=@_;
-$code.=<<___ if (!defined($modulo));
+if (!defined($HK)) { $HK = $T2;
+$code.=<<___;
movdqa $Xi,$Xhi #
pshufd \$0b01001110,$Xi,$T1
pshufd \$0b01001110,$Hkey,$T2
pxor $Xi,$T1 #
pxor $Hkey,$T2
___
+} else {
+$code.=<<___;
+ movdqa $Xi,$Xhi #
+ pshufd \$0b01001110,$Xi,$T1
+ pxor $Xi,$T1 #
+___
+}
$code.=<<___;
pclmulqdq \$0x00,$Hkey,$Xi #######
pclmulqdq \$0x11,$Hkey,$Xhi #######
- pclmulqdq \$0x00,$T2,$T1 #######
+ pclmulqdq \$0x00,$HK,$T1 #######
pxor $Xi,$T1 #
pxor $Xhi,$T1 #
@@ -376,42 +446,53 @@ $code.=<<___;
___
}
-sub reduction_alg9 { # 17/13 times faster than Intel version
+sub reduction_alg9 { # 17/11 times faster than Intel version
my ($Xhi,$Xi) = @_;
$code.=<<___;
# 1st phase
- movdqa $Xi,$T1 #
+ movdqa $Xi,$T2 #
+ movdqa $Xi,$T1
+ psllq \$5,$Xi
+ pxor $Xi,$T1 #
psllq \$1,$Xi
pxor $T1,$Xi #
- psllq \$5,$Xi #
- pxor $T1,$Xi #
psllq \$57,$Xi #
- movdqa $Xi,$T2 #
+ movdqa $Xi,$T1 #
pslldq \$8,$Xi
- psrldq \$8,$T2 #
- pxor $T1,$Xi
- pxor $T2,$Xhi #
+ psrldq \$8,$T1 #
+ pxor $T2,$Xi
+ pxor $T1,$Xhi #
# 2nd phase
movdqa $Xi,$T2
+ psrlq \$1,$Xi
+ pxor $T2,$Xhi #
+ pxor $Xi,$T2
psrlq \$5,$Xi
pxor $T2,$Xi #
psrlq \$1,$Xi #
- pxor $T2,$Xi #
- pxor $Xhi,$T2
- psrlq \$1,$Xi #
- pxor $T2,$Xi #
+ pxor $Xhi,$Xi #
___
}
{ my ($Htbl,$Xip)=@_4args;
+ my $HK="%xmm6";
$code.=<<___;
.globl gcm_init_clmul
.type gcm_init_clmul,\@abi-omnipotent
.align 16
gcm_init_clmul:
+.L_init_clmul:
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_gcm_init_clmul:
+ # I can't trust assembler to use specific encoding:-(
+ .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
+ .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
+___
+$code.=<<___;
movdqu ($Xip),$Hkey
pshufd \$0b01001110,$Hkey,$Hkey # dword swap
@@ -430,13 +511,47 @@ gcm_init_clmul:
pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
# calculate H^2
+ pshufd \$0b01001110,$Hkey,$HK
movdqa $Hkey,$Xi
+ pxor $Hkey,$HK
___
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK);
&reduction_alg9 ($Xhi,$Xi);
$code.=<<___;
- movdqu $Hkey,($Htbl) # save H
- movdqu $Xi,16($Htbl) # save H^2
+ pshufd \$0b01001110,$Hkey,$T1
+ pshufd \$0b01001110,$Xi,$T2
+ pxor $Hkey,$T1 # Karatsuba pre-processing
+ movdqu $Hkey,0x00($Htbl) # save H
+ pxor $Xi,$T2 # Karatsuba pre-processing
+ movdqu $Xi,0x10($Htbl) # save H^2
+ palignr \$8,$T1,$T2 # low part is H.lo^H.hi...
+ movdqu $T2,0x20($Htbl) # save Karatsuba "salt"
+___
+if ($do4xaggr) {
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3
+ &reduction_alg9 ($Xhi,$Xi);
+$code.=<<___;
+ movdqa $Xi,$T3
+___
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4
+ &reduction_alg9 ($Xhi,$Xi);
+$code.=<<___;
+ pshufd \$0b01001110,$T3,$T1
+ pshufd \$0b01001110,$Xi,$T2
+ pxor $T3,$T1 # Karatsuba pre-processing
+ movdqu $T3,0x30($Htbl) # save H^3
+ pxor $Xi,$T2 # Karatsuba pre-processing
+ movdqu $Xi,0x40($Htbl) # save H^4
+ palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi...
+ movdqu $T2,0x50($Htbl) # save Karatsuba "salt"
+___
+}
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ lea 0x18(%rsp),%rsp
+.LSEH_end_gcm_init_clmul:
+___
+$code.=<<___;
ret
.size gcm_init_clmul,.-gcm_init_clmul
___
@@ -449,13 +564,38 @@ $code.=<<___;
.type gcm_gmult_clmul,\@abi-omnipotent
.align 16
gcm_gmult_clmul:
+.L_gmult_clmul:
movdqu ($Xip),$Xi
movdqa .Lbswap_mask(%rip),$T3
movdqu ($Htbl),$Hkey
+ movdqu 0x20($Htbl),$T2
pshufb $T3,$Xi
___
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
- &reduction_alg9 ($Xhi,$Xi);
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
+$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
+ # experimental alternative. special thing about is that there
+ # no dependency between the two multiplications...
+ mov \$`0xE1<<1`,%eax
+ mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff
+ mov \$0x07,%r11d
+ movq %rax,$T1
+ movq %r10,$T2
+ movq %r11,$T3 # borrow $T3
+ pand $Xi,$T3
+ pshufb $T3,$T2 # ($Xi&7)·0xE0
+ movq %rax,$T3
+ pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1)
+ pxor $Xi,$T2
+ pslldq \$15,$T2
+ paddd $T2,$T2 # <<(64+56+1)
+ pxor $T2,$Xi
+ pclmulqdq \$0x01,$T3,$Xi
+ movdqa .Lbswap_mask(%rip),$T3 # reload $T3
+ psrldq \$1,$T1
+ pxor $T1,$Xhi
+ pslldq \$7,$Xi
+ pxor $Xhi,$Xi
+___
$code.=<<___;
pshufb $T3,$Xi
movdqu $Xi,($Xip)
@@ -465,129 +605,327 @@ ___
}
{ my ($Xip,$Htbl,$inp,$len)=@_4args;
- my $Xn="%xmm6";
- my $Xhn="%xmm7";
- my $Hkey2="%xmm8";
- my $T1n="%xmm9";
- my $T2n="%xmm10";
+ my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
+ my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
$code.=<<___;
.globl gcm_ghash_clmul
.type gcm_ghash_clmul,\@abi-omnipotent
-.align 16
+.align 32
gcm_ghash_clmul:
+.L_ghash_clmul:
___
$code.=<<___ if ($win64);
+ lea -0x88(%rsp),%rax
.LSEH_begin_gcm_ghash_clmul:
# I can't trust assembler to use specific encoding:-(
- .byte 0x48,0x83,0xec,0x58 #sub \$0x58,%rsp
- .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
- .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp)
- .byte 0x44,0x0f,0x29,0x44,0x24,0x20 #movaps %xmm8,0x20(%rsp)
- .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 #movaps %xmm9,0x30(%rsp)
- .byte 0x44,0x0f,0x29,0x54,0x24,0x40 #movaps %xmm10,0x40(%rsp)
+ .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
+ .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
+ .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
+ .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
+ .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
+ .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
+ .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
+ .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
+ .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
+ .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
+ .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
___
$code.=<<___;
movdqa .Lbswap_mask(%rip),$T3
movdqu ($Xip),$Xi
movdqu ($Htbl),$Hkey
+ movdqu 0x20($Htbl),$HK
pshufb $T3,$Xi
sub \$0x10,$len
jz .Lodd_tail
- movdqu 16($Htbl),$Hkey2
+ movdqu 0x10($Htbl),$Hkey2
+___
+if ($do4xaggr) {
+my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
+
+$code.=<<___;
+ mov OPENSSL_ia32cap_P+4(%rip),%eax
+ cmp \$0x30,$len
+ jb .Lskip4x
+
+ and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE
+ cmp \$`1<<22`,%eax # check for MOVBE without XSAVE
+ je .Lskip4x
+
+ sub \$0x30,$len
+ mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
+ movdqu 0x30($Htbl),$Hkey3
+ movdqu 0x40($Htbl),$Hkey4
+
+ #######
+ # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P
+ #
+ movdqu 0x30($inp),$Xln
+ movdqu 0x20($inp),$Xl
+ pshufb $T3,$Xln
+ pshufb $T3,$Xl
+ movdqa $Xln,$Xhn
+ pshufd \$0b01001110,$Xln,$Xmn
+ pxor $Xln,$Xmn
+ pclmulqdq \$0x00,$Hkey,$Xln
+ pclmulqdq \$0x11,$Hkey,$Xhn
+ pclmulqdq \$0x00,$HK,$Xmn
+
+ movdqa $Xl,$Xh
+ pshufd \$0b01001110,$Xl,$Xm
+ pxor $Xl,$Xm
+ pclmulqdq \$0x00,$Hkey2,$Xl
+ pclmulqdq \$0x11,$Hkey2,$Xh
+ pclmulqdq \$0x10,$HK,$Xm
+ xorps $Xl,$Xln
+ xorps $Xh,$Xhn
+ movups 0x50($Htbl),$HK
+ xorps $Xm,$Xmn
+
+ movdqu 0x10($inp),$Xl
+ movdqu 0($inp),$T1
+ pshufb $T3,$Xl
+ pshufb $T3,$T1
+ movdqa $Xl,$Xh
+ pshufd \$0b01001110,$Xl,$Xm
+ pxor $T1,$Xi
+ pxor $Xl,$Xm
+ pclmulqdq \$0x00,$Hkey3,$Xl
+ movdqa $Xi,$Xhi
+ pshufd \$0b01001110,$Xi,$T1
+ pxor $Xi,$T1
+ pclmulqdq \$0x11,$Hkey3,$Xh
+ pclmulqdq \$0x00,$HK,$Xm
+ xorps $Xl,$Xln
+ xorps $Xh,$Xhn
+
+ lea 0x40($inp),$inp
+ sub \$0x40,$len
+ jc .Ltail4x
+
+ jmp .Lmod4_loop
+.align 32
+.Lmod4_loop:
+ pclmulqdq \$0x00,$Hkey4,$Xi
+ xorps $Xm,$Xmn
+ movdqu 0x30($inp),$Xl
+ pshufb $T3,$Xl
+ pclmulqdq \$0x11,$Hkey4,$Xhi
+ xorps $Xln,$Xi
+ movdqu 0x20($inp),$Xln
+ movdqa $Xl,$Xh
+ pclmulqdq \$0x10,$HK,$T1
+ pshufd \$0b01001110,$Xl,$Xm
+ xorps $Xhn,$Xhi
+ pxor $Xl,$Xm
+ pshufb $T3,$Xln
+ movups 0x20($Htbl),$HK
+ xorps $Xmn,$T1
+ pclmulqdq \$0x00,$Hkey,$Xl
+ pshufd \$0b01001110,$Xln,$Xmn
+
+ pxor $Xi,$T1 # aggregated Karatsuba post-processing
+ movdqa $Xln,$Xhn
+ pxor $Xhi,$T1 #
+ pxor $Xln,$Xmn
+ movdqa $T1,$T2 #
+ pclmulqdq \$0x11,$Hkey,$Xh
+ pslldq \$8,$T1
+ psrldq \$8,$T2 #
+ pxor $T1,$Xi
+ movdqa .L7_mask(%rip),$T1
+ pxor $T2,$Xhi #
+ movq %rax,$T2
+
+ pand $Xi,$T1 # 1st phase
+ pshufb $T1,$T2 #
+ pxor $Xi,$T2 #
+ pclmulqdq \$0x00,$HK,$Xm
+ psllq \$57,$T2 #
+ movdqa $T2,$T1 #
+ pslldq \$8,$T2
+ pclmulqdq \$0x00,$Hkey2,$Xln
+ psrldq \$8,$T1 #
+ pxor $T2,$Xi
+ pxor $T1,$Xhi #
+ movdqu 0($inp),$T1
+
+ movdqa $Xi,$T2 # 2nd phase
+ psrlq \$1,$Xi
+ pclmulqdq \$0x11,$Hkey2,$Xhn
+ xorps $Xl,$Xln
+ movdqu 0x10($inp),$Xl
+ pshufb $T3,$Xl
+ pclmulqdq \$0x10,$HK,$Xmn
+ xorps $Xh,$Xhn
+ movups 0x50($Htbl),$HK
+ pshufb $T3,$T1
+ pxor $T2,$Xhi #
+ pxor $Xi,$T2
+ psrlq \$5,$Xi
+
+ movdqa $Xl,$Xh
+ pxor $Xm,$Xmn
+ pshufd \$0b01001110,$Xl,$Xm
+ pxor $T2,$Xi #
+ pxor $T1,$Xhi
+ pxor $Xl,$Xm
+ pclmulqdq \$0x00,$Hkey3,$Xl
+ psrlq \$1,$Xi #
+ pxor $Xhi,$Xi #
+ movdqa $Xi,$Xhi
+ pclmulqdq \$0x11,$Hkey3,$Xh
+ xorps $Xl,$Xln
+ pshufd \$0b01001110,$Xi,$T1
+ pxor $Xi,$T1
+
+ pclmulqdq \$0x00,$HK,$Xm
+ xorps $Xh,$Xhn
+
+ lea 0x40($inp),$inp
+ sub \$0x40,$len
+ jnc .Lmod4_loop
+
+.Ltail4x:
+ pclmulqdq \$0x00,$Hkey4,$Xi
+ pclmulqdq \$0x11,$Hkey4,$Xhi
+ pclmulqdq \$0x10,$HK,$T1
+ xorps $Xm,$Xmn
+ xorps $Xln,$Xi
+ xorps $Xhn,$Xhi
+ pxor $Xi,$Xhi # aggregated Karatsuba post-processing
+ pxor $Xmn,$T1
+
+ pxor $Xhi,$T1 #
+ pxor $Xi,$Xhi
+
+ movdqa $T1,$T2 #
+ psrldq \$8,$T1
+ pslldq \$8,$T2 #
+ pxor $T1,$Xhi
+ pxor $T2,$Xi #
+___
+ &reduction_alg9($Xhi,$Xi);
+$code.=<<___;
+ add \$0x40,$len
+ jz .Ldone
+ movdqu 0x20($Htbl),$HK
+ sub \$0x10,$len
+ jz .Lodd_tail
+.Lskip4x:
+___
+}
+$code.=<<___;
#######
# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
# [(H*Ii+1) + (H*Xi+1)] mod P =
# [(H*Ii+1) + H^2*(Ii+Xi)] mod P
#
movdqu ($inp),$T1 # Ii
- movdqu 16($inp),$Xn # Ii+1
+ movdqu 16($inp),$Xln # Ii+1
pshufb $T3,$T1
- pshufb $T3,$Xn
+ pshufb $T3,$Xln
pxor $T1,$Xi # Ii+Xi
-___
- &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1
-$code.=<<___;
- movdqa $Xi,$Xhi #
- pshufd \$0b01001110,$Xi,$T1
- pshufd \$0b01001110,$Hkey2,$T2
- pxor $Xi,$T1 #
- pxor $Hkey2,$T2
+
+ movdqa $Xln,$Xhn
+ pshufd \$0b01001110,$Xln,$Xmn
+ pxor $Xln,$Xmn
+ pclmulqdq \$0x00,$Hkey,$Xln
+ pclmulqdq \$0x11,$Hkey,$Xhn
+ pclmulqdq \$0x00,$HK,$Xmn
lea 32($inp),$inp # i+=2
+ nop
sub \$0x20,$len
jbe .Leven_tail
+ nop
+ jmp .Lmod_loop
+.align 32
.Lmod_loop:
-___
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi)
-$code.=<<___;
- movdqu ($inp),$T1 # Ii
- pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
- pxor $Xhn,$Xhi
+ movdqa $Xi,$Xhi
+ movdqa $Xmn,$T1
+ pshufd \$0b01001110,$Xi,$Xmn #
+ pxor $Xi,$Xmn #
- movdqu 16($inp),$Xn # Ii+1
- pshufb $T3,$T1
- pshufb $T3,$Xn
+ pclmulqdq \$0x00,$Hkey2,$Xi
+ pclmulqdq \$0x11,$Hkey2,$Xhi
+ pclmulqdq \$0x10,$HK,$Xmn
- movdqa $Xn,$Xhn #
- pshufd \$0b01001110,$Xn,$T1n
- pshufd \$0b01001110,$Hkey,$T2n
- pxor $Xn,$T1n #
- pxor $Hkey,$T2n
- pxor $T1,$Xhi # "Ii+Xi", consume early
+ pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
+ pxor $Xhn,$Xhi
+ movdqu ($inp),$T2 # Ii
+ pxor $Xi,$T1 # aggregated Karatsuba post-processing
+ pshufb $T3,$T2
+ movdqu 16($inp),$Xln # Ii+1
+
+ pxor $Xhi,$T1
+ pxor $T2,$Xhi # "Ii+Xi", consume early
+ pxor $T1,$Xmn
+ pshufb $T3,$Xln
+ movdqa $Xmn,$T1 #
+ psrldq \$8,$T1
+ pslldq \$8,$Xmn #
+ pxor $T1,$Xhi
+ pxor $Xmn,$Xi #
+
+ movdqa $Xln,$Xhn #
- movdqa $Xi,$T1 # 1st phase
+ movdqa $Xi,$T2 # 1st phase
+ movdqa $Xi,$T1
+ psllq \$5,$Xi
+ pxor $Xi,$T1 #
+ pclmulqdq \$0x00,$Hkey,$Xln #######
psllq \$1,$Xi
pxor $T1,$Xi #
- psllq \$5,$Xi #
- pxor $T1,$Xi #
- pclmulqdq \$0x00,$Hkey,$Xn #######
psllq \$57,$Xi #
- movdqa $Xi,$T2 #
+ movdqa $Xi,$T1 #
pslldq \$8,$Xi
- psrldq \$8,$T2 #
- pxor $T1,$Xi
- pxor $T2,$Xhi #
+ psrldq \$8,$T1 #
+ pxor $T2,$Xi
+ pshufd \$0b01001110,$Xhn,$Xmn
+ pxor $T1,$Xhi #
+ pxor $Xhn,$Xmn #
- pclmulqdq \$0x11,$Hkey,$Xhn #######
movdqa $Xi,$T2 # 2nd phase
+ psrlq \$1,$Xi
+ pclmulqdq \$0x11,$Hkey,$Xhn #######
+ pxor $T2,$Xhi #
+ pxor $Xi,$T2
psrlq \$5,$Xi
pxor $T2,$Xi #
+ lea 32($inp),$inp
psrlq \$1,$Xi #
- pxor $T2,$Xi #
- pxor $Xhi,$T2
- psrlq \$1,$Xi #
- pxor $T2,$Xi #
+ pclmulqdq \$0x00,$HK,$Xmn #######
+ pxor $Xhi,$Xi #
- pclmulqdq \$0x00,$T2n,$T1n #######
- movdqa $Xi,$Xhi #
- pshufd \$0b01001110,$Xi,$T1
- pshufd \$0b01001110,$Hkey2,$T2
- pxor $Xi,$T1 #
- pxor $Hkey2,$T2
-
- pxor $Xn,$T1n #
- pxor $Xhn,$T1n #
- movdqa $T1n,$T2n #
- psrldq \$8,$T1n
- pslldq \$8,$T2n #
- pxor $T1n,$Xhn
- pxor $T2n,$Xn #
-
- lea 32($inp),$inp
sub \$0x20,$len
ja .Lmod_loop
.Leven_tail:
-___
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi)
-$code.=<<___;
- pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
+ movdqa $Xi,$Xhi
+ movdqa $Xmn,$T1
+ pshufd \$0b01001110,$Xi,$Xmn #
+ pxor $Xi,$Xmn #
+
+ pclmulqdq \$0x00,$Hkey2,$Xi
+ pclmulqdq \$0x11,$Hkey2,$Xhi
+ pclmulqdq \$0x10,$HK,$Xmn
+
+ pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
pxor $Xhn,$Xhi
+ pxor $Xi,$T1
+ pxor $Xhi,$T1
+ pxor $T1,$Xmn
+ movdqa $Xmn,$T1 #
+ psrldq \$8,$T1
+ pslldq \$8,$Xmn #
+ pxor $T1,$Xhi
+ pxor $Xmn,$Xi #
___
&reduction_alg9 ($Xhi,$Xi);
$code.=<<___;
@@ -599,7 +937,7 @@ $code.=<<___;
pshufb $T3,$T1
pxor $T1,$Xi # Ii+Xi
___
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi)
&reduction_alg9 ($Xhi,$Xi);
$code.=<<___;
.Ldone:
@@ -612,21 +950,607 @@ $code.=<<___ if ($win64);
movaps 0x20(%rsp),%xmm8
movaps 0x30(%rsp),%xmm9
movaps 0x40(%rsp),%xmm10
- add \$0x58,%rsp
+ movaps 0x50(%rsp),%xmm11
+ movaps 0x60(%rsp),%xmm12
+ movaps 0x70(%rsp),%xmm13
+ movaps 0x80(%rsp),%xmm14
+ movaps 0x90(%rsp),%xmm15
+ lea 0xa8(%rsp),%rsp
+.LSEH_end_gcm_ghash_clmul:
___
$code.=<<___;
ret
-.LSEH_end_gcm_ghash_clmul:
.size gcm_ghash_clmul,.-gcm_ghash_clmul
___
}
+
+$code.=<<___;
+.globl gcm_init_avx
+.type gcm_init_avx,\@abi-omnipotent
+.align 32
+gcm_init_avx:
+___
+if ($avx) {
+my ($Htbl,$Xip)=@_4args;
+my $HK="%xmm6";
+
+$code.=<<___ if ($win64);
+.LSEH_begin_gcm_init_avx:
+ # I can't trust assembler to use specific encoding:-(
+ .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
+ .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
+___
+$code.=<<___;
+ vzeroupper
+
+ vmovdqu ($Xip),$Hkey
+ vpshufd \$0b01001110,$Hkey,$Hkey # dword swap
+
+ # <<1 twist
+ vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
+ vpsrlq \$63,$Hkey,$T1
+ vpsllq \$1,$Hkey,$Hkey
+ vpxor $T3,$T3,$T3 #
+ vpcmpgtd $T2,$T3,$T3 # broadcast carry bit
+ vpslldq \$8,$T1,$T1
+ vpor $T1,$Hkey,$Hkey # H<<=1
+
+ # magic reduction
+ vpand .L0x1c2_polynomial(%rip),$T3,$T3
+ vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial
+
+ vpunpckhqdq $Hkey,$Hkey,$HK
+ vmovdqa $Hkey,$Xi
+ vpxor $Hkey,$HK,$HK
+ mov \$4,%r10 # up to H^8
+ jmp .Linit_start_avx
+___
+
+sub clmul64x64_avx {
+my ($Xhi,$Xi,$Hkey,$HK)=@_;
+
+if (!defined($HK)) { $HK = $T2;
+$code.=<<___;
+ vpunpckhqdq $Xi,$Xi,$T1
+ vpunpckhqdq $Hkey,$Hkey,$T2
+ vpxor $Xi,$T1,$T1 #
+ vpxor $Hkey,$T2,$T2
+___
+} else {
+$code.=<<___;
+ vpunpckhqdq $Xi,$Xi,$T1
+ vpxor $Xi,$T1,$T1 #
+___
+}
+$code.=<<___;
+ vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi #######
+ vpclmulqdq \$0x00,$Hkey,$Xi,$Xi #######
+ vpclmulqdq \$0x00,$HK,$T1,$T1 #######
+ vpxor $Xi,$Xhi,$T2 #
+ vpxor $T2,$T1,$T1 #
+
+ vpslldq \$8,$T1,$T2 #
+ vpsrldq \$8,$T1,$T1
+ vpxor $T2,$Xi,$Xi #
+ vpxor $T1,$Xhi,$Xhi
+___
+}
+
+sub reduction_avx {
+my ($Xhi,$Xi) = @_;
+
+$code.=<<___;
+ vpsllq \$57,$Xi,$T1 # 1st phase
+ vpsllq \$62,$Xi,$T2
+ vpxor $T1,$T2,$T2 #
+ vpsllq \$63,$Xi,$T1
+ vpxor $T1,$T2,$T2 #
+ vpslldq \$8,$T2,$T1 #
+ vpsrldq \$8,$T2,$T2
+ vpxor $T1,$Xi,$Xi #
+ vpxor $T2,$Xhi,$Xhi
+
+ vpsrlq \$1,$Xi,$T2 # 2nd phase
+ vpxor $Xi,$Xhi,$Xhi
+ vpxor $T2,$Xi,$Xi #
+ vpsrlq \$5,$T2,$T2
+ vpxor $T2,$Xi,$Xi #
+ vpsrlq \$1,$Xi,$Xi #
+ vpxor $Xhi,$Xi,$Xi #
+___
+}
$code.=<<___;
+.align 32
+.Linit_loop_avx:
+ vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi...
+ vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt"
+___
+ &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7
+ &reduction_avx ($Xhi,$Xi);
+$code.=<<___;
+.Linit_start_avx:
+ vmovdqa $Xi,$T3
+___
+ &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8
+ &reduction_avx ($Xhi,$Xi);
+$code.=<<___;
+ vpshufd \$0b01001110,$T3,$T1
+ vpshufd \$0b01001110,$Xi,$T2
+ vpxor $T3,$T1,$T1 # Karatsuba pre-processing
+ vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7
+ vpxor $Xi,$T2,$T2 # Karatsuba pre-processing
+ vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8
+ lea 0x30($Htbl),$Htbl
+ sub \$1,%r10
+ jnz .Linit_loop_avx
+
+ vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped
+ vmovdqu $T3,-0x10($Htbl)
+
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ lea 0x18(%rsp),%rsp
+.LSEH_end_gcm_init_avx:
+___
+$code.=<<___;
+ ret
+.size gcm_init_avx,.-gcm_init_avx
+___
+} else {
+$code.=<<___;
+ jmp .L_init_clmul
+.size gcm_init_avx,.-gcm_init_avx
+___
+}
+
+$code.=<<___;
+.globl gcm_gmult_avx
+.type gcm_gmult_avx,\@abi-omnipotent
+.align 32
+gcm_gmult_avx:
+ jmp .L_gmult_clmul
+.size gcm_gmult_avx,.-gcm_gmult_avx
+___
+
+$code.=<<___;
+.globl gcm_ghash_avx
+.type gcm_ghash_avx,\@abi-omnipotent
+.align 32
+gcm_ghash_avx:
+___
+if ($avx) {
+my ($Xip,$Htbl,$inp,$len)=@_4args;
+my ($Xlo,$Xhi,$Xmi,
+ $Zlo,$Zhi,$Zmi,
+ $Hkey,$HK,$T1,$T2,
+ $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15));
+
+$code.=<<___ if ($win64);
+ lea -0x88(%rsp),%rax
+.LSEH_begin_gcm_ghash_avx:
+ # I can't trust assembler to use specific encoding:-(
+ .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
+ .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
+ .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
+ .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
+ .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
+ .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
+ .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
+ .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
+ .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
+ .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
+ .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
+___
+$code.=<<___;
+ vzeroupper
+
+ vmovdqu ($Xip),$Xi # load $Xi
+ lea .L0x1c2_polynomial(%rip),%r10
+ lea 0x40($Htbl),$Htbl # size optimization
+ vmovdqu .Lbswap_mask(%rip),$bswap
+ vpshufb $bswap,$Xi,$Xi
+ cmp \$0x80,$len
+ jb .Lshort_avx
+ sub \$0x80,$len
+
+ vmovdqu 0x70($inp),$Ii # I[7]
+ vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
+ vpshufb $bswap,$Ii,$Ii
+ vmovdqu 0x20-0x40($Htbl),$HK
+
+ vpunpckhqdq $Ii,$Ii,$T2
+ vmovdqu 0x60($inp),$Ij # I[6]
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpxor $Ii,$T2,$T2
+ vpshufb $bswap,$Ij,$Ij
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
+ vpunpckhqdq $Ij,$Ij,$T1
+ vmovdqu 0x50($inp),$Ii # I[5]
+ vpclmulqdq \$0x00,$HK,$T2,$Xmi
+ vpxor $Ij,$T1,$T1
+
+ vpshufb $bswap,$Ii,$Ii
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
+ vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
+ vpxor $Ii,$T2,$T2
+ vmovdqu 0x40($inp),$Ij # I[4]
+ vpclmulqdq \$0x10,$HK,$T1,$Zmi
+ vmovdqu 0x50-0x40($Htbl),$HK
+
+ vpshufb $bswap,$Ij,$Ij
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpxor $Xhi,$Zhi,$Zhi
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T2,$Xmi
+ vpxor $Ij,$T1,$T1
+
+ vmovdqu 0x30($inp),$Ii # I[3]
+ vpxor $Zlo,$Xlo,$Xlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
+ vpxor $Zhi,$Xhi,$Xhi
+ vpshufb $bswap,$Ii,$Ii
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
+ vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
+ vpxor $Zmi,$Xmi,$Xmi
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpclmulqdq \$0x10,$HK,$T1,$Zmi
+ vmovdqu 0x80-0x40($Htbl),$HK
+ vpxor $Ii,$T2,$T2
+
+ vmovdqu 0x20($inp),$Ij # I[2]
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpxor $Xhi,$Zhi,$Zhi
+ vpshufb $bswap,$Ij,$Ij
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
+ vpxor $Xmi,$Zmi,$Zmi
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpclmulqdq \$0x00,$HK,$T2,$Xmi
+ vpxor $Ij,$T1,$T1
+
+ vmovdqu 0x10($inp),$Ii # I[1]
+ vpxor $Zlo,$Xlo,$Xlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
+ vpxor $Zhi,$Xhi,$Xhi
+ vpshufb $bswap,$Ii,$Ii
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
+ vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
+ vpxor $Zmi,$Xmi,$Xmi
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpclmulqdq \$0x10,$HK,$T1,$Zmi
+ vmovdqu 0xb0-0x40($Htbl),$HK
+ vpxor $Ii,$T2,$T2
+
+ vmovdqu ($inp),$Ij # I[0]
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpxor $Xhi,$Zhi,$Zhi
+ vpshufb $bswap,$Ij,$Ij
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x10,$HK,$T2,$Xmi
+
+ lea 0x80($inp),$inp
+ cmp \$0x80,$len
+ jb .Ltail_avx
+
+ vpxor $Xi,$Ij,$Ij # accumulate $Xi
+ sub \$0x80,$len
+ jmp .Loop8x_avx
+
+.align 32
+.Loop8x_avx:
+ vpunpckhqdq $Ij,$Ij,$T1
+ vmovdqu 0x70($inp),$Ii # I[7]
+ vpxor $Xlo,$Zlo,$Zlo
+ vpxor $Ij,$T1,$T1
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xi
+ vpshufb $bswap,$Ii,$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xo
+ vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Tred
+ vmovdqu 0x20-0x40($Htbl),$HK
+ vpxor $Ii,$T2,$T2
+
+ vmovdqu 0x60($inp),$Ij # I[6]
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpxor $Zlo,$Xi,$Xi # collect result
+ vpshufb $bswap,$Ij,$Ij
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vxorps $Zhi,$Xo,$Xo
+ vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpclmulqdq \$0x00,$HK, $T2,$Xmi
+ vpxor $Zmi,$Tred,$Tred
+ vxorps $Ij,$T1,$T1
+
+ vmovdqu 0x50($inp),$Ii # I[5]
+ vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
+ vpxor $Xo,$Tred,$Tred
+ vpslldq \$8,$Tred,$T2
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
+ vpsrldq \$8,$Tred,$Tred
+ vpxor $T2, $Xi, $Xi
+ vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
+ vpshufb $bswap,$Ii,$Ii
+ vxorps $Tred,$Xo, $Xo
+ vpxor $Xhi,$Zhi,$Zhi
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpclmulqdq \$0x10,$HK, $T1,$Zmi
+ vmovdqu 0x50-0x40($Htbl),$HK
+ vpxor $Ii,$T2,$T2
+ vpxor $Xmi,$Zmi,$Zmi
+
+ vmovdqu 0x40($inp),$Ij # I[4]
+ vpalignr \$8,$Xi,$Xi,$Tred # 1st phase
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpshufb $bswap,$Ij,$Ij
+ vpxor $Zlo,$Xlo,$Xlo
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Zhi,$Xhi,$Xhi
+ vpclmulqdq \$0x00,$HK, $T2,$Xmi
+ vxorps $Ij,$T1,$T1
+ vpxor $Zmi,$Xmi,$Xmi
+
+ vmovdqu 0x30($inp),$Ii # I[3]
+ vpclmulqdq \$0x10,(%r10),$Xi,$Xi
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
+ vpshufb $bswap,$Ii,$Ii
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
+ vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x10,$HK, $T1,$Zmi
+ vmovdqu 0x80-0x40($Htbl),$HK
+ vpxor $Ii,$T2,$T2
+ vpxor $Xmi,$Zmi,$Zmi
+
+ vmovdqu 0x20($inp),$Ij # I[2]
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpshufb $bswap,$Ij,$Ij
+ vpxor $Zlo,$Xlo,$Xlo
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Zhi,$Xhi,$Xhi
+ vpclmulqdq \$0x00,$HK, $T2,$Xmi
+ vpxor $Ij,$T1,$T1
+ vpxor $Zmi,$Xmi,$Xmi
+ vxorps $Tred,$Xi,$Xi
+
+ vmovdqu 0x10($inp),$Ii # I[1]
+ vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
+ vpshufb $bswap,$Ii,$Ii
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
+ vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
+ vpclmulqdq \$0x10,(%r10),$Xi,$Xi
+ vxorps $Xo,$Tred,$Tred
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x10,$HK, $T1,$Zmi
+ vmovdqu 0xb0-0x40($Htbl),$HK
+ vpxor $Ii,$T2,$T2
+ vpxor $Xmi,$Zmi,$Zmi
+
+ vmovdqu ($inp),$Ij # I[0]
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpshufb $bswap,$Ij,$Ij
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
+ vpxor $Tred,$Ij,$Ij
+ vpclmulqdq \$0x10,$HK, $T2,$Xmi
+ vpxor $Xi,$Ij,$Ij # accumulate $Xi
+
+ lea 0x80($inp),$inp
+ sub \$0x80,$len
+ jnc .Loop8x_avx
+
+ add \$0x80,$len
+ jmp .Ltail_no_xor_avx
+
+.align 32
+.Lshort_avx:
+ vmovdqu -0x10($inp,$len),$Ii # very last word
+ lea ($inp,$len),$inp
+ vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
+ vmovdqu 0x20-0x40($Htbl),$HK
+ vpshufb $bswap,$Ii,$Ij
+
+ vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo,
+ vmovdqa $Xhi,$Zhi # $Zhi and
+ vmovdqa $Xmi,$Zmi # $Zmi
+ sub \$0x10,$len
+ jz .Ltail_avx
+
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vmovdqu -0x20($inp),$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
+ vpshufb $bswap,$Ii,$Ij
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+ vpsrldq \$8,$HK,$HK
+ sub \$0x10,$len
+ jz .Ltail_avx
+
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vmovdqu -0x30($inp),$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
+ vpshufb $bswap,$Ii,$Ij
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+ vmovdqu 0x50-0x40($Htbl),$HK
+ sub \$0x10,$len
+ jz .Ltail_avx
+
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vmovdqu -0x40($inp),$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
+ vpshufb $bswap,$Ii,$Ij
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+ vpsrldq \$8,$HK,$HK
+ sub \$0x10,$len
+ jz .Ltail_avx
+
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vmovdqu -0x50($inp),$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
+ vpshufb $bswap,$Ii,$Ij
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+ vmovdqu 0x80-0x40($Htbl),$HK
+ sub \$0x10,$len
+ jz .Ltail_avx
+
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vmovdqu -0x60($inp),$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
+ vpshufb $bswap,$Ii,$Ij
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+ vpsrldq \$8,$HK,$HK
+ sub \$0x10,$len
+ jz .Ltail_avx
+
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vmovdqu -0x70($inp),$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
+ vpshufb $bswap,$Ii,$Ij
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+ vmovq 0xb8-0x40($Htbl),$HK
+ sub \$0x10,$len
+ jmp .Ltail_avx
+
+.align 32
+.Ltail_avx:
+ vpxor $Xi,$Ij,$Ij # accumulate $Xi
+.Ltail_no_xor_avx:
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+
+ vmovdqu (%r10),$Tred
+
+ vpxor $Xlo,$Zlo,$Xi
+ vpxor $Xhi,$Zhi,$Xo
+ vpxor $Xmi,$Zmi,$Zmi
+
+ vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing
+ vpxor $Xo, $Zmi,$Zmi
+ vpslldq \$8, $Zmi,$T2
+ vpsrldq \$8, $Zmi,$Zmi
+ vpxor $T2, $Xi, $Xi
+ vpxor $Zmi,$Xo, $Xo
+
+ vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase
+ vpalignr \$8,$Xi,$Xi,$Xi
+ vpxor $T2,$Xi,$Xi
+
+ vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase
+ vpalignr \$8,$Xi,$Xi,$Xi
+ vpxor $Xo,$Xi,$Xi
+ vpxor $T2,$Xi,$Xi
+
+ cmp \$0,$len
+ jne .Lshort_avx
+
+ vpshufb $bswap,$Xi,$Xi
+ vmovdqu $Xi,($Xip)
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ movaps 0x10(%rsp),%xmm7
+ movaps 0x20(%rsp),%xmm8
+ movaps 0x30(%rsp),%xmm9
+ movaps 0x40(%rsp),%xmm10
+ movaps 0x50(%rsp),%xmm11
+ movaps 0x60(%rsp),%xmm12
+ movaps 0x70(%rsp),%xmm13
+ movaps 0x80(%rsp),%xmm14
+ movaps 0x90(%rsp),%xmm15
+ lea 0xa8(%rsp),%rsp
+.LSEH_end_gcm_ghash_avx:
+___
+$code.=<<___;
+ ret
+.size gcm_ghash_avx,.-gcm_ghash_avx
+___
+} else {
+$code.=<<___;
+ jmp .L_ghash_clmul
+.size gcm_ghash_avx,.-gcm_ghash_avx
+___
+}
+
+$code.=<<___;
.align 64
.Lbswap_mask:
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
.L0x1c2_polynomial:
.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.L7_mask:
+ .long 7,0,7,0
+.L7_mask_poly:
+ .long 7,0,`0xE1<<1`,0
.align 64
.type .Lrem_4bit,\@object
.Lrem_4bit:
@@ -774,10 +1698,24 @@ se_handler:
.rva .LSEH_end_gcm_ghash_4bit
.rva .LSEH_info_gcm_ghash_4bit
+ .rva .LSEH_begin_gcm_init_clmul
+ .rva .LSEH_end_gcm_init_clmul
+ .rva .LSEH_info_gcm_init_clmul
+
.rva .LSEH_begin_gcm_ghash_clmul
.rva .LSEH_end_gcm_ghash_clmul
.rva .LSEH_info_gcm_ghash_clmul
+___
+$code.=<<___ if ($avx);
+ .rva .LSEH_begin_gcm_init_avx
+ .rva .LSEH_end_gcm_init_avx
+ .rva .LSEH_info_gcm_init_clmul
+ .rva .LSEH_begin_gcm_ghash_avx
+ .rva .LSEH_end_gcm_ghash_avx
+ .rva .LSEH_info_gcm_ghash_clmul
+___
+$code.=<<___;
.section .xdata
.align 8
.LSEH_info_gcm_gmult_4bit:
@@ -788,14 +1726,23 @@ se_handler:
.byte 9,0,0,0
.rva se_handler
.rva .Lghash_prologue,.Lghash_epilogue # HandlerData
+.LSEH_info_gcm_init_clmul:
+ .byte 0x01,0x08,0x03,0x00
+ .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
+ .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18
.LSEH_info_gcm_ghash_clmul:
- .byte 0x01,0x1f,0x0b,0x00
- .byte 0x1f,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
- .byte 0x19,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
- .byte 0x13,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
- .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
- .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6
- .byte 0x04,0xa2,0x00,0x00 #sub rsp,0x58
+ .byte 0x01,0x33,0x16,0x00
+ .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15
+ .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14
+ .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13
+ .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12
+ .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11
+ .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
+ .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
+ .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
+ .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
+ .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
+ .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
___
}
diff --git a/crypto/modes/asm/ghashp8-ppc.pl b/crypto/modes/asm/ghashp8-ppc.pl
new file mode 100755
index 000000000000..e76a58c343c1
--- /dev/null
+++ b/crypto/modes/asm/ghashp8-ppc.pl
@@ -0,0 +1,234 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# GHASH for for PowerISA v2.07.
+#
+# July 2014
+#
+# Accurate performance measurements are problematic, because it's
+# always virtualized setup with possibly throttled processor.
+# Relative comparison is therefore more informative. This initial
+# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
+# faster than "4-bit" integer-only compiler-generated 64-bit code.
+# "Initial version" means that there is room for futher improvement.
+
+$flavour=shift;
+$output =shift;
+
+if ($flavour =~ /64/) {
+ $SIZE_T=8;
+ $LRSAVE=2*$SIZE_T;
+ $STU="stdu";
+ $POP="ld";
+ $PUSH="std";
+} elsif ($flavour =~ /32/) {
+ $SIZE_T=4;
+ $LRSAVE=$SIZE_T;
+ $STU="stwu";
+ $POP="lwz";
+ $PUSH="stw";
+} else { die "nonsense $flavour"; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
+
+my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block
+
+my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
+my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
+my $vrsave="r12";
+
+$code=<<___;
+.machine "any"
+
+.text
+
+.globl .gcm_init_p8
+.align 5
+.gcm_init_p8:
+ lis r0,0xfff0
+ li r8,0x10
+ mfspr $vrsave,256
+ li r9,0x20
+ mtspr 256,r0
+ li r10,0x30
+ lvx_u $H,0,r4 # load H
+
+ vspltisb $xC2,-16 # 0xf0
+ vspltisb $t0,1 # one
+ vaddubm $xC2,$xC2,$xC2 # 0xe0
+ vxor $zero,$zero,$zero
+ vor $xC2,$xC2,$t0 # 0xe1
+ vsldoi $xC2,$xC2,$zero,15 # 0xe1...
+ vsldoi $t1,$zero,$t0,1 # ...1
+ vaddubm $xC2,$xC2,$xC2 # 0xc2...
+ vspltisb $t2,7
+ vor $xC2,$xC2,$t1 # 0xc2....01
+ vspltb $t1,$H,0 # most significant byte
+ vsl $H,$H,$t0 # H<<=1
+ vsrab $t1,$t1,$t2 # broadcast carry bit
+ vand $t1,$t1,$xC2
+ vxor $H,$H,$t1 # twisted H
+
+ vsldoi $H,$H,$H,8 # twist even more ...
+ vsldoi $xC2,$zero,$xC2,8 # 0xc2.0
+ vsldoi $Hl,$zero,$H,8 # ... and split
+ vsldoi $Hh,$H,$zero,8
+
+ stvx_u $xC2,0,r3 # save pre-computed table
+ stvx_u $Hl,r8,r3
+ stvx_u $H, r9,r3
+ stvx_u $Hh,r10,r3
+
+ mtspr 256,$vrsave
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,2,0
+ .long 0
+.size .gcm_init_p8,.-.gcm_init_p8
+
+.globl .gcm_gmult_p8
+.align 5
+.gcm_gmult_p8:
+ lis r0,0xfff8
+ li r8,0x10
+ mfspr $vrsave,256
+ li r9,0x20
+ mtspr 256,r0
+ li r10,0x30
+ lvx_u $IN,0,$Xip # load Xi
+
+ lvx_u $Hl,r8,$Htbl # load pre-computed table
+ le?lvsl $lemask,r0,r0
+ lvx_u $H, r9,$Htbl
+ le?vspltisb $t0,0x07
+ lvx_u $Hh,r10,$Htbl
+ le?vxor $lemask,$lemask,$t0
+ lvx_u $xC2,0,$Htbl
+ le?vperm $IN,$IN,$IN,$lemask
+ vxor $zero,$zero,$zero
+
+ vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
+ vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
+ vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
+
+ vpmsumd $t2,$Xl,$xC2 # 1st phase
+
+ vsldoi $t0,$Xm,$zero,8
+ vsldoi $t1,$zero,$Xm,8
+ vxor $Xl,$Xl,$t0
+ vxor $Xh,$Xh,$t1
+
+ vsldoi $Xl,$Xl,$Xl,8
+ vxor $Xl,$Xl,$t2
+
+ vsldoi $t1,$Xl,$Xl,8 # 2nd phase
+ vpmsumd $Xl,$Xl,$xC2
+ vxor $t1,$t1,$Xh
+ vxor $Xl,$Xl,$t1
+
+ le?vperm $Xl,$Xl,$Xl,$lemask
+ stvx_u $Xl,0,$Xip # write out Xi
+
+ mtspr 256,$vrsave
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,2,0
+ .long 0
+.size .gcm_gmult_p8,.-.gcm_gmult_p8
+
+.globl .gcm_ghash_p8
+.align 5
+.gcm_ghash_p8:
+ lis r0,0xfff8
+ li r8,0x10
+ mfspr $vrsave,256
+ li r9,0x20
+ mtspr 256,r0
+ li r10,0x30
+ lvx_u $Xl,0,$Xip # load Xi
+
+ lvx_u $Hl,r8,$Htbl # load pre-computed table
+ le?lvsl $lemask,r0,r0
+ lvx_u $H, r9,$Htbl
+ le?vspltisb $t0,0x07
+ lvx_u $Hh,r10,$Htbl
+ le?vxor $lemask,$lemask,$t0
+ lvx_u $xC2,0,$Htbl
+ le?vperm $Xl,$Xl,$Xl,$lemask
+ vxor $zero,$zero,$zero
+
+ lvx_u $IN,0,$inp
+ addi $inp,$inp,16
+ subi $len,$len,16
+ le?vperm $IN,$IN,$IN,$lemask
+ vxor $IN,$IN,$Xl
+ b Loop
+
+.align 5
+Loop:
+ subic $len,$len,16
+ vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
+ subfe. r0,r0,r0 # borrow?-1:0
+ vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
+ and r0,r0,$len
+ vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
+ add $inp,$inp,r0
+
+ vpmsumd $t2,$Xl,$xC2 # 1st phase
+
+ vsldoi $t0,$Xm,$zero,8
+ vsldoi $t1,$zero,$Xm,8
+ vxor $Xl,$Xl,$t0
+ vxor $Xh,$Xh,$t1
+
+ vsldoi $Xl,$Xl,$Xl,8
+ vxor $Xl,$Xl,$t2
+ lvx_u $IN,0,$inp
+ addi $inp,$inp,16
+
+ vsldoi $t1,$Xl,$Xl,8 # 2nd phase
+ vpmsumd $Xl,$Xl,$xC2
+ le?vperm $IN,$IN,$IN,$lemask
+ vxor $t1,$t1,$Xh
+ vxor $IN,$IN,$t1
+ vxor $IN,$IN,$Xl
+ beq Loop # did $len-=16 borrow?
+
+ vxor $Xl,$Xl,$t1
+ le?vperm $Xl,$Xl,$Xl,$lemask
+ stvx_u $Xl,0,$Xip # write out Xi
+
+ mtspr 256,$vrsave
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,4,0
+ .long 0
+.size .gcm_ghash_p8,.-.gcm_ghash_p8
+
+.asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+
+foreach (split("\n",$code)) {
+ if ($flavour =~ /le$/o) { # little-endian
+ s/le\?//o or
+ s/be\?/#be#/o;
+ } else {
+ s/le\?/#le#/o or
+ s/be\?//o;
+ }
+ print $_,"\n";
+}
+
+close STDOUT; # enforce flush
diff --git a/crypto/modes/asm/ghashv8-armx.pl b/crypto/modes/asm/ghashv8-armx.pl
new file mode 100755
index 000000000000..0b9cd7359fba
--- /dev/null
+++ b/crypto/modes/asm/ghashv8-armx.pl
@@ -0,0 +1,409 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
+#
+# June 2014
+#
+# Initial version was developed in tight cooperation with Ard
+# Biesheuvel <ard.biesheuvel@linaro.org> from bits-n-pieces from
+# other assembly modules. Just like aesv8-armx.pl this module
+# supports both AArch32 and AArch64 execution modes.
+#
+# July 2014
+#
+# Implement 2x aggregated reduction [see ghash-x86.pl for background
+# information].
+#
+# Current performance in cycles per processed byte:
+#
+# PMULL[2] 32-bit NEON(*)
+# Apple A7 0.92 5.62
+# Cortex-A53 1.01 8.39
+# Cortex-A57 1.17 7.61
+#
+# (*) presented for reference/comparison purposes;
+
+$flavour = shift;
+open STDOUT,">".shift;
+
+$Xi="x0"; # argument block
+$Htbl="x1";
+$inp="x2";
+$len="x3";
+
+$inc="x12";
+
+{
+my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
+my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+___
+$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
+$code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/);
+
+################################################################################
+# void gcm_init_v8(u128 Htable[16],const u64 H[2]);
+#
+# input: 128-bit H - secret parameter E(K,0^128)
+# output: precomputed table filled with degrees of twisted H;
+# H is twisted to handle reverse bitness of GHASH;
+# only few of 16 slots of Htable[16] are used;
+# data is opaque to outside world (which allows to
+# optimize the code independently);
+#
+$code.=<<___;
+.global gcm_init_v8
+.type gcm_init_v8,%function
+.align 4
+gcm_init_v8:
+ vld1.64 {$t1},[x1] @ load input H
+ vmov.i8 $xC2,#0xe1
+ vshl.i64 $xC2,$xC2,#57 @ 0xc2.0
+ vext.8 $IN,$t1,$t1,#8
+ vshr.u64 $t2,$xC2,#63
+ vdup.32 $t1,${t1}[1]
+ vext.8 $t0,$t2,$xC2,#8 @ t0=0xc2....01
+ vshr.u64 $t2,$IN,#63
+ vshr.s32 $t1,$t1,#31 @ broadcast carry bit
+ vand $t2,$t2,$t0
+ vshl.i64 $IN,$IN,#1
+ vext.8 $t2,$t2,$t2,#8
+ vand $t0,$t0,$t1
+ vorr $IN,$IN,$t2 @ H<<<=1
+ veor $H,$IN,$t0 @ twisted H
+ vst1.64 {$H},[x0],#16 @ store Htable[0]
+
+ @ calculate H^2
+ vext.8 $t0,$H,$H,#8 @ Karatsuba pre-processing
+ vpmull.p64 $Xl,$H,$H
+ veor $t0,$t0,$H
+ vpmull2.p64 $Xh,$H,$H
+ vpmull.p64 $Xm,$t0,$t0
+
+ vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
+ veor $t2,$Xl,$Xh
+ veor $Xm,$Xm,$t1
+ veor $Xm,$Xm,$t2
+ vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
+
+ vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
+ vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
+ veor $Xl,$Xm,$t2
+
+ vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
+ vpmull.p64 $Xl,$Xl,$xC2
+ veor $t2,$t2,$Xh
+ veor $H2,$Xl,$t2
+
+ vext.8 $t1,$H2,$H2,#8 @ Karatsuba pre-processing
+ veor $t1,$t1,$H2
+ vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed
+ vst1.64 {$Hhl-$H2},[x0] @ store Htable[1..2]
+
+ ret
+.size gcm_init_v8,.-gcm_init_v8
+___
+################################################################################
+# void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
+#
+# input: Xi - current hash value;
+# Htable - table precomputed in gcm_init_v8;
+# output: Xi - next hash value Xi;
+#
+$code.=<<___;
+.global gcm_gmult_v8
+.type gcm_gmult_v8,%function
+.align 4
+gcm_gmult_v8:
+ vld1.64 {$t1},[$Xi] @ load Xi
+ vmov.i8 $xC2,#0xe1
+ vld1.64 {$H-$Hhl},[$Htbl] @ load twisted H, ...
+ vshl.u64 $xC2,$xC2,#57
+#ifndef __ARMEB__
+ vrev64.8 $t1,$t1
+#endif
+ vext.8 $IN,$t1,$t1,#8
+
+ vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
+ veor $t1,$t1,$IN @ Karatsuba pre-processing
+ vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
+ vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+ vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
+ veor $t2,$Xl,$Xh
+ veor $Xm,$Xm,$t1
+ veor $Xm,$Xm,$t2
+ vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
+
+ vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
+ vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
+ veor $Xl,$Xm,$t2
+
+ vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
+ vpmull.p64 $Xl,$Xl,$xC2
+ veor $t2,$t2,$Xh
+ veor $Xl,$Xl,$t2
+
+#ifndef __ARMEB__
+ vrev64.8 $Xl,$Xl
+#endif
+ vext.8 $Xl,$Xl,$Xl,#8
+ vst1.64 {$Xl},[$Xi] @ write out Xi
+
+ ret
+.size gcm_gmult_v8,.-gcm_gmult_v8
+___
+################################################################################
+# void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#
+# input: table precomputed in gcm_init_v8;
+# current hash value Xi;
+# pointer to input data;
+# length of input data in bytes, but divisible by block size;
+# output: next hash value Xi;
+#
+$code.=<<___;
+.global gcm_ghash_v8
+.type gcm_ghash_v8,%function
+.align 4
+gcm_ghash_v8:
+___
+$code.=<<___ if ($flavour !~ /64/);
+ vstmdb sp!,{d8-d15} @ 32-bit ABI says so
+___
+$code.=<<___;
+ vld1.64 {$Xl},[$Xi] @ load [rotated] Xi
+ @ "[rotated]" means that
+ @ loaded value would have
+ @ to be rotated in order to
+ @ make it appear as in
+ @ alorithm specification
+ subs $len,$len,#32 @ see if $len is 32 or larger
+ mov $inc,#16 @ $inc is used as post-
+ @ increment for input pointer;
+ @ as loop is modulo-scheduled
+ @ $inc is zeroed just in time
+ @ to preclude oversteping
+ @ inp[len], which means that
+ @ last block[s] are actually
+ @ loaded twice, but last
+ @ copy is not processed
+ vld1.64 {$H-$Hhl},[$Htbl],#32 @ load twisted H, ..., H^2
+ vmov.i8 $xC2,#0xe1
+ vld1.64 {$H2},[$Htbl]
+ cclr $inc,eq @ is it time to zero $inc?
+ vext.8 $Xl,$Xl,$Xl,#8 @ rotate Xi
+ vld1.64 {$t0},[$inp],#16 @ load [rotated] I[0]
+ vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant
+#ifndef __ARMEB__
+ vrev64.8 $t0,$t0
+ vrev64.8 $Xl,$Xl
+#endif
+ vext.8 $IN,$t0,$t0,#8 @ rotate I[0]
+ b.lo .Lodd_tail_v8 @ $len was less than 32
+___
+{ my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7));
+ #######
+ # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+ # [(H*Ii+1) + (H*Xi+1)] mod P =
+ # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
+ #
+$code.=<<___;
+ vld1.64 {$t1},[$inp],$inc @ load [rotated] I[1]
+#ifndef __ARMEB__
+ vrev64.8 $t1,$t1
+#endif
+ vext.8 $In,$t1,$t1,#8
+ veor $IN,$IN,$Xl @ I[i]^=Xi
+ vpmull.p64 $Xln,$H,$In @ H·Ii+1
+ veor $t1,$t1,$In @ Karatsuba pre-processing
+ vpmull2.p64 $Xhn,$H,$In
+ b .Loop_mod2x_v8
+
+.align 4
+.Loop_mod2x_v8:
+ vext.8 $t2,$IN,$IN,#8
+ subs $len,$len,#32 @ is there more data?
+ vpmull.p64 $Xl,$H2,$IN @ H^2.lo·Xi.lo
+ cclr $inc,lo @ is it time to zero $inc?
+
+ vpmull.p64 $Xmn,$Hhl,$t1
+ veor $t2,$t2,$IN @ Karatsuba pre-processing
+ vpmull2.p64 $Xh,$H2,$IN @ H^2.hi·Xi.hi
+ veor $Xl,$Xl,$Xln @ accumulate
+ vpmull2.p64 $Xm,$Hhl,$t2 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+ vld1.64 {$t0},[$inp],$inc @ load [rotated] I[i+2]
+
+ veor $Xh,$Xh,$Xhn
+ cclr $inc,eq @ is it time to zero $inc?
+ veor $Xm,$Xm,$Xmn
+
+ vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
+ veor $t2,$Xl,$Xh
+ veor $Xm,$Xm,$t1
+ vld1.64 {$t1},[$inp],$inc @ load [rotated] I[i+3]
+#ifndef __ARMEB__
+ vrev64.8 $t0,$t0
+#endif
+ veor $Xm,$Xm,$t2
+ vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
+
+#ifndef __ARMEB__
+ vrev64.8 $t1,$t1
+#endif
+ vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
+ vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
+ vext.8 $In,$t1,$t1,#8
+ vext.8 $IN,$t0,$t0,#8
+ veor $Xl,$Xm,$t2
+ vpmull.p64 $Xln,$H,$In @ H·Ii+1
+ veor $IN,$IN,$Xh @ accumulate $IN early
+
+ vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
+ vpmull.p64 $Xl,$Xl,$xC2
+ veor $IN,$IN,$t2
+ veor $t1,$t1,$In @ Karatsuba pre-processing
+ veor $IN,$IN,$Xl
+ vpmull2.p64 $Xhn,$H,$In
+ b.hs .Loop_mod2x_v8 @ there was at least 32 more bytes
+
+ veor $Xh,$Xh,$t2
+ vext.8 $IN,$t0,$t0,#8 @ re-construct $IN
+ adds $len,$len,#32 @ re-construct $len
+ veor $Xl,$Xl,$Xh @ re-construct $Xl
+ b.eq .Ldone_v8 @ is $len zero?
+___
+}
+$code.=<<___;
+.Lodd_tail_v8:
+ vext.8 $t2,$Xl,$Xl,#8
+ veor $IN,$IN,$Xl @ inp^=Xi
+ veor $t1,$t0,$t2 @ $t1 is rotated inp^Xi
+
+ vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
+ veor $t1,$t1,$IN @ Karatsuba pre-processing
+ vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
+ vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+ vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
+ veor $t2,$Xl,$Xh
+ veor $Xm,$Xm,$t1
+ veor $Xm,$Xm,$t2
+ vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
+
+ vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
+ vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
+ veor $Xl,$Xm,$t2
+
+ vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
+ vpmull.p64 $Xl,$Xl,$xC2
+ veor $t2,$t2,$Xh
+ veor $Xl,$Xl,$t2
+
+.Ldone_v8:
+#ifndef __ARMEB__
+ vrev64.8 $Xl,$Xl
+#endif
+ vext.8 $Xl,$Xl,$Xl,#8
+ vst1.64 {$Xl},[$Xi] @ write out Xi
+
+___
+$code.=<<___ if ($flavour !~ /64/);
+ vldmia sp!,{d8-d15} @ 32-bit ABI says so
+___
+$code.=<<___;
+ ret
+.size gcm_ghash_v8,.-gcm_ghash_v8
+___
+}
+$code.=<<___;
+.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+
+if ($flavour =~ /64/) { ######## 64-bit code
+ sub unvmov {
+ my $arg=shift;
+
+ $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
+ sprintf "ins v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1;
+ }
+ foreach(split("\n",$code)) {
+ s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
+ s/vmov\.i8/movi/o or # fix up legacy mnemonics
+ s/vmov\s+(.*)/unvmov($1)/geo or
+ s/vext\.8/ext/o or
+ s/vshr\.s/sshr\.s/o or
+ s/vshr/ushr/o or
+ s/^(\s+)v/$1/o or # strip off v prefix
+ s/\bbx\s+lr\b/ret/o;
+
+ s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
+ s/@\s/\/\//o; # old->new style commentary
+
+ # fix up remainig legacy suffixes
+ s/\.[ui]?8(\s)/$1/o;
+ s/\.[uis]?32//o and s/\.16b/\.4s/go;
+ m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument
+ m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments
+ s/\.[uisp]?64//o and s/\.16b/\.2d/go;
+ s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
+
+ print $_,"\n";
+ }
+} else { ######## 32-bit code
+ sub unvdup32 {
+ my $arg=shift;
+
+ $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
+ sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
+ }
+ sub unvpmullp64 {
+ my ($mnemonic,$arg)=@_;
+
+ if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
+ my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
+ |(($2&7)<<17)|(($2&8)<<4)
+ |(($3&7)<<1) |(($3&8)<<2);
+ $word |= 0x00010001 if ($mnemonic =~ "2");
+ # since ARMv7 instructions are always encoded little-endian.
+ # correct solution is to use .inst directive, but older
+ # assemblers don't implement it:-(
+ sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+ $word&0xff,($word>>8)&0xff,
+ ($word>>16)&0xff,($word>>24)&0xff,
+ $mnemonic,$arg;
+ }
+ }
+
+ foreach(split("\n",$code)) {
+ s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
+ s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
+ s/\/\/\s?/@ /o; # new->old style commentary
+
+ # fix up remainig new-style suffixes
+ s/\],#[0-9]+/]!/o;
+
+ s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
+ s/vdup\.32\s+(.*)/unvdup32($1)/geo or
+ s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
+ s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
+ s/^(\s+)b\./$1b/o or
+ s/^(\s+)ret/$1bx\tlr/o;
+
+ print $_,"\n";
+ }
+}
+
+close STDOUT; # enforce flush
diff --git a/crypto/modes/cbc128.c b/crypto/modes/cbc128.c
index 1ed79672749a..c13caea5355b 100644
--- a/crypto/modes/cbc128.c
+++ b/crypto/modes/cbc128.c
@@ -59,7 +59,7 @@
#endif
#include <assert.h>
-#ifndef STRICT_ALIGNMENT
+#if !defined(STRICT_ALIGNMENT) && !defined(PEDANTIC)
# define STRICT_ALIGNMENT 0
#endif
diff --git a/crypto/modes/gcm128.c b/crypto/modes/gcm128.c
index 0ee569fb7af0..e299131c1382 100644
--- a/crypto/modes/gcm128.c
+++ b/crypto/modes/gcm128.c
@@ -687,20 +687,31 @@ static void gcm_gmult_1bit(u64 Xi[2], const u64 H[2])
#endif
-#if TABLE_BITS==4 && defined(GHASH_ASM)
+#if TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
# if !defined(I386_ONLY) && \
(defined(__i386) || defined(__i386__) || \
defined(__x86_64) || defined(__x86_64__) || \
defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
# define GHASH_ASM_X86_OR_64
# define GCM_FUNCREF_4BIT
-extern unsigned int OPENSSL_ia32cap_P[2];
+extern unsigned int OPENSSL_ia32cap_P[];
void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]);
void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]);
void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
size_t len);
+# if defined(__i386) || defined(__i386__) || defined(_M_IX86)
+# define gcm_init_avx gcm_init_clmul
+# define gcm_gmult_avx gcm_gmult_clmul
+# define gcm_ghash_avx gcm_ghash_clmul
+# else
+void gcm_init_avx(u128 Htable[16], const u64 Xi[2]);
+void gcm_gmult_avx(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_avx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+ size_t len);
+# endif
+
# if defined(__i386) || defined(__i386__) || defined(_M_IX86)
# define GHASH_ASM_X86
void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]);
@@ -711,15 +722,41 @@ void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]);
void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp,
size_t len);
# endif
-# elif defined(__arm__) || defined(__arm)
+# elif defined(__arm__) || defined(__arm) || defined(__aarch64__)
# include "arm_arch.h"
-# if __ARM_ARCH__>=7
+# if __ARM_MAX_ARCH__>=7
# define GHASH_ASM_ARM
# define GCM_FUNCREF_4BIT
+# define PMULL_CAPABLE (OPENSSL_armcap_P & ARMV8_PMULL)
+# if defined(__arm__) || defined(__arm)
+# define NEON_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON)
+# endif
+void gcm_init_neon(u128 Htable[16], const u64 Xi[2]);
void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]);
void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp,
size_t len);
+void gcm_init_v8(u128 Htable[16], const u64 Xi[2]);
+void gcm_gmult_v8(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_v8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+ size_t len);
# endif
+# elif defined(__sparc__) || defined(__sparc)
+# include "sparc_arch.h"
+# define GHASH_ASM_SPARC
+# define GCM_FUNCREF_4BIT
+extern unsigned int OPENSSL_sparcv9cap_P[];
+void gcm_init_vis3(u128 Htable[16], const u64 Xi[2]);
+void gcm_gmult_vis3(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_vis3(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+ size_t len);
+# elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
+# include "ppc_arch.h"
+# define GHASH_ASM_PPC
+# define GCM_FUNCREF_4BIT
+void gcm_init_p8(u128 Htable[16], const u64 Xi[2]);
+void gcm_gmult_p8(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_p8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+ size_t len);
# endif
#endif
@@ -768,9 +805,15 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
# if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
if (OPENSSL_ia32cap_P[0] & (1 << 24) && /* check FXSR bit */
OPENSSL_ia32cap_P[1] & (1 << 1)) { /* check PCLMULQDQ bit */
- gcm_init_clmul(ctx->Htable, ctx->H.u);
- ctx->gmult = gcm_gmult_clmul;
- ctx->ghash = gcm_ghash_clmul;
+ if (((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41) { /* AVX+MOVBE */
+ gcm_init_avx(ctx->Htable, ctx->H.u);
+ ctx->gmult = gcm_gmult_avx;
+ ctx->ghash = gcm_ghash_avx;
+ } else {
+ gcm_init_clmul(ctx->Htable, ctx->H.u);
+ ctx->gmult = gcm_gmult_clmul;
+ ctx->ghash = gcm_ghash_clmul;
+ }
return;
}
# endif
@@ -792,13 +835,52 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
ctx->ghash = gcm_ghash_4bit;
# endif
# elif defined(GHASH_ASM_ARM)
- if (OPENSSL_armcap_P & ARMV7_NEON) {
+# ifdef PMULL_CAPABLE
+ if (PMULL_CAPABLE) {
+ gcm_init_v8(ctx->Htable, ctx->H.u);
+ ctx->gmult = gcm_gmult_v8;
+ ctx->ghash = gcm_ghash_v8;
+ } else
+# endif
+# ifdef NEON_CAPABLE
+ if (NEON_CAPABLE) {
+ gcm_init_neon(ctx->Htable, ctx->H.u);
ctx->gmult = gcm_gmult_neon;
ctx->ghash = gcm_ghash_neon;
+ } else
+# endif
+ {
+ gcm_init_4bit(ctx->Htable, ctx->H.u);
+ ctx->gmult = gcm_gmult_4bit;
+# if defined(GHASH)
+ ctx->ghash = gcm_ghash_4bit;
+# else
+ ctx->ghash = NULL;
+# endif
+ }
+# elif defined(GHASH_ASM_SPARC)
+ if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
+ gcm_init_vis3(ctx->Htable, ctx->H.u);
+ ctx->gmult = gcm_gmult_vis3;
+ ctx->ghash = gcm_ghash_vis3;
+ } else {
+ gcm_init_4bit(ctx->Htable, ctx->H.u);
+ ctx->gmult = gcm_gmult_4bit;
+ ctx->ghash = gcm_ghash_4bit;
+ }
+# elif defined(GHASH_ASM_PPC)
+ if (OPENSSL_ppccap_P & PPC_CRYPTO207) {
+ gcm_init_p8(ctx->Htable, ctx->H.u);
+ ctx->gmult = gcm_gmult_p8;
+ ctx->ghash = gcm_ghash_p8;
} else {
gcm_init_4bit(ctx->Htable, ctx->H.u);
ctx->gmult = gcm_gmult_4bit;
+# if defined(GHASH)
ctx->ghash = gcm_ghash_4bit;
+# else
+ ctx->ghash = NULL;
+# endif
}
# else
gcm_init_4bit(ctx->Htable, ctx->H.u);
diff --git a/crypto/modes/modes.h b/crypto/modes/modes.h
index 880f020d58e2..fd488499a0b4 100644
--- a/crypto/modes/modes.h
+++ b/crypto/modes/modes.h
@@ -148,6 +148,16 @@ int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx,
const unsigned char *inp, unsigned char *out,
size_t len, int enc);
+size_t CRYPTO_128_wrap(void *key, const unsigned char *iv,
+ unsigned char *out,
+ const unsigned char *in, size_t inlen,
+ block128_f block);
+
+size_t CRYPTO_128_unwrap(void *key, const unsigned char *iv,
+ unsigned char *out,
+ const unsigned char *in, size_t inlen,
+ block128_f block);
+
#ifdef __cplusplus
}
#endif
diff --git a/crypto/modes/modes_lcl.h b/crypto/modes/modes_lcl.h
index 296849b8670c..fe14ec7002f0 100644
--- a/crypto/modes/modes_lcl.h
+++ b/crypto/modes/modes_lcl.h
@@ -25,39 +25,49 @@ typedef unsigned int u32;
typedef unsigned char u8;
#define STRICT_ALIGNMENT 1
-#if defined(__i386) || defined(__i386__) || \
- defined(__x86_64) || defined(__x86_64__) || \
- defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
- defined(__s390__) || defined(__s390x__)
-# undef STRICT_ALIGNMENT
+#ifndef PEDANTIC
+# if defined(__i386) || defined(__i386__) || \
+ defined(__x86_64) || defined(__x86_64__) || \
+ defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
+ defined(__aarch64__) || \
+ defined(__s390__) || defined(__s390x__)
+# undef STRICT_ALIGNMENT
+# endif
#endif
#if !defined(PEDANTIC) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
# if defined(__GNUC__) && __GNUC__>=2
# if defined(__x86_64) || defined(__x86_64__)
-# define BSWAP8(x) ({ u64 ret=(x); \
+# define BSWAP8(x) ({ u64 ret_=(x); \
asm ("bswapq %0" \
- : "+r"(ret)); ret; })
-# define BSWAP4(x) ({ u32 ret=(x); \
+ : "+r"(ret_)); ret_; })
+# define BSWAP4(x) ({ u32 ret_=(x); \
asm ("bswapl %0" \
- : "+r"(ret)); ret; })
+ : "+r"(ret_)); ret_; })
# elif (defined(__i386) || defined(__i386__)) && !defined(I386_ONLY)
-# define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x); \
+# define BSWAP8(x) ({ u32 lo_=(u64)(x)>>32,hi_=(x); \
asm ("bswapl %0; bswapl %1" \
- : "+r"(hi),"+r"(lo)); \
- (u64)hi<<32|lo; })
-# define BSWAP4(x) ({ u32 ret=(x); \
+ : "+r"(hi_),"+r"(lo_)); \
+ (u64)hi_<<32|lo_; })
+# define BSWAP4(x) ({ u32 ret_=(x); \
asm ("bswapl %0" \
- : "+r"(ret)); ret; })
+ : "+r"(ret_)); ret_; })
+# elif defined(__aarch64__)
+# define BSWAP8(x) ({ u64 ret_; \
+ asm ("rev %0,%1" \
+ : "=r"(ret_) : "r"(x)); ret_; })
+# define BSWAP4(x) ({ u32 ret_; \
+ asm ("rev %w0,%w1" \
+ : "=r"(ret_) : "r"(x)); ret_; })
# elif (defined(__arm__) || defined(__arm)) && !defined(STRICT_ALIGNMENT)
-# define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x); \
+# define BSWAP8(x) ({ u32 lo_=(u64)(x)>>32,hi_=(x); \
asm ("rev %0,%0; rev %1,%1" \
- : "+r"(hi),"+r"(lo)); \
- (u64)hi<<32|lo; })
-# define BSWAP4(x) ({ u32 ret; \
+ : "+r"(hi_),"+r"(lo_)); \
+ (u64)hi_<<32|lo_; })
+# define BSWAP4(x) ({ u32 ret_; \
asm ("rev %0,%1" \
- : "=r"(ret) : "r"((u32)(x))); \
- ret; })
+ : "=r"(ret_) : "r"((u32)(x))); \
+ ret_; })
# endif
# elif defined(_MSC_VER)
# if _MSC_VER>=1300
diff --git a/crypto/modes/wrap128.c b/crypto/modes/wrap128.c
new file mode 100644
index 000000000000..4dcaf0326fa8
--- /dev/null
+++ b/crypto/modes/wrap128.c
@@ -0,0 +1,138 @@
+/* crypto/modes/wrap128.c */
+/*
+ * Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2013 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include "cryptlib.h"
+#include <openssl/modes.h>
+
+static const unsigned char default_iv[] = {
+ 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6,
+};
+
+/*
+ * Input size limit: lower than maximum of standards but far larger than
+ * anything that will be used in practice.
+ */
+#define CRYPTO128_WRAP_MAX (1UL << 31)
+
+size_t CRYPTO_128_wrap(void *key, const unsigned char *iv,
+ unsigned char *out,
+ const unsigned char *in, size_t inlen,
+ block128_f block)
+{
+ unsigned char *A, B[16], *R;
+ size_t i, j, t;
+ if ((inlen & 0x7) || (inlen < 8) || (inlen > CRYPTO128_WRAP_MAX))
+ return 0;
+ A = B;
+ t = 1;
+ memcpy(out + 8, in, inlen);
+ if (!iv)
+ iv = default_iv;
+
+ memcpy(A, iv, 8);
+
+ for (j = 0; j < 6; j++) {
+ R = out + 8;
+ for (i = 0; i < inlen; i += 8, t++, R += 8) {
+ memcpy(B + 8, R, 8);
+ block(B, B, key);
+ A[7] ^= (unsigned char)(t & 0xff);
+ if (t > 0xff) {
+ A[6] ^= (unsigned char)((t >> 8) & 0xff);
+ A[5] ^= (unsigned char)((t >> 16) & 0xff);
+ A[4] ^= (unsigned char)((t >> 24) & 0xff);
+ }
+ memcpy(R, B + 8, 8);
+ }
+ }
+ memcpy(out, A, 8);
+ return inlen + 8;
+}
+
+size_t CRYPTO_128_unwrap(void *key, const unsigned char *iv,
+ unsigned char *out,
+ const unsigned char *in, size_t inlen,
+ block128_f block)
+{
+ unsigned char *A, B[16], *R;
+ size_t i, j, t;
+ inlen -= 8;
+ if ((inlen & 0x7) || (inlen < 16) || (inlen > CRYPTO128_WRAP_MAX))
+ return 0;
+ A = B;
+ t = 6 * (inlen >> 3);
+ memcpy(A, in, 8);
+ memcpy(out, in + 8, inlen);
+ for (j = 0; j < 6; j++) {
+ R = out + inlen - 8;
+ for (i = 0; i < inlen; i += 8, t--, R -= 8) {
+ A[7] ^= (unsigned char)(t & 0xff);
+ if (t > 0xff) {
+ A[6] ^= (unsigned char)((t >> 8) & 0xff);
+ A[5] ^= (unsigned char)((t >> 16) & 0xff);
+ A[4] ^= (unsigned char)((t >> 24) & 0xff);
+ }
+ memcpy(B + 8, R, 8);
+ block(B, B, key);
+ memcpy(R, B + 8, 8);
+ }
+ }
+ if (!iv)
+ iv = default_iv;
+ if (memcmp(A, iv, 8)) {
+ OPENSSL_cleanse(out, inlen);
+ return 0;
+ }
+ return inlen;
+}
diff --git a/crypto/o_str.c b/crypto/o_str.c
index b23ef323b124..4e2d096704f0 100644
--- a/crypto/o_str.c
+++ b/crypto/o_str.c
@@ -62,7 +62,7 @@
#include "o_str.h"
#if !defined(OPENSSL_IMPLEMENTS_strncasecmp) && \
- !defined(OPENSSL_SYSNAME_WIN32) && \
+ !defined(OPENSSL_SYSNAME_WIN32) && !defined(OPENSSL_SYSNAME_WINCE) && \
!defined(NETWARE_CLIB)
# include <strings.h>
#endif
diff --git a/crypto/o_time.c b/crypto/o_time.c
index e18b71d484ff..58413fe97d09 100644
--- a/crypto/o_time.c
+++ b/crypto/o_time.c
@@ -246,9 +246,73 @@ struct tm *OPENSSL_gmtime(const time_t *timer, struct tm *result)
static long date_to_julian(int y, int m, int d);
static void julian_to_date(long jd, int *y, int *m, int *d);
+static int julian_adj(const struct tm *tm, int off_day, long offset_sec,
+ long *pday, int *psec);
int OPENSSL_gmtime_adj(struct tm *tm, int off_day, long offset_sec)
{
+ int time_sec, time_year, time_month, time_day;
+ long time_jd;
+
+ /* Convert time and offset into julian day and seconds */
+ if (!julian_adj(tm, off_day, offset_sec, &time_jd, &time_sec))
+ return 0;
+
+ /* Convert Julian day back to date */
+
+ julian_to_date(time_jd, &time_year, &time_month, &time_day);
+
+ if (time_year < 1900 || time_year > 9999)
+ return 0;
+
+ /* Update tm structure */
+
+ tm->tm_year = time_year - 1900;
+ tm->tm_mon = time_month - 1;
+ tm->tm_mday = time_day;
+
+ tm->tm_hour = time_sec / 3600;
+ tm->tm_min = (time_sec / 60) % 60;
+ tm->tm_sec = time_sec % 60;
+
+ return 1;
+
+}
+
+int OPENSSL_gmtime_diff(int *pday, int *psec,
+ const struct tm *from, const struct tm *to)
+{
+ int from_sec, to_sec, diff_sec;
+ long from_jd, to_jd, diff_day;
+ if (!julian_adj(from, 0, 0, &from_jd, &from_sec))
+ return 0;
+ if (!julian_adj(to, 0, 0, &to_jd, &to_sec))
+ return 0;
+ diff_day = to_jd - from_jd;
+ diff_sec = to_sec - from_sec;
+ /* Adjust differences so both positive or both negative */
+ if (diff_day > 0 && diff_sec < 0) {
+ diff_day--;
+ diff_sec += SECS_PER_DAY;
+ }
+ if (diff_day < 0 && diff_sec > 0) {
+ diff_day++;
+ diff_sec -= SECS_PER_DAY;
+ }
+
+ if (pday)
+ *pday = (int)diff_day;
+ if (psec)
+ *psec = diff_sec;
+
+ return 1;
+
+}
+
+/* Convert tm structure and offset into julian day and seconds */
+static int julian_adj(const struct tm *tm, int off_day, long offset_sec,
+ long *pday, int *psec)
+{
int offset_hms, offset_day;
long time_jd;
int time_year, time_month, time_day;
@@ -284,25 +348,9 @@ int OPENSSL_gmtime_adj(struct tm *tm, int off_day, long offset_sec)
if (time_jd < 0)
return 0;
- /* Convert Julian day back to date */
-
- julian_to_date(time_jd, &time_year, &time_month, &time_day);
-
- if (time_year < 1900 || time_year > 9999)
- return 0;
-
- /* Update tm structure */
-
- tm->tm_year = time_year - 1900;
- tm->tm_mon = time_month - 1;
- tm->tm_mday = time_day;
-
- tm->tm_hour = offset_hms / 3600;
- tm->tm_min = (offset_hms / 60) % 60;
- tm->tm_sec = offset_hms % 60;
-
+ *pday = time_jd;
+ *psec = offset_hms;
return 1;
-
}
/*
@@ -354,27 +402,39 @@ int main(int argc, char **argv)
int check_time(long offset)
{
- struct tm tm1, tm2;
+ struct tm tm1, tm2, o1;
+ int off_day, off_sec;
+ long toffset;
time_t t1, t2;
time(&t1);
t2 = t1 + offset;
OPENSSL_gmtime(&t2, &tm2);
OPENSSL_gmtime(&t1, &tm1);
+ o1 = tm1;
OPENSSL_gmtime_adj(&tm1, 0, offset);
- if ((tm1.tm_year == tm2.tm_year) &&
- (tm1.tm_mon == tm2.tm_mon) &&
- (tm1.tm_mday == tm2.tm_mday) &&
- (tm1.tm_hour == tm2.tm_hour) &&
- (tm1.tm_min == tm2.tm_min) && (tm1.tm_sec == tm2.tm_sec))
- return 1;
- fprintf(stderr, "TIME ERROR!!\n");
- fprintf(stderr, "Time1: %d/%d/%d, %d:%02d:%02d\n",
- tm2.tm_mday, tm2.tm_mon + 1, tm2.tm_year + 1900,
- tm2.tm_hour, tm2.tm_min, tm2.tm_sec);
- fprintf(stderr, "Time2: %d/%d/%d, %d:%02d:%02d\n",
- tm1.tm_mday, tm1.tm_mon + 1, tm1.tm_year + 1900,
- tm1.tm_hour, tm1.tm_min, tm1.tm_sec);
- return 0;
+ if ((tm1.tm_year != tm2.tm_year) ||
+ (tm1.tm_mon != tm2.tm_mon) ||
+ (tm1.tm_mday != tm2.tm_mday) ||
+ (tm1.tm_hour != tm2.tm_hour) ||
+ (tm1.tm_min != tm2.tm_min) || (tm1.tm_sec != tm2.tm_sec)) {
+ fprintf(stderr, "TIME ERROR!!\n");
+ fprintf(stderr, "Time1: %d/%d/%d, %d:%02d:%02d\n",
+ tm2.tm_mday, tm2.tm_mon + 1, tm2.tm_year + 1900,
+ tm2.tm_hour, tm2.tm_min, tm2.tm_sec);
+ fprintf(stderr, "Time2: %d/%d/%d, %d:%02d:%02d\n",
+ tm1.tm_mday, tm1.tm_mon + 1, tm1.tm_year + 1900,
+ tm1.tm_hour, tm1.tm_min, tm1.tm_sec);
+ return 0;
+ }
+ OPENSSL_gmtime_diff(&o1, &tm1, &off_day, &off_sec);
+ toffset = (long)off_day *SECS_PER_DAY + off_sec;
+ if (offset != toffset) {
+ fprintf(stderr, "TIME OFFSET ERROR!!\n");
+ fprintf(stderr, "Expected %ld, Got %ld (%d:%d)\n",
+ offset, toffset, off_day, off_sec);
+ return 0;
+ }
+ return 1;
}
#endif
diff --git a/crypto/o_time.h b/crypto/o_time.h
index 901b2005677e..a83a3d247d41 100644
--- a/crypto/o_time.h
+++ b/crypto/o_time.h
@@ -64,5 +64,7 @@
struct tm *OPENSSL_gmtime(const time_t *timer, struct tm *result);
int OPENSSL_gmtime_adj(struct tm *tm, int offset_day, long offset_sec);
+int OPENSSL_gmtime_diff(int *pday, int *psec,
+ const struct tm *from, const struct tm *to);
#endif
diff --git a/crypto/objects/obj_dat.h b/crypto/objects/obj_dat.h
index bc69665bc582..b7e3cf280ee0 100644
--- a/crypto/objects/obj_dat.h
+++ b/crypto/objects/obj_dat.h
@@ -62,12 +62,12 @@
* [including the GNU Public Licence.]
*/
-#define NUM_NID 920
-#define NUM_SN 913
-#define NUM_LN 913
-#define NUM_OBJ 857
+#define NUM_NID 958
+#define NUM_SN 951
+#define NUM_LN 951
+#define NUM_OBJ 890
-static const unsigned char lvalues[5974]={
+static const unsigned char lvalues[6255]={
0x2A,0x86,0x48,0x86,0xF7,0x0D, /* [ 0] OBJ_rsadsi */
0x2A,0x86,0x48,0x86,0xF7,0x0D,0x01, /* [ 6] OBJ_pkcs */
0x2A,0x86,0x48,0x86,0xF7,0x0D,0x02,0x02, /* [ 13] OBJ_md2 */
@@ -919,6 +919,39 @@ static const unsigned char lvalues[5974]={
0x2A,0x86,0x48,0x86,0xF7,0x0D,0x01,0x01,0x08,/* [5946] OBJ_mgf1 */
0x2A,0x86,0x48,0x86,0xF7,0x0D,0x01,0x01,0x0A,/* [5955] OBJ_rsassaPss */
0x2A,0x86,0x48,0x86,0xF7,0x0D,0x01,0x01,0x07,/* [5964] OBJ_rsaesOaep */
+0x2A,0x86,0x48,0xCE,0x3E,0x02,0x01, /* [5973] OBJ_dhpublicnumber */
+0x2B,0x24,0x03,0x03,0x02,0x08,0x01,0x01,0x01,/* [5980] OBJ_brainpoolP160r1 */
+0x2B,0x24,0x03,0x03,0x02,0x08,0x01,0x01,0x02,/* [5989] OBJ_brainpoolP160t1 */
+0x2B,0x24,0x03,0x03,0x02,0x08,0x01,0x01,0x03,/* [5998] OBJ_brainpoolP192r1 */
+0x2B,0x24,0x03,0x03,0x02,0x08,0x01,0x01,0x04,/* [6007] OBJ_brainpoolP192t1 */
+0x2B,0x24,0x03,0x03,0x02,0x08,0x01,0x01,0x05,/* [6016] OBJ_brainpoolP224r1 */
+0x2B,0x24,0x03,0x03,0x02,0x08,0x01,0x01,0x06,/* [6025] OBJ_brainpoolP224t1 */
+0x2B,0x24,0x03,0x03,0x02,0x08,0x01,0x01,0x07,/* [6034] OBJ_brainpoolP256r1 */
+0x2B,0x24,0x03,0x03,0x02,0x08,0x01,0x01,0x08,/* [6043] OBJ_brainpoolP256t1 */
+0x2B,0x24,0x03,0x03,0x02,0x08,0x01,0x01,0x09,/* [6052] OBJ_brainpoolP320r1 */
+0x2B,0x24,0x03,0x03,0x02,0x08,0x01,0x01,0x0A,/* [6061] OBJ_brainpoolP320t1 */
+0x2B,0x24,0x03,0x03,0x02,0x08,0x01,0x01,0x0B,/* [6070] OBJ_brainpoolP384r1 */
+0x2B,0x24,0x03,0x03,0x02,0x08,0x01,0x01,0x0C,/* [6079] OBJ_brainpoolP384t1 */
+0x2B,0x24,0x03,0x03,0x02,0x08,0x01,0x01,0x0D,/* [6088] OBJ_brainpoolP512r1 */
+0x2B,0x24,0x03,0x03,0x02,0x08,0x01,0x01,0x0E,/* [6097] OBJ_brainpoolP512t1 */
+0x2A,0x86,0x48,0x86,0xF7,0x0D,0x01,0x01,0x09,/* [6106] OBJ_pSpecified */
+0x2B,0x81,0x05,0x10,0x86,0x48,0x3F,0x00,0x02,/* [6115] OBJ_dhSinglePass_stdDH_sha1kdf_scheme */
+0x2B,0x81,0x04,0x01,0x0B,0x00, /* [6124] OBJ_dhSinglePass_stdDH_sha224kdf_scheme */
+0x2B,0x81,0x04,0x01,0x0B,0x01, /* [6130] OBJ_dhSinglePass_stdDH_sha256kdf_scheme */
+0x2B,0x81,0x04,0x01,0x0B,0x02, /* [6136] OBJ_dhSinglePass_stdDH_sha384kdf_scheme */
+0x2B,0x81,0x04,0x01,0x0B,0x03, /* [6142] OBJ_dhSinglePass_stdDH_sha512kdf_scheme */
+0x2B,0x81,0x05,0x10,0x86,0x48,0x3F,0x00,0x03,/* [6148] OBJ_dhSinglePass_cofactorDH_sha1kdf_scheme */
+0x2B,0x81,0x04,0x01,0x0E,0x00, /* [6157] OBJ_dhSinglePass_cofactorDH_sha224kdf_scheme */
+0x2B,0x81,0x04,0x01,0x0E,0x01, /* [6163] OBJ_dhSinglePass_cofactorDH_sha256kdf_scheme */
+0x2B,0x81,0x04,0x01,0x0E,0x02, /* [6169] OBJ_dhSinglePass_cofactorDH_sha384kdf_scheme */
+0x2B,0x81,0x04,0x01,0x0E,0x03, /* [6175] OBJ_dhSinglePass_cofactorDH_sha512kdf_scheme */
+0x2B,0x06,0x01,0x04,0x01,0xD6,0x79,0x02,0x04,0x02,/* [6181] OBJ_ct_precert_scts */
+0x2B,0x06,0x01,0x04,0x01,0xD6,0x79,0x02,0x04,0x03,/* [6191] OBJ_ct_precert_poison */
+0x2B,0x06,0x01,0x04,0x01,0xD6,0x79,0x02,0x04,0x04,/* [6201] OBJ_ct_precert_signer */
+0x2B,0x06,0x01,0x04,0x01,0xD6,0x79,0x02,0x04,0x05,/* [6211] OBJ_ct_cert_scts */
+0x2B,0x06,0x01,0x04,0x01,0x82,0x37,0x3C,0x02,0x01,0x01,/* [6221] OBJ_jurisdictionLocalityName */
+0x2B,0x06,0x01,0x04,0x01,0x82,0x37,0x3C,0x02,0x01,0x02,/* [6232] OBJ_jurisdictionStateOrProvinceName */
+0x2B,0x06,0x01,0x04,0x01,0x82,0x37,0x3C,0x02,0x01,0x03,/* [6243] OBJ_jurisdictionCountryName */
};
static const ASN1_OBJECT nid_objs[NUM_NID]={
@@ -2399,12 +2432,95 @@ static const ASN1_OBJECT nid_objs[NUM_NID]={
{"AES-256-CBC-HMAC-SHA1","aes-256-cbc-hmac-sha1",
NID_aes_256_cbc_hmac_sha1,0,NULL,0},
{"RSAES-OAEP","rsaesOaep",NID_rsaesOaep,9,&(lvalues[5964]),0},
+{"dhpublicnumber","X9.42 DH",NID_dhpublicnumber,7,&(lvalues[5973]),0},
+{"brainpoolP160r1","brainpoolP160r1",NID_brainpoolP160r1,9,
+ &(lvalues[5980]),0},
+{"brainpoolP160t1","brainpoolP160t1",NID_brainpoolP160t1,9,
+ &(lvalues[5989]),0},
+{"brainpoolP192r1","brainpoolP192r1",NID_brainpoolP192r1,9,
+ &(lvalues[5998]),0},
+{"brainpoolP192t1","brainpoolP192t1",NID_brainpoolP192t1,9,
+ &(lvalues[6007]),0},
+{"brainpoolP224r1","brainpoolP224r1",NID_brainpoolP224r1,9,
+ &(lvalues[6016]),0},
+{"brainpoolP224t1","brainpoolP224t1",NID_brainpoolP224t1,9,
+ &(lvalues[6025]),0},
+{"brainpoolP256r1","brainpoolP256r1",NID_brainpoolP256r1,9,
+ &(lvalues[6034]),0},
+{"brainpoolP256t1","brainpoolP256t1",NID_brainpoolP256t1,9,
+ &(lvalues[6043]),0},
+{"brainpoolP320r1","brainpoolP320r1",NID_brainpoolP320r1,9,
+ &(lvalues[6052]),0},
+{"brainpoolP320t1","brainpoolP320t1",NID_brainpoolP320t1,9,
+ &(lvalues[6061]),0},
+{"brainpoolP384r1","brainpoolP384r1",NID_brainpoolP384r1,9,
+ &(lvalues[6070]),0},
+{"brainpoolP384t1","brainpoolP384t1",NID_brainpoolP384t1,9,
+ &(lvalues[6079]),0},
+{"brainpoolP512r1","brainpoolP512r1",NID_brainpoolP512r1,9,
+ &(lvalues[6088]),0},
+{"brainpoolP512t1","brainpoolP512t1",NID_brainpoolP512t1,9,
+ &(lvalues[6097]),0},
+{"PSPECIFIED","pSpecified",NID_pSpecified,9,&(lvalues[6106]),0},
+{"dhSinglePass-stdDH-sha1kdf-scheme",
+ "dhSinglePass-stdDH-sha1kdf-scheme",
+ NID_dhSinglePass_stdDH_sha1kdf_scheme,9,&(lvalues[6115]),0},
+{"dhSinglePass-stdDH-sha224kdf-scheme",
+ "dhSinglePass-stdDH-sha224kdf-scheme",
+ NID_dhSinglePass_stdDH_sha224kdf_scheme,6,&(lvalues[6124]),0},
+{"dhSinglePass-stdDH-sha256kdf-scheme",
+ "dhSinglePass-stdDH-sha256kdf-scheme",
+ NID_dhSinglePass_stdDH_sha256kdf_scheme,6,&(lvalues[6130]),0},
+{"dhSinglePass-stdDH-sha384kdf-scheme",
+ "dhSinglePass-stdDH-sha384kdf-scheme",
+ NID_dhSinglePass_stdDH_sha384kdf_scheme,6,&(lvalues[6136]),0},
+{"dhSinglePass-stdDH-sha512kdf-scheme",
+ "dhSinglePass-stdDH-sha512kdf-scheme",
+ NID_dhSinglePass_stdDH_sha512kdf_scheme,6,&(lvalues[6142]),0},
+{"dhSinglePass-cofactorDH-sha1kdf-scheme",
+ "dhSinglePass-cofactorDH-sha1kdf-scheme",
+ NID_dhSinglePass_cofactorDH_sha1kdf_scheme,9,&(lvalues[6148]),0},
+{"dhSinglePass-cofactorDH-sha224kdf-scheme",
+ "dhSinglePass-cofactorDH-sha224kdf-scheme",
+ NID_dhSinglePass_cofactorDH_sha224kdf_scheme,6,&(lvalues[6157]),0},
+{"dhSinglePass-cofactorDH-sha256kdf-scheme",
+ "dhSinglePass-cofactorDH-sha256kdf-scheme",
+ NID_dhSinglePass_cofactorDH_sha256kdf_scheme,6,&(lvalues[6163]),0},
+{"dhSinglePass-cofactorDH-sha384kdf-scheme",
+ "dhSinglePass-cofactorDH-sha384kdf-scheme",
+ NID_dhSinglePass_cofactorDH_sha384kdf_scheme,6,&(lvalues[6169]),0},
+{"dhSinglePass-cofactorDH-sha512kdf-scheme",
+ "dhSinglePass-cofactorDH-sha512kdf-scheme",
+ NID_dhSinglePass_cofactorDH_sha512kdf_scheme,6,&(lvalues[6175]),0},
+{"dh-std-kdf","dh-std-kdf",NID_dh_std_kdf,0,NULL,0},
+{"dh-cofactor-kdf","dh-cofactor-kdf",NID_dh_cofactor_kdf,0,NULL,0},
+{"AES-128-CBC-HMAC-SHA256","aes-128-cbc-hmac-sha256",
+ NID_aes_128_cbc_hmac_sha256,0,NULL,0},
+{"AES-192-CBC-HMAC-SHA256","aes-192-cbc-hmac-sha256",
+ NID_aes_192_cbc_hmac_sha256,0,NULL,0},
+{"AES-256-CBC-HMAC-SHA256","aes-256-cbc-hmac-sha256",
+ NID_aes_256_cbc_hmac_sha256,0,NULL,0},
+{"ct_precert_scts","CT Precertificate SCTs",NID_ct_precert_scts,10,
+ &(lvalues[6181]),0},
+{"ct_precert_poison","CT Precertificate Poison",NID_ct_precert_poison,
+ 10,&(lvalues[6191]),0},
+{"ct_precert_signer","CT Precertificate Signer",NID_ct_precert_signer,
+ 10,&(lvalues[6201]),0},
+{"ct_cert_scts","CT Certificate SCTs",NID_ct_cert_scts,10,
+ &(lvalues[6211]),0},
+{"jurisdictionL","jurisdictionLocalityName",
+ NID_jurisdictionLocalityName,11,&(lvalues[6221]),0},
+{"jurisdictionST","jurisdictionStateOrProvinceName",
+ NID_jurisdictionStateOrProvinceName,11,&(lvalues[6232]),0},
+{"jurisdictionC","jurisdictionCountryName",
+ NID_jurisdictionCountryName,11,&(lvalues[6243]),0},
};
static const unsigned int sn_objs[NUM_SN]={
364, /* "AD_DVCS" */
419, /* "AES-128-CBC" */
916, /* "AES-128-CBC-HMAC-SHA1" */
+948, /* "AES-128-CBC-HMAC-SHA256" */
421, /* "AES-128-CFB" */
650, /* "AES-128-CFB1" */
653, /* "AES-128-CFB8" */
@@ -2414,6 +2530,7 @@ static const unsigned int sn_objs[NUM_SN]={
913, /* "AES-128-XTS" */
423, /* "AES-192-CBC" */
917, /* "AES-192-CBC-HMAC-SHA1" */
+949, /* "AES-192-CBC-HMAC-SHA256" */
425, /* "AES-192-CFB" */
651, /* "AES-192-CFB1" */
654, /* "AES-192-CFB8" */
@@ -2422,6 +2539,7 @@ static const unsigned int sn_objs[NUM_SN]={
424, /* "AES-192-OFB" */
427, /* "AES-256-CBC" */
918, /* "AES-256-CBC-HMAC-SHA1" */
+950, /* "AES-256-CBC-HMAC-SHA256" */
429, /* "AES-256-CFB" */
652, /* "AES-256-CFB1" */
655, /* "AES-256-CFB8" */
@@ -2537,6 +2655,7 @@ static const unsigned int sn_objs[NUM_SN]={
69, /* "PBKDF2" */
162, /* "PBMAC1" */
127, /* "PKIX" */
+935, /* "PSPECIFIED" */
98, /* "RC2-40-CBC" */
166, /* "RC2-64-CBC" */
37, /* "RC2-CBC" */
@@ -2613,6 +2732,20 @@ static const unsigned int sn_objs[NUM_SN]={
87, /* "basicConstraints" */
365, /* "basicOCSPResponse" */
285, /* "biometricInfo" */
+921, /* "brainpoolP160r1" */
+922, /* "brainpoolP160t1" */
+923, /* "brainpoolP192r1" */
+924, /* "brainpoolP192t1" */
+925, /* "brainpoolP224r1" */
+926, /* "brainpoolP224t1" */
+927, /* "brainpoolP256r1" */
+928, /* "brainpoolP256t1" */
+929, /* "brainpoolP320r1" */
+930, /* "brainpoolP320t1" */
+931, /* "brainpoolP384r1" */
+932, /* "brainpoolP384t1" */
+933, /* "brainpoolP512r1" */
+934, /* "brainpoolP512t1" */
494, /* "buildingName" */
860, /* "businessCategory" */
691, /* "c2onb191v4" */
@@ -2658,6 +2791,10 @@ static const unsigned int sn_objs[NUM_SN]={
884, /* "crossCertificatePair" */
806, /* "cryptocom" */
805, /* "cryptopro" */
+954, /* "ct_cert_scts" */
+952, /* "ct_precert_poison" */
+951, /* "ct_precert_scts" */
+953, /* "ct_precert_signer" */
500, /* "dITRedirect" */
451, /* "dNSDomain" */
495, /* "dSAQuality" */
@@ -2667,7 +2804,20 @@ static const unsigned int sn_objs[NUM_SN]={
891, /* "deltaRevocationList" */
107, /* "description" */
871, /* "destinationIndicator" */
+947, /* "dh-cofactor-kdf" */
+946, /* "dh-std-kdf" */
28, /* "dhKeyAgreement" */
+941, /* "dhSinglePass-cofactorDH-sha1kdf-scheme" */
+942, /* "dhSinglePass-cofactorDH-sha224kdf-scheme" */
+943, /* "dhSinglePass-cofactorDH-sha256kdf-scheme" */
+944, /* "dhSinglePass-cofactorDH-sha384kdf-scheme" */
+945, /* "dhSinglePass-cofactorDH-sha512kdf-scheme" */
+936, /* "dhSinglePass-stdDH-sha1kdf-scheme" */
+937, /* "dhSinglePass-stdDH-sha224kdf-scheme" */
+938, /* "dhSinglePass-stdDH-sha256kdf-scheme" */
+939, /* "dhSinglePass-stdDH-sha384kdf-scheme" */
+940, /* "dhSinglePass-stdDH-sha512kdf-scheme" */
+920, /* "dhpublicnumber" */
382, /* "directory" */
887, /* "distinguishedName" */
892, /* "dmdName" */
@@ -2978,6 +3128,9 @@ static const unsigned int sn_objs[NUM_SN]={
86, /* "issuerAltName" */
770, /* "issuingDistributionPoint" */
492, /* "janetMailbox" */
+957, /* "jurisdictionC" */
+955, /* "jurisdictionL" */
+956, /* "jurisdictionST" */
150, /* "keyBag" */
83, /* "keyUsage" */
477, /* "lastModifiedBy" */
@@ -3328,6 +3481,10 @@ static const unsigned int ln_objs[NUM_LN]={
285, /* "Biometric Info" */
179, /* "CA Issuers" */
785, /* "CA Repository" */
+954, /* "CT Certificate SCTs" */
+952, /* "CT Precertificate Poison" */
+951, /* "CT Precertificate SCTs" */
+953, /* "CT Precertificate Signer" */
131, /* "Code Signing" */
783, /* "Diffie-Hellman based MAC" */
382, /* "Directory" */
@@ -3451,6 +3608,7 @@ static const unsigned int ln_objs[NUM_LN]={
85, /* "X509v3 Subject Alternative Name" */
769, /* "X509v3 Subject Directory Attributes" */
82, /* "X509v3 Subject Key Identifier" */
+920, /* "X9.42 DH" */
184, /* "X9.57" */
185, /* "X9.57 CM ?" */
478, /* "aRecord" */
@@ -3463,6 +3621,7 @@ static const unsigned int ln_objs[NUM_LN]={
606, /* "additional verification" */
419, /* "aes-128-cbc" */
916, /* "aes-128-cbc-hmac-sha1" */
+948, /* "aes-128-cbc-hmac-sha256" */
896, /* "aes-128-ccm" */
421, /* "aes-128-cfb" */
650, /* "aes-128-cfb1" */
@@ -3474,6 +3633,7 @@ static const unsigned int ln_objs[NUM_LN]={
913, /* "aes-128-xts" */
423, /* "aes-192-cbc" */
917, /* "aes-192-cbc-hmac-sha1" */
+949, /* "aes-192-cbc-hmac-sha256" */
899, /* "aes-192-ccm" */
425, /* "aes-192-cfb" */
651, /* "aes-192-cfb1" */
@@ -3484,6 +3644,7 @@ static const unsigned int ln_objs[NUM_LN]={
424, /* "aes-192-ofb" */
427, /* "aes-256-cbc" */
918, /* "aes-256-cbc-hmac-sha1" */
+950, /* "aes-256-cbc-hmac-sha256" */
902, /* "aes-256-ccm" */
429, /* "aes-256-cfb" */
652, /* "aes-256-cfb1" */
@@ -3502,6 +3663,20 @@ static const unsigned int ln_objs[NUM_LN]={
93, /* "bf-cfb" */
92, /* "bf-ecb" */
94, /* "bf-ofb" */
+921, /* "brainpoolP160r1" */
+922, /* "brainpoolP160t1" */
+923, /* "brainpoolP192r1" */
+924, /* "brainpoolP192t1" */
+925, /* "brainpoolP224r1" */
+926, /* "brainpoolP224t1" */
+927, /* "brainpoolP256r1" */
+928, /* "brainpoolP256t1" */
+929, /* "brainpoolP320r1" */
+930, /* "brainpoolP320t1" */
+931, /* "brainpoolP384r1" */
+932, /* "brainpoolP384t1" */
+933, /* "brainpoolP512r1" */
+934, /* "brainpoolP512t1" */
494, /* "buildingName" */
860, /* "businessCategory" */
691, /* "c2onb191v4" */
@@ -3593,7 +3768,19 @@ static const unsigned int ln_objs[NUM_LN]={
107, /* "description" */
871, /* "destinationIndicator" */
80, /* "desx-cbc" */
+947, /* "dh-cofactor-kdf" */
+946, /* "dh-std-kdf" */
28, /* "dhKeyAgreement" */
+941, /* "dhSinglePass-cofactorDH-sha1kdf-scheme" */
+942, /* "dhSinglePass-cofactorDH-sha224kdf-scheme" */
+943, /* "dhSinglePass-cofactorDH-sha256kdf-scheme" */
+944, /* "dhSinglePass-cofactorDH-sha384kdf-scheme" */
+945, /* "dhSinglePass-cofactorDH-sha512kdf-scheme" */
+936, /* "dhSinglePass-stdDH-sha1kdf-scheme" */
+937, /* "dhSinglePass-stdDH-sha224kdf-scheme" */
+938, /* "dhSinglePass-stdDH-sha256kdf-scheme" */
+939, /* "dhSinglePass-stdDH-sha384kdf-scheme" */
+940, /* "dhSinglePass-stdDH-sha512kdf-scheme" */
11, /* "directory services (X.500)" */
378, /* "directory services - algorithms" */
887, /* "distinguishedName" */
@@ -3881,6 +4068,9 @@ static const unsigned int ln_objs[NUM_LN]={
645, /* "itu-t" */
492, /* "janetMailbox" */
646, /* "joint-iso-itu-t" */
+957, /* "jurisdictionCountryName" */
+955, /* "jurisdictionLocalityName" */
+956, /* "jurisdictionStateOrProvinceName" */
150, /* "keyBag" */
773, /* "kisa" */
477, /* "lastModifiedBy" */
@@ -3917,6 +4107,7 @@ static const unsigned int ln_objs[NUM_LN]={
18, /* "organizationalUnitName" */
475, /* "otherMailbox" */
876, /* "owner" */
+935, /* "pSpecified" */
489, /* "pagerTelephoneNumber" */
782, /* "password based MAC" */
374, /* "path" */
@@ -4560,6 +4751,14 @@ static const unsigned int obj_objs[NUM_OBJ]={
505, /* OBJ_mime_mhs_headings 1 3 6 1 7 1 1 */
506, /* OBJ_mime_mhs_bodies 1 3 6 1 7 1 2 */
119, /* OBJ_ripemd160WithRSA 1 3 36 3 3 1 2 */
+937, /* OBJ_dhSinglePass_stdDH_sha224kdf_scheme 1 3 132 1 11 0 */
+938, /* OBJ_dhSinglePass_stdDH_sha256kdf_scheme 1 3 132 1 11 1 */
+939, /* OBJ_dhSinglePass_stdDH_sha384kdf_scheme 1 3 132 1 11 2 */
+940, /* OBJ_dhSinglePass_stdDH_sha512kdf_scheme 1 3 132 1 11 3 */
+942, /* OBJ_dhSinglePass_cofactorDH_sha224kdf_scheme 1 3 132 1 14 0 */
+943, /* OBJ_dhSinglePass_cofactorDH_sha256kdf_scheme 1 3 132 1 14 1 */
+944, /* OBJ_dhSinglePass_cofactorDH_sha384kdf_scheme 1 3 132 1 14 2 */
+945, /* OBJ_dhSinglePass_cofactorDH_sha512kdf_scheme 1 3 132 1 14 3 */
631, /* OBJ_setAttr_GenCryptgrm 2 23 42 3 3 3 1 */
632, /* OBJ_setAttr_T2Enc 2 23 42 3 3 4 1 */
633, /* OBJ_setAttr_T2cleartxt 2 23 42 3 3 4 2 */
@@ -4608,6 +4807,7 @@ static const unsigned int obj_objs[NUM_OBJ]={
416, /* OBJ_ecdsa_with_SHA1 1 2 840 10045 4 1 */
791, /* OBJ_ecdsa_with_Recommended 1 2 840 10045 4 2 */
792, /* OBJ_ecdsa_with_Specified 1 2 840 10045 4 3 */
+920, /* OBJ_dhpublicnumber 1 2 840 10046 2 1 */
258, /* OBJ_id_pkix_mod 1 3 6 1 5 5 7 0 */
175, /* OBJ_id_pe 1 3 6 1 5 5 7 1 */
259, /* OBJ_id_qt 1 3 6 1 5 5 7 2 */
@@ -4825,6 +5025,7 @@ static const unsigned int obj_objs[NUM_OBJ]={
644, /* OBJ_rsaOAEPEncryptionSET 1 2 840 113549 1 1 6 */
919, /* OBJ_rsaesOaep 1 2 840 113549 1 1 7 */
911, /* OBJ_mgf1 1 2 840 113549 1 1 8 */
+935, /* OBJ_pSpecified 1 2 840 113549 1 1 9 */
912, /* OBJ_rsassaPss 1 2 840 113549 1 1 10 */
668, /* OBJ_sha256WithRSAEncryption 1 2 840 113549 1 1 11 */
669, /* OBJ_sha384WithRSAEncryption 1 2 840 113549 1 1 12 */
@@ -4886,6 +5087,22 @@ static const unsigned int obj_objs[NUM_OBJ]={
373, /* OBJ_id_pkix_OCSP_valid 1 3 6 1 5 5 7 48 1 9 */
374, /* OBJ_id_pkix_OCSP_path 1 3 6 1 5 5 7 48 1 10 */
375, /* OBJ_id_pkix_OCSP_trustRoot 1 3 6 1 5 5 7 48 1 11 */
+921, /* OBJ_brainpoolP160r1 1 3 36 3 3 2 8 1 1 1 */
+922, /* OBJ_brainpoolP160t1 1 3 36 3 3 2 8 1 1 2 */
+923, /* OBJ_brainpoolP192r1 1 3 36 3 3 2 8 1 1 3 */
+924, /* OBJ_brainpoolP192t1 1 3 36 3 3 2 8 1 1 4 */
+925, /* OBJ_brainpoolP224r1 1 3 36 3 3 2 8 1 1 5 */
+926, /* OBJ_brainpoolP224t1 1 3 36 3 3 2 8 1 1 6 */
+927, /* OBJ_brainpoolP256r1 1 3 36 3 3 2 8 1 1 7 */
+928, /* OBJ_brainpoolP256t1 1 3 36 3 3 2 8 1 1 8 */
+929, /* OBJ_brainpoolP320r1 1 3 36 3 3 2 8 1 1 9 */
+930, /* OBJ_brainpoolP320t1 1 3 36 3 3 2 8 1 1 10 */
+931, /* OBJ_brainpoolP384r1 1 3 36 3 3 2 8 1 1 11 */
+932, /* OBJ_brainpoolP384t1 1 3 36 3 3 2 8 1 1 12 */
+933, /* OBJ_brainpoolP512r1 1 3 36 3 3 2 8 1 1 13 */
+934, /* OBJ_brainpoolP512t1 1 3 36 3 3 2 8 1 1 14 */
+936, /* OBJ_dhSinglePass_stdDH_sha1kdf_scheme 1 3 133 16 840 63 0 2 */
+941, /* OBJ_dhSinglePass_cofactorDH_sha1kdf_scheme 1 3 133 16 840 63 0 3 */
418, /* OBJ_aes_128_ecb 2 16 840 1 101 3 4 1 1 */
419, /* OBJ_aes_128_cbc 2 16 840 1 101 3 4 1 2 */
420, /* OBJ_aes_128_ofb128 2 16 840 1 101 3 4 1 3 */
@@ -5013,6 +5230,10 @@ static const unsigned int obj_objs[NUM_OBJ]={
138, /* OBJ_ms_efs 1 3 6 1 4 1 311 10 3 4 */
648, /* OBJ_ms_smartcard_login 1 3 6 1 4 1 311 20 2 2 */
649, /* OBJ_ms_upn 1 3 6 1 4 1 311 20 2 3 */
+951, /* OBJ_ct_precert_scts 1 3 6 1 4 1 11129 2 4 2 */
+952, /* OBJ_ct_precert_poison 1 3 6 1 4 1 11129 2 4 3 */
+953, /* OBJ_ct_precert_signer 1 3 6 1 4 1 11129 2 4 4 */
+954, /* OBJ_ct_cert_scts 1 3 6 1 4 1 11129 2 4 5 */
751, /* OBJ_camellia_128_cbc 1 2 392 200011 61 1 1 1 2 */
752, /* OBJ_camellia_192_cbc 1 2 392 200011 61 1 1 1 3 */
753, /* OBJ_camellia_256_cbc 1 2 392 200011 61 1 1 1 4 */
@@ -5091,5 +5312,8 @@ static const unsigned int obj_objs[NUM_OBJ]={
154, /* OBJ_secretBag 1 2 840 113549 1 12 10 1 5 */
155, /* OBJ_safeContentsBag 1 2 840 113549 1 12 10 1 6 */
34, /* OBJ_idea_cbc 1 3 6 1 4 1 188 7 1 1 2 */
+955, /* OBJ_jurisdictionLocalityName 1 3 6 1 4 1 311 60 2 1 1 */
+956, /* OBJ_jurisdictionStateOrProvinceName 1 3 6 1 4 1 311 60 2 1 2 */
+957, /* OBJ_jurisdictionCountryName 1 3 6 1 4 1 311 60 2 1 3 */
};
diff --git a/crypto/objects/obj_mac.h b/crypto/objects/obj_mac.h
index f752aeff75b1..779c309b869b 100644
--- a/crypto/objects/obj_mac.h
+++ b/crypto/objects/obj_mac.h
@@ -590,6 +590,11 @@
#define NID_mgf1 911
#define OBJ_mgf1 OBJ_pkcs1,8L
+#define SN_pSpecified "PSPECIFIED"
+#define LN_pSpecified "pSpecified"
+#define NID_pSpecified 935
+#define OBJ_pSpecified OBJ_pkcs1,9L
+
#define SN_rsassaPss "RSASSA-PSS"
#define LN_rsassaPss "rsassaPss"
#define NID_rsassaPss 912
@@ -4029,3 +4034,161 @@
#define SN_aes_256_cbc_hmac_sha1 "AES-256-CBC-HMAC-SHA1"
#define LN_aes_256_cbc_hmac_sha1 "aes-256-cbc-hmac-sha1"
#define NID_aes_256_cbc_hmac_sha1 918
+
+#define SN_aes_128_cbc_hmac_sha256 "AES-128-CBC-HMAC-SHA256"
+#define LN_aes_128_cbc_hmac_sha256 "aes-128-cbc-hmac-sha256"
+#define NID_aes_128_cbc_hmac_sha256 948
+
+#define SN_aes_192_cbc_hmac_sha256 "AES-192-CBC-HMAC-SHA256"
+#define LN_aes_192_cbc_hmac_sha256 "aes-192-cbc-hmac-sha256"
+#define NID_aes_192_cbc_hmac_sha256 949
+
+#define SN_aes_256_cbc_hmac_sha256 "AES-256-CBC-HMAC-SHA256"
+#define LN_aes_256_cbc_hmac_sha256 "aes-256-cbc-hmac-sha256"
+#define NID_aes_256_cbc_hmac_sha256 950
+
+#define SN_dhpublicnumber "dhpublicnumber"
+#define LN_dhpublicnumber "X9.42 DH"
+#define NID_dhpublicnumber 920
+#define OBJ_dhpublicnumber OBJ_ISO_US,10046L,2L,1L
+
+#define SN_brainpoolP160r1 "brainpoolP160r1"
+#define NID_brainpoolP160r1 921
+#define OBJ_brainpoolP160r1 1L,3L,36L,3L,3L,2L,8L,1L,1L,1L
+
+#define SN_brainpoolP160t1 "brainpoolP160t1"
+#define NID_brainpoolP160t1 922
+#define OBJ_brainpoolP160t1 1L,3L,36L,3L,3L,2L,8L,1L,1L,2L
+
+#define SN_brainpoolP192r1 "brainpoolP192r1"
+#define NID_brainpoolP192r1 923
+#define OBJ_brainpoolP192r1 1L,3L,36L,3L,3L,2L,8L,1L,1L,3L
+
+#define SN_brainpoolP192t1 "brainpoolP192t1"
+#define NID_brainpoolP192t1 924
+#define OBJ_brainpoolP192t1 1L,3L,36L,3L,3L,2L,8L,1L,1L,4L
+
+#define SN_brainpoolP224r1 "brainpoolP224r1"
+#define NID_brainpoolP224r1 925
+#define OBJ_brainpoolP224r1 1L,3L,36L,3L,3L,2L,8L,1L,1L,5L
+
+#define SN_brainpoolP224t1 "brainpoolP224t1"
+#define NID_brainpoolP224t1 926
+#define OBJ_brainpoolP224t1 1L,3L,36L,3L,3L,2L,8L,1L,1L,6L
+
+#define SN_brainpoolP256r1 "brainpoolP256r1"
+#define NID_brainpoolP256r1 927
+#define OBJ_brainpoolP256r1 1L,3L,36L,3L,3L,2L,8L,1L,1L,7L
+
+#define SN_brainpoolP256t1 "brainpoolP256t1"
+#define NID_brainpoolP256t1 928
+#define OBJ_brainpoolP256t1 1L,3L,36L,3L,3L,2L,8L,1L,1L,8L
+
+#define SN_brainpoolP320r1 "brainpoolP320r1"
+#define NID_brainpoolP320r1 929
+#define OBJ_brainpoolP320r1 1L,3L,36L,3L,3L,2L,8L,1L,1L,9L
+
+#define SN_brainpoolP320t1 "brainpoolP320t1"
+#define NID_brainpoolP320t1 930
+#define OBJ_brainpoolP320t1 1L,3L,36L,3L,3L,2L,8L,1L,1L,10L
+
+#define SN_brainpoolP384r1 "brainpoolP384r1"
+#define NID_brainpoolP384r1 931
+#define OBJ_brainpoolP384r1 1L,3L,36L,3L,3L,2L,8L,1L,1L,11L
+
+#define SN_brainpoolP384t1 "brainpoolP384t1"
+#define NID_brainpoolP384t1 932
+#define OBJ_brainpoolP384t1 1L,3L,36L,3L,3L,2L,8L,1L,1L,12L
+
+#define SN_brainpoolP512r1 "brainpoolP512r1"
+#define NID_brainpoolP512r1 933
+#define OBJ_brainpoolP512r1 1L,3L,36L,3L,3L,2L,8L,1L,1L,13L
+
+#define SN_brainpoolP512t1 "brainpoolP512t1"
+#define NID_brainpoolP512t1 934
+#define OBJ_brainpoolP512t1 1L,3L,36L,3L,3L,2L,8L,1L,1L,14L
+
+#define OBJ_x9_63_scheme 1L,3L,133L,16L,840L,63L,0L
+
+#define OBJ_secg_scheme OBJ_certicom_arc,1L
+
+#define SN_dhSinglePass_stdDH_sha1kdf_scheme "dhSinglePass-stdDH-sha1kdf-scheme"
+#define NID_dhSinglePass_stdDH_sha1kdf_scheme 936
+#define OBJ_dhSinglePass_stdDH_sha1kdf_scheme OBJ_x9_63_scheme,2L
+
+#define SN_dhSinglePass_stdDH_sha224kdf_scheme "dhSinglePass-stdDH-sha224kdf-scheme"
+#define NID_dhSinglePass_stdDH_sha224kdf_scheme 937
+#define OBJ_dhSinglePass_stdDH_sha224kdf_scheme OBJ_secg_scheme,11L,0L
+
+#define SN_dhSinglePass_stdDH_sha256kdf_scheme "dhSinglePass-stdDH-sha256kdf-scheme"
+#define NID_dhSinglePass_stdDH_sha256kdf_scheme 938
+#define OBJ_dhSinglePass_stdDH_sha256kdf_scheme OBJ_secg_scheme,11L,1L
+
+#define SN_dhSinglePass_stdDH_sha384kdf_scheme "dhSinglePass-stdDH-sha384kdf-scheme"
+#define NID_dhSinglePass_stdDH_sha384kdf_scheme 939
+#define OBJ_dhSinglePass_stdDH_sha384kdf_scheme OBJ_secg_scheme,11L,2L
+
+#define SN_dhSinglePass_stdDH_sha512kdf_scheme "dhSinglePass-stdDH-sha512kdf-scheme"
+#define NID_dhSinglePass_stdDH_sha512kdf_scheme 940
+#define OBJ_dhSinglePass_stdDH_sha512kdf_scheme OBJ_secg_scheme,11L,3L
+
+#define SN_dhSinglePass_cofactorDH_sha1kdf_scheme "dhSinglePass-cofactorDH-sha1kdf-scheme"
+#define NID_dhSinglePass_cofactorDH_sha1kdf_scheme 941
+#define OBJ_dhSinglePass_cofactorDH_sha1kdf_scheme OBJ_x9_63_scheme,3L
+
+#define SN_dhSinglePass_cofactorDH_sha224kdf_scheme "dhSinglePass-cofactorDH-sha224kdf-scheme"
+#define NID_dhSinglePass_cofactorDH_sha224kdf_scheme 942
+#define OBJ_dhSinglePass_cofactorDH_sha224kdf_scheme OBJ_secg_scheme,14L,0L
+
+#define SN_dhSinglePass_cofactorDH_sha256kdf_scheme "dhSinglePass-cofactorDH-sha256kdf-scheme"
+#define NID_dhSinglePass_cofactorDH_sha256kdf_scheme 943
+#define OBJ_dhSinglePass_cofactorDH_sha256kdf_scheme OBJ_secg_scheme,14L,1L
+
+#define SN_dhSinglePass_cofactorDH_sha384kdf_scheme "dhSinglePass-cofactorDH-sha384kdf-scheme"
+#define NID_dhSinglePass_cofactorDH_sha384kdf_scheme 944
+#define OBJ_dhSinglePass_cofactorDH_sha384kdf_scheme OBJ_secg_scheme,14L,2L
+
+#define SN_dhSinglePass_cofactorDH_sha512kdf_scheme "dhSinglePass-cofactorDH-sha512kdf-scheme"
+#define NID_dhSinglePass_cofactorDH_sha512kdf_scheme 945
+#define OBJ_dhSinglePass_cofactorDH_sha512kdf_scheme OBJ_secg_scheme,14L,3L
+
+#define SN_dh_std_kdf "dh-std-kdf"
+#define NID_dh_std_kdf 946
+
+#define SN_dh_cofactor_kdf "dh-cofactor-kdf"
+#define NID_dh_cofactor_kdf 947
+
+#define SN_ct_precert_scts "ct_precert_scts"
+#define LN_ct_precert_scts "CT Precertificate SCTs"
+#define NID_ct_precert_scts 951
+#define OBJ_ct_precert_scts 1L,3L,6L,1L,4L,1L,11129L,2L,4L,2L
+
+#define SN_ct_precert_poison "ct_precert_poison"
+#define LN_ct_precert_poison "CT Precertificate Poison"
+#define NID_ct_precert_poison 952
+#define OBJ_ct_precert_poison 1L,3L,6L,1L,4L,1L,11129L,2L,4L,3L
+
+#define SN_ct_precert_signer "ct_precert_signer"
+#define LN_ct_precert_signer "CT Precertificate Signer"
+#define NID_ct_precert_signer 953
+#define OBJ_ct_precert_signer 1L,3L,6L,1L,4L,1L,11129L,2L,4L,4L
+
+#define SN_ct_cert_scts "ct_cert_scts"
+#define LN_ct_cert_scts "CT Certificate SCTs"
+#define NID_ct_cert_scts 954
+#define OBJ_ct_cert_scts 1L,3L,6L,1L,4L,1L,11129L,2L,4L,5L
+
+#define SN_jurisdictionLocalityName "jurisdictionL"
+#define LN_jurisdictionLocalityName "jurisdictionLocalityName"
+#define NID_jurisdictionLocalityName 955
+#define OBJ_jurisdictionLocalityName 1L,3L,6L,1L,4L,1L,311L,60L,2L,1L,1L
+
+#define SN_jurisdictionStateOrProvinceName "jurisdictionST"
+#define LN_jurisdictionStateOrProvinceName "jurisdictionStateOrProvinceName"
+#define NID_jurisdictionStateOrProvinceName 956
+#define OBJ_jurisdictionStateOrProvinceName 1L,3L,6L,1L,4L,1L,311L,60L,2L,1L,2L
+
+#define SN_jurisdictionCountryName "jurisdictionC"
+#define LN_jurisdictionCountryName "jurisdictionCountryName"
+#define NID_jurisdictionCountryName 957
+#define OBJ_jurisdictionCountryName 1L,3L,6L,1L,4L,1L,311L,60L,2L,1L,3L
diff --git a/crypto/objects/obj_mac.num b/crypto/objects/obj_mac.num
index 1d0a7c802daf..8e5ea8336331 100644
--- a/crypto/objects/obj_mac.num
+++ b/crypto/objects/obj_mac.num
@@ -917,3 +917,41 @@ aes_128_cbc_hmac_sha1 916
aes_192_cbc_hmac_sha1 917
aes_256_cbc_hmac_sha1 918
rsaesOaep 919
+dhpublicnumber 920
+brainpoolP160r1 921
+brainpoolP160t1 922
+brainpoolP192r1 923
+brainpoolP192t1 924
+brainpoolP224r1 925
+brainpoolP224t1 926
+brainpoolP256r1 927
+brainpoolP256t1 928
+brainpoolP320r1 929
+brainpoolP320t1 930
+brainpoolP384r1 931
+brainpoolP384t1 932
+brainpoolP512r1 933
+brainpoolP512t1 934
+pSpecified 935
+dhSinglePass_stdDH_sha1kdf_scheme 936
+dhSinglePass_stdDH_sha224kdf_scheme 937
+dhSinglePass_stdDH_sha256kdf_scheme 938
+dhSinglePass_stdDH_sha384kdf_scheme 939
+dhSinglePass_stdDH_sha512kdf_scheme 940
+dhSinglePass_cofactorDH_sha1kdf_scheme 941
+dhSinglePass_cofactorDH_sha224kdf_scheme 942
+dhSinglePass_cofactorDH_sha256kdf_scheme 943
+dhSinglePass_cofactorDH_sha384kdf_scheme 944
+dhSinglePass_cofactorDH_sha512kdf_scheme 945
+dh_std_kdf 946
+dh_cofactor_kdf 947
+aes_128_cbc_hmac_sha256 948
+aes_192_cbc_hmac_sha256 949
+aes_256_cbc_hmac_sha256 950
+ct_precert_scts 951
+ct_precert_poison 952
+ct_precert_signer 953
+ct_cert_scts 954
+jurisdictionLocalityName 955
+jurisdictionStateOrProvinceName 956
+jurisdictionCountryName 957
diff --git a/crypto/objects/obj_xref.h b/crypto/objects/obj_xref.h
index b8f7d341f787..e453e99f8336 100644
--- a/crypto/objects/obj_xref.h
+++ b/crypto/objects/obj_xref.h
@@ -41,6 +41,21 @@ static const nid_triple sigoid_srt[] = {
{NID_id_GostR3411_94_with_GostR3410_2001_cc, NID_id_GostR3411_94,
NID_id_GostR3410_2001_cc},
{NID_rsassaPss, NID_undef, NID_rsaEncryption},
+ {NID_dhSinglePass_stdDH_sha1kdf_scheme, NID_sha1, NID_dh_std_kdf},
+ {NID_dhSinglePass_stdDH_sha224kdf_scheme, NID_sha224, NID_dh_std_kdf},
+ {NID_dhSinglePass_stdDH_sha256kdf_scheme, NID_sha256, NID_dh_std_kdf},
+ {NID_dhSinglePass_stdDH_sha384kdf_scheme, NID_sha384, NID_dh_std_kdf},
+ {NID_dhSinglePass_stdDH_sha512kdf_scheme, NID_sha512, NID_dh_std_kdf},
+ {NID_dhSinglePass_cofactorDH_sha1kdf_scheme, NID_sha1,
+ NID_dh_cofactor_kdf},
+ {NID_dhSinglePass_cofactorDH_sha224kdf_scheme, NID_sha224,
+ NID_dh_cofactor_kdf},
+ {NID_dhSinglePass_cofactorDH_sha256kdf_scheme, NID_sha256,
+ NID_dh_cofactor_kdf},
+ {NID_dhSinglePass_cofactorDH_sha384kdf_scheme, NID_sha384,
+ NID_dh_cofactor_kdf},
+ {NID_dhSinglePass_cofactorDH_sha512kdf_scheme, NID_sha512,
+ NID_dh_cofactor_kdf},
};
static const nid_triple *const sigoid_srt_xref[] = {
@@ -54,19 +69,29 @@ static const nid_triple *const sigoid_srt_xref[] = {
&sigoid_srt[5],
&sigoid_srt[8],
&sigoid_srt[12],
+ &sigoid_srt[30],
+ &sigoid_srt[35],
&sigoid_srt[6],
&sigoid_srt[10],
&sigoid_srt[11],
&sigoid_srt[13],
&sigoid_srt[24],
&sigoid_srt[20],
+ &sigoid_srt[32],
+ &sigoid_srt[37],
&sigoid_srt[14],
&sigoid_srt[21],
+ &sigoid_srt[33],
+ &sigoid_srt[38],
&sigoid_srt[15],
&sigoid_srt[22],
+ &sigoid_srt[34],
+ &sigoid_srt[39],
&sigoid_srt[16],
&sigoid_srt[23],
&sigoid_srt[19],
+ &sigoid_srt[31],
+ &sigoid_srt[36],
&sigoid_srt[25],
&sigoid_srt[26],
&sigoid_srt[27],
diff --git a/crypto/objects/obj_xref.txt b/crypto/objects/obj_xref.txt
index cb917182ee2f..19c94226b20c 100644
--- a/crypto/objects/obj_xref.txt
+++ b/crypto/objects/obj_xref.txt
@@ -44,3 +44,15 @@ id_GostR3411_94_with_GostR3410_2001 id_GostR3411_94 id_GostR3410_2001
id_GostR3411_94_with_GostR3410_94 id_GostR3411_94 id_GostR3410_94
id_GostR3411_94_with_GostR3410_94_cc id_GostR3411_94 id_GostR3410_94_cc
id_GostR3411_94_with_GostR3410_2001_cc id_GostR3411_94 id_GostR3410_2001_cc
+# ECDH KDFs and their corresponding message digests and schemes
+dhSinglePass_stdDH_sha1kdf_scheme sha1 dh_std_kdf
+dhSinglePass_stdDH_sha224kdf_scheme sha224 dh_std_kdf
+dhSinglePass_stdDH_sha256kdf_scheme sha256 dh_std_kdf
+dhSinglePass_stdDH_sha384kdf_scheme sha384 dh_std_kdf
+dhSinglePass_stdDH_sha512kdf_scheme sha512 dh_std_kdf
+
+dhSinglePass_cofactorDH_sha1kdf_scheme sha1 dh_cofactor_kdf
+dhSinglePass_cofactorDH_sha224kdf_scheme sha224 dh_cofactor_kdf
+dhSinglePass_cofactorDH_sha256kdf_scheme sha256 dh_cofactor_kdf
+dhSinglePass_cofactorDH_sha384kdf_scheme sha384 dh_cofactor_kdf
+dhSinglePass_cofactorDH_sha512kdf_scheme sha512 dh_cofactor_kdf
diff --git a/crypto/objects/objects.txt b/crypto/objects/objects.txt
index d3bfad72a2ac..b57aabb22650 100644
--- a/crypto/objects/objects.txt
+++ b/crypto/objects/objects.txt
@@ -168,6 +168,7 @@ pkcs1 5 : RSA-SHA1 : sha1WithRSAEncryption
# According to PKCS #1 version 2.1
pkcs1 7 : RSAES-OAEP : rsaesOaep
pkcs1 8 : MGF1 : mgf1
+pkcs1 9 : PSPECIFIED : pSpecified
pkcs1 10 : RSASSA-PSS : rsassaPss
pkcs1 11 : RSA-SHA256 : sha256WithRSAEncryption
@@ -1290,3 +1291,60 @@ kisa 1 6 : SEED-OFB : seed-ofb
: AES-128-CBC-HMAC-SHA1 : aes-128-cbc-hmac-sha1
: AES-192-CBC-HMAC-SHA1 : aes-192-cbc-hmac-sha1
: AES-256-CBC-HMAC-SHA1 : aes-256-cbc-hmac-sha1
+ : AES-128-CBC-HMAC-SHA256 : aes-128-cbc-hmac-sha256
+ : AES-192-CBC-HMAC-SHA256 : aes-192-cbc-hmac-sha256
+ : AES-256-CBC-HMAC-SHA256 : aes-256-cbc-hmac-sha256
+
+ISO-US 10046 2 1 : dhpublicnumber : X9.42 DH
+
+# RFC 5639 curve OIDs (see http://www.ietf.org/rfc/rfc5639.txt)
+# versionOne OBJECT IDENTIFIER ::= {
+# iso(1) identifified-organization(3) teletrust(36) algorithm(3)
+# signature-algorithm(3) ecSign(2) ecStdCurvesAndGeneration(8)
+# ellipticCurve(1) 1 }
+1 3 36 3 3 2 8 1 1 1 : brainpoolP160r1
+1 3 36 3 3 2 8 1 1 2 : brainpoolP160t1
+1 3 36 3 3 2 8 1 1 3 : brainpoolP192r1
+1 3 36 3 3 2 8 1 1 4 : brainpoolP192t1
+1 3 36 3 3 2 8 1 1 5 : brainpoolP224r1
+1 3 36 3 3 2 8 1 1 6 : brainpoolP224t1
+1 3 36 3 3 2 8 1 1 7 : brainpoolP256r1
+1 3 36 3 3 2 8 1 1 8 : brainpoolP256t1
+1 3 36 3 3 2 8 1 1 9 : brainpoolP320r1
+1 3 36 3 3 2 8 1 1 10 : brainpoolP320t1
+1 3 36 3 3 2 8 1 1 11 : brainpoolP384r1
+1 3 36 3 3 2 8 1 1 12 : brainpoolP384t1
+1 3 36 3 3 2 8 1 1 13 : brainpoolP512r1
+1 3 36 3 3 2 8 1 1 14 : brainpoolP512t1
+
+# ECDH schemes from RFC5753
+!Alias x9-63-scheme 1 3 133 16 840 63 0
+!Alias secg-scheme certicom-arc 1
+
+x9-63-scheme 2 : dhSinglePass-stdDH-sha1kdf-scheme
+secg-scheme 11 0 : dhSinglePass-stdDH-sha224kdf-scheme
+secg-scheme 11 1 : dhSinglePass-stdDH-sha256kdf-scheme
+secg-scheme 11 2 : dhSinglePass-stdDH-sha384kdf-scheme
+secg-scheme 11 3 : dhSinglePass-stdDH-sha512kdf-scheme
+
+x9-63-scheme 3 : dhSinglePass-cofactorDH-sha1kdf-scheme
+secg-scheme 14 0 : dhSinglePass-cofactorDH-sha224kdf-scheme
+secg-scheme 14 1 : dhSinglePass-cofactorDH-sha256kdf-scheme
+secg-scheme 14 2 : dhSinglePass-cofactorDH-sha384kdf-scheme
+secg-scheme 14 3 : dhSinglePass-cofactorDH-sha512kdf-scheme
+# NIDs for use with lookup tables.
+ : dh-std-kdf
+ : dh-cofactor-kdf
+
+# RFC 6962 Extension OIDs (see http://www.ietf.org/rfc/rfc6962.txt)
+1 3 6 1 4 1 11129 2 4 2 : ct_precert_scts : CT Precertificate SCTs
+1 3 6 1 4 1 11129 2 4 3 : ct_precert_poison : CT Precertificate Poison
+1 3 6 1 4 1 11129 2 4 4 : ct_precert_signer : CT Precertificate Signer
+1 3 6 1 4 1 11129 2 4 5 : ct_cert_scts : CT Certificate SCTs
+
+# CABForum EV SSL Certificate Guidelines
+# (see https://cabforum.org/extended-validation/)
+# OIDs for Subject Jurisdiction of Incorporation or Registration
+1 3 6 1 4 1 311 60 2 1 1 : jurisdictionL : jurisdictionLocalityName
+1 3 6 1 4 1 311 60 2 1 2 : jurisdictionST : jurisdictionStateOrProvinceName
+1 3 6 1 4 1 311 60 2 1 3 : jurisdictionC : jurisdictionCountryName
diff --git a/crypto/objects/objxref.pl b/crypto/objects/objxref.pl
index 35c06514b932..1913b9d133c2 100755
--- a/crypto/objects/objxref.pl
+++ b/crypto/objects/objxref.pl
@@ -39,7 +39,8 @@ my @xrkeys = keys %xref_tbl;
my @srt1 = sort { $oid_tbl{$a} <=> $oid_tbl{$b}} @xrkeys;
-for(my $i = 0; $i <= $#srt1; $i++)
+my $i;
+for($i = 0; $i <= $#srt1; $i++)
{
$xref_tbl{$srt1[$i]}[2] = $i;
}
diff --git a/crypto/ocsp/ocsp.h b/crypto/ocsp/ocsp.h
index 25ef01956be2..ca2ee76dce47 100644
--- a/crypto/ocsp/ocsp.h
+++ b/crypto/ocsp/ocsp.h
@@ -394,11 +394,22 @@ typedef struct ocsp_service_locator_st {
OCSP_CERTID *OCSP_CERTID_dup(OCSP_CERTID *id);
-OCSP_RESPONSE *OCSP_sendreq_bio(BIO *b, char *path, OCSP_REQUEST *req);
-OCSP_REQ_CTX *OCSP_sendreq_new(BIO *io, char *path, OCSP_REQUEST *req,
+OCSP_RESPONSE *OCSP_sendreq_bio(BIO *b, const char *path, OCSP_REQUEST *req);
+OCSP_REQ_CTX *OCSP_sendreq_new(BIO *io, const char *path, OCSP_REQUEST *req,
int maxline);
+int OCSP_REQ_CTX_nbio(OCSP_REQ_CTX *rctx);
int OCSP_sendreq_nbio(OCSP_RESPONSE **presp, OCSP_REQ_CTX *rctx);
+OCSP_REQ_CTX *OCSP_REQ_CTX_new(BIO *io, int maxline);
void OCSP_REQ_CTX_free(OCSP_REQ_CTX *rctx);
+void OCSP_set_max_response_length(OCSP_REQ_CTX *rctx, unsigned long len);
+int OCSP_REQ_CTX_i2d(OCSP_REQ_CTX *rctx, const ASN1_ITEM *it,
+ ASN1_VALUE *val);
+int OCSP_REQ_CTX_nbio_d2i(OCSP_REQ_CTX *rctx, ASN1_VALUE **pval,
+ const ASN1_ITEM *it);
+BIO *OCSP_REQ_CTX_get0_mem_bio(OCSP_REQ_CTX *rctx);
+int OCSP_REQ_CTX_i2d(OCSP_REQ_CTX *rctx, const ASN1_ITEM *it,
+ ASN1_VALUE *val);
+int OCSP_REQ_CTX_http(OCSP_REQ_CTX *rctx, const char *op, const char *path);
int OCSP_REQ_CTX_set1_req(OCSP_REQ_CTX *rctx, OCSP_REQUEST *req);
int OCSP_REQ_CTX_add1_header(OCSP_REQ_CTX *rctx,
const char *name, const char *value);
@@ -447,7 +458,7 @@ int OCSP_check_validity(ASN1_GENERALIZEDTIME *thisupd,
int OCSP_request_verify(OCSP_REQUEST *req, STACK_OF(X509) *certs,
X509_STORE *store, unsigned long flags);
-int OCSP_parse_url(char *url, char **phost, char **pport, char **ppath,
+int OCSP_parse_url(const char *url, char **phost, char **pport, char **ppath,
int *pssl);
int OCSP_id_issuer_cmp(OCSP_CERTID *a, OCSP_CERTID *b);
diff --git a/crypto/ocsp/ocsp_ht.c b/crypto/ocsp/ocsp_ht.c
index 970fea4a70a8..88b26b38e8d9 100644
--- a/crypto/ocsp/ocsp_ht.c
+++ b/crypto/ocsp/ocsp_ht.c
@@ -81,9 +81,10 @@ struct ocsp_req_ctx_st {
BIO *io; /* BIO to perform I/O with */
BIO *mem; /* Memory BIO response is built into */
unsigned long asn1_len; /* ASN1 length of response */
+ unsigned long max_resp_len; /* Maximum length of response */
};
-#define OCSP_MAX_REQUEST_LENGTH (100 * 1024)
+#define OCSP_MAX_RESP_LENGTH (100 * 1024)
#define OCSP_MAX_LINE_LEN 4096;
/* OCSP states */
@@ -100,15 +101,42 @@ struct ocsp_req_ctx_st {
#define OHS_ASN1_HEADER 3
/* OCSP content octets being read */
#define OHS_ASN1_CONTENT 4
+/* First call: ready to start I/O */
+#define OHS_ASN1_WRITE_INIT (5 | OHS_NOREAD)
/* Request being sent */
#define OHS_ASN1_WRITE (6 | OHS_NOREAD)
/* Request being flushed */
#define OHS_ASN1_FLUSH (7 | OHS_NOREAD)
/* Completed */
#define OHS_DONE (8 | OHS_NOREAD)
+/* Headers set, no final \r\n included */
+#define OHS_HTTP_HEADER (9 | OHS_NOREAD)
static int parse_http_line1(char *line);
+OCSP_REQ_CTX *OCSP_REQ_CTX_new(BIO *io, int maxline)
+{
+ OCSP_REQ_CTX *rctx;
+ rctx = OPENSSL_malloc(sizeof(OCSP_REQ_CTX));
+ if (!rctx)
+ return NULL;
+ rctx->state = OHS_ERROR;
+ rctx->max_resp_len = OCSP_MAX_RESP_LENGTH;
+ rctx->mem = BIO_new(BIO_s_mem());
+ rctx->io = io;
+ rctx->asn1_len = 0;
+ if (maxline > 0)
+ rctx->iobuflen = maxline;
+ else
+ rctx->iobuflen = OCSP_MAX_LINE_LEN;
+ rctx->iobuf = OPENSSL_malloc(rctx->iobuflen);
+ if (!rctx->iobuf || !rctx->mem) {
+ OCSP_REQ_CTX_free(rctx);
+ return NULL;
+ }
+ return rctx;
+}
+
void OCSP_REQ_CTX_free(OCSP_REQ_CTX *rctx)
{
if (rctx->mem)
@@ -118,20 +146,71 @@ void OCSP_REQ_CTX_free(OCSP_REQ_CTX *rctx)
OPENSSL_free(rctx);
}
-int OCSP_REQ_CTX_set1_req(OCSP_REQ_CTX *rctx, OCSP_REQUEST *req)
+BIO *OCSP_REQ_CTX_get0_mem_bio(OCSP_REQ_CTX *rctx)
+{
+ return rctx->mem;
+}
+
+void OCSP_set_max_response_length(OCSP_REQ_CTX *rctx, unsigned long len)
+{
+ if (len == 0)
+ rctx->max_resp_len = OCSP_MAX_RESP_LENGTH;
+ else
+ rctx->max_resp_len = len;
+}
+
+int OCSP_REQ_CTX_i2d(OCSP_REQ_CTX *rctx, const ASN1_ITEM *it, ASN1_VALUE *val)
{
static const char req_hdr[] =
"Content-Type: application/ocsp-request\r\n"
"Content-Length: %d\r\n\r\n";
- if (BIO_printf(rctx->mem, req_hdr, i2d_OCSP_REQUEST(req, NULL)) <= 0)
+ int reqlen = ASN1_item_i2d(val, NULL, it);
+ if (BIO_printf(rctx->mem, req_hdr, reqlen) <= 0)
+ return 0;
+ if (ASN1_item_i2d_bio(it, rctx->mem, val) <= 0)
+ return 0;
+ rctx->state = OHS_ASN1_WRITE_INIT;
+ return 1;
+}
+
+int OCSP_REQ_CTX_nbio_d2i(OCSP_REQ_CTX *rctx,
+ ASN1_VALUE **pval, const ASN1_ITEM *it)
+{
+ int rv, len;
+ const unsigned char *p;
+
+ rv = OCSP_REQ_CTX_nbio(rctx);
+ if (rv != 1)
+ return rv;
+
+ len = BIO_get_mem_data(rctx->mem, &p);
+ *pval = ASN1_item_d2i(NULL, &p, len, it);
+ if (*pval == NULL) {
+ rctx->state = OHS_ERROR;
return 0;
- if (i2d_OCSP_REQUEST_bio(rctx->mem, req) <= 0)
+ }
+ return 1;
+}
+
+int OCSP_REQ_CTX_http(OCSP_REQ_CTX *rctx, const char *op, const char *path)
+{
+ static const char http_hdr[] = "%s %s HTTP/1.0\r\n";
+
+ if (!path)
+ path = "/";
+
+ if (BIO_printf(rctx->mem, http_hdr, op, path) <= 0)
return 0;
- rctx->state = OHS_ASN1_WRITE;
- rctx->asn1_len = BIO_get_mem_data(rctx->mem, NULL);
+ rctx->state = OHS_HTTP_HEADER;
return 1;
}
+int OCSP_REQ_CTX_set1_req(OCSP_REQ_CTX *rctx, OCSP_REQUEST *req)
+{
+ return OCSP_REQ_CTX_i2d(rctx, ASN1_ITEM_rptr(OCSP_REQUEST),
+ (ASN1_VALUE *)req);
+}
+
int OCSP_REQ_CTX_add1_header(OCSP_REQ_CTX *rctx,
const char *name, const char *value)
{
@@ -147,39 +226,27 @@ int OCSP_REQ_CTX_add1_header(OCSP_REQ_CTX *rctx,
}
if (BIO_write(rctx->mem, "\r\n", 2) != 2)
return 0;
+ rctx->state = OHS_HTTP_HEADER;
return 1;
}
-OCSP_REQ_CTX *OCSP_sendreq_new(BIO *io, char *path, OCSP_REQUEST *req,
+OCSP_REQ_CTX *OCSP_sendreq_new(BIO *io, const char *path, OCSP_REQUEST *req,
int maxline)
{
- static const char post_hdr[] = "POST %s HTTP/1.0\r\n";
- OCSP_REQ_CTX *rctx;
- rctx = OPENSSL_malloc(sizeof(OCSP_REQ_CTX));
+ OCSP_REQ_CTX *rctx = NULL;
+ rctx = OCSP_REQ_CTX_new(io, maxline);
if (!rctx)
return NULL;
- rctx->state = OHS_ERROR;
- rctx->mem = BIO_new(BIO_s_mem());
- rctx->io = io;
- rctx->asn1_len = 0;
- if (maxline > 0)
- rctx->iobuflen = maxline;
- else
- rctx->iobuflen = OCSP_MAX_LINE_LEN;
- rctx->iobuf = OPENSSL_malloc(rctx->iobuflen);
- if (!rctx->mem || !rctx->iobuf)
- goto err;
- if (!path)
- path = "/";
- if (BIO_printf(rctx->mem, post_hdr, path) <= 0)
+ if (!OCSP_REQ_CTX_http(rctx, "POST", path))
goto err;
if (req && !OCSP_REQ_CTX_set1_req(rctx, req))
goto err;
return rctx;
+
err:
OCSP_REQ_CTX_free(rctx);
return NULL;
@@ -256,7 +323,7 @@ static int parse_http_line1(char *line)
}
-int OCSP_sendreq_nbio(OCSP_RESPONSE **presp, OCSP_REQ_CTX *rctx)
+int OCSP_REQ_CTX_nbio(OCSP_REQ_CTX *rctx)
{
int i, n;
const unsigned char *p;
@@ -277,6 +344,17 @@ int OCSP_sendreq_nbio(OCSP_RESPONSE **presp, OCSP_REQ_CTX *rctx)
}
switch (rctx->state) {
+ case OHS_HTTP_HEADER:
+ /* Last operation was adding headers: need a final \r\n */
+ if (BIO_write(rctx->mem, "\r\n", 2) != 2) {
+ rctx->state = OHS_ERROR;
+ return 0;
+ }
+ rctx->state = OHS_ASN1_WRITE_INIT;
+
+ case OHS_ASN1_WRITE_INIT:
+ rctx->asn1_len = BIO_get_mem_data(rctx->mem, NULL);
+ rctx->state = OHS_ASN1_WRITE;
case OHS_ASN1_WRITE:
n = BIO_get_mem_data(rctx->mem, &p);
@@ -412,7 +490,7 @@ int OCSP_sendreq_nbio(OCSP_RESPONSE **presp, OCSP_REQ_CTX *rctx)
rctx->asn1_len |= *p++;
}
- if (rctx->asn1_len > OCSP_MAX_REQUEST_LENGTH) {
+ if (rctx->asn1_len > rctx->max_resp_len) {
rctx->state = OHS_ERROR;
return 0;
}
@@ -426,18 +504,12 @@ int OCSP_sendreq_nbio(OCSP_RESPONSE **presp, OCSP_REQ_CTX *rctx)
/* Fall thru */
case OHS_ASN1_CONTENT:
- n = BIO_get_mem_data(rctx->mem, &p);
+ n = BIO_get_mem_data(rctx->mem, NULL);
if (n < (int)rctx->asn1_len)
goto next_io;
- *presp = d2i_OCSP_RESPONSE(NULL, &p, rctx->asn1_len);
- if (*presp) {
- rctx->state = OHS_DONE;
- return 1;
- }
-
- rctx->state = OHS_ERROR;
- return 0;
+ rctx->state = OHS_DONE;
+ return 1;
break;
@@ -450,9 +522,16 @@ int OCSP_sendreq_nbio(OCSP_RESPONSE **presp, OCSP_REQ_CTX *rctx)
}
+int OCSP_sendreq_nbio(OCSP_RESPONSE **presp, OCSP_REQ_CTX *rctx)
+{
+ return OCSP_REQ_CTX_nbio_d2i(rctx,
+ (ASN1_VALUE **)presp,
+ ASN1_ITEM_rptr(OCSP_RESPONSE));
+}
+
/* Blocking OCSP request handler: now a special case of non-blocking I/O */
-OCSP_RESPONSE *OCSP_sendreq_bio(BIO *b, char *path, OCSP_REQUEST *req)
+OCSP_RESPONSE *OCSP_sendreq_bio(BIO *b, const char *path, OCSP_REQUEST *req)
{
OCSP_RESPONSE *resp = NULL;
OCSP_REQ_CTX *ctx;
diff --git a/crypto/ocsp/ocsp_lib.c b/crypto/ocsp/ocsp_lib.c
index d28d6b5c042e..442a5b63d4ba 100755
--- a/crypto/ocsp/ocsp_lib.c
+++ b/crypto/ocsp/ocsp_lib.c
@@ -175,7 +175,7 @@ int OCSP_id_cmp(OCSP_CERTID *a, OCSP_CERTID *b)
* whether it is SSL.
*/
-int OCSP_parse_url(char *url, char **phost, char **pport, char **ppath,
+int OCSP_parse_url(const char *url, char **phost, char **pport, char **ppath,
int *pssl)
{
char *p, *buf;
diff --git a/crypto/opensslconf.h b/crypto/opensslconf.h
index 4a6e2766e0bb..15487c9f93be 100644
--- a/crypto/opensslconf.h
+++ b/crypto/opensslconf.h
@@ -20,6 +20,9 @@ extern "C" {
#ifndef OPENSSL_NO_KRB5
# define OPENSSL_NO_KRB5
#endif
+#ifndef OPENSSL_NO_LIBUNBOUND
+# define OPENSSL_NO_LIBUNBOUND
+#endif
#ifndef OPENSSL_NO_MD2
# define OPENSSL_NO_MD2
#endif
@@ -32,6 +35,9 @@ extern "C" {
#ifndef OPENSSL_NO_SCTP
# define OPENSSL_NO_SCTP
#endif
+#ifndef OPENSSL_NO_SSL_TRACE
+# define OPENSSL_NO_SSL_TRACE
+#endif
#ifndef OPENSSL_NO_STORE
# define OPENSSL_NO_STORE
#endif
@@ -62,6 +68,9 @@ extern "C" {
# if defined(OPENSSL_NO_KRB5) && !defined(NO_KRB5)
# define NO_KRB5
# endif
+# if defined(OPENSSL_NO_LIBUNBOUND) && !defined(NO_LIBUNBOUND)
+# define NO_LIBUNBOUND
+# endif
# if defined(OPENSSL_NO_MD2) && !defined(NO_MD2)
# define NO_MD2
# endif
@@ -74,6 +83,9 @@ extern "C" {
# if defined(OPENSSL_NO_SCTP) && !defined(NO_SCTP)
# define NO_SCTP
# endif
+# if defined(OPENSSL_NO_SSL_TRACE) && !defined(NO_SSL_TRACE)
+# define NO_SSL_TRACE
+# endif
# if defined(OPENSSL_NO_STORE) && !defined(NO_STORE)
# define NO_STORE
# endif
diff --git a/crypto/opensslv.h b/crypto/opensslv.h
index bd66999c1c6b..c06b13ac6b0f 100644
--- a/crypto/opensslv.h
+++ b/crypto/opensslv.h
@@ -30,11 +30,11 @@ extern "C" {
* (Prior to 0.9.5a beta1, a different scheme was used: MMNNFFRBB for
* major minor fix final patch/beta)
*/
-# define OPENSSL_VERSION_NUMBER 0x1000110fL
+# define OPENSSL_VERSION_NUMBER 0x1000204fL
# ifdef OPENSSL_FIPS
-# define OPENSSL_VERSION_TEXT "OpenSSL 1.0.1p-fips 9 Jul 2015"
+# define OPENSSL_VERSION_TEXT "OpenSSL 1.0.2d-fips 9 Jul 2015"
# else
-# define OPENSSL_VERSION_TEXT "OpenSSL 1.0.1p 9 Jul 2015"
+# define OPENSSL_VERSION_TEXT "OpenSSL 1.0.2d 9 Jul 2015"
# endif
# define OPENSSL_VERSION_PTEXT " part of " OPENSSL_VERSION_TEXT
diff --git a/crypto/ossl_typ.h b/crypto/ossl_typ.h
index 0fcb0cea6c61..9144ea2cf60b 100644
--- a/crypto/ossl_typ.h
+++ b/crypto/ossl_typ.h
@@ -100,6 +100,8 @@ typedef int ASN1_BOOLEAN;
typedef int ASN1_NULL;
# endif
+typedef struct asn1_object_st ASN1_OBJECT;
+
typedef struct ASN1_ITEM_st ASN1_ITEM;
typedef struct asn1_pctx_st ASN1_PCTX;
diff --git a/crypto/pem/Makefile b/crypto/pem/Makefile
index 7a930eeae774..65de60e2a12e 100644
--- a/crypto/pem/Makefile
+++ b/crypto/pem/Makefile
@@ -171,12 +171,13 @@ pem_pk8.o: ../../include/openssl/x509.h ../../include/openssl/x509_vfy.h
pem_pk8.o: ../cryptlib.h pem_pk8.c
pem_pkey.o: ../../e_os.h ../../include/openssl/asn1.h
pem_pkey.o: ../../include/openssl/bio.h ../../include/openssl/buffer.h
-pem_pkey.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
-pem_pkey.o: ../../include/openssl/ec.h ../../include/openssl/ecdh.h
-pem_pkey.o: ../../include/openssl/ecdsa.h ../../include/openssl/engine.h
-pem_pkey.o: ../../include/openssl/err.h ../../include/openssl/evp.h
-pem_pkey.o: ../../include/openssl/lhash.h ../../include/openssl/obj_mac.h
-pem_pkey.o: ../../include/openssl/objects.h ../../include/openssl/opensslconf.h
+pem_pkey.o: ../../include/openssl/crypto.h ../../include/openssl/dh.h
+pem_pkey.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h
+pem_pkey.o: ../../include/openssl/ecdh.h ../../include/openssl/ecdsa.h
+pem_pkey.o: ../../include/openssl/engine.h ../../include/openssl/err.h
+pem_pkey.o: ../../include/openssl/evp.h ../../include/openssl/lhash.h
+pem_pkey.o: ../../include/openssl/obj_mac.h ../../include/openssl/objects.h
+pem_pkey.o: ../../include/openssl/opensslconf.h
pem_pkey.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
pem_pkey.o: ../../include/openssl/pem.h ../../include/openssl/pem2.h
pem_pkey.o: ../../include/openssl/pkcs12.h ../../include/openssl/pkcs7.h
diff --git a/crypto/pem/pem.h b/crypto/pem/pem.h
index 2cdad8ac2669..d3b23fc997d6 100644
--- a/crypto/pem/pem.h
+++ b/crypto/pem/pem.h
@@ -129,6 +129,7 @@ extern "C" {
# define PEM_STRING_PKCS8 "ENCRYPTED PRIVATE KEY"
# define PEM_STRING_PKCS8INF "PRIVATE KEY"
# define PEM_STRING_DHPARAMS "DH PARAMETERS"
+# define PEM_STRING_DHXPARAMS "X9.42 DH PARAMETERS"
# define PEM_STRING_SSL_SESSION "SSL SESSION PARAMETERS"
# define PEM_STRING_DSAPARAMS "DSA PARAMETERS"
# define PEM_STRING_ECDSA_PUBLIC "ECDSA PUBLIC KEY"
@@ -181,7 +182,6 @@ typedef struct pem_ctx_st {
int num_recipient;
PEM_USER **recipient;
-
/*-
XXX(ben): don#t think this is used!
STACK *x509_chain; / * certificate chain */
@@ -399,8 +399,8 @@ int PEM_do_header(EVP_CIPHER_INFO *cipher, unsigned char *data, long *len,
# ifndef OPENSSL_NO_BIO
int PEM_read_bio(BIO *bp, char **name, char **header,
unsigned char **data, long *len);
-int PEM_write_bio(BIO *bp, const char *name, char *hdr, unsigned char *data,
- long len);
+int PEM_write_bio(BIO *bp, const char *name, const char *hdr,
+ const unsigned char *data, long len);
int PEM_bytes_read_bio(unsigned char **pdata, long *plen, char **pnm,
const char *name, BIO *bp, pem_password_cb *cb,
void *u);
@@ -419,7 +419,8 @@ int PEM_X509_INFO_write_bio(BIO *bp, X509_INFO *xi, EVP_CIPHER *enc,
int PEM_read(FILE *fp, char **name, char **header,
unsigned char **data, long *len);
-int PEM_write(FILE *fp, char *name, char *hdr, unsigned char *data, long len);
+int PEM_write(FILE *fp, const char *name, const char *hdr,
+ const unsigned char *data, long len);
void *PEM_ASN1_read(d2i_of_void *d2i, const char *name, FILE *fp, void **x,
pem_password_cb *cb, void *u);
int PEM_ASN1_write(i2d_of_void *i2d, const char *name, FILE *fp,
@@ -474,6 +475,7 @@ DECLARE_PEM_rw(EC_PUBKEY, EC_KEY)
# endif
# ifndef OPENSSL_NO_DH
DECLARE_PEM_rw_const(DHparams, DH)
+DECLARE_PEM_write_const(DHxparams, DH)
# endif
DECLARE_PEM_rw_cb(PrivateKey, EVP_PKEY)
DECLARE_PEM_rw(PUBKEY, EVP_PKEY)
@@ -562,8 +564,10 @@ void ERR_load_PEM_strings(void);
# define PEM_F_PEM_PK8PKEY 119
# define PEM_F_PEM_READ 108
# define PEM_F_PEM_READ_BIO 109
+# define PEM_F_PEM_READ_BIO_DHPARAMS 141
# define PEM_F_PEM_READ_BIO_PARAMETERS 140
# define PEM_F_PEM_READ_BIO_PRIVATEKEY 123
+# define PEM_F_PEM_READ_DHPARAMS 142
# define PEM_F_PEM_READ_PRIVATEKEY 124
# define PEM_F_PEM_SEALFINAL 110
# define PEM_F_PEM_SEALINIT 111
diff --git a/crypto/pem/pem_all.c b/crypto/pem/pem_all.c
index 64b8ba7c256a..0e5be63ef089 100644
--- a/crypto/pem/pem_all.c
+++ b/crypto/pem/pem_all.c
@@ -421,6 +421,7 @@ EC_KEY *PEM_read_ECPrivateKey(FILE *fp, EC_KEY **eckey, pem_password_cb *cb,
#ifndef OPENSSL_NO_DH
-IMPLEMENT_PEM_rw_const(DHparams, DH, PEM_STRING_DHPARAMS, DHparams)
+IMPLEMENT_PEM_write_const(DHparams, DH, PEM_STRING_DHPARAMS, DHparams)
+ IMPLEMENT_PEM_write_const(DHxparams, DH, PEM_STRING_DHXPARAMS, DHxparams)
#endif
- IMPLEMENT_PEM_rw(PUBKEY, EVP_PKEY, PEM_STRING_PUBLIC, PUBKEY)
+IMPLEMENT_PEM_rw(PUBKEY, EVP_PKEY, PEM_STRING_PUBLIC, PUBKEY)
diff --git a/crypto/pem/pem_err.c b/crypto/pem/pem_err.c
index 702c5adecb86..e1f4fdb432d0 100644
--- a/crypto/pem/pem_err.c
+++ b/crypto/pem/pem_err.c
@@ -1,6 +1,6 @@
/* crypto/pem/pem_err.c */
/* ====================================================================
- * Copyright (c) 1999-2007 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -99,8 +99,10 @@ static ERR_STRING_DATA PEM_str_functs[] = {
{ERR_FUNC(PEM_F_PEM_PK8PKEY), "PEM_PK8PKEY"},
{ERR_FUNC(PEM_F_PEM_READ), "PEM_read"},
{ERR_FUNC(PEM_F_PEM_READ_BIO), "PEM_read_bio"},
+ {ERR_FUNC(PEM_F_PEM_READ_BIO_DHPARAMS), "PEM_READ_BIO_DHPARAMS"},
{ERR_FUNC(PEM_F_PEM_READ_BIO_PARAMETERS), "PEM_read_bio_Parameters"},
{ERR_FUNC(PEM_F_PEM_READ_BIO_PRIVATEKEY), "PEM_READ_BIO_PRIVATEKEY"},
+ {ERR_FUNC(PEM_F_PEM_READ_DHPARAMS), "PEM_READ_DHPARAMS"},
{ERR_FUNC(PEM_F_PEM_READ_PRIVATEKEY), "PEM_READ_PRIVATEKEY"},
{ERR_FUNC(PEM_F_PEM_SEALFINAL), "PEM_SealFinal"},
{ERR_FUNC(PEM_F_PEM_SEALINIT), "PEM_SealInit"},
diff --git a/crypto/pem/pem_lib.c b/crypto/pem/pem_lib.c
index 55071616e203..a29821aab2eb 100644
--- a/crypto/pem/pem_lib.c
+++ b/crypto/pem/pem_lib.c
@@ -229,6 +229,10 @@ static int check_pem(const char *nm, const char *name)
}
return 0;
}
+ /* If reading DH parameters handle X9.42 DH format too */
+ if (!strcmp(nm, PEM_STRING_DHXPARAMS) &&
+ !strcmp(name, PEM_STRING_DHPARAMS))
+ return 1;
/* Permit older strings */
@@ -472,8 +476,9 @@ int PEM_do_header(EVP_CIPHER_INFO *cipher, unsigned char *data, long *plen,
EVP_CIPHER_CTX_cleanup(&ctx);
OPENSSL_cleanse((char *)buf, sizeof(buf));
OPENSSL_cleanse((char *)key, sizeof(key));
- j += i;
- if (!o) {
+ if (o)
+ j += i;
+ else {
PEMerr(PEM_F_PEM_DO_HEADER, PEM_R_BAD_DECRYPT);
return (0);
}
@@ -574,8 +579,8 @@ static int load_iv(char **fromp, unsigned char *to, int num)
}
#ifndef OPENSSL_NO_FP_API
-int PEM_write(FILE *fp, char *name, char *header, unsigned char *data,
- long len)
+int PEM_write(FILE *fp, const char *name, const char *header,
+ const unsigned char *data, long len)
{
BIO *b;
int ret;
@@ -591,8 +596,8 @@ int PEM_write(FILE *fp, char *name, char *header, unsigned char *data,
}
#endif
-int PEM_write_bio(BIO *bp, const char *name, char *header,
- unsigned char *data, long len)
+int PEM_write_bio(BIO *bp, const char *name, const char *header,
+ const unsigned char *data, long len)
{
int nlen, n, i, j, outl;
unsigned char *buf = NULL;
diff --git a/crypto/pem/pem_pkey.c b/crypto/pem/pem_pkey.c
index 0b05e63fd7aa..04d6319a225b 100644
--- a/crypto/pem/pem_pkey.c
+++ b/crypto/pem/pem_pkey.c
@@ -68,6 +68,9 @@
#ifndef OPENSSL_NO_ENGINE
# include <openssl/engine.h>
#endif
+#ifndef OPENSSL_NO_DH
+# include <openssl/dh.h>
+#endif
#include "asn1_locl.h"
int pem_check_suffix(const char *pem_str, const char *suffix);
@@ -241,3 +244,50 @@ int PEM_write_PrivateKey(FILE *fp, EVP_PKEY *x, const EVP_CIPHER *enc,
}
#endif
+
+#ifndef OPENSSL_NO_DH
+
+/* Transparently read in PKCS#3 or X9.42 DH parameters */
+
+DH *PEM_read_bio_DHparams(BIO *bp, DH **x, pem_password_cb *cb, void *u)
+{
+ char *nm = NULL;
+ const unsigned char *p = NULL;
+ unsigned char *data = NULL;
+ long len;
+ DH *ret = NULL;
+
+ if (!PEM_bytes_read_bio(&data, &len, &nm, PEM_STRING_DHPARAMS, bp, cb, u))
+ return NULL;
+ p = data;
+
+ if (!strcmp(nm, PEM_STRING_DHXPARAMS))
+ ret = d2i_DHxparams(x, &p, len);
+ else
+ ret = d2i_DHparams(x, &p, len);
+
+ if (ret == NULL)
+ PEMerr(PEM_F_PEM_READ_BIO_DHPARAMS, ERR_R_ASN1_LIB);
+ OPENSSL_free(nm);
+ OPENSSL_free(data);
+ return ret;
+}
+
+# ifndef OPENSSL_NO_FP_API
+DH *PEM_read_DHparams(FILE *fp, DH **x, pem_password_cb *cb, void *u)
+{
+ BIO *b;
+ DH *ret;
+
+ if ((b = BIO_new(BIO_s_file())) == NULL) {
+ PEMerr(PEM_F_PEM_READ_DHPARAMS, ERR_R_BUF_LIB);
+ return (0);
+ }
+ BIO_set_fp(b, fp, BIO_NOCLOSE);
+ ret = PEM_read_bio_DHparams(b, x, cb, u);
+ BIO_free(b);
+ return (ret);
+}
+# endif
+
+#endif
diff --git a/crypto/perlasm/ppc-xlate.pl b/crypto/perlasm/ppc-xlate.pl
index a3edd982b664..f89e81429931 100755
--- a/crypto/perlasm/ppc-xlate.pl
+++ b/crypto/perlasm/ppc-xlate.pl
@@ -27,7 +27,8 @@ my $globl = sub {
/osx/ && do { $name = "_$name";
last;
};
- /linux.*32/ && do { $ret .= ".globl $name\n";
+ /linux.*(32|64le)/
+ && do { $ret .= ".globl $name\n";
$ret .= ".type $name,\@function";
last;
};
@@ -37,7 +38,6 @@ my $globl = sub {
$ret .= ".align 3\n";
$ret .= "$name:\n";
$ret .= ".quad .$name,.TOC.\@tocbase,0\n";
- $ret .= ".size $name,24\n";
$ret .= ".previous\n";
$name = ".$name";
@@ -50,7 +50,9 @@ my $globl = sub {
$ret;
};
my $text = sub {
- ($flavour =~ /aix/) ? ".csect" : ".text";
+ my $ret = ($flavour =~ /aix/) ? ".csect\t.text[PR],7" : ".text";
+ $ret = ".abiversion 2\n".$ret if ($flavour =~ /linux.*64le/);
+ $ret;
};
my $machine = sub {
my $junk = shift;
@@ -62,9 +64,12 @@ my $machine = sub {
".machine $arch";
};
my $size = sub {
- if ($flavour =~ /linux.*32/)
+ if ($flavour =~ /linux/)
{ shift;
- ".size " . join(",",@_);
+ my $name = shift; $name =~ s|^[\.\_]||;
+ my $ret = ".size $name,.-".($flavour=~/64$/?".":"").$name;
+ $ret .= "\n.size .$name,.-.$name" if ($flavour=~/64$/);
+ $ret;
}
else
{ ""; }
@@ -77,6 +82,25 @@ my $asciz = sub {
else
{ ""; }
};
+my $quad = sub {
+ shift;
+ my @ret;
+ my ($hi,$lo);
+ for (@_) {
+ if (/^0x([0-9a-f]*?)([0-9a-f]{1,8})$/io)
+ { $hi=$1?"0x$1":"0"; $lo="0x$2"; }
+ elsif (/^([0-9]+)$/o)
+ { $hi=$1>>32; $lo=$1&0xffffffff; } # error-prone with 32-bit perl
+ else
+ { $hi=undef; $lo=$_; }
+
+ if (defined($hi))
+ { push(@ret,$flavour=~/le$/o?".long\t$lo,$hi":".long\t$hi,$lo"); }
+ else
+ { push(@ret,".quad $lo"); }
+ }
+ join("\n",@ret);
+};
################################################################
# simplified mnemonics not handled by at least one assembler
@@ -122,6 +146,46 @@ my $extrdi = sub {
$b = ($b+$n)&63; $n = 64-$n;
" rldicl $ra,$rs,$b,$n";
};
+my $vmr = sub {
+ my ($f,$vx,$vy) = @_;
+ " vor $vx,$vy,$vy";
+};
+
+# PowerISA 2.06 stuff
+sub vsxmem_op {
+ my ($f, $vrt, $ra, $rb, $op) = @_;
+ " .long ".sprintf "0x%X",(31<<26)|($vrt<<21)|($ra<<16)|($rb<<11)|($op*2+1);
+}
+# made-up unaligned memory reference AltiVec/VMX instructions
+my $lvx_u = sub { vsxmem_op(@_, 844); }; # lxvd2x
+my $stvx_u = sub { vsxmem_op(@_, 972); }; # stxvd2x
+my $lvdx_u = sub { vsxmem_op(@_, 588); }; # lxsdx
+my $stvdx_u = sub { vsxmem_op(@_, 716); }; # stxsdx
+my $lvx_4w = sub { vsxmem_op(@_, 780); }; # lxvw4x
+my $stvx_4w = sub { vsxmem_op(@_, 908); }; # stxvw4x
+
+# PowerISA 2.07 stuff
+sub vcrypto_op {
+ my ($f, $vrt, $vra, $vrb, $op) = @_;
+ " .long ".sprintf "0x%X",(4<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|$op;
+}
+my $vcipher = sub { vcrypto_op(@_, 1288); };
+my $vcipherlast = sub { vcrypto_op(@_, 1289); };
+my $vncipher = sub { vcrypto_op(@_, 1352); };
+my $vncipherlast= sub { vcrypto_op(@_, 1353); };
+my $vsbox = sub { vcrypto_op(@_, 0, 1480); };
+my $vshasigmad = sub { my ($st,$six)=splice(@_,-2); vcrypto_op(@_, $st<<4|$six, 1730); };
+my $vshasigmaw = sub { my ($st,$six)=splice(@_,-2); vcrypto_op(@_, $st<<4|$six, 1666); };
+my $vpmsumb = sub { vcrypto_op(@_, 1032); };
+my $vpmsumd = sub { vcrypto_op(@_, 1224); };
+my $vpmsubh = sub { vcrypto_op(@_, 1096); };
+my $vpmsumw = sub { vcrypto_op(@_, 1160); };
+my $vaddudm = sub { vcrypto_op(@_, 192); };
+
+my $mtsle = sub {
+ my ($f, $arg) = @_;
+ " .long ".sprintf "0x%X",(31<<26)|($arg<<21)|(147*2);
+};
while($line=<>) {
@@ -138,7 +202,10 @@ while($line=<>) {
{
$line =~ s|(^[\.\w]+)\:\s*||;
my $label = $1;
- printf "%s:",($GLOBALS{$label} or $label) if ($label);
+ if ($label) {
+ printf "%s:",($GLOBALS{$label} or $label);
+ printf "\n.localentry\t$GLOBALS{$label},0" if ($GLOBALS{$label} && $flavour =~ /linux.*64le/);
+ }
}
{
@@ -147,7 +214,7 @@ while($line=<>) {
my $mnemonic = $2;
my $f = $3;
my $opcode = eval("\$$mnemonic");
- $line =~ s|\bc?[rf]([0-9]+)\b|$1|g if ($c ne "." and $flavour !~ /osx/);
+ $line =~ s/\b(c?[rf]|v|vs)([0-9]+)\b/$2/g if ($c ne "." and $flavour !~ /osx/);
if (ref($opcode) eq 'CODE') { $line = &$opcode($f,split(',',$line)); }
elsif ($mnemonic) { $line = $c.$mnemonic.$f."\t".$line; }
}
diff --git a/crypto/perlasm/sparcv9_modes.pl b/crypto/perlasm/sparcv9_modes.pl
new file mode 100755
index 000000000000..eb267a57ed81
--- /dev/null
+++ b/crypto/perlasm/sparcv9_modes.pl
@@ -0,0 +1,1687 @@
+#!/usr/bin/env perl
+
+# Specific modes implementations for SPARC Architecture 2011. There
+# is T4 dependency though, an ASI value that is not specified in the
+# Architecture Manual. But as SPARC universe is rather monocultural,
+# we imply that processor capable of executing crypto instructions
+# can handle the ASI in question as well. This means that we ought to
+# keep eyes open when new processors emerge...
+#
+# As for above mentioned ASI. It's so called "block initializing
+# store" which cancels "read" in "read-update-write" on cache lines.
+# This is "cooperative" optimization, as it reduces overall pressure
+# on memory interface. Benefits can't be observed/quantified with
+# usual benchmarks, on the contrary you can notice that single-thread
+# performance for parallelizable modes is ~1.5% worse for largest
+# block sizes [though few percent better for not so long ones]. All
+# this based on suggestions from David Miller.
+
+sub asm_init { # to be called with @ARGV as argument
+ for (@_) { $::abibits=64 if (/\-m64/ || /\-xarch\=v9/); }
+ if ($::abibits==64) { $::bias=2047; $::frame=192; $::size_t_cc="%xcc"; }
+ else { $::bias=0; $::frame=112; $::size_t_cc="%icc"; }
+}
+
+# unified interface
+my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5));
+# local variables
+my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7));
+
+sub alg_cbc_encrypt_implement {
+my ($alg,$bits) = @_;
+
+$::code.=<<___;
+.globl ${alg}${bits}_t4_cbc_encrypt
+.align 32
+${alg}${bits}_t4_cbc_encrypt:
+ save %sp, -$::frame, %sp
+ cmp $len, 0
+ be,pn $::size_t_cc, .L${bits}_cbc_enc_abort
+ sub $inp, $out, $blk_init ! $inp!=$out
+___
+$::code.=<<___ if (!$::evp);
+ andcc $ivec, 7, $ivoff
+ alignaddr $ivec, %g0, $ivec
+
+ ldd [$ivec + 0], %f0 ! load ivec
+ bz,pt %icc, 1f
+ ldd [$ivec + 8], %f2
+ ldd [$ivec + 16], %f4
+ faligndata %f0, %f2, %f0
+ faligndata %f2, %f4, %f2
+1:
+___
+$::code.=<<___ if ($::evp);
+ ld [$ivec + 0], %f0
+ ld [$ivec + 4], %f1
+ ld [$ivec + 8], %f2
+ ld [$ivec + 12], %f3
+___
+$::code.=<<___;
+ prefetch [$inp], 20
+ prefetch [$inp + 63], 20
+ call _${alg}${bits}_load_enckey
+ and $inp, 7, $ileft
+ andn $inp, 7, $inp
+ sll $ileft, 3, $ileft
+ mov 64, $iright
+ mov 0xff, $omask
+ sub $iright, $ileft, $iright
+ and $out, 7, $ooff
+ cmp $len, 127
+ movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
+ movleu $::size_t_cc, 0, $blk_init ! $len<128 ||
+ brnz,pn $blk_init, .L${bits}cbc_enc_blk ! $inp==$out)
+ srl $omask, $ooff, $omask
+
+ alignaddrl $out, %g0, $out
+ srlx $len, 4, $len
+ prefetch [$out], 22
+
+.L${bits}_cbc_enc_loop:
+ ldx [$inp + 0], %o0
+ brz,pt $ileft, 4f
+ ldx [$inp + 8], %o1
+
+ ldx [$inp + 16], %o2
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ sllx %o1, $ileft, %o1
+ or %g1, %o0, %o0
+ srlx %o2, $iright, %o2
+ or %o2, %o1, %o1
+4:
+ xor %g4, %o0, %o0 ! ^= rk[0]
+ xor %g5, %o1, %o1
+ movxtod %o0, %f12
+ movxtod %o1, %f14
+
+ fxor %f12, %f0, %f0 ! ^= ivec
+ fxor %f14, %f2, %f2
+ prefetch [$out + 63], 22
+ prefetch [$inp + 16+63], 20
+ call _${alg}${bits}_encrypt_1x
+ add $inp, 16, $inp
+
+ brnz,pn $ooff, 2f
+ sub $len, 1, $len
+
+ std %f0, [$out + 0]
+ std %f2, [$out + 8]
+ brnz,pt $len, .L${bits}_cbc_enc_loop
+ add $out, 16, $out
+___
+$::code.=<<___ if ($::evp);
+ st %f0, [$ivec + 0]
+ st %f1, [$ivec + 4]
+ st %f2, [$ivec + 8]
+ st %f3, [$ivec + 12]
+___
+$::code.=<<___ if (!$::evp);
+ brnz,pn $ivoff, 3f
+ nop
+
+ std %f0, [$ivec + 0] ! write out ivec
+ std %f2, [$ivec + 8]
+___
+$::code.=<<___;
+.L${bits}_cbc_enc_abort:
+ ret
+ restore
+
+.align 16
+2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
+ ! and ~3x deterioration
+ ! in inp==out case
+ faligndata %f0, %f0, %f4 ! handle unaligned output
+ faligndata %f0, %f2, %f6
+ faligndata %f2, %f2, %f8
+
+ stda %f4, [$out + $omask]0xc0 ! partial store
+ std %f6, [$out + 8]
+ add $out, 16, $out
+ orn %g0, $omask, $omask
+ stda %f8, [$out + $omask]0xc0 ! partial store
+
+ brnz,pt $len, .L${bits}_cbc_enc_loop+4
+ orn %g0, $omask, $omask
+___
+$::code.=<<___ if ($::evp);
+ st %f0, [$ivec + 0]
+ st %f1, [$ivec + 4]
+ st %f2, [$ivec + 8]
+ st %f3, [$ivec + 12]
+___
+$::code.=<<___ if (!$::evp);
+ brnz,pn $ivoff, 3f
+ nop
+
+ std %f0, [$ivec + 0] ! write out ivec
+ std %f2, [$ivec + 8]
+ ret
+ restore
+
+.align 16
+3: alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec
+ mov 0xff, $omask
+ srl $omask, $ivoff, $omask
+ faligndata %f0, %f0, %f4
+ faligndata %f0, %f2, %f6
+ faligndata %f2, %f2, %f8
+ stda %f4, [$ivec + $omask]0xc0
+ std %f6, [$ivec + 8]
+ add $ivec, 16, $ivec
+ orn %g0, $omask, $omask
+ stda %f8, [$ivec + $omask]0xc0
+___
+$::code.=<<___;
+ ret
+ restore
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+.align 32
+.L${bits}cbc_enc_blk:
+ add $out, $len, $blk_init
+ and $blk_init, 63, $blk_init ! tail
+ sub $len, $blk_init, $len
+ add $blk_init, 15, $blk_init ! round up to 16n
+ srlx $len, 4, $len
+ srl $blk_init, 4, $blk_init
+
+.L${bits}_cbc_enc_blk_loop:
+ ldx [$inp + 0], %o0
+ brz,pt $ileft, 5f
+ ldx [$inp + 8], %o1
+
+ ldx [$inp + 16], %o2
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ sllx %o1, $ileft, %o1
+ or %g1, %o0, %o0
+ srlx %o2, $iright, %o2
+ or %o2, %o1, %o1
+5:
+ xor %g4, %o0, %o0 ! ^= rk[0]
+ xor %g5, %o1, %o1
+ movxtod %o0, %f12
+ movxtod %o1, %f14
+
+ fxor %f12, %f0, %f0 ! ^= ivec
+ fxor %f14, %f2, %f2
+ prefetch [$inp + 16+63], 20
+ call _${alg}${bits}_encrypt_1x
+ add $inp, 16, $inp
+ sub $len, 1, $len
+
+ stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ add $out, 8, $out
+ stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ brnz,pt $len, .L${bits}_cbc_enc_blk_loop
+ add $out, 8, $out
+
+ membar #StoreLoad|#StoreStore
+ brnz,pt $blk_init, .L${bits}_cbc_enc_loop
+ mov $blk_init, $len
+___
+$::code.=<<___ if ($::evp);
+ st %f0, [$ivec + 0]
+ st %f1, [$ivec + 4]
+ st %f2, [$ivec + 8]
+ st %f3, [$ivec + 12]
+___
+$::code.=<<___ if (!$::evp);
+ brnz,pn $ivoff, 3b
+ nop
+
+ std %f0, [$ivec + 0] ! write out ivec
+ std %f2, [$ivec + 8]
+___
+$::code.=<<___;
+ ret
+ restore
+.type ${alg}${bits}_t4_cbc_encrypt,#function
+.size ${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt
+___
+}
+
+sub alg_cbc_decrypt_implement {
+my ($alg,$bits) = @_;
+
+$::code.=<<___;
+.globl ${alg}${bits}_t4_cbc_decrypt
+.align 32
+${alg}${bits}_t4_cbc_decrypt:
+ save %sp, -$::frame, %sp
+ cmp $len, 0
+ be,pn $::size_t_cc, .L${bits}_cbc_dec_abort
+ sub $inp, $out, $blk_init ! $inp!=$out
+___
+$::code.=<<___ if (!$::evp);
+ andcc $ivec, 7, $ivoff
+ alignaddr $ivec, %g0, $ivec
+
+ ldd [$ivec + 0], %f12 ! load ivec
+ bz,pt %icc, 1f
+ ldd [$ivec + 8], %f14
+ ldd [$ivec + 16], %f0
+ faligndata %f12, %f14, %f12
+ faligndata %f14, %f0, %f14
+1:
+___
+$::code.=<<___ if ($::evp);
+ ld [$ivec + 0], %f12 ! load ivec
+ ld [$ivec + 4], %f13
+ ld [$ivec + 8], %f14
+ ld [$ivec + 12], %f15
+___
+$::code.=<<___;
+ prefetch [$inp], 20
+ prefetch [$inp + 63], 20
+ call _${alg}${bits}_load_deckey
+ and $inp, 7, $ileft
+ andn $inp, 7, $inp
+ sll $ileft, 3, $ileft
+ mov 64, $iright
+ mov 0xff, $omask
+ sub $iright, $ileft, $iright
+ and $out, 7, $ooff
+ cmp $len, 255
+ movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
+ movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
+ brnz,pn $blk_init, .L${bits}cbc_dec_blk ! $inp==$out)
+ srl $omask, $ooff, $omask
+
+ andcc $len, 16, %g0 ! is number of blocks even?
+ srlx $len, 4, $len
+ alignaddrl $out, %g0, $out
+ bz %icc, .L${bits}_cbc_dec_loop2x
+ prefetch [$out], 22
+.L${bits}_cbc_dec_loop:
+ ldx [$inp + 0], %o0
+ brz,pt $ileft, 4f
+ ldx [$inp + 8], %o1
+
+ ldx [$inp + 16], %o2
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ sllx %o1, $ileft, %o1
+ or %g1, %o0, %o0
+ srlx %o2, $iright, %o2
+ or %o2, %o1, %o1
+4:
+ xor %g4, %o0, %o2 ! ^= rk[0]
+ xor %g5, %o1, %o3
+ movxtod %o2, %f0
+ movxtod %o3, %f2
+
+ prefetch [$out + 63], 22
+ prefetch [$inp + 16+63], 20
+ call _${alg}${bits}_decrypt_1x
+ add $inp, 16, $inp
+
+ fxor %f12, %f0, %f0 ! ^= ivec
+ fxor %f14, %f2, %f2
+ movxtod %o0, %f12
+ movxtod %o1, %f14
+
+ brnz,pn $ooff, 2f
+ sub $len, 1, $len
+
+ std %f0, [$out + 0]
+ std %f2, [$out + 8]
+ brnz,pt $len, .L${bits}_cbc_dec_loop2x
+ add $out, 16, $out
+___
+$::code.=<<___ if ($::evp);
+ st %f12, [$ivec + 0]
+ st %f13, [$ivec + 4]
+ st %f14, [$ivec + 8]
+ st %f15, [$ivec + 12]
+___
+$::code.=<<___ if (!$::evp);
+ brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
+ nop
+
+ std %f12, [$ivec + 0] ! write out ivec
+ std %f14, [$ivec + 8]
+___
+$::code.=<<___;
+.L${bits}_cbc_dec_abort:
+ ret
+ restore
+
+.align 16
+2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
+ ! and ~3x deterioration
+ ! in inp==out case
+ faligndata %f0, %f0, %f4 ! handle unaligned output
+ faligndata %f0, %f2, %f6
+ faligndata %f2, %f2, %f8
+
+ stda %f4, [$out + $omask]0xc0 ! partial store
+ std %f6, [$out + 8]
+ add $out, 16, $out
+ orn %g0, $omask, $omask
+ stda %f8, [$out + $omask]0xc0 ! partial store
+
+ brnz,pt $len, .L${bits}_cbc_dec_loop2x+4
+ orn %g0, $omask, $omask
+___
+$::code.=<<___ if ($::evp);
+ st %f12, [$ivec + 0]
+ st %f13, [$ivec + 4]
+ st %f14, [$ivec + 8]
+ st %f15, [$ivec + 12]
+___
+$::code.=<<___ if (!$::evp);
+ brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
+ nop
+
+ std %f12, [$ivec + 0] ! write out ivec
+ std %f14, [$ivec + 8]
+___
+$::code.=<<___;
+ ret
+ restore
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+.align 32
+.L${bits}_cbc_dec_loop2x:
+ ldx [$inp + 0], %o0
+ ldx [$inp + 8], %o1
+ ldx [$inp + 16], %o2
+ brz,pt $ileft, 4f
+ ldx [$inp + 24], %o3
+
+ ldx [$inp + 32], %o4
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ or %g1, %o0, %o0
+ sllx %o1, $ileft, %o1
+ srlx %o2, $iright, %g1
+ or %g1, %o1, %o1
+ sllx %o2, $ileft, %o2
+ srlx %o3, $iright, %g1
+ or %g1, %o2, %o2
+ sllx %o3, $ileft, %o3
+ srlx %o4, $iright, %o4
+ or %o4, %o3, %o3
+4:
+ xor %g4, %o0, %o4 ! ^= rk[0]
+ xor %g5, %o1, %o5
+ movxtod %o4, %f0
+ movxtod %o5, %f2
+ xor %g4, %o2, %o4
+ xor %g5, %o3, %o5
+ movxtod %o4, %f4
+ movxtod %o5, %f6
+
+ prefetch [$out + 63], 22
+ prefetch [$inp + 32+63], 20
+ call _${alg}${bits}_decrypt_2x
+ add $inp, 32, $inp
+
+ movxtod %o0, %f8
+ movxtod %o1, %f10
+ fxor %f12, %f0, %f0 ! ^= ivec
+ fxor %f14, %f2, %f2
+ movxtod %o2, %f12
+ movxtod %o3, %f14
+ fxor %f8, %f4, %f4
+ fxor %f10, %f6, %f6
+
+ brnz,pn $ooff, 2f
+ sub $len, 2, $len
+
+ std %f0, [$out + 0]
+ std %f2, [$out + 8]
+ std %f4, [$out + 16]
+ std %f6, [$out + 24]
+ brnz,pt $len, .L${bits}_cbc_dec_loop2x
+ add $out, 32, $out
+___
+$::code.=<<___ if ($::evp);
+ st %f12, [$ivec + 0]
+ st %f13, [$ivec + 4]
+ st %f14, [$ivec + 8]
+ st %f15, [$ivec + 12]
+___
+$::code.=<<___ if (!$::evp);
+ brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
+ nop
+
+ std %f12, [$ivec + 0] ! write out ivec
+ std %f14, [$ivec + 8]
+___
+$::code.=<<___;
+ ret
+ restore
+
+.align 16
+2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
+ ! and ~3x deterioration
+ ! in inp==out case
+ faligndata %f0, %f0, %f8 ! handle unaligned output
+ faligndata %f0, %f2, %f0
+ faligndata %f2, %f4, %f2
+ faligndata %f4, %f6, %f4
+ faligndata %f6, %f6, %f6
+ stda %f8, [$out + $omask]0xc0 ! partial store
+ std %f0, [$out + 8]
+ std %f2, [$out + 16]
+ std %f4, [$out + 24]
+ add $out, 32, $out
+ orn %g0, $omask, $omask
+ stda %f6, [$out + $omask]0xc0 ! partial store
+
+ brnz,pt $len, .L${bits}_cbc_dec_loop2x+4
+ orn %g0, $omask, $omask
+___
+$::code.=<<___ if ($::evp);
+ st %f12, [$ivec + 0]
+ st %f13, [$ivec + 4]
+ st %f14, [$ivec + 8]
+ st %f15, [$ivec + 12]
+___
+$::code.=<<___ if (!$::evp);
+ brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
+ nop
+
+ std %f12, [$ivec + 0] ! write out ivec
+ std %f14, [$ivec + 8]
+ ret
+ restore
+
+.align 16
+.L${bits}_cbc_dec_unaligned_ivec:
+ alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec
+ mov 0xff, $omask
+ srl $omask, $ivoff, $omask
+ faligndata %f12, %f12, %f0
+ faligndata %f12, %f14, %f2
+ faligndata %f14, %f14, %f4
+ stda %f0, [$ivec + $omask]0xc0
+ std %f2, [$ivec + 8]
+ add $ivec, 16, $ivec
+ orn %g0, $omask, $omask
+ stda %f4, [$ivec + $omask]0xc0
+___
+$::code.=<<___;
+ ret
+ restore
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+.align 32
+.L${bits}cbc_dec_blk:
+ add $out, $len, $blk_init
+ and $blk_init, 63, $blk_init ! tail
+ sub $len, $blk_init, $len
+ add $blk_init, 15, $blk_init ! round up to 16n
+ srlx $len, 4, $len
+ srl $blk_init, 4, $blk_init
+ sub $len, 1, $len
+ add $blk_init, 1, $blk_init
+
+.L${bits}_cbc_dec_blk_loop2x:
+ ldx [$inp + 0], %o0
+ ldx [$inp + 8], %o1
+ ldx [$inp + 16], %o2
+ brz,pt $ileft, 5f
+ ldx [$inp + 24], %o3
+
+ ldx [$inp + 32], %o4
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ or %g1, %o0, %o0
+ sllx %o1, $ileft, %o1
+ srlx %o2, $iright, %g1
+ or %g1, %o1, %o1
+ sllx %o2, $ileft, %o2
+ srlx %o3, $iright, %g1
+ or %g1, %o2, %o2
+ sllx %o3, $ileft, %o3
+ srlx %o4, $iright, %o4
+ or %o4, %o3, %o3
+5:
+ xor %g4, %o0, %o4 ! ^= rk[0]
+ xor %g5, %o1, %o5
+ movxtod %o4, %f0
+ movxtod %o5, %f2
+ xor %g4, %o2, %o4
+ xor %g5, %o3, %o5
+ movxtod %o4, %f4
+ movxtod %o5, %f6
+
+ prefetch [$inp + 32+63], 20
+ call _${alg}${bits}_decrypt_2x
+ add $inp, 32, $inp
+ subcc $len, 2, $len
+
+ movxtod %o0, %f8
+ movxtod %o1, %f10
+ fxor %f12, %f0, %f0 ! ^= ivec
+ fxor %f14, %f2, %f2
+ movxtod %o2, %f12
+ movxtod %o3, %f14
+ fxor %f8, %f4, %f4
+ fxor %f10, %f6, %f6
+
+ stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ add $out, 8, $out
+ stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ add $out, 8, $out
+ stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ add $out, 8, $out
+ stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ bgu,pt $::size_t_cc, .L${bits}_cbc_dec_blk_loop2x
+ add $out, 8, $out
+
+ add $blk_init, $len, $len
+ andcc $len, 1, %g0 ! is number of blocks even?
+ membar #StoreLoad|#StoreStore
+ bnz,pt %icc, .L${bits}_cbc_dec_loop
+ srl $len, 0, $len
+ brnz,pn $len, .L${bits}_cbc_dec_loop2x
+ nop
+___
+$::code.=<<___ if ($::evp);
+ st %f12, [$ivec + 0] ! write out ivec
+ st %f13, [$ivec + 4]
+ st %f14, [$ivec + 8]
+ st %f15, [$ivec + 12]
+___
+$::code.=<<___ if (!$::evp);
+ brnz,pn $ivoff, 3b
+ nop
+
+ std %f12, [$ivec + 0] ! write out ivec
+ std %f14, [$ivec + 8]
+___
+$::code.=<<___;
+ ret
+ restore
+.type ${alg}${bits}_t4_cbc_decrypt,#function
+.size ${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt
+___
+}
+
+sub alg_ctr32_implement {
+my ($alg,$bits) = @_;
+
+$::code.=<<___;
+.globl ${alg}${bits}_t4_ctr32_encrypt
+.align 32
+${alg}${bits}_t4_ctr32_encrypt:
+ save %sp, -$::frame, %sp
+
+ prefetch [$inp], 20
+ prefetch [$inp + 63], 20
+ call _${alg}${bits}_load_enckey
+ sllx $len, 4, $len
+
+ ld [$ivec + 0], %l4 ! counter
+ ld [$ivec + 4], %l5
+ ld [$ivec + 8], %l6
+ ld [$ivec + 12], %l7
+
+ sllx %l4, 32, %o5
+ or %l5, %o5, %o5
+ sllx %l6, 32, %g1
+ xor %o5, %g4, %g4 ! ^= rk[0]
+ xor %g1, %g5, %g5
+ movxtod %g4, %f14 ! most significant 64 bits
+
+ sub $inp, $out, $blk_init ! $inp!=$out
+ and $inp, 7, $ileft
+ andn $inp, 7, $inp
+ sll $ileft, 3, $ileft
+ mov 64, $iright
+ mov 0xff, $omask
+ sub $iright, $ileft, $iright
+ and $out, 7, $ooff
+ cmp $len, 255
+ movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
+ movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
+ brnz,pn $blk_init, .L${bits}_ctr32_blk ! $inp==$out)
+ srl $omask, $ooff, $omask
+
+ andcc $len, 16, %g0 ! is number of blocks even?
+ alignaddrl $out, %g0, $out
+ bz %icc, .L${bits}_ctr32_loop2x
+ srlx $len, 4, $len
+.L${bits}_ctr32_loop:
+ ldx [$inp + 0], %o0
+ brz,pt $ileft, 4f
+ ldx [$inp + 8], %o1
+
+ ldx [$inp + 16], %o2
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ sllx %o1, $ileft, %o1
+ or %g1, %o0, %o0
+ srlx %o2, $iright, %o2
+ or %o2, %o1, %o1
+4:
+ xor %g5, %l7, %g1 ! ^= rk[0]
+ add %l7, 1, %l7
+ movxtod %g1, %f2
+ srl %l7, 0, %l7 ! clruw
+ prefetch [$out + 63], 22
+ prefetch [$inp + 16+63], 20
+___
+$::code.=<<___ if ($alg eq "aes");
+ aes_eround01 %f16, %f14, %f2, %f4
+ aes_eround23 %f18, %f14, %f2, %f2
+___
+$::code.=<<___ if ($alg eq "cmll");
+ camellia_f %f16, %f2, %f14, %f2
+ camellia_f %f18, %f14, %f2, %f0
+___
+$::code.=<<___;
+ call _${alg}${bits}_encrypt_1x+8
+ add $inp, 16, $inp
+
+ movxtod %o0, %f10
+ movxtod %o1, %f12
+ fxor %f10, %f0, %f0 ! ^= inp
+ fxor %f12, %f2, %f2
+
+ brnz,pn $ooff, 2f
+ sub $len, 1, $len
+
+ std %f0, [$out + 0]
+ std %f2, [$out + 8]
+ brnz,pt $len, .L${bits}_ctr32_loop2x
+ add $out, 16, $out
+
+ ret
+ restore
+
+.align 16
+2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
+ ! and ~3x deterioration
+ ! in inp==out case
+ faligndata %f0, %f0, %f4 ! handle unaligned output
+ faligndata %f0, %f2, %f6
+ faligndata %f2, %f2, %f8
+ stda %f4, [$out + $omask]0xc0 ! partial store
+ std %f6, [$out + 8]
+ add $out, 16, $out
+ orn %g0, $omask, $omask
+ stda %f8, [$out + $omask]0xc0 ! partial store
+
+ brnz,pt $len, .L${bits}_ctr32_loop2x+4
+ orn %g0, $omask, $omask
+
+ ret
+ restore
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+.align 32
+.L${bits}_ctr32_loop2x:
+ ldx [$inp + 0], %o0
+ ldx [$inp + 8], %o1
+ ldx [$inp + 16], %o2
+ brz,pt $ileft, 4f
+ ldx [$inp + 24], %o3
+
+ ldx [$inp + 32], %o4
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ or %g1, %o0, %o0
+ sllx %o1, $ileft, %o1
+ srlx %o2, $iright, %g1
+ or %g1, %o1, %o1
+ sllx %o2, $ileft, %o2
+ srlx %o3, $iright, %g1
+ or %g1, %o2, %o2
+ sllx %o3, $ileft, %o3
+ srlx %o4, $iright, %o4
+ or %o4, %o3, %o3
+4:
+ xor %g5, %l7, %g1 ! ^= rk[0]
+ add %l7, 1, %l7
+ movxtod %g1, %f2
+ srl %l7, 0, %l7 ! clruw
+ xor %g5, %l7, %g1
+ add %l7, 1, %l7
+ movxtod %g1, %f6
+ srl %l7, 0, %l7 ! clruw
+ prefetch [$out + 63], 22
+ prefetch [$inp + 32+63], 20
+___
+$::code.=<<___ if ($alg eq "aes");
+ aes_eround01 %f16, %f14, %f2, %f8
+ aes_eround23 %f18, %f14, %f2, %f2
+ aes_eround01 %f16, %f14, %f6, %f10
+ aes_eround23 %f18, %f14, %f6, %f6
+___
+$::code.=<<___ if ($alg eq "cmll");
+ camellia_f %f16, %f2, %f14, %f2
+ camellia_f %f16, %f6, %f14, %f6
+ camellia_f %f18, %f14, %f2, %f0
+ camellia_f %f18, %f14, %f6, %f4
+___
+$::code.=<<___;
+ call _${alg}${bits}_encrypt_2x+16
+ add $inp, 32, $inp
+
+ movxtod %o0, %f8
+ movxtod %o1, %f10
+ movxtod %o2, %f12
+ fxor %f8, %f0, %f0 ! ^= inp
+ movxtod %o3, %f8
+ fxor %f10, %f2, %f2
+ fxor %f12, %f4, %f4
+ fxor %f8, %f6, %f6
+
+ brnz,pn $ooff, 2f
+ sub $len, 2, $len
+
+ std %f0, [$out + 0]
+ std %f2, [$out + 8]
+ std %f4, [$out + 16]
+ std %f6, [$out + 24]
+ brnz,pt $len, .L${bits}_ctr32_loop2x
+ add $out, 32, $out
+
+ ret
+ restore
+
+.align 16
+2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
+ ! and ~3x deterioration
+ ! in inp==out case
+ faligndata %f0, %f0, %f8 ! handle unaligned output
+ faligndata %f0, %f2, %f0
+ faligndata %f2, %f4, %f2
+ faligndata %f4, %f6, %f4
+ faligndata %f6, %f6, %f6
+
+ stda %f8, [$out + $omask]0xc0 ! partial store
+ std %f0, [$out + 8]
+ std %f2, [$out + 16]
+ std %f4, [$out + 24]
+ add $out, 32, $out
+ orn %g0, $omask, $omask
+ stda %f6, [$out + $omask]0xc0 ! partial store
+
+ brnz,pt $len, .L${bits}_ctr32_loop2x+4
+ orn %g0, $omask, $omask
+
+ ret
+ restore
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+.align 32
+.L${bits}_ctr32_blk:
+ add $out, $len, $blk_init
+ and $blk_init, 63, $blk_init ! tail
+ sub $len, $blk_init, $len
+ add $blk_init, 15, $blk_init ! round up to 16n
+ srlx $len, 4, $len
+ srl $blk_init, 4, $blk_init
+ sub $len, 1, $len
+ add $blk_init, 1, $blk_init
+
+.L${bits}_ctr32_blk_loop2x:
+ ldx [$inp + 0], %o0
+ ldx [$inp + 8], %o1
+ ldx [$inp + 16], %o2
+ brz,pt $ileft, 5f
+ ldx [$inp + 24], %o3
+
+ ldx [$inp + 32], %o4
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ or %g1, %o0, %o0
+ sllx %o1, $ileft, %o1
+ srlx %o2, $iright, %g1
+ or %g1, %o1, %o1
+ sllx %o2, $ileft, %o2
+ srlx %o3, $iright, %g1
+ or %g1, %o2, %o2
+ sllx %o3, $ileft, %o3
+ srlx %o4, $iright, %o4
+ or %o4, %o3, %o3
+5:
+ xor %g5, %l7, %g1 ! ^= rk[0]
+ add %l7, 1, %l7
+ movxtod %g1, %f2
+ srl %l7, 0, %l7 ! clruw
+ xor %g5, %l7, %g1
+ add %l7, 1, %l7
+ movxtod %g1, %f6
+ srl %l7, 0, %l7 ! clruw
+ prefetch [$inp + 32+63], 20
+___
+$::code.=<<___ if ($alg eq "aes");
+ aes_eround01 %f16, %f14, %f2, %f8
+ aes_eround23 %f18, %f14, %f2, %f2
+ aes_eround01 %f16, %f14, %f6, %f10
+ aes_eround23 %f18, %f14, %f6, %f6
+___
+$::code.=<<___ if ($alg eq "cmll");
+ camellia_f %f16, %f2, %f14, %f2
+ camellia_f %f16, %f6, %f14, %f6
+ camellia_f %f18, %f14, %f2, %f0
+ camellia_f %f18, %f14, %f6, %f4
+___
+$::code.=<<___;
+ call _${alg}${bits}_encrypt_2x+16
+ add $inp, 32, $inp
+ subcc $len, 2, $len
+
+ movxtod %o0, %f8
+ movxtod %o1, %f10
+ movxtod %o2, %f12
+ fxor %f8, %f0, %f0 ! ^= inp
+ movxtod %o3, %f8
+ fxor %f10, %f2, %f2
+ fxor %f12, %f4, %f4
+ fxor %f8, %f6, %f6
+
+ stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ add $out, 8, $out
+ stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ add $out, 8, $out
+ stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ add $out, 8, $out
+ stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ bgu,pt $::size_t_cc, .L${bits}_ctr32_blk_loop2x
+ add $out, 8, $out
+
+ add $blk_init, $len, $len
+ andcc $len, 1, %g0 ! is number of blocks even?
+ membar #StoreLoad|#StoreStore
+ bnz,pt %icc, .L${bits}_ctr32_loop
+ srl $len, 0, $len
+ brnz,pn $len, .L${bits}_ctr32_loop2x
+ nop
+
+ ret
+ restore
+.type ${alg}${bits}_t4_ctr32_encrypt,#function
+.size ${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt
+___
+}
+
+sub alg_xts_implement {
+my ($alg,$bits,$dir) = @_;
+my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5));
+my $rem=$ivec;
+
+$::code.=<<___;
+.globl ${alg}${bits}_t4_xts_${dir}crypt
+.align 32
+${alg}${bits}_t4_xts_${dir}crypt:
+ save %sp, -$::frame-16, %sp
+
+ mov $ivec, %o0
+ add %fp, $::bias-16, %o1
+ call ${alg}_t4_encrypt
+ mov $key2, %o2
+
+ add %fp, $::bias-16, %l7
+ ldxa [%l7]0x88, %g2
+ add %fp, $::bias-8, %l7
+ ldxa [%l7]0x88, %g3 ! %g3:%g2 is tweak
+
+ sethi %hi(0x76543210), %l7
+ or %l7, %lo(0x76543210), %l7
+ bmask %l7, %g0, %g0 ! byte swap mask
+
+ prefetch [$inp], 20
+ prefetch [$inp + 63], 20
+ call _${alg}${bits}_load_${dir}ckey
+ and $len, 15, $rem
+ and $len, -16, $len
+___
+$code.=<<___ if ($dir eq "de");
+ mov 0, %l7
+ movrnz $rem, 16, %l7
+ sub $len, %l7, $len
+___
+$code.=<<___;
+
+ sub $inp, $out, $blk_init ! $inp!=$out
+ and $inp, 7, $ileft
+ andn $inp, 7, $inp
+ sll $ileft, 3, $ileft
+ mov 64, $iright
+ mov 0xff, $omask
+ sub $iright, $ileft, $iright
+ and $out, 7, $ooff
+ cmp $len, 255
+ movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
+ movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
+ brnz,pn $blk_init, .L${bits}_xts_${dir}blk ! $inp==$out)
+ srl $omask, $ooff, $omask
+
+ andcc $len, 16, %g0 ! is number of blocks even?
+___
+$code.=<<___ if ($dir eq "de");
+ brz,pn $len, .L${bits}_xts_${dir}steal
+___
+$code.=<<___;
+ alignaddrl $out, %g0, $out
+ bz %icc, .L${bits}_xts_${dir}loop2x
+ srlx $len, 4, $len
+.L${bits}_xts_${dir}loop:
+ ldx [$inp + 0], %o0
+ brz,pt $ileft, 4f
+ ldx [$inp + 8], %o1
+
+ ldx [$inp + 16], %o2
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ sllx %o1, $ileft, %o1
+ or %g1, %o0, %o0
+ srlx %o2, $iright, %o2
+ or %o2, %o1, %o1
+4:
+ movxtod %g2, %f12
+ movxtod %g3, %f14
+ bshuffle %f12, %f12, %f12
+ bshuffle %f14, %f14, %f14
+
+ xor %g4, %o0, %o0 ! ^= rk[0]
+ xor %g5, %o1, %o1
+ movxtod %o0, %f0
+ movxtod %o1, %f2
+
+ fxor %f12, %f0, %f0 ! ^= tweak[0]
+ fxor %f14, %f2, %f2
+
+ prefetch [$out + 63], 22
+ prefetch [$inp + 16+63], 20
+ call _${alg}${bits}_${dir}crypt_1x
+ add $inp, 16, $inp
+
+ fxor %f12, %f0, %f0 ! ^= tweak[0]
+ fxor %f14, %f2, %f2
+
+ srax %g3, 63, %l7 ! next tweak value
+ addcc %g2, %g2, %g2
+ and %l7, 0x87, %l7
+ addxc %g3, %g3, %g3
+ xor %l7, %g2, %g2
+
+ brnz,pn $ooff, 2f
+ sub $len, 1, $len
+
+ std %f0, [$out + 0]
+ std %f2, [$out + 8]
+ brnz,pt $len, .L${bits}_xts_${dir}loop2x
+ add $out, 16, $out
+
+ brnz,pn $rem, .L${bits}_xts_${dir}steal
+ nop
+
+ ret
+ restore
+
+.align 16
+2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
+ ! and ~3x deterioration
+ ! in inp==out case
+ faligndata %f0, %f0, %f4 ! handle unaligned output
+ faligndata %f0, %f2, %f6
+ faligndata %f2, %f2, %f8
+ stda %f4, [$out + $omask]0xc0 ! partial store
+ std %f6, [$out + 8]
+ add $out, 16, $out
+ orn %g0, $omask, $omask
+ stda %f8, [$out + $omask]0xc0 ! partial store
+
+ brnz,pt $len, .L${bits}_xts_${dir}loop2x+4
+ orn %g0, $omask, $omask
+
+ brnz,pn $rem, .L${bits}_xts_${dir}steal
+ nop
+
+ ret
+ restore
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+.align 32
+.L${bits}_xts_${dir}loop2x:
+ ldx [$inp + 0], %o0
+ ldx [$inp + 8], %o1
+ ldx [$inp + 16], %o2
+ brz,pt $ileft, 4f
+ ldx [$inp + 24], %o3
+
+ ldx [$inp + 32], %o4
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ or %g1, %o0, %o0
+ sllx %o1, $ileft, %o1
+ srlx %o2, $iright, %g1
+ or %g1, %o1, %o1
+ sllx %o2, $ileft, %o2
+ srlx %o3, $iright, %g1
+ or %g1, %o2, %o2
+ sllx %o3, $ileft, %o3
+ srlx %o4, $iright, %o4
+ or %o4, %o3, %o3
+4:
+ movxtod %g2, %f12
+ movxtod %g3, %f14
+ bshuffle %f12, %f12, %f12
+ bshuffle %f14, %f14, %f14
+
+ srax %g3, 63, %l7 ! next tweak value
+ addcc %g2, %g2, %g2
+ and %l7, 0x87, %l7
+ addxc %g3, %g3, %g3
+ xor %l7, %g2, %g2
+
+ movxtod %g2, %f8
+ movxtod %g3, %f10
+ bshuffle %f8, %f8, %f8
+ bshuffle %f10, %f10, %f10
+
+ xor %g4, %o0, %o0 ! ^= rk[0]
+ xor %g5, %o1, %o1
+ xor %g4, %o2, %o2 ! ^= rk[0]
+ xor %g5, %o3, %o3
+ movxtod %o0, %f0
+ movxtod %o1, %f2
+ movxtod %o2, %f4
+ movxtod %o3, %f6
+
+ fxor %f12, %f0, %f0 ! ^= tweak[0]
+ fxor %f14, %f2, %f2
+ fxor %f8, %f4, %f4 ! ^= tweak[0]
+ fxor %f10, %f6, %f6
+
+ prefetch [$out + 63], 22
+ prefetch [$inp + 32+63], 20
+ call _${alg}${bits}_${dir}crypt_2x
+ add $inp, 32, $inp
+
+ movxtod %g2, %f8
+ movxtod %g3, %f10
+
+ srax %g3, 63, %l7 ! next tweak value
+ addcc %g2, %g2, %g2
+ and %l7, 0x87, %l7
+ addxc %g3, %g3, %g3
+ xor %l7, %g2, %g2
+
+ bshuffle %f8, %f8, %f8
+ bshuffle %f10, %f10, %f10
+
+ fxor %f12, %f0, %f0 ! ^= tweak[0]
+ fxor %f14, %f2, %f2
+ fxor %f8, %f4, %f4
+ fxor %f10, %f6, %f6
+
+ brnz,pn $ooff, 2f
+ sub $len, 2, $len
+
+ std %f0, [$out + 0]
+ std %f2, [$out + 8]
+ std %f4, [$out + 16]
+ std %f6, [$out + 24]
+ brnz,pt $len, .L${bits}_xts_${dir}loop2x
+ add $out, 32, $out
+
+ fsrc2 %f4, %f0
+ fsrc2 %f6, %f2
+ brnz,pn $rem, .L${bits}_xts_${dir}steal
+ nop
+
+ ret
+ restore
+
+.align 16
+2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
+ ! and ~3x deterioration
+ ! in inp==out case
+ faligndata %f0, %f0, %f8 ! handle unaligned output
+ faligndata %f0, %f2, %f10
+ faligndata %f2, %f4, %f12
+ faligndata %f4, %f6, %f14
+ faligndata %f6, %f6, %f0
+
+ stda %f8, [$out + $omask]0xc0 ! partial store
+ std %f10, [$out + 8]
+ std %f12, [$out + 16]
+ std %f14, [$out + 24]
+ add $out, 32, $out
+ orn %g0, $omask, $omask
+ stda %f0, [$out + $omask]0xc0 ! partial store
+
+ brnz,pt $len, .L${bits}_xts_${dir}loop2x+4
+ orn %g0, $omask, $omask
+
+ fsrc2 %f4, %f0
+ fsrc2 %f6, %f2
+ brnz,pn $rem, .L${bits}_xts_${dir}steal
+ nop
+
+ ret
+ restore
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+.align 32
+.L${bits}_xts_${dir}blk:
+ add $out, $len, $blk_init
+ and $blk_init, 63, $blk_init ! tail
+ sub $len, $blk_init, $len
+ add $blk_init, 15, $blk_init ! round up to 16n
+ srlx $len, 4, $len
+ srl $blk_init, 4, $blk_init
+ sub $len, 1, $len
+ add $blk_init, 1, $blk_init
+
+.L${bits}_xts_${dir}blk2x:
+ ldx [$inp + 0], %o0
+ ldx [$inp + 8], %o1
+ ldx [$inp + 16], %o2
+ brz,pt $ileft, 5f
+ ldx [$inp + 24], %o3
+
+ ldx [$inp + 32], %o4
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ or %g1, %o0, %o0
+ sllx %o1, $ileft, %o1
+ srlx %o2, $iright, %g1
+ or %g1, %o1, %o1
+ sllx %o2, $ileft, %o2
+ srlx %o3, $iright, %g1
+ or %g1, %o2, %o2
+ sllx %o3, $ileft, %o3
+ srlx %o4, $iright, %o4
+ or %o4, %o3, %o3
+5:
+ movxtod %g2, %f12
+ movxtod %g3, %f14
+ bshuffle %f12, %f12, %f12
+ bshuffle %f14, %f14, %f14
+
+ srax %g3, 63, %l7 ! next tweak value
+ addcc %g2, %g2, %g2
+ and %l7, 0x87, %l7
+ addxc %g3, %g3, %g3
+ xor %l7, %g2, %g2
+
+ movxtod %g2, %f8
+ movxtod %g3, %f10
+ bshuffle %f8, %f8, %f8
+ bshuffle %f10, %f10, %f10
+
+ xor %g4, %o0, %o0 ! ^= rk[0]
+ xor %g5, %o1, %o1
+ xor %g4, %o2, %o2 ! ^= rk[0]
+ xor %g5, %o3, %o3
+ movxtod %o0, %f0
+ movxtod %o1, %f2
+ movxtod %o2, %f4
+ movxtod %o3, %f6
+
+ fxor %f12, %f0, %f0 ! ^= tweak[0]
+ fxor %f14, %f2, %f2
+ fxor %f8, %f4, %f4 ! ^= tweak[0]
+ fxor %f10, %f6, %f6
+
+ prefetch [$inp + 32+63], 20
+ call _${alg}${bits}_${dir}crypt_2x
+ add $inp, 32, $inp
+
+ movxtod %g2, %f8
+ movxtod %g3, %f10
+
+ srax %g3, 63, %l7 ! next tweak value
+ addcc %g2, %g2, %g2
+ and %l7, 0x87, %l7
+ addxc %g3, %g3, %g3
+ xor %l7, %g2, %g2
+
+ bshuffle %f8, %f8, %f8
+ bshuffle %f10, %f10, %f10
+
+ fxor %f12, %f0, %f0 ! ^= tweak[0]
+ fxor %f14, %f2, %f2
+ fxor %f8, %f4, %f4
+ fxor %f10, %f6, %f6
+
+ subcc $len, 2, $len
+ stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ add $out, 8, $out
+ stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ add $out, 8, $out
+ stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ add $out, 8, $out
+ stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ bgu,pt $::size_t_cc, .L${bits}_xts_${dir}blk2x
+ add $out, 8, $out
+
+ add $blk_init, $len, $len
+ andcc $len, 1, %g0 ! is number of blocks even?
+ membar #StoreLoad|#StoreStore
+ bnz,pt %icc, .L${bits}_xts_${dir}loop
+ srl $len, 0, $len
+ brnz,pn $len, .L${bits}_xts_${dir}loop2x
+ nop
+
+ fsrc2 %f4, %f0
+ fsrc2 %f6, %f2
+ brnz,pn $rem, .L${bits}_xts_${dir}steal
+ nop
+
+ ret
+ restore
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+___
+$code.=<<___ if ($dir eq "en");
+.align 32
+.L${bits}_xts_${dir}steal:
+ std %f0, [%fp + $::bias-16] ! copy of output
+ std %f2, [%fp + $::bias-8]
+
+ srl $ileft, 3, $ileft
+ add %fp, $::bias-16, %l7
+ add $inp, $ileft, $inp ! original $inp+$len&-15
+ add $out, $ooff, $out ! original $out+$len&-15
+ mov 0, $ileft
+ nop ! align
+
+.L${bits}_xts_${dir}stealing:
+ ldub [$inp + $ileft], %o0
+ ldub [%l7 + $ileft], %o1
+ dec $rem
+ stb %o0, [%l7 + $ileft]
+ stb %o1, [$out + $ileft]
+ brnz $rem, .L${bits}_xts_${dir}stealing
+ inc $ileft
+
+ mov %l7, $inp
+ sub $out, 16, $out
+ mov 0, $ileft
+ sub $out, $ooff, $out
+ ba .L${bits}_xts_${dir}loop ! one more time
+ mov 1, $len ! $rem is 0
+___
+$code.=<<___ if ($dir eq "de");
+.align 32
+.L${bits}_xts_${dir}steal:
+ ldx [$inp + 0], %o0
+ brz,pt $ileft, 8f
+ ldx [$inp + 8], %o1
+
+ ldx [$inp + 16], %o2
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ sllx %o1, $ileft, %o1
+ or %g1, %o0, %o0
+ srlx %o2, $iright, %o2
+ or %o2, %o1, %o1
+8:
+ srax %g3, 63, %l7 ! next tweak value
+ addcc %g2, %g2, %o2
+ and %l7, 0x87, %l7
+ addxc %g3, %g3, %o3
+ xor %l7, %o2, %o2
+
+ movxtod %o2, %f12
+ movxtod %o3, %f14
+ bshuffle %f12, %f12, %f12
+ bshuffle %f14, %f14, %f14
+
+ xor %g4, %o0, %o0 ! ^= rk[0]
+ xor %g5, %o1, %o1
+ movxtod %o0, %f0
+ movxtod %o1, %f2
+
+ fxor %f12, %f0, %f0 ! ^= tweak[0]
+ fxor %f14, %f2, %f2
+
+ call _${alg}${bits}_${dir}crypt_1x
+ add $inp, 16, $inp
+
+ fxor %f12, %f0, %f0 ! ^= tweak[0]
+ fxor %f14, %f2, %f2
+
+ std %f0, [%fp + $::bias-16]
+ std %f2, [%fp + $::bias-8]
+
+ srl $ileft, 3, $ileft
+ add %fp, $::bias-16, %l7
+ add $inp, $ileft, $inp ! original $inp+$len&-15
+ add $out, $ooff, $out ! original $out+$len&-15
+ mov 0, $ileft
+ add $out, 16, $out
+ nop ! align
+
+.L${bits}_xts_${dir}stealing:
+ ldub [$inp + $ileft], %o0
+ ldub [%l7 + $ileft], %o1
+ dec $rem
+ stb %o0, [%l7 + $ileft]
+ stb %o1, [$out + $ileft]
+ brnz $rem, .L${bits}_xts_${dir}stealing
+ inc $ileft
+
+ mov %l7, $inp
+ sub $out, 16, $out
+ mov 0, $ileft
+ sub $out, $ooff, $out
+ ba .L${bits}_xts_${dir}loop ! one more time
+ mov 1, $len ! $rem is 0
+___
+$code.=<<___;
+ ret
+ restore
+.type ${alg}${bits}_t4_xts_${dir}crypt,#function
+.size ${alg}${bits}_t4_xts_${dir}crypt,.-${alg}${bits}_t4_xts_${dir}crypt
+___
+}
+
+# Purpose of these subroutines is to explicitly encode VIS instructions,
+# so that one can compile the module without having to specify VIS
+# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# Idea is to reserve for option to produce "universal" binary and let
+# programmer detect if current CPU is VIS capable at run-time.
+sub unvis {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my ($ref,$opf);
+my %visopf = ( "faligndata" => 0x048,
+ "bshuffle" => 0x04c,
+ "fnot2" => 0x066,
+ "fxor" => 0x06c,
+ "fsrc2" => 0x078 );
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+ if ($opf=$visopf{$mnemonic}) {
+ foreach ($rs1,$rs2,$rd) {
+ return $ref if (!/%f([0-9]{1,2})/);
+ $_=$1;
+ if ($1>=32) {
+ return $ref if ($1&1);
+ # re-encode for upper double register addressing
+ $_=($1|$1>>5)&31;
+ }
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+sub unvis3 {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
+my ($ref,$opf);
+my %visopf = ( "addxc" => 0x011,
+ "addxccc" => 0x013,
+ "umulxhi" => 0x016,
+ "alignaddr" => 0x018,
+ "bmask" => 0x019,
+ "alignaddrl" => 0x01a );
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+ if ($opf=$visopf{$mnemonic}) {
+ foreach ($rs1,$rs2,$rd) {
+ return $ref if (!/%([goli])([0-9])/);
+ $_=$bias{$1}+$2;
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+sub unaes_round { # 4-argument instructions
+my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
+my ($ref,$opf);
+my %aesopf = ( "aes_eround01" => 0,
+ "aes_eround23" => 1,
+ "aes_dround01" => 2,
+ "aes_dround23" => 3,
+ "aes_eround01_l"=> 4,
+ "aes_eround23_l"=> 5,
+ "aes_dround01_l"=> 6,
+ "aes_dround23_l"=> 7,
+ "aes_kexpand1" => 8 );
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
+
+ if (defined($opf=$aesopf{$mnemonic})) {
+ $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
+ foreach ($rs1,$rs2,$rd) {
+ return $ref if (!/%f([0-9]{1,2})/);
+ $_=$1;
+ if ($1>=32) {
+ return $ref if ($1&1);
+ # re-encode for upper double register addressing
+ $_=($1|$1>>5)&31;
+ }
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+sub unaes_kexpand { # 3-argument instructions
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my ($ref,$opf);
+my %aesopf = ( "aes_kexpand0" => 0x130,
+ "aes_kexpand2" => 0x131 );
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+ if (defined($opf=$aesopf{$mnemonic})) {
+ foreach ($rs1,$rs2,$rd) {
+ return $ref if (!/%f([0-9]{1,2})/);
+ $_=$1;
+ if ($1>=32) {
+ return $ref if ($1&1);
+ # re-encode for upper double register addressing
+ $_=($1|$1>>5)&31;
+ }
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+sub uncamellia_f { # 4-argument instructions
+my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
+my ($ref,$opf);
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
+
+ if (1) {
+ $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
+ foreach ($rs1,$rs2,$rd) {
+ return $ref if (!/%f([0-9]{1,2})/);
+ $_=$1;
+ if ($1>=32) {
+ return $ref if ($1&1);
+ # re-encode for upper double register addressing
+ $_=($1|$1>>5)&31;
+ }
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|0xc<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+sub uncamellia3 { # 3-argument instructions
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my ($ref,$opf);
+my %cmllopf = ( "camellia_fl" => 0x13c,
+ "camellia_fli" => 0x13d );
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+ if (defined($opf=$cmllopf{$mnemonic})) {
+ foreach ($rs1,$rs2,$rd) {
+ return $ref if (!/%f([0-9]{1,2})/);
+ $_=$1;
+ if ($1>=32) {
+ return $ref if ($1&1);
+ # re-encode for upper double register addressing
+ $_=($1|$1>>5)&31;
+ }
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+sub unmovxtox { # 2-argument instructions
+my ($mnemonic,$rs,$rd)=@_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 );
+my ($ref,$opf);
+my %movxopf = ( "movdtox" => 0x110,
+ "movstouw" => 0x111,
+ "movstosw" => 0x113,
+ "movxtod" => 0x118,
+ "movwtos" => 0x119 );
+
+ $ref = "$mnemonic\t$rs,$rd";
+
+ if (defined($opf=$movxopf{$mnemonic})) {
+ foreach ($rs,$rd) {
+ return $ref if (!/%([fgoli])([0-9]{1,2})/);
+ $_=$bias{$1}+$2;
+ if ($2>=32) {
+ return $ref if ($2&1);
+ # re-encode for upper double register addressing
+ $_=($2|$2>>5)&31;
+ }
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 2<<30|$rd<<25|0x36<<19|$opf<<5|$rs,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+sub undes {
+my ($mnemonic)=shift;
+my @args=@_;
+my ($ref,$opf);
+my %desopf = ( "des_round" => 0b1001,
+ "des_ip" => 0b100110100,
+ "des_iip" => 0b100110101,
+ "des_kexpand" => 0b100110110 );
+
+ $ref = "$mnemonic\t".join(",",@_);
+
+ if (defined($opf=$desopf{$mnemonic})) { # 4-arg
+ if ($mnemonic eq "des_round") {
+ foreach (@args[0..3]) {
+ return $ref if (!/%f([0-9]{1,2})/);
+ $_=$1;
+ if ($1>=32) {
+ return $ref if ($1&1);
+ # re-encode for upper double register addressing
+ $_=($1|$1>>5)&31;
+ }
+ }
+ return sprintf ".word\t0x%08x !%s",
+ 2<<30|0b011001<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<9|$args[3]<<25,
+ $ref;
+ } elsif ($mnemonic eq "des_kexpand") { # 3-arg
+ foreach (@args[0..2]) {
+ return $ref if (!/(%f)?([0-9]{1,2})/);
+ $_=$2;
+ if ($2>=32) {
+ return $ref if ($2&1);
+ # re-encode for upper double register addressing
+ $_=($2|$2>>5)&31;
+ }
+ }
+ return sprintf ".word\t0x%08x !%s",
+ 2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<25,
+ $ref;
+ } else { # 2-arg
+ foreach (@args[0..1]) {
+ return $ref if (!/%f([0-9]{1,2})/);
+ $_=$1;
+ if ($1>=32) {
+ return $ref if ($2&1);
+ # re-encode for upper double register addressing
+ $_=($1|$1>>5)&31;
+ }
+ }
+ return sprintf ".word\t0x%08x !%s",
+ 2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]<<25,
+ $ref;
+ }
+ } else {
+ return $ref;
+ }
+}
+
+sub emit_assembler {
+ foreach (split("\n",$::code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+
+ s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/go;
+
+ s/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
+ &unaes_round($1,$2,$3,$4,$5)
+ /geo or
+ s/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
+ &unaes_kexpand($1,$2,$3,$4)
+ /geo or
+ s/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
+ &uncamellia_f($1,$2,$3,$4,$5)
+ /geo or
+ s/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
+ &uncamellia3($1,$2,$3,$4)
+ /geo or
+ s/\b(des_\w+)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+)(?:,\s*(%f[0-9]{1,2})(?:,\s*(%f[0-9]{1,2}))?)?/
+ &undes($1,$2,$3,$4,$5)
+ /geo or
+ s/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/
+ &unmovxtox($1,$2,$3)
+ /geo or
+ s/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/
+ &unmovxtox($1,$2,$3)
+ /geo or
+ s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
+ &unvis($1,$2,$3,$4)
+ /geo or
+ s/\b(umulxhi|bmask|addxc[c]{0,2}|alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
+ &unvis3($1,$2,$3,$4)
+ /geo;
+
+ print $_,"\n";
+ }
+}
+
+1;
diff --git a/crypto/perlasm/x86_64-xlate.pl b/crypto/perlasm/x86_64-xlate.pl
index 56d9b64b6fb3..9c70b8c2c6e9 100755
--- a/crypto/perlasm/x86_64-xlate.pl
+++ b/crypto/perlasm/x86_64-xlate.pl
@@ -121,7 +121,7 @@ my %globals;
$self->{sz} = "";
} elsif ($self->{op} =~ /^v/) { # VEX
$self->{sz} = "";
- } elsif ($self->{op} =~ /movq/ && $line =~ /%xmm/) {
+ } elsif ($self->{op} =~ /mov[dq]/ && $line =~ /%xmm/) {
$self->{sz} = "";
} elsif ($self->{op} =~ /([a-z]{3,})([qlwb])$/) {
$self->{op} = $1;
@@ -250,8 +250,13 @@ my %globals;
# in $self->{label}, new gas requires sign extension...
use integer;
$self->{label} =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
- $self->{label} =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg;
- $self->{label} =~ s/([0-9]+)/$1<<32>>32/eg;
+ $self->{label} =~ s/\b([0-9]+\s*[\*\/\%]\s*[0-9]+)\b/eval($1)/eg;
+ $self->{label} =~ s/\b([0-9]+)\b/$1<<32>>32/eg;
+
+ if (!$self->{label} && $self->{index} && $self->{scale}==1 &&
+ $self->{base} =~ /(rbp|r13)/) {
+ $self->{base} = $self->{index}; $self->{index} = $1;
+ }
if ($gas) {
$self->{label} =~ s/^___imp_/__imp__/ if ($flavour eq "mingw64");
@@ -265,14 +270,20 @@ my %globals;
sprintf "%s%s(%%%s)", $self->{asterisk},$self->{label},$self->{base};
}
} else {
- %szmap = ( b=>"BYTE$PTR", w=>"WORD$PTR", l=>"DWORD$PTR",
- q=>"QWORD$PTR",o=>"OWORD$PTR",x=>"XMMWORD$PTR" );
+ %szmap = ( b=>"BYTE$PTR", w=>"WORD$PTR",
+ l=>"DWORD$PTR", d=>"DWORD$PTR",
+ q=>"QWORD$PTR", o=>"OWORD$PTR",
+ x=>"XMMWORD$PTR", y=>"YMMWORD$PTR", z=>"ZMMWORD$PTR" );
$self->{label} =~ s/\./\$/g;
$self->{label} =~ s/(?<![\w\$\.])0x([0-9a-f]+)/0$1h/ig;
$self->{label} = "($self->{label})" if ($self->{label} =~ /[\*\+\-\/]/);
- $sz="q" if ($self->{asterisk} || opcode->mnemonic() eq "movq");
- $sz="l" if (opcode->mnemonic() eq "movd");
+
+ ($self->{asterisk}) && ($sz="q") ||
+ (opcode->mnemonic() =~ /^v?mov([qd])$/) && ($sz=$1) ||
+ (opcode->mnemonic() =~ /^v?pinsr([qdwb])$/) && ($sz=$1) ||
+ (opcode->mnemonic() =~ /^vpbroadcast([qdwb])$/) && ($sz=$1) ||
+ (opcode->mnemonic() =~ /^vinsert[fi]128$/) && ($sz="x");
if (defined($self->{index})) {
sprintf "%s[%s%s*%d%s]",$szmap{$sz},
@@ -412,7 +423,7 @@ my %globals;
}
sub out {
my $self = shift;
- if ($nasm && opcode->mnemonic()=~m/^j/) {
+ if ($nasm && opcode->mnemonic()=~m/^j(?![re]cxz)/) {
"NEAR ".$self->{value};
} else {
$self->{value};
@@ -530,7 +541,7 @@ my %globals;
$v="$current_segment\tENDS\n" if ($current_segment);
$current_segment = ".text\$";
$v.="$current_segment\tSEGMENT ";
- $v.=$masm>=$masmref ? "ALIGN(64)" : "PAGE";
+ $v.=$masm>=$masmref ? "ALIGN(256)" : "PAGE";
$v.=" 'CODE'";
}
$self->{value} = $v;
@@ -772,10 +783,64 @@ my $rdrand = sub {
}
};
+my $rdseed = sub {
+ if (shift =~ /%[er](\w+)/) {
+ my @opcode=();
+ my $dst=$1;
+ if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
+ rex(\@opcode,0,$1,8);
+ push @opcode,0x0f,0xc7,0xf8|($dst&7);
+ @opcode;
+ } else {
+ ();
+ }
+};
+
+sub rxb {
+ local *opcode=shift;
+ my ($dst,$src1,$src2,$rxb)=@_;
+
+ $rxb|=0x7<<5;
+ $rxb&=~(0x04<<5) if($dst>=8);
+ $rxb&=~(0x01<<5) if($src1>=8);
+ $rxb&=~(0x02<<5) if($src2>=8);
+ push @opcode,$rxb;
+}
+
+my $vprotd = sub {
+ if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my @opcode=(0x8f);
+ rxb(\@opcode,$3,$2,-1,0x08);
+ push @opcode,0x78,0xc2;
+ push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
+ my $c=$1;
+ push @opcode,$c=~/^0/?oct($c):$c;
+ @opcode;
+ } else {
+ ();
+ }
+};
+
+my $vprotq = sub {
+ if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my @opcode=(0x8f);
+ rxb(\@opcode,$3,$2,-1,0x08);
+ push @opcode,0x78,0xc3;
+ push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
+ my $c=$1;
+ push @opcode,$c=~/^0/?oct($c):$c;
+ @opcode;
+ } else {
+ ();
+ }
+};
+
if ($nasm) {
print <<___;
default rel
%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
___
} elsif ($masm) {
print <<___;
@@ -789,6 +854,7 @@ while($line=<>) {
$line =~ s|[#!].*$||; # get rid of asm-style comments...
$line =~ s|/\*.*\*/||; # ... and C-style comments...
$line =~ s|^\s+||; # ... and skip white spaces in beginning
+ $line =~ s|\s+$||; # ... and at the end
undef $label;
undef $opcode;
@@ -837,6 +903,8 @@ while($line=<>) {
my $arg = $_->out();
# $insn.=$sz compensates for movq, pinsrw, ...
if ($arg =~ /^xmm[0-9]+$/) { $insn.=$sz; $sz="x" if(!$sz); last; }
+ if ($arg =~ /^ymm[0-9]+$/) { $insn.=$sz; $sz="y" if(!$sz); last; }
+ if ($arg =~ /^zmm[0-9]+$/) { $insn.=$sz; $sz="z" if(!$sz); last; }
if ($arg =~ /^mm[0-9]+$/) { $insn.=$sz; $sz="q" if(!$sz); last; }
}
@args = reverse(@args);
diff --git a/crypto/perlasm/x86asm.pl b/crypto/perlasm/x86asm.pl
index eb543db2f66e..cae156ae63ce 100644
--- a/crypto/perlasm/x86asm.pl
+++ b/crypto/perlasm/x86asm.pl
@@ -131,6 +131,40 @@ sub ::rdrand
{ &::generic("rdrand",@_); }
}
+sub ::rdseed
+{ my ($dst)=@_;
+ if ($dst =~ /(e[a-dsd][ixp])/)
+ { &::data_byte(0x0f,0xc7,0xf8|$regrm{$dst}); }
+ else
+ { &::generic("rdrand",@_); }
+}
+
+sub rxb {
+ local *opcode=shift;
+ my ($dst,$src1,$src2,$rxb)=@_;
+
+ $rxb|=0x7<<5;
+ $rxb&=~(0x04<<5) if($dst>=8);
+ $rxb&=~(0x01<<5) if($src1>=8);
+ $rxb&=~(0x02<<5) if($src2>=8);
+ push @opcode,$rxb;
+}
+
+sub ::vprotd
+{ my $args=join(',',@_);
+ if ($args =~ /xmm([0-7]),xmm([0-7]),([x0-9a-f]+)/)
+ { my @opcode=(0x8f);
+ rxb(\@opcode,$1,$2,-1,0x08);
+ push @opcode,0x78,0xc2;
+ push @opcode,0xc0|($2&7)|(($1&7)<<3); # ModR/M
+ my $c=$3;
+ push @opcode,$c=~/^0/?oct($c):$c;
+ &::data_byte(@opcode);
+ }
+ else
+ { &::generic("vprotd",@_); }
+}
+
# label management
$lbdecor="L"; # local label decoration, set by package
$label="000";
@@ -221,6 +255,8 @@ sub ::asm_init
$elf=$cpp=$coff=$aout=$macosx=$win32=$netware=$mwerks=$android=0;
if (($type eq "elf"))
{ $elf=1; require "x86gas.pl"; }
+ elsif (($type eq "elf-1"))
+ { $elf=-1; require "x86gas.pl"; }
elsif (($type eq "a\.out"))
{ $aout=1; require "x86gas.pl"; }
elsif (($type eq "coff" or $type eq "gaswin"))
@@ -257,4 +293,6 @@ EOF
&file($filename);
}
+sub ::hidden {}
+
1;
diff --git a/crypto/perlasm/x86gas.pl b/crypto/perlasm/x86gas.pl
index 682a3a3163e2..63b2301fd1f0 100755
--- a/crypto/perlasm/x86gas.pl
+++ b/crypto/perlasm/x86gas.pl
@@ -70,6 +70,8 @@ sub ::DWP
{ my($addr,$reg1,$reg2,$idx)=@_;
my $ret="";
+ if (!defined($idx) && 1*$reg2) { $idx=$reg2; $reg2=$reg1; undef $reg1; }
+
$addr =~ s/^\s+//;
# prepend global references with optional underscore
$addr =~ s/^([^\+\-0-9][^\+\-]*)/&::islabel($1) or "$nmdecor$1"/ige;
@@ -157,7 +159,7 @@ sub ::file_end
}
}
if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out) {
- my $tmp=".comm\t${nmdecor}OPENSSL_ia32cap_P,8";
+ my $tmp=".comm\t${nmdecor}OPENSSL_ia32cap_P,16";
if ($::macosx) { push (@out,"$tmp,2\n"); }
elsif ($::elf) { push (@out,"$tmp,4\n"); }
else { push (@out,"$tmp\n"); }
@@ -170,10 +172,9 @@ sub ::data_short{ push(@out,".value\t".join(',',@_)."\n"); }
sub ::data_word { push(@out,".long\t".join(',',@_)."\n"); }
sub ::align
-{ my $val=$_[0],$p2,$i;
+{ my $val=$_[0];
if ($::aout)
- { for ($p2=0;$val!=0;$val>>=1) { $p2++; }
- $val=$p2-1;
+ { $val=int(log($val)/log(2));
$val.=",0x90";
}
push(@out,".align\t$val\n");
@@ -195,6 +196,8 @@ sub ::picmeup
&::mov($dst,&::DWP("$indirect-$reflabel",$base));
$non_lazy_ptr{"$nmdecor$sym"}=$indirect;
}
+ elsif ($sym eq "OPENSSL_ia32cap_P" && $::elf>0)
+ { &::lea($dst,&::DWP("$sym-$reflabel",$base)); }
else
{ &::lea($dst,&::DWP("_GLOBAL_OFFSET_TABLE_+[.-$reflabel]",
$base));
@@ -250,4 +253,6 @@ ___
sub ::dataseg
{ push(@out,".data\n"); }
+*::hidden = sub { push(@out,".hidden\t$nmdecor$_[0]\n"); } if ($::elf);
+
1;
diff --git a/crypto/perlasm/x86masm.pl b/crypto/perlasm/x86masm.pl
index f937d07c87df..1741342c3af3 100755
--- a/crypto/perlasm/x86masm.pl
+++ b/crypto/perlasm/x86masm.pl
@@ -39,6 +39,8 @@ sub get_mem
{ my($size,$addr,$reg1,$reg2,$idx)=@_;
my($post,$ret);
+ if (!defined($idx) && 1*$reg2) { $idx=$reg2; $reg2=$reg1; undef $reg1; }
+
$ret .= "$size PTR " if ($size ne "");
$addr =~ s/^\s+//;
@@ -133,7 +135,7 @@ ___
if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out)
{ my $comm=<<___;
.bss SEGMENT 'BSS'
-COMM ${nmdecor}OPENSSL_ia32cap_P:QWORD
+COMM ${nmdecor}OPENSSL_ia32cap_P:DWORD:4
.bss ENDS
___
# comment out OPENSSL_ia32cap_P declarations
diff --git a/crypto/perlasm/x86nasm.pl b/crypto/perlasm/x86nasm.pl
index ca2511c9eb91..5d92f6092ac9 100644
--- a/crypto/perlasm/x86nasm.pl
+++ b/crypto/perlasm/x86nasm.pl
@@ -36,6 +36,8 @@ sub get_mem
{ my($size,$addr,$reg1,$reg2,$idx)=@_;
my($post,$ret);
+ if (!defined($idx) && 1*$reg2) { $idx=$reg2; $reg2=$reg1; undef $reg1; }
+
if ($size ne "")
{ $ret .= "$size";
$ret .= " PTR" if ($::mwerks);
@@ -117,7 +119,7 @@ sub ::file_end
{ if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out)
{ my $comm=<<___;
${drdecor}segment .bss
-${drdecor}common ${nmdecor}OPENSSL_ia32cap_P 8
+${drdecor}common ${nmdecor}OPENSSL_ia32cap_P 16
___
# comment out OPENSSL_ia32cap_P declarations
grep {s/(^extern\s+${nmdecor}OPENSSL_ia32cap_P)/\;$1/} @out;
diff --git a/crypto/pkcs12/p12_decr.c b/crypto/pkcs12/p12_decr.c
index d46eae3c66a6..b40ea10ccbee 100644
--- a/crypto/pkcs12/p12_decr.c
+++ b/crypto/pkcs12/p12_decr.c
@@ -171,28 +171,32 @@ ASN1_OCTET_STRING *PKCS12_item_i2d_encrypt(X509_ALGOR *algor,
const char *pass, int passlen,
void *obj, int zbuf)
{
- ASN1_OCTET_STRING *oct;
+ ASN1_OCTET_STRING *oct = NULL;
unsigned char *in = NULL;
int inlen;
if (!(oct = M_ASN1_OCTET_STRING_new())) {
PKCS12err(PKCS12_F_PKCS12_ITEM_I2D_ENCRYPT, ERR_R_MALLOC_FAILURE);
- return NULL;
+ goto err;
}
inlen = ASN1_item_i2d(obj, &in, it);
if (!in) {
PKCS12err(PKCS12_F_PKCS12_ITEM_I2D_ENCRYPT, PKCS12_R_ENCODE_ERROR);
- return NULL;
+ goto err;
}
if (!PKCS12_pbe_crypt(algor, pass, passlen, in, inlen, &oct->data,
&oct->length, 1)) {
PKCS12err(PKCS12_F_PKCS12_ITEM_I2D_ENCRYPT, PKCS12_R_ENCRYPT_ERROR);
OPENSSL_free(in);
- return NULL;
+ goto err;
}
if (zbuf)
OPENSSL_cleanse(in, inlen);
OPENSSL_free(in);
return oct;
+ err:
+ if (oct)
+ ASN1_OCTET_STRING_free(oct);
+ return NULL;
}
IMPLEMENT_PKCS12_STACK_OF(PKCS7)
diff --git a/crypto/pkcs12/p12_p8e.c b/crypto/pkcs12/p12_p8e.c
index d970f0544c5e..861a087f80d7 100644
--- a/crypto/pkcs12/p12_p8e.c
+++ b/crypto/pkcs12/p12_p8e.c
@@ -76,8 +76,12 @@ X509_SIG *PKCS8_encrypt(int pbe_nid, const EVP_CIPHER *cipher,
if (pbe_nid == -1)
pbe = PKCS5_pbe2_set(cipher, iter, salt, saltlen);
- else
+ else if (EVP_PBE_find(EVP_PBE_TYPE_PRF, pbe_nid, NULL, NULL, 0))
+ pbe = PKCS5_pbe2_set_iv(cipher, iter, salt, saltlen, NULL, pbe_nid);
+ else {
+ ERR_clear_error();
pbe = PKCS5_pbe_set(pbe_nid, iter, salt, saltlen);
+ }
if (!pbe) {
PKCS12err(PKCS12_F_PKCS8_ENCRYPT, ERR_R_ASN1_LIB);
goto err;
diff --git a/crypto/ppc_arch.h b/crypto/ppc_arch.h
new file mode 100644
index 000000000000..b50ec996a5a3
--- /dev/null
+++ b/crypto/ppc_arch.h
@@ -0,0 +1,10 @@
+#ifndef __PPC_ARCH_H__
+# define __PPC_ARCH_H__
+
+extern unsigned int OPENSSL_ppccap_P;
+
+# define PPC_FPU64 (1<<0)
+# define PPC_ALTIVEC (1<<1)
+# define PPC_CRYPTO207 (1<<2)
+
+#endif
diff --git a/crypto/ppccap.c b/crypto/ppccap.c
index 5242294310b1..2b7f704cd82a 100644
--- a/crypto/ppccap.c
+++ b/crypto/ppccap.c
@@ -4,13 +4,15 @@
#include <setjmp.h>
#include <signal.h>
#include <unistd.h>
+#if defined(__linux) || defined(_AIX)
+# include <sys/utsname.h>
+#endif
#include <crypto.h>
#include <openssl/bn.h>
-#define PPC_FPU64 (1<<0)
-#define PPC_ALTIVEC (1<<1)
+#include "ppc_arch.h"
-static int OPENSSL_ppccap_P = 0;
+unsigned int OPENSSL_ppccap_P = 0;
static sigset_t all_masked;
@@ -25,7 +27,7 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, int num);
if (sizeof(size_t) == 4) {
-# if (defined(__APPLE__) && defined(__MACH__))
+# if 1 || (defined(__APPLE__) && defined(__MACH__))
if (num >= 8 && (num & 3) == 0 && (OPENSSL_ppccap_P & PPC_FPU64))
return bn_mul_mont_fpu64(rp, ap, bp, np, n0, num);
# else
@@ -55,6 +57,22 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
}
#endif
+void sha256_block_p8(void *ctx, const void *inp, size_t len);
+void sha256_block_ppc(void *ctx, const void *inp, size_t len);
+void sha256_block_data_order(void *ctx, const void *inp, size_t len)
+{
+ OPENSSL_ppccap_P & PPC_CRYPTO207 ? sha256_block_p8(ctx, inp, len) :
+ sha256_block_ppc(ctx, inp, len);
+}
+
+void sha512_block_p8(void *ctx, const void *inp, size_t len);
+void sha512_block_ppc(void *ctx, const void *inp, size_t len);
+void sha512_block_data_order(void *ctx, const void *inp, size_t len)
+{
+ OPENSSL_ppccap_P & PPC_CRYPTO207 ? sha512_block_p8(ctx, inp, len) :
+ sha512_block_ppc(ctx, inp, len);
+}
+
static sigjmp_buf ill_jmp;
static void ill_handler(int sig)
{
@@ -63,6 +81,7 @@ static void ill_handler(int sig)
void OPENSSL_ppc64_probe(void);
void OPENSSL_altivec_probe(void);
+void OPENSSL_crypto207_probe(void);
void OPENSSL_cpuid_setup(void)
{
@@ -93,12 +112,15 @@ void OPENSSL_cpuid_setup(void)
OPENSSL_ppccap_P = 0;
#if defined(_AIX)
- if (sizeof(size_t) == 4
+ if (sizeof(size_t) == 4) {
+ struct utsname uts;
# if defined(_SC_AIX_KERNEL_BITMODE)
- && sysconf(_SC_AIX_KERNEL_BITMODE) != 64
+ if (sysconf(_SC_AIX_KERNEL_BITMODE) != 64)
+ return;
# endif
- )
- return;
+ if (uname(&uts) != 0 || atoi(uts.version) < 6)
+ return;
+ }
#endif
memset(&ill_act, 0, sizeof(ill_act));
@@ -109,10 +131,14 @@ void OPENSSL_cpuid_setup(void)
sigaction(SIGILL, &ill_act, &ill_oact);
if (sizeof(size_t) == 4) {
- if (sigsetjmp(ill_jmp, 1) == 0) {
- OPENSSL_ppc64_probe();
- OPENSSL_ppccap_P |= PPC_FPU64;
- }
+#ifdef __linux
+ struct utsname uts;
+ if (uname(&uts) == 0 && strcmp(uts.machine, "ppc64") == 0)
+#endif
+ if (sigsetjmp(ill_jmp, 1) == 0) {
+ OPENSSL_ppc64_probe();
+ OPENSSL_ppccap_P |= PPC_FPU64;
+ }
} else {
/*
* Wanted code detecting POWER6 CPU and setting PPC_FPU64
@@ -122,6 +148,10 @@ void OPENSSL_cpuid_setup(void)
if (sigsetjmp(ill_jmp, 1) == 0) {
OPENSSL_altivec_probe();
OPENSSL_ppccap_P |= PPC_ALTIVEC;
+ if (sigsetjmp(ill_jmp, 1) == 0) {
+ OPENSSL_crypto207_probe();
+ OPENSSL_ppccap_P |= PPC_CRYPTO207;
+ }
}
sigaction(SIGILL, &ill_oact, NULL);
diff --git a/crypto/ppccpuid.pl b/crypto/ppccpuid.pl
index 4ba736a1d1bd..8d800fe7d36f 100755
--- a/crypto/ppccpuid.pl
+++ b/crypto/ppccpuid.pl
@@ -31,6 +31,7 @@ $code=<<___;
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
+.size .OPENSSL_ppc64_probe,.-.OPENSSL_ppc64_probe
.globl .OPENSSL_altivec_probe
.align 4
@@ -39,6 +40,17 @@ $code=<<___;
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
+.size .OPENSSL_altivec_probe,.-..OPENSSL_altivec_probe
+
+.globl .OPENSSL_crypto207_probe
+.align 4
+.OPENSSL_crypto207_probe:
+ lvx_u v0,0,r1
+ vcipher v0,v0,v0
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+.size .OPENSSL_crypto207_probe,.-.OPENSSL_crypto207_probe
.globl .OPENSSL_wipe_cpu
.align 4
@@ -71,6 +83,7 @@ $code=<<___;
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
+.size .OPENSSL_wipe_cpu,.-.OPENSSL_wipe_cpu
.globl .OPENSSL_atomic_add
.align 4
@@ -84,6 +97,7 @@ Ladd: lwarx r5,0,r3
.long 0
.byte 0,12,0x14,0,0,0,2,0
.long 0
+.size .OPENSSL_atomic_add,.-.OPENSSL_atomic_add
.globl .OPENSSL_rdtsc
.align 4
@@ -93,6 +107,7 @@ Ladd: lwarx r5,0,r3
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
+.size .OPENSSL_rdtsc,.-.OPENSSL_rdtsc
.globl .OPENSSL_cleanse
.align 4
@@ -125,6 +140,7 @@ Laligned:
.long 0
.byte 0,12,0x14,0,0,0,2,0
.long 0
+.size .OPENSSL_cleanse,.-.OPENSSL_cleanse
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/crypto/rc4/Makefile b/crypto/rc4/Makefile
index f3d6e0dc0405..7434ff737e66 100644
--- a/crypto/rc4/Makefile
+++ b/crypto/rc4/Makefile
@@ -42,7 +42,7 @@ lib: $(LIBOBJ)
@touch lib
rc4-586.s: asm/rc4-586.pl ../perlasm/x86asm.pl
- $(PERL) asm/rc4-586.pl $(PERLASM_SCHEME) $(CFLAGS) > $@
+ $(PERL) asm/rc4-586.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@
rc4-x86_64.s: asm/rc4-x86_64.pl
$(PERL) asm/rc4-x86_64.pl $(PERLASM_SCHEME) > $@
@@ -66,7 +66,7 @@ rc4-ia64.s: rc4-ia64.S
rc4-%.s: asm/rc4-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
files:
- $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
+ $(PERL) $(TOP)/util/files.pl "RC4_ENC=$(RC4_ENC)" Makefile >> $(TOP)/MINFO
links:
@$(PERL) $(TOP)/util/mklink.pl ../../include/openssl $(EXHEADER)
diff --git a/crypto/rc4/asm/rc4-586.pl b/crypto/rc4/asm/rc4-586.pl
index 5c9ac6ad286e..1d55d551e9bb 100644
--- a/crypto/rc4/asm/rc4-586.pl
+++ b/crypto/rc4/asm/rc4-586.pl
@@ -60,7 +60,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
-&asm_init($ARGV[0],"rc4-586.pl");
+&asm_init($ARGV[0],"rc4-586.pl",$x86only = $ARGV[$#ARGV] eq "386");
$xx="eax";
$yy="ebx";
@@ -184,8 +184,11 @@ if ($alt=0) {
&and ($ty,-4); # how many 4-byte chunks?
&jz (&label("loop1"));
- &test ($ty,-8);
&mov (&wparam(3),$out); # $out as accumulator in these loops
+ if ($x86only) {
+ &jmp (&label("go4loop4"));
+ } else {
+ &test ($ty,-8);
&jz (&label("go4loop4"));
&picmeup($out,"OPENSSL_ia32cap_P");
@@ -228,6 +231,7 @@ if ($alt=0) {
&cmp ($inp,&wparam(1)); # compare to input+len
&je (&label("done"));
&jmp (&label("loop1"));
+ }
&set_label("go4loop4",16);
&lea ($ty,&DWP(-4,$inp,$ty));
diff --git a/crypto/rc4/rc4_enc.c b/crypto/rc4/rc4_enc.c
index 6ebd54d46cad..0f0a2487a7f7 100644
--- a/crypto/rc4/rc4_enc.c
+++ b/crypto/rc4/rc4_enc.c
@@ -79,7 +79,7 @@ void RC4(RC4_KEY *key, size_t len, const unsigned char *indata,
y = key->y;
d = key->data;
-#if defined(RC4_CHUNK)
+#if defined(RC4_CHUNK) && !defined(PEDANTIC)
/*-
* The original reason for implementing this(*) was the fact that
* pre-21164a Alpha CPUs don't have byte load/store instructions
diff --git a/crypto/rc5/rc5_locl.h b/crypto/rc5/rc5_locl.h
index 1e83f19fecb7..ee757e647765 100644
--- a/crypto/rc5/rc5_locl.h
+++ b/crypto/rc5/rc5_locl.h
@@ -146,9 +146,12 @@
*((c)++)=(unsigned char)(((l)>> 8L)&0xff), \
*((c)++)=(unsigned char)(((l) )&0xff))
-#if (defined(OPENSSL_SYS_WIN32) && defined(_MSC_VER)) || defined(__ICC)
+#if (defined(OPENSSL_SYS_WIN32) && defined(_MSC_VER))
# define ROTATE_l32(a,n) _lrotl(a,n)
# define ROTATE_r32(a,n) _lrotr(a,n)
+#elif defined(__ICC)
+# define ROTATE_l32(a,n) _rotl(a,n)
+# define ROTATE_r32(a,n) _rotr(a,n)
#elif defined(__GNUC__) && __GNUC__>=2 && !defined(__STRICT_ANSI__) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) && !defined(PEDANTIC)
# if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)
# define ROTATE_l32(a,n) ({ register unsigned int ret; \
diff --git a/crypto/rsa/Makefile b/crypto/rsa/Makefile
index 79c7c421bc09..e292e84db3c9 100644
--- a/crypto/rsa/Makefile
+++ b/crypto/rsa/Makefile
@@ -228,19 +228,20 @@ rsa_pk1.o: ../cryptlib.h rsa_pk1.c
rsa_pmeth.o: ../../e_os.h ../../include/openssl/asn1.h
rsa_pmeth.o: ../../include/openssl/asn1t.h ../../include/openssl/bio.h
rsa_pmeth.o: ../../include/openssl/bn.h ../../include/openssl/buffer.h
-rsa_pmeth.o: ../../include/openssl/cms.h ../../include/openssl/crypto.h
-rsa_pmeth.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h
-rsa_pmeth.o: ../../include/openssl/ecdh.h ../../include/openssl/ecdsa.h
-rsa_pmeth.o: ../../include/openssl/err.h ../../include/openssl/evp.h
-rsa_pmeth.o: ../../include/openssl/lhash.h ../../include/openssl/obj_mac.h
-rsa_pmeth.o: ../../include/openssl/objects.h
+rsa_pmeth.o: ../../include/openssl/cms.h ../../include/openssl/conf.h
+rsa_pmeth.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
+rsa_pmeth.o: ../../include/openssl/ec.h ../../include/openssl/ecdh.h
+rsa_pmeth.o: ../../include/openssl/ecdsa.h ../../include/openssl/err.h
+rsa_pmeth.o: ../../include/openssl/evp.h ../../include/openssl/lhash.h
+rsa_pmeth.o: ../../include/openssl/obj_mac.h ../../include/openssl/objects.h
rsa_pmeth.o: ../../include/openssl/opensslconf.h
rsa_pmeth.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
rsa_pmeth.o: ../../include/openssl/pkcs7.h ../../include/openssl/rsa.h
rsa_pmeth.o: ../../include/openssl/safestack.h ../../include/openssl/sha.h
rsa_pmeth.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
rsa_pmeth.o: ../../include/openssl/x509.h ../../include/openssl/x509_vfy.h
-rsa_pmeth.o: ../cryptlib.h ../evp/evp_locl.h rsa_locl.h rsa_pmeth.c
+rsa_pmeth.o: ../../include/openssl/x509v3.h ../cryptlib.h ../evp/evp_locl.h
+rsa_pmeth.o: rsa_locl.h rsa_pmeth.c
rsa_prn.o: ../../e_os.h ../../include/openssl/asn1.h
rsa_prn.o: ../../include/openssl/bio.h ../../include/openssl/buffer.h
rsa_prn.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
diff --git a/crypto/rsa/rsa.h b/crypto/rsa/rsa.h
index a8b59a9559a9..d2ee37406e3c 100644
--- a/crypto/rsa/rsa.h
+++ b/crypto/rsa/rsa.h
@@ -262,13 +262,31 @@ struct rsa_st {
EVP_PKEY_CTRL_RSA_KEYGEN_PUBEXP, 0, pubexp)
# define EVP_PKEY_CTX_set_rsa_mgf1_md(ctx, md) \
- EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_TYPE_SIG, \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, \
+ EVP_PKEY_OP_TYPE_SIG | EVP_PKEY_OP_TYPE_CRYPT, \
EVP_PKEY_CTRL_RSA_MGF1_MD, 0, (void *)md)
+# define EVP_PKEY_CTX_set_rsa_oaep_md(ctx, md) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_TYPE_CRYPT, \
+ EVP_PKEY_CTRL_RSA_OAEP_MD, 0, (void *)md)
+
# define EVP_PKEY_CTX_get_rsa_mgf1_md(ctx, pmd) \
- EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_TYPE_SIG, \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, \
+ EVP_PKEY_OP_TYPE_SIG | EVP_PKEY_OP_TYPE_CRYPT, \
EVP_PKEY_CTRL_GET_RSA_MGF1_MD, 0, (void *)pmd)
+# define EVP_PKEY_CTX_get_rsa_oaep_md(ctx, pmd) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_TYPE_CRYPT, \
+ EVP_PKEY_CTRL_GET_RSA_OAEP_MD, 0, (void *)pmd)
+
+# define EVP_PKEY_CTX_set0_rsa_oaep_label(ctx, l, llen) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_TYPE_CRYPT, \
+ EVP_PKEY_CTRL_RSA_OAEP_LABEL, llen, (void *)l)
+
+# define EVP_PKEY_CTX_get0_rsa_oaep_label(ctx, l) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_TYPE_CRYPT, \
+ EVP_PKEY_CTRL_GET_RSA_OAEP_LABEL, 0, (void *)l)
+
# define EVP_PKEY_CTRL_RSA_PADDING (EVP_PKEY_ALG_CTRL + 1)
# define EVP_PKEY_CTRL_RSA_PSS_SALTLEN (EVP_PKEY_ALG_CTRL + 2)
@@ -280,6 +298,12 @@ struct rsa_st {
# define EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN (EVP_PKEY_ALG_CTRL + 7)
# define EVP_PKEY_CTRL_GET_RSA_MGF1_MD (EVP_PKEY_ALG_CTRL + 8)
+# define EVP_PKEY_CTRL_RSA_OAEP_MD (EVP_PKEY_ALG_CTRL + 9)
+# define EVP_PKEY_CTRL_RSA_OAEP_LABEL (EVP_PKEY_ALG_CTRL + 10)
+
+# define EVP_PKEY_CTRL_GET_RSA_OAEP_MD (EVP_PKEY_ALG_CTRL + 11)
+# define EVP_PKEY_CTRL_GET_RSA_OAEP_LABEL (EVP_PKEY_ALG_CTRL + 12)
+
# define RSA_PKCS1_PADDING 1
# define RSA_SSLV23_PADDING 2
# define RSA_NO_PADDING 3
@@ -347,6 +371,14 @@ typedef struct rsa_pss_params_st {
DECLARE_ASN1_FUNCTIONS(RSA_PSS_PARAMS)
+typedef struct rsa_oaep_params_st {
+ X509_ALGOR *hashFunc;
+ X509_ALGOR *maskGenFunc;
+ X509_ALGOR *pSourceFunc;
+} RSA_OAEP_PARAMS;
+
+DECLARE_ASN1_FUNCTIONS(RSA_OAEP_PARAMS)
+
# ifndef OPENSSL_NO_FP_API
int RSA_print_fp(FILE *fp, const RSA *r, int offset);
# endif
@@ -414,6 +446,15 @@ int RSA_padding_add_PKCS1_OAEP(unsigned char *to, int tlen,
int RSA_padding_check_PKCS1_OAEP(unsigned char *to, int tlen,
const unsigned char *f, int fl, int rsa_len,
const unsigned char *p, int pl);
+int RSA_padding_add_PKCS1_OAEP_mgf1(unsigned char *to, int tlen,
+ const unsigned char *from, int flen,
+ const unsigned char *param, int plen,
+ const EVP_MD *md, const EVP_MD *mgf1md);
+int RSA_padding_check_PKCS1_OAEP_mgf1(unsigned char *to, int tlen,
+ const unsigned char *from, int flen,
+ int num, const unsigned char *param,
+ int plen, const EVP_MD *md,
+ const EVP_MD *mgf1md);
int RSA_padding_add_SSLv23(unsigned char *to, int tlen,
const unsigned char *f, int fl);
int RSA_padding_check_SSLv23(unsigned char *to, int tlen,
@@ -494,8 +535,10 @@ void ERR_load_RSA_strings(void);
# define RSA_F_PKEY_RSA_SIGN 142
# define RSA_F_PKEY_RSA_VERIFY 154
# define RSA_F_PKEY_RSA_VERIFYRECOVER 141
+# define RSA_F_RSA_ALGOR_TO_MD 157
# define RSA_F_RSA_BUILTIN_KEYGEN 129
# define RSA_F_RSA_CHECK_KEY 123
+# define RSA_F_RSA_CMS_DECRYPT 158
# define RSA_F_RSA_EAY_PRIVATE_DECRYPT 101
# define RSA_F_RSA_EAY_PRIVATE_ENCRYPT 102
# define RSA_F_RSA_EAY_PUBLIC_DECRYPT 103
@@ -504,6 +547,7 @@ void ERR_load_RSA_strings(void);
# define RSA_F_RSA_GENERATE_KEY_EX 155
# define RSA_F_RSA_ITEM_VERIFY 156
# define RSA_F_RSA_MEMORY_LOCK 130
+# define RSA_F_RSA_MGF1_TO_MD 159
# define RSA_F_RSA_NEW_METHOD 106
# define RSA_F_RSA_NULL 124
# define RSA_F_RSA_NULL_MOD_EXP 131
@@ -513,6 +557,7 @@ void ERR_load_RSA_strings(void);
# define RSA_F_RSA_NULL_PUBLIC_ENCRYPT 135
# define RSA_F_RSA_PADDING_ADD_NONE 107
# define RSA_F_RSA_PADDING_ADD_PKCS1_OAEP 121
+# define RSA_F_RSA_PADDING_ADD_PKCS1_OAEP_MGF1 160
# define RSA_F_RSA_PADDING_ADD_PKCS1_PSS 125
# define RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1 148
# define RSA_F_RSA_PADDING_ADD_PKCS1_TYPE_1 108
@@ -521,6 +566,7 @@ void ERR_load_RSA_strings(void);
# define RSA_F_RSA_PADDING_ADD_X931 127
# define RSA_F_RSA_PADDING_CHECK_NONE 111
# define RSA_F_RSA_PADDING_CHECK_PKCS1_OAEP 122
+# define RSA_F_RSA_PADDING_CHECK_PKCS1_OAEP_MGF1 161
# define RSA_F_RSA_PADDING_CHECK_PKCS1_TYPE_1 112
# define RSA_F_RSA_PADDING_CHECK_PKCS1_TYPE_2 113
# define RSA_F_RSA_PADDING_CHECK_SSLV23 114
@@ -531,6 +577,7 @@ void ERR_load_RSA_strings(void);
# define RSA_F_RSA_PRIVATE_ENCRYPT 151
# define RSA_F_RSA_PRIV_DECODE 137
# define RSA_F_RSA_PRIV_ENCODE 138
+# define RSA_F_RSA_PSS_TO_CTX 162
# define RSA_F_RSA_PUBLIC_DECRYPT 152
# define RSA_F_RSA_PUBLIC_ENCRYPT 153
# define RSA_F_RSA_PUB_DECODE 139
@@ -556,17 +603,21 @@ void ERR_load_RSA_strings(void);
# define RSA_R_DATA_TOO_LARGE_FOR_MODULUS 132
# define RSA_R_DATA_TOO_SMALL 111
# define RSA_R_DATA_TOO_SMALL_FOR_KEY_SIZE 122
+# define RSA_R_DIGEST_DOES_NOT_MATCH 166
# define RSA_R_DIGEST_TOO_BIG_FOR_RSA_KEY 112
# define RSA_R_DMP1_NOT_CONGRUENT_TO_D 124
# define RSA_R_DMQ1_NOT_CONGRUENT_TO_D 125
# define RSA_R_D_E_NOT_CONGRUENT_TO_1 123
# define RSA_R_FIRST_OCTET_INVALID 133
# define RSA_R_ILLEGAL_OR_UNSUPPORTED_PADDING_MODE 144
+# define RSA_R_INVALID_DIGEST 160
# define RSA_R_INVALID_DIGEST_LENGTH 143
# define RSA_R_INVALID_HEADER 137
# define RSA_R_INVALID_KEYBITS 145
+# define RSA_R_INVALID_LABEL 161
# define RSA_R_INVALID_MESSAGE_LENGTH 131
# define RSA_R_INVALID_MGF1_MD 156
+# define RSA_R_INVALID_OAEP_PARAMETERS 162
# define RSA_R_INVALID_PADDING 138
# define RSA_R_INVALID_PADDING_MODE 141
# define RSA_R_INVALID_PSS_PARAMETERS 149
@@ -595,9 +646,12 @@ void ERR_load_RSA_strings(void);
# define RSA_R_SSLV3_ROLLBACK_ATTACK 115
# define RSA_R_THE_ASN1_OBJECT_IDENTIFIER_IS_NOT_KNOWN_FOR_THIS_MD 116
# define RSA_R_UNKNOWN_ALGORITHM_TYPE 117
+# define RSA_R_UNKNOWN_DIGEST 163
# define RSA_R_UNKNOWN_MASK_DIGEST 151
# define RSA_R_UNKNOWN_PADDING_TYPE 118
# define RSA_R_UNKNOWN_PSS_DIGEST 152
+# define RSA_R_UNSUPPORTED_ENCRYPTION_TYPE 164
+# define RSA_R_UNSUPPORTED_LABEL_SOURCE 165
# define RSA_R_UNSUPPORTED_MASK_ALGORITHM 153
# define RSA_R_UNSUPPORTED_MASK_PARAMETER 154
# define RSA_R_UNSUPPORTED_SIGNATURE_TYPE 155
diff --git a/crypto/rsa/rsa_ameth.c b/crypto/rsa/rsa_ameth.c
index 93e071de75e2..ca3922e6c298 100644
--- a/crypto/rsa/rsa_ameth.c
+++ b/crypto/rsa/rsa_ameth.c
@@ -68,6 +68,11 @@
#endif
#include "asn1_locl.h"
+static int rsa_cms_sign(CMS_SignerInfo *si);
+static int rsa_cms_verify(CMS_SignerInfo *si);
+static int rsa_cms_decrypt(CMS_RecipientInfo *ri);
+static int rsa_cms_encrypt(CMS_RecipientInfo *ri);
+
static int rsa_pub_encode(X509_PUBKEY *pk, const EVP_PKEY *pkey)
{
unsigned char *penc = NULL;
@@ -258,6 +263,23 @@ static int rsa_priv_print(BIO *bp, const EVP_PKEY *pkey, int indent,
return do_rsa_print(bp, pkey->pkey.rsa, indent, 1);
}
+/* Given an MGF1 Algorithm ID decode to an Algorithm Identifier */
+static X509_ALGOR *rsa_mgf1_decode(X509_ALGOR *alg)
+{
+ const unsigned char *p;
+ int plen;
+ if (alg == NULL)
+ return NULL;
+ if (OBJ_obj2nid(alg->algorithm) != NID_mgf1)
+ return NULL;
+ if (alg->parameter->type != V_ASN1_SEQUENCE)
+ return NULL;
+
+ p = alg->parameter->value.sequence->data;
+ plen = alg->parameter->value.sequence->length;
+ return d2i_X509_ALGOR(NULL, &p, plen);
+}
+
static RSA_PSS_PARAMS *rsa_pss_decode(const X509_ALGOR *alg,
X509_ALGOR **pmaskHash)
{
@@ -276,15 +298,7 @@ static RSA_PSS_PARAMS *rsa_pss_decode(const X509_ALGOR *alg,
if (!pss)
return NULL;
- if (pss->maskGenAlgorithm) {
- ASN1_TYPE *param = pss->maskGenAlgorithm->parameter;
- if (OBJ_obj2nid(pss->maskGenAlgorithm->algorithm) == NID_mgf1
- && param->type == V_ASN1_SEQUENCE) {
- p = param->value.sequence->data;
- plen = param->value.sequence->length;
- *pmaskHash = d2i_X509_ALGOR(NULL, &p, plen);
- }
- }
+ *pmaskHash = rsa_mgf1_decode(pss->maskGenAlgorithm);
return pss;
}
@@ -401,17 +415,25 @@ static int rsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2)
#ifndef OPENSSL_NO_CMS
case ASN1_PKEY_CTRL_CMS_SIGN:
if (arg1 == 0)
- CMS_SignerInfo_get0_algs(arg2, NULL, NULL, NULL, &alg);
+ return rsa_cms_sign(arg2);
+ else if (arg1 == 1)
+ return rsa_cms_verify(arg2);
break;
case ASN1_PKEY_CTRL_CMS_ENVELOPE:
if (arg1 == 0)
- CMS_RecipientInfo_ktri_get0_algs(arg2, NULL, NULL, &alg);
+ return rsa_cms_encrypt(arg2);
+ else if (arg1 == 1)
+ return rsa_cms_decrypt(arg2);
break;
+
+ case ASN1_PKEY_CTRL_CMS_RI_TYPE:
+ *(int *)arg2 = CMS_RECIPINFO_TRANS;
+ return 1;
#endif
case ASN1_PKEY_CTRL_DEFAULT_MD_NID:
- *(int *)arg2 = NID_sha1;
+ *(int *)arg2 = NID_sha256;
return 1;
default:
@@ -426,59 +448,166 @@ static int rsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2)
}
+/* allocate and set algorithm ID from EVP_MD, default SHA1 */
+static int rsa_md_to_algor(X509_ALGOR **palg, const EVP_MD *md)
+{
+ if (EVP_MD_type(md) == NID_sha1)
+ return 1;
+ *palg = X509_ALGOR_new();
+ if (!*palg)
+ return 0;
+ X509_ALGOR_set_md(*palg, md);
+ return 1;
+}
+
+/* Allocate and set MGF1 algorithm ID from EVP_MD */
+static int rsa_md_to_mgf1(X509_ALGOR **palg, const EVP_MD *mgf1md)
+{
+ X509_ALGOR *algtmp = NULL;
+ ASN1_STRING *stmp = NULL;
+ *palg = NULL;
+ if (EVP_MD_type(mgf1md) == NID_sha1)
+ return 1;
+ /* need to embed algorithm ID inside another */
+ if (!rsa_md_to_algor(&algtmp, mgf1md))
+ goto err;
+ if (!ASN1_item_pack(algtmp, ASN1_ITEM_rptr(X509_ALGOR), &stmp))
+ goto err;
+ *palg = X509_ALGOR_new();
+ if (!*palg)
+ goto err;
+ X509_ALGOR_set0(*palg, OBJ_nid2obj(NID_mgf1), V_ASN1_SEQUENCE, stmp);
+ stmp = NULL;
+ err:
+ if (stmp)
+ ASN1_STRING_free(stmp);
+ if (algtmp)
+ X509_ALGOR_free(algtmp);
+ if (*palg)
+ return 1;
+ return 0;
+}
+
+/* convert algorithm ID to EVP_MD, default SHA1 */
+static const EVP_MD *rsa_algor_to_md(X509_ALGOR *alg)
+{
+ const EVP_MD *md;
+ if (!alg)
+ return EVP_sha1();
+ md = EVP_get_digestbyobj(alg->algorithm);
+ if (md == NULL)
+ RSAerr(RSA_F_RSA_ALGOR_TO_MD, RSA_R_UNKNOWN_DIGEST);
+ return md;
+}
+
+/* convert MGF1 algorithm ID to EVP_MD, default SHA1 */
+static const EVP_MD *rsa_mgf1_to_md(X509_ALGOR *alg, X509_ALGOR *maskHash)
+{
+ const EVP_MD *md;
+ if (!alg)
+ return EVP_sha1();
+ /* Check mask and lookup mask hash algorithm */
+ if (OBJ_obj2nid(alg->algorithm) != NID_mgf1) {
+ RSAerr(RSA_F_RSA_MGF1_TO_MD, RSA_R_UNSUPPORTED_MASK_ALGORITHM);
+ return NULL;
+ }
+ if (!maskHash) {
+ RSAerr(RSA_F_RSA_MGF1_TO_MD, RSA_R_UNSUPPORTED_MASK_PARAMETER);
+ return NULL;
+ }
+ md = EVP_get_digestbyobj(maskHash->algorithm);
+ if (md == NULL) {
+ RSAerr(RSA_F_RSA_MGF1_TO_MD, RSA_R_UNKNOWN_MASK_DIGEST);
+ return NULL;
+ }
+ return md;
+}
+
/*
- * Customised RSA item verification routine. This is called when a signature
- * is encountered requiring special handling. We currently only handle PSS.
+ * Convert EVP_PKEY_CTX is PSS mode into corresponding algorithm parameter,
+ * suitable for setting an AlgorithmIdentifier.
*/
-static int rsa_item_verify(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
- X509_ALGOR *sigalg, ASN1_BIT_STRING *sig,
- EVP_PKEY *pkey)
+static ASN1_STRING *rsa_ctx_to_pss(EVP_PKEY_CTX *pkctx)
+{
+ const EVP_MD *sigmd, *mgf1md;
+ RSA_PSS_PARAMS *pss = NULL;
+ ASN1_STRING *os = NULL;
+ EVP_PKEY *pk = EVP_PKEY_CTX_get0_pkey(pkctx);
+ int saltlen, rv = 0;
+ if (EVP_PKEY_CTX_get_signature_md(pkctx, &sigmd) <= 0)
+ goto err;
+ if (EVP_PKEY_CTX_get_rsa_mgf1_md(pkctx, &mgf1md) <= 0)
+ goto err;
+ if (!EVP_PKEY_CTX_get_rsa_pss_saltlen(pkctx, &saltlen))
+ goto err;
+ if (saltlen == -1)
+ saltlen = EVP_MD_size(sigmd);
+ else if (saltlen == -2) {
+ saltlen = EVP_PKEY_size(pk) - EVP_MD_size(sigmd) - 2;
+ if (((EVP_PKEY_bits(pk) - 1) & 0x7) == 0)
+ saltlen--;
+ }
+ pss = RSA_PSS_PARAMS_new();
+ if (!pss)
+ goto err;
+ if (saltlen != 20) {
+ pss->saltLength = ASN1_INTEGER_new();
+ if (!pss->saltLength)
+ goto err;
+ if (!ASN1_INTEGER_set(pss->saltLength, saltlen))
+ goto err;
+ }
+ if (!rsa_md_to_algor(&pss->hashAlgorithm, sigmd))
+ goto err;
+ if (!rsa_md_to_mgf1(&pss->maskGenAlgorithm, mgf1md))
+ goto err;
+ /* Finally create string with pss parameter encoding. */
+ if (!ASN1_item_pack(pss, ASN1_ITEM_rptr(RSA_PSS_PARAMS), &os))
+ goto err;
+ rv = 1;
+ err:
+ if (pss)
+ RSA_PSS_PARAMS_free(pss);
+ if (rv)
+ return os;
+ if (os)
+ ASN1_STRING_free(os);
+ return NULL;
+}
+
+/*
+ * From PSS AlgorithmIdentifier set public key parameters. If pkey isn't NULL
+ * then the EVP_MD_CTX is setup and initalised. If it is NULL parameters are
+ * passed to pkctx instead.
+ */
+
+static int rsa_pss_to_ctx(EVP_MD_CTX *ctx, EVP_PKEY_CTX *pkctx,
+ X509_ALGOR *sigalg, EVP_PKEY *pkey)
{
int rv = -1;
int saltlen;
const EVP_MD *mgf1md = NULL, *md = NULL;
RSA_PSS_PARAMS *pss;
X509_ALGOR *maskHash;
- EVP_PKEY_CTX *pkctx;
/* Sanity check: make sure it is PSS */
if (OBJ_obj2nid(sigalg->algorithm) != NID_rsassaPss) {
- RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_SIGNATURE_TYPE);
+ RSAerr(RSA_F_RSA_PSS_TO_CTX, RSA_R_UNSUPPORTED_SIGNATURE_TYPE);
return -1;
}
/* Decode PSS parameters */
pss = rsa_pss_decode(sigalg, &maskHash);
if (pss == NULL) {
- RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_PSS_PARAMETERS);
+ RSAerr(RSA_F_RSA_PSS_TO_CTX, RSA_R_INVALID_PSS_PARAMETERS);
goto err;
}
- /* Check mask and lookup mask hash algorithm */
- if (pss->maskGenAlgorithm) {
- if (OBJ_obj2nid(pss->maskGenAlgorithm->algorithm) != NID_mgf1) {
- RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_MASK_ALGORITHM);
- goto err;
- }
- if (!maskHash) {
- RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_MASK_PARAMETER);
- goto err;
- }
- mgf1md = EVP_get_digestbyobj(maskHash->algorithm);
- if (mgf1md == NULL) {
- RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNKNOWN_MASK_DIGEST);
- goto err;
- }
- } else
- mgf1md = EVP_sha1();
-
- if (pss->hashAlgorithm) {
- md = EVP_get_digestbyobj(pss->hashAlgorithm->algorithm);
- if (md == NULL) {
- RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNKNOWN_PSS_DIGEST);
- goto err;
- }
- } else
- md = EVP_sha1();
+ mgf1md = rsa_mgf1_to_md(pss->maskGenAlgorithm, maskHash);
+ if (!mgf1md)
+ goto err;
+ md = rsa_algor_to_md(pss->hashAlgorithm);
+ if (!md)
+ goto err;
if (pss->saltLength) {
saltlen = ASN1_INTEGER_get(pss->saltLength);
@@ -488,7 +617,7 @@ static int rsa_item_verify(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
* routines will trap other invalid values anyway.
*/
if (saltlen < 0) {
- RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_SALT_LENGTH);
+ RSAerr(RSA_F_RSA_PSS_TO_CTX, RSA_R_INVALID_SALT_LENGTH);
goto err;
}
} else
@@ -499,14 +628,24 @@ static int rsa_item_verify(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
* PKCS#1 says we should reject any other value anyway.
*/
if (pss->trailerField && ASN1_INTEGER_get(pss->trailerField) != 1) {
- RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_TRAILER);
+ RSAerr(RSA_F_RSA_PSS_TO_CTX, RSA_R_INVALID_TRAILER);
goto err;
}
/* We have all parameters now set up context */
- if (!EVP_DigestVerifyInit(ctx, &pkctx, md, NULL, pkey))
- goto err;
+ if (pkey) {
+ if (!EVP_DigestVerifyInit(ctx, &pkctx, md, NULL, pkey))
+ goto err;
+ } else {
+ const EVP_MD *checkmd;
+ if (EVP_PKEY_CTX_get_signature_md(pkctx, &checkmd) <= 0)
+ goto err;
+ if (EVP_MD_type(md) != EVP_MD_type(checkmd)) {
+ RSAerr(RSA_F_RSA_PSS_TO_CTX, RSA_R_DIGEST_DOES_NOT_MATCH);
+ goto err;
+ }
+ }
if (EVP_PKEY_CTX_set_rsa_padding(pkctx, RSA_PKCS1_PSS_PADDING) <= 0)
goto err;
@@ -517,7 +656,7 @@ static int rsa_item_verify(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
if (EVP_PKEY_CTX_set_rsa_mgf1_md(pkctx, mgf1md) <= 0)
goto err;
/* Carry on */
- rv = 2;
+ rv = 1;
err:
RSA_PSS_PARAMS_free(pss);
@@ -526,6 +665,71 @@ static int rsa_item_verify(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
return rv;
}
+static int rsa_cms_verify(CMS_SignerInfo *si)
+{
+ int nid, nid2;
+ X509_ALGOR *alg;
+ EVP_PKEY_CTX *pkctx = CMS_SignerInfo_get0_pkey_ctx(si);
+ CMS_SignerInfo_get0_algs(si, NULL, NULL, NULL, &alg);
+ nid = OBJ_obj2nid(alg->algorithm);
+ if (nid == NID_rsaEncryption)
+ return 1;
+ if (nid == NID_rsassaPss)
+ return rsa_pss_to_ctx(NULL, pkctx, alg, NULL);
+ /* Workaround for some implementation that use a signature OID */
+ if (OBJ_find_sigid_algs(nid, NULL, &nid2)) {
+ if (nid2 == NID_rsaEncryption)
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Customised RSA item verification routine. This is called when a signature
+ * is encountered requiring special handling. We currently only handle PSS.
+ */
+
+static int rsa_item_verify(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
+ X509_ALGOR *sigalg, ASN1_BIT_STRING *sig,
+ EVP_PKEY *pkey)
+{
+ /* Sanity check: make sure it is PSS */
+ if (OBJ_obj2nid(sigalg->algorithm) != NID_rsassaPss) {
+ RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_SIGNATURE_TYPE);
+ return -1;
+ }
+ if (rsa_pss_to_ctx(ctx, NULL, sigalg, pkey) > 0) {
+ /* Carry on */
+ return 2;
+ }
+ return -1;
+}
+
+static int rsa_cms_sign(CMS_SignerInfo *si)
+{
+ int pad_mode = RSA_PKCS1_PADDING;
+ X509_ALGOR *alg;
+ EVP_PKEY_CTX *pkctx = CMS_SignerInfo_get0_pkey_ctx(si);
+ ASN1_STRING *os = NULL;
+ CMS_SignerInfo_get0_algs(si, NULL, NULL, NULL, &alg);
+ if (pkctx) {
+ if (EVP_PKEY_CTX_get_rsa_padding(pkctx, &pad_mode) <= 0)
+ return 0;
+ }
+ if (pad_mode == RSA_PKCS1_PADDING) {
+ X509_ALGOR_set0(alg, OBJ_nid2obj(NID_rsaEncryption), V_ASN1_NULL, 0);
+ return 1;
+ }
+ /* We don't support it */
+ if (pad_mode != RSA_PKCS1_PSS_PADDING)
+ return 0;
+ os = rsa_ctx_to_pss(pkctx);
+ if (!os)
+ return 0;
+ X509_ALGOR_set0(alg, OBJ_nid2obj(NID_rsassaPss), V_ASN1_SEQUENCE, os);
+ return 1;
+}
+
static int rsa_item_sign(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
X509_ALGOR *alg1, X509_ALGOR *alg2,
ASN1_BIT_STRING *sig)
@@ -537,78 +741,184 @@ static int rsa_item_sign(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
if (pad_mode == RSA_PKCS1_PADDING)
return 2;
if (pad_mode == RSA_PKCS1_PSS_PADDING) {
- const EVP_MD *sigmd, *mgf1md;
- RSA_PSS_PARAMS *pss = NULL;
- X509_ALGOR *mgf1alg = NULL;
- ASN1_STRING *os1 = NULL, *os2 = NULL;
- EVP_PKEY *pk = EVP_PKEY_CTX_get0_pkey(pkctx);
- int saltlen, rv = 0;
- sigmd = EVP_MD_CTX_md(ctx);
- if (EVP_PKEY_CTX_get_rsa_mgf1_md(pkctx, &mgf1md) <= 0)
- goto err;
- if (!EVP_PKEY_CTX_get_rsa_pss_saltlen(pkctx, &saltlen))
- goto err;
- if (saltlen == -1)
- saltlen = EVP_MD_size(sigmd);
- else if (saltlen == -2) {
- saltlen = EVP_PKEY_size(pk) - EVP_MD_size(sigmd) - 2;
- if (((EVP_PKEY_bits(pk) - 1) & 0x7) == 0)
- saltlen--;
- }
- pss = RSA_PSS_PARAMS_new();
- if (!pss)
- goto err;
- if (saltlen != 20) {
- pss->saltLength = ASN1_INTEGER_new();
- if (!pss->saltLength)
- goto err;
- if (!ASN1_INTEGER_set(pss->saltLength, saltlen))
- goto err;
- }
- if (EVP_MD_type(sigmd) != NID_sha1) {
- pss->hashAlgorithm = X509_ALGOR_new();
- if (!pss->hashAlgorithm)
- goto err;
- X509_ALGOR_set_md(pss->hashAlgorithm, sigmd);
- }
- if (EVP_MD_type(mgf1md) != NID_sha1) {
- ASN1_STRING *stmp = NULL;
- /* need to embed algorithm ID inside another */
- mgf1alg = X509_ALGOR_new();
- X509_ALGOR_set_md(mgf1alg, mgf1md);
- if (!ASN1_item_pack(mgf1alg, ASN1_ITEM_rptr(X509_ALGOR), &stmp))
- goto err;
- pss->maskGenAlgorithm = X509_ALGOR_new();
- if (!pss->maskGenAlgorithm)
- goto err;
- X509_ALGOR_set0(pss->maskGenAlgorithm,
- OBJ_nid2obj(NID_mgf1), V_ASN1_SEQUENCE, stmp);
- }
- /* Finally create string with pss parameter encoding. */
- if (!ASN1_item_pack(pss, ASN1_ITEM_rptr(RSA_PSS_PARAMS), &os1))
- goto err;
+ ASN1_STRING *os1 = NULL;
+ os1 = rsa_ctx_to_pss(pkctx);
+ if (!os1)
+ return 0;
+ /* Duplicate parameters if we have to */
if (alg2) {
- os2 = ASN1_STRING_dup(os1);
- if (!os2)
- goto err;
+ ASN1_STRING *os2 = ASN1_STRING_dup(os1);
+ if (!os2) {
+ ASN1_STRING_free(os1);
+ return 0;
+ }
X509_ALGOR_set0(alg2, OBJ_nid2obj(NID_rsassaPss),
V_ASN1_SEQUENCE, os2);
}
X509_ALGOR_set0(alg1, OBJ_nid2obj(NID_rsassaPss),
V_ASN1_SEQUENCE, os1);
- os1 = os2 = NULL;
- rv = 3;
+ return 3;
+ }
+ return 2;
+}
+
+static RSA_OAEP_PARAMS *rsa_oaep_decode(const X509_ALGOR *alg,
+ X509_ALGOR **pmaskHash)
+{
+ const unsigned char *p;
+ int plen;
+ RSA_OAEP_PARAMS *pss;
+
+ *pmaskHash = NULL;
+
+ if (!alg->parameter || alg->parameter->type != V_ASN1_SEQUENCE)
+ return NULL;
+ p = alg->parameter->value.sequence->data;
+ plen = alg->parameter->value.sequence->length;
+ pss = d2i_RSA_OAEP_PARAMS(NULL, &p, plen);
+
+ if (!pss)
+ return NULL;
+
+ *pmaskHash = rsa_mgf1_decode(pss->maskGenFunc);
+
+ return pss;
+}
+
+static int rsa_cms_decrypt(CMS_RecipientInfo *ri)
+{
+ EVP_PKEY_CTX *pkctx;
+ X509_ALGOR *cmsalg;
+ int nid;
+ int rv = -1;
+ unsigned char *label = NULL;
+ int labellen = 0;
+ const EVP_MD *mgf1md = NULL, *md = NULL;
+ RSA_OAEP_PARAMS *oaep;
+ X509_ALGOR *maskHash;
+ pkctx = CMS_RecipientInfo_get0_pkey_ctx(ri);
+ if (!pkctx)
+ return 0;
+ if (!CMS_RecipientInfo_ktri_get0_algs(ri, NULL, NULL, &cmsalg))
+ return -1;
+ nid = OBJ_obj2nid(cmsalg->algorithm);
+ if (nid == NID_rsaEncryption)
+ return 1;
+ if (nid != NID_rsaesOaep) {
+ RSAerr(RSA_F_RSA_CMS_DECRYPT, RSA_R_UNSUPPORTED_ENCRYPTION_TYPE);
+ return -1;
+ }
+ /* Decode OAEP parameters */
+ oaep = rsa_oaep_decode(cmsalg, &maskHash);
+
+ if (oaep == NULL) {
+ RSAerr(RSA_F_RSA_CMS_DECRYPT, RSA_R_INVALID_OAEP_PARAMETERS);
+ goto err;
+ }
+
+ mgf1md = rsa_mgf1_to_md(oaep->maskGenFunc, maskHash);
+ if (!mgf1md)
+ goto err;
+ md = rsa_algor_to_md(oaep->hashFunc);
+ if (!md)
+ goto err;
+
+ if (oaep->pSourceFunc) {
+ X509_ALGOR *plab = oaep->pSourceFunc;
+ if (OBJ_obj2nid(plab->algorithm) != NID_pSpecified) {
+ RSAerr(RSA_F_RSA_CMS_DECRYPT, RSA_R_UNSUPPORTED_LABEL_SOURCE);
+ goto err;
+ }
+ if (plab->parameter->type != V_ASN1_OCTET_STRING) {
+ RSAerr(RSA_F_RSA_CMS_DECRYPT, RSA_R_INVALID_LABEL);
+ goto err;
+ }
+
+ label = plab->parameter->value.octet_string->data;
+ /* Stop label being freed when OAEP parameters are freed */
+ plab->parameter->value.octet_string->data = NULL;
+ labellen = plab->parameter->value.octet_string->length;
+ }
+
+ if (EVP_PKEY_CTX_set_rsa_padding(pkctx, RSA_PKCS1_OAEP_PADDING) <= 0)
+ goto err;
+ if (EVP_PKEY_CTX_set_rsa_oaep_md(pkctx, md) <= 0)
+ goto err;
+ if (EVP_PKEY_CTX_set_rsa_mgf1_md(pkctx, mgf1md) <= 0)
+ goto err;
+ if (EVP_PKEY_CTX_set0_rsa_oaep_label(pkctx, label, labellen) <= 0)
+ goto err;
+ /* Carry on */
+ rv = 1;
+
err:
- if (mgf1alg)
- X509_ALGOR_free(mgf1alg);
- if (pss)
- RSA_PSS_PARAMS_free(pss);
- if (os1)
- ASN1_STRING_free(os1);
- return rv;
+ RSA_OAEP_PARAMS_free(oaep);
+ if (maskHash)
+ X509_ALGOR_free(maskHash);
+ return rv;
+}
+static int rsa_cms_encrypt(CMS_RecipientInfo *ri)
+{
+ const EVP_MD *md, *mgf1md;
+ RSA_OAEP_PARAMS *oaep = NULL;
+ ASN1_STRING *os = NULL;
+ X509_ALGOR *alg;
+ EVP_PKEY_CTX *pkctx = CMS_RecipientInfo_get0_pkey_ctx(ri);
+ int pad_mode = RSA_PKCS1_PADDING, rv = 0, labellen;
+ unsigned char *label;
+ CMS_RecipientInfo_ktri_get0_algs(ri, NULL, NULL, &alg);
+ if (pkctx) {
+ if (EVP_PKEY_CTX_get_rsa_padding(pkctx, &pad_mode) <= 0)
+ return 0;
}
- return 2;
+ if (pad_mode == RSA_PKCS1_PADDING) {
+ X509_ALGOR_set0(alg, OBJ_nid2obj(NID_rsaEncryption), V_ASN1_NULL, 0);
+ return 1;
+ }
+ /* Not supported */
+ if (pad_mode != RSA_PKCS1_OAEP_PADDING)
+ return 0;
+ if (EVP_PKEY_CTX_get_rsa_oaep_md(pkctx, &md) <= 0)
+ goto err;
+ if (EVP_PKEY_CTX_get_rsa_mgf1_md(pkctx, &mgf1md) <= 0)
+ goto err;
+ labellen = EVP_PKEY_CTX_get0_rsa_oaep_label(pkctx, &label);
+ if (labellen < 0)
+ goto err;
+ oaep = RSA_OAEP_PARAMS_new();
+ if (!oaep)
+ goto err;
+ if (!rsa_md_to_algor(&oaep->hashFunc, md))
+ goto err;
+ if (!rsa_md_to_mgf1(&oaep->maskGenFunc, mgf1md))
+ goto err;
+ if (labellen > 0) {
+ ASN1_OCTET_STRING *los = ASN1_OCTET_STRING_new();
+ oaep->pSourceFunc = X509_ALGOR_new();
+ if (!oaep->pSourceFunc)
+ goto err;
+ if (!los)
+ goto err;
+ if (!ASN1_OCTET_STRING_set(los, label, labellen)) {
+ ASN1_OCTET_STRING_free(los);
+ goto err;
+ }
+ X509_ALGOR_set0(oaep->pSourceFunc, OBJ_nid2obj(NID_pSpecified),
+ V_ASN1_OCTET_STRING, los);
+ }
+ /* create string with pss parameter encoding. */
+ if (!ASN1_item_pack(oaep, ASN1_ITEM_rptr(RSA_OAEP_PARAMS), &os))
+ goto err;
+ X509_ALGOR_set0(alg, OBJ_nid2obj(NID_rsaesOaep), V_ASN1_SEQUENCE, os);
+ os = NULL;
+ rv = 1;
+ err:
+ if (oaep)
+ RSA_OAEP_PARAMS_free(oaep);
+ if (os)
+ ASN1_STRING_free(os);
+ return rv;
}
const EVP_PKEY_ASN1_METHOD rsa_asn1_meths[] = {
diff --git a/crypto/rsa/rsa_asn1.c b/crypto/rsa/rsa_asn1.c
index 3d82c1d0c715..aff8b583fa72 100644
--- a/crypto/rsa/rsa_asn1.c
+++ b/crypto/rsa/rsa_asn1.c
@@ -108,6 +108,14 @@ ASN1_SEQUENCE(RSA_PSS_PARAMS) = {
IMPLEMENT_ASN1_FUNCTIONS(RSA_PSS_PARAMS)
+ASN1_SEQUENCE(RSA_OAEP_PARAMS) = {
+ ASN1_EXP_OPT(RSA_OAEP_PARAMS, hashFunc, X509_ALGOR, 0),
+ ASN1_EXP_OPT(RSA_OAEP_PARAMS, maskGenFunc, X509_ALGOR, 1),
+ ASN1_EXP_OPT(RSA_OAEP_PARAMS, pSourceFunc, X509_ALGOR, 2),
+} ASN1_SEQUENCE_END(RSA_OAEP_PARAMS)
+
+IMPLEMENT_ASN1_FUNCTIONS(RSA_OAEP_PARAMS)
+
IMPLEMENT_ASN1_ENCODE_FUNCTIONS_const_fname(RSA, RSAPrivateKey, RSAPrivateKey)
IMPLEMENT_ASN1_ENCODE_FUNCTIONS_const_fname(RSA, RSAPublicKey, RSAPublicKey)
diff --git a/crypto/rsa/rsa_err.c b/crypto/rsa/rsa_err.c
index 25b3fa743d41..0bab05efcfca 100644
--- a/crypto/rsa/rsa_err.c
+++ b/crypto/rsa/rsa_err.c
@@ -1,6 +1,6 @@
/* crypto/rsa/rsa_err.c */
/* ====================================================================
- * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1999-2014 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -80,8 +80,10 @@ static ERR_STRING_DATA RSA_str_functs[] = {
{ERR_FUNC(RSA_F_PKEY_RSA_SIGN), "PKEY_RSA_SIGN"},
{ERR_FUNC(RSA_F_PKEY_RSA_VERIFY), "PKEY_RSA_VERIFY"},
{ERR_FUNC(RSA_F_PKEY_RSA_VERIFYRECOVER), "PKEY_RSA_VERIFYRECOVER"},
+ {ERR_FUNC(RSA_F_RSA_ALGOR_TO_MD), "RSA_ALGOR_TO_MD"},
{ERR_FUNC(RSA_F_RSA_BUILTIN_KEYGEN), "RSA_BUILTIN_KEYGEN"},
{ERR_FUNC(RSA_F_RSA_CHECK_KEY), "RSA_check_key"},
+ {ERR_FUNC(RSA_F_RSA_CMS_DECRYPT), "RSA_CMS_DECRYPT"},
{ERR_FUNC(RSA_F_RSA_EAY_PRIVATE_DECRYPT), "RSA_EAY_PRIVATE_DECRYPT"},
{ERR_FUNC(RSA_F_RSA_EAY_PRIVATE_ENCRYPT), "RSA_EAY_PRIVATE_ENCRYPT"},
{ERR_FUNC(RSA_F_RSA_EAY_PUBLIC_DECRYPT), "RSA_EAY_PUBLIC_DECRYPT"},
@@ -90,6 +92,7 @@ static ERR_STRING_DATA RSA_str_functs[] = {
{ERR_FUNC(RSA_F_RSA_GENERATE_KEY_EX), "RSA_generate_key_ex"},
{ERR_FUNC(RSA_F_RSA_ITEM_VERIFY), "RSA_ITEM_VERIFY"},
{ERR_FUNC(RSA_F_RSA_MEMORY_LOCK), "RSA_memory_lock"},
+ {ERR_FUNC(RSA_F_RSA_MGF1_TO_MD), "RSA_MGF1_TO_MD"},
{ERR_FUNC(RSA_F_RSA_NEW_METHOD), "RSA_new_method"},
{ERR_FUNC(RSA_F_RSA_NULL), "RSA_NULL"},
{ERR_FUNC(RSA_F_RSA_NULL_MOD_EXP), "RSA_NULL_MOD_EXP"},
@@ -100,6 +103,8 @@ static ERR_STRING_DATA RSA_str_functs[] = {
{ERR_FUNC(RSA_F_RSA_PADDING_ADD_NONE), "RSA_padding_add_none"},
{ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_OAEP),
"RSA_padding_add_PKCS1_OAEP"},
+ {ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_OAEP_MGF1),
+ "RSA_padding_add_PKCS1_OAEP_mgf1"},
{ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_PSS), "RSA_padding_add_PKCS1_PSS"},
{ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1),
"RSA_padding_add_PKCS1_PSS_mgf1"},
@@ -112,6 +117,8 @@ static ERR_STRING_DATA RSA_str_functs[] = {
{ERR_FUNC(RSA_F_RSA_PADDING_CHECK_NONE), "RSA_padding_check_none"},
{ERR_FUNC(RSA_F_RSA_PADDING_CHECK_PKCS1_OAEP),
"RSA_padding_check_PKCS1_OAEP"},
+ {ERR_FUNC(RSA_F_RSA_PADDING_CHECK_PKCS1_OAEP_MGF1),
+ "RSA_padding_check_PKCS1_OAEP_mgf1"},
{ERR_FUNC(RSA_F_RSA_PADDING_CHECK_PKCS1_TYPE_1),
"RSA_padding_check_PKCS1_type_1"},
{ERR_FUNC(RSA_F_RSA_PADDING_CHECK_PKCS1_TYPE_2),
@@ -124,6 +131,7 @@ static ERR_STRING_DATA RSA_str_functs[] = {
{ERR_FUNC(RSA_F_RSA_PRIVATE_ENCRYPT), "RSA_private_encrypt"},
{ERR_FUNC(RSA_F_RSA_PRIV_DECODE), "RSA_PRIV_DECODE"},
{ERR_FUNC(RSA_F_RSA_PRIV_ENCODE), "RSA_PRIV_ENCODE"},
+ {ERR_FUNC(RSA_F_RSA_PSS_TO_CTX), "RSA_PSS_TO_CTX"},
{ERR_FUNC(RSA_F_RSA_PUBLIC_DECRYPT), "RSA_public_decrypt"},
{ERR_FUNC(RSA_F_RSA_PUBLIC_ENCRYPT), "RSA_public_encrypt"},
{ERR_FUNC(RSA_F_RSA_PUB_DECODE), "RSA_PUB_DECODE"},
@@ -157,6 +165,7 @@ static ERR_STRING_DATA RSA_str_reasons[] = {
{ERR_REASON(RSA_R_DATA_TOO_SMALL), "data too small"},
{ERR_REASON(RSA_R_DATA_TOO_SMALL_FOR_KEY_SIZE),
"data too small for key size"},
+ {ERR_REASON(RSA_R_DIGEST_DOES_NOT_MATCH), "digest does not match"},
{ERR_REASON(RSA_R_DIGEST_TOO_BIG_FOR_RSA_KEY),
"digest too big for rsa key"},
{ERR_REASON(RSA_R_DMP1_NOT_CONGRUENT_TO_D), "dmp1 not congruent to d"},
@@ -165,11 +174,14 @@ static ERR_STRING_DATA RSA_str_reasons[] = {
{ERR_REASON(RSA_R_FIRST_OCTET_INVALID), "first octet invalid"},
{ERR_REASON(RSA_R_ILLEGAL_OR_UNSUPPORTED_PADDING_MODE),
"illegal or unsupported padding mode"},
+ {ERR_REASON(RSA_R_INVALID_DIGEST), "invalid digest"},
{ERR_REASON(RSA_R_INVALID_DIGEST_LENGTH), "invalid digest length"},
{ERR_REASON(RSA_R_INVALID_HEADER), "invalid header"},
{ERR_REASON(RSA_R_INVALID_KEYBITS), "invalid keybits"},
+ {ERR_REASON(RSA_R_INVALID_LABEL), "invalid label"},
{ERR_REASON(RSA_R_INVALID_MESSAGE_LENGTH), "invalid message length"},
{ERR_REASON(RSA_R_INVALID_MGF1_MD), "invalid mgf1 md"},
+ {ERR_REASON(RSA_R_INVALID_OAEP_PARAMETERS), "invalid oaep parameters"},
{ERR_REASON(RSA_R_INVALID_PADDING), "invalid padding"},
{ERR_REASON(RSA_R_INVALID_PADDING_MODE), "invalid padding mode"},
{ERR_REASON(RSA_R_INVALID_PSS_PARAMETERS), "invalid pss parameters"},
@@ -203,9 +215,13 @@ static ERR_STRING_DATA RSA_str_reasons[] = {
{ERR_REASON(RSA_R_THE_ASN1_OBJECT_IDENTIFIER_IS_NOT_KNOWN_FOR_THIS_MD),
"the asn1 object identifier is not known for this md"},
{ERR_REASON(RSA_R_UNKNOWN_ALGORITHM_TYPE), "unknown algorithm type"},
+ {ERR_REASON(RSA_R_UNKNOWN_DIGEST), "unknown digest"},
{ERR_REASON(RSA_R_UNKNOWN_MASK_DIGEST), "unknown mask digest"},
{ERR_REASON(RSA_R_UNKNOWN_PADDING_TYPE), "unknown padding type"},
{ERR_REASON(RSA_R_UNKNOWN_PSS_DIGEST), "unknown pss digest"},
+ {ERR_REASON(RSA_R_UNSUPPORTED_ENCRYPTION_TYPE),
+ "unsupported encryption type"},
+ {ERR_REASON(RSA_R_UNSUPPORTED_LABEL_SOURCE), "unsupported label source"},
{ERR_REASON(RSA_R_UNSUPPORTED_MASK_ALGORITHM),
"unsupported mask algorithm"},
{ERR_REASON(RSA_R_UNSUPPORTED_MASK_PARAMETER),
diff --git a/crypto/rsa/rsa_oaep.c b/crypto/rsa/rsa_oaep.c
index 499835f8143d..9c2a943cf778 100644
--- a/crypto/rsa/rsa_oaep.c
+++ b/crypto/rsa/rsa_oaep.c
@@ -28,39 +28,53 @@
# include <openssl/rand.h>
# include <openssl/sha.h>
-static int MGF1(unsigned char *mask, long len,
- const unsigned char *seed, long seedlen);
-
int RSA_padding_add_PKCS1_OAEP(unsigned char *to, int tlen,
const unsigned char *from, int flen,
const unsigned char *param, int plen)
{
+ return RSA_padding_add_PKCS1_OAEP_mgf1(to, tlen, from, flen,
+ param, plen, NULL, NULL);
+}
+
+int RSA_padding_add_PKCS1_OAEP_mgf1(unsigned char *to, int tlen,
+ const unsigned char *from, int flen,
+ const unsigned char *param, int plen,
+ const EVP_MD *md, const EVP_MD *mgf1md)
+{
int i, emlen = tlen - 1;
unsigned char *db, *seed;
- unsigned char *dbmask, seedmask[SHA_DIGEST_LENGTH];
+ unsigned char *dbmask, seedmask[EVP_MAX_MD_SIZE];
+ int mdlen;
+
+ if (md == NULL)
+ md = EVP_sha1();
+ if (mgf1md == NULL)
+ mgf1md = md;
- if (flen > emlen - 2 * SHA_DIGEST_LENGTH - 1) {
- RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_OAEP,
+ mdlen = EVP_MD_size(md);
+
+ if (flen > emlen - 2 * mdlen - 1) {
+ RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_OAEP_MGF1,
RSA_R_DATA_TOO_LARGE_FOR_KEY_SIZE);
return 0;
}
- if (emlen < 2 * SHA_DIGEST_LENGTH + 1) {
- RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_OAEP, RSA_R_KEY_SIZE_TOO_SMALL);
+ if (emlen < 2 * mdlen + 1) {
+ RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_OAEP_MGF1,
+ RSA_R_KEY_SIZE_TOO_SMALL);
return 0;
}
to[0] = 0;
seed = to + 1;
- db = to + SHA_DIGEST_LENGTH + 1;
+ db = to + mdlen + 1;
- if (!EVP_Digest((void *)param, plen, db, NULL, EVP_sha1(), NULL))
+ if (!EVP_Digest((void *)param, plen, db, NULL, md, NULL))
return 0;
- memset(db + SHA_DIGEST_LENGTH, 0,
- emlen - flen - 2 * SHA_DIGEST_LENGTH - 1);
- db[emlen - flen - SHA_DIGEST_LENGTH - 1] = 0x01;
- memcpy(db + emlen - flen - SHA_DIGEST_LENGTH, from, (unsigned int)flen);
- if (RAND_bytes(seed, SHA_DIGEST_LENGTH) <= 0)
+ memset(db + mdlen, 0, emlen - flen - 2 * mdlen - 1);
+ db[emlen - flen - mdlen - 1] = 0x01;
+ memcpy(db + emlen - flen - mdlen, from, (unsigned int)flen);
+ if (RAND_bytes(seed, mdlen) <= 0)
return 0;
# ifdef PKCS_TESTVECT
memcpy(seed,
@@ -68,20 +82,20 @@ int RSA_padding_add_PKCS1_OAEP(unsigned char *to, int tlen,
20);
# endif
- dbmask = OPENSSL_malloc(emlen - SHA_DIGEST_LENGTH);
+ dbmask = OPENSSL_malloc(emlen - mdlen);
if (dbmask == NULL) {
- RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_OAEP, ERR_R_MALLOC_FAILURE);
+ RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_OAEP_MGF1, ERR_R_MALLOC_FAILURE);
return 0;
}
- if (MGF1(dbmask, emlen - SHA_DIGEST_LENGTH, seed, SHA_DIGEST_LENGTH) < 0)
+ if (PKCS1_MGF1(dbmask, emlen - mdlen, seed, mdlen, mgf1md) < 0)
return 0;
- for (i = 0; i < emlen - SHA_DIGEST_LENGTH; i++)
+ for (i = 0; i < emlen - mdlen; i++)
db[i] ^= dbmask[i];
- if (MGF1(seedmask, SHA_DIGEST_LENGTH, db, emlen - SHA_DIGEST_LENGTH) < 0)
+ if (PKCS1_MGF1(seedmask, mdlen, db, emlen - mdlen, mgf1md) < 0)
return 0;
- for (i = 0; i < SHA_DIGEST_LENGTH; i++)
+ for (i = 0; i < mdlen; i++)
seed[i] ^= seedmask[i];
OPENSSL_free(dbmask);
@@ -92,6 +106,16 @@ int RSA_padding_check_PKCS1_OAEP(unsigned char *to, int tlen,
const unsigned char *from, int flen, int num,
const unsigned char *param, int plen)
{
+ return RSA_padding_check_PKCS1_OAEP_mgf1(to, tlen, from, flen, num,
+ param, plen, NULL, NULL);
+}
+
+int RSA_padding_check_PKCS1_OAEP_mgf1(unsigned char *to, int tlen,
+ const unsigned char *from, int flen,
+ int num, const unsigned char *param,
+ int plen, const EVP_MD *md,
+ const EVP_MD *mgf1md)
+{
int i, dblen, mlen = -1, one_index = 0, msg_index;
unsigned int good, found_one_byte;
const unsigned char *maskedseed, *maskeddb;
@@ -101,26 +125,33 @@ int RSA_padding_check_PKCS1_OAEP(unsigned char *to, int tlen,
*/
unsigned char *db = NULL, *em = NULL, seed[EVP_MAX_MD_SIZE],
phash[EVP_MAX_MD_SIZE];
+ int mdlen;
+
+ if (md == NULL)
+ md = EVP_sha1();
+ if (mgf1md == NULL)
+ mgf1md = md;
+
+ mdlen = EVP_MD_size(md);
if (tlen <= 0 || flen <= 0)
return -1;
-
/*
* |num| is the length of the modulus; |flen| is the length of the
* encoded message. Therefore, for any |from| that was obtained by
* decrypting a ciphertext, we must have |flen| <= |num|. Similarly,
- * num < 2 * SHA_DIGEST_LENGTH + 2 must hold for the modulus
- * irrespective of the ciphertext, see PKCS #1 v2.2, section 7.1.2.
+ * num < 2 * mdlen + 2 must hold for the modulus irrespective of
+ * the ciphertext, see PKCS #1 v2.2, section 7.1.2.
* This does not leak any side-channel information.
*/
- if (num < flen || num < 2 * SHA_DIGEST_LENGTH + 2)
+ if (num < flen || num < 2 * mdlen + 2)
goto decoding_err;
- dblen = num - SHA_DIGEST_LENGTH - 1;
+ dblen = num - mdlen - 1;
db = OPENSSL_malloc(dblen);
em = OPENSSL_malloc(num);
if (db == NULL || em == NULL) {
- RSAerr(RSA_F_RSA_PADDING_CHECK_PKCS1_OAEP, ERR_R_MALLOC_FAILURE);
+ RSAerr(RSA_F_RSA_PADDING_CHECK_PKCS1_OAEP_MGF1, ERR_R_MALLOC_FAILURE);
goto cleanup;
}
@@ -143,26 +174,25 @@ int RSA_padding_check_PKCS1_OAEP(unsigned char *to, int tlen,
good = constant_time_is_zero(em[0]);
maskedseed = em + 1;
- maskeddb = em + 1 + SHA_DIGEST_LENGTH;
+ maskeddb = em + 1 + mdlen;
- if (MGF1(seed, SHA_DIGEST_LENGTH, maskeddb, dblen))
+ if (PKCS1_MGF1(seed, mdlen, maskeddb, dblen, mgf1md))
goto cleanup;
- for (i = 0; i < SHA_DIGEST_LENGTH; i++)
+ for (i = 0; i < mdlen; i++)
seed[i] ^= maskedseed[i];
- if (MGF1(db, dblen, seed, SHA_DIGEST_LENGTH))
+ if (PKCS1_MGF1(db, dblen, seed, mdlen, mgf1md))
goto cleanup;
for (i = 0; i < dblen; i++)
db[i] ^= maskeddb[i];
- if (!EVP_Digest((void *)param, plen, phash, NULL, EVP_sha1(), NULL))
+ if (!EVP_Digest((void *)param, plen, phash, NULL, md, NULL))
goto cleanup;
- good &=
- constant_time_is_zero(CRYPTO_memcmp(db, phash, SHA_DIGEST_LENGTH));
+ good &= constant_time_is_zero(CRYPTO_memcmp(db, phash, mdlen));
found_one_byte = 0;
- for (i = SHA_DIGEST_LENGTH; i < dblen; i++) {
+ for (i = mdlen; i < dblen; i++) {
/*
* Padding consists of a number of 0-bytes, followed by a 1.
*/
@@ -188,7 +218,7 @@ int RSA_padding_check_PKCS1_OAEP(unsigned char *to, int tlen,
mlen = dblen - msg_index;
if (tlen < mlen) {
- RSAerr(RSA_F_RSA_PADDING_CHECK_PKCS1_OAEP, RSA_R_DATA_TOO_LARGE);
+ RSAerr(RSA_F_RSA_PADDING_CHECK_PKCS1_OAEP_MGF1, RSA_R_DATA_TOO_LARGE);
mlen = -1;
} else {
memcpy(to, db + msg_index, mlen);
@@ -200,7 +230,8 @@ int RSA_padding_check_PKCS1_OAEP(unsigned char *to, int tlen,
* To avoid chosen ciphertext attacks, the error message should not
* reveal which kind of decoding error happened.
*/
- RSAerr(RSA_F_RSA_PADDING_CHECK_PKCS1_OAEP, RSA_R_OAEP_DECODING_ERROR);
+ RSAerr(RSA_F_RSA_PADDING_CHECK_PKCS1_OAEP_MGF1,
+ RSA_R_OAEP_DECODING_ERROR);
cleanup:
if (db != NULL)
OPENSSL_free(db);
@@ -249,9 +280,4 @@ int PKCS1_MGF1(unsigned char *mask, long len,
return rv;
}
-static int MGF1(unsigned char *mask, long len, const unsigned char *seed,
- long seedlen)
-{
- return PKCS1_MGF1(mask, len, seed, seedlen, EVP_sha1());
-}
#endif
diff --git a/crypto/rsa/rsa_pmeth.c b/crypto/rsa/rsa_pmeth.c
index 6a7c67cdb86c..203635595f4c 100644
--- a/crypto/rsa/rsa_pmeth.c
+++ b/crypto/rsa/rsa_pmeth.c
@@ -64,6 +64,7 @@
#include <openssl/rsa.h>
#include <openssl/bn.h>
#include <openssl/evp.h>
+#include <openssl/x509v3.h>
#ifndef OPENSSL_NO_CMS
# include <openssl/cms.h>
#endif
@@ -87,10 +88,13 @@ typedef struct {
const EVP_MD *md;
/* message digest for MGF1 */
const EVP_MD *mgf1md;
- /* PSS/OAEP salt length */
+ /* PSS salt length */
int saltlen;
/* Temp buffer */
unsigned char *tbuf;
+ /* OAEP label */
+ unsigned char *oaep_label;
+ size_t oaep_labellen;
} RSA_PKEY_CTX;
static int pkey_rsa_init(EVP_PKEY_CTX *ctx)
@@ -108,6 +112,9 @@ static int pkey_rsa_init(EVP_PKEY_CTX *ctx)
rctx->saltlen = -2;
+ rctx->oaep_label = NULL;
+ rctx->oaep_labellen = 0;
+
ctx->data = rctx;
ctx->keygen_info = rctx->gentmp;
ctx->keygen_info_count = 2;
@@ -130,6 +137,15 @@ static int pkey_rsa_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src)
}
dctx->pad_mode = sctx->pad_mode;
dctx->md = sctx->md;
+ dctx->mgf1md = sctx->mgf1md;
+ if (sctx->oaep_label) {
+ if (dctx->oaep_label)
+ OPENSSL_free(dctx->oaep_label);
+ dctx->oaep_label = BUF_memdup(sctx->oaep_label, sctx->oaep_labellen);
+ if (!dctx->oaep_label)
+ return 0;
+ dctx->oaep_labellen = sctx->oaep_labellen;
+ }
return 1;
}
@@ -151,6 +167,8 @@ static void pkey_rsa_cleanup(EVP_PKEY_CTX *ctx)
BN_free(rctx->pub_exp);
if (rctx->tbuf)
OPENSSL_free(rctx->tbuf);
+ if (rctx->oaep_label)
+ OPENSSL_free(rctx->oaep_label);
OPENSSL_free(rctx);
}
}
@@ -173,10 +191,18 @@ static int pkey_fips_check_ctx(EVP_PKEY_CTX *ctx)
rv = 0;
if (!(rsa->meth->flags & RSA_FLAG_FIPS_METHOD) && rv)
return -1;
- if (rctx->md && !(rctx->md->flags & EVP_MD_FLAG_FIPS))
- return rv;
- if (rctx->mgf1md && !(rctx->mgf1md->flags & EVP_MD_FLAG_FIPS))
- return rv;
+ if (rctx->md) {
+ const EVP_MD *fmd;
+ fmd = FIPS_get_digestbynid(EVP_MD_type(rctx->md));
+ if (!fmd || !(fmd->flags & EVP_MD_FLAG_FIPS))
+ return rv;
+ }
+ if (rctx->mgf1md && !(rctx->mgf1md->flags & EVP_MD_FLAG_FIPS)) {
+ const EVP_MD *fmd;
+ fmd = FIPS_get_digestbynid(EVP_MD_type(rctx->mgf1md));
+ if (!fmd || !(fmd->flags & EVP_MD_FLAG_FIPS))
+ return rv;
+ }
return 1;
}
#endif
@@ -388,8 +414,21 @@ static int pkey_rsa_encrypt(EVP_PKEY_CTX *ctx,
{
int ret;
RSA_PKEY_CTX *rctx = ctx->data;
- ret = RSA_public_encrypt(inlen, in, out, ctx->pkey->pkey.rsa,
- rctx->pad_mode);
+ if (rctx->pad_mode == RSA_PKCS1_OAEP_PADDING) {
+ int klen = RSA_size(ctx->pkey->pkey.rsa);
+ if (!setup_tbuf(rctx, ctx))
+ return -1;
+ if (!RSA_padding_add_PKCS1_OAEP_mgf1(rctx->tbuf, klen,
+ in, inlen,
+ rctx->oaep_label,
+ rctx->oaep_labellen,
+ rctx->md, rctx->mgf1md))
+ return -1;
+ ret = RSA_public_encrypt(klen, rctx->tbuf, out,
+ ctx->pkey->pkey.rsa, RSA_NO_PADDING);
+ } else
+ ret = RSA_public_encrypt(inlen, in, out, ctx->pkey->pkey.rsa,
+ rctx->pad_mode);
if (ret < 0)
return ret;
*outlen = ret;
@@ -402,8 +441,26 @@ static int pkey_rsa_decrypt(EVP_PKEY_CTX *ctx,
{
int ret;
RSA_PKEY_CTX *rctx = ctx->data;
- ret = RSA_private_decrypt(inlen, in, out, ctx->pkey->pkey.rsa,
- rctx->pad_mode);
+ if (rctx->pad_mode == RSA_PKCS1_OAEP_PADDING) {
+ int i;
+ if (!setup_tbuf(rctx, ctx))
+ return -1;
+ ret = RSA_private_decrypt(inlen, in, rctx->tbuf,
+ ctx->pkey->pkey.rsa, RSA_NO_PADDING);
+ if (ret <= 0)
+ return ret;
+ for (i = 0; i < ret; i++) {
+ if (rctx->tbuf[i])
+ break;
+ }
+ ret = RSA_padding_check_PKCS1_OAEP_mgf1(out, ret, rctx->tbuf + i,
+ ret - i, ret,
+ rctx->oaep_label,
+ rctx->oaep_labellen,
+ rctx->md, rctx->mgf1md);
+ } else
+ ret = RSA_private_decrypt(inlen, in, out, ctx->pkey->pkey.rsa,
+ rctx->pad_mode);
if (ret < 0)
return ret;
*outlen = ret;
@@ -490,18 +547,36 @@ static int pkey_rsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
case EVP_PKEY_CTRL_RSA_KEYGEN_PUBEXP:
if (!p2)
return -2;
+ BN_free(rctx->pub_exp);
rctx->pub_exp = p2;
return 1;
+ case EVP_PKEY_CTRL_RSA_OAEP_MD:
+ case EVP_PKEY_CTRL_GET_RSA_OAEP_MD:
+ if (rctx->pad_mode != RSA_PKCS1_OAEP_PADDING) {
+ RSAerr(RSA_F_PKEY_RSA_CTRL, RSA_R_INVALID_PADDING_MODE);
+ return -2;
+ }
+ if (type == EVP_PKEY_CTRL_GET_RSA_OAEP_MD)
+ *(const EVP_MD **)p2 = rctx->md;
+ else
+ rctx->md = p2;
+ return 1;
+
case EVP_PKEY_CTRL_MD:
if (!check_padding_md(p2, rctx->pad_mode))
return 0;
rctx->md = p2;
return 1;
+ case EVP_PKEY_CTRL_GET_MD:
+ *(const EVP_MD **)p2 = rctx->md;
+ return 1;
+
case EVP_PKEY_CTRL_RSA_MGF1_MD:
case EVP_PKEY_CTRL_GET_RSA_MGF1_MD:
- if (rctx->pad_mode != RSA_PKCS1_PSS_PADDING) {
+ if (rctx->pad_mode != RSA_PKCS1_PSS_PADDING
+ && rctx->pad_mode != RSA_PKCS1_OAEP_PADDING) {
RSAerr(RSA_F_PKEY_RSA_CTRL, RSA_R_INVALID_MGF1_MD);
return -2;
}
@@ -514,6 +589,30 @@ static int pkey_rsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
rctx->mgf1md = p2;
return 1;
+ case EVP_PKEY_CTRL_RSA_OAEP_LABEL:
+ if (rctx->pad_mode != RSA_PKCS1_OAEP_PADDING) {
+ RSAerr(RSA_F_PKEY_RSA_CTRL, RSA_R_INVALID_PADDING_MODE);
+ return -2;
+ }
+ if (rctx->oaep_label)
+ OPENSSL_free(rctx->oaep_label);
+ if (p2 && p1 > 0) {
+ rctx->oaep_label = p2;
+ rctx->oaep_labellen = p1;
+ } else {
+ rctx->oaep_label = NULL;
+ rctx->oaep_labellen = 0;
+ }
+ return 1;
+
+ case EVP_PKEY_CTRL_GET_RSA_OAEP_LABEL:
+ if (rctx->pad_mode != RSA_PKCS1_OAEP_PADDING) {
+ RSAerr(RSA_F_PKEY_RSA_CTRL, RSA_R_INVALID_PADDING_MODE);
+ return -2;
+ }
+ *(unsigned char **)p2 = rctx->oaep_label;
+ return rctx->oaep_labellen;
+
case EVP_PKEY_CTRL_DIGESTINIT:
case EVP_PKEY_CTRL_PKCS7_ENCRYPT:
case EVP_PKEY_CTRL_PKCS7_DECRYPT:
@@ -521,16 +620,6 @@ static int pkey_rsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
return 1;
#ifndef OPENSSL_NO_CMS
case EVP_PKEY_CTRL_CMS_DECRYPT:
- {
- X509_ALGOR *alg = NULL;
- ASN1_OBJECT *encalg = NULL;
- if (p2)
- CMS_RecipientInfo_ktri_get0_algs(p2, NULL, NULL, &alg);
- if (alg)
- X509_ALGOR_get0(&encalg, NULL, NULL, alg);
- if (encalg && OBJ_obj2nid(encalg) == NID_rsaesOaep)
- rctx->pad_mode = RSA_PKCS1_OAEP_PADDING;
- }
case EVP_PKEY_CTRL_CMS_ENCRYPT:
case EVP_PKEY_CTRL_CMS_SIGN:
return 1;
@@ -599,6 +688,36 @@ static int pkey_rsa_ctrl_str(EVP_PKEY_CTX *ctx,
return ret;
}
+ if (!strcmp(type, "rsa_mgf1_md")) {
+ const EVP_MD *md;
+ if (!(md = EVP_get_digestbyname(value))) {
+ RSAerr(RSA_F_PKEY_RSA_CTRL_STR, RSA_R_INVALID_DIGEST);
+ return 0;
+ }
+ return EVP_PKEY_CTX_set_rsa_mgf1_md(ctx, md);
+ }
+
+ if (!strcmp(type, "rsa_oaep_md")) {
+ const EVP_MD *md;
+ if (!(md = EVP_get_digestbyname(value))) {
+ RSAerr(RSA_F_PKEY_RSA_CTRL_STR, RSA_R_INVALID_DIGEST);
+ return 0;
+ }
+ return EVP_PKEY_CTX_set_rsa_oaep_md(ctx, md);
+ }
+ if (!strcmp(type, "rsa_oaep_label")) {
+ unsigned char *lab;
+ long lablen;
+ int ret;
+ lab = string_to_hex(value, &lablen);
+ if (!lab)
+ return 0;
+ ret = EVP_PKEY_CTX_set0_rsa_oaep_label(ctx, lab, lablen);
+ if (ret <= 0)
+ OPENSSL_free(lab);
+ return ret;
+ }
+
return -2;
}
diff --git a/crypto/rsa/rsa_sign.c b/crypto/rsa/rsa_sign.c
index bc91da2c1fe9..19461c6364d4 100644
--- a/crypto/rsa/rsa_sign.c
+++ b/crypto/rsa/rsa_sign.c
@@ -261,19 +261,8 @@ int int_rsa_verify(int dtype, const unsigned char *m,
OBJ_nid2ln(dtype));
#endif
if (sigtype != dtype) {
- if (((dtype == NID_md5) &&
- (sigtype == NID_md5WithRSAEncryption)) ||
- ((dtype == NID_md2) &&
- (sigtype == NID_md2WithRSAEncryption))) {
- /* ok, we will let it through */
-#if !defined(OPENSSL_NO_STDIO) && !defined(OPENSSL_SYS_WIN16)
- fprintf(stderr,
- "signature has problems, re-make with post SSLeay045\n");
-#endif
- } else {
- RSAerr(RSA_F_INT_RSA_VERIFY, RSA_R_ALGORITHM_MISMATCH);
- goto err;
- }
+ RSAerr(RSA_F_INT_RSA_VERIFY, RSA_R_ALGORITHM_MISMATCH);
+ goto err;
}
if (rm) {
const EVP_MD *md;
diff --git a/crypto/sha/Makefile b/crypto/sha/Makefile
index ceb8094b0cba..de6cdde58a9c 100644
--- a/crypto/sha/Makefile
+++ b/crypto/sha/Makefile
@@ -60,21 +60,25 @@ sha256-armv4.S: asm/sha256-armv4.pl
$(PERL) $< $(PERLASM_SCHEME) $@
sha1-alpha.s: asm/sha1-alpha.pl
- (preproc=/tmp/$$$$.$@; trap "rm $$preproc" INT; \
+ (preproc=$$$$.$@.S; trap "rm $$preproc" INT; \
$(PERL) asm/sha1-alpha.pl > $$preproc && \
- $(CC) -E $$preproc > $@ && rm $$preproc)
+ $(CC) -E -P $$preproc > $@ && rm $$preproc)
# Solaris make has to be explicitly told
sha1-x86_64.s: asm/sha1-x86_64.pl; $(PERL) asm/sha1-x86_64.pl $(PERLASM_SCHEME) > $@
+sha1-mb-x86_64.s: asm/sha1-mb-x86_64.pl; $(PERL) asm/sha1-mb-x86_64.pl $(PERLASM_SCHEME) > $@
sha256-x86_64.s:asm/sha512-x86_64.pl; $(PERL) asm/sha512-x86_64.pl $(PERLASM_SCHEME) $@
+sha256-mb-x86_64.s: asm/sha256-mb-x86_64.pl; $(PERL) asm/sha256-mb-x86_64.pl $(PERLASM_SCHEME) > $@
sha512-x86_64.s:asm/sha512-x86_64.pl; $(PERL) asm/sha512-x86_64.pl $(PERLASM_SCHEME) $@
-sha1-sparcv9.s: asm/sha1-sparcv9.pl; $(PERL) asm/sha1-sparcv9.pl $@ $(CFLAGS)
-sha256-sparcv9.s:asm/sha512-sparcv9.pl; $(PERL) asm/sha512-sparcv9.pl $@ $(CFLAGS)
-sha512-sparcv9.s:asm/sha512-sparcv9.pl; $(PERL) asm/sha512-sparcv9.pl $@ $(CFLAGS)
+sha1-sparcv9.S: asm/sha1-sparcv9.pl; $(PERL) asm/sha1-sparcv9.pl $@ $(CFLAGS)
+sha256-sparcv9.S:asm/sha512-sparcv9.pl; $(PERL) asm/sha512-sparcv9.pl $@ $(CFLAGS)
+sha512-sparcv9.S:asm/sha512-sparcv9.pl; $(PERL) asm/sha512-sparcv9.pl $@ $(CFLAGS)
sha1-ppc.s: asm/sha1-ppc.pl; $(PERL) asm/sha1-ppc.pl $(PERLASM_SCHEME) $@
sha256-ppc.s: asm/sha512-ppc.pl; $(PERL) asm/sha512-ppc.pl $(PERLASM_SCHEME) $@
sha512-ppc.s: asm/sha512-ppc.pl; $(PERL) asm/sha512-ppc.pl $(PERLASM_SCHEME) $@
+sha256p8-ppc.s: asm/sha512p8-ppc.pl; $(PERL) asm/sha512p8-ppc.pl $(PERLASM_SCHEME) $@
+sha512p8-ppc.s: asm/sha512p8-ppc.pl; $(PERL) asm/sha512p8-ppc.pl $(PERLASM_SCHEME) $@
sha1-parisc.s: asm/sha1-parisc.pl; $(PERL) asm/sha1-parisc.pl $(PERLASM_SCHEME) $@
sha256-parisc.s:asm/sha512-parisc.pl; $(PERL) asm/sha512-parisc.pl $(PERLASM_SCHEME) $@
@@ -92,6 +96,9 @@ sha512-%.S: asm/sha512-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
sha1-armv4-large.o: sha1-armv4-large.S
sha256-armv4.o: sha256-armv4.S
sha512-armv4.o: sha512-armv4.S
+sha1-armv8.o: sha1-armv8.S
+sha256-armv8.o: sha256-armv8.S
+sha512-armv8.o: sha512-armv8.S
files:
$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
diff --git a/crypto/sha/asm/sha1-586.pl b/crypto/sha/asm/sha1-586.pl
index 1084d227fe06..4895eb3ddf85 100644
--- a/crypto/sha/asm/sha1-586.pl
+++ b/crypto/sha/asm/sha1-586.pl
@@ -1,7 +1,7 @@
#!/usr/bin/env perl
# ====================================================================
-# [Re]written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# [Re]written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -79,6 +79,10 @@
# strongly, it's probably more appropriate to discuss possibility of
# using vector rotate XOP on AMD...
+# March 2014.
+#
+# Add support for Intel SHA Extensions.
+
######################################################################
# Current performance is summarized in following table. Numbers are
# CPU clock cycles spent to process single byte (less is better).
@@ -88,13 +92,20 @@
# PIII 11.5 -
# P4 10.6 -
# AMD K8 7.1 -
-# Core2 7.3 6.1/+20% -
-# Atom 12.5 9.5(*)/+32% -
-# Westmere 7.3 5.6/+30% -
-# Sandy Bridge 8.8 6.2/+40% 5.1(**)/+70%
+# Core2 7.3 6.0/+22% -
+# Westmere 7.3 5.5/+33% -
+# Sandy Bridge 8.8 6.2/+40% 5.1(**)/+73%
+# Ivy Bridge 7.2 4.8/+51% 4.7(**)/+53%
+# Haswell 6.5 4.3/+51% 4.1(**)/+58%
+# Bulldozer 11.6 6.0/+92%
+# VIA Nano 10.6 7.5/+41%
+# Atom 12.5 9.3(*)/+35%
+# Silvermont 14.5 9.9(*)/+46%
#
# (*) Loop is 1056 instructions long and expected result is ~8.25.
-# It remains mystery [to me] why ILP is limited to 1.7.
+# The discrepancy is because of front-end limitations, so
+# called MS-ROM penalties, and on Silvermont even rotate's
+# limited parallelism.
#
# (**) As per above comment, the result is for AVX *plus* sh[rl]d.
@@ -116,6 +127,15 @@ $ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" &&
`nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
$1>=2.03); # first version supporting AVX
+$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32" &&
+ `ml 2>&1` =~ /Version ([0-9]+)\./ &&
+ $1>=10); # first version supporting AVX
+
+$ymm=1 if ($xmm && !$ymm && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/ &&
+ $2>=3.0); # first version supporting AVX
+
+$shaext=$xmm; ### set to zero if compiling for 1.0.1
+
&external_label("OPENSSL_ia32cap_P") if ($xmm);
@@ -295,6 +315,7 @@ if ($alt) {
&function_begin("sha1_block_data_order");
if ($xmm) {
+ &static_label("shaext_shortcut") if ($shaext);
&static_label("ssse3_shortcut");
&static_label("avx_shortcut") if ($ymm);
&static_label("K_XX_XX");
@@ -309,8 +330,13 @@ if ($xmm) {
&mov ($D,&DWP(4,$T));
&test ($D,1<<9); # check SSSE3 bit
&jz (&label("x86"));
+ &mov ($C,&DWP(8,$T));
&test ($A,1<<24); # check FXSR bit
&jz (&label("x86"));
+ if ($shaext) {
+ &test ($C,1<<29); # check SHA bit
+ &jnz (&label("shaext_shortcut"));
+ }
if ($ymm) {
&and ($D,1<<28); # mask AVX bit
&and ($A,1<<30); # mask "Intel CPU" bit
@@ -389,6 +415,117 @@ if ($xmm) {
&function_end("sha1_block_data_order");
if ($xmm) {
+if ($shaext) {
+######################################################################
+# Intel SHA Extensions implementation of SHA1 update function.
+#
+my ($ctx,$inp,$num)=("edi","esi","ecx");
+my ($ABCD,$E,$E_,$BSWAP)=map("xmm$_",(0..3));
+my @MSG=map("xmm$_",(4..7));
+
+sub sha1rnds4 {
+ my ($dst,$src,$imm)=@_;
+ if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+ { &data_byte(0x0f,0x3a,0xcc,0xc0|($1<<3)|$2,$imm); }
+}
+sub sha1op38 {
+ my ($opcodelet,$dst,$src)=@_;
+ if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+ { &data_byte(0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2); }
+}
+sub sha1nexte { sha1op38(0xc8,@_); }
+sub sha1msg1 { sha1op38(0xc9,@_); }
+sub sha1msg2 { sha1op38(0xca,@_); }
+
+&function_begin("_sha1_block_data_order_shaext");
+ &call (&label("pic_point")); # make it PIC!
+ &set_label("pic_point");
+ &blindpop($tmp1);
+ &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
+&set_label("shaext_shortcut");
+ &mov ($ctx,&wparam(0));
+ &mov ("ebx","esp");
+ &mov ($inp,&wparam(1));
+ &mov ($num,&wparam(2));
+ &sub ("esp",32);
+
+ &movdqu ($ABCD,&QWP(0,$ctx));
+ &movd ($E,&DWP(16,$ctx));
+ &and ("esp",-32);
+ &movdqa ($BSWAP,&QWP(0x50,$tmp1)); # byte-n-word swap
+
+ &movdqu (@MSG[0],&QWP(0,$inp));
+ &pshufd ($ABCD,$ABCD,0b00011011); # flip word order
+ &movdqu (@MSG[1],&QWP(0x10,$inp));
+ &pshufd ($E,$E,0b00011011); # flip word order
+ &movdqu (@MSG[2],&QWP(0x20,$inp));
+ &pshufb (@MSG[0],$BSWAP);
+ &movdqu (@MSG[3],&QWP(0x30,$inp));
+ &pshufb (@MSG[1],$BSWAP);
+ &pshufb (@MSG[2],$BSWAP);
+ &pshufb (@MSG[3],$BSWAP);
+ &jmp (&label("loop_shaext"));
+
+&set_label("loop_shaext",16);
+ &dec ($num);
+ &lea ("eax",&DWP(0x40,$inp));
+ &movdqa (&QWP(0,"esp"),$E); # offload $E
+ &paddd ($E,@MSG[0]);
+ &cmovne ($inp,"eax");
+ &movdqa (&QWP(16,"esp"),$ABCD); # offload $ABCD
+
+for($i=0;$i<20-4;$i+=2) {
+ &sha1msg1 (@MSG[0],@MSG[1]);
+ &movdqa ($E_,$ABCD);
+ &sha1rnds4 ($ABCD,$E,int($i/5)); # 0-3...
+ &sha1nexte ($E_,@MSG[1]);
+ &pxor (@MSG[0],@MSG[2]);
+ &sha1msg1 (@MSG[1],@MSG[2]);
+ &sha1msg2 (@MSG[0],@MSG[3]);
+
+ &movdqa ($E,$ABCD);
+ &sha1rnds4 ($ABCD,$E_,int(($i+1)/5));
+ &sha1nexte ($E,@MSG[2]);
+ &pxor (@MSG[1],@MSG[3]);
+ &sha1msg2 (@MSG[1],@MSG[0]);
+
+ push(@MSG,shift(@MSG)); push(@MSG,shift(@MSG));
+}
+ &movdqu (@MSG[0],&QWP(0,$inp));
+ &movdqa ($E_,$ABCD);
+ &sha1rnds4 ($ABCD,$E,3); # 64-67
+ &sha1nexte ($E_,@MSG[1]);
+ &movdqu (@MSG[1],&QWP(0x10,$inp));
+ &pshufb (@MSG[0],$BSWAP);
+
+ &movdqa ($E,$ABCD);
+ &sha1rnds4 ($ABCD,$E_,3); # 68-71
+ &sha1nexte ($E,@MSG[2]);
+ &movdqu (@MSG[2],&QWP(0x20,$inp));
+ &pshufb (@MSG[1],$BSWAP);
+
+ &movdqa ($E_,$ABCD);
+ &sha1rnds4 ($ABCD,$E,3); # 72-75
+ &sha1nexte ($E_,@MSG[3]);
+ &movdqu (@MSG[3],&QWP(0x30,$inp));
+ &pshufb (@MSG[2],$BSWAP);
+
+ &movdqa ($E,$ABCD);
+ &sha1rnds4 ($ABCD,$E_,3); # 76-79
+ &movdqa ($E_,&QWP(0,"esp"));
+ &pshufb (@MSG[3],$BSWAP);
+ &sha1nexte ($E,$E_);
+ &paddd ($ABCD,&QWP(16,"esp"));
+
+ &jnz (&label("loop_shaext"));
+
+ &pshufd ($ABCD,$ABCD,0b00011011);
+ &pshufd ($E,$E,0b00011011);
+ &movdqu (&QWP(0,$ctx),$ABCD)
+ &movd (&DWP(16,$ctx),$E);
+ &mov ("esp","ebx");
+&function_end("_sha1_block_data_order_shaext");
+}
######################################################################
# The SSSE3 implementation.
#
@@ -416,6 +553,7 @@ my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded
my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4
my @V=($A,$B,$C,$D,$E);
my $j=0; # hash round
+my $rx=0;
my @T=($T,$tmp1);
my $inp;
@@ -501,8 +639,11 @@ my $_ror=sub { &ror(@_) };
&movdqa (&QWP(0+16,"esp"),@X[-3&7]);
&psubd (@X[-3&7],@X[3]);
&movdqa (&QWP(0+32,"esp"),@X[-2&7]);
+ &mov (@T[1],$C);
&psubd (@X[-2&7],@X[3]);
- &movdqa (@X[0],@X[-3&7]);
+ &xor (@T[1],$D);
+ &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]);
+ &and (@T[0],@T[1]);
&jmp (&label("loop"));
######################################################################
@@ -528,76 +669,77 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
my ($a,$b,$c,$d,$e);
+ eval(shift(@insns)); # ror
eval(shift(@insns));
eval(shift(@insns));
- &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
+ &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
&movdqa (@X[2],@X[-1&7]);
eval(shift(@insns));
eval(shift(@insns));
&paddd (@X[3],@X[-1&7]);
&movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer
- eval(shift(@insns));
+ eval(shift(@insns)); # rol
eval(shift(@insns));
&psrldq (@X[2],4); # "X[-3]", 3 dwords
eval(shift(@insns));
eval(shift(@insns));
&pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
eval(shift(@insns));
- eval(shift(@insns));
+ eval(shift(@insns)); # ror
&pxor (@X[2],@X[-2&7]); # "X[-3]"^"X[-8]"
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
- eval(shift(@insns));
&pxor (@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]"
eval(shift(@insns));
- eval(shift(@insns));
+ eval(shift(@insns)); # rol
&movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU
eval(shift(@insns));
eval(shift(@insns));
&movdqa (@X[4],@X[0]);
- &movdqa (@X[2],@X[0]);
- eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ &movdqa (@X[2],@X[0]);
eval(shift(@insns));
&pslldq (@X[4],12); # "X[0]"<<96, extract one dword
&paddd (@X[0],@X[0]);
eval(shift(@insns));
eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
&psrld (@X[2],31);
eval(shift(@insns));
- eval(shift(@insns));
+ eval(shift(@insns)); # rol
&movdqa (@X[3],@X[4]);
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns));
&psrld (@X[4],30);
- &por (@X[0],@X[2]); # "X[0]"<<<=1
eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ &por (@X[0],@X[2]); # "X[0]"<<<=1
eval(shift(@insns));
&movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer
eval(shift(@insns));
eval(shift(@insns));
&pslld (@X[3],2);
- &pxor (@X[0],@X[4]);
- eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ &pxor (@X[0],@X[4]);
&movdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX
eval(shift(@insns));
eval(shift(@insns));
&pxor (@X[0],@X[3]); # "X[0]"^=("X[0]"<<96)<<<2
- &movdqa (@X[1],@X[-2&7]) if ($Xi<7);
+ &pshufd (@X[1],@X[-3&7],0xee) if ($Xi<7); # was &movdqa (@X[1],@X[-2&7])
+ &pshufd (@X[3],@X[-1&7],0xee) if ($Xi==7);
eval(shift(@insns));
eval(shift(@insns));
@@ -609,13 +751,12 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
sub Xupdate_ssse3_32_79()
{ use integer;
my $body = shift;
- my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
+ my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
my ($a,$b,$c,$d,$e);
- &movdqa (@X[2],@X[-1&7]) if ($Xi==8);
eval(shift(@insns)); # body_20_39
&pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
- &palignr(@X[2],@X[-2&7],8); # compose "X[-6]"
+ &punpcklqdq(@X[2],@X[-1&7]); # compose "X[-6]", was &palignr(@X[2],@X[-2&7],8)
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # rol
@@ -624,13 +765,14 @@ sub Xupdate_ssse3_32_79()
&movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns)) if (@insns[0] =~ /_rol/);
if ($Xi%5) {
&movdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX...
} else { # ... or load next one
&movdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp"));
}
- &paddd (@X[3],@X[-1&7]);
eval(shift(@insns)); # ror
+ &paddd (@X[3],@X[-1&7]);
eval(shift(@insns));
&pxor (@X[0],@X[2]); # "X[0]"^="X[-6]"
@@ -645,6 +787,7 @@ sub Xupdate_ssse3_32_79()
eval(shift(@insns));
eval(shift(@insns)); # ror
eval(shift(@insns));
+ eval(shift(@insns)) if (@insns[0] =~ /_rol/);
&pslld (@X[0],2);
eval(shift(@insns)); # body_20_39
@@ -656,6 +799,8 @@ sub Xupdate_ssse3_32_79()
eval(shift(@insns));
eval(shift(@insns)); # ror
eval(shift(@insns));
+ eval(shift(@insns)) if (@insns[1] =~ /_rol/);
+ eval(shift(@insns)) if (@insns[0] =~ /_rol/);
&por (@X[0],@X[2]); # "X[0]"<<<=2
eval(shift(@insns)); # body_20_39
@@ -666,7 +811,7 @@ sub Xupdate_ssse3_32_79()
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # ror
- &movdqa (@X[3],@X[0]) if ($Xi<19);
+ &pshufd (@X[3],@X[-1],0xee) if ($Xi<19); # was &movdqa (@X[3],@X[0])
eval(shift(@insns));
foreach (@insns) { eval; } # remaining instructions
@@ -681,6 +826,12 @@ sub Xuplast_ssse3_80()
my ($a,$b,$c,$d,$e);
eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
&paddd (@X[3],@X[-1&7]);
eval(shift(@insns));
eval(shift(@insns));
@@ -717,9 +868,16 @@ sub Xloop_ssse3()
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
&pshufb (@X[($Xi-3)&7],@X[2]);
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
&paddd (@X[($Xi-4)&7],@X[3]);
eval(shift(@insns));
eval(shift(@insns));
@@ -728,6 +886,8 @@ sub Xloop_ssse3()
&movdqa (&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]); # X[]+K xfer to IALU
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
&psubd (@X[($Xi-4)&7],@X[3]);
foreach (@insns) { eval; }
@@ -743,51 +903,124 @@ sub Xtail_ssse3()
foreach (@insns) { eval; }
}
-sub body_00_19 () {
+sub body_00_19 () { # ((c^d)&b)^d
+ # on start @T[0]=(c^d)&b
+ return &body_20_39() if ($rx==19); $rx++;
(
'($a,$b,$c,$d,$e)=@V;'.
- '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer
- '&xor ($c,$d);',
+ '&$_ror ($b,$j?7:2);', # $b>>>2
+ '&xor (@T[0],$d);',
'&mov (@T[1],$a);', # $b in next round
+
+ '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer
+ '&xor ($b,$c);', # $c^$d for next round
+
'&$_rol ($a,5);',
- '&and (@T[0],$c);', # ($b&($c^$d))
- '&xor ($c,$d);', # restore $c
- '&xor (@T[0],$d);',
- '&add ($e,$a);',
- '&$_ror ($b,$j?7:2);', # $b>>>2
- '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ '&add ($e,@T[0]);',
+ '&and (@T[1],$b);', # ($b&($c^$d)) for next round
+
+ '&xor ($b,$c);', # restore $b
+ '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
);
}
-sub body_20_39 () {
+sub body_20_39 () { # b^d^c
+ # on entry @T[0]=b^d
+ return &body_40_59() if ($rx==39); $rx++;
(
'($a,$b,$c,$d,$e)=@V;'.
- '&add ($e,&DWP(4*($j++&15),"esp"));', # X[]+K xfer
- '&xor (@T[0],$d);', # ($b^$d)
+ '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer
+ '&xor (@T[0],$d) if($j==19);'.
+ '&xor (@T[0],$c) if($j> 19);', # ($b^$d^$c)
'&mov (@T[1],$a);', # $b in next round
+
'&$_rol ($a,5);',
- '&xor (@T[0],$c);', # ($b^$d^$c)
- '&add ($e,$a);',
+ '&add ($e,@T[0]);',
+ '&xor (@T[1],$c) if ($j< 79);', # $b^$d for next round
+
'&$_ror ($b,7);', # $b>>>2
- '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
);
}
-sub body_40_59 () {
+sub body_40_59 () { # ((b^c)&(c^d))^c
+ # on entry @T[0]=(b^c), (c^=d)
+ $rx++;
(
'($a,$b,$c,$d,$e)=@V;'.
- '&mov (@T[1],$c);',
- '&xor ($c,$d);',
- '&add ($e,&DWP(4*($j++&15),"esp"));', # X[]+K xfer
- '&and (@T[1],$d);',
- '&and (@T[0],$c);', # ($b&($c^$d))
+ '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer
+ '&and (@T[0],$c) if ($j>=40);', # (b^c)&(c^d)
+ '&xor ($c,$d) if ($j>=40);', # restore $c
+
'&$_ror ($b,7);', # $b>>>2
- '&add ($e,@T[1]);',
- '&mov (@T[1],$a);', # $b in next round
+ '&mov (@T[1],$a);', # $b for next round
+ '&xor (@T[0],$c);',
+
'&$_rol ($a,5);',
'&add ($e,@T[0]);',
- '&xor ($c,$d);', # restore $c
- '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ '&xor (@T[1],$c) if ($j==59);'.
+ '&xor (@T[1],$b) if ($j< 59);', # b^c for next round
+
+ '&xor ($b,$c) if ($j< 59);', # c^d for next round
+ '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+}
+######
+sub bodyx_00_19 () { # ((c^d)&b)^d
+ # on start @T[0]=(b&c)^(~b&d), $e+=X[]+K
+ return &bodyx_20_39() if ($rx==19); $rx++;
+ (
+ '($a,$b,$c,$d,$e)=@V;'.
+
+ '&rorx ($b,$b,2) if ($j==0);'. # $b>>>2
+ '&rorx ($b,@T[1],7) if ($j!=0);', # $b>>>2
+ '&lea ($e,&DWP(0,$e,@T[0]));',
+ '&rorx (@T[0],$a,5);',
+
+ '&andn (@T[1],$a,$c);',
+ '&and ($a,$b)',
+ '&add ($d,&DWP(4*(($j+1)&15),"esp"));', # X[]+K xfer
+
+ '&xor (@T[1],$a)',
+ '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+}
+
+sub bodyx_20_39 () { # b^d^c
+ # on start $b=b^c^d
+ return &bodyx_40_59() if ($rx==39); $rx++;
+ (
+ '($a,$b,$c,$d,$e)=@V;'.
+
+ '&add ($e,($j==19?@T[0]:$b))',
+ '&rorx ($b,@T[1],7);', # $b>>>2
+ '&rorx (@T[0],$a,5);',
+
+ '&xor ($a,$b) if ($j<79);',
+ '&add ($d,&DWP(4*(($j+1)&15),"esp")) if ($j<79);', # X[]+K xfer
+ '&xor ($a,$c) if ($j<79);',
+ '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+}
+
+sub bodyx_40_59 () { # ((b^c)&(c^d))^c
+ # on start $b=((b^c)&(c^d))^c
+ return &bodyx_20_39() if ($rx==59); $rx++;
+ (
+ '($a,$b,$c,$d,$e)=@V;'.
+
+ '&rorx (@T[0],$a,5)',
+ '&lea ($e,&DWP(0,$e,$b))',
+ '&rorx ($b,@T[1],7)', # $b>>>2
+ '&add ($d,&DWP(4*(($j+1)&15),"esp"))', # X[]+K xfer
+
+ '&mov (@T[1],$c)',
+ '&xor ($a,$b)', # b^c for next round
+ '&xor (@T[1],$b)', # c^d for next round
+
+ '&and ($a,@T[1])',
+ '&add ($e,@T[0])',
+ '&xor ($a,$b)' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
);
}
@@ -825,10 +1058,14 @@ sub body_40_59 () {
&mov (&DWP(4,@T[1]),@T[0]);
&add ($E,&DWP(16,@T[1]));
&mov (&DWP(8,@T[1]),$C);
- &mov ($B,@T[0]);
+ &mov ($B,$C);
&mov (&DWP(12,@T[1]),$D);
+ &xor ($B,$D);
&mov (&DWP(16,@T[1]),$E);
- &movdqa (@X[0],@X[-3&7]);
+ &mov (@T[1],@T[0]);
+ &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]);
+ &and (@T[0],$B);
+ &mov ($B,$T[1]);
&jmp (&label("loop"));
@@ -853,6 +1090,8 @@ sub body_40_59 () {
&function_end("_sha1_block_data_order_ssse3");
+$rx=0; # reset
+
if ($ymm) {
my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded
my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4
@@ -940,8 +1179,11 @@ my $_ror=sub { &shrd(@_[0],@_) };
&vpaddd (@X[1],@X[-3&7],@X[3]);
&vpaddd (@X[2],@X[-2&7],@X[3]);
&vmovdqa(&QWP(0,"esp"),@X[0]); # X[]+K xfer to IALU
+ &mov (@T[1],$C);
&vmovdqa(&QWP(0+16,"esp"),@X[1]);
+ &xor (@T[1],$D);
&vmovdqa(&QWP(0+32,"esp"),@X[2]);
+ &and (@T[0],@T[1]);
&jmp (&label("loop"));
sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
@@ -1025,7 +1267,7 @@ sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
sub Xupdate_avx_32_79()
{ use integer;
my $body = shift;
- my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
+ my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
my ($a,$b,$c,$d,$e);
&vpalignr(@X[2],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
@@ -1188,10 +1430,14 @@ sub Xtail_avx()
&add ($D,&DWP(12,@T[1]));
&mov (&DWP(4,@T[1]),@T[0]);
&add ($E,&DWP(16,@T[1]));
+ &mov ($B,$C);
&mov (&DWP(8,@T[1]),$C);
- &mov ($B,@T[0]);
+ &xor ($B,$D);
&mov (&DWP(12,@T[1]),$D);
&mov (&DWP(16,@T[1]),$E);
+ &mov (@T[1],@T[0]);
+ &and (@T[0],$B);
+ &mov ($B,@T[1]);
&jmp (&label("loop"));
@@ -1223,6 +1469,7 @@ sub Xtail_avx()
&data_word(0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc); # K_40_59
&data_word(0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6); # K_60_79
&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # pbswap mask
+&data_byte(0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0);
}
&asciz("SHA1 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
diff --git a/crypto/sha/asm/sha1-armv4-large.pl b/crypto/sha/asm/sha1-armv4-large.pl
index 33da3e0e3c0d..b2c30322c351 100755
--- a/crypto/sha/asm/sha1-armv4-large.pl
+++ b/crypto/sha/asm/sha1-armv4-large.pl
@@ -1,7 +1,7 @@
#!/usr/bin/env perl
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -52,6 +52,20 @@
# Profiler-assisted and platform-specific optimization resulted in 10%
# improvement on Cortex A8 core and 12.2 cycles per byte.
+# September 2013.
+#
+# Add NEON implementation (see sha1-586.pl for background info). On
+# Cortex A8 it was measured to process one byte in 6.7 cycles or >80%
+# faster than integer-only code. Because [fully unrolled] NEON code
+# is ~2.5x larger and there are some redundant instructions executed
+# when processing last block, improvement is not as big for smallest
+# blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per
+# byte, which is also >80% faster than integer-only code.
+
+# May 2014.
+#
+# Add ARMv8 code path performing at 2.35 cpb on Apple A7.
+
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
@@ -153,12 +167,22 @@ $code=<<___;
#include "arm_arch.h"
.text
+.code 32
.global sha1_block_data_order
.type sha1_block_data_order,%function
-.align 2
+.align 5
sha1_block_data_order:
+#if __ARM_MAX_ARCH__>=7
+ sub r3,pc,#8 @ sha1_block_data_order
+ ldr r12,.LOPENSSL_armcap
+ ldr r12,[r3,r12] @ OPENSSL_armcap_P
+ tst r12,#ARMV8_SHA1
+ bne .LARMv8
+ tst r12,#ARMV7_NEON
+ bne .LNEON
+#endif
stmdb sp!,{r4-r12,lr}
add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
ldmia $ctx,{$a,$b,$c,$d,$e}
@@ -233,16 +257,427 @@ $code.=<<___;
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
-.align 2
+.size sha1_block_data_order,.-sha1_block_data_order
+
+.align 5
.LK_00_19: .word 0x5a827999
.LK_20_39: .word 0x6ed9eba1
.LK_40_59: .word 0x8f1bbcdc
.LK_60_79: .word 0xca62c1d6
-.size sha1_block_data_order,.-sha1_block_data_order
-.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
-.align 2
+#if __ARM_MAX_ARCH__>=7
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-sha1_block_data_order
+#endif
+.asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align 5
+___
+#####################################################################
+# NEON stuff
+#
+{{{
+my @V=($a,$b,$c,$d,$e);
+my ($K_XX_XX,$Ki,$t0,$t1,$Xfer,$saved_sp)=map("r$_",(8..12,14));
+my $Xi=4;
+my @X=map("q$_",(8..11,0..3));
+my @Tx=("q12","q13");
+my ($K,$zero)=("q14","q15");
+my $j=0;
+
+sub AUTOLOAD() # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+ my $arg = pop;
+ $arg = "#$arg" if ($arg*1 eq $arg);
+ $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub body_00_19 () {
+ (
+ '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
+ '&bic ($t0,$d,$b)',
+ '&add ($e,$e,$Ki)', # e+=X[i]+K
+ '&and ($t1,$c,$b)',
+ '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
+ '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
+ '&eor ($t1,$t1,$t0)', # F_00_19
+ '&mov ($b,$b,"ror#2")', # b=ROR(b,2)
+ '&add ($e,$e,$t1);'. # e+=F_00_19
+ '$j++; unshift(@V,pop(@V));'
+ )
+}
+sub body_20_39 () {
+ (
+ '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
+ '&eor ($t0,$b,$d)',
+ '&add ($e,$e,$Ki)', # e+=X[i]+K
+ '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15)) if ($j<79)',
+ '&eor ($t1,$t0,$c)', # F_20_39
+ '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
+ '&mov ($b,$b,"ror#2")', # b=ROR(b,2)
+ '&add ($e,$e,$t1);'. # e+=F_20_39
+ '$j++; unshift(@V,pop(@V));'
+ )
+}
+sub body_40_59 () {
+ (
+ '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
+ '&add ($e,$e,$Ki)', # e+=X[i]+K
+ '&and ($t0,$c,$d)',
+ '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
+ '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
+ '&eor ($t1,$c,$d)',
+ '&add ($e,$e,$t0)',
+ '&and ($t1,$t1,$b)',
+ '&mov ($b,$b,"ror#2")', # b=ROR(b,2)
+ '&add ($e,$e,$t1);'. # e+=F_40_59
+ '$j++; unshift(@V,pop(@V));'
+ )
+}
+
+sub Xupdate_16_31 ()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e);
+
+ &vext_8 (@X[0],@X[-4&7],@X[-3&7],8); # compose "X[-14]" in "X[0]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (@Tx[1],@X[-1&7],$K);
+ eval(shift(@insns));
+ &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0);
+ eval(shift(@insns));
+ &vext_8 (@Tx[0],@X[-1&7],$zero,4); # "X[-3]", 3 words
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor (@Tx[0],@Tx[0],@X[0]); # "X[0]"^="X[-3]"^"X[-8]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer
+ &sub ($Xfer,$Xfer,64) if ($Xi%4==0);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vext_8 (@Tx[1],$zero,@Tx[0],4); # "X[0]"<<96, extract one dword
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (@X[0],@Tx[0],@Tx[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsri_32 (@X[0],@Tx[0],31); # "X[0]"<<<=1
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 (@Tx[0],@Tx[1],30);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshl_u32 (@Tx[1],@Tx[1],2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor (@X[0],@X[0],@Tx[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor (@X[0],@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
+
+ foreach (@insns) { eval; } # remaining instructions [if any]
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xupdate_32_79 ()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e);
+
+ &vext_8 (@Tx[0],@X[-2&7],@X[-1&7],8); # compose "X[-6]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (@Tx[1],@X[-1&7],$K);
+ eval(shift(@insns));
+ &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0);
+ eval(shift(@insns));
+ &veor (@Tx[0],@Tx[0],@X[0]); # "X[-6]"^="X[0]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 (@X[0],@Tx[0],30);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer
+ &sub ($Xfer,$Xfer,64) if ($Xi%4==0);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 (@X[0],@Tx[0],2); # "X[0]"="X[-6]"<<<2
+
+ foreach (@insns) { eval; } # remaining instructions [if any]
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xuplast_80 ()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e);
+
+ &vadd_i32 (@Tx[1],@X[-1&7],$K);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!");
+ &sub ($Xfer,$Xfer,64);
+
+ &teq ($inp,$len);
+ &sub ($K_XX_XX,$K_XX_XX,16); # rewind $K_XX_XX
+ &subeq ($inp,$inp,64); # reload last block to avoid SEGV
+ &vld1_8 ("{@X[-4&7]-@X[-3&7]}","[$inp]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vld1_8 ("{@X[-2&7]-@X[-1&7]}","[$inp]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!"); # load K_00_19
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vrev32_8 (@X[-4&7],@X[-4&7]);
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ $Xi=0;
+}
+
+sub Xloop()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e);
+
+ &vrev32_8 (@X[($Xi-3)&7],@X[($Xi-3)&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (@X[$Xi&7],@X[($Xi-4)&7],$K);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vst1_32 ("{@X[$Xi&7]}","[$Xfer,:128]!");# X[]+K xfer to IALU
+
+ foreach (@insns) { eval; }
+
+ $Xi++;
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.type sha1_block_data_order_neon,%function
+.align 4
+sha1_block_data_order_neon:
+.LNEON:
+ stmdb sp!,{r4-r12,lr}
+ add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
+ @ dmb @ errata #451034 on early Cortex A8
+ @ vstmdb sp!,{d8-d15} @ ABI specification says so
+ mov $saved_sp,sp
+ sub sp,sp,#64 @ alloca
+ adr $K_XX_XX,.LK_00_19
+ bic sp,sp,#15 @ align for 128-bit stores
+
+ ldmia $ctx,{$a,$b,$c,$d,$e} @ load context
+ mov $Xfer,sp
+
+ vld1.8 {@X[-4&7]-@X[-3&7]},[$inp]! @ handles unaligned
+ veor $zero,$zero,$zero
+ vld1.8 {@X[-2&7]-@X[-1&7]},[$inp]!
+ vld1.32 {${K}\[]},[$K_XX_XX,:32]! @ load K_00_19
+ vrev32.8 @X[-4&7],@X[-4&7] @ yes, even on
+ vrev32.8 @X[-3&7],@X[-3&7] @ big-endian...
+ vrev32.8 @X[-2&7],@X[-2&7]
+ vadd.i32 @X[0],@X[-4&7],$K
+ vrev32.8 @X[-1&7],@X[-1&7]
+ vadd.i32 @X[1],@X[-3&7],$K
+ vst1.32 {@X[0]},[$Xfer,:128]!
+ vadd.i32 @X[2],@X[-2&7],$K
+ vst1.32 {@X[1]},[$Xfer,:128]!
+ vst1.32 {@X[2]},[$Xfer,:128]!
+ ldr $Ki,[sp] @ big RAW stall
+
+.Loop_neon:
+___
+ &Xupdate_16_31(\&body_00_19);
+ &Xupdate_16_31(\&body_00_19);
+ &Xupdate_16_31(\&body_00_19);
+ &Xupdate_16_31(\&body_00_19);
+ &Xupdate_32_79(\&body_00_19);
+ &Xupdate_32_79(\&body_20_39);
+ &Xupdate_32_79(\&body_20_39);
+ &Xupdate_32_79(\&body_20_39);
+ &Xupdate_32_79(\&body_20_39);
+ &Xupdate_32_79(\&body_20_39);
+ &Xupdate_32_79(\&body_40_59);
+ &Xupdate_32_79(\&body_40_59);
+ &Xupdate_32_79(\&body_40_59);
+ &Xupdate_32_79(\&body_40_59);
+ &Xupdate_32_79(\&body_40_59);
+ &Xupdate_32_79(\&body_20_39);
+ &Xuplast_80(\&body_20_39);
+ &Xloop(\&body_20_39);
+ &Xloop(\&body_20_39);
+ &Xloop(\&body_20_39);
+$code.=<<___;
+ ldmia $ctx,{$Ki,$t0,$t1,$Xfer} @ accumulate context
+ add $a,$a,$Ki
+ ldr $Ki,[$ctx,#16]
+ add $b,$b,$t0
+ add $c,$c,$t1
+ add $d,$d,$Xfer
+ moveq sp,$saved_sp
+ add $e,$e,$Ki
+ ldrne $Ki,[sp]
+ stmia $ctx,{$a,$b,$c,$d,$e}
+ addne $Xfer,sp,#3*16
+ bne .Loop_neon
+
+ @ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r12,pc}
+.size sha1_block_data_order_neon,.-sha1_block_data_order_neon
+#endif
+___
+}}}
+#####################################################################
+# ARMv8 stuff
+#
+{{{
+my ($ABCD,$E,$E0,$E1)=map("q$_",(0..3));
+my @MSG=map("q$_",(4..7));
+my @Kxx=map("q$_",(8..11));
+my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14));
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.type sha1_block_data_order_armv8,%function
+.align 5
+sha1_block_data_order_armv8:
+.LARMv8:
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+
+ veor $E,$E,$E
+ adr r3,.LK_00_19
+ vld1.32 {$ABCD},[$ctx]!
+ vld1.32 {$E\[0]},[$ctx]
+ sub $ctx,$ctx,#16
+ vld1.32 {@Kxx[0]\[]},[r3,:32]!
+ vld1.32 {@Kxx[1]\[]},[r3,:32]!
+ vld1.32 {@Kxx[2]\[]},[r3,:32]!
+ vld1.32 {@Kxx[3]\[]},[r3,:32]
+
+.Loop_v8:
+ vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
+ vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
+ vrev32.8 @MSG[0],@MSG[0]
+ vrev32.8 @MSG[1],@MSG[1]
+
+ vadd.i32 $W0,@Kxx[0],@MSG[0]
+ vrev32.8 @MSG[2],@MSG[2]
+ vmov $ABCD_SAVE,$ABCD @ offload
+ subs $len,$len,#1
+
+ vadd.i32 $W1,@Kxx[0],@MSG[1]
+ vrev32.8 @MSG[3],@MSG[3]
+ sha1h $E1,$ABCD @ 0
+ sha1c $ABCD,$E,$W0
+ vadd.i32 $W0,@Kxx[$j],@MSG[2]
+ sha1su0 @MSG[0],@MSG[1],@MSG[2]
+___
+for ($j=0,$i=1;$i<20-3;$i++) {
+my $f=("c","p","m","p")[$i/5];
+$code.=<<___;
+ sha1h $E0,$ABCD @ $i
+ sha1$f $ABCD,$E1,$W1
+ vadd.i32 $W1,@Kxx[$j],@MSG[3]
+ sha1su1 @MSG[0],@MSG[3]
+___
+$code.=<<___ if ($i<20-4);
+ sha1su0 @MSG[1],@MSG[2],@MSG[3]
___
+ ($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0);
+ push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0);
+}
+$code.=<<___;
+ sha1h $E0,$ABCD @ $i
+ sha1p $ABCD,$E1,$W1
+ vadd.i32 $W1,@Kxx[$j],@MSG[3]
+
+ sha1h $E1,$ABCD @ 18
+ sha1p $ABCD,$E0,$W0
+
+ sha1h $E0,$ABCD @ 19
+ sha1p $ABCD,$E1,$W1
+
+ vadd.i32 $E,$E,$E0
+ vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
+ bne .Loop_v8
+
+ vst1.32 {$ABCD},[$ctx]!
+ vst1.32 {$E\[0]},[$ctx]
+
+ vldmia sp!,{d8-d15}
+ ret @ bx lr
+.size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
+#endif
+___
+}}}
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.comm OPENSSL_armcap_P,4,4
+#endif
+___
+
+{ my %opcode = (
+ "sha1c" => 0xf2000c40, "sha1p" => 0xf2100c40,
+ "sha1m" => 0xf2200c40, "sha1su0" => 0xf2300c40,
+ "sha1h" => 0xf3b902c0, "sha1su1" => 0xf3ba0380 );
+
+ sub unsha1 {
+ my ($mnemonic,$arg)=@_;
+
+ if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
+ my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+ |(($2&7)<<17)|(($2&8)<<4)
+ |(($3&7)<<1) |(($3&8)<<2);
+ # since ARMv7 instructions are always encoded little-endian.
+ # correct solution is to use .inst directive, but older
+ # assemblers don't implement it:-(
+ sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+ $word&0xff,($word>>8)&0xff,
+ ($word>>16)&0xff,($word>>24)&0xff,
+ $mnemonic,$arg;
+ }
+ }
+}
+
+foreach (split($/,$code)) {
+ s/{q([0-9]+)\[\]}/sprintf "{d%d[],d%d[]}",2*$1,2*$1+1/eo or
+ s/{q([0-9]+)\[0\]}/sprintf "{d%d[0]}",2*$1/eo;
+
+ s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo;
+
+ s/\bret\b/bx lr/o or
+ s/\bbx\s+lr\b/.word\t0xe12fff1e/o; # make it possible to compile with -march=armv4
+
+ print $_,$/;
+}
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
-print $code;
close STDOUT; # enforce flush
diff --git a/crypto/sha/asm/sha1-armv8.pl b/crypto/sha/asm/sha1-armv8.pl
new file mode 100755
index 000000000000..c04432a54394
--- /dev/null
+++ b/crypto/sha/asm/sha1-armv8.pl
@@ -0,0 +1,338 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA1 for ARMv8.
+#
+# Performance in cycles per processed byte and improvement coefficient
+# over code generated with "default" compiler:
+#
+# hardware-assisted software(*)
+# Apple A7 2.31 4.13 (+14%)
+# Cortex-A53 2.24 8.03 (+97%)
+# Cortex-A57 2.35 7.88 (+74%)
+# Denver 2.13 3.97 (+0%)(**)
+# X-Gene 8.80 (+200%)
+#
+# (*) Software results are presented mostly for reference purposes.
+# (**) Keep in mind that Denver relies on binary translation, which
+# optimizes compiler output at run-time.
+
+$flavour = shift;
+open STDOUT,">".shift;
+
+($ctx,$inp,$num)=("x0","x1","x2");
+@Xw=map("w$_",(3..17,19));
+@Xx=map("x$_",(3..17,19));
+@V=($A,$B,$C,$D,$E)=map("w$_",(20..24));
+($t0,$t1,$t2,$K)=map("w$_",(25..28));
+
+
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=($i+2)&15;
+
+$code.=<<___ if ($i<15 && !($i&1));
+ lsr @Xx[$i+1],@Xx[$i],#32
+___
+$code.=<<___ if ($i<14 && !($i&1));
+ ldr @Xx[$i+2],[$inp,#`($i+2)*4-64`]
+___
+$code.=<<___ if ($i<14 && ($i&1));
+#ifdef __ARMEB__
+ ror @Xx[$i+1],@Xx[$i+1],#32
+#else
+ rev32 @Xx[$i+1],@Xx[$i+1]
+#endif
+___
+$code.=<<___ if ($i<14);
+ bic $t0,$d,$b
+ and $t1,$c,$b
+ ror $t2,$a,#27
+ add $d,$d,$K // future e+=K
+ orr $t0,$t0,$t1
+ add $e,$e,$t2 // e+=rot(a,5)
+ ror $b,$b,#2
+ add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
+ add $e,$e,$t0 // e+=F(b,c,d)
+___
+$code.=<<___ if ($i==19);
+ movz $K,#0xeba1
+ movk $K,#0x6ed9,lsl#16
+___
+$code.=<<___ if ($i>=14);
+ eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
+ bic $t0,$d,$b
+ and $t1,$c,$b
+ ror $t2,$a,#27
+ eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
+ add $d,$d,$K // future e+=K
+ orr $t0,$t0,$t1
+ add $e,$e,$t2 // e+=rot(a,5)
+ eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
+ ror $b,$b,#2
+ add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
+ add $e,$e,$t0 // e+=F(b,c,d)
+ ror @Xw[$j],@Xw[$j],#31
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=($i+2)&15;
+
+$code.=<<___ if ($i==59);
+ movz $K,#0xc1d6
+ movk $K,#0xca62,lsl#16
+___
+$code.=<<___;
+ orr $t0,$b,$c
+ and $t1,$b,$c
+ eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
+ ror $t2,$a,#27
+ and $t0,$t0,$d
+ add $d,$d,$K // future e+=K
+ eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
+ add $e,$e,$t2 // e+=rot(a,5)
+ orr $t0,$t0,$t1
+ ror $b,$b,#2
+ eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
+ add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
+ add $e,$e,$t0 // e+=F(b,c,d)
+ ror @Xw[$j],@Xw[$j],#31
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=($i+2)&15;
+
+$code.=<<___ if ($i==39);
+ movz $K,#0xbcdc
+ movk $K,#0x8f1b,lsl#16
+___
+$code.=<<___ if ($i<78);
+ eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
+ eor $t0,$d,$b
+ ror $t2,$a,#27
+ add $d,$d,$K // future e+=K
+ eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
+ eor $t0,$t0,$c
+ add $e,$e,$t2 // e+=rot(a,5)
+ ror $b,$b,#2
+ eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
+ add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
+ add $e,$e,$t0 // e+=F(b,c,d)
+ ror @Xw[$j],@Xw[$j],#31
+___
+$code.=<<___ if ($i==78);
+ ldp @Xw[1],@Xw[2],[$ctx]
+ eor $t0,$d,$b
+ ror $t2,$a,#27
+ add $d,$d,$K // future e+=K
+ eor $t0,$t0,$c
+ add $e,$e,$t2 // e+=rot(a,5)
+ ror $b,$b,#2
+ add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
+ add $e,$e,$t0 // e+=F(b,c,d)
+___
+$code.=<<___ if ($i==79);
+ ldp @Xw[3],@Xw[4],[$ctx,#8]
+ eor $t0,$d,$b
+ ror $t2,$a,#27
+ eor $t0,$t0,$c
+ add $e,$e,$t2 // e+=rot(a,5)
+ ror $b,$b,#2
+ ldr @Xw[5],[$ctx,#16]
+ add $e,$e,$t0 // e+=F(b,c,d)
+___
+}
+
+$code.=<<___;
+#include "arm_arch.h"
+
+.text
+
+.globl sha1_block_data_order
+.type sha1_block_data_order,%function
+.align 6
+sha1_block_data_order:
+ ldr x16,.LOPENSSL_armcap_P
+ adr x17,.LOPENSSL_armcap_P
+ add x16,x16,x17
+ ldr w16,[x16]
+ tst w16,#ARMV8_SHA1
+ b.ne .Lv8_entry
+
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+
+ ldp $A,$B,[$ctx]
+ ldp $C,$D,[$ctx,#8]
+ ldr $E,[$ctx,#16]
+
+.Loop:
+ ldr @Xx[0],[$inp],#64
+ movz $K,#0x7999
+ sub $num,$num,#1
+ movk $K,#0x5a82,lsl#16
+#ifdef __ARMEB__
+ ror $Xx[0],@Xx[0],#32
+#else
+ rev32 @Xx[0],@Xx[0]
+#endif
+ add $E,$E,$K // warm it up
+ add $E,$E,@Xw[0]
+___
+for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ add $B,$B,@Xw[2]
+ add $C,$C,@Xw[3]
+ add $A,$A,@Xw[1]
+ add $D,$D,@Xw[4]
+ add $E,$E,@Xw[5]
+ stp $A,$B,[$ctx]
+ stp $C,$D,[$ctx,#8]
+ str $E,[$ctx,#16]
+ cbnz $num,.Loop
+
+ ldp x19,x20,[sp,#16]
+ ldp x21,x22,[sp,#32]
+ ldp x23,x24,[sp,#48]
+ ldp x25,x26,[sp,#64]
+ ldp x27,x28,[sp,#80]
+ ldr x29,[sp],#96
+ ret
+.size sha1_block_data_order,.-sha1_block_data_order
+___
+{{{
+my ($ABCD,$E,$E0,$E1)=map("v$_.16b",(0..3));
+my @MSG=map("v$_.16b",(4..7));
+my @Kxx=map("v$_.4s",(16..19));
+my ($W0,$W1)=("v20.4s","v21.4s");
+my $ABCD_SAVE="v22.16b";
+
+$code.=<<___;
+.type sha1_block_armv8,%function
+.align 6
+sha1_block_armv8:
+.Lv8_entry:
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ adr x4,.Lconst
+ eor $E,$E,$E
+ ld1.32 {$ABCD},[$ctx],#16
+ ld1.32 {$E}[0],[$ctx]
+ sub $ctx,$ctx,#16
+ ld1.32 {@Kxx[0]-@Kxx[3]},[x4]
+
+.Loop_hw:
+ ld1 {@MSG[0]-@MSG[3]},[$inp],#64
+ sub $num,$num,#1
+ rev32 @MSG[0],@MSG[0]
+ rev32 @MSG[1],@MSG[1]
+
+ add.i32 $W0,@Kxx[0],@MSG[0]
+ rev32 @MSG[2],@MSG[2]
+ orr $ABCD_SAVE,$ABCD,$ABCD // offload
+
+ add.i32 $W1,@Kxx[0],@MSG[1]
+ rev32 @MSG[3],@MSG[3]
+ sha1h $E1,$ABCD
+ sha1c $ABCD,$E,$W0 // 0
+ add.i32 $W0,@Kxx[$j],@MSG[2]
+ sha1su0 @MSG[0],@MSG[1],@MSG[2]
+___
+for ($j=0,$i=1;$i<20-3;$i++) {
+my $f=("c","p","m","p")[$i/5];
+$code.=<<___;
+ sha1h $E0,$ABCD // $i
+ sha1$f $ABCD,$E1,$W1
+ add.i32 $W1,@Kxx[$j],@MSG[3]
+ sha1su1 @MSG[0],@MSG[3]
+___
+$code.=<<___ if ($i<20-4);
+ sha1su0 @MSG[1],@MSG[2],@MSG[3]
+___
+ ($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0);
+ push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0);
+}
+$code.=<<___;
+ sha1h $E0,$ABCD // $i
+ sha1p $ABCD,$E1,$W1
+ add.i32 $W1,@Kxx[$j],@MSG[3]
+
+ sha1h $E1,$ABCD // 18
+ sha1p $ABCD,$E0,$W0
+
+ sha1h $E0,$ABCD // 19
+ sha1p $ABCD,$E1,$W1
+
+ add.i32 $E,$E,$E0
+ add.i32 $ABCD,$ABCD,$ABCD_SAVE
+
+ cbnz $num,.Loop_hw
+
+ st1.32 {$ABCD},[$ctx],#16
+ st1.32 {$E}[0],[$ctx]
+
+ ldr x29,[sp],#16
+ ret
+.size sha1_block_armv8,.-sha1_block_armv8
+.align 6
+.Lconst:
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79
+.LOPENSSL_armcap_P:
+.quad OPENSSL_armcap_P-.
+.asciz "SHA1 block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+.comm OPENSSL_armcap_P,4,4
+___
+}}}
+
+{ my %opcode = (
+ "sha1c" => 0x5e000000, "sha1p" => 0x5e001000,
+ "sha1m" => 0x5e002000, "sha1su0" => 0x5e003000,
+ "sha1h" => 0x5e280800, "sha1su1" => 0x5e281800 );
+
+ sub unsha1 {
+ my ($mnemonic,$arg)=@_;
+
+ $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
+ &&
+ sprintf ".inst\t0x%08x\t//%s %s",
+ $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
+ $mnemonic,$arg;
+ }
+}
+
+foreach(split("\n",$code)) {
+
+ s/\`([^\`]*)\`/eval($1)/geo;
+
+ s/\b(sha1\w+)\s+([qv].*)/unsha1($1,$2)/geo;
+
+ s/\.\w?32\b//o and s/\.16b/\.4s/go;
+ m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go;
+
+ print $_,"\n";
+}
+
+close STDOUT;
diff --git a/crypto/sha/asm/sha1-mb-x86_64.pl b/crypto/sha/asm/sha1-mb-x86_64.pl
new file mode 100755
index 000000000000..a8ee075eaaa0
--- /dev/null
+++ b/crypto/sha/asm/sha1-mb-x86_64.pl
@@ -0,0 +1,1574 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# Multi-buffer SHA1 procedure processes n buffers in parallel by
+# placing buffer data to designated lane of SIMD register. n is
+# naturally limited to 4 on pre-AVX2 processors and to 8 on
+# AVX2-capable processors such as Haswell.
+#
+# this +aesni(i) sha1 aesni-sha1 gain(iv)
+# -------------------------------------------------------------------
+# Westmere(ii) 10.7/n +1.28=3.96(n=4) 5.30 6.66 +68%
+# Atom(ii) 18.1/n +3.93=8.46(n=4) 9.37 12.8 +51%
+# Sandy Bridge (8.16 +5.15=13.3)/n 4.99 5.98 +80%
+# Ivy Bridge (8.08 +5.14=13.2)/n 4.60 5.54 +68%
+# Haswell(iii) (8.96 +5.00=14.0)/n 3.57 4.55 +160%
+# Bulldozer (9.76 +5.76=15.5)/n 5.95 6.37 +64%
+#
+# (i) multi-block CBC encrypt with 128-bit key;
+# (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
+# because of lower AES-NI instruction throughput;
+# (iii) "this" is for n=8, when we gather twice as much data, result
+# for n=4 is 8.00+4.44=12.4;
+# (iv) presented improvement coefficients are asymptotic limits and
+# in real-life application are somewhat lower, e.g. for 2KB
+# fragments they range from 30% to 100% (on Haswell);
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+$avx=0;
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+ $avx = ($2>=3.0) + ($2>3.0);
+}
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+# void sha1_multi_block (
+# struct { unsigned int A[8];
+# unsigned int B[8];
+# unsigned int C[8];
+# unsigned int D[8];
+# unsigned int E[8]; } *ctx,
+# struct { void *ptr; int blocks; } inp[8],
+# int num); /* 1 or 2 */
+#
+$ctx="%rdi"; # 1st arg
+$inp="%rsi"; # 2nd arg
+$num="%edx";
+@ptr=map("%r$_",(8..11));
+$Tbl="%rbp";
+
+@V=($A,$B,$C,$D,$E)=map("%xmm$_",(0..4));
+($t0,$t1,$t2,$t3,$tx)=map("%xmm$_",(5..9));
+@Xi=map("%xmm$_",(10..14));
+$K="%xmm15";
+
+if (1) {
+ # Atom-specific optimization aiming to eliminate pshufb with high
+ # registers [and thus get rid of 48 cycles accumulated penalty]
+ @Xi=map("%xmm$_",(0..4));
+ ($tx,$t0,$t1,$t2,$t3)=map("%xmm$_",(5..9));
+ @V=($A,$B,$C,$D,$E)=map("%xmm$_",(10..14));
+}
+
+$REG_SZ=16;
+
+sub Xi_off {
+my $off = shift;
+
+ $off %= 16; $off *= $REG_SZ;
+ $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
+}
+
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+my $k=$i+2;
+
+# Loads are performed 2+3/4 iterations in advance. 3/4 means that out
+# of 4 words you would expect to be loaded per given iteration one is
+# spilled to next iteration. In other words indices in four input
+# streams are distributed as following:
+#
+# $i==0: 0,0,0,0,1,1,1,1,2,2,2,
+# $i==1: 2,3,3,3,
+# $i==2: 3,4,4,4,
+# ...
+# $i==13: 14,15,15,15,
+# $i==14: 15
+#
+# Then at $i==15 Xupdate is applied one iteration in advance...
+$code.=<<___ if ($i==0);
+ movd (@ptr[0]),@Xi[0]
+ lea `16*4`(@ptr[0]),@ptr[0]
+ movd (@ptr[1]),@Xi[2] # borrow @Xi[2]
+ lea `16*4`(@ptr[1]),@ptr[1]
+ movd (@ptr[2]),@Xi[3] # borrow @Xi[3]
+ lea `16*4`(@ptr[2]),@ptr[2]
+ movd (@ptr[3]),@Xi[4] # borrow @Xi[4]
+ lea `16*4`(@ptr[3]),@ptr[3]
+ punpckldq @Xi[3],@Xi[0]
+ movd `4*$j-16*4`(@ptr[0]),@Xi[1]
+ punpckldq @Xi[4],@Xi[2]
+ movd `4*$j-16*4`(@ptr[1]),$t3
+ punpckldq @Xi[2],@Xi[0]
+ movd `4*$j-16*4`(@ptr[2]),$t2
+ pshufb $tx,@Xi[0]
+___
+$code.=<<___ if ($i<14); # just load input
+ movd `4*$j-16*4`(@ptr[3]),$t1
+ punpckldq $t2,@Xi[1]
+ movdqa $a,$t2
+ paddd $K,$e # e+=K_00_19
+ punpckldq $t1,$t3
+ movdqa $b,$t1
+ movdqa $b,$t0
+ pslld \$5,$t2
+ pandn $d,$t1
+ pand $c,$t0
+ punpckldq $t3,@Xi[1]
+ movdqa $a,$t3
+
+ movdqa @Xi[0],`&Xi_off($i)`
+ paddd @Xi[0],$e # e+=X[i]
+ movd `4*$k-16*4`(@ptr[0]),@Xi[2]
+ psrld \$27,$t3
+ pxor $t1,$t0 # Ch(b,c,d)
+ movdqa $b,$t1
+
+ por $t3,$t2 # rol(a,5)
+ movd `4*$k-16*4`(@ptr[1]),$t3
+ pslld \$30,$t1
+ paddd $t0,$e # e+=Ch(b,c,d)
+
+ psrld \$2,$b
+ paddd $t2,$e # e+=rol(a,5)
+ pshufb $tx,@Xi[1]
+ movd `4*$k-16*4`(@ptr[2]),$t2
+ por $t1,$b # b=rol(b,30)
+___
+$code.=<<___ if ($i==14); # just load input
+ movd `4*$j-16*4`(@ptr[3]),$t1
+ punpckldq $t2,@Xi[1]
+ movdqa $a,$t2
+ paddd $K,$e # e+=K_00_19
+ punpckldq $t1,$t3
+ movdqa $b,$t1
+ movdqa $b,$t0
+ pslld \$5,$t2
+ prefetcht0 63(@ptr[0])
+ pandn $d,$t1
+ pand $c,$t0
+ punpckldq $t3,@Xi[1]
+ movdqa $a,$t3
+
+ movdqa @Xi[0],`&Xi_off($i)`
+ paddd @Xi[0],$e # e+=X[i]
+ psrld \$27,$t3
+ pxor $t1,$t0 # Ch(b,c,d)
+ movdqa $b,$t1
+ prefetcht0 63(@ptr[1])
+
+ por $t3,$t2 # rol(a,5)
+ pslld \$30,$t1
+ paddd $t0,$e # e+=Ch(b,c,d)
+ prefetcht0 63(@ptr[2])
+
+ psrld \$2,$b
+ paddd $t2,$e # e+=rol(a,5)
+ pshufb $tx,@Xi[1]
+ prefetcht0 63(@ptr[3])
+ por $t1,$b # b=rol(b,30)
+___
+$code.=<<___ if ($i>=13 && $i<15);
+ movdqa `&Xi_off($j+2)`,@Xi[3] # preload "X[2]"
+___
+$code.=<<___ if ($i>=15); # apply Xupdate
+ pxor @Xi[-2],@Xi[1] # "X[13]"
+ movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]"
+
+ movdqa $a,$t2
+ pxor `&Xi_off($j+8)`,@Xi[1]
+ paddd $K,$e # e+=K_00_19
+ movdqa $b,$t1
+ pslld \$5,$t2
+ pxor @Xi[3],@Xi[1]
+ movdqa $b,$t0
+ pandn $d,$t1
+ movdqa @Xi[1],$tx
+ pand $c,$t0
+ movdqa $a,$t3
+ psrld \$31,$tx
+ paddd @Xi[1],@Xi[1]
+
+ movdqa @Xi[0],`&Xi_off($i)`
+ paddd @Xi[0],$e # e+=X[i]
+ psrld \$27,$t3
+ pxor $t1,$t0 # Ch(b,c,d)
+
+ movdqa $b,$t1
+ por $t3,$t2 # rol(a,5)
+ pslld \$30,$t1
+ paddd $t0,$e # e+=Ch(b,c,d)
+
+ psrld \$2,$b
+ paddd $t2,$e # e+=rol(a,5)
+ por $tx,@Xi[1] # rol \$1,@Xi[1]
+ por $t1,$b # b=rol(b,30)
+___
+push(@Xi,shift(@Xi));
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+
+$code.=<<___ if ($i<79);
+ pxor @Xi[-2],@Xi[1] # "X[13]"
+ movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]"
+
+ movdqa $a,$t2
+ movdqa $d,$t0
+ pxor `&Xi_off($j+8)`,@Xi[1]
+ paddd $K,$e # e+=K_20_39
+ pslld \$5,$t2
+ pxor $b,$t0
+
+ movdqa $a,$t3
+___
+$code.=<<___ if ($i<72);
+ movdqa @Xi[0],`&Xi_off($i)`
+___
+$code.=<<___ if ($i<79);
+ paddd @Xi[0],$e # e+=X[i]
+ pxor @Xi[3],@Xi[1]
+ psrld \$27,$t3
+ pxor $c,$t0 # Parity(b,c,d)
+ movdqa $b,$t1
+
+ pslld \$30,$t1
+ movdqa @Xi[1],$tx
+ por $t3,$t2 # rol(a,5)
+ psrld \$31,$tx
+ paddd $t0,$e # e+=Parity(b,c,d)
+ paddd @Xi[1],@Xi[1]
+
+ psrld \$2,$b
+ paddd $t2,$e # e+=rol(a,5)
+ por $tx,@Xi[1] # rol(@Xi[1],1)
+ por $t1,$b # b=rol(b,30)
+___
+$code.=<<___ if ($i==79);
+ movdqa $a,$t2
+ paddd $K,$e # e+=K_20_39
+ movdqa $d,$t0
+ pslld \$5,$t2
+ pxor $b,$t0
+
+ movdqa $a,$t3
+ paddd @Xi[0],$e # e+=X[i]
+ psrld \$27,$t3
+ movdqa $b,$t1
+ pxor $c,$t0 # Parity(b,c,d)
+
+ pslld \$30,$t1
+ por $t3,$t2 # rol(a,5)
+ paddd $t0,$e # e+=Parity(b,c,d)
+
+ psrld \$2,$b
+ paddd $t2,$e # e+=rol(a,5)
+ por $t1,$b # b=rol(b,30)
+___
+push(@Xi,shift(@Xi));
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+
+$code.=<<___;
+ pxor @Xi[-2],@Xi[1] # "X[13]"
+ movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]"
+
+ movdqa $a,$t2
+ movdqa $d,$t1
+ pxor `&Xi_off($j+8)`,@Xi[1]
+ pxor @Xi[3],@Xi[1]
+ paddd $K,$e # e+=K_40_59
+ pslld \$5,$t2
+ movdqa $a,$t3
+ pand $c,$t1
+
+ movdqa $d,$t0
+ movdqa @Xi[1],$tx
+ psrld \$27,$t3
+ paddd $t1,$e
+ pxor $c,$t0
+
+ movdqa @Xi[0],`&Xi_off($i)`
+ paddd @Xi[0],$e # e+=X[i]
+ por $t3,$t2 # rol(a,5)
+ psrld \$31,$tx
+ pand $b,$t0
+ movdqa $b,$t1
+
+ pslld \$30,$t1
+ paddd @Xi[1],@Xi[1]
+ paddd $t0,$e # e+=Maj(b,d,c)
+
+ psrld \$2,$b
+ paddd $t2,$e # e+=rol(a,5)
+ por $tx,@Xi[1] # rol(@X[1],1)
+ por $t1,$b # b=rol(b,30)
+___
+push(@Xi,shift(@Xi));
+}
+
+$code.=<<___;
+.text
+
+.extern OPENSSL_ia32cap_P
+
+.globl sha1_multi_block
+.type sha1_multi_block,\@function,3
+.align 32
+sha1_multi_block:
+ mov OPENSSL_ia32cap_P+4(%rip),%rcx
+ bt \$61,%rcx # check SHA bit
+ jc _shaext_shortcut
+___
+$code.=<<___ if ($avx);
+ test \$`1<<28`,%ecx
+ jnz _avx_shortcut
+___
+$code.=<<___;
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+___
+$code.=<<___ if ($win64);
+ lea -0xa8(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,-0x78(%rax)
+ movaps %xmm11,-0x68(%rax)
+ movaps %xmm12,-0x58(%rax)
+ movaps %xmm13,-0x48(%rax)
+ movaps %xmm14,-0x38(%rax)
+ movaps %xmm15,-0x28(%rax)
+___
+$code.=<<___;
+ sub \$`$REG_SZ*18`,%rsp
+ and \$-256,%rsp
+ mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
+.Lbody:
+ lea K_XX_XX(%rip),$Tbl
+ lea `$REG_SZ*16`(%rsp),%rbx
+
+.Loop_grande:
+ mov $num,`$REG_SZ*17+8`(%rsp) # original $num
+ xor $num,$num
+___
+for($i=0;$i<4;$i++) {
+ $code.=<<___;
+ mov `16*$i+0`($inp),@ptr[$i] # input pointer
+ mov `16*$i+8`($inp),%ecx # number of blocks
+ cmp $num,%ecx
+ cmovg %ecx,$num # find maximum
+ test %ecx,%ecx
+ mov %ecx,`4*$i`(%rbx) # initialize counters
+ cmovle $Tbl,@ptr[$i] # cancel input
+___
+}
+$code.=<<___;
+ test $num,$num
+ jz .Ldone
+
+ movdqu 0x00($ctx),$A # load context
+ lea 128(%rsp),%rax
+ movdqu 0x20($ctx),$B
+ movdqu 0x40($ctx),$C
+ movdqu 0x60($ctx),$D
+ movdqu 0x80($ctx),$E
+ movdqa 0x60($Tbl),$tx # pbswap_mask
+ movdqa -0x20($Tbl),$K # K_00_19
+ jmp .Loop
+
+.align 32
+.Loop:
+___
+for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+$code.=" movdqa 0x00($Tbl),$K\n"; # K_20_39
+for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=" movdqa 0x20($Tbl),$K\n"; # K_40_59
+for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+$code.=" movdqa 0x40($Tbl),$K\n"; # K_60_79
+for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ movdqa (%rbx),@Xi[0] # pull counters
+ mov \$1,%ecx
+ cmp 4*0(%rbx),%ecx # examinte counters
+ pxor $t2,$t2
+ cmovge $Tbl,@ptr[0] # cancel input
+ cmp 4*1(%rbx),%ecx
+ movdqa @Xi[0],@Xi[1]
+ cmovge $Tbl,@ptr[1]
+ cmp 4*2(%rbx),%ecx
+ pcmpgtd $t2,@Xi[1] # mask value
+ cmovge $Tbl,@ptr[2]
+ cmp 4*3(%rbx),%ecx
+ paddd @Xi[1],@Xi[0] # counters--
+ cmovge $Tbl,@ptr[3]
+
+ movdqu 0x00($ctx),$t0
+ pand @Xi[1],$A
+ movdqu 0x20($ctx),$t1
+ pand @Xi[1],$B
+ paddd $t0,$A
+ movdqu 0x40($ctx),$t2
+ pand @Xi[1],$C
+ paddd $t1,$B
+ movdqu 0x60($ctx),$t3
+ pand @Xi[1],$D
+ paddd $t2,$C
+ movdqu 0x80($ctx),$tx
+ pand @Xi[1],$E
+ movdqu $A,0x00($ctx)
+ paddd $t3,$D
+ movdqu $B,0x20($ctx)
+ paddd $tx,$E
+ movdqu $C,0x40($ctx)
+ movdqu $D,0x60($ctx)
+ movdqu $E,0x80($ctx)
+
+ movdqa @Xi[0],(%rbx) # save counters
+ movdqa 0x60($Tbl),$tx # pbswap_mask
+ movdqa -0x20($Tbl),$K # K_00_19
+ dec $num
+ jnz .Loop
+
+ mov `$REG_SZ*17+8`(%rsp),$num
+ lea $REG_SZ($ctx),$ctx
+ lea `16*$REG_SZ/4`($inp),$inp
+ dec $num
+ jnz .Loop_grande
+
+.Ldone:
+ mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
+___
+$code.=<<___ if ($win64);
+ movaps -0xb8(%rax),%xmm6
+ movaps -0xa8(%rax),%xmm7
+ movaps -0x98(%rax),%xmm8
+ movaps -0x88(%rax),%xmm9
+ movaps -0x78(%rax),%xmm10
+ movaps -0x68(%rax),%xmm11
+ movaps -0x58(%rax),%xmm12
+ movaps -0x48(%rax),%xmm13
+ movaps -0x38(%rax),%xmm14
+ movaps -0x28(%rax),%xmm15
+___
+$code.=<<___;
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp
+.Lepilogue:
+ ret
+.size sha1_multi_block,.-sha1_multi_block
+___
+ {{{
+my ($ABCD0,$E0,$E0_,$BSWAP,$ABCD1,$E1,$E1_)=map("%xmm$_",(0..3,8..10));
+my @MSG0=map("%xmm$_",(4..7));
+my @MSG1=map("%xmm$_",(11..14));
+
+$code.=<<___;
+.type sha1_multi_block_shaext,\@function,3
+.align 32
+sha1_multi_block_shaext:
+_shaext_shortcut:
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+___
+$code.=<<___ if ($win64);
+ lea -0xa8(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,-0x78(%rax)
+ movaps %xmm11,-0x68(%rax)
+ movaps %xmm12,-0x58(%rax)
+ movaps %xmm13,-0x48(%rax)
+ movaps %xmm14,-0x38(%rax)
+ movaps %xmm15,-0x28(%rax)
+___
+$code.=<<___;
+ sub \$`$REG_SZ*18`,%rsp
+ shl \$1,$num # we process pair at a time
+ and \$-256,%rsp
+ lea 0x40($ctx),$ctx # size optimization
+ mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
+.Lbody_shaext:
+ lea `$REG_SZ*16`(%rsp),%rbx
+ movdqa K_XX_XX+0x80(%rip),$BSWAP # byte-n-word swap
+
+.Loop_grande_shaext:
+ mov $num,`$REG_SZ*17+8`(%rsp) # orignal $num
+ xor $num,$num
+___
+for($i=0;$i<2;$i++) {
+ $code.=<<___;
+ mov `16*$i+0`($inp),@ptr[$i] # input pointer
+ mov `16*$i+8`($inp),%ecx # number of blocks
+ cmp $num,%ecx
+ cmovg %ecx,$num # find maximum
+ test %ecx,%ecx
+ mov %ecx,`4*$i`(%rbx) # initialize counters
+ cmovle %rsp,@ptr[$i] # cancel input
+___
+}
+$code.=<<___;
+ test $num,$num
+ jz .Ldone_shaext
+
+ movq 0x00-0x40($ctx),$ABCD0 # a1.a0
+ movq 0x20-0x40($ctx),@MSG0[0]# b1.b0
+ movq 0x40-0x40($ctx),@MSG0[1]# c1.c0
+ movq 0x60-0x40($ctx),@MSG0[2]# d1.d0
+ movq 0x80-0x40($ctx),@MSG0[3]# e1.e0
+
+ punpckldq @MSG0[0],$ABCD0 # b1.a1.b0.a0
+ punpckldq @MSG0[2],@MSG0[1] # d1.c1.d0.c0
+
+ movdqa $ABCD0,$ABCD1
+ punpcklqdq @MSG0[1],$ABCD0 # d0.c0.b0.a0
+ punpckhqdq @MSG0[1],$ABCD1 # d1.c1.b1.a1
+
+ pshufd \$0b00111111,@MSG0[3],$E0
+ pshufd \$0b01111111,@MSG0[3],$E1
+ pshufd \$0b00011011,$ABCD0,$ABCD0
+ pshufd \$0b00011011,$ABCD1,$ABCD1
+ jmp .Loop_shaext
+
+.align 32
+.Loop_shaext:
+ movdqu 0x00(@ptr[0]),@MSG0[0]
+ movdqu 0x00(@ptr[1]),@MSG1[0]
+ movdqu 0x10(@ptr[0]),@MSG0[1]
+ movdqu 0x10(@ptr[1]),@MSG1[1]
+ movdqu 0x20(@ptr[0]),@MSG0[2]
+ pshufb $BSWAP,@MSG0[0]
+ movdqu 0x20(@ptr[1]),@MSG1[2]
+ pshufb $BSWAP,@MSG1[0]
+ movdqu 0x30(@ptr[0]),@MSG0[3]
+ lea 0x40(@ptr[0]),@ptr[0]
+ pshufb $BSWAP,@MSG0[1]
+ movdqu 0x30(@ptr[1]),@MSG1[3]
+ lea 0x40(@ptr[1]),@ptr[1]
+ pshufb $BSWAP,@MSG1[1]
+
+ movdqa $E0,0x50(%rsp) # offload
+ paddd @MSG0[0],$E0
+ movdqa $E1,0x70(%rsp)
+ paddd @MSG1[0],$E1
+ movdqa $ABCD0,0x40(%rsp) # offload
+ movdqa $ABCD0,$E0_
+ movdqa $ABCD1,0x60(%rsp)
+ movdqa $ABCD1,$E1_
+ sha1rnds4 \$0,$E0,$ABCD0 # 0-3
+ sha1nexte @MSG0[1],$E0_
+ sha1rnds4 \$0,$E1,$ABCD1 # 0-3
+ sha1nexte @MSG1[1],$E1_
+ pshufb $BSWAP,@MSG0[2]
+ prefetcht0 127(@ptr[0])
+ sha1msg1 @MSG0[1],@MSG0[0]
+ pshufb $BSWAP,@MSG1[2]
+ prefetcht0 127(@ptr[1])
+ sha1msg1 @MSG1[1],@MSG1[0]
+
+ pshufb $BSWAP,@MSG0[3]
+ movdqa $ABCD0,$E0
+ pshufb $BSWAP,@MSG1[3]
+ movdqa $ABCD1,$E1
+ sha1rnds4 \$0,$E0_,$ABCD0 # 4-7
+ sha1nexte @MSG0[2],$E0
+ sha1rnds4 \$0,$E1_,$ABCD1 # 4-7
+ sha1nexte @MSG1[2],$E1
+ pxor @MSG0[2],@MSG0[0]
+ sha1msg1 @MSG0[2],@MSG0[1]
+ pxor @MSG1[2],@MSG1[0]
+ sha1msg1 @MSG1[2],@MSG1[1]
+___
+for($i=2;$i<20-4;$i++) {
+$code.=<<___;
+ movdqa $ABCD0,$E0_
+ movdqa $ABCD1,$E1_
+ sha1rnds4 \$`int($i/5)`,$E0,$ABCD0 # 8-11
+ sha1nexte @MSG0[3],$E0_
+ sha1rnds4 \$`int($i/5)`,$E1,$ABCD1 # 8-11
+ sha1nexte @MSG1[3],$E1_
+ sha1msg2 @MSG0[3],@MSG0[0]
+ sha1msg2 @MSG1[3],@MSG1[0]
+ pxor @MSG0[3],@MSG0[1]
+ sha1msg1 @MSG0[3],@MSG0[2]
+ pxor @MSG1[3],@MSG1[1]
+ sha1msg1 @MSG1[3],@MSG1[2]
+___
+ ($E0,$E0_)=($E0_,$E0); ($E1,$E1_)=($E1_,$E1);
+ push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1));
+}
+$code.=<<___;
+ movdqa $ABCD0,$E0_
+ movdqa $ABCD1,$E1_
+ sha1rnds4 \$3,$E0,$ABCD0 # 64-67
+ sha1nexte @MSG0[3],$E0_
+ sha1rnds4 \$3,$E1,$ABCD1 # 64-67
+ sha1nexte @MSG1[3],$E1_
+ sha1msg2 @MSG0[3],@MSG0[0]
+ sha1msg2 @MSG1[3],@MSG1[0]
+ pxor @MSG0[3],@MSG0[1]
+ pxor @MSG1[3],@MSG1[1]
+
+ mov \$1,%ecx
+ pxor @MSG0[2],@MSG0[2] # zero
+ cmp 4*0(%rbx),%ecx # examine counters
+ cmovge %rsp,@ptr[0] # cancel input
+
+ movdqa $ABCD0,$E0
+ movdqa $ABCD1,$E1
+ sha1rnds4 \$3,$E0_,$ABCD0 # 68-71
+ sha1nexte @MSG0[0],$E0
+ sha1rnds4 \$3,$E1_,$ABCD1 # 68-71
+ sha1nexte @MSG1[0],$E1
+ sha1msg2 @MSG0[0],@MSG0[1]
+ sha1msg2 @MSG1[0],@MSG1[1]
+
+ cmp 4*1(%rbx),%ecx
+ cmovge %rsp,@ptr[1]
+ movq (%rbx),@MSG0[0] # pull counters
+
+ movdqa $ABCD0,$E0_
+ movdqa $ABCD1,$E1_
+ sha1rnds4 \$3,$E0,$ABCD0 # 72-75
+ sha1nexte @MSG0[1],$E0_
+ sha1rnds4 \$3,$E1,$ABCD1 # 72-75
+ sha1nexte @MSG1[1],$E1_
+
+ pshufd \$0x00,@MSG0[0],@MSG1[2]
+ pshufd \$0x55,@MSG0[0],@MSG1[3]
+ movdqa @MSG0[0],@MSG0[1]
+ pcmpgtd @MSG0[2],@MSG1[2]
+ pcmpgtd @MSG0[2],@MSG1[3]
+
+ movdqa $ABCD0,$E0
+ movdqa $ABCD1,$E1
+ sha1rnds4 \$3,$E0_,$ABCD0 # 76-79
+ sha1nexte $MSG0[2],$E0
+ sha1rnds4 \$3,$E1_,$ABCD1 # 76-79
+ sha1nexte $MSG0[2],$E1
+
+ pcmpgtd @MSG0[2],@MSG0[1] # counter mask
+ pand @MSG1[2],$ABCD0
+ pand @MSG1[2],$E0
+ pand @MSG1[3],$ABCD1
+ pand @MSG1[3],$E1
+ paddd @MSG0[1],@MSG0[0] # counters--
+
+ paddd 0x40(%rsp),$ABCD0
+ paddd 0x50(%rsp),$E0
+ paddd 0x60(%rsp),$ABCD1
+ paddd 0x70(%rsp),$E1
+
+ movq @MSG0[0],(%rbx) # save counters
+ dec $num
+ jnz .Loop_shaext
+
+ mov `$REG_SZ*17+8`(%rsp),$num
+
+ pshufd \$0b00011011,$ABCD0,$ABCD0
+ pshufd \$0b00011011,$ABCD1,$ABCD1
+
+ movdqa $ABCD0,@MSG0[0]
+ punpckldq $ABCD1,$ABCD0 # b1.b0.a1.a0
+ punpckhdq $ABCD1,@MSG0[0] # d1.d0.c1.c0
+ punpckhdq $E1,$E0 # e1.e0.xx.xx
+ movq $ABCD0,0x00-0x40($ctx) # a1.a0
+ psrldq \$8,$ABCD0
+ movq @MSG0[0],0x40-0x40($ctx)# c1.c0
+ psrldq \$8,@MSG0[0]
+ movq $ABCD0,0x20-0x40($ctx) # b1.b0
+ psrldq \$8,$E0
+ movq @MSG0[0],0x60-0x40($ctx)# d1.d0
+ movq $E0,0x80-0x40($ctx) # e1.e0
+
+ lea `$REG_SZ/2`($ctx),$ctx
+ lea `16*2`($inp),$inp
+ dec $num
+ jnz .Loop_grande_shaext
+
+.Ldone_shaext:
+ #mov `$REG_SZ*17`(%rsp),%rax # original %rsp
+___
+$code.=<<___ if ($win64);
+ movaps -0xb8(%rax),%xmm6
+ movaps -0xa8(%rax),%xmm7
+ movaps -0x98(%rax),%xmm8
+ movaps -0x88(%rax),%xmm9
+ movaps -0x78(%rax),%xmm10
+ movaps -0x68(%rax),%xmm11
+ movaps -0x58(%rax),%xmm12
+ movaps -0x48(%rax),%xmm13
+ movaps -0x38(%rax),%xmm14
+ movaps -0x28(%rax),%xmm15
+___
+$code.=<<___;
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp
+.Lepilogue_shaext:
+ ret
+.size sha1_multi_block_shaext,.-sha1_multi_block_shaext
+___
+ }}}
+
+ if ($avx) {{{
+sub BODY_00_19_avx {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+my $k=$i+2;
+my $vpack = $REG_SZ==16 ? "vpunpckldq" : "vinserti128";
+my $ptr_n = $REG_SZ==16 ? @ptr[1] : @ptr[4];
+
+$code.=<<___ if ($i==0 && $REG_SZ==16);
+ vmovd (@ptr[0]),@Xi[0]
+ lea `16*4`(@ptr[0]),@ptr[0]
+ vmovd (@ptr[1]),@Xi[2] # borrow Xi[2]
+ lea `16*4`(@ptr[1]),@ptr[1]
+ vpinsrd \$1,(@ptr[2]),@Xi[0],@Xi[0]
+ lea `16*4`(@ptr[2]),@ptr[2]
+ vpinsrd \$1,(@ptr[3]),@Xi[2],@Xi[2]
+ lea `16*4`(@ptr[3]),@ptr[3]
+ vmovd `4*$j-16*4`(@ptr[0]),@Xi[1]
+ vpunpckldq @Xi[2],@Xi[0],@Xi[0]
+ vmovd `4*$j-16*4`($ptr_n),$t3
+ vpshufb $tx,@Xi[0],@Xi[0]
+___
+$code.=<<___ if ($i<15 && $REG_SZ==16); # just load input
+ vpinsrd \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1]
+ vpinsrd \$1,`4*$j-16*4`(@ptr[3]),$t3,$t3
+___
+$code.=<<___ if ($i==0 && $REG_SZ==32);
+ vmovd (@ptr[0]),@Xi[0]
+ lea `16*4`(@ptr[0]),@ptr[0]
+ vmovd (@ptr[4]),@Xi[2] # borrow Xi[2]
+ lea `16*4`(@ptr[4]),@ptr[4]
+ vmovd (@ptr[1]),$t2
+ lea `16*4`(@ptr[1]),@ptr[1]
+ vmovd (@ptr[5]),$t1
+ lea `16*4`(@ptr[5]),@ptr[5]
+ vpinsrd \$1,(@ptr[2]),@Xi[0],@Xi[0]
+ lea `16*4`(@ptr[2]),@ptr[2]
+ vpinsrd \$1,(@ptr[6]),@Xi[2],@Xi[2]
+ lea `16*4`(@ptr[6]),@ptr[6]
+ vpinsrd \$1,(@ptr[3]),$t2,$t2
+ lea `16*4`(@ptr[3]),@ptr[3]
+ vpunpckldq $t2,@Xi[0],@Xi[0]
+ vpinsrd \$1,(@ptr[7]),$t1,$t1
+ lea `16*4`(@ptr[7]),@ptr[7]
+ vpunpckldq $t1,@Xi[2],@Xi[2]
+ vmovd `4*$j-16*4`(@ptr[0]),@Xi[1]
+ vinserti128 @Xi[2],@Xi[0],@Xi[0]
+ vmovd `4*$j-16*4`($ptr_n),$t3
+ vpshufb $tx,@Xi[0],@Xi[0]
+___
+$code.=<<___ if ($i<15 && $REG_SZ==32); # just load input
+ vmovd `4*$j-16*4`(@ptr[1]),$t2
+ vmovd `4*$j-16*4`(@ptr[5]),$t1
+ vpinsrd \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1]
+ vpinsrd \$1,`4*$j-16*4`(@ptr[6]),$t3,$t3
+ vpinsrd \$1,`4*$j-16*4`(@ptr[3]),$t2,$t2
+ vpunpckldq $t2,@Xi[1],@Xi[1]
+ vpinsrd \$1,`4*$j-16*4`(@ptr[7]),$t1,$t1
+ vpunpckldq $t1,$t3,$t3
+___
+$code.=<<___ if ($i<14);
+ vpaddd $K,$e,$e # e+=K_00_19
+ vpslld \$5,$a,$t2
+ vpandn $d,$b,$t1
+ vpand $c,$b,$t0
+
+ vmovdqa @Xi[0],`&Xi_off($i)`
+ vpaddd @Xi[0],$e,$e # e+=X[i]
+ $vpack $t3,@Xi[1],@Xi[1]
+ vpsrld \$27,$a,$t3
+ vpxor $t1,$t0,$t0 # Ch(b,c,d)
+ vmovd `4*$k-16*4`(@ptr[0]),@Xi[2]
+
+ vpslld \$30,$b,$t1
+ vpor $t3,$t2,$t2 # rol(a,5)
+ vmovd `4*$k-16*4`($ptr_n),$t3
+ vpaddd $t0,$e,$e # e+=Ch(b,c,d)
+
+ vpsrld \$2,$b,$b
+ vpaddd $t2,$e,$e # e+=rol(a,5)
+ vpshufb $tx,@Xi[1],@Xi[1]
+ vpor $t1,$b,$b # b=rol(b,30)
+___
+$code.=<<___ if ($i==14);
+ vpaddd $K,$e,$e # e+=K_00_19
+ prefetcht0 63(@ptr[0])
+ vpslld \$5,$a,$t2
+ vpandn $d,$b,$t1
+ vpand $c,$b,$t0
+
+ vmovdqa @Xi[0],`&Xi_off($i)`
+ vpaddd @Xi[0],$e,$e # e+=X[i]
+ $vpack $t3,@Xi[1],@Xi[1]
+ vpsrld \$27,$a,$t3
+ prefetcht0 63(@ptr[1])
+ vpxor $t1,$t0,$t0 # Ch(b,c,d)
+
+ vpslld \$30,$b,$t1
+ vpor $t3,$t2,$t2 # rol(a,5)
+ prefetcht0 63(@ptr[2])
+ vpaddd $t0,$e,$e # e+=Ch(b,c,d)
+
+ vpsrld \$2,$b,$b
+ vpaddd $t2,$e,$e # e+=rol(a,5)
+ prefetcht0 63(@ptr[3])
+ vpshufb $tx,@Xi[1],@Xi[1]
+ vpor $t1,$b,$b # b=rol(b,30)
+___
+$code.=<<___ if ($i>=13 && $i<15);
+ vmovdqa `&Xi_off($j+2)`,@Xi[3] # preload "X[2]"
+___
+$code.=<<___ if ($i>=15); # apply Xupdate
+ vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]"
+ vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]"
+
+ vpaddd $K,$e,$e # e+=K_00_19
+ vpslld \$5,$a,$t2
+ vpandn $d,$b,$t1
+ `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)`
+ vpand $c,$b,$t0
+
+ vmovdqa @Xi[0],`&Xi_off($i)`
+ vpaddd @Xi[0],$e,$e # e+=X[i]
+ vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1]
+ vpsrld \$27,$a,$t3
+ vpxor $t1,$t0,$t0 # Ch(b,c,d)
+ vpxor @Xi[3],@Xi[1],@Xi[1]
+ `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)`
+
+ vpslld \$30,$b,$t1
+ vpor $t3,$t2,$t2 # rol(a,5)
+ vpaddd $t0,$e,$e # e+=Ch(b,c,d)
+ `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)`
+ vpsrld \$31,@Xi[1],$tx
+ vpaddd @Xi[1],@Xi[1],@Xi[1]
+
+ vpsrld \$2,$b,$b
+ `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)`
+ vpaddd $t2,$e,$e # e+=rol(a,5)
+ vpor $tx,@Xi[1],@Xi[1] # rol \$1,@Xi[1]
+ vpor $t1,$b,$b # b=rol(b,30)
+___
+push(@Xi,shift(@Xi));
+}
+
+sub BODY_20_39_avx {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+
+$code.=<<___ if ($i<79);
+ vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]"
+ vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]"
+
+ vpslld \$5,$a,$t2
+ vpaddd $K,$e,$e # e+=K_20_39
+ vpxor $b,$d,$t0
+___
+$code.=<<___ if ($i<72);
+ vmovdqa @Xi[0],`&Xi_off($i)`
+___
+$code.=<<___ if ($i<79);
+ vpaddd @Xi[0],$e,$e # e+=X[i]
+ vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1]
+ vpsrld \$27,$a,$t3
+ vpxor $c,$t0,$t0 # Parity(b,c,d)
+ vpxor @Xi[3],@Xi[1],@Xi[1]
+
+ vpslld \$30,$b,$t1
+ vpor $t3,$t2,$t2 # rol(a,5)
+ vpaddd $t0,$e,$e # e+=Parity(b,c,d)
+ vpsrld \$31,@Xi[1],$tx
+ vpaddd @Xi[1],@Xi[1],@Xi[1]
+
+ vpsrld \$2,$b,$b
+ vpaddd $t2,$e,$e # e+=rol(a,5)
+ vpor $tx,@Xi[1],@Xi[1] # rol(@Xi[1],1)
+ vpor $t1,$b,$b # b=rol(b,30)
+___
+$code.=<<___ if ($i==79);
+ vpslld \$5,$a,$t2
+ vpaddd $K,$e,$e # e+=K_20_39
+ vpxor $b,$d,$t0
+
+ vpsrld \$27,$a,$t3
+ vpaddd @Xi[0],$e,$e # e+=X[i]
+ vpxor $c,$t0,$t0 # Parity(b,c,d)
+
+ vpslld \$30,$b,$t1
+ vpor $t3,$t2,$t2 # rol(a,5)
+ vpaddd $t0,$e,$e # e+=Parity(b,c,d)
+
+ vpsrld \$2,$b,$b
+ vpaddd $t2,$e,$e # e+=rol(a,5)
+ vpor $t1,$b,$b # b=rol(b,30)
+___
+push(@Xi,shift(@Xi));
+}
+
+sub BODY_40_59_avx {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+
+$code.=<<___;
+ vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]"
+ vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]"
+
+ vpaddd $K,$e,$e # e+=K_40_59
+ vpslld \$5,$a,$t2
+ vpand $c,$d,$t1
+ vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1]
+
+ vpaddd $t1,$e,$e
+ vpsrld \$27,$a,$t3
+ vpxor $c,$d,$t0
+ vpxor @Xi[3],@Xi[1],@Xi[1]
+
+ vmovdqu @Xi[0],`&Xi_off($i)`
+ vpaddd @Xi[0],$e,$e # e+=X[i]
+ vpor $t3,$t2,$t2 # rol(a,5)
+ vpsrld \$31,@Xi[1],$tx
+ vpand $b,$t0,$t0
+ vpaddd @Xi[1],@Xi[1],@Xi[1]
+
+ vpslld \$30,$b,$t1
+ vpaddd $t0,$e,$e # e+=Maj(b,d,c)
+
+ vpsrld \$2,$b,$b
+ vpaddd $t2,$e,$e # e+=rol(a,5)
+ vpor $tx,@Xi[1],@Xi[1] # rol(@X[1],1)
+ vpor $t1,$b,$b # b=rol(b,30)
+___
+push(@Xi,shift(@Xi));
+}
+
+$code.=<<___;
+.type sha1_multi_block_avx,\@function,3
+.align 32
+sha1_multi_block_avx:
+_avx_shortcut:
+___
+$code.=<<___ if ($avx>1);
+ shr \$32,%rcx
+ cmp \$2,$num
+ jb .Lavx
+ test \$`1<<5`,%ecx
+ jnz _avx2_shortcut
+ jmp .Lavx
+.align 32
+.Lavx:
+___
+$code.=<<___;
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+___
+$code.=<<___ if ($win64);
+ lea -0xa8(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,-0x78(%rax)
+ movaps %xmm11,-0x68(%rax)
+ movaps %xmm12,-0x58(%rax)
+ movaps %xmm13,-0x48(%rax)
+ movaps %xmm14,-0x38(%rax)
+ movaps %xmm15,-0x28(%rax)
+___
+$code.=<<___;
+ sub \$`$REG_SZ*18`, %rsp
+ and \$-256,%rsp
+ mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
+.Lbody_avx:
+ lea K_XX_XX(%rip),$Tbl
+ lea `$REG_SZ*16`(%rsp),%rbx
+
+ vzeroupper
+.Loop_grande_avx:
+ mov $num,`$REG_SZ*17+8`(%rsp) # original $num
+ xor $num,$num
+___
+for($i=0;$i<4;$i++) {
+ $code.=<<___;
+ mov `16*$i+0`($inp),@ptr[$i] # input pointer
+ mov `16*$i+8`($inp),%ecx # number of blocks
+ cmp $num,%ecx
+ cmovg %ecx,$num # find maximum
+ test %ecx,%ecx
+ mov %ecx,`4*$i`(%rbx) # initialize counters
+ cmovle $Tbl,@ptr[$i] # cancel input
+___
+}
+$code.=<<___;
+ test $num,$num
+ jz .Ldone_avx
+
+ vmovdqu 0x00($ctx),$A # load context
+ lea 128(%rsp),%rax
+ vmovdqu 0x20($ctx),$B
+ vmovdqu 0x40($ctx),$C
+ vmovdqu 0x60($ctx),$D
+ vmovdqu 0x80($ctx),$E
+ vmovdqu 0x60($Tbl),$tx # pbswap_mask
+ jmp .Loop_avx
+
+.align 32
+.Loop_avx:
+___
+$code.=" vmovdqa -0x20($Tbl),$K\n"; # K_00_19
+for($i=0;$i<20;$i++) { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); }
+$code.=" vmovdqa 0x00($Tbl),$K\n"; # K_20_39
+for(;$i<40;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
+$code.=" vmovdqa 0x20($Tbl),$K\n"; # K_40_59
+for(;$i<60;$i++) { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); }
+$code.=" vmovdqa 0x40($Tbl),$K\n"; # K_60_79
+for(;$i<80;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ mov \$1,%ecx
+___
+for($i=0;$i<4;$i++) {
+ $code.=<<___;
+ cmp `4*$i`(%rbx),%ecx # examine counters
+ cmovge $Tbl,@ptr[$i] # cancel input
+___
+}
+$code.=<<___;
+ vmovdqu (%rbx),$t0 # pull counters
+ vpxor $t2,$t2,$t2
+ vmovdqa $t0,$t1
+ vpcmpgtd $t2,$t1,$t1 # mask value
+ vpaddd $t1,$t0,$t0 # counters--
+
+ vpand $t1,$A,$A
+ vpand $t1,$B,$B
+ vpaddd 0x00($ctx),$A,$A
+ vpand $t1,$C,$C
+ vpaddd 0x20($ctx),$B,$B
+ vpand $t1,$D,$D
+ vpaddd 0x40($ctx),$C,$C
+ vpand $t1,$E,$E
+ vpaddd 0x60($ctx),$D,$D
+ vpaddd 0x80($ctx),$E,$E
+ vmovdqu $A,0x00($ctx)
+ vmovdqu $B,0x20($ctx)
+ vmovdqu $C,0x40($ctx)
+ vmovdqu $D,0x60($ctx)
+ vmovdqu $E,0x80($ctx)
+
+ vmovdqu $t0,(%rbx) # save counters
+ vmovdqu 0x60($Tbl),$tx # pbswap_mask
+ dec $num
+ jnz .Loop_avx
+
+ mov `$REG_SZ*17+8`(%rsp),$num
+ lea $REG_SZ($ctx),$ctx
+ lea `16*$REG_SZ/4`($inp),$inp
+ dec $num
+ jnz .Loop_grande_avx
+
+.Ldone_avx:
+ mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps -0xb8(%rax),%xmm6
+ movaps -0xa8(%rax),%xmm7
+ movaps -0x98(%rax),%xmm8
+ movaps -0x88(%rax),%xmm9
+ movaps -0x78(%rax),%xmm10
+ movaps -0x68(%rax),%xmm11
+ movaps -0x58(%rax),%xmm12
+ movaps -0x48(%rax),%xmm13
+ movaps -0x38(%rax),%xmm14
+ movaps -0x28(%rax),%xmm15
+___
+$code.=<<___;
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp
+.Lepilogue_avx:
+ ret
+.size sha1_multi_block_avx,.-sha1_multi_block_avx
+___
+
+ if ($avx>1) {
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+
+$REG_SZ=32;
+
+@ptr=map("%r$_",(12..15,8..11));
+
+@V=($A,$B,$C,$D,$E)=map("%ymm$_",(0..4));
+($t0,$t1,$t2,$t3,$tx)=map("%ymm$_",(5..9));
+@Xi=map("%ymm$_",(10..14));
+$K="%ymm15";
+
+$code.=<<___;
+.type sha1_multi_block_avx2,\@function,3
+.align 32
+sha1_multi_block_avx2:
+_avx2_shortcut:
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0xa8(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,0x40(%rsp)
+ movaps %xmm11,0x50(%rsp)
+ movaps %xmm12,-0x78(%rax)
+ movaps %xmm13,-0x68(%rax)
+ movaps %xmm14,-0x58(%rax)
+ movaps %xmm15,-0x48(%rax)
+___
+$code.=<<___;
+ sub \$`$REG_SZ*18`, %rsp
+ and \$-256,%rsp
+ mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
+.Lbody_avx2:
+ lea K_XX_XX(%rip),$Tbl
+ shr \$1,$num
+
+ vzeroupper
+.Loop_grande_avx2:
+ mov $num,`$REG_SZ*17+8`(%rsp) # original $num
+ xor $num,$num
+ lea `$REG_SZ*16`(%rsp),%rbx
+___
+for($i=0;$i<8;$i++) {
+ $code.=<<___;
+ mov `16*$i+0`($inp),@ptr[$i] # input pointer
+ mov `16*$i+8`($inp),%ecx # number of blocks
+ cmp $num,%ecx
+ cmovg %ecx,$num # find maximum
+ test %ecx,%ecx
+ mov %ecx,`4*$i`(%rbx) # initialize counters
+ cmovle $Tbl,@ptr[$i] # cancel input
+___
+}
+$code.=<<___;
+ vmovdqu 0x00($ctx),$A # load context
+ lea 128(%rsp),%rax
+ vmovdqu 0x20($ctx),$B
+ lea 256+128(%rsp),%rbx
+ vmovdqu 0x40($ctx),$C
+ vmovdqu 0x60($ctx),$D
+ vmovdqu 0x80($ctx),$E
+ vmovdqu 0x60($Tbl),$tx # pbswap_mask
+ jmp .Loop_avx2
+
+.align 32
+.Loop_avx2:
+___
+$code.=" vmovdqa -0x20($Tbl),$K\n"; # K_00_19
+for($i=0;$i<20;$i++) { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); }
+$code.=" vmovdqa 0x00($Tbl),$K\n"; # K_20_39
+for(;$i<40;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
+$code.=" vmovdqa 0x20($Tbl),$K\n"; # K_40_59
+for(;$i<60;$i++) { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); }
+$code.=" vmovdqa 0x40($Tbl),$K\n"; # K_60_79
+for(;$i<80;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ mov \$1,%ecx
+ lea `$REG_SZ*16`(%rsp),%rbx
+___
+for($i=0;$i<8;$i++) {
+ $code.=<<___;
+ cmp `4*$i`(%rbx),%ecx # examine counters
+ cmovge $Tbl,@ptr[$i] # cancel input
+___
+}
+$code.=<<___;
+ vmovdqu (%rbx),$t0 # pull counters
+ vpxor $t2,$t2,$t2
+ vmovdqa $t0,$t1
+ vpcmpgtd $t2,$t1,$t1 # mask value
+ vpaddd $t1,$t0,$t0 # counters--
+
+ vpand $t1,$A,$A
+ vpand $t1,$B,$B
+ vpaddd 0x00($ctx),$A,$A
+ vpand $t1,$C,$C
+ vpaddd 0x20($ctx),$B,$B
+ vpand $t1,$D,$D
+ vpaddd 0x40($ctx),$C,$C
+ vpand $t1,$E,$E
+ vpaddd 0x60($ctx),$D,$D
+ vpaddd 0x80($ctx),$E,$E
+ vmovdqu $A,0x00($ctx)
+ vmovdqu $B,0x20($ctx)
+ vmovdqu $C,0x40($ctx)
+ vmovdqu $D,0x60($ctx)
+ vmovdqu $E,0x80($ctx)
+
+ vmovdqu $t0,(%rbx) # save counters
+ lea 256+128(%rsp),%rbx
+ vmovdqu 0x60($Tbl),$tx # pbswap_mask
+ dec $num
+ jnz .Loop_avx2
+
+ #mov `$REG_SZ*17+8`(%rsp),$num
+ #lea $REG_SZ($ctx),$ctx
+ #lea `16*$REG_SZ/4`($inp),$inp
+ #dec $num
+ #jnz .Loop_grande_avx2
+
+.Ldone_avx2:
+ mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps -0xd8(%rax),%xmm6
+ movaps -0xc8(%rax),%xmm7
+ movaps -0xb8(%rax),%xmm8
+ movaps -0xa8(%rax),%xmm9
+ movaps -0x98(%rax),%xmm10
+ movaps -0x88(%rax),%xmm11
+ movaps -0x78(%rax),%xmm12
+ movaps -0x68(%rax),%xmm13
+ movaps -0x58(%rax),%xmm14
+ movaps -0x48(%rax),%xmm15
+___
+$code.=<<___;
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp
+.Lepilogue_avx2:
+ ret
+.size sha1_multi_block_avx2,.-sha1_multi_block_avx2
+___
+ } }}}
+$code.=<<___;
+
+.align 256
+ .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
+ .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
+K_XX_XX:
+ .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
+ .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
+ .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
+ .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
+ .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
+ .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
+ .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
+ .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
+ .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
+ .asciz "SHA1 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+if ($win64) {
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # end of prologue label
+ cmp %r10,%rbx # context->Rip<.Lbody
+ jb .Lin_prologue
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=.Lepilogue
+ jae .Lin_prologue
+
+ mov `16*17`(%rax),%rax # pull saved stack pointer
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+
+ lea -24-10*16(%rax),%rsi
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+
+.Lin_prologue:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size se_handler,.-se_handler
+___
+$code.=<<___ if ($avx>1);
+.type avx2_handler,\@abi-omnipotent
+.align 16
+avx2_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # end of prologue label
+ cmp %r10,%rbx # context->Rip<body label
+ jb .Lin_prologue
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lin_prologue
+
+ mov `32*17`($context),%rax # pull saved stack pointer
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore cotnext->R12
+ mov %r13,224($context) # restore cotnext->R13
+ mov %r14,232($context) # restore cotnext->R14
+ mov %r15,240($context) # restore cotnext->R15
+
+ lea -56-10*16(%rax),%rsi
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+
+ jmp .Lin_prologue
+.size avx2_handler,.-avx2_handler
+___
+$code.=<<___;
+.section .pdata
+.align 4
+ .rva .LSEH_begin_sha1_multi_block
+ .rva .LSEH_end_sha1_multi_block
+ .rva .LSEH_info_sha1_multi_block
+ .rva .LSEH_begin_sha1_multi_block_shaext
+ .rva .LSEH_end_sha1_multi_block_shaext
+ .rva .LSEH_info_sha1_multi_block_shaext
+___
+$code.=<<___ if ($avx);
+ .rva .LSEH_begin_sha1_multi_block_avx
+ .rva .LSEH_end_sha1_multi_block_avx
+ .rva .LSEH_info_sha1_multi_block_avx
+___
+$code.=<<___ if ($avx>1);
+ .rva .LSEH_begin_sha1_multi_block_avx2
+ .rva .LSEH_end_sha1_multi_block_avx2
+ .rva .LSEH_info_sha1_multi_block_avx2
+___
+$code.=<<___;
+.section .xdata
+.align 8
+.LSEH_info_sha1_multi_block:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lbody,.Lepilogue # HandlerData[]
+.LSEH_info_sha1_multi_block_shaext:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[]
+___
+$code.=<<___ if ($avx);
+.LSEH_info_sha1_multi_block_avx:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lbody_avx,.Lepilogue_avx # HandlerData[]
+___
+$code.=<<___ if ($avx>1);
+.LSEH_info_sha1_multi_block_avx2:
+ .byte 9,0,0,0
+ .rva avx2_handler
+ .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[]
+___
+}
+####################################################################
+
+sub rex {
+ local *opcode=shift;
+ my ($dst,$src)=@_;
+ my $rex=0;
+
+ $rex|=0x04 if ($dst>=8);
+ $rex|=0x01 if ($src>=8);
+ unshift @opcode,$rex|0x40 if ($rex);
+}
+
+sub sha1rnds4 {
+ if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my @opcode=(0x0f,0x3a,0xcc);
+ rex(\@opcode,$3,$2);
+ push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
+ my $c=$1;
+ push @opcode,$c=~/^0/?oct($c):$c;
+ return ".byte\t".join(',',@opcode);
+ } else {
+ return "sha1rnds4\t".@_[0];
+ }
+}
+
+sub sha1op38 {
+ my $instr = shift;
+ my %opcodelet = (
+ "sha1nexte" => 0xc8,
+ "sha1msg1" => 0xc9,
+ "sha1msg2" => 0xca );
+
+ if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my @opcode=(0x0f,0x38);
+ rex(\@opcode,$2,$1);
+ push @opcode,$opcodelet{$instr};
+ push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
+ return ".byte\t".join(',',@opcode);
+ } else {
+ return $instr."\t".@_[0];
+ }
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval($1)/ge;
+
+ s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or
+ s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo or
+
+ s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
+ s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
+ s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or
+ s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
+ s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or
+ s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
+
+ print $_,"\n";
+}
+
+close STDOUT;
diff --git a/crypto/sha/asm/sha1-mips.pl b/crypto/sha/asm/sha1-mips.pl
index 197bc6b50e95..340849389993 100755
--- a/crypto/sha/asm/sha1-mips.pl
+++ b/crypto/sha/asm/sha1-mips.pl
@@ -15,6 +15,10 @@
# compatible subroutine. There is room for minor optimization on
# little-endian platforms...
+# September 2012.
+#
+# Add MIPS32r2 code (>25% less instructions).
+
######################################################################
# There is a number of MIPS ABI in use, O32 and N32/64 are most
# widely used. Then there is a new contender: NUBI. It appears that if
@@ -42,7 +46,7 @@
# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
#
-$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
if ($flavour =~ /64|n32/i) {
$PTR_ADD="dadd"; # incidentally works even on n32
@@ -95,6 +99,10 @@ sub BODY_00_14 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if (!$big_endian);
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+ wsbh @X[$i],@X[$i] # byte swap($i)
+ rotr @X[$i],@X[$i],16
+#else
srl $t0,@X[$i],24 # byte swap($i)
srl $t1,@X[$i],8
andi $t2,@X[$i],0xFF00
@@ -104,8 +112,22 @@ $code.=<<___ if (!$big_endian);
or @X[$i],$t0
or $t1,$t2
or @X[$i],$t1
+#endif
___
$code.=<<___;
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+ addu $e,$K # $i
+ xor $t0,$c,$d
+ rotr $t1,$a,27
+ lwl @X[$j],$j*4+$MSB($inp)
+ and $t0,$b
+ addu $e,$t1
+ lwr @X[$j],$j*4+$LSB($inp)
+ xor $t0,$d
+ addu $e,@X[$i]
+ rotr $b,$b,2
+ addu $e,$t0
+#else
lwl @X[$j],$j*4+$MSB($inp)
sll $t0,$a,5 # $i
addu $e,$K
@@ -121,6 +143,7 @@ $code.=<<___;
addu $e,@X[$i]
or $b,$t2
addu $e,$t0
+#endif
___
}
@@ -129,6 +152,10 @@ my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if (!$big_endian && $i==15);
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+ wsbh @X[$i],@X[$i] # byte swap($i)
+ rotr @X[$i],@X[$i],16
+#else
srl $t0,@X[$i],24 # byte swap($i)
srl $t1,@X[$i],8
andi $t2,@X[$i],0xFF00
@@ -138,8 +165,24 @@ $code.=<<___ if (!$big_endian && $i==15);
or @X[$i],$t0
or @X[$i],$t1
or @X[$i],$t2
+#endif
___
$code.=<<___;
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+ addu $e,$K # $i
+ xor @X[$j%16],@X[($j+2)%16]
+ xor $t0,$c,$d
+ rotr $t1,$a,27
+ xor @X[$j%16],@X[($j+8)%16]
+ and $t0,$b
+ addu $e,$t1
+ xor @X[$j%16],@X[($j+13)%16]
+ xor $t0,$d
+ addu $e,@X[$i%16]
+ rotr @X[$j%16],@X[$j%16],31
+ rotr $b,$b,2
+ addu $e,$t0
+#else
xor @X[$j%16],@X[($j+2)%16]
sll $t0,$a,5 # $i
addu $e,$K
@@ -159,6 +202,7 @@ $code.=<<___;
addu $e,@X[$i%16]
or $b,$t2
addu $e,$t0
+#endif
___
}
@@ -166,6 +210,20 @@ sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if ($i<79);
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+ xor @X[$j%16],@X[($j+2)%16]
+ addu $e,$K # $i
+ rotr $t1,$a,27
+ xor @X[$j%16],@X[($j+8)%16]
+ xor $t0,$c,$d
+ addu $e,$t1
+ xor @X[$j%16],@X[($j+13)%16]
+ xor $t0,$b
+ addu $e,@X[$i%16]
+ rotr @X[$j%16],@X[$j%16],31
+ rotr $b,$b,2
+ addu $e,$t0
+#else
xor @X[$j%16],@X[($j+2)%16]
sll $t0,$a,5 # $i
addu $e,$K
@@ -184,8 +242,24 @@ $code.=<<___ if ($i<79);
or @X[$j%16],$t1
or $b,$t2
addu $e,$t0
+#endif
___
$code.=<<___ if ($i==79);
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+ lw @X[0],0($ctx)
+ addu $e,$K # $i
+ lw @X[1],4($ctx)
+ rotr $t1,$a,27
+ lw @X[2],8($ctx)
+ xor $t0,$c,$d
+ addu $e,$t1
+ lw @X[3],12($ctx)
+ xor $t0,$b
+ addu $e,@X[$i%16]
+ lw @X[4],16($ctx)
+ rotr $b,$b,2
+ addu $e,$t0
+#else
lw @X[0],0($ctx)
sll $t0,$a,5 # $i
addu $e,$K
@@ -203,6 +277,7 @@ $code.=<<___ if ($i==79);
addu $e,@X[$i%16]
or $b,$t2
addu $e,$t0
+#endif
___
}
@@ -210,6 +285,22 @@ sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if ($i<79);
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+ addu $e,$K # $i
+ and $t0,$c,$d
+ xor @X[$j%16],@X[($j+2)%16]
+ rotr $t1,$a,27
+ addu $e,$t0
+ xor @X[$j%16],@X[($j+8)%16]
+ xor $t0,$c,$d
+ addu $e,$t1
+ xor @X[$j%16],@X[($j+13)%16]
+ and $t0,$b
+ addu $e,@X[$i%16]
+ rotr @X[$j%16],@X[$j%16],31
+ rotr $b,$b,2
+ addu $e,$t0
+#else
xor @X[$j%16],@X[($j+2)%16]
sll $t0,$a,5 # $i
addu $e,$K
@@ -230,6 +321,7 @@ $code.=<<___ if ($i<79);
addu $e,@X[$i%16]
or $b,$t2
addu $e,$t0
+#endif
___
}
@@ -241,6 +333,10 @@ $code=<<___;
# include <openssl/fipssyms.h>
#endif
+#if defined(__mips_smartmips) && !defined(_MIPS_ARCH_MIPS32R2)
+#define _MIPS_ARCH_MIPS32R2
+#endif
+
.text
.set noat
diff --git a/crypto/sha/asm/sha1-ppc.pl b/crypto/sha/asm/sha1-ppc.pl
index 2140dd2f8dd6..df5989610c4c 100755
--- a/crypto/sha/asm/sha1-ppc.pl
+++ b/crypto/sha/asm/sha1-ppc.pl
@@ -9,8 +9,7 @@
# I let hardware handle unaligned input(*), except on page boundaries
# (see below for details). Otherwise straightforward implementation
-# with X vector in register bank. The module is big-endian [which is
-# not big deal as there're no little-endian targets left around].
+# with X vector in register bank.
#
# (*) this means that this module is inappropriate for PPC403? Does
# anybody know if pre-POWER3 can sustain unaligned load?
@@ -38,6 +37,10 @@ if ($flavour =~ /64/) {
$PUSH ="stw";
} else { die "nonsense $flavour"; }
+# Define endianess based on flavour
+# i.e.: linux64le
+$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
+
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
@@ -68,14 +71,28 @@ $T ="r12";
@X=("r16","r17","r18","r19","r20","r21","r22","r23",
"r24","r25","r26","r27","r28","r29","r30","r31");
+sub loadbe {
+my ($dst, $src, $temp_reg) = @_;
+$code.=<<___ if (!$LITTLE_ENDIAN);
+ lwz $dst,$src
+___
+$code.=<<___ if ($LITTLE_ENDIAN);
+ lwz $temp_reg,$src
+ rotlwi $dst,$temp_reg,8
+ rlwimi $dst,$temp_reg,24,0,7
+ rlwimi $dst,$temp_reg,24,16,23
+___
+}
+
sub BODY_00_19 {
my ($i,$a,$b,$c,$d,$e,$f)=@_;
my $j=$i+1;
-$code.=<<___ if ($i==0);
- lwz @X[$i],`$i*4`($inp)
-___
+
+ # Since the last value of $f is discarded, we can use
+ # it as a temp reg to swap byte-order when needed.
+ loadbe("@X[$i]","`$i*4`($inp)",$f) if ($i==0);
+ loadbe("@X[$j]","`$j*4`($inp)",$f) if ($i<15);
$code.=<<___ if ($i<15);
- lwz @X[$j],`$j*4`($inp)
add $f,$K,$e
rotlwi $e,$a,5
add $f,$f,@X[$i]
@@ -108,31 +125,31 @@ my ($i,$a,$b,$c,$d,$e,$f)=@_;
my $j=$i+1;
$code.=<<___ if ($i<79);
add $f,$K,$e
+ xor $t0,$b,$d
rotlwi $e,$a,5
xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
add $f,$f,@X[$i%16]
- xor $t0,$b,$c
+ xor $t0,$t0,$c
xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
- add $f,$f,$e
+ add $f,$f,$t0
rotlwi $b,$b,30
- xor $t0,$t0,$d
xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
- add $f,$f,$t0
+ add $f,$f,$e
rotlwi @X[$j%16],@X[$j%16],1
___
$code.=<<___ if ($i==79);
add $f,$K,$e
+ xor $t0,$b,$d
rotlwi $e,$a,5
lwz r16,0($ctx)
add $f,$f,@X[$i%16]
- xor $t0,$b,$c
+ xor $t0,$t0,$c
lwz r17,4($ctx)
- add $f,$f,$e
+ add $f,$f,$t0
rotlwi $b,$b,30
lwz r18,8($ctx)
- xor $t0,$t0,$d
lwz r19,12($ctx)
- add $f,$f,$t0
+ add $f,$f,$e
lwz r20,16($ctx)
___
}
@@ -316,6 +333,7 @@ $code.=<<___;
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
+.size .sha1_block_data_order,.-.sha1_block_data_order
___
$code.=<<___;
.asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
diff --git a/crypto/sha/asm/sha1-sparcv9.pl b/crypto/sha/asm/sha1-sparcv9.pl
index 5c161cecd696..b5efcde5c139 100755
--- a/crypto/sha/asm/sha1-sparcv9.pl
+++ b/crypto/sha/asm/sha1-sparcv9.pl
@@ -5,6 +5,8 @@
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Hardware SPARC T4 support by David S. Miller <davem@davemloft.net>.
# ====================================================================
# Performance improvement is not really impressive on pre-T1 CPU: +8%
@@ -18,10 +20,10 @@
# ensure scalability on UltraSPARC T1, or rather to avoid decay when
# amount of active threads exceeds the number of physical cores.
-$bits=32;
-for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
-if ($bits==64) { $bias=2047; $frame=192; }
-else { $bias=0; $frame=112; }
+# SPARC T4 SHA1 hardware achieves 3.72 cycles per byte, which is 3.1x
+# faster than software. Multi-process benchmark saturates at 11x
+# single-process result on 8-core processor, or ~9GBps per 2.85GHz
+# socket.
$output=shift;
open STDOUT,">$output";
@@ -178,17 +180,102 @@ $code.=<<___;
___
}
-$code.=<<___ if ($bits==64);
+$code.=<<___;
+#include "sparc_arch.h"
+
+#ifdef __arch64__
.register %g2,#scratch
.register %g3,#scratch
-___
-$code.=<<___;
+#endif
+
.section ".text",#alloc,#execinstr
+#ifdef __PIC__
+SPARC_PIC_THUNK(%g1)
+#endif
+
.align 32
.globl sha1_block_data_order
sha1_block_data_order:
- save %sp,-$frame,%sp
+ SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
+ ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1]
+
+ andcc %g1, CFR_SHA1, %g0
+ be .Lsoftware
+ nop
+
+ ld [%o0 + 0x00], %f0 ! load context
+ ld [%o0 + 0x04], %f1
+ ld [%o0 + 0x08], %f2
+ andcc %o1, 0x7, %g0
+ ld [%o0 + 0x0c], %f3
+ bne,pn %icc, .Lhwunaligned
+ ld [%o0 + 0x10], %f4
+
+.Lhw_loop:
+ ldd [%o1 + 0x00], %f8
+ ldd [%o1 + 0x08], %f10
+ ldd [%o1 + 0x10], %f12
+ ldd [%o1 + 0x18], %f14
+ ldd [%o1 + 0x20], %f16
+ ldd [%o1 + 0x28], %f18
+ ldd [%o1 + 0x30], %f20
+ subcc %o2, 1, %o2 ! done yet?
+ ldd [%o1 + 0x38], %f22
+ add %o1, 0x40, %o1
+ prefetch [%o1 + 63], 20
+
+ .word 0x81b02820 ! SHA1
+
+ bne,pt SIZE_T_CC, .Lhw_loop
+ nop
+
+.Lhwfinish:
+ st %f0, [%o0 + 0x00] ! store context
+ st %f1, [%o0 + 0x04]
+ st %f2, [%o0 + 0x08]
+ st %f3, [%o0 + 0x0c]
+ retl
+ st %f4, [%o0 + 0x10]
+
+.align 8
+.Lhwunaligned:
+ alignaddr %o1, %g0, %o1
+
+ ldd [%o1 + 0x00], %f10
+.Lhwunaligned_loop:
+ ldd [%o1 + 0x08], %f12
+ ldd [%o1 + 0x10], %f14
+ ldd [%o1 + 0x18], %f16
+ ldd [%o1 + 0x20], %f18
+ ldd [%o1 + 0x28], %f20
+ ldd [%o1 + 0x30], %f22
+ ldd [%o1 + 0x38], %f24
+ subcc %o2, 1, %o2 ! done yet?
+ ldd [%o1 + 0x40], %f26
+ add %o1, 0x40, %o1
+ prefetch [%o1 + 63], 20
+
+ faligndata %f10, %f12, %f8
+ faligndata %f12, %f14, %f10
+ faligndata %f14, %f16, %f12
+ faligndata %f16, %f18, %f14
+ faligndata %f18, %f20, %f16
+ faligndata %f20, %f22, %f18
+ faligndata %f22, %f24, %f20
+ faligndata %f24, %f26, %f22
+
+ .word 0x81b02820 ! SHA1
+
+ bne,pt SIZE_T_CC, .Lhwunaligned_loop
+ for %f26, %f26, %f10 ! %f10=%f26
+
+ ba .Lhwfinish
+ nop
+
+.align 16
+.Lsoftware:
+ save %sp,-STACK_FRAME,%sp
sllx $len,6,$len
add $inp,$len,$len
@@ -268,7 +355,7 @@ $code.=<<___;
add $E,@X[4],$E
st $E,[$ctx+16]
- bne `$bits==64?"%xcc":"%icc"`,.Lloop
+ bne SIZE_T_CC,.Lloop
andn $inp,7,$tmp0
ret
@@ -279,6 +366,62 @@ $code.=<<___;
.align 4
___
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-print $code;
+# Purpose of these subroutines is to explicitly encode VIS instructions,
+# so that one can compile the module without having to specify VIS
+# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# Idea is to reserve for option to produce "universal" binary and let
+# programmer detect if current CPU is VIS capable at run-time.
+sub unvis {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my $ref,$opf;
+my %visopf = ( "faligndata" => 0x048,
+ "for" => 0x07c );
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+ if ($opf=$visopf{$mnemonic}) {
+ foreach ($rs1,$rs2,$rd) {
+ return $ref if (!/%f([0-9]{1,2})/);
+ $_=$1;
+ if ($1>=32) {
+ return $ref if ($1&1);
+ # re-encode for upper double register addressing
+ $_=($1|$1>>5)&31;
+ }
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+sub unalignaddr {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
+my $ref="$mnemonic\t$rs1,$rs2,$rd";
+
+ foreach ($rs1,$rs2,$rd) {
+ if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
+ else { return $ref; }
+ }
+ return sprintf ".word\t0x%08x !%s",
+ 0x81b00300|$rd<<25|$rs1<<14|$rs2,
+ $ref;
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+
+ s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
+ &unvis($1,$2,$3,$4)
+ /ge;
+ s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
+ &unalignaddr($1,$2,$3,$4)
+ /ge;
+
+ print $_,"\n";
+}
+
close STDOUT;
diff --git a/crypto/sha/asm/sha1-x86_64.pl b/crypto/sha/asm/sha1-x86_64.pl
index f15c7ec39b20..9bb6b498190f 100755
--- a/crypto/sha/asm/sha1-x86_64.pl
+++ b/crypto/sha/asm/sha1-x86_64.pl
@@ -1,7 +1,7 @@
#!/usr/bin/env perl
#
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -49,17 +49,37 @@
#
# Add AVX code path. See sha1-586.pl for further information.
+# May 2013.
+#
+# Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions
+# and loading pair of consecutive blocks to 256-bit %ymm registers)
+# did not provide impressive performance improvement till a crucial
+# hint regarding the number of Xupdate iterations to pre-compute in
+# advance was provided by Ilya Albrekht of Intel Corp.
+
+# March 2014.
+#
+# Add support for Intel SHA Extensions.
+
######################################################################
# Current performance is summarized in following table. Numbers are
# CPU clock cycles spent to process single byte (less is better).
#
-# x86_64 SSSE3 AVX
-# P4 9.8 -
-# Opteron 6.6 -
-# Core2 6.7 6.1/+10% -
-# Atom 11.0 9.7/+13% -
-# Westmere 7.1 5.6/+27% -
-# Sandy Bridge 7.9 6.3/+25% 5.2/+51%
+# x86_64 SSSE3 AVX[2]
+# P4 9.05 -
+# Opteron 6.26 -
+# Core2 6.55 6.05/+8% -
+# Westmere 6.73 5.30/+27% -
+# Sandy Bridge 7.70 6.10/+26% 4.99/+54%
+# Ivy Bridge 6.06 4.67/+30% 4.60/+32%
+# Haswell 5.45 4.15/+31% 3.57/+53%
+# Bulldozer 9.11 5.95/+53%
+# VIA Nano 9.32 7.15/+30%
+# Atom 10.3 9.17/+12%
+# Silvermont 13.1(*) 9.37/+40%
+#
+# (*) obviously suboptimal result, nothing was done about it,
+# because SSSE3 code is compiled unconditionally;
$flavour = shift;
$output = shift;
@@ -72,15 +92,27 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
-$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
- =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
- $1>=2.19);
-$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
- `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
- $1>=2.09);
-$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
- `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
- $1>=10);
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([2-9]\.[0-9]+)/) {
+ $avx = ($2>=3.0) + ($2>3.0);
+}
+
+$shaext=1; ### set to zero if compiling for 1.0.1
+$avx=1 if (!$shaext && $avx);
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
@@ -97,7 +129,7 @@ $num="%r10";
$t0="%eax";
$t1="%ebx";
$t2="%ecx";
-@xi=("%edx","%ebp");
+@xi=("%edx","%ebp","%r14d");
$A="%esi";
$B="%edi";
$C="%r11d";
@@ -112,42 +144,40 @@ my $j=$i+1;
$code.=<<___ if ($i==0);
mov `4*$i`($inp),$xi[0]
bswap $xi[0]
- mov $xi[0],`4*$i`(%rsp)
___
$code.=<<___ if ($i<15);
- mov $c,$t0
mov `4*$j`($inp),$xi[1]
+ mov $d,$t0
+ mov $xi[0],`4*$i`(%rsp)
mov $a,$t2
- xor $d,$t0
bswap $xi[1]
+ xor $c,$t0
rol \$5,$t2
- lea 0x5a827999($xi[0],$e),$e
and $b,$t0
- mov $xi[1],`4*$j`(%rsp)
+ lea 0x5a827999($xi[0],$e),$e
add $t2,$e
xor $d,$t0
rol \$30,$b
add $t0,$e
___
$code.=<<___ if ($i>=15);
- mov `4*($j%16)`(%rsp),$xi[1]
- mov $c,$t0
+ xor `4*($j%16)`(%rsp),$xi[1]
+ mov $d,$t0
+ mov $xi[0],`4*($i%16)`(%rsp)
mov $a,$t2
xor `4*(($j+2)%16)`(%rsp),$xi[1]
- xor $d,$t0
+ xor $c,$t0
rol \$5,$t2
xor `4*(($j+8)%16)`(%rsp),$xi[1]
and $b,$t0
lea 0x5a827999($xi[0],$e),$e
- xor `4*(($j+13)%16)`(%rsp),$xi[1]
+ rol \$30,$b
xor $d,$t0
- rol \$1,$xi[1]
add $t2,$e
- rol \$30,$b
- mov $xi[1],`4*($j%16)`(%rsp)
+ rol \$1,$xi[1]
add $t0,$e
___
-unshift(@xi,pop(@xi));
+push(@xi,shift(@xi));
}
sub BODY_20_39 {
@@ -155,62 +185,58 @@ my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
$code.=<<___ if ($i<79);
- mov `4*($j%16)`(%rsp),$xi[1]
- mov $c,$t0
+ xor `4*($j%16)`(%rsp),$xi[1]
+ mov $b,$t0
+ `"mov $xi[0],".4*($i%16)."(%rsp)" if ($i<72)`
mov $a,$t2
xor `4*(($j+2)%16)`(%rsp),$xi[1]
- xor $b,$t0
+ xor $d,$t0
rol \$5,$t2
- lea $K($xi[0],$e),$e
xor `4*(($j+8)%16)`(%rsp),$xi[1]
- xor $d,$t0
+ lea $K($xi[0],$e),$e
+ xor $c,$t0
add $t2,$e
- xor `4*(($j+13)%16)`(%rsp),$xi[1]
rol \$30,$b
add $t0,$e
rol \$1,$xi[1]
___
-$code.=<<___ if ($i<76);
- mov $xi[1],`4*($j%16)`(%rsp)
-___
$code.=<<___ if ($i==79);
- mov $c,$t0
+ mov $b,$t0
mov $a,$t2
- xor $b,$t0
+ xor $d,$t0
lea $K($xi[0],$e),$e
rol \$5,$t2
- xor $d,$t0
+ xor $c,$t0
add $t2,$e
rol \$30,$b
add $t0,$e
___
-unshift(@xi,pop(@xi));
+push(@xi,shift(@xi));
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___;
- mov `4*($j%16)`(%rsp),$xi[1]
- mov $c,$t0
- mov $c,$t1
+ xor `4*($j%16)`(%rsp),$xi[1]
+ mov $d,$t0
+ mov $xi[0],`4*($i%16)`(%rsp)
+ mov $d,$t1
xor `4*(($j+2)%16)`(%rsp),$xi[1]
- and $d,$t0
+ and $c,$t0
mov $a,$t2
xor `4*(($j+8)%16)`(%rsp),$xi[1]
- xor $d,$t1
lea 0x8f1bbcdc($xi[0],$e),$e
+ xor $c,$t1
rol \$5,$t2
- xor `4*(($j+13)%16)`(%rsp),$xi[1]
add $t0,$e
- and $b,$t1
rol \$1,$xi[1]
- add $t1,$e
- rol \$30,$b
- mov $xi[1],`4*($j%16)`(%rsp)
+ and $b,$t1
add $t2,$e
+ rol \$30,$b
+ add $t1,$e
___
-unshift(@xi,pop(@xi));
+push(@xi,shift(@xi));
}
$code.=<<___;
@@ -223,9 +249,19 @@ $code.=<<___;
sha1_block_data_order:
mov OPENSSL_ia32cap_P+0(%rip),%r9d
mov OPENSSL_ia32cap_P+4(%rip),%r8d
+ mov OPENSSL_ia32cap_P+8(%rip),%r10d
test \$`1<<9`,%r8d # check SSSE3 bit
jz .Lialu
___
+$code.=<<___ if ($shaext);
+ test \$`1<<29`,%r10d # check SHA bit
+ jnz _shaext_shortcut
+___
+$code.=<<___ if ($avx>1);
+ and \$`1<<3|1<<5|1<<8`,%r10d # check AVX2+BMI1+BMI2
+ cmp \$`1<<3|1<<5|1<<8`,%r10d
+ je _avx2_shortcut
+___
$code.=<<___ if ($avx);
and \$`1<<28`,%r8d # mask AVX bit
and \$`1<<30`,%r9d # mask "Intel CPU" bit
@@ -238,17 +274,18 @@ $code.=<<___;
.align 16
.Lialu:
+ mov %rsp,%rax
push %rbx
push %rbp
push %r12
push %r13
- mov %rsp,%r11
+ push %r14
mov %rdi,$ctx # reassigned argument
sub \$`8+16*4`,%rsp
mov %rsi,$inp # reassigned argument
and \$-64,%rsp
mov %rdx,$num # reassigned argument
- mov %r11,`16*4`(%rsp)
+ mov %rax,`16*4`(%rsp)
.Lprologue:
mov 0($ctx),$A
@@ -282,53 +319,187 @@ $code.=<<___;
jnz .Lloop
mov `16*4`(%rsp),%rsi
- mov (%rsi),%r13
- mov 8(%rsi),%r12
- mov 16(%rsi),%rbp
- mov 24(%rsi),%rbx
- lea 32(%rsi),%rsp
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
.Lepilogue:
ret
.size sha1_block_data_order,.-sha1_block_data_order
___
+if ($shaext) {{{
+######################################################################
+# Intel SHA Extensions implementation of SHA1 update function.
+#
+my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx");
+my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9));
+my @MSG=map("%xmm$_",(4..7));
+
+$code.=<<___;
+.type sha1_block_data_order_shaext,\@function,3
+.align 32
+sha1_block_data_order_shaext:
+_shaext_shortcut:
+___
+$code.=<<___ if ($win64);
+ lea `-8-4*16`(%rsp),%rsp
+ movaps %xmm6,-8-4*16(%rax)
+ movaps %xmm7,-8-3*16(%rax)
+ movaps %xmm8,-8-2*16(%rax)
+ movaps %xmm9,-8-1*16(%rax)
+.Lprologue_shaext:
+___
+$code.=<<___;
+ movdqu ($ctx),$ABCD
+ movd 16($ctx),$E
+ movdqa K_XX_XX+0xa0(%rip),$BSWAP # byte-n-word swap
+
+ movdqu ($inp),@MSG[0]
+ pshufd \$0b00011011,$ABCD,$ABCD # flip word order
+ movdqu 0x10($inp),@MSG[1]
+ pshufd \$0b00011011,$E,$E # flip word order
+ movdqu 0x20($inp),@MSG[2]
+ pshufb $BSWAP,@MSG[0]
+ movdqu 0x30($inp),@MSG[3]
+ pshufb $BSWAP,@MSG[1]
+ pshufb $BSWAP,@MSG[2]
+ movdqa $E,$E_SAVE # offload $E
+ pshufb $BSWAP,@MSG[3]
+ jmp .Loop_shaext
+
+.align 16
+.Loop_shaext:
+ dec $num
+ lea 0x40($inp),%rax # next input block
+ paddd @MSG[0],$E
+ cmovne %rax,$inp
+ movdqa $ABCD,$ABCD_SAVE # offload $ABCD
+___
+for($i=0;$i<20-4;$i+=2) {
+$code.=<<___;
+ sha1msg1 @MSG[1],@MSG[0]
+ movdqa $ABCD,$E_
+ sha1rnds4 \$`int($i/5)`,$E,$ABCD # 0-3...
+ sha1nexte @MSG[1],$E_
+ pxor @MSG[2],@MSG[0]
+ sha1msg1 @MSG[2],@MSG[1]
+ sha1msg2 @MSG[3],@MSG[0]
+
+ movdqa $ABCD,$E
+ sha1rnds4 \$`int(($i+1)/5)`,$E_,$ABCD
+ sha1nexte @MSG[2],$E
+ pxor @MSG[3],@MSG[1]
+ sha1msg2 @MSG[0],@MSG[1]
+___
+ push(@MSG,shift(@MSG)); push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+ movdqu ($inp),@MSG[0]
+ movdqa $ABCD,$E_
+ sha1rnds4 \$3,$E,$ABCD # 64-67
+ sha1nexte @MSG[1],$E_
+ movdqu 0x10($inp),@MSG[1]
+ pshufb $BSWAP,@MSG[0]
+
+ movdqa $ABCD,$E
+ sha1rnds4 \$3,$E_,$ABCD # 68-71
+ sha1nexte @MSG[2],$E
+ movdqu 0x20($inp),@MSG[2]
+ pshufb $BSWAP,@MSG[1]
+
+ movdqa $ABCD,$E_
+ sha1rnds4 \$3,$E,$ABCD # 72-75
+ sha1nexte @MSG[3],$E_
+ movdqu 0x30($inp),@MSG[3]
+ pshufb $BSWAP,@MSG[2]
+
+ movdqa $ABCD,$E
+ sha1rnds4 \$3,$E_,$ABCD # 76-79
+ sha1nexte $E_SAVE,$E
+ pshufb $BSWAP,@MSG[3]
+
+ paddd $ABCD_SAVE,$ABCD
+ movdqa $E,$E_SAVE # offload $E
+
+ jnz .Loop_shaext
+
+ pshufd \$0b00011011,$ABCD,$ABCD
+ pshufd \$0b00011011,$E,$E
+ movdqu $ABCD,($ctx)
+ movd $E,16($ctx)
+___
+$code.=<<___ if ($win64);
+ movaps -8-4*16(%rax),%xmm6
+ movaps -8-3*16(%rax),%xmm7
+ movaps -8-2*16(%rax),%xmm8
+ movaps -8-1*16(%rax),%xmm9
+ mov %rax,%rsp
+.Lepilogue_shaext:
+___
+$code.=<<___;
+ ret
+.size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
+___
+}}}
{{{
my $Xi=4;
my @X=map("%xmm$_",(4..7,0..3));
my @Tx=map("%xmm$_",(8..10));
+my $Kx="%xmm11";
my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
my @T=("%esi","%edi");
my $j=0;
+my $rx=0;
my $K_XX_XX="%r11";
my $_rol=sub { &rol(@_) };
my $_ror=sub { &ror(@_) };
+{ my $sn;
+sub align32() {
+ ++$sn;
+$code.=<<___;
+ jmp .Lalign32_$sn # see "Decoded ICache" in manual
+.align 32
+.Lalign32_$sn:
+___
+}
+}
+
$code.=<<___;
.type sha1_block_data_order_ssse3,\@function,3
.align 16
sha1_block_data_order_ssse3:
_ssse3_shortcut:
+ mov %rsp,%rax
push %rbx
push %rbp
push %r12
- lea `-64-($win64?5*16:0)`(%rsp),%rsp
+ push %r13 # redundant, done to share Win64 SE handler
+ push %r14
+ lea `-64-($win64?6*16:0)`(%rsp),%rsp
___
$code.=<<___ if ($win64);
- movaps %xmm6,64+0(%rsp)
- movaps %xmm7,64+16(%rsp)
- movaps %xmm8,64+32(%rsp)
- movaps %xmm9,64+48(%rsp)
- movaps %xmm10,64+64(%rsp)
+ movaps %xmm6,-40-6*16(%rax)
+ movaps %xmm7,-40-5*16(%rax)
+ movaps %xmm8,-40-4*16(%rax)
+ movaps %xmm9,-40-3*16(%rax)
+ movaps %xmm10,-40-2*16(%rax)
+ movaps %xmm11,-40-1*16(%rax)
.Lprologue_ssse3:
___
$code.=<<___;
+ mov %rax,%r14 # original %rsp
+ and \$-64,%rsp
mov %rdi,$ctx # reassigned argument
mov %rsi,$inp # reassigned argument
mov %rdx,$num # reassigned argument
shl \$6,$num
add $inp,$num
- lea K_XX_XX(%rip),$K_XX_XX
+ lea K_XX_XX+64(%rip),$K_XX_XX
mov 0($ctx),$A # load context
mov 4($ctx),$B
@@ -336,19 +507,22 @@ $code.=<<___;
mov 12($ctx),$D
mov $B,@T[0] # magic seed
mov 16($ctx),$E
+ mov $C,@T[1]
+ xor $D,@T[1]
+ and @T[1],@T[0]
movdqa 64($K_XX_XX),@X[2] # pbswap mask
- movdqa 0($K_XX_XX),@Tx[1] # K_00_19
+ movdqa -64($K_XX_XX),@Tx[1] # K_00_19
movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
movdqu 16($inp),@X[-3&7]
movdqu 32($inp),@X[-2&7]
movdqu 48($inp),@X[-1&7]
pshufb @X[2],@X[-4&7] # byte swap
- add \$64,$inp
pshufb @X[2],@X[-3&7]
pshufb @X[2],@X[-2&7]
- pshufb @X[2],@X[-1&7]
+ add \$64,$inp
paddd @Tx[1],@X[-4&7] # add K_00_19
+ pshufb @X[2],@X[-1&7]
paddd @Tx[1],@X[-3&7]
paddd @Tx[1],@X[-2&7]
movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
@@ -373,61 +547,61 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
my ($a,$b,$c,$d,$e);
- &movdqa (@X[0],@X[-3&7]);
- eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]);
eval(shift(@insns));
&movdqa (@Tx[0],@X[-1&7]);
- &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
+ &paddd (@Tx[1],@X[-1&7]);
eval(shift(@insns));
eval(shift(@insns));
- &paddd (@Tx[1],@X[-1&7]);
+ &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
eval(shift(@insns));
+ eval(shift(@insns)); # rol
eval(shift(@insns));
&psrldq (@Tx[0],4); # "X[-3]", 3 dwords
eval(shift(@insns));
eval(shift(@insns));
+
&pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
eval(shift(@insns));
- eval(shift(@insns));
-
+ eval(shift(@insns)); # ror
&pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
- eval(shift(@insns));
&pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
eval(shift(@insns));
- eval(shift(@insns));
+ eval(shift(@insns)); # rol
&movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
eval(shift(@insns));
eval(shift(@insns));
&movdqa (@Tx[2],@X[0]);
- &movdqa (@Tx[0],@X[0]);
- eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ &movdqa (@Tx[0],@X[0]);
eval(shift(@insns));
&pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
&paddd (@X[0],@X[0]);
eval(shift(@insns));
eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
&psrld (@Tx[0],31);
eval(shift(@insns));
+ eval(shift(@insns)); # rol
eval(shift(@insns));
&movdqa (@Tx[1],@Tx[2]);
eval(shift(@insns));
eval(shift(@insns));
&psrld (@Tx[2],30);
- &por (@X[0],@Tx[0]); # "X[0]"<<<=1
eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ &por (@X[0],@Tx[0]); # "X[0]"<<<=1
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
@@ -435,12 +609,13 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
&pslld (@Tx[1],2);
&pxor (@X[0],@Tx[2]);
eval(shift(@insns));
- eval(shift(@insns));
- &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
+ &movdqa (@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)"); # K_XX_XX
+ eval(shift(@insns)); # rol
eval(shift(@insns));
eval(shift(@insns));
&pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
+ &pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
foreach (@insns) { eval; } # remaining instructions [if any]
@@ -451,27 +626,30 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
sub Xupdate_ssse3_32_79()
{ use integer;
my $body = shift;
- my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
+ my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
my ($a,$b,$c,$d,$e);
- &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8);
- eval(shift(@insns)); # body_20_39
+ eval(shift(@insns)) if ($Xi==8);
&pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
- &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]"
+ eval(shift(@insns)) if ($Xi==8);
+ eval(shift(@insns)); # body_20_39
eval(shift(@insns));
+ eval(shift(@insns)) if (@insns[1] =~ /_ror/);
+ eval(shift(@insns)) if (@insns[0] =~ /_ror/);
+ &punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
eval(shift(@insns));
eval(shift(@insns)); # rol
&pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
eval(shift(@insns));
- eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
+ eval(shift(@insns));
if ($Xi%5) {
&movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
} else { # ... or load next one
- &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+ &movdqa (@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
}
- &paddd (@Tx[1],@X[-1&7]);
eval(shift(@insns)); # ror
+ &paddd (@Tx[1],@X[-1&7]);
eval(shift(@insns));
&pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
@@ -479,29 +657,31 @@ sub Xupdate_ssse3_32_79()
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # rol
+ eval(shift(@insns)) if (@insns[0] =~ /_ror/);
&movdqa (@Tx[0],@X[0]);
- &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
eval(shift(@insns));
eval(shift(@insns));
+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
eval(shift(@insns)); # ror
eval(shift(@insns));
+ eval(shift(@insns)); # body_20_39
&pslld (@X[0],2);
- eval(shift(@insns)); # body_20_39
eval(shift(@insns));
- &psrld (@Tx[0],30);
eval(shift(@insns));
- eval(shift(@insns)); # rol
+ &psrld (@Tx[0],30);
+ eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # ror
- eval(shift(@insns));
&por (@X[0],@Tx[0]); # "X[0]"<<<=2
- eval(shift(@insns)); # body_20_39
eval(shift(@insns));
- &movdqa (@Tx[1],@X[0]) if ($Xi<19);
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns)) if (@insns[1] =~ /_rol/);
+ eval(shift(@insns)) if (@insns[0] =~ /_rol/);
+ &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0])
eval(shift(@insns));
eval(shift(@insns)); # rol
eval(shift(@insns));
@@ -522,10 +702,11 @@ sub Xuplast_ssse3_80()
my ($a,$b,$c,$d,$e);
eval(shift(@insns));
- &paddd (@Tx[1],@X[-1&7]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
+ &paddd (@Tx[1],@X[-1&7]);
+ eval(shift(@insns));
eval(shift(@insns));
&movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
@@ -538,7 +719,7 @@ sub Xuplast_ssse3_80()
unshift(@Tx,pop(@Tx));
&movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask
- &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19
+ &movdqa (@Tx[1],"-64($K_XX_XX)"); # K_00_19
&movdqu (@X[-4&7],"0($inp)"); # load input
&movdqu (@X[-3&7],"16($inp)");
&movdqu (@X[-2&7],"32($inp)");
@@ -557,9 +738,12 @@ sub Xloop_ssse3()
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns));
&pshufb (@X[($Xi-3)&7],@X[2]);
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
&paddd (@X[($Xi-4)&7],@Tx[1]);
eval(shift(@insns));
eval(shift(@insns));
@@ -568,6 +752,8 @@ sub Xloop_ssse3()
&movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
&psubd (@X[($Xi-4)&7],@Tx[1]);
foreach (@insns) { eval; }
@@ -583,51 +769,66 @@ sub Xtail_ssse3()
foreach (@insns) { eval; }
}
-sub body_00_19 () {
+sub body_00_19 () { # ((c^d)&b)^d
+ # on start @T[0]=(c^d)&b
+ return &body_20_39() if ($rx==19); $rx++;
(
'($a,$b,$c,$d,$e)=@V;'.
- '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer
- '&xor ($c,$d);',
- '&mov (@T[1],$a);', # $b in next round
- '&$_rol ($a,5);',
- '&and (@T[0],$c);', # ($b&($c^$d))
- '&xor ($c,$d);', # restore $c
- '&xor (@T[0],$d);',
- '&add ($e,$a);',
- '&$_ror ($b,$j?7:2);', # $b>>>2
- '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ '&$_ror ($b,$j?7:2)', # $b>>>2
+ '&xor (@T[0],$d)',
+ '&mov (@T[1],$a)', # $b for next round
+
+ '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
+ '&xor ($b,$c)', # $c^$d for next round
+
+ '&$_rol ($a,5)',
+ '&add ($e,@T[0])',
+ '&and (@T[1],$b)', # ($b&($c^$d)) for next round
+
+ '&xor ($b,$c)', # restore $b
+ '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
);
}
-sub body_20_39 () {
+sub body_20_39 () { # b^d^c
+ # on entry @T[0]=b^d
+ return &body_40_59() if ($rx==39); $rx++;
(
'($a,$b,$c,$d,$e)=@V;'.
- '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
- '&xor (@T[0],$d);', # ($b^$d)
- '&mov (@T[1],$a);', # $b in next round
- '&$_rol ($a,5);',
- '&xor (@T[0],$c);', # ($b^$d^$c)
- '&add ($e,$a);',
- '&$_ror ($b,7);', # $b>>>2
- '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
+ '&xor (@T[0],$d) if($j==19);'.
+ '&xor (@T[0],$c) if($j> 19)', # ($b^$d^$c)
+ '&mov (@T[1],$a)', # $b for next round
+
+ '&$_rol ($a,5)',
+ '&add ($e,@T[0])',
+ '&xor (@T[1],$c) if ($j< 79)', # $b^$d for next round
+
+ '&$_ror ($b,7)', # $b>>>2
+ '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
);
}
-sub body_40_59 () {
+sub body_40_59 () { # ((b^c)&(c^d))^c
+ # on entry @T[0]=(b^c), (c^=d)
+ $rx++;
(
'($a,$b,$c,$d,$e)=@V;'.
- '&mov (@T[1],$c);',
- '&xor ($c,$d);',
- '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
- '&and (@T[1],$d);',
- '&and (@T[0],$c);', # ($b&($c^$d))
- '&$_ror ($b,7);', # $b>>>2
- '&add ($e,@T[1]);',
- '&mov (@T[1],$a);', # $b in next round
- '&$_rol ($a,5);',
- '&add ($e,@T[0]);',
- '&xor ($c,$d);', # restore $c
- '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
+ '&and (@T[0],$c) if ($j>=40)', # (b^c)&(c^d)
+ '&xor ($c,$d) if ($j>=40)', # restore $c
+
+ '&$_ror ($b,7)', # $b>>>2
+ '&mov (@T[1],$a)', # $b for next round
+ '&xor (@T[0],$c)',
+
+ '&$_rol ($a,5)',
+ '&add ($e,@T[0])',
+ '&xor (@T[1],$c) if ($j==59);'.
+ '&xor (@T[1],$b) if ($j< 59)', # b^c for next round
+
+ '&xor ($b,$c) if ($j< 59)', # c^d for next round
+ '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
);
}
$code.=<<___;
@@ -668,8 +869,11 @@ $code.=<<___;
mov @T[0],4($ctx)
mov @T[0],$B # magic seed
mov $C,8($ctx)
+ mov $C,@T[1]
mov $D,12($ctx)
+ xor $D,@T[1]
mov $E,16($ctx)
+ and @T[1],@T[0]
jmp .Loop_ssse3
.align 16
@@ -694,31 +898,34 @@ $code.=<<___;
mov $E,16($ctx)
___
$code.=<<___ if ($win64);
- movaps 64+0(%rsp),%xmm6
- movaps 64+16(%rsp),%xmm7
- movaps 64+32(%rsp),%xmm8
- movaps 64+48(%rsp),%xmm9
- movaps 64+64(%rsp),%xmm10
+ movaps -40-6*16(%r14),%xmm6
+ movaps -40-5*16(%r14),%xmm7
+ movaps -40-4*16(%r14),%xmm8
+ movaps -40-3*16(%r14),%xmm9
+ movaps -40-2*16(%r14),%xmm10
+ movaps -40-1*16(%r14),%xmm11
___
$code.=<<___;
- lea `64+($win64?5*16:0)`(%rsp),%rsi
- mov 0(%rsi),%r12
- mov 8(%rsi),%rbp
- mov 16(%rsi),%rbx
- lea 24(%rsi),%rsp
+ lea (%r14),%rsi
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
.Lepilogue_ssse3:
ret
.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
___
if ($avx) {
-my $Xi=4;
-my @X=map("%xmm$_",(4..7,0..3));
-my @Tx=map("%xmm$_",(8..10));
-my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
-my @T=("%esi","%edi");
-my $j=0;
-my $K_XX_XX="%r11";
+$Xi=4; # reset variables
+@X=map("%xmm$_",(4..7,0..3));
+@Tx=map("%xmm$_",(8..10));
+$j=0;
+$rx=0;
+
+my $done_avx_label=".Ldone_avx";
my $_rol=sub { &shld(@_[0],@_) };
my $_ror=sub { &shrd(@_[0],@_) };
@@ -728,28 +935,34 @@ $code.=<<___;
.align 16
sha1_block_data_order_avx:
_avx_shortcut:
+ mov %rsp,%rax
push %rbx
push %rbp
push %r12
- lea `-64-($win64?5*16:0)`(%rsp),%rsp
+ push %r13 # redundant, done to share Win64 SE handler
+ push %r14
+ lea `-64-($win64?6*16:0)`(%rsp),%rsp
+ vzeroupper
___
$code.=<<___ if ($win64);
- movaps %xmm6,64+0(%rsp)
- movaps %xmm7,64+16(%rsp)
- movaps %xmm8,64+32(%rsp)
- movaps %xmm9,64+48(%rsp)
- movaps %xmm10,64+64(%rsp)
+ vmovaps %xmm6,-40-6*16(%rax)
+ vmovaps %xmm7,-40-5*16(%rax)
+ vmovaps %xmm8,-40-4*16(%rax)
+ vmovaps %xmm9,-40-3*16(%rax)
+ vmovaps %xmm10,-40-2*16(%rax)
+ vmovaps %xmm11,-40-1*16(%rax)
.Lprologue_avx:
___
$code.=<<___;
+ mov %rax,%r14 # original %rsp
+ and \$-64,%rsp
mov %rdi,$ctx # reassigned argument
mov %rsi,$inp # reassigned argument
mov %rdx,$num # reassigned argument
- vzeroupper
shl \$6,$num
add $inp,$num
- lea K_XX_XX(%rip),$K_XX_XX
+ lea K_XX_XX+64(%rip),$K_XX_XX
mov 0($ctx),$A # load context
mov 4($ctx),$B
@@ -757,9 +970,12 @@ $code.=<<___;
mov 12($ctx),$D
mov $B,@T[0] # magic seed
mov 16($ctx),$E
+ mov $C,@T[1]
+ xor $D,@T[1]
+ and @T[1],@T[0]
vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
- vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19
+ vmovdqa -64($K_XX_XX),$Kx # K_00_19
vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
vmovdqu 16($inp),@X[-3&7]
vmovdqu 32($inp),@X[-2&7]
@@ -769,9 +985,9 @@ $code.=<<___;
vpshufb @X[2],@X[-3&7],@X[-3&7]
vpshufb @X[2],@X[-2&7],@X[-2&7]
vpshufb @X[2],@X[-1&7],@X[-1&7]
- vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19
- vpaddd @Tx[1],@X[-3&7],@X[1]
- vpaddd @Tx[1],@X[-2&7],@X[2]
+ vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
+ vpaddd $Kx,@X[-3&7],@X[1]
+ vpaddd $Kx,@X[-2&7],@X[2]
vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
vmovdqa @X[1],16(%rsp)
vmovdqa @X[2],32(%rsp)
@@ -790,10 +1006,10 @@ sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
eval(shift(@insns));
eval(shift(@insns));
- &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
+ &vpaddd (@Tx[1],$Kx,@X[-1&7]);
eval(shift(@insns));
eval(shift(@insns));
- &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
+ &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
eval(shift(@insns));
eval(shift(@insns));
&vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
@@ -843,7 +1059,7 @@ sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
&vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
eval(shift(@insns));
eval(shift(@insns));
- &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
+ &vmovdqa ($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
eval(shift(@insns));
eval(shift(@insns));
@@ -851,13 +1067,12 @@ sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
foreach (@insns) { eval; } # remaining instructions [if any]
$Xi++; push(@X,shift(@X)); # "rotate" X[]
- push(@Tx,shift(@Tx));
}
sub Xupdate_avx_32_79()
{ use integer;
my $body = shift;
- my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
+ my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
my ($a,$b,$c,$d,$e);
&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
@@ -870,12 +1085,8 @@ sub Xupdate_avx_32_79()
&vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
eval(shift(@insns));
eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
- if ($Xi%5) {
- &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
- } else { # ... or load next one
- &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
- }
- &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
+ &vpaddd (@Tx[1],$Kx,@X[-1&7]);
+ &vmovdqa ($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)") if ($Xi%5==0);
eval(shift(@insns)); # ror
eval(shift(@insns));
@@ -905,7 +1116,6 @@ sub Xupdate_avx_32_79()
&vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
eval(shift(@insns)); # body_20_39
eval(shift(@insns));
- &vmovdqa (@Tx[1],@X[0]) if ($Xi<19);
eval(shift(@insns));
eval(shift(@insns)); # rol
eval(shift(@insns));
@@ -916,7 +1126,6 @@ sub Xupdate_avx_32_79()
foreach (@insns) { eval; } # remaining instructions
$Xi++; push(@X,shift(@X)); # "rotate" X[]
- push(@Tx,shift(@Tx));
}
sub Xuplast_avx_80()
@@ -926,23 +1135,21 @@ sub Xuplast_avx_80()
my ($a,$b,$c,$d,$e);
eval(shift(@insns));
- &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
+ &vpaddd (@Tx[1],$Kx,@X[-1&7]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
- &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
+ &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
foreach (@insns) { eval; } # remaining instructions
&cmp ($inp,$num);
- &je (".Ldone_avx");
-
- unshift(@Tx,pop(@Tx));
+ &je ($done_avx_label);
&vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask
- &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19
+ &vmovdqa($Kx,"-64($K_XX_XX)"); # K_00_19
&vmovdqu(@X[-4&7],"0($inp)"); # load input
&vmovdqu(@X[-3&7],"16($inp)");
&vmovdqu(@X[-2&7],"32($inp)");
@@ -964,7 +1171,7 @@ sub Xloop_avx()
&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
eval(shift(@insns));
eval(shift(@insns));
- &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
+ &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],$Kx);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
@@ -1024,12 +1231,15 @@ $code.=<<___;
mov @T[0],4($ctx)
mov @T[0],$B # magic seed
mov $C,8($ctx)
+ mov $C,@T[1]
mov $D,12($ctx)
+ xor $D,@T[1]
mov $E,16($ctx)
+ and @T[1],@T[0]
jmp .Loop_avx
.align 16
-.Ldone_avx:
+$done_avx_label:
___
$j=$saved_j; @V=@saved_V;
@@ -1052,31 +1262,520 @@ $code.=<<___;
mov $E,16($ctx)
___
$code.=<<___ if ($win64);
- movaps 64+0(%rsp),%xmm6
- movaps 64+16(%rsp),%xmm7
- movaps 64+32(%rsp),%xmm8
- movaps 64+48(%rsp),%xmm9
- movaps 64+64(%rsp),%xmm10
+ movaps -40-6*16(%r14),%xmm6
+ movaps -40-5*16(%r14),%xmm7
+ movaps -40-4*16(%r14),%xmm8
+ movaps -40-3*16(%r14),%xmm9
+ movaps -40-2*16(%r14),%xmm10
+ movaps -40-1*16(%r14),%xmm11
___
$code.=<<___;
- lea `64+($win64?5*16:0)`(%rsp),%rsi
- mov 0(%rsi),%r12
- mov 8(%rsi),%rbp
- mov 16(%rsi),%rbx
- lea 24(%rsi),%rsp
+ lea (%r14),%rsi
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
.Lepilogue_avx:
ret
.size sha1_block_data_order_avx,.-sha1_block_data_order_avx
___
+
+if ($avx>1) {
+use integer;
+$Xi=4; # reset variables
+@X=map("%ymm$_",(4..7,0..3));
+@Tx=map("%ymm$_",(8..10));
+$Kx="%ymm11";
+$j=0;
+
+my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi");
+my ($a5,$t0)=("%r12d","%edi");
+
+my ($A,$F,$B,$C,$D,$E)=@ROTX;
+my $rx=0;
+my $frame="%r13";
+
+$code.=<<___;
+.type sha1_block_data_order_avx2,\@function,3
+.align 16
+sha1_block_data_order_avx2:
+_avx2_shortcut:
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ lea -6*16(%rsp),%rsp
+ vmovaps %xmm6,-40-6*16(%rax)
+ vmovaps %xmm7,-40-5*16(%rax)
+ vmovaps %xmm8,-40-4*16(%rax)
+ vmovaps %xmm9,-40-3*16(%rax)
+ vmovaps %xmm10,-40-2*16(%rax)
+ vmovaps %xmm11,-40-1*16(%rax)
+.Lprologue_avx2:
+___
+$code.=<<___;
+ mov %rax,%r14 # original %rsp
+ mov %rdi,$ctx # reassigned argument
+ mov %rsi,$inp # reassigned argument
+ mov %rdx,$num # reassigned argument
+
+ lea -640(%rsp),%rsp
+ shl \$6,$num
+ lea 64($inp),$frame
+ and \$-128,%rsp
+ add $inp,$num
+ lea K_XX_XX+64(%rip),$K_XX_XX
+
+ mov 0($ctx),$A # load context
+ cmp $num,$frame
+ cmovae $inp,$frame # next or same block
+ mov 4($ctx),$F
+ mov 8($ctx),$C
+ mov 12($ctx),$D
+ mov 16($ctx),$E
+ vmovdqu 64($K_XX_XX),@X[2] # pbswap mask
+
+ vmovdqu ($inp),%xmm0
+ vmovdqu 16($inp),%xmm1
+ vmovdqu 32($inp),%xmm2
+ vmovdqu 48($inp),%xmm3
+ lea 64($inp),$inp
+ vinserti128 \$1,($frame),@X[-4&7],@X[-4&7]
+ vinserti128 \$1,16($frame),@X[-3&7],@X[-3&7]
+ vpshufb @X[2],@X[-4&7],@X[-4&7]
+ vinserti128 \$1,32($frame),@X[-2&7],@X[-2&7]
+ vpshufb @X[2],@X[-3&7],@X[-3&7]
+ vinserti128 \$1,48($frame),@X[-1&7],@X[-1&7]
+ vpshufb @X[2],@X[-2&7],@X[-2&7]
+ vmovdqu -64($K_XX_XX),$Kx # K_00_19
+ vpshufb @X[2],@X[-1&7],@X[-1&7]
+
+ vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
+ vpaddd $Kx,@X[-3&7],@X[1]
+ vmovdqu @X[0],0(%rsp) # X[]+K xfer to IALU
+ vpaddd $Kx,@X[-2&7],@X[2]
+ vmovdqu @X[1],32(%rsp)
+ vpaddd $Kx,@X[-1&7],@X[3]
+ vmovdqu @X[2],64(%rsp)
+ vmovdqu @X[3],96(%rsp)
+___
+for (;$Xi<8;$Xi++) { # Xupdate_avx2_16_31
+ use integer;
+
+ &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
+ &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
+ &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
+ &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
+ &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
+ &vpsrld (@Tx[0],@X[0],31);
+ &vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
+ &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
+ &vpaddd (@X[0],@X[0],@X[0]);
+ &vpsrld (@Tx[1],@Tx[2],30);
+ &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
+ &vpslld (@Tx[2],@Tx[2],2);
+ &vpxor (@X[0],@X[0],@Tx[1]);
+ &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
+ &vpaddd (@Tx[1],@X[0],$Kx);
+ &vmovdqu("32*$Xi(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+
+ push(@X,shift(@X)); # "rotate" X[]
+}
+$code.=<<___;
+ lea 128(%rsp),$frame
+ jmp .Loop_avx2
+.align 32
+.Loop_avx2:
+ rorx \$2,$F,$B
+ andn $D,$F,$t0
+ and $C,$F
+ xor $t0,$F
+___
+sub bodyx_00_19 () { # 8 instructions, 3 cycles critical path
+ # at start $f=(b&c)^(~b&d), $b>>>=2
+ return &bodyx_20_39() if ($rx==19); $rx++;
+ (
+ '($a,$f,$b,$c,$d,$e)=@ROTX;'.
+
+ '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K
+ '&lea ($frame,"256($frame)") if ($j%32==31);',
+ '&andn ($t0,$a,$c)', # ~b&d for next round
+
+ '&add ($e,$f)', # e+=(b&c)^(~b&d)
+ '&rorx ($a5,$a,27)', # a<<<5
+ '&rorx ($f,$a,2)', # b>>>2 for next round
+ '&and ($a,$b)', # b&c for next round
+
+ '&add ($e,$a5)', # e+=a<<<5
+ '&xor ($a,$t0);'. # f=(b&c)^(~b&d) for next round
+
+ 'unshift(@ROTX,pop(@ROTX)); $j++;'
+ )
+}
+
+sub bodyx_20_39 () { # 7 instructions, 2 cycles critical path
+ # on entry $f=b^c^d, $b>>>=2
+ return &bodyx_40_59() if ($rx==39); $rx++;
+ (
+ '($a,$f,$b,$c,$d,$e)=@ROTX;'.
+
+ '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K
+ '&lea ($frame,"256($frame)") if ($j%32==31);',
+
+ '&lea ($e,"($e,$f)")', # e+=b^c^d
+ '&rorx ($a5,$a,27)', # a<<<5
+ '&rorx ($f,$a,2) if ($j<79)', # b>>>2 in next round
+ '&xor ($a,$b) if ($j<79)', # b^c for next round
+
+ '&add ($e,$a5)', # e+=a<<<5
+ '&xor ($a,$c) if ($j<79);'. # f=b^c^d for next round
+
+ 'unshift(@ROTX,pop(@ROTX)); $j++;'
+ )
+}
+
+sub bodyx_40_59 () { # 10 instructions, 3 cycles critical path
+ # on entry $f=((b^c)&(c^d)), $b>>>=2
+ $rx++;
+ (
+ '($a,$f,$b,$c,$d,$e)=@ROTX;'.
+
+ '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K
+ '&lea ($frame,"256($frame)") if ($j%32==31);',
+ '&xor ($f,$c) if ($j>39)', # (b^c)&(c^d)^c
+ '&mov ($t0,$b) if ($j<59)', # count on zero latency
+ '&xor ($t0,$c) if ($j<59)', # c^d for next round
+
+ '&lea ($e,"($e,$f)")', # e+=(b^c)&(c^d)^c
+ '&rorx ($a5,$a,27)', # a<<<5
+ '&rorx ($f,$a,2)', # b>>>2 in next round
+ '&xor ($a,$b)', # b^c for next round
+
+ '&add ($e,$a5)', # e+=a<<<5
+ '&and ($a,$t0) if ($j< 59);'. # f=(b^c)&(c^d) for next round
+ '&xor ($a,$c) if ($j==59);'. # f=b^c^d for next round
+
+ 'unshift(@ROTX,pop(@ROTX)); $j++;'
+ )
+}
+
+sub Xupdate_avx2_16_31() # recall that $Xi starts wtih 4
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 instructions
+ my ($a,$b,$c,$d,$e);
+
+ &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
+ &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpsrld (@Tx[0],@X[0],31);
+ &vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
+ &vpaddd (@X[0],@X[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpsrld (@Tx[1],@Tx[2],30);
+ &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpslld (@Tx[2],@Tx[2],2);
+ &vpxor (@X[0],@X[0],@Tx[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpaddd (@Tx[1],@X[0],$Kx);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+
+ foreach (@insns) { eval; } # remaining instructions [if any]
+
+ $Xi++;
+ push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xupdate_avx2_32_79()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 to 50 instructions
+ my ($a,$b,$c,$d,$e);
+
+ &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
+ &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
+ &vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)") if ($Xi%5==0);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpsrld (@Tx[0],@X[0],30);
+ &vpslld (@X[0],@X[0],2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ #&vpslld (@X[0],@X[0],2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpaddd (@Tx[1],@X[0],$Kx);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vmovdqu("32*$Xi(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ $Xi++;
+ push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xloop_avx2()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ foreach (@insns) { eval; }
+}
+
+ &align32();
+ &Xupdate_avx2_32_79(\&bodyx_00_19);
+ &Xupdate_avx2_32_79(\&bodyx_00_19);
+ &Xupdate_avx2_32_79(\&bodyx_00_19);
+ &Xupdate_avx2_32_79(\&bodyx_00_19);
+
+ &Xupdate_avx2_32_79(\&bodyx_20_39);
+ &Xupdate_avx2_32_79(\&bodyx_20_39);
+ &Xupdate_avx2_32_79(\&bodyx_20_39);
+ &Xupdate_avx2_32_79(\&bodyx_20_39);
+
+ &align32();
+ &Xupdate_avx2_32_79(\&bodyx_40_59);
+ &Xupdate_avx2_32_79(\&bodyx_40_59);
+ &Xupdate_avx2_32_79(\&bodyx_40_59);
+ &Xupdate_avx2_32_79(\&bodyx_40_59);
+
+ &Xloop_avx2(\&bodyx_20_39);
+ &Xloop_avx2(\&bodyx_20_39);
+ &Xloop_avx2(\&bodyx_20_39);
+ &Xloop_avx2(\&bodyx_20_39);
+
+$code.=<<___;
+ lea 128($inp),$frame
+ lea 128($inp),%rdi # borrow $t0
+ cmp $num,$frame
+ cmovae $inp,$frame # next or previous block
+
+ # output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
+ add 0($ctx),@ROTX[0] # update context
+ add 4($ctx),@ROTX[1]
+ add 8($ctx),@ROTX[3]
+ mov @ROTX[0],0($ctx)
+ add 12($ctx),@ROTX[4]
+ mov @ROTX[1],4($ctx)
+ mov @ROTX[0],$A # A=d
+ add 16($ctx),@ROTX[5]
+ mov @ROTX[3],$a5
+ mov @ROTX[3],8($ctx)
+ mov @ROTX[4],$D # D=b
+ #xchg @ROTX[5],$F # F=c, C=f
+ mov @ROTX[4],12($ctx)
+ mov @ROTX[1],$F # F=e
+ mov @ROTX[5],16($ctx)
+ #mov $F,16($ctx)
+ mov @ROTX[5],$E # E=c
+ mov $a5,$C # C=f
+ #xchg $F,$E # E=c, F=e
+
+ cmp $num,$inp
+ je .Ldone_avx2
+___
+
+$Xi=4; # reset variables
+@X=map("%ymm$_",(4..7,0..3));
+
+$code.=<<___;
+ vmovdqu 64($K_XX_XX),@X[2] # pbswap mask
+ cmp $num,%rdi # borrowed $t0
+ ja .Last_avx2
+
+ vmovdqu -64(%rdi),%xmm0 # low part of @X[-4&7]
+ vmovdqu -48(%rdi),%xmm1
+ vmovdqu -32(%rdi),%xmm2
+ vmovdqu -16(%rdi),%xmm3
+ vinserti128 \$1,0($frame),@X[-4&7],@X[-4&7]
+ vinserti128 \$1,16($frame),@X[-3&7],@X[-3&7]
+ vinserti128 \$1,32($frame),@X[-2&7],@X[-2&7]
+ vinserti128 \$1,48($frame),@X[-1&7],@X[-1&7]
+ jmp .Last_avx2
+
+.align 32
+.Last_avx2:
+ lea 128+16(%rsp),$frame
+ rorx \$2,$F,$B
+ andn $D,$F,$t0
+ and $C,$F
+ xor $t0,$F
+ sub \$-128,$inp
+___
+ $rx=$j=0; @ROTX=($A,$F,$B,$C,$D,$E);
+
+ &Xloop_avx2 (\&bodyx_00_19);
+ &Xloop_avx2 (\&bodyx_00_19);
+ &Xloop_avx2 (\&bodyx_00_19);
+ &Xloop_avx2 (\&bodyx_00_19);
+
+ &Xloop_avx2 (\&bodyx_20_39);
+ &vmovdqu ($Kx,"-64($K_XX_XX)"); # K_00_19
+ &vpshufb (@X[-4&7],@X[-4&7],@X[2]); # byte swap
+ &Xloop_avx2 (\&bodyx_20_39);
+ &vpshufb (@X[-3&7],@X[-3&7],@X[2]);
+ &vpaddd (@Tx[0],@X[-4&7],$Kx); # add K_00_19
+ &Xloop_avx2 (\&bodyx_20_39);
+ &vmovdqu ("0(%rsp)",@Tx[0]);
+ &vpshufb (@X[-2&7],@X[-2&7],@X[2]);
+ &vpaddd (@Tx[1],@X[-3&7],$Kx);
+ &Xloop_avx2 (\&bodyx_20_39);
+ &vmovdqu ("32(%rsp)",@Tx[1]);
+ &vpshufb (@X[-1&7],@X[-1&7],@X[2]);
+ &vpaddd (@X[2],@X[-2&7],$Kx);
+
+ &Xloop_avx2 (\&bodyx_40_59);
+ &align32 ();
+ &vmovdqu ("64(%rsp)",@X[2]);
+ &vpaddd (@X[3],@X[-1&7],$Kx);
+ &Xloop_avx2 (\&bodyx_40_59);
+ &vmovdqu ("96(%rsp)",@X[3]);
+ &Xloop_avx2 (\&bodyx_40_59);
+ &Xupdate_avx2_16_31(\&bodyx_40_59);
+
+ &Xupdate_avx2_16_31(\&bodyx_20_39);
+ &Xupdate_avx2_16_31(\&bodyx_20_39);
+ &Xupdate_avx2_16_31(\&bodyx_20_39);
+ &Xloop_avx2 (\&bodyx_20_39);
+
+$code.=<<___;
+ lea 128(%rsp),$frame
+
+ # output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
+ add 0($ctx),@ROTX[0] # update context
+ add 4($ctx),@ROTX[1]
+ add 8($ctx),@ROTX[3]
+ mov @ROTX[0],0($ctx)
+ add 12($ctx),@ROTX[4]
+ mov @ROTX[1],4($ctx)
+ mov @ROTX[0],$A # A=d
+ add 16($ctx),@ROTX[5]
+ mov @ROTX[3],$a5
+ mov @ROTX[3],8($ctx)
+ mov @ROTX[4],$D # D=b
+ #xchg @ROTX[5],$F # F=c, C=f
+ mov @ROTX[4],12($ctx)
+ mov @ROTX[1],$F # F=e
+ mov @ROTX[5],16($ctx)
+ #mov $F,16($ctx)
+ mov @ROTX[5],$E # E=c
+ mov $a5,$C # C=f
+ #xchg $F,$E # E=c, F=e
+
+ cmp $num,$inp
+ jbe .Loop_avx2
+
+.Ldone_avx2:
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps -40-6*16(%r14),%xmm6
+ movaps -40-5*16(%r14),%xmm7
+ movaps -40-4*16(%r14),%xmm8
+ movaps -40-3*16(%r14),%xmm9
+ movaps -40-2*16(%r14),%xmm10
+ movaps -40-1*16(%r14),%xmm11
+___
+$code.=<<___;
+ lea (%r14),%rsi
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
+.Lepilogue_avx2:
+ ret
+.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
+___
+}
}
$code.=<<___;
.align 64
K_XX_XX:
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
+.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
___
}}}
$code.=<<___;
@@ -1122,20 +1821,58 @@ se_handler:
jae .Lcommon_seh_tail
mov `16*4`(%rax),%rax # pull saved stack pointer
- lea 32(%rax),%rax
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
mov -32(%rax),%r13
+ mov -40(%rax),%r14
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore context->R12
mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
jmp .Lcommon_seh_tail
.size se_handler,.-se_handler
+___
+
+$code.=<<___ if ($shaext);
+.type shaext_handler,\@abi-omnipotent
+.align 16
+shaext_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ lea .Lprologue_shaext(%rip),%r10
+ cmp %r10,%rbx # context->Rip<.Lprologue
+ jb .Lcommon_seh_tail
+
+ lea .Lepilogue_shaext(%rip),%r10
+ cmp %r10,%rbx # context->Rip>=.Lepilogue
+ jae .Lcommon_seh_tail
+
+ lea -8-4*16(%rax),%rsi
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$8,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+ jmp .Lcommon_seh_tail
+.size shaext_handler,.-shaext_handler
+___
+
+$code.=<<___;
.type ssse3_handler,\@abi-omnipotent
.align 16
ssse3_handler:
@@ -1168,18 +1905,23 @@ ssse3_handler:
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail
- lea 64(%rax),%rsi
+ mov 232($context),%rax # pull context->R14
+
+ lea -40-6*16(%rax),%rsi
lea 512($context),%rdi # &context.Xmm6
- mov \$10,%ecx
+ mov \$12,%ecx
.long 0xa548f3fc # cld; rep movsq
- lea `24+64+5*16`(%rax),%rax # adjust stack pointer
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore cotnext->R12
+ mov %r13,224($context) # restore cotnext->R13
+ mov %r14,232($context) # restore cotnext->R14
.Lcommon_seh_tail:
mov 8(%rax),%rdi
@@ -1226,6 +1968,13 @@ ssse3_handler:
.rva .LSEH_begin_sha1_block_data_order
.rva .LSEH_end_sha1_block_data_order
.rva .LSEH_info_sha1_block_data_order
+___
+$code.=<<___ if ($shaext);
+ .rva .LSEH_begin_sha1_block_data_order_shaext
+ .rva .LSEH_end_sha1_block_data_order_shaext
+ .rva .LSEH_info_sha1_block_data_order_shaext
+___
+$code.=<<___;
.rva .LSEH_begin_sha1_block_data_order_ssse3
.rva .LSEH_end_sha1_block_data_order_ssse3
.rva .LSEH_info_sha1_block_data_order_ssse3
@@ -1235,12 +1984,24 @@ $code.=<<___ if ($avx);
.rva .LSEH_end_sha1_block_data_order_avx
.rva .LSEH_info_sha1_block_data_order_avx
___
+$code.=<<___ if ($avx>1);
+ .rva .LSEH_begin_sha1_block_data_order_avx2
+ .rva .LSEH_end_sha1_block_data_order_avx2
+ .rva .LSEH_info_sha1_block_data_order_avx2
+___
$code.=<<___;
.section .xdata
.align 8
.LSEH_info_sha1_block_data_order:
.byte 9,0,0,0
.rva se_handler
+___
+$code.=<<___ if ($shaext);
+.LSEH_info_sha1_block_data_order_shaext:
+ .byte 9,0,0,0
+ .rva shaext_handler
+___
+$code.=<<___;
.LSEH_info_sha1_block_data_order_ssse3:
.byte 9,0,0,0
.rva ssse3_handler
@@ -1252,10 +2013,55 @@ $code.=<<___ if ($avx);
.rva ssse3_handler
.rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
___
+$code.=<<___ if ($avx>1);
+.LSEH_info_sha1_block_data_order_avx2:
+ .byte 9,0,0,0
+ .rva ssse3_handler
+ .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
+___
}
####################################################################
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-print $code;
+sub sha1rnds4 {
+ if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) {
+ my @opcode=(0x0f,0x3a,0xcc);
+ push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
+ my $c=$1;
+ push @opcode,$c=~/^0/?oct($c):$c;
+ return ".byte\t".join(',',@opcode);
+ } else {
+ return "sha1rnds4\t".@_[0];
+ }
+}
+
+sub sha1op38 {
+ my $instr = shift;
+ my %opcodelet = (
+ "sha1nexte" => 0xc8,
+ "sha1msg1" => 0xc9,
+ "sha1msg2" => 0xca );
+
+ if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my @opcode=(0x0f,0x38);
+ my $rex=0;
+ $rex|=0x04 if ($2>=8);
+ $rex|=0x01 if ($1>=8);
+ unshift @opcode,0x40|$rex if ($rex);
+ push @opcode,$opcodelet{$instr};
+ push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
+ return ".byte\t".join(',',@opcode);
+ } else {
+ return $instr."\t".@_[0];
+ }
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or
+ s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo;
+
+ print $_,"\n";
+}
close STDOUT;
diff --git a/crypto/sha/asm/sha256-586.pl b/crypto/sha/asm/sha256-586.pl
index 928ec53123bf..6462e45ba75b 100755
--- a/crypto/sha/asm/sha256-586.pl
+++ b/crypto/sha/asm/sha256-586.pl
@@ -1,7 +1,7 @@
#!/usr/bin/env perl
#
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -9,20 +9,55 @@
#
# SHA256 block transform for x86. September 2007.
#
-# Performance in clock cycles per processed byte (less is better):
+# Performance improvement over compiler generated code varies from
+# 10% to 40% [see below]. Not very impressive on some µ-archs, but
+# it's 5 times smaller and optimizies amount of writes.
#
-# Pentium PIII P4 AMD K8 Core2
-# gcc 46 36 41 27 26
-# icc 57 33 38 25 23
-# x86 asm 40 30 33 20 18
-# x86_64 asm(*) - - 21 16 16
+# May 2012.
#
-# (*) x86_64 assembler performance is presented for reference
-# purposes.
+# Optimization including two of Pavel Semjanov's ideas, alternative
+# Maj and full unroll, resulted in ~20-25% improvement on most CPUs,
+# ~7% on Pentium, ~40% on Atom. As fully unrolled loop body is almost
+# 15x larger, 8KB vs. 560B, it's fired only for longer inputs. But not
+# on P4, where it kills performance, nor Sandy Bridge, where folded
+# loop is approximately as fast...
#
-# Performance improvement over compiler generated code varies from
-# 10% to 40% [see above]. Not very impressive on some µ-archs, but
-# it's 5 times smaller and optimizies amount of writes.
+# June 2012.
+#
+# Add AMD XOP-specific code path, >30% improvement on Bulldozer over
+# May version, >60% over original. Add AVX+shrd code path, >25%
+# improvement on Sandy Bridge over May version, 60% over original.
+#
+# May 2013.
+#
+# Replace AMD XOP code path with SSSE3 to cover more processors.
+# (Biggest improvement coefficient is on upcoming Atom Silvermont,
+# not shown.) Add AVX+BMI code path.
+#
+# March 2014.
+#
+# Add support for Intel SHA Extensions.
+#
+# Performance in clock cycles per processed byte (less is better):
+#
+# gcc icc x86 asm(*) SIMD x86_64 asm(**)
+# Pentium 46 57 40/38 - -
+# PIII 36 33 27/24 - -
+# P4 41 38 28 - 17.3
+# AMD K8 27 25 19/15.5 - 14.9
+# Core2 26 23 18/15.6 14.3 13.8
+# Westmere 27 - 19/15.7 13.4 12.3
+# Sandy Bridge 25 - 15.9 12.4 11.6
+# Ivy Bridge 24 - 15.0 11.4 10.3
+# Haswell 22 - 13.9 9.46 7.80
+# Bulldozer 36 - 27/22 17.0 13.6
+# VIA Nano 36 - 25/22 16.8 16.5
+# Atom 50 - 30/25 21.9 18.9
+# Silvermont 40 - 34/31 22.9 20.6
+#
+# (*) numbers after slash are for unrolled loop, where applicable;
+# (**) x86_64 assembly performance is presented for reference
+# purposes, results are best-available;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
@@ -30,72 +65,122 @@ require "x86asm.pl";
&asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386");
+$xmm=$avx=0;
+for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+if ($xmm && `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if ($xmm && !$avx && $ARGV[0] eq "win32n" &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.03) + ($1>=2.10);
+}
+
+if ($xmm && !$avx && $ARGV[0] eq "win32" &&
+ `ml 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+}
+
+if ($xmm && !$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+ $avx = ($2>=3.0) + ($2>3.0);
+}
+
+$shaext=$xmm; ### set to zero if compiling for 1.0.1
+
+$unroll_after = 64*4; # If pre-evicted from L1P cache first spin of
+ # fully unrolled loop was measured to run about
+ # 3-4x slower. If slowdown coefficient is N and
+ # unrolled loop is m times faster, then you break
+ # even at (N-1)/(m-1) blocks. Then it needs to be
+ # adjusted for probability of code being evicted,
+ # code size/cache size=1/4. Typical m is 1.15...
+
$A="eax";
$E="edx";
$T="ebx";
-$Aoff=&DWP(0,"esp");
-$Boff=&DWP(4,"esp");
-$Coff=&DWP(8,"esp");
-$Doff=&DWP(12,"esp");
-$Eoff=&DWP(16,"esp");
-$Foff=&DWP(20,"esp");
-$Goff=&DWP(24,"esp");
-$Hoff=&DWP(28,"esp");
-$Xoff=&DWP(32,"esp");
+$Aoff=&DWP(4,"esp");
+$Boff=&DWP(8,"esp");
+$Coff=&DWP(12,"esp");
+$Doff=&DWP(16,"esp");
+$Eoff=&DWP(20,"esp");
+$Foff=&DWP(24,"esp");
+$Goff=&DWP(28,"esp");
+$Hoff=&DWP(32,"esp");
+$Xoff=&DWP(36,"esp");
$K256="ebp";
+sub BODY_16_63() {
+ &mov ($T,"ecx"); # "ecx" is preloaded
+ &mov ("esi",&DWP(4*(9+15+16-14),"esp"));
+ &ror ("ecx",18-7);
+ &mov ("edi","esi");
+ &ror ("esi",19-17);
+ &xor ("ecx",$T);
+ &shr ($T,3);
+ &ror ("ecx",7);
+ &xor ("esi","edi");
+ &xor ($T,"ecx"); # T = sigma0(X[-15])
+ &ror ("esi",17);
+ &add ($T,&DWP(4*(9+15+16),"esp")); # T += X[-16]
+ &shr ("edi",10);
+ &add ($T,&DWP(4*(9+15+16-9),"esp")); # T += X[-7]
+ #&xor ("edi","esi") # sigma1(X[-2])
+ # &add ($T,"edi"); # T += sigma1(X[-2])
+ # &mov (&DWP(4*(9+15),"esp"),$T); # save X[0]
+
+ &BODY_00_15(1);
+}
sub BODY_00_15() {
my $in_16_63=shift;
&mov ("ecx",$E);
- &add ($T,"edi") if ($in_16_63); # T += sigma1(X[-2])
- &ror ("ecx",25-11);
+ &xor ("edi","esi") if ($in_16_63); # sigma1(X[-2])
&mov ("esi",$Foff);
+ &ror ("ecx",25-11);
+ &add ($T,"edi") if ($in_16_63); # T += sigma1(X[-2])
+ &mov ("edi",$Goff);
&xor ("ecx",$E);
+ &xor ("esi","edi");
+ &mov ($T,&DWP(4*(9+15),"esp")) if (!$in_16_63);
+ &mov (&DWP(4*(9+15),"esp"),$T) if ($in_16_63); # save X[0]
&ror ("ecx",11-6);
- &mov (&DWP(4*(8+15),"esp"),$T) if ($in_16_63); # save X[0]
- &xor ("ecx",$E);
- &ror ("ecx",6); # Sigma1(e)
- &mov ("edi",$Goff);
- &add ($T,"ecx"); # T += Sigma1(e)
-
- &xor ("esi","edi");
- &mov ($Eoff,$E); # modulo-scheduled
+ &and ("esi",$E);
+ &mov ($Eoff,$E); # modulo-scheduled
+ &xor ($E,"ecx");
+ &add ($T,$Hoff); # T += h
+ &xor ("esi","edi"); # Ch(e,f,g)
+ &ror ($E,6); # Sigma1(e)
&mov ("ecx",$A);
- &and ("esi",$E);
- &mov ($E,$Doff); # e becomes d, which is e in next iteration
- &xor ("esi","edi"); # Ch(e,f,g)
- &mov ("edi",$A);
- &add ($T,"esi"); # T += Ch(e,f,g)
+ &add ($T,"esi"); # T += Ch(e,f,g)
&ror ("ecx",22-13);
- &add ($T,$Hoff); # T += h
+ &add ($T,$E); # T += Sigma1(e)
+ &mov ("edi",$Boff);
&xor ("ecx",$A);
+ &mov ($Aoff,$A); # modulo-scheduled
+ &lea ("esp",&DWP(-4,"esp"));
&ror ("ecx",13-2);
- &mov ("esi",$Boff);
- &xor ("ecx",$A);
- &ror ("ecx",2); # Sigma0(a)
- &add ($E,$T); # d += T
- &mov ("edi",$Coff);
-
- &add ($T,"ecx"); # T += Sigma0(a)
- &mov ($Aoff,$A); # modulo-scheduled
-
- &mov ("ecx",$A);
- &sub ("esp",4);
- &or ($A,"esi"); # a becomes h, which is a in next iteration
- &and ("ecx","esi");
- &and ($A,"edi");
&mov ("esi",&DWP(0,$K256));
- &or ($A,"ecx"); # h=Maj(a,b,c)
+ &xor ("ecx",$A);
+ &mov ($E,$Eoff); # e in next iteration, d in this one
+ &xor ($A,"edi"); # a ^= b
+ &ror ("ecx",2); # Sigma0(a)
+ &add ($T,"esi"); # T+= K[i]
+ &mov (&DWP(0,"esp"),$A); # (b^c) in next round
+ &add ($E,$T); # d += T
+ &and ($A,&DWP(4,"esp")); # a &= (b^c)
+ &add ($T,"ecx"); # T += Sigma0(a)
+ &xor ($A,"edi"); # h = Maj(a,b,c) = Ch(a^b,c,b)
+ &mov ("ecx",&DWP(4*(9+15+16-1),"esp")) if ($in_16_63); # preload T
&add ($K256,4);
- &add ($A,$T); # h += T
- &mov ($T,&DWP(4*(8+15+16-1),"esp")) if ($in_16_63); # preload T
- &add ($E,"esi"); # d += K256[i]
- &add ($A,"esi"); # h += K256[i]
+ &add ($A,$T); # h += T
}
+&external_label("OPENSSL_ia32cap_P") if (!$i386);
+
&function_begin("sha256_block_data_order");
&mov ("esi",wparam(0)); # ctx
&mov ("edi",wparam(1)); # inp
@@ -116,26 +201,59 @@ sub BODY_00_15() {
&mov (&DWP(4,"esp"),"edi"); # inp
&mov (&DWP(8,"esp"),"eax"); # inp+num*128
&mov (&DWP(12,"esp"),"ebx"); # saved sp
+ if (!$i386 && $xmm) {
+ &picmeup("edx","OPENSSL_ia32cap_P",$K256,&label("K256"));
+ &mov ("ecx",&DWP(0,"edx"));
+ &mov ("ebx",&DWP(4,"edx"));
+ &test ("ecx",1<<20); # check for P4
+ &jnz (&label("loop"));
+ &mov ("edx",&DWP(8,"edx")) if ($xmm);
+ &test ("ecx",1<<24); # check for FXSR
+ &jz ($unroll_after?&label("no_xmm"):&label("loop"));
+ &and ("ecx",1<<30); # mask "Intel CPU" bit
+ &and ("ebx",1<<28|1<<9); # mask AVX and SSSE3 bits
+ &test ("edx",1<<29) if ($shaext); # check for SHA
+ &jnz (&label("shaext")) if ($shaext);
+ &or ("ecx","ebx");
+ &and ("ecx",1<<28|1<<30);
+ &cmp ("ecx",1<<28|1<<30);
+ if ($xmm) {
+ &je (&label("AVX")) if ($avx);
+ &test ("ebx",1<<9); # check for SSSE3
+ &jnz (&label("SSSE3"));
+ } else {
+ &je (&label("loop_shrd"));
+ }
+ if ($unroll_after) {
+&set_label("no_xmm");
+ &sub ("eax","edi");
+ &cmp ("eax",$unroll_after);
+ &jae (&label("unrolled"));
+ } }
+ &jmp (&label("loop"));
+
+sub COMPACT_LOOP() {
+my $suffix=shift;
-&set_label("loop",16);
+&set_label("loop$suffix",$suffix?32:16);
# copy input block to stack reversing byte and dword order
for($i=0;$i<4;$i++) {
&mov ("eax",&DWP($i*16+0,"edi"));
&mov ("ebx",&DWP($i*16+4,"edi"));
&mov ("ecx",&DWP($i*16+8,"edi"));
- &mov ("edx",&DWP($i*16+12,"edi"));
&bswap ("eax");
+ &mov ("edx",&DWP($i*16+12,"edi"));
&bswap ("ebx");
- &bswap ("ecx");
- &bswap ("edx");
&push ("eax");
+ &bswap ("ecx");
&push ("ebx");
+ &bswap ("edx");
&push ("ecx");
&push ("edx");
}
&add ("edi",64);
- &sub ("esp",4*8); # place for A,B,C,D,E,F,G,H
- &mov (&DWP(4*(8+16)+4,"esp"),"edi");
+ &lea ("esp",&DWP(-4*9,"esp"));# place for A,B,C,D,E,F,G,H
+ &mov (&DWP(4*(9+16)+4,"esp"),"edi");
# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
&mov ($A,&DWP(0,"esi"));
@@ -144,8 +262,10 @@ sub BODY_00_15() {
&mov ("edi",&DWP(12,"esi"));
# &mov ($Aoff,$A);
&mov ($Boff,"ebx");
+ &xor ("ebx","ecx");
&mov ($Coff,"ecx");
&mov ($Doff,"edi");
+ &mov (&DWP(0,"esp"),"ebx"); # magic
&mov ($E,&DWP(16,"esi"));
&mov ("ebx",&DWP(20,"esi"));
&mov ("ecx",&DWP(24,"esi"));
@@ -155,59 +275,41 @@ sub BODY_00_15() {
&mov ($Goff,"ecx");
&mov ($Hoff,"edi");
-&set_label("00_15",16);
- &mov ($T,&DWP(4*(8+15),"esp"));
+&set_label("00_15$suffix",16);
&BODY_00_15();
&cmp ("esi",0xc19bf174);
- &jne (&label("00_15"));
-
- &mov ($T,&DWP(4*(8+15+16-1),"esp")); # preloaded in BODY_00_15(1)
-&set_label("16_63",16);
- &mov ("esi",$T);
- &mov ("ecx",&DWP(4*(8+15+16-14),"esp"));
- &ror ("esi",18-7);
- &mov ("edi","ecx");
- &xor ("esi",$T);
- &ror ("esi",7);
- &shr ($T,3);
-
- &ror ("edi",19-17);
- &xor ($T,"esi"); # T = sigma0(X[-15])
- &xor ("edi","ecx");
- &ror ("edi",17);
- &shr ("ecx",10);
- &add ($T,&DWP(4*(8+15+16),"esp")); # T += X[-16]
- &xor ("edi","ecx"); # sigma1(X[-2])
-
- &add ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7]
- # &add ($T,"edi"); # T += sigma1(X[-2])
- # &mov (&DWP(4*(8+15),"esp"),$T); # save X[0]
+ &jne (&label("00_15$suffix"));
- &BODY_00_15(1);
+ &mov ("ecx",&DWP(4*(9+15+16-1),"esp")); # preloaded in BODY_00_15(1)
+ &jmp (&label("16_63$suffix"));
+
+&set_label("16_63$suffix",16);
+
+ &BODY_16_63();
&cmp ("esi",0xc67178f2);
- &jne (&label("16_63"));
+ &jne (&label("16_63$suffix"));
- &mov ("esi",&DWP(4*(8+16+64)+0,"esp"));#ctx
+ &mov ("esi",&DWP(4*(9+16+64)+0,"esp"));#ctx
# &mov ($A,$Aoff);
&mov ("ebx",$Boff);
- &mov ("ecx",$Coff);
- &mov ("edi",$Doff);
+ # &mov ("edi",$Coff);
+ &mov ("ecx",$Doff);
&add ($A,&DWP(0,"esi"));
&add ("ebx",&DWP(4,"esi"));
- &add ("ecx",&DWP(8,"esi"));
- &add ("edi",&DWP(12,"esi"));
+ &add ("edi",&DWP(8,"esi"));
+ &add ("ecx",&DWP(12,"esi"));
&mov (&DWP(0,"esi"),$A);
&mov (&DWP(4,"esi"),"ebx");
- &mov (&DWP(8,"esi"),"ecx");
- &mov (&DWP(12,"esi"),"edi");
+ &mov (&DWP(8,"esi"),"edi");
+ &mov (&DWP(12,"esi"),"ecx");
# &mov ($E,$Eoff);
&mov ("eax",$Foff);
&mov ("ebx",$Goff);
&mov ("ecx",$Hoff);
- &mov ("edi",&DWP(4*(8+16+64)+4,"esp"));#inp
+ &mov ("edi",&DWP(4*(9+16+64)+4,"esp"));#inp
&add ($E,&DWP(16,"esi"));
&add ("eax",&DWP(20,"esi"));
&add ("ebx",&DWP(24,"esi"));
@@ -217,33 +319,963 @@ sub BODY_00_15() {
&mov (&DWP(24,"esi"),"ebx");
&mov (&DWP(28,"esi"),"ecx");
- &add ("esp",4*(8+16+64)); # destroy frame
+ &lea ("esp",&DWP(4*(9+16+64),"esp"));# destroy frame
&sub ($K256,4*64); # rewind K
&cmp ("edi",&DWP(8,"esp")); # are we done yet?
- &jb (&label("loop"));
-
+ &jb (&label("loop$suffix"));
+}
+ &COMPACT_LOOP();
+ &mov ("esp",&DWP(12,"esp")); # restore sp
+&function_end_A();
+ if (!$i386 && !$xmm) {
+ # ~20% improvement on Sandy Bridge
+ local *ror = sub { &shrd(@_[0],@_) };
+ &COMPACT_LOOP("_shrd");
&mov ("esp",&DWP(12,"esp")); # restore sp
&function_end_A();
+ }
&set_label("K256",64); # Yes! I keep it in the code segment!
- &data_word(0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5);
- &data_word(0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5);
- &data_word(0xd807aa98,0x12835b01,0x243185be,0x550c7dc3);
- &data_word(0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174);
- &data_word(0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc);
- &data_word(0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da);
- &data_word(0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7);
- &data_word(0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967);
- &data_word(0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13);
- &data_word(0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85);
- &data_word(0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3);
- &data_word(0xd192e819,0xd6990624,0xf40e3585,0x106aa070);
- &data_word(0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5);
- &data_word(0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3);
- &data_word(0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208);
- &data_word(0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2);
-&function_end_B("sha256_block_data_order");
+@K256=( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
+ 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
+ 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
+ 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
+ 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
+ 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
+ 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
+ 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
+ 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
+ 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
+ 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
+ 0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
+ 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
+ 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
+ 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
+ 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 );
+&data_word(@K256);
+&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # byte swap mask
&asciz("SHA256 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
+($a,$b,$c,$d,$e,$f,$g,$h)=(0..7); # offsets
+sub off { &DWP(4*(((shift)-$i)&7),"esp"); }
+
+if (!$i386 && $unroll_after) {
+my @AH=($A,$K256);
+
+&set_label("unrolled",16);
+ &lea ("esp",&DWP(-96,"esp"));
+ # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
+ &mov ($AH[0],&DWP(0,"esi"));
+ &mov ($AH[1],&DWP(4,"esi"));
+ &mov ("ecx",&DWP(8,"esi"));
+ &mov ("ebx",&DWP(12,"esi"));
+ #&mov (&DWP(0,"esp"),$AH[0]);
+ &mov (&DWP(4,"esp"),$AH[1]);
+ &xor ($AH[1],"ecx"); # magic
+ &mov (&DWP(8,"esp"),"ecx");
+ &mov (&DWP(12,"esp"),"ebx");
+ &mov ($E,&DWP(16,"esi"));
+ &mov ("ebx",&DWP(20,"esi"));
+ &mov ("ecx",&DWP(24,"esi"));
+ &mov ("esi",&DWP(28,"esi"));
+ #&mov (&DWP(16,"esp"),$E);
+ &mov (&DWP(20,"esp"),"ebx");
+ &mov (&DWP(24,"esp"),"ecx");
+ &mov (&DWP(28,"esp"),"esi");
+ &jmp (&label("grand_loop"));
+
+&set_label("grand_loop",16);
+ # copy input block to stack reversing byte order
+ for($i=0;$i<5;$i++) {
+ &mov ("ebx",&DWP(12*$i+0,"edi"));
+ &mov ("ecx",&DWP(12*$i+4,"edi"));
+ &bswap ("ebx");
+ &mov ("esi",&DWP(12*$i+8,"edi"));
+ &bswap ("ecx");
+ &mov (&DWP(32+12*$i+0,"esp"),"ebx");
+ &bswap ("esi");
+ &mov (&DWP(32+12*$i+4,"esp"),"ecx");
+ &mov (&DWP(32+12*$i+8,"esp"),"esi");
+ }
+ &mov ("ebx",&DWP($i*12,"edi"));
+ &add ("edi",64);
+ &bswap ("ebx");
+ &mov (&DWP(96+4,"esp"),"edi");
+ &mov (&DWP(32+12*$i,"esp"),"ebx");
+
+ my ($t1,$t2) = ("ecx","esi");
+
+ for ($i=0;$i<64;$i++) {
+
+ if ($i>=16) {
+ &mov ($T,$t1); # $t1 is preloaded
+ # &mov ($t2,&DWP(32+4*(($i+14)&15),"esp"));
+ &ror ($t1,18-7);
+ &mov ("edi",$t2);
+ &ror ($t2,19-17);
+ &xor ($t1,$T);
+ &shr ($T,3);
+ &ror ($t1,7);
+ &xor ($t2,"edi");
+ &xor ($T,$t1); # T = sigma0(X[-15])
+ &ror ($t2,17);
+ &add ($T,&DWP(32+4*($i&15),"esp")); # T += X[-16]
+ &shr ("edi",10);
+ &add ($T,&DWP(32+4*(($i+9)&15),"esp")); # T += X[-7]
+ #&xor ("edi",$t2) # sigma1(X[-2])
+ # &add ($T,"edi"); # T += sigma1(X[-2])
+ # &mov (&DWP(4*(9+15),"esp"),$T); # save X[0]
+ }
+ &mov ($t1,$E);
+ &xor ("edi",$t2) if ($i>=16); # sigma1(X[-2])
+ &mov ($t2,&off($f));
+ &ror ($E,25-11);
+ &add ($T,"edi") if ($i>=16); # T += sigma1(X[-2])
+ &mov ("edi",&off($g));
+ &xor ($E,$t1);
+ &mov ($T,&DWP(32+4*($i&15),"esp")) if ($i<16); # X[i]
+ &mov (&DWP(32+4*($i&15),"esp"),$T) if ($i>=16 && $i<62); # save X[0]
+ &xor ($t2,"edi");
+ &ror ($E,11-6);
+ &and ($t2,$t1);
+ &mov (&off($e),$t1); # save $E, modulo-scheduled
+ &xor ($E,$t1);
+ &add ($T,&off($h)); # T += h
+ &xor ("edi",$t2); # Ch(e,f,g)
+ &ror ($E,6); # Sigma1(e)
+ &mov ($t1,$AH[0]);
+ &add ($T,"edi"); # T += Ch(e,f,g)
+
+ &ror ($t1,22-13);
+ &mov ($t2,$AH[0]);
+ &mov ("edi",&off($b));
+ &xor ($t1,$AH[0]);
+ &mov (&off($a),$AH[0]); # save $A, modulo-scheduled
+ &xor ($AH[0],"edi"); # a ^= b, (b^c) in next round
+ &ror ($t1,13-2);
+ &and ($AH[1],$AH[0]); # (b^c) &= (a^b)
+ &lea ($E,&DWP(@K256[$i],$T,$E)); # T += Sigma1(1)+K[i]
+ &xor ($t1,$t2);
+ &xor ($AH[1],"edi"); # h = Maj(a,b,c) = Ch(a^b,c,b)
+ &mov ($t2,&DWP(32+4*(($i+2)&15),"esp")) if ($i>=15 && $i<63);
+ &ror ($t1,2); # Sigma0(a)
+
+ &add ($AH[1],$E); # h += T
+ &add ($E,&off($d)); # d += T
+ &add ($AH[1],$t1); # h += Sigma0(a)
+ &mov ($t1,&DWP(32+4*(($i+15)&15),"esp")) if ($i>=15 && $i<63);
+
+ @AH = reverse(@AH); # rotate(a,h)
+ ($t1,$t2) = ($t2,$t1); # rotate(t1,t2)
+ }
+ &mov ("esi",&DWP(96,"esp")); #ctx
+ #&mov ($AH[0],&DWP(0,"esp"));
+ &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp"));
+ #&mov ("edi", &DWP(8,"esp"));
+ &mov ("ecx",&DWP(12,"esp"));
+ &add ($AH[0],&DWP(0,"esi"));
+ &add ($AH[1],&DWP(4,"esi"));
+ &add ("edi",&DWP(8,"esi"));
+ &add ("ecx",&DWP(12,"esi"));
+ &mov (&DWP(0,"esi"),$AH[0]);
+ &mov (&DWP(4,"esi"),$AH[1]);
+ &mov (&DWP(8,"esi"),"edi");
+ &mov (&DWP(12,"esi"),"ecx");
+ #&mov (&DWP(0,"esp"),$AH[0]);
+ &mov (&DWP(4,"esp"),$AH[1]);
+ &xor ($AH[1],"edi"); # magic
+ &mov (&DWP(8,"esp"),"edi");
+ &mov (&DWP(12,"esp"),"ecx");
+ #&mov ($E,&DWP(16,"esp"));
+ &mov ("edi",&DWP(20,"esp"));
+ &mov ("ebx",&DWP(24,"esp"));
+ &mov ("ecx",&DWP(28,"esp"));
+ &add ($E,&DWP(16,"esi"));
+ &add ("edi",&DWP(20,"esi"));
+ &add ("ebx",&DWP(24,"esi"));
+ &add ("ecx",&DWP(28,"esi"));
+ &mov (&DWP(16,"esi"),$E);
+ &mov (&DWP(20,"esi"),"edi");
+ &mov (&DWP(24,"esi"),"ebx");
+ &mov (&DWP(28,"esi"),"ecx");
+ #&mov (&DWP(16,"esp"),$E);
+ &mov (&DWP(20,"esp"),"edi");
+ &mov ("edi",&DWP(96+4,"esp")); # inp
+ &mov (&DWP(24,"esp"),"ebx");
+ &mov (&DWP(28,"esp"),"ecx");
+
+ &cmp ("edi",&DWP(96+8,"esp")); # are we done yet?
+ &jb (&label("grand_loop"));
+
+ &mov ("esp",&DWP(96+12,"esp")); # restore sp
+&function_end_A();
+}
+ if (!$i386 && $xmm) {{{
+if ($shaext) {
+######################################################################
+# Intel SHA Extensions implementation of SHA256 update function.
+#
+my ($ctx,$inp,$end)=("esi","edi","eax");
+my ($Wi,$ABEF,$CDGH,$TMP)=map("xmm$_",(0..2,7));
+my @MSG=map("xmm$_",(3..6));
+
+sub sha256op38 {
+ my ($opcodelet,$dst,$src)=@_;
+ if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+ { &data_byte(0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2); }
+}
+sub sha256rnds2 { sha256op38(0xcb,@_); }
+sub sha256msg1 { sha256op38(0xcc,@_); }
+sub sha256msg2 { sha256op38(0xcd,@_); }
+
+&set_label("shaext",32);
+ &sub ("esp",32);
+
+ &movdqu ($ABEF,&QWP(0,$ctx)); # DCBA
+ &lea ($K256,&DWP(0x80,$K256));
+ &movdqu ($CDGH,&QWP(16,$ctx)); # HGFE
+ &movdqa ($TMP,&QWP(0x100-0x80,$K256)); # byte swap mask
+
+ &pshufd ($Wi,$ABEF,0x1b); # ABCD
+ &pshufd ($ABEF,$ABEF,0xb1); # CDAB
+ &pshufd ($CDGH,$CDGH,0x1b); # EFGH
+ &palignr ($ABEF,$CDGH,8); # ABEF
+ &punpcklqdq ($CDGH,$Wi); # CDGH
+ &jmp (&label("loop_shaext"));
+
+&set_label("loop_shaext",16);
+ &movdqu (@MSG[0],&QWP(0,$inp));
+ &movdqu (@MSG[1],&QWP(0x10,$inp));
+ &movdqu (@MSG[2],&QWP(0x20,$inp));
+ &pshufb (@MSG[0],$TMP);
+ &movdqu (@MSG[3],&QWP(0x30,$inp));
+ &movdqa (&QWP(16,"esp"),$CDGH); # offload
+
+ &movdqa ($Wi,&QWP(0*16-0x80,$K256));
+ &paddd ($Wi,@MSG[0]);
+ &pshufb (@MSG[1],$TMP);
+ &sha256rnds2 ($CDGH,$ABEF); # 0-3
+ &pshufd ($Wi,$Wi,0x0e);
+ &nop ();
+ &movdqa (&QWP(0,"esp"),$ABEF); # offload
+ &sha256rnds2 ($ABEF,$CDGH);
+
+ &movdqa ($Wi,&QWP(1*16-0x80,$K256));
+ &paddd ($Wi,@MSG[1]);
+ &pshufb (@MSG[2],$TMP);
+ &sha256rnds2 ($CDGH,$ABEF); # 4-7
+ &pshufd ($Wi,$Wi,0x0e);
+ &lea ($inp,&DWP(0x40,$inp));
+ &sha256msg1 (@MSG[0],@MSG[1]);
+ &sha256rnds2 ($ABEF,$CDGH);
+
+ &movdqa ($Wi,&QWP(2*16-0x80,$K256));
+ &paddd ($Wi,@MSG[2]);
+ &pshufb (@MSG[3],$TMP);
+ &sha256rnds2 ($CDGH,$ABEF); # 8-11
+ &pshufd ($Wi,$Wi,0x0e);
+ &movdqa ($TMP,@MSG[3]);
+ &palignr ($TMP,@MSG[2],4);
+ &nop ();
+ &paddd (@MSG[0],$TMP);
+ &sha256msg1 (@MSG[1],@MSG[2]);
+ &sha256rnds2 ($ABEF,$CDGH);
+
+ &movdqa ($Wi,&QWP(3*16-0x80,$K256));
+ &paddd ($Wi,@MSG[3]);
+ &sha256msg2 (@MSG[0],@MSG[3]);
+ &sha256rnds2 ($CDGH,$ABEF); # 12-15
+ &pshufd ($Wi,$Wi,0x0e);
+ &movdqa ($TMP,@MSG[0]);
+ &palignr ($TMP,@MSG[3],4);
+ &nop ();
+ &paddd (@MSG[1],$TMP);
+ &sha256msg1 (@MSG[2],@MSG[3]);
+ &sha256rnds2 ($ABEF,$CDGH);
+
+for($i=4;$i<16-3;$i++) {
+ &movdqa ($Wi,&QWP($i*16-0x80,$K256));
+ &paddd ($Wi,@MSG[0]);
+ &sha256msg2 (@MSG[1],@MSG[0]);
+ &sha256rnds2 ($CDGH,$ABEF); # 16-19...
+ &pshufd ($Wi,$Wi,0x0e);
+ &movdqa ($TMP,@MSG[1]);
+ &palignr ($TMP,@MSG[0],4);
+ &nop ();
+ &paddd (@MSG[2],$TMP);
+ &sha256msg1 (@MSG[3],@MSG[0]);
+ &sha256rnds2 ($ABEF,$CDGH);
+
+ push(@MSG,shift(@MSG));
+}
+ &movdqa ($Wi,&QWP(13*16-0x80,$K256));
+ &paddd ($Wi,@MSG[0]);
+ &sha256msg2 (@MSG[1],@MSG[0]);
+ &sha256rnds2 ($CDGH,$ABEF); # 52-55
+ &pshufd ($Wi,$Wi,0x0e);
+ &movdqa ($TMP,@MSG[1])
+ &palignr ($TMP,@MSG[0],4);
+ &sha256rnds2 ($ABEF,$CDGH);
+ &paddd (@MSG[2],$TMP);
+
+ &movdqa ($Wi,&QWP(14*16-0x80,$K256));
+ &paddd ($Wi,@MSG[1]);
+ &sha256rnds2 ($CDGH,$ABEF); # 56-59
+ &pshufd ($Wi,$Wi,0x0e);
+ &sha256msg2 (@MSG[2],@MSG[1]);
+ &movdqa ($TMP,&QWP(0x100-0x80,$K256)); # byte swap mask
+ &sha256rnds2 ($ABEF,$CDGH);
+
+ &movdqa ($Wi,&QWP(15*16-0x80,$K256));
+ &paddd ($Wi,@MSG[2]);
+ &nop ();
+ &sha256rnds2 ($CDGH,$ABEF); # 60-63
+ &pshufd ($Wi,$Wi,0x0e);
+ &cmp ($end,$inp);
+ &nop ();
+ &sha256rnds2 ($ABEF,$CDGH);
+
+ &paddd ($CDGH,&QWP(16,"esp"));
+ &paddd ($ABEF,&QWP(0,"esp"));
+ &jnz (&label("loop_shaext"));
+
+ &pshufd ($CDGH,$CDGH,0xb1); # DCHG
+ &pshufd ($TMP,$ABEF,0x1b); # FEBA
+ &pshufd ($ABEF,$ABEF,0xb1); # BAFE
+ &punpckhqdq ($ABEF,$CDGH); # DCBA
+ &palignr ($CDGH,$TMP,8); # HGFE
+
+ &mov ("esp",&DWP(32+12,"esp"));
+ &movdqu (&QWP(0,$ctx),$ABEF);
+ &movdqu (&QWP(16,$ctx),$CDGH);
+&function_end_A();
+}
+
+my @X = map("xmm$_",(0..3));
+my ($t0,$t1,$t2,$t3) = map("xmm$_",(4..7));
+my @AH = ($A,$T);
+
+&set_label("SSSE3",32);
+ &lea ("esp",&DWP(-96,"esp"));
+ # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
+ &mov ($AH[0],&DWP(0,"esi"));
+ &mov ($AH[1],&DWP(4,"esi"));
+ &mov ("ecx",&DWP(8,"esi"));
+ &mov ("edi",&DWP(12,"esi"));
+ #&mov (&DWP(0,"esp"),$AH[0]);
+ &mov (&DWP(4,"esp"),$AH[1]);
+ &xor ($AH[1],"ecx"); # magic
+ &mov (&DWP(8,"esp"),"ecx");
+ &mov (&DWP(12,"esp"),"edi");
+ &mov ($E,&DWP(16,"esi"));
+ &mov ("edi",&DWP(20,"esi"));
+ &mov ("ecx",&DWP(24,"esi"));
+ &mov ("esi",&DWP(28,"esi"));
+ #&mov (&DWP(16,"esp"),$E);
+ &mov (&DWP(20,"esp"),"edi");
+ &mov ("edi",&DWP(96+4,"esp")); # inp
+ &mov (&DWP(24,"esp"),"ecx");
+ &mov (&DWP(28,"esp"),"esi");
+ &movdqa ($t3,&QWP(256,$K256));
+ &jmp (&label("grand_ssse3"));
+
+&set_label("grand_ssse3",16);
+ # load input, reverse byte order, add K256[0..15], save to stack
+ &movdqu (@X[0],&QWP(0,"edi"));
+ &movdqu (@X[1],&QWP(16,"edi"));
+ &movdqu (@X[2],&QWP(32,"edi"));
+ &movdqu (@X[3],&QWP(48,"edi"));
+ &add ("edi",64);
+ &pshufb (@X[0],$t3);
+ &mov (&DWP(96+4,"esp"),"edi");
+ &pshufb (@X[1],$t3);
+ &movdqa ($t0,&QWP(0,$K256));
+ &pshufb (@X[2],$t3);
+ &movdqa ($t1,&QWP(16,$K256));
+ &paddd ($t0,@X[0]);
+ &pshufb (@X[3],$t3);
+ &movdqa ($t2,&QWP(32,$K256));
+ &paddd ($t1,@X[1]);
+ &movdqa ($t3,&QWP(48,$K256));
+ &movdqa (&QWP(32+0,"esp"),$t0);
+ &paddd ($t2,@X[2]);
+ &movdqa (&QWP(32+16,"esp"),$t1);
+ &paddd ($t3,@X[3]);
+ &movdqa (&QWP(32+32,"esp"),$t2);
+ &movdqa (&QWP(32+48,"esp"),$t3);
+ &jmp (&label("ssse3_00_47"));
+
+&set_label("ssse3_00_47",16);
+ &add ($K256,64);
+
+sub SSSE3_00_47 () {
+my $j = shift;
+my $body = shift;
+my @X = @_;
+my @insns = (&$body,&$body,&$body,&$body); # 120 instructions
+
+ eval(shift(@insns));
+ &movdqa ($t0,@X[1]);
+ eval(shift(@insns)); # @
+ eval(shift(@insns));
+ &movdqa ($t3,@X[3]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &palignr ($t0,@X[0],4); # X[1..4]
+ eval(shift(@insns));
+ eval(shift(@insns)); # @
+ eval(shift(@insns));
+ &palignr ($t3,@X[2],4); # X[9..12]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa ($t1,$t0);
+ eval(shift(@insns)); # @
+ eval(shift(@insns));
+ &movdqa ($t2,$t0);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &psrld ($t0,3);
+ eval(shift(@insns));
+ eval(shift(@insns)); # @
+ &paddd (@X[0],$t3); # X[0..3] += X[9..12]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &psrld ($t2,7);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # @
+ eval(shift(@insns));
+ &pshufd ($t3,@X[3],0b11111010); # X[14..15]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &pslld ($t1,32-18);
+ eval(shift(@insns));
+ eval(shift(@insns)); # @
+ &pxor ($t0,$t2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &psrld ($t2,18-7);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # @
+ &pxor ($t0,$t1);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &pslld ($t1,18-7);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # @
+ &pxor ($t0,$t2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa ($t2,$t3);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # @
+ &pxor ($t0,$t1); # sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &psrld ($t3,10);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # @
+ &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &psrlq ($t2,17);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # @
+ &pxor ($t3,$t2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &psrlq ($t2,19-17);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # @
+ &pxor ($t3,$t2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &pshufd ($t3,$t3,0b10000000);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # @
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # @
+ eval(shift(@insns));
+ &psrldq ($t3,8);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15])
+ eval(shift(@insns)); # @
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # @
+ eval(shift(@insns));
+ &pshufd ($t3,@X[0],0b01010000); # X[16..17]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa ($t2,$t3);
+ eval(shift(@insns)); # @
+ &psrld ($t3,10);
+ eval(shift(@insns));
+ &psrlq ($t2,17);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # @
+ &pxor ($t3,$t2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &psrlq ($t2,19-17);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # @
+ &pxor ($t3,$t2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &pshufd ($t3,$t3,0b00001000);
+ eval(shift(@insns));
+ eval(shift(@insns)); # @
+ &movdqa ($t2,&QWP(16*$j,$K256));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &pslldq ($t3,8);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # @
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # @
+ &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &paddd ($t2,@X[0]);
+ eval(shift(@insns)); # @
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ &movdqa (&QWP(32+16*$j,"esp"),$t2);
+}
+
+sub body_00_15 () {
+ (
+ '&mov ("ecx",$E);',
+ '&ror ($E,25-11);',
+ '&mov ("esi",&off($f));',
+ '&xor ($E,"ecx");',
+ '&mov ("edi",&off($g));',
+ '&xor ("esi","edi");',
+ '&ror ($E,11-6);',
+ '&and ("esi","ecx");',
+ '&mov (&off($e),"ecx");', # save $E, modulo-scheduled
+ '&xor ($E,"ecx");',
+ '&xor ("edi","esi");', # Ch(e,f,g)
+ '&ror ($E,6);', # T = Sigma1(e)
+ '&mov ("ecx",$AH[0]);',
+ '&add ($E,"edi");', # T += Ch(e,f,g)
+ '&mov ("edi",&off($b));',
+ '&mov ("esi",$AH[0]);',
+
+ '&ror ("ecx",22-13);',
+ '&mov (&off($a),$AH[0]);', # save $A, modulo-scheduled
+ '&xor ("ecx",$AH[0]);',
+ '&xor ($AH[0],"edi");', # a ^= b, (b^c) in next round
+ '&add ($E,&off($h));', # T += h
+ '&ror ("ecx",13-2);',
+ '&and ($AH[1],$AH[0]);', # (b^c) &= (a^b)
+ '&xor ("ecx","esi");',
+ '&add ($E,&DWP(32+4*($i&15),"esp"));', # T += K[i]+X[i]
+ '&xor ($AH[1],"edi");', # h = Maj(a,b,c) = Ch(a^b,c,b)
+ '&ror ("ecx",2);', # Sigma0(a)
+
+ '&add ($AH[1],$E);', # h += T
+ '&add ($E,&off($d));', # d += T
+ '&add ($AH[1],"ecx");'. # h += Sigma0(a)
+
+ '@AH = reverse(@AH); $i++;' # rotate(a,h)
+ );
+}
+
+ for ($i=0,$j=0; $j<4; $j++) {
+ &SSSE3_00_47($j,\&body_00_15,@X);
+ push(@X,shift(@X)); # rotate(@X)
+ }
+ &cmp (&DWP(16*$j,$K256),0x00010203);
+ &jne (&label("ssse3_00_47"));
+
+ for ($i=0; $i<16; ) {
+ foreach(body_00_15()) { eval; }
+ }
+
+ &mov ("esi",&DWP(96,"esp")); #ctx
+ #&mov ($AH[0],&DWP(0,"esp"));
+ &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp"));
+ #&mov ("edi", &DWP(8,"esp"));
+ &mov ("ecx",&DWP(12,"esp"));
+ &add ($AH[0],&DWP(0,"esi"));
+ &add ($AH[1],&DWP(4,"esi"));
+ &add ("edi",&DWP(8,"esi"));
+ &add ("ecx",&DWP(12,"esi"));
+ &mov (&DWP(0,"esi"),$AH[0]);
+ &mov (&DWP(4,"esi"),$AH[1]);
+ &mov (&DWP(8,"esi"),"edi");
+ &mov (&DWP(12,"esi"),"ecx");
+ #&mov (&DWP(0,"esp"),$AH[0]);
+ &mov (&DWP(4,"esp"),$AH[1]);
+ &xor ($AH[1],"edi"); # magic
+ &mov (&DWP(8,"esp"),"edi");
+ &mov (&DWP(12,"esp"),"ecx");
+ #&mov ($E,&DWP(16,"esp"));
+ &mov ("edi",&DWP(20,"esp"));
+ &mov ("ecx",&DWP(24,"esp"));
+ &add ($E,&DWP(16,"esi"));
+ &add ("edi",&DWP(20,"esi"));
+ &add ("ecx",&DWP(24,"esi"));
+ &mov (&DWP(16,"esi"),$E);
+ &mov (&DWP(20,"esi"),"edi");
+ &mov (&DWP(20,"esp"),"edi");
+ &mov ("edi",&DWP(28,"esp"));
+ &mov (&DWP(24,"esi"),"ecx");
+ #&mov (&DWP(16,"esp"),$E);
+ &add ("edi",&DWP(28,"esi"));
+ &mov (&DWP(24,"esp"),"ecx");
+ &mov (&DWP(28,"esi"),"edi");
+ &mov (&DWP(28,"esp"),"edi");
+ &mov ("edi",&DWP(96+4,"esp")); # inp
+
+ &movdqa ($t3,&QWP(64,$K256));
+ &sub ($K256,3*64); # rewind K
+ &cmp ("edi",&DWP(96+8,"esp")); # are we done yet?
+ &jb (&label("grand_ssse3"));
+
+ &mov ("esp",&DWP(96+12,"esp")); # restore sp
+&function_end_A();
+ if ($avx) {
+&set_label("AVX",32);
+ if ($avx>1) {
+ &and ("edx",1<<8|1<<3); # check for BMI2+BMI1
+ &cmp ("edx",1<<8|1<<3);
+ &je (&label("AVX_BMI"));
+ }
+ &lea ("esp",&DWP(-96,"esp"));
+ &vzeroall ();
+ # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
+ &mov ($AH[0],&DWP(0,"esi"));
+ &mov ($AH[1],&DWP(4,"esi"));
+ &mov ("ecx",&DWP(8,"esi"));
+ &mov ("edi",&DWP(12,"esi"));
+ #&mov (&DWP(0,"esp"),$AH[0]);
+ &mov (&DWP(4,"esp"),$AH[1]);
+ &xor ($AH[1],"ecx"); # magic
+ &mov (&DWP(8,"esp"),"ecx");
+ &mov (&DWP(12,"esp"),"edi");
+ &mov ($E,&DWP(16,"esi"));
+ &mov ("edi",&DWP(20,"esi"));
+ &mov ("ecx",&DWP(24,"esi"));
+ &mov ("esi",&DWP(28,"esi"));
+ #&mov (&DWP(16,"esp"),$E);
+ &mov (&DWP(20,"esp"),"edi");
+ &mov ("edi",&DWP(96+4,"esp")); # inp
+ &mov (&DWP(24,"esp"),"ecx");
+ &mov (&DWP(28,"esp"),"esi");
+ &vmovdqa ($t3,&QWP(256,$K256));
+ &jmp (&label("grand_avx"));
+
+&set_label("grand_avx",32);
+ # load input, reverse byte order, add K256[0..15], save to stack
+ &vmovdqu (@X[0],&QWP(0,"edi"));
+ &vmovdqu (@X[1],&QWP(16,"edi"));
+ &vmovdqu (@X[2],&QWP(32,"edi"));
+ &vmovdqu (@X[3],&QWP(48,"edi"));
+ &add ("edi",64);
+ &vpshufb (@X[0],@X[0],$t3);
+ &mov (&DWP(96+4,"esp"),"edi");
+ &vpshufb (@X[1],@X[1],$t3);
+ &vpshufb (@X[2],@X[2],$t3);
+ &vpaddd ($t0,@X[0],&QWP(0,$K256));
+ &vpshufb (@X[3],@X[3],$t3);
+ &vpaddd ($t1,@X[1],&QWP(16,$K256));
+ &vpaddd ($t2,@X[2],&QWP(32,$K256));
+ &vpaddd ($t3,@X[3],&QWP(48,$K256));
+ &vmovdqa (&QWP(32+0,"esp"),$t0);
+ &vmovdqa (&QWP(32+16,"esp"),$t1);
+ &vmovdqa (&QWP(32+32,"esp"),$t2);
+ &vmovdqa (&QWP(32+48,"esp"),$t3);
+ &jmp (&label("avx_00_47"));
+
+&set_label("avx_00_47",16);
+ &add ($K256,64);
+
+sub Xupdate_AVX () {
+ (
+ '&vpalignr ($t0,@X[1],@X[0],4);', # X[1..4]
+ '&vpalignr ($t3,@X[3],@X[2],4);', # X[9..12]
+ '&vpsrld ($t2,$t0,7);',
+ '&vpaddd (@X[0],@X[0],$t3);', # X[0..3] += X[9..16]
+ '&vpsrld ($t3,$t0,3);',
+ '&vpslld ($t1,$t0,14);',
+ '&vpxor ($t0,$t3,$t2);',
+ '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
+ '&vpsrld ($t2,$t2,18-7);',
+ '&vpxor ($t0,$t0,$t1);',
+ '&vpslld ($t1,$t1,25-14);',
+ '&vpxor ($t0,$t0,$t2);',
+ '&vpsrld ($t2,$t3,10);',
+ '&vpxor ($t0,$t0,$t1);', # sigma0(X[1..4])
+ '&vpsrlq ($t1,$t3,17);',
+ '&vpaddd (@X[0],@X[0],$t0);', # X[0..3] += sigma0(X[1..4])
+ '&vpxor ($t2,$t2,$t1);',
+ '&vpsrlq ($t3,$t3,19);',
+ '&vpxor ($t2,$t2,$t3);', # sigma1(X[14..15]
+ '&vpshufd ($t3,$t2,0b10000100);',
+ '&vpsrldq ($t3,$t3,8);',
+ '&vpaddd (@X[0],@X[0],$t3);', # X[0..1] += sigma1(X[14..15])
+ '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
+ '&vpsrld ($t2,$t3,10);',
+ '&vpsrlq ($t1,$t3,17);',
+ '&vpxor ($t2,$t2,$t1);',
+ '&vpsrlq ($t3,$t3,19);',
+ '&vpxor ($t2,$t2,$t3);', # sigma1(X[16..17]
+ '&vpshufd ($t3,$t2,0b11101000);',
+ '&vpslldq ($t3,$t3,8);',
+ '&vpaddd (@X[0],@X[0],$t3);' # X[2..3] += sigma1(X[16..17])
+ );
+}
+
+local *ror = sub { &shrd(@_[0],@_) };
+sub AVX_00_47 () {
+my $j = shift;
+my $body = shift;
+my @X = @_;
+my @insns = (&$body,&$body,&$body,&$body); # 120 instructions
+my $insn;
+
+ foreach (Xupdate_AVX()) { # 31 instructions
+ eval;
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval($insn = shift(@insns));
+ eval(shift(@insns)) if ($insn =~ /rorx/ && @insns[0] =~ /rorx/);
+ }
+ &vpaddd ($t2,@X[0],&QWP(16*$j,$K256));
+ foreach (@insns) { eval; } # remaining instructions
+ &vmovdqa (&QWP(32+16*$j,"esp"),$t2);
+}
+
+ for ($i=0,$j=0; $j<4; $j++) {
+ &AVX_00_47($j,\&body_00_15,@X);
+ push(@X,shift(@X)); # rotate(@X)
+ }
+ &cmp (&DWP(16*$j,$K256),0x00010203);
+ &jne (&label("avx_00_47"));
+
+ for ($i=0; $i<16; ) {
+ foreach(body_00_15()) { eval; }
+ }
+
+ &mov ("esi",&DWP(96,"esp")); #ctx
+ #&mov ($AH[0],&DWP(0,"esp"));
+ &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp"));
+ #&mov ("edi", &DWP(8,"esp"));
+ &mov ("ecx",&DWP(12,"esp"));
+ &add ($AH[0],&DWP(0,"esi"));
+ &add ($AH[1],&DWP(4,"esi"));
+ &add ("edi",&DWP(8,"esi"));
+ &add ("ecx",&DWP(12,"esi"));
+ &mov (&DWP(0,"esi"),$AH[0]);
+ &mov (&DWP(4,"esi"),$AH[1]);
+ &mov (&DWP(8,"esi"),"edi");
+ &mov (&DWP(12,"esi"),"ecx");
+ #&mov (&DWP(0,"esp"),$AH[0]);
+ &mov (&DWP(4,"esp"),$AH[1]);
+ &xor ($AH[1],"edi"); # magic
+ &mov (&DWP(8,"esp"),"edi");
+ &mov (&DWP(12,"esp"),"ecx");
+ #&mov ($E,&DWP(16,"esp"));
+ &mov ("edi",&DWP(20,"esp"));
+ &mov ("ecx",&DWP(24,"esp"));
+ &add ($E,&DWP(16,"esi"));
+ &add ("edi",&DWP(20,"esi"));
+ &add ("ecx",&DWP(24,"esi"));
+ &mov (&DWP(16,"esi"),$E);
+ &mov (&DWP(20,"esi"),"edi");
+ &mov (&DWP(20,"esp"),"edi");
+ &mov ("edi",&DWP(28,"esp"));
+ &mov (&DWP(24,"esi"),"ecx");
+ #&mov (&DWP(16,"esp"),$E);
+ &add ("edi",&DWP(28,"esi"));
+ &mov (&DWP(24,"esp"),"ecx");
+ &mov (&DWP(28,"esi"),"edi");
+ &mov (&DWP(28,"esp"),"edi");
+ &mov ("edi",&DWP(96+4,"esp")); # inp
+
+ &vmovdqa ($t3,&QWP(64,$K256));
+ &sub ($K256,3*64); # rewind K
+ &cmp ("edi",&DWP(96+8,"esp")); # are we done yet?
+ &jb (&label("grand_avx"));
+
+ &mov ("esp",&DWP(96+12,"esp")); # restore sp
+ &vzeroall ();
+&function_end_A();
+ if ($avx>1) {
+sub bodyx_00_15 () { # +10%
+ (
+ '&rorx ("ecx",$E,6)',
+ '&rorx ("esi",$E,11)',
+ '&mov (&off($e),$E)', # save $E, modulo-scheduled
+ '&rorx ("edi",$E,25)',
+ '&xor ("ecx","esi")',
+ '&andn ("esi",$E,&off($g))',
+ '&xor ("ecx","edi")', # Sigma1(e)
+ '&and ($E,&off($f))',
+ '&mov (&off($a),$AH[0]);', # save $A, modulo-scheduled
+ '&or ($E,"esi")', # T = Ch(e,f,g)
+
+ '&rorx ("edi",$AH[0],2)',
+ '&rorx ("esi",$AH[0],13)',
+ '&lea ($E,&DWP(0,$E,"ecx"))', # T += Sigma1(e)
+ '&rorx ("ecx",$AH[0],22)',
+ '&xor ("esi","edi")',
+ '&mov ("edi",&off($b))',
+ '&xor ("ecx","esi")', # Sigma0(a)
+
+ '&xor ($AH[0],"edi")', # a ^= b, (b^c) in next round
+ '&add ($E,&off($h))', # T += h
+ '&and ($AH[1],$AH[0])', # (b^c) &= (a^b)
+ '&add ($E,&DWP(32+4*($i&15),"esp"))', # T += K[i]+X[i]
+ '&xor ($AH[1],"edi")', # h = Maj(a,b,c) = Ch(a^b,c,b)
+
+ '&add ("ecx",$E)', # h += T
+ '&add ($E,&off($d))', # d += T
+ '&lea ($AH[1],&DWP(0,$AH[1],"ecx"));'. # h += Sigma0(a)
+
+ '@AH = reverse(@AH); $i++;' # rotate(a,h)
+ );
+}
+
+&set_label("AVX_BMI",32);
+ &lea ("esp",&DWP(-96,"esp"));
+ &vzeroall ();
+ # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
+ &mov ($AH[0],&DWP(0,"esi"));
+ &mov ($AH[1],&DWP(4,"esi"));
+ &mov ("ecx",&DWP(8,"esi"));
+ &mov ("edi",&DWP(12,"esi"));
+ #&mov (&DWP(0,"esp"),$AH[0]);
+ &mov (&DWP(4,"esp"),$AH[1]);
+ &xor ($AH[1],"ecx"); # magic
+ &mov (&DWP(8,"esp"),"ecx");
+ &mov (&DWP(12,"esp"),"edi");
+ &mov ($E,&DWP(16,"esi"));
+ &mov ("edi",&DWP(20,"esi"));
+ &mov ("ecx",&DWP(24,"esi"));
+ &mov ("esi",&DWP(28,"esi"));
+ #&mov (&DWP(16,"esp"),$E);
+ &mov (&DWP(20,"esp"),"edi");
+ &mov ("edi",&DWP(96+4,"esp")); # inp
+ &mov (&DWP(24,"esp"),"ecx");
+ &mov (&DWP(28,"esp"),"esi");
+ &vmovdqa ($t3,&QWP(256,$K256));
+ &jmp (&label("grand_avx_bmi"));
+
+&set_label("grand_avx_bmi",32);
+ # load input, reverse byte order, add K256[0..15], save to stack
+ &vmovdqu (@X[0],&QWP(0,"edi"));
+ &vmovdqu (@X[1],&QWP(16,"edi"));
+ &vmovdqu (@X[2],&QWP(32,"edi"));
+ &vmovdqu (@X[3],&QWP(48,"edi"));
+ &add ("edi",64);
+ &vpshufb (@X[0],@X[0],$t3);
+ &mov (&DWP(96+4,"esp"),"edi");
+ &vpshufb (@X[1],@X[1],$t3);
+ &vpshufb (@X[2],@X[2],$t3);
+ &vpaddd ($t0,@X[0],&QWP(0,$K256));
+ &vpshufb (@X[3],@X[3],$t3);
+ &vpaddd ($t1,@X[1],&QWP(16,$K256));
+ &vpaddd ($t2,@X[2],&QWP(32,$K256));
+ &vpaddd ($t3,@X[3],&QWP(48,$K256));
+ &vmovdqa (&QWP(32+0,"esp"),$t0);
+ &vmovdqa (&QWP(32+16,"esp"),$t1);
+ &vmovdqa (&QWP(32+32,"esp"),$t2);
+ &vmovdqa (&QWP(32+48,"esp"),$t3);
+ &jmp (&label("avx_bmi_00_47"));
+
+&set_label("avx_bmi_00_47",16);
+ &add ($K256,64);
+
+ for ($i=0,$j=0; $j<4; $j++) {
+ &AVX_00_47($j,\&bodyx_00_15,@X);
+ push(@X,shift(@X)); # rotate(@X)
+ }
+ &cmp (&DWP(16*$j,$K256),0x00010203);
+ &jne (&label("avx_bmi_00_47"));
+
+ for ($i=0; $i<16; ) {
+ foreach(bodyx_00_15()) { eval; }
+ }
+
+ &mov ("esi",&DWP(96,"esp")); #ctx
+ #&mov ($AH[0],&DWP(0,"esp"));
+ &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp"));
+ #&mov ("edi", &DWP(8,"esp"));
+ &mov ("ecx",&DWP(12,"esp"));
+ &add ($AH[0],&DWP(0,"esi"));
+ &add ($AH[1],&DWP(4,"esi"));
+ &add ("edi",&DWP(8,"esi"));
+ &add ("ecx",&DWP(12,"esi"));
+ &mov (&DWP(0,"esi"),$AH[0]);
+ &mov (&DWP(4,"esi"),$AH[1]);
+ &mov (&DWP(8,"esi"),"edi");
+ &mov (&DWP(12,"esi"),"ecx");
+ #&mov (&DWP(0,"esp"),$AH[0]);
+ &mov (&DWP(4,"esp"),$AH[1]);
+ &xor ($AH[1],"edi"); # magic
+ &mov (&DWP(8,"esp"),"edi");
+ &mov (&DWP(12,"esp"),"ecx");
+ #&mov ($E,&DWP(16,"esp"));
+ &mov ("edi",&DWP(20,"esp"));
+ &mov ("ecx",&DWP(24,"esp"));
+ &add ($E,&DWP(16,"esi"));
+ &add ("edi",&DWP(20,"esi"));
+ &add ("ecx",&DWP(24,"esi"));
+ &mov (&DWP(16,"esi"),$E);
+ &mov (&DWP(20,"esi"),"edi");
+ &mov (&DWP(20,"esp"),"edi");
+ &mov ("edi",&DWP(28,"esp"));
+ &mov (&DWP(24,"esi"),"ecx");
+ #&mov (&DWP(16,"esp"),$E);
+ &add ("edi",&DWP(28,"esi"));
+ &mov (&DWP(24,"esp"),"ecx");
+ &mov (&DWP(28,"esi"),"edi");
+ &mov (&DWP(28,"esp"),"edi");
+ &mov ("edi",&DWP(96+4,"esp")); # inp
+
+ &vmovdqa ($t3,&QWP(64,$K256));
+ &sub ($K256,3*64); # rewind K
+ &cmp ("edi",&DWP(96+8,"esp")); # are we done yet?
+ &jb (&label("grand_avx_bmi"));
+
+ &mov ("esp",&DWP(96+12,"esp")); # restore sp
+ &vzeroall ();
+&function_end_A();
+ }
+ }
+ }}}
+&function_end_B("sha256_block_data_order");
+
&asm_finish();
diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl
index 9c84e8d93c30..4fee74d832d1 100755
--- a/crypto/sha/asm/sha256-armv4.pl
+++ b/crypto/sha/asm/sha256-armv4.pl
@@ -1,10 +1,12 @@
#!/usr/bin/env perl
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Permission to use under GPL terms is granted.
# ====================================================================
# SHA256 block procedure for ARMv4. May 2007.
@@ -21,15 +23,27 @@
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 16%
-# improvement on Cortex A8 core and ~17 cycles per processed byte.
+# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+# September 2013.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process one
+# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+# code (meaning that latter performs sub-optimally, nothing was done
+# about it).
+
+# May 2014.
+#
+# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$ctx="r0"; $t0="r0";
-$inp="r1"; $t3="r1";
+$inp="r1"; $t4="r1";
$len="r2"; $t1="r2";
-$T1="r3";
+$T1="r3"; $t3="r3";
$A="r4";
$B="r5";
$C="r6";
@@ -52,80 +66,111 @@ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___ if ($i<16);
#if __ARM_ARCH__>=7
- ldr $T1,[$inp],#4
+ @ ldr $t1,[$inp],#4 @ $i
+# if $i==15
+ str $inp,[sp,#17*4] @ make room for $t4
+# endif
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
+ rev $t1,$t1
#else
- ldrb $T1,[$inp,#3] @ $i
+ @ ldrb $t1,[$inp,#3] @ $i
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
ldrb $t2,[$inp,#2]
- ldrb $t1,[$inp,#1]
- ldrb $t0,[$inp],#4
- orr $T1,$T1,$t2,lsl#8
- orr $T1,$T1,$t1,lsl#16
- orr $T1,$T1,$t0,lsl#24
+ ldrb $t0,[$inp,#1]
+ orr $t1,$t1,$t2,lsl#8
+ ldrb $t2,[$inp],#4
+ orr $t1,$t1,$t0,lsl#16
+# if $i==15
+ str $inp,[sp,#17*4] @ make room for $t4
+# endif
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+ orr $t1,$t1,$t2,lsl#24
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
#endif
___
$code.=<<___;
- mov $t0,$e,ror#$Sigma1[0]
ldr $t2,[$Ktbl],#4 @ *K256++
- eor $t0,$t0,$e,ror#$Sigma1[1]
+ add $h,$h,$t1 @ h+=X[i]
+ str $t1,[sp,#`$i%16`*4]
eor $t1,$f,$g
-#if $i>=16
- add $T1,$T1,$t3 @ from BODY_16_xx
-#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
- rev $T1,$T1
-#endif
-#if $i==15
- str $inp,[sp,#17*4] @ leave room for $t3
-#endif
- eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e)
+ add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
and $t1,$t1,$e
- str $T1,[sp,#`$i%16`*4]
- add $T1,$T1,$t0
+ add $h,$h,$t2 @ h+=K256[i]
eor $t1,$t1,$g @ Ch(e,f,g)
- add $T1,$T1,$h
- mov $h,$a,ror#$Sigma0[0]
- add $T1,$T1,$t1
- eor $h,$h,$a,ror#$Sigma0[1]
- add $T1,$T1,$t2
- eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a)
-#if $i>=15
- ldr $t3,[sp,#`($i+2)%16`*4] @ from BODY_16_xx
+ eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
+ add $h,$h,$t1 @ h+=Ch(e,f,g)
+#if $i==31
+ and $t2,$t2,#0xff
+ cmp $t2,#0xf2 @ done?
+#endif
+#if $i<15
+# if __ARM_ARCH__>=7
+ ldr $t1,[$inp],#4 @ prefetch
+# else
+ ldrb $t1,[$inp,#3]
+# endif
+ eor $t2,$a,$b @ a^b, b^c in next round
+#else
+ ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
+ eor $t2,$a,$b @ a^b, b^c in next round
+ ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
#endif
- orr $t0,$a,$b
- and $t1,$a,$b
- and $t0,$t0,$c
- add $h,$h,$T1
- orr $t0,$t0,$t1 @ Maj(a,b,c)
- add $d,$d,$T1
- add $h,$h,$t0
+ eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
+ and $t3,$t3,$t2 @ (b^c)&=(a^b)
+ add $d,$d,$h @ d+=h
+ eor $t3,$t3,$b @ Maj(a,b,c)
+ add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
+ @ add $h,$h,$t3 @ h+=Maj(a,b,c)
___
+ ($t2,$t3)=($t3,$t2);
}
sub BODY_16_XX {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___;
- @ ldr $t3,[sp,#`($i+1)%16`*4] @ $i
- ldr $t2,[sp,#`($i+14)%16`*4]
- mov $t0,$t3,ror#$sigma0[0]
- ldr $T1,[sp,#`($i+0)%16`*4]
- eor $t0,$t0,$t3,ror#$sigma0[1]
- ldr $t1,[sp,#`($i+9)%16`*4]
- eor $t0,$t0,$t3,lsr#$sigma0[2] @ sigma0(X[i+1])
- mov $t3,$t2,ror#$sigma1[0]
- add $T1,$T1,$t0
- eor $t3,$t3,$t2,ror#$sigma1[1]
- add $T1,$T1,$t1
- eor $t3,$t3,$t2,lsr#$sigma1[2] @ sigma1(X[i+14])
- @ add $T1,$T1,$t3
+ @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
+ @ ldr $t4,[sp,#`($i+14)%16`*4]
+ mov $t0,$t1,ror#$sigma0[0]
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
+ mov $t2,$t4,ror#$sigma1[0]
+ eor $t0,$t0,$t1,ror#$sigma0[1]
+ eor $t2,$t2,$t4,ror#$sigma1[1]
+ eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
+ ldr $t1,[sp,#`($i+0)%16`*4]
+ eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
+ ldr $t4,[sp,#`($i+9)%16`*4]
+
+ add $t2,$t2,$t0
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
+ add $t1,$t1,$t2
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
+ add $t1,$t1,$t4 @ X[i]
___
&BODY_00_15(@_);
}
$code=<<___;
-#include "arm_arch.h"
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
.text
+#if __ARM_ARCH__<7
.code 32
+#else
+.syntax unified
+# ifdef __thumb2__
+.thumb
+# else
+.code 32
+# endif
+#endif
.type K256,%object
.align 5
@@ -147,46 +192,73 @@ K256:
.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.size K256,.-K256
+.word 0 @ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-sha256_block_data_order
+#endif
+.align 5
.global sha256_block_data_order
.type sha256_block_data_order,%function
sha256_block_data_order:
+#if __ARM_ARCH__<7
sub r3,pc,#8 @ sha256_block_data_order
+#else
+ adr r3,sha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ ldr r12,.LOPENSSL_armcap
+ ldr r12,[r3,r12] @ OPENSSL_armcap_P
+ tst r12,#ARMV8_SHA256
+ bne .LARMv8
+ tst r12,#ARMV7_NEON
+ bne .LNEON
+#endif
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
- sub $Ktbl,r3,#256 @ K256
+ sub $Ktbl,r3,#256+32 @ K256
sub sp,sp,#16*4 @ alloca(X[16])
.Loop:
+# if __ARM_ARCH__>=7
+ ldr $t1,[$inp],#4
+# else
+ ldrb $t1,[$inp,#3]
+# endif
+ eor $t3,$B,$C @ magic
+ eor $t2,$t2,$t2
___
for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
$code.=".Lrounds_16_xx:\n";
for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
- and $t2,$t2,#0xff
- cmp $t2,#0xf2
+#if __ARM_ARCH__>=7
+ ite eq @ Thumb2 thing, sanity check in ARM
+#endif
+ ldreq $t3,[sp,#16*4] @ pull ctx
bne .Lrounds_16_xx
- ldr $T1,[sp,#16*4] @ pull ctx
- ldr $t0,[$T1,#0]
- ldr $t1,[$T1,#4]
- ldr $t2,[$T1,#8]
+ add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
+ ldr $t0,[$t3,#0]
+ ldr $t1,[$t3,#4]
+ ldr $t2,[$t3,#8]
add $A,$A,$t0
- ldr $t0,[$T1,#12]
+ ldr $t0,[$t3,#12]
add $B,$B,$t1
- ldr $t1,[$T1,#16]
+ ldr $t1,[$t3,#16]
add $C,$C,$t2
- ldr $t2,[$T1,#20]
+ ldr $t2,[$t3,#20]
add $D,$D,$t0
- ldr $t0,[$T1,#24]
+ ldr $t0,[$t3,#24]
add $E,$E,$t1
- ldr $t1,[$T1,#28]
+ ldr $t1,[$t3,#28]
add $F,$F,$t2
ldr $inp,[sp,#17*4] @ pull inp
ldr $t2,[sp,#18*4] @ pull inp+len
add $G,$G,$t0
add $H,$H,$t1
- stmia $T1,{$A,$B,$C,$D,$E,$F,$G,$H}
+ stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
cmp $inp,$t2
sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
bne .Loop
@@ -200,12 +272,442 @@ $code.=<<___;
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
-.size sha256_block_data_order,.-sha256_block_data_order
-.asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
+.size sha256_block_data_order,.-sha256_block_data_order
+___
+######################################################################
+# NEON stuff
+#
+{{{
+my @X=map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
+my $Xfer=$t4;
+my $j=0;
+
+sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
+sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
+
+sub AUTOLOAD() # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+ my $arg = pop;
+ $arg = "#$arg" if ($arg*1 eq $arg);
+ $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub Xupdate()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+ &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T2,$T0,$sigma0[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T1,$T0,$sigma0[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T2,$T0,32-$sigma0[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T3,$T0,$sigma0[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T1,$T1,$T2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T3,$T0,32-$sigma0[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T1,$T1,$T3); # sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4); # sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vld1_32 ("{$T0}","[$Ktbl,:128]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4); # sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 ($T0,$T0,@X[0]);
+ while($#insns>=2) { eval(shift(@insns)); }
+ &vst1_32 ("{$T0}","[$Xfer,:128]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vld1_32 ("{$T0}","[$Ktbl,:128]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vrev32_8 (@X[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 ($T0,$T0,@X[0]);
+ foreach (@insns) { eval; } # remaining instructions
+ &vst1_32 ("{$T0}","[$Xfer,:128]!");
+
+ push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub body_00_15 () {
+ (
+ '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+ '&add ($h,$h,$t1)', # h+=X[i]+K[i]
+ '&eor ($t1,$f,$g)',
+ '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+ '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
+ '&and ($t1,$t1,$e)',
+ '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
+ '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+ '&eor ($t1,$t1,$g)', # Ch(e,f,g)
+ '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
+ '&eor ($t2,$a,$b)', # a^b, b^c in next round
+ '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
+ '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
+ '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
+ '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
+ '&ldr ($t1,"[sp,#64]") if ($j==31)',
+ '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
+ '&add ($d,$d,$h)', # d+=h
+ '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
+ '&eor ($t3,$t3,$b)', # Maj(a,b,c)
+ '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+ )
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.global sha256_block_data_order_neon
+.type sha256_block_data_order_neon,%function
+.align 4
+sha256_block_data_order_neon:
+.LNEON:
+ stmdb sp!,{r4-r12,lr}
+
+ sub $H,sp,#16*4+16
+ adr $Ktbl,K256
+ bic $H,$H,#15 @ align for 128-bit stores
+ mov $t2,sp
+ mov sp,$H @ alloca
+ add $len,$inp,$len,lsl#6 @ len to point at the end of inp
+
+ vld1.8 {@X[0]},[$inp]!
+ vld1.8 {@X[1]},[$inp]!
+ vld1.8 {@X[2]},[$inp]!
+ vld1.8 {@X[3]},[$inp]!
+ vld1.32 {$T0},[$Ktbl,:128]!
+ vld1.32 {$T1},[$Ktbl,:128]!
+ vld1.32 {$T2},[$Ktbl,:128]!
+ vld1.32 {$T3},[$Ktbl,:128]!
+ vrev32.8 @X[0],@X[0] @ yes, even on
+ str $ctx,[sp,#64]
+ vrev32.8 @X[1],@X[1] @ big-endian
+ str $inp,[sp,#68]
+ mov $Xfer,sp
+ vrev32.8 @X[2],@X[2]
+ str $len,[sp,#72]
+ vrev32.8 @X[3],@X[3]
+ str $t2,[sp,#76] @ save original sp
+ vadd.i32 $T0,$T0,@X[0]
+ vadd.i32 $T1,$T1,@X[1]
+ vst1.32 {$T0},[$Xfer,:128]!
+ vadd.i32 $T2,$T2,@X[2]
+ vst1.32 {$T1},[$Xfer,:128]!
+ vadd.i32 $T3,$T3,@X[3]
+ vst1.32 {$T2},[$Xfer,:128]!
+ vst1.32 {$T3},[$Xfer,:128]!
+
+ ldmia $ctx,{$A-$H}
+ sub $Xfer,$Xfer,#64
+ ldr $t1,[sp,#0]
+ eor $t2,$t2,$t2
+ eor $t3,$B,$C
+ b .L_00_48
+
+.align 4
+.L_00_48:
+___
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+$code.=<<___;
+ teq $t1,#0 @ check for K256 terminator
+ ldr $t1,[sp,#0]
+ sub $Xfer,$Xfer,#64
+ bne .L_00_48
+
+ ldr $inp,[sp,#68]
+ ldr $t0,[sp,#72]
+ sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
+ teq $inp,$t0
+ it eq
+ subeq $inp,$inp,#64 @ avoid SEGV
+ vld1.8 {@X[0]},[$inp]! @ load next input block
+ vld1.8 {@X[1]},[$inp]!
+ vld1.8 {@X[2]},[$inp]!
+ vld1.8 {@X[3]},[$inp]!
+ it ne
+ strne $inp,[sp,#68]
+ mov $Xfer,sp
+___
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+$code.=<<___;
+ ldr $t0,[$t1,#0]
+ add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
+ ldr $t2,[$t1,#4]
+ ldr $t3,[$t1,#8]
+ ldr $t4,[$t1,#12]
+ add $A,$A,$t0 @ accumulate
+ ldr $t0,[$t1,#16]
+ add $B,$B,$t2
+ ldr $t2,[$t1,#20]
+ add $C,$C,$t3
+ ldr $t3,[$t1,#24]
+ add $D,$D,$t4
+ ldr $t4,[$t1,#28]
+ add $E,$E,$t0
+ str $A,[$t1],#4
+ add $F,$F,$t2
+ str $B,[$t1],#4
+ add $G,$G,$t3
+ str $C,[$t1],#4
+ add $H,$H,$t4
+ str $D,[$t1],#4
+ stmia $t1,{$E-$H}
+
+ ittte ne
+ movne $Xfer,sp
+ ldrne $t1,[sp,#0]
+ eorne $t2,$t2,$t2
+ ldreq sp,[sp,#76] @ restore original sp
+ itt ne
+ eorne $t3,$B,$C
+ bne .L_00_48
+
+ ldmia sp!,{r4-r12,pc}
+.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+___
+}}}
+######################################################################
+# ARMv8 stuff
+#
+{{{
+my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
+my @MSG=map("q$_",(8..11));
+my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
+my $Ktbl="r3";
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+# define INST(a,b,c,d) .byte c,d|0xc,a,b
+# else
+# define INST(a,b,c,d) .byte a,b,c,d
+# endif
+
+.type sha256_block_data_order_armv8,%function
+.align 5
+sha256_block_data_order_armv8:
+.LARMv8:
+ vld1.32 {$ABCD,$EFGH},[$ctx]
+# ifdef __thumb2__
+ adr $Ktbl,.LARMv8
+ sub $Ktbl,$Ktbl,#.LARMv8-K256
+# else
+ adrl $Ktbl,K256
+# endif
+ add $len,$inp,$len,lsl#6 @ len to point at the end of inp
+
+.Loop_v8:
+ vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
+ vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
+ vld1.32 {$W0},[$Ktbl]!
+ vrev32.8 @MSG[0],@MSG[0]
+ vrev32.8 @MSG[1],@MSG[1]
+ vrev32.8 @MSG[2],@MSG[2]
+ vrev32.8 @MSG[3],@MSG[3]
+ vmov $ABCD_SAVE,$ABCD @ offload
+ vmov $EFGH_SAVE,$EFGH
+ teq $inp,$len
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+ vld1.32 {$W1},[$Ktbl]!
+ vadd.i32 $W0,$W0,@MSG[0]
+ sha256su0 @MSG[0],@MSG[1]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+ sha256su1 @MSG[0],@MSG[2],@MSG[3]
+___
+ ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+ vld1.32 {$W1},[$Ktbl]!
+ vadd.i32 $W0,$W0,@MSG[0]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+
+ vld1.32 {$W0},[$Ktbl]!
+ vadd.i32 $W1,$W1,@MSG[1]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W1
+ sha256h2 $EFGH,$abcd,$W1
+
+ vld1.32 {$W1},[$Ktbl]
+ vadd.i32 $W0,$W0,@MSG[2]
+ sub $Ktbl,$Ktbl,#256-16 @ rewind
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+
+ vadd.i32 $W1,$W1,@MSG[3]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W1
+ sha256h2 $EFGH,$abcd,$W1
+
+ vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
+ vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
+ it ne
+ bne .Loop_v8
+
+ vst1.32 {$ABCD,$EFGH},[$ctx]
+
+ ret @ bx lr
+.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+#endif
+___
+}}}
+$code.=<<___;
+.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm OPENSSL_armcap_P,4,4
+#endif
___
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
-print $code;
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/@/ and !/^$/);
+ print;
+}
+close SELF;
+
+{ my %opcode = (
+ "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
+ "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
+
+ sub unsha256 {
+ my ($mnemonic,$arg)=@_;
+
+ if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
+ my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+ |(($2&7)<<17)|(($2&8)<<4)
+ |(($3&7)<<1) |(($3&8)<<2);
+ # since ARMv7 instructions are always encoded little-endian.
+ # correct solution is to use .inst directive, but older
+ # assemblers don't implement it:-(
+ sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
+ $word&0xff,($word>>8)&0xff,
+ ($word>>16)&0xff,($word>>24)&0xff,
+ $mnemonic,$arg;
+ }
+ }
+}
+
+foreach (split($/,$code)) {
+
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
+
+ s/\bret\b/bx lr/go or
+ s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
+
+ print $_,"\n";
+}
+
close STDOUT; # enforce flush
diff --git a/crypto/sha/asm/sha256-mb-x86_64.pl b/crypto/sha/asm/sha256-mb-x86_64.pl
new file mode 100755
index 000000000000..adf2ddccd18b
--- /dev/null
+++ b/crypto/sha/asm/sha256-mb-x86_64.pl
@@ -0,0 +1,1560 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# Multi-buffer SHA256 procedure processes n buffers in parallel by
+# placing buffer data to designated lane of SIMD register. n is
+# naturally limited to 4 on pre-AVX2 processors and to 8 on
+# AVX2-capable processors such as Haswell.
+#
+# this +aesni(i) sha256 aesni-sha256 gain(iv)
+# -------------------------------------------------------------------
+# Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126%
+# Atom(ii) 38.7/n +3.93=13.6(n=4) 20.8 +5.69=26.5 +95%
+# Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103%
+# Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82%
+# Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170%
+# Bulldozer (21.6 +5.76=27.4)/n 13.6 13.7 +100%
+#
+# (i) multi-block CBC encrypt with 128-bit key;
+# (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
+# because of lower AES-NI instruction throughput, nor is there
+# AES-NI-SHA256 stitch for these processors;
+# (iii) "this" is for n=8, when we gather twice as much data, result
+# for n=4 is 20.3+4.44=24.7;
+# (iv) presented improvement coefficients are asymptotic limits and
+# in real-life application are somewhat lower, e.g. for 2KB
+# fragments they range from 75% to 130% (on Haswell);
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+$avx=0;
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+ $avx = ($2>=3.0) + ($2>3.0);
+}
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+# void sha256_multi_block (
+# struct { unsigned int A[8];
+# unsigned int B[8];
+# unsigned int C[8];
+# unsigned int D[8];
+# unsigned int E[8];
+# unsigned int F[8];
+# unsigned int G[8];
+# unsigned int H[8]; } *ctx,
+# struct { void *ptr; int blocks; } inp[8],
+# int num); /* 1 or 2 */
+#
+$ctx="%rdi"; # 1st arg
+$inp="%rsi"; # 2nd arg
+$num="%edx"; # 3rd arg
+@ptr=map("%r$_",(8..11));
+$Tbl="%rbp";
+
+@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15));
+($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7));
+
+$REG_SZ=16;
+
+sub Xi_off {
+my $off = shift;
+
+ $off %= 16; $off *= $REG_SZ;
+ $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
+}
+
+sub ROUND_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+
+$code.=<<___ if ($i<15);
+ movd `4*$i`(@ptr[0]),$Xi
+ movd `4*$i`(@ptr[1]),$t1
+ movd `4*$i`(@ptr[2]),$t2
+ movd `4*$i`(@ptr[3]),$t3
+ punpckldq $t2,$Xi
+ punpckldq $t3,$t1
+ punpckldq $t1,$Xi
+___
+$code.=<<___ if ($i==15);
+ movd `4*$i`(@ptr[0]),$Xi
+ lea `16*4`(@ptr[0]),@ptr[0]
+ movd `4*$i`(@ptr[1]),$t1
+ lea `16*4`(@ptr[1]),@ptr[1]
+ movd `4*$i`(@ptr[2]),$t2
+ lea `16*4`(@ptr[2]),@ptr[2]
+ movd `4*$i`(@ptr[3]),$t3
+ lea `16*4`(@ptr[3]),@ptr[3]
+ punpckldq $t2,$Xi
+ punpckldq $t3,$t1
+ punpckldq $t1,$Xi
+___
+$code.=<<___;
+ movdqa $e,$sigma
+ `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==0)`
+ movdqa $e,$t3
+ `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==1)`
+ psrld \$6,$sigma
+ movdqa $e,$t2
+ pslld \$7,$t3
+ movdqa $Xi,`&Xi_off($i)`
+ paddd $h,$Xi # Xi+=h
+
+ psrld \$11,$t2
+ pxor $t3,$sigma
+ pslld \$21-7,$t3
+ paddd `32*($i%8)-128`($Tbl),$Xi # Xi+=K[round]
+ pxor $t2,$sigma
+
+ psrld \$25-11,$t2
+ movdqa $e,$t1
+ `"prefetcht0 63(@ptr[0])" if ($i==15)`
+ pxor $t3,$sigma
+ movdqa $e,$axb # borrow $axb
+ pslld \$26-21,$t3
+ pandn $g,$t1
+ pand $f,$axb
+ pxor $t2,$sigma
+
+ `"prefetcht0 63(@ptr[1])" if ($i==15)`
+ movdqa $a,$t2
+ pxor $t3,$sigma # Sigma1(e)
+ movdqa $a,$t3
+ psrld \$2,$t2
+ paddd $sigma,$Xi # Xi+=Sigma1(e)
+ pxor $axb,$t1 # Ch(e,f,g)
+ movdqa $b,$axb
+ movdqa $a,$sigma
+ pslld \$10,$t3
+ pxor $a,$axb # a^b, b^c in next round
+
+ `"prefetcht0 63(@ptr[2])" if ($i==15)`
+ psrld \$13,$sigma
+ pxor $t3,$t2
+ paddd $t1,$Xi # Xi+=Ch(e,f,g)
+ pslld \$19-10,$t3
+ pand $axb,$bxc
+ pxor $sigma,$t2
+
+ `"prefetcht0 63(@ptr[3])" if ($i==15)`
+ psrld \$22-13,$sigma
+ pxor $t3,$t2
+ movdqa $b,$h
+ pslld \$30-19,$t3
+ pxor $t2,$sigma
+ pxor $bxc,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
+ paddd $Xi,$d # d+=Xi
+ pxor $t3,$sigma # Sigma0(a)
+
+ paddd $Xi,$h # h+=Xi
+ paddd $sigma,$h # h+=Sigma0(a)
+___
+$code.=<<___ if (($i%8)==7);
+ lea `32*8`($Tbl),$Tbl
+___
+ ($axb,$bxc)=($bxc,$axb);
+}
+
+sub ROUND_16_XX {
+my $i=shift;
+
+$code.=<<___;
+ movdqa `&Xi_off($i+1)`,$Xn
+ paddd `&Xi_off($i+9)`,$Xi # Xi+=X[i+9]
+
+ movdqa $Xn,$sigma
+ movdqa $Xn,$t2
+ psrld \$3,$sigma
+ movdqa $Xn,$t3
+
+ psrld \$7,$t2
+ movdqa `&Xi_off($i+14)`,$t1
+ pslld \$14,$t3
+ pxor $t2,$sigma
+ psrld \$18-7,$t2
+ movdqa $t1,$axb # borrow $axb
+ pxor $t3,$sigma
+ pslld \$25-14,$t3
+ pxor $t2,$sigma
+ psrld \$10,$t1
+ movdqa $axb,$t2
+
+ psrld \$17,$axb
+ pxor $t3,$sigma # sigma0(X[i+1])
+ pslld \$13,$t2
+ paddd $sigma,$Xi # Xi+=sigma0(e)
+ pxor $axb,$t1
+ psrld \$19-17,$axb
+ pxor $t2,$t1
+ pslld \$15-13,$t2
+ pxor $axb,$t1
+ pxor $t2,$t1 # sigma0(X[i+14])
+ paddd $t1,$Xi # Xi+=sigma1(X[i+14])
+___
+ &ROUND_00_15($i,@_);
+ ($Xi,$Xn)=($Xn,$Xi);
+}
+
+$code.=<<___;
+.text
+
+.extern OPENSSL_ia32cap_P
+
+.globl sha256_multi_block
+.type sha256_multi_block,\@function,3
+.align 32
+sha256_multi_block:
+ mov OPENSSL_ia32cap_P+4(%rip),%rcx
+ bt \$61,%rcx # check SHA bit
+ jc _shaext_shortcut
+___
+$code.=<<___ if ($avx);
+ test \$`1<<28`,%ecx
+ jnz _avx_shortcut
+___
+$code.=<<___;
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+___
+$code.=<<___ if ($win64);
+ lea -0xa8(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,-0x78(%rax)
+ movaps %xmm11,-0x68(%rax)
+ movaps %xmm12,-0x58(%rax)
+ movaps %xmm13,-0x48(%rax)
+ movaps %xmm14,-0x38(%rax)
+ movaps %xmm15,-0x28(%rax)
+___
+$code.=<<___;
+ sub \$`$REG_SZ*18`, %rsp
+ and \$-256,%rsp
+ mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
+.Lbody:
+ lea K256+128(%rip),$Tbl
+ lea `$REG_SZ*16`(%rsp),%rbx
+ lea 0x80($ctx),$ctx # size optimization
+
+.Loop_grande:
+ mov $num,`$REG_SZ*17+8`(%rsp) # original $num
+ xor $num,$num
+___
+for($i=0;$i<4;$i++) {
+ $code.=<<___;
+ mov `16*$i+0`($inp),@ptr[$i] # input pointer
+ mov `16*$i+8`($inp),%ecx # number of blocks
+ cmp $num,%ecx
+ cmovg %ecx,$num # find maximum
+ test %ecx,%ecx
+ mov %ecx,`4*$i`(%rbx) # initialize counters
+ cmovle $Tbl,@ptr[$i] # cancel input
+___
+}
+$code.=<<___;
+ test $num,$num
+ jz .Ldone
+
+ movdqu 0x00-0x80($ctx),$A # load context
+ lea 128(%rsp),%rax
+ movdqu 0x20-0x80($ctx),$B
+ movdqu 0x40-0x80($ctx),$C
+ movdqu 0x60-0x80($ctx),$D
+ movdqu 0x80-0x80($ctx),$E
+ movdqu 0xa0-0x80($ctx),$F
+ movdqu 0xc0-0x80($ctx),$G
+ movdqu 0xe0-0x80($ctx),$H
+ movdqu .Lpbswap(%rip),$Xn
+ jmp .Loop
+
+.align 32
+.Loop:
+ movdqa $C,$bxc
+ pxor $B,$bxc # magic seed
+___
+for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ movdqu `&Xi_off($i)`,$Xi
+ mov \$3,%ecx
+ jmp .Loop_16_xx
+.align 32
+.Loop_16_xx:
+___
+for(;$i<32;$i++) { &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ dec %ecx
+ jnz .Loop_16_xx
+
+ mov \$1,%ecx
+ lea K256+128(%rip),$Tbl
+
+ movdqa (%rbx),$sigma # pull counters
+ cmp 4*0(%rbx),%ecx # examine counters
+ pxor $t1,$t1
+ cmovge $Tbl,@ptr[0] # cancel input
+ cmp 4*1(%rbx),%ecx
+ movdqa $sigma,$Xn
+ cmovge $Tbl,@ptr[1]
+ cmp 4*2(%rbx),%ecx
+ pcmpgtd $t1,$Xn # mask value
+ cmovge $Tbl,@ptr[2]
+ cmp 4*3(%rbx),%ecx
+ paddd $Xn,$sigma # counters--
+ cmovge $Tbl,@ptr[3]
+
+ movdqu 0x00-0x80($ctx),$t1
+ pand $Xn,$A
+ movdqu 0x20-0x80($ctx),$t2
+ pand $Xn,$B
+ movdqu 0x40-0x80($ctx),$t3
+ pand $Xn,$C
+ movdqu 0x60-0x80($ctx),$Xi
+ pand $Xn,$D
+ paddd $t1,$A
+ movdqu 0x80-0x80($ctx),$t1
+ pand $Xn,$E
+ paddd $t2,$B
+ movdqu 0xa0-0x80($ctx),$t2
+ pand $Xn,$F
+ paddd $t3,$C
+ movdqu 0xc0-0x80($ctx),$t3
+ pand $Xn,$G
+ paddd $Xi,$D
+ movdqu 0xe0-0x80($ctx),$Xi
+ pand $Xn,$H
+ paddd $t1,$E
+ paddd $t2,$F
+ movdqu $A,0x00-0x80($ctx)
+ paddd $t3,$G
+ movdqu $B,0x20-0x80($ctx)
+ paddd $Xi,$H
+ movdqu $C,0x40-0x80($ctx)
+ movdqu $D,0x60-0x80($ctx)
+ movdqu $E,0x80-0x80($ctx)
+ movdqu $F,0xa0-0x80($ctx)
+ movdqu $G,0xc0-0x80($ctx)
+ movdqu $H,0xe0-0x80($ctx)
+
+ movdqa $sigma,(%rbx) # save counters
+ movdqa .Lpbswap(%rip),$Xn
+ dec $num
+ jnz .Loop
+
+ mov `$REG_SZ*17+8`(%rsp),$num
+ lea $REG_SZ($ctx),$ctx
+ lea `16*$REG_SZ/4`($inp),$inp
+ dec $num
+ jnz .Loop_grande
+
+.Ldone:
+ mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
+___
+$code.=<<___ if ($win64);
+ movaps -0xb8(%rax),%xmm6
+ movaps -0xa8(%rax),%xmm7
+ movaps -0x98(%rax),%xmm8
+ movaps -0x88(%rax),%xmm9
+ movaps -0x78(%rax),%xmm10
+ movaps -0x68(%rax),%xmm11
+ movaps -0x58(%rax),%xmm12
+ movaps -0x48(%rax),%xmm13
+ movaps -0x38(%rax),%xmm14
+ movaps -0x28(%rax),%xmm15
+___
+$code.=<<___;
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp
+.Lepilogue:
+ ret
+.size sha256_multi_block,.-sha256_multi_block
+___
+ {{{
+my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15));
+my @MSG0=map("%xmm$_",(4..7));
+my @MSG1=map("%xmm$_",(8..11));
+
+$code.=<<___;
+.type sha256_multi_block_shaext,\@function,3
+.align 32
+sha256_multi_block_shaext:
+_shaext_shortcut:
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+___
+$code.=<<___ if ($win64);
+ lea -0xa8(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,-0x78(%rax)
+ movaps %xmm11,-0x68(%rax)
+ movaps %xmm12,-0x58(%rax)
+ movaps %xmm13,-0x48(%rax)
+ movaps %xmm14,-0x38(%rax)
+ movaps %xmm15,-0x28(%rax)
+___
+$code.=<<___;
+ sub \$`$REG_SZ*18`,%rsp
+ shl \$1,$num # we process pair at a time
+ and \$-256,%rsp
+ lea 0x80($ctx),$ctx # size optimization
+ mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
+.Lbody_shaext:
+ lea `$REG_SZ*16`(%rsp),%rbx
+ lea K256_shaext+0x80(%rip),$Tbl
+
+.Loop_grande_shaext:
+ mov $num,`$REG_SZ*17+8`(%rsp) # orignal $num
+ xor $num,$num
+___
+for($i=0;$i<2;$i++) {
+ $code.=<<___;
+ mov `16*$i+0`($inp),@ptr[$i] # input pointer
+ mov `16*$i+8`($inp),%ecx # number of blocks
+ cmp $num,%ecx
+ cmovg %ecx,$num # find maximum
+ test %ecx,%ecx
+ mov %ecx,`4*$i`(%rbx) # initialize counters
+ cmovle %rsp,@ptr[$i] # cancel input
+___
+}
+$code.=<<___;
+ test $num,$num
+ jz .Ldone_shaext
+
+ movq 0x00-0x80($ctx),$ABEF0 # A1.A0
+ movq 0x20-0x80($ctx),@MSG0[0] # B1.B0
+ movq 0x40-0x80($ctx),$CDGH0 # C1.C0
+ movq 0x60-0x80($ctx),@MSG0[1] # D1.D0
+ movq 0x80-0x80($ctx),@MSG1[0] # E1.E0
+ movq 0xa0-0x80($ctx),@MSG1[1] # F1.F0
+ movq 0xc0-0x80($ctx),@MSG1[2] # G1.G0
+ movq 0xe0-0x80($ctx),@MSG1[3] # H1.H0
+
+ punpckldq @MSG0[0],$ABEF0 # B1.A1.B0.A0
+ punpckldq @MSG0[1],$CDGH0 # D1.C1.D0.C0
+ punpckldq @MSG1[1],@MSG1[0] # F1.E1.F0.E0
+ punpckldq @MSG1[3],@MSG1[2] # H1.G1.H0.G0
+ movdqa K256_shaext-0x10(%rip),$TMPx # byte swap
+
+ movdqa $ABEF0,$ABEF1
+ movdqa $CDGH0,$CDGH1
+ punpcklqdq @MSG1[0],$ABEF0 # F0.E0.B0.A0
+ punpcklqdq @MSG1[2],$CDGH0 # H0.G0.D0.C0
+ punpckhqdq @MSG1[0],$ABEF1 # F1.E1.B1.A1
+ punpckhqdq @MSG1[2],$CDGH1 # H1.G1.D1.C1
+
+ pshufd \$0b00011011,$ABEF0,$ABEF0
+ pshufd \$0b00011011,$CDGH0,$CDGH0
+ pshufd \$0b00011011,$ABEF1,$ABEF1
+ pshufd \$0b00011011,$CDGH1,$CDGH1
+ jmp .Loop_shaext
+
+.align 32
+.Loop_shaext:
+ movdqu 0x00(@ptr[0]),@MSG0[0]
+ movdqu 0x00(@ptr[1]),@MSG1[0]
+ movdqu 0x10(@ptr[0]),@MSG0[1]
+ movdqu 0x10(@ptr[1]),@MSG1[1]
+ movdqu 0x20(@ptr[0]),@MSG0[2]
+ pshufb $TMPx,@MSG0[0]
+ movdqu 0x20(@ptr[1]),@MSG1[2]
+ pshufb $TMPx,@MSG1[0]
+ movdqu 0x30(@ptr[0]),@MSG0[3]
+ lea 0x40(@ptr[0]),@ptr[0]
+ movdqu 0x30(@ptr[1]),@MSG1[3]
+ lea 0x40(@ptr[1]),@ptr[1]
+
+ movdqa 0*16-0x80($Tbl),$Wi
+ pshufb $TMPx,@MSG0[1]
+ paddd @MSG0[0],$Wi
+ pxor $ABEF0,@MSG0[0] # black magic
+ movdqa $Wi,$TMP0
+ movdqa 0*16-0x80($Tbl),$TMP1
+ pshufb $TMPx,@MSG1[1]
+ paddd @MSG1[0],$TMP1
+ movdqa $CDGH0,0x50(%rsp) # offload
+ sha256rnds2 $ABEF0,$CDGH0 # 0-3
+ pxor $ABEF1,@MSG1[0] # black magic
+ movdqa $TMP1,$Wi
+ movdqa $CDGH1,0x70(%rsp)
+ sha256rnds2 $ABEF1,$CDGH1 # 0-3
+ pshufd \$0x0e,$TMP0,$Wi
+ pxor $ABEF0,@MSG0[0] # black magic
+ movdqa $ABEF0,0x40(%rsp) # offload
+ sha256rnds2 $CDGH0,$ABEF0
+ pshufd \$0x0e,$TMP1,$Wi
+ pxor $ABEF1,@MSG1[0] # black magic
+ movdqa $ABEF1,0x60(%rsp)
+ movdqa 1*16-0x80($Tbl),$TMP0
+ paddd @MSG0[1],$TMP0
+ pshufb $TMPx,@MSG0[2]
+ sha256rnds2 $CDGH1,$ABEF1
+
+ movdqa $TMP0,$Wi
+ movdqa 1*16-0x80($Tbl),$TMP1
+ paddd @MSG1[1],$TMP1
+ sha256rnds2 $ABEF0,$CDGH0 # 4-7
+ movdqa $TMP1,$Wi
+ prefetcht0 127(@ptr[0])
+ pshufb $TMPx,@MSG0[3]
+ pshufb $TMPx,@MSG1[2]
+ prefetcht0 127(@ptr[1])
+ sha256rnds2 $ABEF1,$CDGH1 # 4-7
+ pshufd \$0x0e,$TMP0,$Wi
+ pshufb $TMPx,@MSG1[3]
+ sha256msg1 @MSG0[1],@MSG0[0]
+ sha256rnds2 $CDGH0,$ABEF0
+ pshufd \$0x0e,$TMP1,$Wi
+ movdqa 2*16-0x80($Tbl),$TMP0
+ paddd @MSG0[2],$TMP0
+ sha256rnds2 $CDGH1,$ABEF1
+
+ movdqa $TMP0,$Wi
+ movdqa 2*16-0x80($Tbl),$TMP1
+ paddd @MSG1[2],$TMP1
+ sha256rnds2 $ABEF0,$CDGH0 # 8-11
+ sha256msg1 @MSG1[1],@MSG1[0]
+ movdqa $TMP1,$Wi
+ movdqa @MSG0[3],$TMPx
+ sha256rnds2 $ABEF1,$CDGH1 # 8-11
+ pshufd \$0x0e,$TMP0,$Wi
+ palignr \$4,@MSG0[2],$TMPx
+ paddd $TMPx,@MSG0[0]
+ movdqa @MSG1[3],$TMPx
+ palignr \$4,@MSG1[2],$TMPx
+ sha256msg1 @MSG0[2],@MSG0[1]
+ sha256rnds2 $CDGH0,$ABEF0
+ pshufd \$0x0e,$TMP1,$Wi
+ movdqa 3*16-0x80($Tbl),$TMP0
+ paddd @MSG0[3],$TMP0
+ sha256rnds2 $CDGH1,$ABEF1
+ sha256msg1 @MSG1[2],@MSG1[1]
+
+ movdqa $TMP0,$Wi
+ movdqa 3*16-0x80($Tbl),$TMP1
+ paddd $TMPx,@MSG1[0]
+ paddd @MSG1[3],$TMP1
+ sha256msg2 @MSG0[3],@MSG0[0]
+ sha256rnds2 $ABEF0,$CDGH0 # 12-15
+ movdqa $TMP1,$Wi
+ movdqa @MSG0[0],$TMPx
+ palignr \$4,@MSG0[3],$TMPx
+ sha256rnds2 $ABEF1,$CDGH1 # 12-15
+ sha256msg2 @MSG1[3],@MSG1[0]
+ pshufd \$0x0e,$TMP0,$Wi
+ paddd $TMPx,@MSG0[1]
+ movdqa @MSG1[0],$TMPx
+ palignr \$4,@MSG1[3],$TMPx
+ sha256msg1 @MSG0[3],@MSG0[2]
+ sha256rnds2 $CDGH0,$ABEF0
+ pshufd \$0x0e,$TMP1,$Wi
+ movdqa 4*16-0x80($Tbl),$TMP0
+ paddd @MSG0[0],$TMP0
+ sha256rnds2 $CDGH1,$ABEF1
+ sha256msg1 @MSG1[3],@MSG1[2]
+___
+for($i=4;$i<16-3;$i++) {
+$code.=<<___;
+ movdqa $TMP0,$Wi
+ movdqa $i*16-0x80($Tbl),$TMP1
+ paddd $TMPx,@MSG1[1]
+ paddd @MSG1[0],$TMP1
+ sha256msg2 @MSG0[0],@MSG0[1]
+ sha256rnds2 $ABEF0,$CDGH0 # 16-19...
+ movdqa $TMP1,$Wi
+ movdqa @MSG0[1],$TMPx
+ palignr \$4,@MSG0[0],$TMPx
+ sha256rnds2 $ABEF1,$CDGH1 # 16-19...
+ sha256msg2 @MSG1[0],@MSG1[1]
+ pshufd \$0x0e,$TMP0,$Wi
+ paddd $TMPx,@MSG0[2]
+ movdqa @MSG1[1],$TMPx
+ palignr \$4,@MSG1[0],$TMPx
+ sha256msg1 @MSG0[0],@MSG0[3]
+ sha256rnds2 $CDGH0,$ABEF0
+ pshufd \$0x0e,$TMP1,$Wi
+ movdqa `($i+1)*16`-0x80($Tbl),$TMP0
+ paddd @MSG0[1],$TMP0
+ sha256rnds2 $CDGH1,$ABEF1
+ sha256msg1 @MSG1[0],@MSG1[3]
+___
+ push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1));
+}
+$code.=<<___;
+ movdqa $TMP0,$Wi
+ movdqa 13*16-0x80($Tbl),$TMP1
+ paddd $TMPx,@MSG1[1]
+ paddd @MSG1[0],$TMP1
+ sha256msg2 @MSG0[0],@MSG0[1]
+ sha256rnds2 $ABEF0,$CDGH0 # 52-55
+ movdqa $TMP1,$Wi
+ movdqa @MSG0[1],$TMPx
+ palignr \$4,@MSG0[0],$TMPx
+ sha256rnds2 $ABEF1,$CDGH1 # 52-55
+ sha256msg2 @MSG1[0],@MSG1[1]
+ pshufd \$0x0e,$TMP0,$Wi
+ paddd $TMPx,@MSG0[2]
+ movdqa @MSG1[1],$TMPx
+ palignr \$4,@MSG1[0],$TMPx
+ nop
+ sha256rnds2 $CDGH0,$ABEF0
+ pshufd \$0x0e,$TMP1,$Wi
+ movdqa 14*16-0x80($Tbl),$TMP0
+ paddd @MSG0[1],$TMP0
+ sha256rnds2 $CDGH1,$ABEF1
+
+ movdqa $TMP0,$Wi
+ movdqa 14*16-0x80($Tbl),$TMP1
+ paddd $TMPx,@MSG1[2]
+ paddd @MSG1[1],$TMP1
+ sha256msg2 @MSG0[1],@MSG0[2]
+ nop
+ sha256rnds2 $ABEF0,$CDGH0 # 56-59
+ movdqa $TMP1,$Wi
+ mov \$1,%ecx
+ pxor @MSG0[1],@MSG0[1] # zero
+ sha256rnds2 $ABEF1,$CDGH1 # 56-59
+ sha256msg2 @MSG1[1],@MSG1[2]
+ pshufd \$0x0e,$TMP0,$Wi
+ movdqa 15*16-0x80($Tbl),$TMP0
+ paddd @MSG0[2],$TMP0
+ movq (%rbx),@MSG0[2] # pull counters
+ nop
+ sha256rnds2 $CDGH0,$ABEF0
+ pshufd \$0x0e,$TMP1,$Wi
+ movdqa 15*16-0x80($Tbl),$TMP1
+ paddd @MSG1[2],$TMP1
+ sha256rnds2 $CDGH1,$ABEF1
+
+ movdqa $TMP0,$Wi
+ cmp 4*0(%rbx),%ecx # examine counters
+ cmovge %rsp,@ptr[0] # cancel input
+ cmp 4*1(%rbx),%ecx
+ cmovge %rsp,@ptr[1]
+ pshufd \$0x00,@MSG0[2],@MSG1[0]
+ sha256rnds2 $ABEF0,$CDGH0 # 60-63
+ movdqa $TMP1,$Wi
+ pshufd \$0x55,@MSG0[2],@MSG1[1]
+ movdqa @MSG0[2],@MSG1[2]
+ sha256rnds2 $ABEF1,$CDGH1 # 60-63
+ pshufd \$0x0e,$TMP0,$Wi
+ pcmpgtd @MSG0[1],@MSG1[0]
+ pcmpgtd @MSG0[1],@MSG1[1]
+ sha256rnds2 $CDGH0,$ABEF0
+ pshufd \$0x0e,$TMP1,$Wi
+ pcmpgtd @MSG0[1],@MSG1[2] # counter mask
+ movdqa K256_shaext-0x10(%rip),$TMPx
+ sha256rnds2 $CDGH1,$ABEF1
+
+ pand @MSG1[0],$CDGH0
+ pand @MSG1[1],$CDGH1
+ pand @MSG1[0],$ABEF0
+ pand @MSG1[1],$ABEF1
+ paddd @MSG0[2],@MSG1[2] # counters--
+
+ paddd 0x50(%rsp),$CDGH0
+ paddd 0x70(%rsp),$CDGH1
+ paddd 0x40(%rsp),$ABEF0
+ paddd 0x60(%rsp),$ABEF1
+
+ movq @MSG1[2],(%rbx) # save counters
+ dec $num
+ jnz .Loop_shaext
+
+ mov `$REG_SZ*17+8`(%rsp),$num
+
+ pshufd \$0b00011011,$ABEF0,$ABEF0
+ pshufd \$0b00011011,$CDGH0,$CDGH0
+ pshufd \$0b00011011,$ABEF1,$ABEF1
+ pshufd \$0b00011011,$CDGH1,$CDGH1
+
+ movdqa $ABEF0,@MSG0[0]
+ movdqa $CDGH0,@MSG0[1]
+ punpckldq $ABEF1,$ABEF0 # B1.B0.A1.A0
+ punpckhdq $ABEF1,@MSG0[0] # F1.F0.E1.E0
+ punpckldq $CDGH1,$CDGH0 # D1.D0.C1.C0
+ punpckhdq $CDGH1,@MSG0[1] # H1.H0.G1.G0
+
+ movq $ABEF0,0x00-0x80($ctx) # A1.A0
+ psrldq \$8,$ABEF0
+ movq @MSG0[0],0x80-0x80($ctx) # E1.E0
+ psrldq \$8,@MSG0[0]
+ movq $ABEF0,0x20-0x80($ctx) # B1.B0
+ movq @MSG0[0],0xa0-0x80($ctx) # F1.F0
+
+ movq $CDGH0,0x40-0x80($ctx) # C1.C0
+ psrldq \$8,$CDGH0
+ movq @MSG0[1],0xc0-0x80($ctx) # G1.G0
+ psrldq \$8,@MSG0[1]
+ movq $CDGH0,0x60-0x80($ctx) # D1.D0
+ movq @MSG0[1],0xe0-0x80($ctx) # H1.H0
+
+ lea `$REG_SZ/2`($ctx),$ctx
+ lea `16*2`($inp),$inp
+ dec $num
+ jnz .Loop_grande_shaext
+
+.Ldone_shaext:
+ #mov `$REG_SZ*17`(%rsp),%rax # original %rsp
+___
+$code.=<<___ if ($win64);
+ movaps -0xb8(%rax),%xmm6
+ movaps -0xa8(%rax),%xmm7
+ movaps -0x98(%rax),%xmm8
+ movaps -0x88(%rax),%xmm9
+ movaps -0x78(%rax),%xmm10
+ movaps -0x68(%rax),%xmm11
+ movaps -0x58(%rax),%xmm12
+ movaps -0x48(%rax),%xmm13
+ movaps -0x38(%rax),%xmm14
+ movaps -0x28(%rax),%xmm15
+___
+$code.=<<___;
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp
+.Lepilogue_shaext:
+ ret
+.size sha256_multi_block_shaext,.-sha256_multi_block_shaext
+___
+ }}}
+ if ($avx) {{{
+sub ROUND_00_15_avx {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+
+$code.=<<___ if ($i<15 && $REG_SZ==16);
+ vmovd `4*$i`(@ptr[0]),$Xi
+ vmovd `4*$i`(@ptr[1]),$t1
+ vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
+ vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
+ vpunpckldq $t1,$Xi,$Xi
+ vpshufb $Xn,$Xi,$Xi
+___
+$code.=<<___ if ($i==15 && $REG_SZ==16);
+ vmovd `4*$i`(@ptr[0]),$Xi
+ lea `16*4`(@ptr[0]),@ptr[0]
+ vmovd `4*$i`(@ptr[1]),$t1
+ lea `16*4`(@ptr[1]),@ptr[1]
+ vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
+ lea `16*4`(@ptr[2]),@ptr[2]
+ vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
+ lea `16*4`(@ptr[3]),@ptr[3]
+ vpunpckldq $t1,$Xi,$Xi
+ vpshufb $Xn,$Xi,$Xi
+___
+$code.=<<___ if ($i<15 && $REG_SZ==32);
+ vmovd `4*$i`(@ptr[0]),$Xi
+ vmovd `4*$i`(@ptr[4]),$t1
+ vmovd `4*$i`(@ptr[1]),$t2
+ vmovd `4*$i`(@ptr[5]),$t3
+ vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
+ vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
+ vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
+ vpunpckldq $t2,$Xi,$Xi
+ vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
+ vpunpckldq $t3,$t1,$t1
+ vinserti128 $t1,$Xi,$Xi
+ vpshufb $Xn,$Xi,$Xi
+___
+$code.=<<___ if ($i==15 && $REG_SZ==32);
+ vmovd `4*$i`(@ptr[0]),$Xi
+ lea `16*4`(@ptr[0]),@ptr[0]
+ vmovd `4*$i`(@ptr[4]),$t1
+ lea `16*4`(@ptr[4]),@ptr[4]
+ vmovd `4*$i`(@ptr[1]),$t2
+ lea `16*4`(@ptr[1]),@ptr[1]
+ vmovd `4*$i`(@ptr[5]),$t3
+ lea `16*4`(@ptr[5]),@ptr[5]
+ vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
+ lea `16*4`(@ptr[2]),@ptr[2]
+ vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
+ lea `16*4`(@ptr[6]),@ptr[6]
+ vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
+ lea `16*4`(@ptr[3]),@ptr[3]
+ vpunpckldq $t2,$Xi,$Xi
+ vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
+ lea `16*4`(@ptr[7]),@ptr[7]
+ vpunpckldq $t3,$t1,$t1
+ vinserti128 $t1,$Xi,$Xi
+ vpshufb $Xn,$Xi,$Xi
+___
+$code.=<<___;
+ vpsrld \$6,$e,$sigma
+ vpslld \$26,$e,$t3
+ vmovdqu $Xi,`&Xi_off($i)`
+ vpaddd $h,$Xi,$Xi # Xi+=h
+
+ vpsrld \$11,$e,$t2
+ vpxor $t3,$sigma,$sigma
+ vpslld \$21,$e,$t3
+ vpaddd `32*($i%8)-128`($Tbl),$Xi,$Xi # Xi+=K[round]
+ vpxor $t2,$sigma,$sigma
+
+ vpsrld \$25,$e,$t2
+ vpxor $t3,$sigma,$sigma
+ `"prefetcht0 63(@ptr[0])" if ($i==15)`
+ vpslld \$7,$e,$t3
+ vpandn $g,$e,$t1
+ vpand $f,$e,$axb # borrow $axb
+ `"prefetcht0 63(@ptr[1])" if ($i==15)`
+ vpxor $t2,$sigma,$sigma
+
+ vpsrld \$2,$a,$h # borrow $h
+ vpxor $t3,$sigma,$sigma # Sigma1(e)
+ `"prefetcht0 63(@ptr[2])" if ($i==15)`
+ vpslld \$30,$a,$t2
+ vpxor $axb,$t1,$t1 # Ch(e,f,g)
+ vpxor $a,$b,$axb # a^b, b^c in next round
+ `"prefetcht0 63(@ptr[3])" if ($i==15)`
+ vpxor $t2,$h,$h
+ vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e)
+
+ vpsrld \$13,$a,$t2
+ `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)`
+ vpslld \$19,$a,$t3
+ vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g)
+ vpand $axb,$bxc,$bxc
+ `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)`
+ vpxor $t2,$h,$sigma
+
+ vpsrld \$22,$a,$t2
+ vpxor $t3,$sigma,$sigma
+ `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)`
+ vpslld \$10,$a,$t3
+ vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
+ vpaddd $Xi,$d,$d # d+=Xi
+ `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)`
+ vpxor $t2,$sigma,$sigma
+ vpxor $t3,$sigma,$sigma # Sigma0(a)
+
+ vpaddd $Xi,$h,$h # h+=Xi
+ vpaddd $sigma,$h,$h # h+=Sigma0(a)
+___
+$code.=<<___ if (($i%8)==7);
+ add \$`32*8`,$Tbl
+___
+ ($axb,$bxc)=($bxc,$axb);
+}
+
+sub ROUND_16_XX_avx {
+my $i=shift;
+
+$code.=<<___;
+ vmovdqu `&Xi_off($i+1)`,$Xn
+ vpaddd `&Xi_off($i+9)`,$Xi,$Xi # Xi+=X[i+9]
+
+ vpsrld \$3,$Xn,$sigma
+ vpsrld \$7,$Xn,$t2
+ vpslld \$25,$Xn,$t3
+ vpxor $t2,$sigma,$sigma
+ vpsrld \$18,$Xn,$t2
+ vpxor $t3,$sigma,$sigma
+ vpslld \$14,$Xn,$t3
+ vmovdqu `&Xi_off($i+14)`,$t1
+ vpsrld \$10,$t1,$axb # borrow $axb
+
+ vpxor $t2,$sigma,$sigma
+ vpsrld \$17,$t1,$t2
+ vpxor $t3,$sigma,$sigma # sigma0(X[i+1])
+ vpslld \$15,$t1,$t3
+ vpaddd $sigma,$Xi,$Xi # Xi+=sigma0(e)
+ vpxor $t2,$axb,$sigma
+ vpsrld \$19,$t1,$t2
+ vpxor $t3,$sigma,$sigma
+ vpslld \$13,$t1,$t3
+ vpxor $t2,$sigma,$sigma
+ vpxor $t3,$sigma,$sigma # sigma0(X[i+14])
+ vpaddd $sigma,$Xi,$Xi # Xi+=sigma1(X[i+14])
+___
+ &ROUND_00_15_avx($i,@_);
+ ($Xi,$Xn)=($Xn,$Xi);
+}
+
+$code.=<<___;
+.type sha256_multi_block_avx,\@function,3
+.align 32
+sha256_multi_block_avx:
+_avx_shortcut:
+___
+$code.=<<___ if ($avx>1);
+ shr \$32,%rcx
+ cmp \$2,$num
+ jb .Lavx
+ test \$`1<<5`,%ecx
+ jnz _avx2_shortcut
+ jmp .Lavx
+.align 32
+.Lavx:
+___
+$code.=<<___;
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+___
+$code.=<<___ if ($win64);
+ lea -0xa8(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,-0x78(%rax)
+ movaps %xmm11,-0x68(%rax)
+ movaps %xmm12,-0x58(%rax)
+ movaps %xmm13,-0x48(%rax)
+ movaps %xmm14,-0x38(%rax)
+ movaps %xmm15,-0x28(%rax)
+___
+$code.=<<___;
+ sub \$`$REG_SZ*18`, %rsp
+ and \$-256,%rsp
+ mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
+.Lbody_avx:
+ lea K256+128(%rip),$Tbl
+ lea `$REG_SZ*16`(%rsp),%rbx
+ lea 0x80($ctx),$ctx # size optimization
+
+.Loop_grande_avx:
+ mov $num,`$REG_SZ*17+8`(%rsp) # original $num
+ xor $num,$num
+___
+for($i=0;$i<4;$i++) {
+ $code.=<<___;
+ mov `16*$i+0`($inp),@ptr[$i] # input pointer
+ mov `16*$i+8`($inp),%ecx # number of blocks
+ cmp $num,%ecx
+ cmovg %ecx,$num # find maximum
+ test %ecx,%ecx
+ mov %ecx,`4*$i`(%rbx) # initialize counters
+ cmovle $Tbl,@ptr[$i] # cancel input
+___
+}
+$code.=<<___;
+ test $num,$num
+ jz .Ldone_avx
+
+ vmovdqu 0x00-0x80($ctx),$A # load context
+ lea 128(%rsp),%rax
+ vmovdqu 0x20-0x80($ctx),$B
+ vmovdqu 0x40-0x80($ctx),$C
+ vmovdqu 0x60-0x80($ctx),$D
+ vmovdqu 0x80-0x80($ctx),$E
+ vmovdqu 0xa0-0x80($ctx),$F
+ vmovdqu 0xc0-0x80($ctx),$G
+ vmovdqu 0xe0-0x80($ctx),$H
+ vmovdqu .Lpbswap(%rip),$Xn
+ jmp .Loop_avx
+
+.align 32
+.Loop_avx:
+ vpxor $B,$C,$bxc # magic seed
+___
+for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ vmovdqu `&Xi_off($i)`,$Xi
+ mov \$3,%ecx
+ jmp .Loop_16_xx_avx
+.align 32
+.Loop_16_xx_avx:
+___
+for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ dec %ecx
+ jnz .Loop_16_xx_avx
+
+ mov \$1,%ecx
+ lea K256+128(%rip),$Tbl
+___
+for($i=0;$i<4;$i++) {
+ $code.=<<___;
+ cmp `4*$i`(%rbx),%ecx # examine counters
+ cmovge $Tbl,@ptr[$i] # cancel input
+___
+}
+$code.=<<___;
+ vmovdqa (%rbx),$sigma # pull counters
+ vpxor $t1,$t1,$t1
+ vmovdqa $sigma,$Xn
+ vpcmpgtd $t1,$Xn,$Xn # mask value
+ vpaddd $Xn,$sigma,$sigma # counters--
+
+ vmovdqu 0x00-0x80($ctx),$t1
+ vpand $Xn,$A,$A
+ vmovdqu 0x20-0x80($ctx),$t2
+ vpand $Xn,$B,$B
+ vmovdqu 0x40-0x80($ctx),$t3
+ vpand $Xn,$C,$C
+ vmovdqu 0x60-0x80($ctx),$Xi
+ vpand $Xn,$D,$D
+ vpaddd $t1,$A,$A
+ vmovdqu 0x80-0x80($ctx),$t1
+ vpand $Xn,$E,$E
+ vpaddd $t2,$B,$B
+ vmovdqu 0xa0-0x80($ctx),$t2
+ vpand $Xn,$F,$F
+ vpaddd $t3,$C,$C
+ vmovdqu 0xc0-0x80($ctx),$t3
+ vpand $Xn,$G,$G
+ vpaddd $Xi,$D,$D
+ vmovdqu 0xe0-0x80($ctx),$Xi
+ vpand $Xn,$H,$H
+ vpaddd $t1,$E,$E
+ vpaddd $t2,$F,$F
+ vmovdqu $A,0x00-0x80($ctx)
+ vpaddd $t3,$G,$G
+ vmovdqu $B,0x20-0x80($ctx)
+ vpaddd $Xi,$H,$H
+ vmovdqu $C,0x40-0x80($ctx)
+ vmovdqu $D,0x60-0x80($ctx)
+ vmovdqu $E,0x80-0x80($ctx)
+ vmovdqu $F,0xa0-0x80($ctx)
+ vmovdqu $G,0xc0-0x80($ctx)
+ vmovdqu $H,0xe0-0x80($ctx)
+
+ vmovdqu $sigma,(%rbx) # save counters
+ vmovdqu .Lpbswap(%rip),$Xn
+ dec $num
+ jnz .Loop_avx
+
+ mov `$REG_SZ*17+8`(%rsp),$num
+ lea $REG_SZ($ctx),$ctx
+ lea `16*$REG_SZ/4`($inp),$inp
+ dec $num
+ jnz .Loop_grande_avx
+
+.Ldone_avx:
+ mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps -0xb8(%rax),%xmm6
+ movaps -0xa8(%rax),%xmm7
+ movaps -0x98(%rax),%xmm8
+ movaps -0x88(%rax),%xmm9
+ movaps -0x78(%rax),%xmm10
+ movaps -0x68(%rax),%xmm11
+ movaps -0x58(%rax),%xmm12
+ movaps -0x48(%rax),%xmm13
+ movaps -0x38(%rax),%xmm14
+ movaps -0x28(%rax),%xmm15
+___
+$code.=<<___;
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp
+.Lepilogue_avx:
+ ret
+.size sha256_multi_block_avx,.-sha256_multi_block_avx
+___
+ if ($avx>1) {
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+
+$REG_SZ=32;
+@ptr=map("%r$_",(12..15,8..11));
+
+@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15));
+($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7));
+
+$code.=<<___;
+.type sha256_multi_block_avx2,\@function,3
+.align 32
+sha256_multi_block_avx2:
+_avx2_shortcut:
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0xa8(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,0x40(%rsp)
+ movaps %xmm11,0x50(%rsp)
+ movaps %xmm12,-0x78(%rax)
+ movaps %xmm13,-0x68(%rax)
+ movaps %xmm14,-0x58(%rax)
+ movaps %xmm15,-0x48(%rax)
+___
+$code.=<<___;
+ sub \$`$REG_SZ*18`, %rsp
+ and \$-256,%rsp
+ mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
+.Lbody_avx2:
+ lea K256+128(%rip),$Tbl
+ lea 0x80($ctx),$ctx # size optimization
+
+.Loop_grande_avx2:
+ mov $num,`$REG_SZ*17+8`(%rsp) # original $num
+ xor $num,$num
+ lea `$REG_SZ*16`(%rsp),%rbx
+___
+for($i=0;$i<8;$i++) {
+ $code.=<<___;
+ mov `16*$i+0`($inp),@ptr[$i] # input pointer
+ mov `16*$i+8`($inp),%ecx # number of blocks
+ cmp $num,%ecx
+ cmovg %ecx,$num # find maximum
+ test %ecx,%ecx
+ mov %ecx,`4*$i`(%rbx) # initialize counters
+ cmovle $Tbl,@ptr[$i] # cancel input
+___
+}
+$code.=<<___;
+ vmovdqu 0x00-0x80($ctx),$A # load context
+ lea 128(%rsp),%rax
+ vmovdqu 0x20-0x80($ctx),$B
+ lea 256+128(%rsp),%rbx
+ vmovdqu 0x40-0x80($ctx),$C
+ vmovdqu 0x60-0x80($ctx),$D
+ vmovdqu 0x80-0x80($ctx),$E
+ vmovdqu 0xa0-0x80($ctx),$F
+ vmovdqu 0xc0-0x80($ctx),$G
+ vmovdqu 0xe0-0x80($ctx),$H
+ vmovdqu .Lpbswap(%rip),$Xn
+ jmp .Loop_avx2
+
+.align 32
+.Loop_avx2:
+ vpxor $B,$C,$bxc # magic seed
+___
+for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ vmovdqu `&Xi_off($i)`,$Xi
+ mov \$3,%ecx
+ jmp .Loop_16_xx_avx2
+.align 32
+.Loop_16_xx_avx2:
+___
+for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ dec %ecx
+ jnz .Loop_16_xx_avx2
+
+ mov \$1,%ecx
+ lea `$REG_SZ*16`(%rsp),%rbx
+ lea K256+128(%rip),$Tbl
+___
+for($i=0;$i<8;$i++) {
+ $code.=<<___;
+ cmp `4*$i`(%rbx),%ecx # examine counters
+ cmovge $Tbl,@ptr[$i] # cancel input
+___
+}
+$code.=<<___;
+ vmovdqa (%rbx),$sigma # pull counters
+ vpxor $t1,$t1,$t1
+ vmovdqa $sigma,$Xn
+ vpcmpgtd $t1,$Xn,$Xn # mask value
+ vpaddd $Xn,$sigma,$sigma # counters--
+
+ vmovdqu 0x00-0x80($ctx),$t1
+ vpand $Xn,$A,$A
+ vmovdqu 0x20-0x80($ctx),$t2
+ vpand $Xn,$B,$B
+ vmovdqu 0x40-0x80($ctx),$t3
+ vpand $Xn,$C,$C
+ vmovdqu 0x60-0x80($ctx),$Xi
+ vpand $Xn,$D,$D
+ vpaddd $t1,$A,$A
+ vmovdqu 0x80-0x80($ctx),$t1
+ vpand $Xn,$E,$E
+ vpaddd $t2,$B,$B
+ vmovdqu 0xa0-0x80($ctx),$t2
+ vpand $Xn,$F,$F
+ vpaddd $t3,$C,$C
+ vmovdqu 0xc0-0x80($ctx),$t3
+ vpand $Xn,$G,$G
+ vpaddd $Xi,$D,$D
+ vmovdqu 0xe0-0x80($ctx),$Xi
+ vpand $Xn,$H,$H
+ vpaddd $t1,$E,$E
+ vpaddd $t2,$F,$F
+ vmovdqu $A,0x00-0x80($ctx)
+ vpaddd $t3,$G,$G
+ vmovdqu $B,0x20-0x80($ctx)
+ vpaddd $Xi,$H,$H
+ vmovdqu $C,0x40-0x80($ctx)
+ vmovdqu $D,0x60-0x80($ctx)
+ vmovdqu $E,0x80-0x80($ctx)
+ vmovdqu $F,0xa0-0x80($ctx)
+ vmovdqu $G,0xc0-0x80($ctx)
+ vmovdqu $H,0xe0-0x80($ctx)
+
+ vmovdqu $sigma,(%rbx) # save counters
+ lea 256+128(%rsp),%rbx
+ vmovdqu .Lpbswap(%rip),$Xn
+ dec $num
+ jnz .Loop_avx2
+
+ #mov `$REG_SZ*17+8`(%rsp),$num
+ #lea $REG_SZ($ctx),$ctx
+ #lea `16*$REG_SZ/4`($inp),$inp
+ #dec $num
+ #jnz .Loop_grande_avx2
+
+.Ldone_avx2:
+ mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps -0xd8(%rax),%xmm6
+ movaps -0xc8(%rax),%xmm7
+ movaps -0xb8(%rax),%xmm8
+ movaps -0xa8(%rax),%xmm9
+ movaps -0x98(%rax),%xmm10
+ movaps -0x88(%rax),%xmm11
+ movaps -0x78(%rax),%xmm12
+ movaps -0x68(%rax),%xmm13
+ movaps -0x58(%rax),%xmm14
+ movaps -0x48(%rax),%xmm15
+___
+$code.=<<___;
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp
+.Lepilogue_avx2:
+ ret
+.size sha256_multi_block_avx2,.-sha256_multi_block_avx2
+___
+ } }}}
+$code.=<<___;
+.align 256
+K256:
+___
+sub TABLE {
+ foreach (@_) {
+ $code.=<<___;
+ .long $_,$_,$_,$_
+ .long $_,$_,$_,$_
+___
+ }
+}
+&TABLE( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
+ 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
+ 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
+ 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
+ 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
+ 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
+ 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
+ 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
+ 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
+ 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
+ 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
+ 0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
+ 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
+ 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
+ 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
+ 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 );
+$code.=<<___;
+.Lpbswap:
+ .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
+ .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
+K256_shaext:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+ .asciz "SHA256 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+if ($win64) {
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # end of prologue label
+ cmp %r10,%rbx # context->Rip<.Lbody
+ jb .Lin_prologue
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=.Lepilogue
+ jae .Lin_prologue
+
+ mov `16*17`(%rax),%rax # pull saved stack pointer
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+
+ lea -24-10*16(%rax),%rsi
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+
+.Lin_prologue:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size se_handler,.-se_handler
+___
+$code.=<<___ if ($avx>1);
+.type avx2_handler,\@abi-omnipotent
+.align 16
+avx2_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # end of prologue label
+ cmp %r10,%rbx # context->Rip<body label
+ jb .Lin_prologue
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lin_prologue
+
+ mov `32*17`($context),%rax # pull saved stack pointer
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore cotnext->R12
+ mov %r13,224($context) # restore cotnext->R13
+ mov %r14,232($context) # restore cotnext->R14
+ mov %r15,240($context) # restore cotnext->R15
+
+ lea -56-10*16(%rax),%rsi
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+
+ jmp .Lin_prologue
+.size avx2_handler,.-avx2_handler
+___
+$code.=<<___;
+.section .pdata
+.align 4
+ .rva .LSEH_begin_sha256_multi_block
+ .rva .LSEH_end_sha256_multi_block
+ .rva .LSEH_info_sha256_multi_block
+ .rva .LSEH_begin_sha256_multi_block_shaext
+ .rva .LSEH_end_sha256_multi_block_shaext
+ .rva .LSEH_info_sha256_multi_block_shaext
+___
+$code.=<<___ if ($avx);
+ .rva .LSEH_begin_sha256_multi_block_avx
+ .rva .LSEH_end_sha256_multi_block_avx
+ .rva .LSEH_info_sha256_multi_block_avx
+___
+$code.=<<___ if ($avx>1);
+ .rva .LSEH_begin_sha256_multi_block_avx2
+ .rva .LSEH_end_sha256_multi_block_avx2
+ .rva .LSEH_info_sha256_multi_block_avx2
+___
+$code.=<<___;
+.section .xdata
+.align 8
+.LSEH_info_sha256_multi_block:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lbody,.Lepilogue # HandlerData[]
+.LSEH_info_sha256_multi_block_shaext:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[]
+___
+$code.=<<___ if ($avx);
+.LSEH_info_sha256_multi_block_avx:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lbody_avx,.Lepilogue_avx # HandlerData[]
+___
+$code.=<<___ if ($avx>1);
+.LSEH_info_sha256_multi_block_avx2:
+ .byte 9,0,0,0
+ .rva avx2_handler
+ .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[]
+___
+}
+####################################################################
+
+sub rex {
+ local *opcode=shift;
+ my ($dst,$src)=@_;
+ my $rex=0;
+
+ $rex|=0x04 if ($dst>=8);
+ $rex|=0x01 if ($src>=8);
+ unshift @opcode,$rex|0x40 if ($rex);
+}
+
+sub sha256op38 {
+ my $instr = shift;
+ my %opcodelet = (
+ "sha256rnds2" => 0xcb,
+ "sha256msg1" => 0xcc,
+ "sha256msg2" => 0xcd );
+
+ if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my @opcode=(0x0f,0x38);
+ rex(\@opcode,$2,$1);
+ push @opcode,$opcodelet{$instr};
+ push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
+ return ".byte\t".join(',',@opcode);
+ } else {
+ return $instr."\t".@_[0];
+ }
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval($1)/ge;
+
+ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo or
+
+ s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
+ s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
+ s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or
+ s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
+ s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or
+ s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
+
+ print $_,"\n";
+}
+
+close STDOUT;
diff --git a/crypto/sha/asm/sha512-586.pl b/crypto/sha/asm/sha512-586.pl
index 7eab6a5b88b2..e96ec00314a4 100755
--- a/crypto/sha/asm/sha512-586.pl
+++ b/crypto/sha/asm/sha512-586.pl
@@ -1,7 +1,7 @@
#!/usr/bin/env perl
#
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -9,17 +9,31 @@
#
# SHA512 block transform for x86. September 2007.
#
+# May 2013.
+#
+# Add SSSE3 code path, 20-25% improvement [over original SSE2 code].
+#
# Performance in clock cycles per processed byte (less is better):
#
-# Pentium PIII P4 AMD K8 Core2
-# gcc 100 75 116 54 66
-# icc 97 77 95 55 57
-# x86 asm 61 56 82 36 40
-# SSE2 asm - - 38 24 20
-# x86_64 asm(*) - - 30 10.0 10.5
+# gcc icc x86 asm SIMD(*) x86_64(**)
+# Pentium 100 97 61 - -
+# PIII 75 77 56 - -
+# P4 116 95 82 34.6 30.8
+# AMD K8 54 55 36 20.7 9.57
+# Core2 66 57 40 15.9 9.97
+# Westmere 70 - 38 12.2 9.58
+# Sandy Bridge 58 - 35 11.9 11.2
+# Ivy Bridge 50 - 33 11.5 8.17
+# Haswell 46 - 29 11.3 7.66
+# Bulldozer 121 - 50 14.0 13.5
+# VIA Nano 91 - 52 33 14.7
+# Atom 126 - 68 48(***) 14.7
+# Silvermont 97 - 58 42(***) 17.5
#
-# (*) x86_64 assembler performance is presented for reference
-# purposes.
+# (*) whichever best applicable.
+# (**) x86_64 assembler performance is presented for reference
+# purposes, the results are for integer-only code.
+# (***) paddq is increadibly slow on Atom.
#
# IALU code-path is optimized for elder Pentiums. On vanilla Pentium
# performance improvement over compiler generated code reaches ~60%,
@@ -66,72 +80,77 @@ $Hsse2=&QWP(56,"esp");
$A="mm0"; # B-D and
$E="mm4"; # F-H are commonly loaded to respectively mm1-mm3 and
# mm5-mm7, but it's done on on-demand basis...
+$BxC="mm2"; # ... except for B^C
sub BODY_00_15_sse2 {
- my $prefetch=shift;
+ my $phase=shift;
- &movq ("mm5",$Fsse2); # load f
- &movq ("mm6",$Gsse2); # load g
- &movq ("mm7",$Hsse2); # load h
+ #&movq ("mm5",$Fsse2); # load f
+ #&movq ("mm6",$Gsse2); # load g
&movq ("mm1",$E); # %mm1 is sliding right
- &movq ("mm2",$E); # %mm2 is sliding left
+ &pxor ("mm5","mm6"); # f^=g
&psrlq ("mm1",14);
- &movq ($Esse2,$E); # modulo-scheduled save e
- &psllq ("mm2",23);
+ &movq ($Esse2,$E); # modulo-scheduled save e
+ &pand ("mm5",$E); # f&=e
+ &psllq ($E,23); # $E is sliding left
+ &movq ($A,"mm3") if ($phase<2);
+ &movq (&QWP(8*9,"esp"),"mm7") # save X[i]
&movq ("mm3","mm1"); # %mm3 is T1
- &psrlq ("mm1",4);
- &pxor ("mm3","mm2");
- &psllq ("mm2",23);
+ &psrlq ("mm1",4);
+ &pxor ("mm5","mm6"); # Ch(e,f,g)
+ &pxor ("mm3",$E);
+ &psllq ($E,23);
&pxor ("mm3","mm1");
- &psrlq ("mm1",23);
- &pxor ("mm3","mm2");
- &psllq ("mm2",4);
+ &movq ($Asse2,$A); # modulo-scheduled save a
+ &paddq ("mm7","mm5"); # X[i]+=Ch(e,f,g)
+ &pxor ("mm3",$E);
+ &psrlq ("mm1",23);
+ &paddq ("mm7",$Hsse2); # X[i]+=h
&pxor ("mm3","mm1");
- &paddq ("mm7",QWP(0,$K512)); # h+=K512[i]
- &pxor ("mm3","mm2"); # T1=Sigma1_512(e)
-
- &pxor ("mm5","mm6"); # f^=g
+ &psllq ($E,4);
+ &paddq ("mm7",QWP(0,$K512)); # X[i]+=K512[i]
+ &pxor ("mm3",$E); # T1=Sigma1_512(e)
+
+ &movq ($E,$Dsse2); # e = load d, e in next round
+ &paddq ("mm3","mm7"); # T1+=X[i]
+ &movq ("mm5",$A); # %mm5 is sliding right
+ &psrlq ("mm5",28);
+ &paddq ($E,"mm3"); # d += T1
+ &movq ("mm6",$A); # %mm6 is sliding left
+ &movq ("mm7","mm5");
+ &psllq ("mm6",25);
&movq ("mm1",$Bsse2); # load b
- &pand ("mm5",$E); # f&=e
- &movq ("mm2",$Csse2); # load c
- &pxor ("mm5","mm6"); # f^=g
- &movq ($E,$Dsse2); # e = load d
- &paddq ("mm3","mm5"); # T1+=Ch(e,f,g)
- &movq (&QWP(0,"esp"),$A); # modulo-scheduled save a
- &paddq ("mm3","mm7"); # T1+=h
-
- &movq ("mm5",$A); # %mm5 is sliding right
- &movq ("mm6",$A); # %mm6 is sliding left
- &paddq ("mm3",&QWP(8*9,"esp")); # T1+=X[0]
- &psrlq ("mm5",28);
- &paddq ($E,"mm3"); # e += T1
- &psllq ("mm6",25);
- &movq ("mm7","mm5"); # %mm7 is T2
- &psrlq ("mm5",6);
- &pxor ("mm7","mm6");
- &psllq ("mm6",5);
- &pxor ("mm7","mm5");
- &psrlq ("mm5",5);
- &pxor ("mm7","mm6");
- &psllq ("mm6",6);
- &pxor ("mm7","mm5");
+ &psrlq ("mm5",6);
+ &pxor ("mm7","mm6");
&sub ("esp",8);
- &pxor ("mm7","mm6"); # T2=Sigma0_512(a)
-
- &movq ("mm5",$A); # %mm5=a
- &por ($A,"mm2"); # a=a|c
- &movq ("mm6",&QWP(8*(9+16-14),"esp")) if ($prefetch);
- &pand ("mm5","mm2"); # %mm5=a&c
- &pand ($A,"mm1"); # a=(a|c)&b
- &movq ("mm2",&QWP(8*(9+16-1),"esp")) if ($prefetch);
- &por ("mm5",$A); # %mm5=(a&c)|((a|c)&b)
- &paddq ("mm7","mm5"); # T2+=Maj(a,b,c)
- &movq ($A,"mm3"); # a=T1
-
- &mov (&LB("edx"),&BP(0,$K512));
- &paddq ($A,"mm7"); # a+=T2
- &add ($K512,8);
+ &psllq ("mm6",5);
+ &pxor ("mm7","mm5");
+ &pxor ($A,"mm1"); # a^b, b^c in next round
+ &psrlq ("mm5",5);
+ &pxor ("mm7","mm6");
+ &pand ($BxC,$A); # (b^c)&(a^b)
+ &psllq ("mm6",6);
+ &pxor ("mm7","mm5");
+ &pxor ($BxC,"mm1"); # [h=]Maj(a,b,c)
+ &pxor ("mm6","mm7"); # Sigma0_512(a)
+ &movq ("mm7",&QWP(8*(9+16-1),"esp")) if ($phase!=0); # pre-fetch
+ &movq ("mm5",$Fsse2) if ($phase==0); # load f
+
+ if ($phase>1) {
+ &paddq ($BxC,"mm6"); # h+=Sigma0(a)
+ &add ($K512,8);
+ #&paddq ($BxC,"mm3"); # h+=T1
+
+ ($A,$BxC) = ($BxC,$A); # rotate registers
+ } else {
+ &paddq ("mm3",$BxC); # T1+=Maj(a,b,c)
+ &movq ($BxC,$A);
+ &add ($K512,8);
+ &paddq ("mm3","mm6"); # T1+=Sigma0(a)
+ &movq ("mm6",$Gsse2) if ($phase==0); # load g
+ #&movq ($A,"mm3"); # h=T1
+ }
}
sub BODY_00_15_x86 {
@@ -284,110 +303,357 @@ sub BODY_00_15_x86 {
if ($sse2) {
&picmeup("edx","OPENSSL_ia32cap_P",$K512,&label("K512"));
- &bt (&DWP(0,"edx"),26);
- &jnc (&label("loop_x86"));
+ &mov ("ecx",&DWP(0,"edx"));
+ &test ("ecx",1<<26);
+ &jz (&label("loop_x86"));
+
+ &mov ("edx",&DWP(4,"edx"));
# load ctx->h[0-7]
&movq ($A,&QWP(0,"esi"));
+ &and ("ecx",1<<24); # XMM registers availability
&movq ("mm1",&QWP(8,"esi"));
- &movq ("mm2",&QWP(16,"esi"));
+ &and ("edx",1<<9); # SSSE3 bit
+ &movq ($BxC,&QWP(16,"esi"));
+ &or ("ecx","edx");
&movq ("mm3",&QWP(24,"esi"));
&movq ($E,&QWP(32,"esi"));
&movq ("mm5",&QWP(40,"esi"));
&movq ("mm6",&QWP(48,"esi"));
&movq ("mm7",&QWP(56,"esi"));
+ &cmp ("ecx",1<<24|1<<9);
+ &je (&label("SSSE3"));
&sub ("esp",8*10);
+ &jmp (&label("loop_sse2"));
&set_label("loop_sse2",16);
- # &movq ($Asse2,$A);
+ #&movq ($Asse2,$A);
&movq ($Bsse2,"mm1");
- &movq ($Csse2,"mm2");
+ &movq ($Csse2,$BxC);
&movq ($Dsse2,"mm3");
- # &movq ($Esse2,$E);
+ #&movq ($Esse2,$E);
&movq ($Fsse2,"mm5");
&movq ($Gsse2,"mm6");
+ &pxor ($BxC,"mm1"); # magic
&movq ($Hsse2,"mm7");
+ &movq ("mm3",$A); # magic
- &mov ("ecx",&DWP(0,"edi"));
- &mov ("edx",&DWP(4,"edi"));
+ &mov ("eax",&DWP(0,"edi"));
+ &mov ("ebx",&DWP(4,"edi"));
&add ("edi",8);
- &bswap ("ecx");
- &bswap ("edx");
- &mov (&DWP(8*9+4,"esp"),"ecx");
- &mov (&DWP(8*9+0,"esp"),"edx");
+ &mov ("edx",15); # counter
+ &bswap ("eax");
+ &bswap ("ebx");
+ &jmp (&label("00_14_sse2"));
&set_label("00_14_sse2",16);
+ &movd ("mm1","eax");
&mov ("eax",&DWP(0,"edi"));
+ &movd ("mm7","ebx");
&mov ("ebx",&DWP(4,"edi"));
&add ("edi",8);
&bswap ("eax");
&bswap ("ebx");
- &mov (&DWP(8*8+4,"esp"),"eax");
- &mov (&DWP(8*8+0,"esp"),"ebx");
+ &punpckldq("mm7","mm1");
&BODY_00_15_sse2();
- &cmp (&LB("edx"),0x35);
- &jne (&label("00_14_sse2"));
+ &dec ("edx");
+ &jnz (&label("00_14_sse2"));
+
+ &movd ("mm1","eax");
+ &movd ("mm7","ebx");
+ &punpckldq("mm7","mm1");
&BODY_00_15_sse2(1);
+ &pxor ($A,$A); # A is in %mm3
+ &mov ("edx",32); # counter
+ &jmp (&label("16_79_sse2"));
+
&set_label("16_79_sse2",16);
- #&movq ("mm2",&QWP(8*(9+16-1),"esp")); #prefetched in BODY_00_15
- #&movq ("mm6",&QWP(8*(9+16-14),"esp"));
- &movq ("mm1","mm2");
+ for ($j=0;$j<2;$j++) { # 2x unroll
+ #&movq ("mm7",&QWP(8*(9+16-1),"esp")); # prefetched in BODY_00_15
+ &movq ("mm5",&QWP(8*(9+16-14),"esp"));
+ &movq ("mm1","mm7");
+ &psrlq ("mm7",1);
+ &movq ("mm6","mm5");
+ &psrlq ("mm5",6);
+ &psllq ("mm1",56);
+ &paddq ($A,"mm3"); # from BODY_00_15
+ &movq ("mm3","mm7");
+ &psrlq ("mm7",7-1);
+ &pxor ("mm3","mm1");
+ &psllq ("mm1",63-56);
+ &pxor ("mm3","mm7");
+ &psrlq ("mm7",8-7);
+ &pxor ("mm3","mm1");
+ &movq ("mm1","mm5");
+ &psrlq ("mm5",19-6);
+ &pxor ("mm7","mm3"); # sigma0
+
+ &psllq ("mm6",3);
+ &pxor ("mm1","mm5");
+ &paddq ("mm7",&QWP(8*(9+16),"esp"));
+ &pxor ("mm1","mm6");
+ &psrlq ("mm5",61-19);
+ &paddq ("mm7",&QWP(8*(9+16-9),"esp"));
+ &pxor ("mm1","mm5");
+ &psllq ("mm6",45-3);
+ &movq ("mm5",$Fsse2); # load f
+ &pxor ("mm1","mm6"); # sigma1
+ &movq ("mm6",$Gsse2); # load g
- &psrlq ("mm2",1);
- &movq ("mm7","mm6");
- &psrlq ("mm6",6);
- &movq ("mm3","mm2");
+ &paddq ("mm7","mm1"); # X[i]
+ #&movq (&QWP(8*9,"esp"),"mm7"); # moved to BODY_00_15
- &psrlq ("mm2",7-1);
- &movq ("mm5","mm6");
- &psrlq ("mm6",19-6);
- &pxor ("mm3","mm2");
+ &BODY_00_15_sse2(2);
+ }
+ &dec ("edx");
+ &jnz (&label("16_79_sse2"));
- &psrlq ("mm2",8-7);
- &pxor ("mm5","mm6");
- &psrlq ("mm6",61-19);
- &pxor ("mm3","mm2");
+ #&movq ($A,$Asse2);
+ &paddq ($A,"mm3"); # from BODY_00_15
+ &movq ("mm1",$Bsse2);
+ #&movq ($BxC,$Csse2);
+ &movq ("mm3",$Dsse2);
+ #&movq ($E,$Esse2);
+ &movq ("mm5",$Fsse2);
+ &movq ("mm6",$Gsse2);
+ &movq ("mm7",$Hsse2);
- &movq ("mm2",&QWP(8*(9+16),"esp"));
+ &pxor ($BxC,"mm1"); # de-magic
+ &paddq ($A,&QWP(0,"esi"));
+ &paddq ("mm1",&QWP(8,"esi"));
+ &paddq ($BxC,&QWP(16,"esi"));
+ &paddq ("mm3",&QWP(24,"esi"));
+ &paddq ($E,&QWP(32,"esi"));
+ &paddq ("mm5",&QWP(40,"esi"));
+ &paddq ("mm6",&QWP(48,"esi"));
+ &paddq ("mm7",&QWP(56,"esi"));
- &psllq ("mm1",56);
- &pxor ("mm5","mm6");
- &psllq ("mm7",3);
- &pxor ("mm3","mm1");
+ &mov ("eax",8*80);
+ &movq (&QWP(0,"esi"),$A);
+ &movq (&QWP(8,"esi"),"mm1");
+ &movq (&QWP(16,"esi"),$BxC);
+ &movq (&QWP(24,"esi"),"mm3");
+ &movq (&QWP(32,"esi"),$E);
+ &movq (&QWP(40,"esi"),"mm5");
+ &movq (&QWP(48,"esi"),"mm6");
+ &movq (&QWP(56,"esi"),"mm7");
- &paddq ("mm2",&QWP(8*(9+16-9),"esp"));
+ &lea ("esp",&DWP(0,"esp","eax")); # destroy frame
+ &sub ($K512,"eax"); # rewind K
- &psllq ("mm1",63-56);
- &pxor ("mm5","mm7");
- &psllq ("mm7",45-3);
- &pxor ("mm3","mm1");
- &pxor ("mm5","mm7");
+ &cmp ("edi",&DWP(8*10+8,"esp")); # are we done yet?
+ &jb (&label("loop_sse2"));
- &paddq ("mm3","mm5");
- &paddq ("mm3","mm2");
- &movq (&QWP(8*9,"esp"),"mm3");
+ &mov ("esp",&DWP(8*10+12,"esp")); # restore sp
+ &emms ();
+&function_end_A();
- &BODY_00_15_sse2(1);
+&set_label("SSSE3",32);
+{ my ($cnt,$frame)=("ecx","edx");
+ my @X=map("xmm$_",(0..7));
+ my $j;
+ my $i=0;
+
+ &lea ($frame,&DWP(-64,"esp"));
+ &sub ("esp",256);
+
+ # fixed stack frame layout
+ #
+ # +0 A B C D E F G H # backing store
+ # +64 X[0]+K[i] .. X[15]+K[i] # XMM->MM xfer area
+ # +192 # XMM off-load ring buffer
+ # +256 # saved parameters
+
+ &movdqa (@X[1],&QWP(80*8,$K512)); # byte swap mask
+ &movdqu (@X[0],&QWP(0,"edi"));
+ &pshufb (@X[0],@X[1]);
+ for ($j=0;$j<8;$j++) {
+ &movdqa (&QWP(16*(($j-1)%4),$frame),@X[3]) if ($j>4); # off-load
+ &movdqa (@X[3],&QWP(16*($j%8),$K512));
+ &movdqa (@X[2],@X[1]) if ($j<7); # perpetuate byte swap mask
+ &movdqu (@X[1],&QWP(16*($j+1),"edi")) if ($j<7); # next input
+ &movdqa (@X[1],&QWP(16*(($j+1)%4),$frame)) if ($j==7);# restore @X[0]
+ &paddq (@X[3],@X[0]);
+ &pshufb (@X[1],@X[2]) if ($j<7);
+ &movdqa (&QWP(16*($j%8)-128,$frame),@X[3]); # xfer X[i]+K[i]
+
+ push(@X,shift(@X)); # rotate(@X)
+ }
+ #&jmp (&label("loop_ssse3"));
+ &nop ();
- &cmp (&LB("edx"),0x17);
- &jne (&label("16_79_sse2"));
+&set_label("loop_ssse3",32);
+ &movdqa (@X[2],&QWP(16*(($j+1)%4),$frame)); # pre-restore @X[1]
+ &movdqa (&QWP(16*(($j-1)%4),$frame),@X[3]); # off-load @X[3]
+ &lea ($K512,&DWP(16*8,$K512));
+
+ #&movq ($Asse2,$A); # off-load A-H
+ &movq ($Bsse2,"mm1");
+ &mov ("ebx","edi");
+ &movq ($Csse2,$BxC);
+ &lea ("edi",&DWP(128,"edi")); # advance input
+ &movq ($Dsse2,"mm3");
+ &cmp ("edi","eax");
+ #&movq ($Esse2,$E);
+ &movq ($Fsse2,"mm5");
+ &cmovb ("ebx","edi");
+ &movq ($Gsse2,"mm6");
+ &mov ("ecx",4); # loop counter
+ &pxor ($BxC,"mm1"); # magic
+ &movq ($Hsse2,"mm7");
+ &pxor ("mm3","mm3"); # magic
+
+ &jmp (&label("00_47_ssse3"));
+
+sub BODY_00_15_ssse3 { # "phase-less" copy of BODY_00_15_sse2
+ (
+ '&movq ("mm1",$E)', # %mm1 is sliding right
+ '&movq ("mm7",&QWP(((-8*$i)%128)-128,$frame))',# X[i]+K[i]
+ '&pxor ("mm5","mm6")', # f^=g
+ '&psrlq ("mm1",14)',
+ '&movq (&QWP(8*($i+4)%64,"esp"),$E)', # modulo-scheduled save e
+ '&pand ("mm5",$E)', # f&=e
+ '&psllq ($E,23)', # $E is sliding left
+ '&paddq ($A,"mm3")', # [h+=Maj(a,b,c)]
+ '&movq ("mm3","mm1")', # %mm3 is T1
+ '&psrlq("mm1",4)',
+ '&pxor ("mm5","mm6")', # Ch(e,f,g)
+ '&pxor ("mm3",$E)',
+ '&psllq($E,23)',
+ '&pxor ("mm3","mm1")',
+ '&movq (&QWP(8*$i%64,"esp"),$A)', # modulo-scheduled save a
+ '&paddq("mm7","mm5")', # X[i]+=Ch(e,f,g)
+ '&pxor ("mm3",$E)',
+ '&psrlq("mm1",23)',
+ '&paddq("mm7",&QWP(8*($i+7)%64,"esp"))', # X[i]+=h
+ '&pxor ("mm3","mm1")',
+ '&psllq($E,4)',
+ '&pxor ("mm3",$E)', # T1=Sigma1_512(e)
+
+ '&movq ($E,&QWP(8*($i+3)%64,"esp"))', # e = load d, e in next round
+ '&paddq ("mm3","mm7")', # T1+=X[i]
+ '&movq ("mm5",$A)', # %mm5 is sliding right
+ '&psrlq("mm5",28)',
+ '&paddq ($E,"mm3")', # d += T1
+ '&movq ("mm6",$A)', # %mm6 is sliding left
+ '&movq ("mm7","mm5")',
+ '&psllq("mm6",25)',
+ '&movq ("mm1",&QWP(8*($i+1)%64,"esp"))', # load b
+ '&psrlq("mm5",6)',
+ '&pxor ("mm7","mm6")',
+ '&psllq("mm6",5)',
+ '&pxor ("mm7","mm5")',
+ '&pxor ($A,"mm1")', # a^b, b^c in next round
+ '&psrlq("mm5",5)',
+ '&pxor ("mm7","mm6")',
+ '&pand ($BxC,$A)', # (b^c)&(a^b)
+ '&psllq("mm6",6)',
+ '&pxor ("mm7","mm5")',
+ '&pxor ($BxC,"mm1")', # [h=]Maj(a,b,c)
+ '&pxor ("mm6","mm7")', # Sigma0_512(a)
+ '&movq ("mm5",&QWP(8*($i+5-1)%64,"esp"))', # pre-load f
+ '&paddq ($BxC,"mm6")', # h+=Sigma0(a)
+ '&movq ("mm6",&QWP(8*($i+6-1)%64,"esp"))', # pre-load g
+
+ '($A,$BxC) = ($BxC,$A); $i--;'
+ );
+}
- # &movq ($A,$Asse2);
+&set_label("00_47_ssse3",32);
+
+ for(;$j<16;$j++) {
+ my ($t0,$t2,$t1)=@X[2..4];
+ my @insns = (&BODY_00_15_ssse3(),&BODY_00_15_ssse3());
+
+ &movdqa ($t2,@X[5]);
+ &movdqa (@X[1],$t0); # restore @X[1]
+ &palignr ($t0,@X[0],8); # X[1..2]
+ &movdqa (&QWP(16*($j%4),$frame),@X[4]); # off-load @X[4]
+ &palignr ($t2,@X[4],8); # X[9..10]
+
+ &movdqa ($t1,$t0);
+ &psrlq ($t0,7);
+ &paddq (@X[0],$t2); # X[0..1] += X[9..10]
+ &movdqa ($t2,$t1);
+ &psrlq ($t1,1);
+ &psllq ($t2,64-8);
+ &pxor ($t0,$t1);
+ &psrlq ($t1,8-1);
+ &pxor ($t0,$t2);
+ &psllq ($t2,8-1);
+ &pxor ($t0,$t1);
+ &movdqa ($t1,@X[7]);
+ &pxor ($t0,$t2); # sigma0(X[1..2])
+ &movdqa ($t2,@X[7]);
+ &psrlq ($t1,6);
+ &paddq (@X[0],$t0); # X[0..1] += sigma0(X[1..2])
+
+ &movdqa ($t0,@X[7]);
+ &psrlq ($t2,19);
+ &psllq ($t0,64-61);
+ &pxor ($t1,$t2);
+ &psrlq ($t2,61-19);
+ &pxor ($t1,$t0);
+ &psllq ($t0,61-19);
+ &pxor ($t1,$t2);
+ &movdqa ($t2,&QWP(16*(($j+2)%4),$frame));# pre-restore @X[1]
+ &pxor ($t1,$t0); # sigma0(X[1..2])
+ &movdqa ($t0,&QWP(16*($j%8),$K512));
+ eval(shift(@insns));
+ &paddq (@X[0],$t1); # X[0..1] += sigma0(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &paddq ($t0,@X[0]);
+ foreach(@insns) { eval; }
+ &movdqa (&QWP(16*($j%8)-128,$frame),$t0);# xfer X[i]+K[i]
+
+ push(@X,shift(@X)); # rotate(@X)
+ }
+ &lea ($K512,&DWP(16*8,$K512));
+ &dec ("ecx");
+ &jnz (&label("00_47_ssse3"));
+
+ &movdqa (@X[1],&QWP(0,$K512)); # byte swap mask
+ &lea ($K512,&DWP(-80*8,$K512)); # rewind
+ &movdqu (@X[0],&QWP(0,"ebx"));
+ &pshufb (@X[0],@X[1]);
+
+ for ($j=0;$j<8;$j++) { # load next or same block
+ my @insns = (&BODY_00_15_ssse3(),&BODY_00_15_ssse3());
+
+ &movdqa (&QWP(16*(($j-1)%4),$frame),@X[3]) if ($j>4); # off-load
+ &movdqa (@X[3],&QWP(16*($j%8),$K512));
+ &movdqa (@X[2],@X[1]) if ($j<7); # perpetuate byte swap mask
+ &movdqu (@X[1],&QWP(16*($j+1),"ebx")) if ($j<7); # next input
+ &movdqa (@X[1],&QWP(16*(($j+1)%4),$frame)) if ($j==7);# restore @X[0]
+ &paddq (@X[3],@X[0]);
+ &pshufb (@X[1],@X[2]) if ($j<7);
+ foreach(@insns) { eval; }
+ &movdqa (&QWP(16*($j%8)-128,$frame),@X[3]);# xfer X[i]+K[i]
+
+ push(@X,shift(@X)); # rotate(@X)
+ }
+
+ #&movq ($A,$Asse2); # load A-H
&movq ("mm1",$Bsse2);
- &movq ("mm2",$Csse2);
+ &paddq ($A,"mm3"); # from BODY_00_15
+ #&movq ($BxC,$Csse2);
&movq ("mm3",$Dsse2);
- # &movq ($E,$Esse2);
- &movq ("mm5",$Fsse2);
- &movq ("mm6",$Gsse2);
+ #&movq ($E,$Esse2);
+ #&movq ("mm5",$Fsse2);
+ #&movq ("mm6",$Gsse2);
&movq ("mm7",$Hsse2);
+ &pxor ($BxC,"mm1"); # de-magic
&paddq ($A,&QWP(0,"esi"));
&paddq ("mm1",&QWP(8,"esi"));
- &paddq ("mm2",&QWP(16,"esi"));
+ &paddq ($BxC,&QWP(16,"esi"));
&paddq ("mm3",&QWP(24,"esi"));
&paddq ($E,&QWP(32,"esi"));
&paddq ("mm5",&QWP(40,"esi"));
@@ -396,21 +662,19 @@ if ($sse2) {
&movq (&QWP(0,"esi"),$A);
&movq (&QWP(8,"esi"),"mm1");
- &movq (&QWP(16,"esi"),"mm2");
+ &movq (&QWP(16,"esi"),$BxC);
&movq (&QWP(24,"esi"),"mm3");
&movq (&QWP(32,"esi"),$E);
&movq (&QWP(40,"esi"),"mm5");
&movq (&QWP(48,"esi"),"mm6");
&movq (&QWP(56,"esi"),"mm7");
- &add ("esp",8*80); # destroy frame
- &sub ($K512,8*80); # rewind K
-
- &cmp ("edi",&DWP(8*10+8,"esp")); # are we done yet?
- &jb (&label("loop_sse2"));
+ &cmp ("edi","eax") # are we done yet?
+ &jb (&label("loop_ssse3"));
+ &mov ("esp",&DWP(64+12,$frame)); # restore sp
&emms ();
- &mov ("esp",&DWP(8*10+12,"esp")); # restore sp
+}
&function_end_A();
}
&set_label("loop_x86",16);
@@ -638,6 +902,9 @@ if ($sse2) {
&data_word(0xfc657e2a,0x597f299c); # u64
&data_word(0x3ad6faec,0x5fcb6fab); # u64
&data_word(0x4a475817,0x6c44198c); # u64
+
+ &data_word(0x04050607,0x00010203); # byte swap
+ &data_word(0x0c0d0e0f,0x08090a0b); # mask
&function_end_B("sha512_block_data_order");
&asciz("SHA512 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl
index 7faf37b14790..fb7dc506aca1 100755
--- a/crypto/sha/asm/sha512-armv4.pl
+++ b/crypto/sha/asm/sha512-armv4.pl
@@ -1,7 +1,7 @@
#!/usr/bin/env perl
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -26,7 +26,24 @@
# March 2011.
#
# Add NEON implementation. On Cortex A8 it was measured to process
-# one byte in 25.5 cycles or 47% faster than integer-only code.
+# one byte in 23.3 cycles or ~60% faster than integer-only code.
+
+# August 2012.
+#
+# Improve NEON performance by 12% on Snapdragon S4. In absolute
+# terms it's 22.6 cycles per byte, which is disappointing result.
+# Technical writers asserted that 3-way S4 pipeline can sustain
+# multiple NEON instructions per cycle, but dual NEON issue could
+# not be observed, and for NEON-only sequences IPC(*) was found to
+# be limited by 1:-( 0.33 and 0.66 were measured for sequences with
+# ILPs(*) of 1 and 2 respectively. This in turn means that you can
+# even find yourself striving, as I did here, for achieving IPC
+# adequate to one delivered by Cortex A8 [for reference, it's
+# 0.5 for ILP of 1, and 1 for higher ILPs].
+#
+# (*) ILP, instruction-level parallelism, how many instructions
+# *can* execute at the same time. IPC, instructions per cycle,
+# indicates how many instructions actually execute.
# Byte order [in]dependence. =========================================
#
@@ -220,16 +237,20 @@ WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
.size K512,.-K512
+#if __ARM_MAX_ARCH__>=7
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-sha512_block_data_order
.skip 32-4
+#else
+.skip 32
+#endif
.global sha512_block_data_order
.type sha512_block_data_order,%function
sha512_block_data_order:
sub r3,pc,#8 @ sha512_block_data_order
add $len,$inp,$len,lsl#7 @ len to point at the end of inp
-#if __ARM_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
tst r12,#1
@@ -457,40 +478,40 @@ $code.=<<___ if ($i<16 || $i&1);
vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
#endif
vshr.u64 $t1,$e,#@Sigma1[1]
+#if $i>0
+ vadd.i64 $a,$Maj @ h+=Maj from the past
+#endif
vshr.u64 $t2,$e,#@Sigma1[2]
___
$code.=<<___;
vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
vsli.64 $t0,$e,#`64-@Sigma1[0]`
vsli.64 $t1,$e,#`64-@Sigma1[1]`
+ vmov $Ch,$e
vsli.64 $t2,$e,#`64-@Sigma1[2]`
#if $i<16 && defined(__ARMEL__)
vrev64.8 @X[$i],@X[$i]
#endif
- vadd.i64 $T1,$K,$h
- veor $Ch,$f,$g
- veor $t0,$t1
- vand $Ch,$e
- veor $t0,$t2 @ Sigma1(e)
- veor $Ch,$g @ Ch(e,f,g)
- vadd.i64 $T1,$t0
+ veor $t1,$t0
+ vbsl $Ch,$f,$g @ Ch(e,f,g)
vshr.u64 $t0,$a,#@Sigma0[0]
- vadd.i64 $T1,$Ch
+ veor $t2,$t1 @ Sigma1(e)
+ vadd.i64 $T1,$Ch,$h
vshr.u64 $t1,$a,#@Sigma0[1]
- vshr.u64 $t2,$a,#@Sigma0[2]
vsli.64 $t0,$a,#`64-@Sigma0[0]`
+ vadd.i64 $T1,$t2
+ vshr.u64 $t2,$a,#@Sigma0[2]
+ vadd.i64 $K,@X[$i%16]
vsli.64 $t1,$a,#`64-@Sigma0[1]`
+ veor $Maj,$a,$b
vsli.64 $t2,$a,#`64-@Sigma0[2]`
- vadd.i64 $T1,@X[$i%16]
- vorr $Maj,$a,$c
- vand $Ch,$a,$c
veor $h,$t0,$t1
- vand $Maj,$b
+ vadd.i64 $T1,$K
+ vbsl $Maj,$c,$b @ Maj(a,b,c)
veor $h,$t2 @ Sigma0(a)
- vorr $Maj,$Ch @ Maj(a,b,c)
- vadd.i64 $h,$T1
vadd.i64 $d,$T1
- vadd.i64 $h,$Maj
+ vadd.i64 $Maj,$T1
+ @ vadd.i64 $h,$Maj
___
}
@@ -508,6 +529,7 @@ $i /= 2;
$code.=<<___;
vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
+ vadd.i64 @_[0],d30 @ h+=Maj from the past
vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
@@ -533,7 +555,8 @@ ___
}
$code.=<<___;
-#if __ARM_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
.fpu neon
.align 4
@@ -554,6 +577,7 @@ for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
bne .L16_79_neon
+ vadd.i64 $A,d30 @ h+=Maj from the past
vldmia $ctx,{d24-d31} @ load context to temp
vadd.i64 q8,q12 @ vectorized accumulate
vadd.i64 q9,q13
@@ -565,7 +589,7 @@ $code.=<<___;
bne .Loop_neon
vldmia sp!,{d8-d15} @ epilogue
- bx lr
+ ret @ bx lr
#endif
___
}
@@ -573,10 +597,13 @@ $code.=<<___;
.size sha512_block_data_order,.-sha512_block_data_order
.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
+#if __ARM_MAX_ARCH__>=7
.comm OPENSSL_armcap_P,4,4
+#endif
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
+$code =~ s/\bret\b/bx lr/gm;
print $code;
close STDOUT; # enforce flush
diff --git a/crypto/sha/asm/sha512-armv8.pl b/crypto/sha/asm/sha512-armv8.pl
new file mode 100755
index 000000000000..f7b36b986a61
--- /dev/null
+++ b/crypto/sha/asm/sha512-armv8.pl
@@ -0,0 +1,422 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA256/512 for ARMv8.
+#
+# Performance in cycles per processed byte and improvement coefficient
+# over code generated with "default" compiler:
+#
+# SHA256-hw SHA256(*) SHA512
+# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
+# Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
+# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
+# Denver 2.01 10.5 (+26%) 6.70 (+8%)
+# X-Gene 20.0 (+100%) 12.8 (+300%(***))
+#
+# (*) Software SHA256 results are of lesser relevance, presented
+# mostly for informational purposes.
+# (**) The result is a trade-off: it's possible to improve it by
+# 10% (or by 1 cycle per round), but at the cost of 20% loss
+# on Cortex-A53 (or by 4 cycles per round).
+# (***) Super-impressive coefficients over gcc-generated code are
+# indication of some compiler "pathology", most notably code
+# generated with -mgeneral-regs-only is significanty faster
+# and the gap is only 40-90%.
+
+$flavour=shift;
+$output=shift;
+open STDOUT,">$output";
+
+if ($output =~ /512/) {
+ $BITS=512;
+ $SZ=8;
+ @Sigma0=(28,34,39);
+ @Sigma1=(14,18,41);
+ @sigma0=(1, 8, 7);
+ @sigma1=(19,61, 6);
+ $rounds=80;
+ $reg_t="x";
+} else {
+ $BITS=256;
+ $SZ=4;
+ @Sigma0=( 2,13,22);
+ @Sigma1=( 6,11,25);
+ @sigma0=( 7,18, 3);
+ @sigma1=(17,19,10);
+ $rounds=64;
+ $reg_t="w";
+}
+
+$func="sha${BITS}_block_data_order";
+
+($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
+
+@X=map("$reg_t$_",(3..15,0..2));
+@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
+($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
+
+sub BODY_00_xx {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my $j=($i+1)&15;
+my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
+ $T0=@X[$i+3] if ($i<11);
+
+$code.=<<___ if ($i<16);
+#ifndef __ARMEB__
+ rev @X[$i],@X[$i] // $i
+#endif
+___
+$code.=<<___ if ($i<13 && ($i&1));
+ ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ
+___
+$code.=<<___ if ($i==13);
+ ldp @X[14],@X[15],[$inp]
+___
+$code.=<<___ if ($i>=14);
+ ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
+___
+$code.=<<___ if ($i>0 && $i<16);
+ add $a,$a,$t1 // h+=Sigma0(a)
+___
+$code.=<<___ if ($i>=11);
+ str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
+___
+# While ARMv8 specifies merged rotate-n-logical operation such as
+# 'eor x,y,z,ror#n', it was found to negatively affect performance
+# on Apple A7. The reason seems to be that it requires even 'y' to
+# be available earlier. This means that such merged instruction is
+# not necessarily best choice on critical path... On the other hand
+# Cortex-A5x handles merged instructions much better than disjoint
+# rotate and logical... See (**) footnote above.
+$code.=<<___ if ($i<15);
+ ror $t0,$e,#$Sigma1[0]
+ add $h,$h,$t2 // h+=K[i]
+ eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
+ and $t1,$f,$e
+ bic $t2,$g,$e
+ add $h,$h,@X[$i&15] // h+=X[i]
+ orr $t1,$t1,$t2 // Ch(e,f,g)
+ eor $t2,$a,$b // a^b, b^c in next round
+ eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e)
+ ror $T0,$a,#$Sigma0[0]
+ add $h,$h,$t1 // h+=Ch(e,f,g)
+ eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
+ add $h,$h,$t0 // h+=Sigma1(e)
+ and $t3,$t3,$t2 // (b^c)&=(a^b)
+ add $d,$d,$h // d+=h
+ eor $t3,$t3,$b // Maj(a,b,c)
+ eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a)
+ add $h,$h,$t3 // h+=Maj(a,b,c)
+ ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
+ //add $h,$h,$t1 // h+=Sigma0(a)
+___
+$code.=<<___ if ($i>=15);
+ ror $t0,$e,#$Sigma1[0]
+ add $h,$h,$t2 // h+=K[i]
+ ror $T1,@X[($j+1)&15],#$sigma0[0]
+ and $t1,$f,$e
+ ror $T2,@X[($j+14)&15],#$sigma1[0]
+ bic $t2,$g,$e
+ ror $T0,$a,#$Sigma0[0]
+ add $h,$h,@X[$i&15] // h+=X[i]
+ eor $t0,$t0,$e,ror#$Sigma1[1]
+ eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
+ orr $t1,$t1,$t2 // Ch(e,f,g)
+ eor $t2,$a,$b // a^b, b^c in next round
+ eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e)
+ eor $T0,$T0,$a,ror#$Sigma0[1]
+ add $h,$h,$t1 // h+=Ch(e,f,g)
+ and $t3,$t3,$t2 // (b^c)&=(a^b)
+ eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
+ eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1])
+ add $h,$h,$t0 // h+=Sigma1(e)
+ eor $t3,$t3,$b // Maj(a,b,c)
+ eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a)
+ eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14])
+ add @X[$j],@X[$j],@X[($j+9)&15]
+ add $d,$d,$h // d+=h
+ add $h,$h,$t3 // h+=Maj(a,b,c)
+ ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
+ add @X[$j],@X[$j],$T1
+ add $h,$h,$t1 // h+=Sigma0(a)
+ add @X[$j],@X[$j],$T2
+___
+ ($t2,$t3)=($t3,$t2);
+}
+
+$code.=<<___;
+#include "arm_arch.h"
+
+.text
+
+.globl $func
+.type $func,%function
+.align 6
+$func:
+___
+$code.=<<___ if ($SZ==4);
+ ldr x16,.LOPENSSL_armcap_P
+ adr x17,.LOPENSSL_armcap_P
+ add x16,x16,x17
+ ldr w16,[x16]
+ tst w16,#ARMV8_SHA256
+ b.ne .Lv8_entry
+___
+$code.=<<___;
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#4*$SZ
+
+ ldp $A,$B,[$ctx] // load context
+ ldp $C,$D,[$ctx,#2*$SZ]
+ ldp $E,$F,[$ctx,#4*$SZ]
+ add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input
+ ldp $G,$H,[$ctx,#6*$SZ]
+ adr $Ktbl,K$BITS
+ stp $ctx,$num,[x29,#96]
+
+.Loop:
+ ldp @X[0],@X[1],[$inp],#2*$SZ
+ ldr $t2,[$Ktbl],#$SZ // *K++
+ eor $t3,$B,$C // magic seed
+ str $inp,[x29,#112]
+___
+for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
+$code.=".Loop_16_xx:\n";
+for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ cbnz $t2,.Loop_16_xx
+
+ ldp $ctx,$num,[x29,#96]
+ ldr $inp,[x29,#112]
+ sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind
+
+ ldp @X[0],@X[1],[$ctx]
+ ldp @X[2],@X[3],[$ctx,#2*$SZ]
+ add $inp,$inp,#14*$SZ // advance input pointer
+ ldp @X[4],@X[5],[$ctx,#4*$SZ]
+ add $A,$A,@X[0]
+ ldp @X[6],@X[7],[$ctx,#6*$SZ]
+ add $B,$B,@X[1]
+ add $C,$C,@X[2]
+ add $D,$D,@X[3]
+ stp $A,$B,[$ctx]
+ add $E,$E,@X[4]
+ add $F,$F,@X[5]
+ stp $C,$D,[$ctx,#2*$SZ]
+ add $G,$G,@X[6]
+ add $H,$H,@X[7]
+ cmp $inp,$num
+ stp $E,$F,[$ctx,#4*$SZ]
+ stp $G,$H,[$ctx,#6*$SZ]
+ b.ne .Loop
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#4*$SZ
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#128
+ ret
+.size $func,.-$func
+
+.align 6
+.type K$BITS,%object
+K$BITS:
+___
+$code.=<<___ if ($SZ==8);
+ .quad 0x428a2f98d728ae22,0x7137449123ef65cd
+ .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ .quad 0x3956c25bf348b538,0x59f111f1b605d019
+ .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ .quad 0xd807aa98a3030242,0x12835b0145706fbe
+ .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ .quad 0x9bdc06a725c71235,0xc19bf174cf692694
+ .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ .quad 0x983e5152ee66dfab,0xa831c66d2db43210
+ .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ .quad 0x06ca6351e003826f,0x142929670a0e6e70
+ .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+ .quad 0x81c2c92e47edaee6,0x92722c851482353b
+ .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+ .quad 0xd192e819d6ef5218,0xd69906245565a910
+ .quad 0xf40e35855771202a,0x106aa07032bbd1b8
+ .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+ .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ .quad 0x90befffa23631e28,0xa4506cebde82bde9
+ .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ .quad 0xca273eceea26619c,0xd186b8c721c0c207
+ .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ .quad 0x113f9804bef90dae,0x1b710b35131c471b
+ .quad 0x28db77f523047d84,0x32caab7b40c72493
+ .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+ .quad 0 // terminator
+___
+$code.=<<___ if ($SZ==4);
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+ .long 0 //terminator
+___
+$code.=<<___;
+.size K$BITS,.-K$BITS
+.align 3
+.LOPENSSL_armcap_P:
+ .quad OPENSSL_armcap_P-.
+.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+
+if ($SZ==4) {
+my $Ktbl="x3";
+
+my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
+my @MSG=map("v$_.16b",(4..7));
+my ($W0,$W1)=("v16.4s","v17.4s");
+my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
+
+$code.=<<___;
+.type sha256_block_armv8,%function
+.align 6
+sha256_block_armv8:
+.Lv8_entry:
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ld1.32 {$ABCD,$EFGH},[$ctx]
+ adr $Ktbl,K256
+
+.Loop_hw:
+ ld1 {@MSG[0]-@MSG[3]},[$inp],#64
+ sub $num,$num,#1
+ ld1.32 {$W0},[$Ktbl],#16
+ rev32 @MSG[0],@MSG[0]
+ rev32 @MSG[1],@MSG[1]
+ rev32 @MSG[2],@MSG[2]
+ rev32 @MSG[3],@MSG[3]
+ orr $ABCD_SAVE,$ABCD,$ABCD // offload
+ orr $EFGH_SAVE,$EFGH,$EFGH
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+ ld1.32 {$W1},[$Ktbl],#16
+ add.i32 $W0,$W0,@MSG[0]
+ sha256su0 @MSG[0],@MSG[1]
+ orr $abcd,$ABCD,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+ sha256su1 @MSG[0],@MSG[2],@MSG[3]
+___
+ ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+ ld1.32 {$W1},[$Ktbl],#16
+ add.i32 $W0,$W0,@MSG[0]
+ orr $abcd,$ABCD,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+
+ ld1.32 {$W0},[$Ktbl],#16
+ add.i32 $W1,$W1,@MSG[1]
+ orr $abcd,$ABCD,$ABCD
+ sha256h $ABCD,$EFGH,$W1
+ sha256h2 $EFGH,$abcd,$W1
+
+ ld1.32 {$W1},[$Ktbl]
+ add.i32 $W0,$W0,@MSG[2]
+ sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind
+ orr $abcd,$ABCD,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+
+ add.i32 $W1,$W1,@MSG[3]
+ orr $abcd,$ABCD,$ABCD
+ sha256h $ABCD,$EFGH,$W1
+ sha256h2 $EFGH,$abcd,$W1
+
+ add.i32 $ABCD,$ABCD,$ABCD_SAVE
+ add.i32 $EFGH,$EFGH,$EFGH_SAVE
+
+ cbnz $num,.Loop_hw
+
+ st1.32 {$ABCD,$EFGH},[$ctx]
+
+ ldr x29,[sp],#16
+ ret
+.size sha256_block_armv8,.-sha256_block_armv8
+___
+}
+
+$code.=<<___;
+.comm OPENSSL_armcap_P,4,4
+___
+
+{ my %opcode = (
+ "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000,
+ "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 );
+
+ sub unsha256 {
+ my ($mnemonic,$arg)=@_;
+
+ $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
+ &&
+ sprintf ".inst\t0x%08x\t//%s %s",
+ $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
+ $mnemonic,$arg;
+ }
+}
+
+foreach(split("\n",$code)) {
+
+ s/\`([^\`]*)\`/eval($1)/geo;
+
+ s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/geo;
+
+ s/\.\w?32\b//o and s/\.16b/\.4s/go;
+ m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go;
+
+ print $_,"\n";
+}
+
+close STDOUT;
diff --git a/crypto/sha/asm/sha512-ia64.pl b/crypto/sha/asm/sha512-ia64.pl
index 1c6ce56522ed..59f889a09594 100755
--- a/crypto/sha/asm/sha512-ia64.pl
+++ b/crypto/sha/asm/sha512-ia64.pl
@@ -1,7 +1,7 @@
#!/usr/bin/env perl
#
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -48,16 +48,22 @@
# because on Itanium 1 stall on MM result is accompanied by
# pipeline flush, which takes 6 cycles:-(
#
-# Resulting performance numbers for 900MHz Itanium 2 system:
+# June 2012
#
-# The 'numbers' are in 1000s of bytes per second processed.
-# type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes
-# sha1(*) 6210.14k 20376.30k 52447.83k 85870.05k 105478.12k
-# sha256 7476.45k 20572.05k 41538.34k 56062.29k 62093.18k
-# sha512 4996.56k 20026.28k 47597.20k 85278.79k 111501.31k
+# Improve performance by 15-20%. Note about "rules of engagement"
+# above. Contemporary cores are equipped with additional shifter,
+# so that they should perform even better than below, presumably
+# by ~10%.
#
-# (*) SHA1 numbers are for HP-UX compiler and are presented purely
-# for reference purposes. I bet it can improved too...
+######################################################################
+# Current performance in cycles per processed byte for Itanium 2
+# pre-9000 series [little-endian] system:
+#
+# SHA1(*) 5.7
+# SHA256 12.6
+# SHA512 6.7
+#
+# (*) SHA1 result is presented purely for reference purposes.
#
# To generate code, pass the file name with either 256 or 512 in its
# name and compiler flags.
@@ -106,8 +112,8 @@ if (!defined($big_endian))
{ $big_endian=(unpack('L',pack('N',1))==1); }
$code=<<___;
-.ident \"$output, version 1.1\"
-.ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\"
+.ident \"$output, version 2.0\"
+.ident \"IA-64 ISA artwork by Andy Polyakov <appro\@openssl.org>\"
.explicit
.text
@@ -115,26 +121,25 @@ pfssave=r2;
lcsave=r3;
prsave=r14;
K=r15;
-A=r16; B=r17; C=r18; D=r19;
-E=r20; F=r21; G=r22; H=r23;
+A_=r16; B_=r17; C_=r18; D_=r19;
+E_=r20; F_=r21; G_=r22; H_=r23;
T1=r24; T2=r25;
s0=r26; s1=r27; t0=r28; t1=r29;
Ktbl=r30;
ctx=r31; // 1st arg
-input=r48; // 2nd arg
-num=r49; // 3rd arg
-sgm0=r50; sgm1=r51; // small constants
-A_=r54; B_=r55; C_=r56; D_=r57;
-E_=r58; F_=r59; G_=r60; H_=r61;
+input=r56; // 2nd arg
+num=r57; // 3rd arg
+sgm0=r58; sgm1=r59; // small constants
// void $func (SHA_CTX *ctx, const void *in,size_t num[,int host])
.global $func#
.proc $func#
.align 32
+.skip 16
$func:
.prologue
.save ar.pfs,pfssave
-{ .mmi; alloc pfssave=ar.pfs,3,27,0,16
+{ .mmi; alloc pfssave=ar.pfs,3,25,0,24
$ADDP ctx=0,r32 // 1st arg
.save ar.lc,lcsave
mov lcsave=ar.lc }
@@ -145,11 +150,9 @@ $func:
.body
{ .mib; add r8=0*$SZ,ctx
- add r9=1*$SZ,ctx
- brp.loop.imp .L_first16,.L_first16_end-16 }
+ add r9=1*$SZ,ctx }
{ .mib; add r10=2*$SZ,ctx
- add r11=3*$SZ,ctx
- brp.loop.imp .L_rest,.L_rest_end-16 };;
+ add r11=3*$SZ,ctx };;
// load A-H
.Lpic_point:
@@ -164,7 +167,7 @@ $func:
add Ktbl=($TABLE#-.Lpic_point),Ktbl }
{ .mmi; $LDW G_=[r10]
$LDW H_=[r11]
- cmp.ne p0,p16=0,r0 };; // used in sha256_block
+ cmp.ne p0,p16=0,r0 };;
___
$code.=<<___ if ($BITS==64);
{ .mii; and r8=7,input
@@ -179,50 +182,26 @@ $code.=<<___ if ($BITS==64);
___
$code.=<<___;
.L_outer:
-.rotr X[16]
-{ .mmi; mov A=A_
- mov B=B_
+.rotr R[8],X[16]
+A=R[0]; B=R[1]; C=R[2]; D=R[3]; E=R[4]; F=R[5]; G=R[6]; H=R[7]
+{ .mmi; ld1 X[15]=[input],$SZ // eliminated in sha512
+ mov A=A_
mov ar.lc=14 }
-{ .mmi; mov C=C_
- mov D=D_
- mov E=E_ }
-{ .mmi; mov F=F_
- mov G=G_
- mov ar.ec=2 }
-{ .mmi; ld1 X[15]=[input],$SZ // eliminated in 64-bit
+{ .mmi; mov B=B_
+ mov C=C_
+ mov D=D_ }
+{ .mmi; mov E=E_
+ mov F=F_
+ mov ar.ec=2 };;
+{ .mmi; mov G=G_
mov H=H_
- mov sgm1=$sigma1[2] };;
-
-___
-$t0="t0", $t1="t1", $code.=<<___ if ($BITS==32);
-.align 32
-.L_first16:
-{ .mmi; add r9=1-$SZ,input
- add r10=2-$SZ,input
- add r11=3-$SZ,input };;
-{ .mmi; ld1 r9=[r9]
- ld1 r10=[r10]
- dep.z $t1=E,32,32 }
-{ .mmi; $LDW K=[Ktbl],$SZ
- ld1 r11=[r11]
- zxt4 E=E };;
-{ .mii; or $t1=$t1,E
- dep X[15]=X[15],r9,8,8
- dep r11=r10,r11,8,8 };;
-{ .mmi; and T1=F,E
- and T2=A,B
- dep X[15]=X[15],r11,16,16 }
-{ .mmi; andcm r8=G,E
- and r9=A,C
- mux2 $t0=A,0x44 };; // copy lower half to upper
-{ .mmi; (p16) ld1 X[15-1]=[input],$SZ // prefetch
- xor T1=T1,r8 // T1=((e & f) ^ (~e & g))
- _rotr r11=$t1,$Sigma1[0] } // ROTR(e,14)
-{ .mib; and r10=B,C
- xor T2=T2,r9 };;
+ mov sgm1=$sigma1[2] }
+{ .mib; mov r8=0
+ add r9=1-$SZ,input
+ brp.loop.imp .L_first16,.L_first16_end-16 };;
___
$t0="A", $t1="E", $code.=<<___ if ($BITS==64);
-// in 64-bit mode I load whole X[16] at once and take care of alignment...
+// in sha512 case I load whole X[16] at once and take care of alignment...
{ .mmi; add r8=1*$SZ,input
add r9=2*$SZ,input
add r10=3*$SZ,input };;
@@ -248,7 +227,9 @@ $t0="A", $t1="E", $code.=<<___ if ($BITS==64);
$LDW X[ 2]=[r8],4*$SZ
(p15) br.cond.dpnt.many .L7byte };;
{ .mmb; $LDW X[ 1]=[r9],4*$SZ
- $LDW X[ 0]=[r10],4*$SZ
+ $LDW X[ 0]=[r10],4*$SZ }
+{ .mib; mov r8=0
+ mux1 X[15]=X[15],\@rev // eliminated on big-endian
br.many .L_first16 };;
.L1byte:
{ .mmi; $LDW X[13]=[r9],4*$SZ
@@ -281,7 +262,9 @@ $t0="A", $t1="E", $code.=<<___ if ($BITS==64);
shrp X[ 3]=X[ 3],X[ 2],56 }
{ .mii; shrp X[ 2]=X[ 2],X[ 1],56
shrp X[ 1]=X[ 1],X[ 0],56 }
-{ .mib; shrp X[ 0]=X[ 0],T1,56
+{ .mib; shrp X[ 0]=X[ 0],T1,56 }
+{ .mib; mov r8=0
+ mux1 X[15]=X[15],\@rev // eliminated on big-endian
br.many .L_first16 };;
.L2byte:
{ .mmi; $LDW X[11]=[input],4*$SZ
@@ -313,7 +296,9 @@ $t0="A", $t1="E", $code.=<<___ if ($BITS==64);
shrp X[ 2]=X[ 2],X[ 1],48 }
{ .mii; shrp X[ 1]=X[ 1],X[ 0],48
shrp X[ 0]=X[ 0],T1,48 }
-{ .mfb; br.many .L_first16 };;
+{ .mib; mov r8=0
+ mux1 X[15]=X[15],\@rev // eliminated on big-endian
+ br.many .L_first16 };;
.L3byte:
{ .mmi; $LDW X[ 9]=[r9],4*$SZ
$LDW X[ 8]=[r10],4*$SZ
@@ -341,7 +326,9 @@ $t0="A", $t1="E", $code.=<<___ if ($BITS==64);
shrp X[ 3]=X[ 3],X[ 2],40 }
{ .mii; shrp X[ 2]=X[ 2],X[ 1],40
shrp X[ 1]=X[ 1],X[ 0],40 }
-{ .mib; shrp X[ 0]=X[ 0],T1,40
+{ .mib; shrp X[ 0]=X[ 0],T1,40 }
+{ .mib; mov r8=0
+ mux1 X[15]=X[15],\@rev // eliminated on big-endian
br.many .L_first16 };;
.L4byte:
{ .mmi; $LDW X[ 7]=[input],4*$SZ
@@ -369,7 +356,9 @@ $t0="A", $t1="E", $code.=<<___ if ($BITS==64);
shrp X[ 2]=X[ 2],X[ 1],32 }
{ .mii; shrp X[ 1]=X[ 1],X[ 0],32
shrp X[ 0]=X[ 0],T1,32 }
-{ .mfb; br.many .L_first16 };;
+{ .mib; mov r8=0
+ mux1 X[15]=X[15],\@rev // eliminated on big-endian
+ br.many .L_first16 };;
.L5byte:
{ .mmi; $LDW X[ 5]=[r9],4*$SZ
$LDW X[ 4]=[r10],4*$SZ
@@ -393,7 +382,9 @@ $t0="A", $t1="E", $code.=<<___ if ($BITS==64);
shrp X[ 3]=X[ 3],X[ 2],24 }
{ .mii; shrp X[ 2]=X[ 2],X[ 1],24
shrp X[ 1]=X[ 1],X[ 0],24 }
-{ .mib; shrp X[ 0]=X[ 0],T1,24
+{ .mib; shrp X[ 0]=X[ 0],T1,24 }
+{ .mib; mov r8=0
+ mux1 X[15]=X[15],\@rev // eliminated on big-endian
br.many .L_first16 };;
.L6byte:
{ .mmi; $LDW X[ 3]=[input],4*$SZ
@@ -417,7 +408,9 @@ $t0="A", $t1="E", $code.=<<___ if ($BITS==64);
shrp X[ 2]=X[ 2],X[ 1],16 }
{ .mii; shrp X[ 1]=X[ 1],X[ 0],16
shrp X[ 0]=X[ 0],T1,16 }
-{ .mfb; br.many .L_first16 };;
+{ .mib; mov r8=0
+ mux1 X[15]=X[15],\@rev // eliminated on big-endian
+ br.many .L_first16 };;
.L7byte:
{ .mmi; $LDW X[ 1]=[r9],4*$SZ
$LDW X[ 0]=[r10],4*$SZ
@@ -437,128 +430,146 @@ $t0="A", $t1="E", $code.=<<___ if ($BITS==64);
shrp X[ 3]=X[ 3],X[ 2],8 }
{ .mii; shrp X[ 2]=X[ 2],X[ 1],8
shrp X[ 1]=X[ 1],X[ 0],8 }
-{ .mib; shrp X[ 0]=X[ 0],T1,8
- br.many .L_first16 };;
+{ .mib; shrp X[ 0]=X[ 0],T1,8 }
+{ .mib; mov r8=0
+ mux1 X[15]=X[15],\@rev };; // eliminated on big-endian
.align 32
.L_first16:
{ .mmi; $LDW K=[Ktbl],$SZ
- and T1=F,E
- and T2=A,B }
-{ .mmi; //$LDW X[15]=[input],$SZ // X[i]=*input++
+ add A=A,r8 // H+=Sigma(0) from the past
+ _rotr r10=$t1,$Sigma1[0] } // ROTR(e,14)
+{ .mmi; and T1=F,E
andcm r8=G,E
- and r9=A,C };;
-{ .mmi; xor T1=T1,r8 //T1=((e & f) ^ (~e & g))
- and r10=B,C
- _rotr r11=$t1,$Sigma1[0] } // ROTR(e,14)
-{ .mmi; xor T2=T2,r9
- mux1 X[15]=X[15],\@rev };; // eliminated in big-endian
+ (p16) mux1 X[14]=X[14],\@rev };; // eliminated on big-endian
+{ .mmi; and T2=A,B
+ and r9=A,C
+ _rotr r11=$t1,$Sigma1[1] } // ROTR(e,41)
+{ .mmi; xor T1=T1,r8 // T1=((e & f) ^ (~e & g))
+ and r8=B,C };;
+___
+$t0="t0", $t1="t1", $code.=<<___ if ($BITS==32);
+.align 32
+.L_first16:
+{ .mmi; add A=A,r8 // H+=Sigma(0) from the past
+ add r10=2-$SZ,input
+ add r11=3-$SZ,input };;
+{ .mmi; ld1 r9=[r9]
+ ld1 r10=[r10]
+ dep.z $t1=E,32,32 }
+{ .mmi; ld1 r11=[r11]
+ $LDW K=[Ktbl],$SZ
+ zxt4 E=E };;
+{ .mii; or $t1=$t1,E
+ dep X[15]=X[15],r9,8,8
+ mux2 $t0=A,0x44 };; // copy lower half to upper
+{ .mmi; and T1=F,E
+ andcm r8=G,E
+ dep r11=r10,r11,8,8 };;
+{ .mmi; and T2=A,B
+ and r9=A,C
+ dep X[15]=X[15],r11,16,16 };;
+{ .mmi; (p16) ld1 X[15-1]=[input],$SZ // prefetch
+ xor T1=T1,r8 // T1=((e & f) ^ (~e & g))
+ _rotr r10=$t1,$Sigma1[0] } // ROTR(e,14)
+{ .mmi; and r8=B,C
+ _rotr r11=$t1,$Sigma1[1] };; // ROTR(e,18)
___
$code.=<<___;
-{ .mib; add T1=T1,H // T1=Ch(e,f,g)+h
- _rotr r8=$t1,$Sigma1[1] } // ROTR(e,18)
-{ .mib; xor T2=T2,r10 // T2=((a & b) ^ (a & c) ^ (b & c))
- mov H=G };;
-{ .mib; xor r11=r8,r11
- _rotr r9=$t1,$Sigma1[2] } // ROTR(e,41)
-{ .mib; mov G=F
- mov F=E };;
-{ .mib; xor r9=r9,r11 // r9=Sigma1(e)
- _rotr r10=$t0,$Sigma0[0] } // ROTR(a,28)
-{ .mib; add T1=T1,K // T1=Ch(e,f,g)+h+K512[i]
- mov E=D };;
-{ .mib; add T1=T1,r9 // T1+=Sigma1(e)
- _rotr r11=$t0,$Sigma0[1] } // ROTR(a,34)
-{ .mib; mov D=C
- mov C=B };;
-{ .mib; add T1=T1,X[15] // T1+=X[i]
- _rotr r8=$t0,$Sigma0[2] } // ROTR(a,39)
-{ .mib; xor r10=r10,r11
- mux2 X[15]=X[15],0x44 };; // eliminated in 64-bit
-{ .mmi; xor r10=r8,r10 // r10=Sigma0(a)
- mov B=A
- add A=T1,T2 };;
-{ .mib; add E=E,T1
- add A=A,r10 // T2=Maj(a,b,c)+Sigma0(a)
- br.ctop.sptk .L_first16 };;
+{ .mmi; add T1=T1,H // T1=Ch(e,f,g)+h
+ xor r10=r10,r11
+ _rotr r11=$t1,$Sigma1[2] } // ROTR(e,41)
+{ .mmi; xor T2=T2,r9
+ add K=K,X[15] };;
+{ .mmi; add T1=T1,K // T1+=K[i]+X[i]
+ xor T2=T2,r8 // T2=((a & b) ^ (a & c) ^ (b & c))
+ _rotr r8=$t0,$Sigma0[0] } // ROTR(a,28)
+{ .mmi; xor r11=r11,r10 // Sigma1(e)
+ _rotr r9=$t0,$Sigma0[1] };; // ROTR(a,34)
+{ .mmi; add T1=T1,r11 // T+=Sigma1(e)
+ xor r8=r8,r9
+ _rotr r9=$t0,$Sigma0[2] };; // ROTR(a,39)
+{ .mmi; xor r8=r8,r9 // Sigma0(a)
+ add D=D,T1
+ mux2 H=X[15],0x44 } // mov H=X[15] in sha512
+{ .mib; (p16) add r9=1-$SZ,input // not used in sha512
+ add X[15]=T1,T2 // H=T1+Maj(a,b,c)
+ br.ctop.sptk .L_first16 };;
.L_first16_end:
-{ .mii; mov ar.lc=$rounds-17
- mov ar.ec=1 };;
+{ .mib; mov ar.lc=$rounds-17
+ brp.loop.imp .L_rest,.L_rest_end-16 }
+{ .mib; mov ar.ec=1
+ br.many .L_rest };;
.align 32
.L_rest:
-.rotr X[16]
-{ .mib; $LDW K=[Ktbl],$SZ
+{ .mmi; $LDW K=[Ktbl],$SZ
+ add A=A,r8 // H+=Sigma0(a) from the past
_rotr r8=X[15-1],$sigma0[0] } // ROTR(s0,1)
-{ .mib; $ADD X[15]=X[15],X[15-9] // X[i&0xF]+=X[(i+9)&0xF]
- $SHRU s0=X[15-1],sgm0 };; // s0=X[(i+1)&0xF]>>7
+{ .mmi; add X[15]=X[15],X[15-9] // X[i&0xF]+=X[(i+9)&0xF]
+ $SHRU s0=X[15-1],sgm0 };; // s0=X[(i+1)&0xF]>>7
{ .mib; and T1=F,E
_rotr r9=X[15-1],$sigma0[1] } // ROTR(s0,8)
{ .mib; andcm r10=G,E
- $SHRU s1=X[15-14],sgm1 };; // s1=X[(i+14)&0xF]>>6
+ $SHRU s1=X[15-14],sgm1 };; // s1=X[(i+14)&0xF]>>6
+// Pair of mmi; splits on Itanium 1 and prevents pipeline flush
+// upon $SHRU output usage
{ .mmi; xor T1=T1,r10 // T1=((e & f) ^ (~e & g))
xor r9=r8,r9
- _rotr r10=X[15-14],$sigma1[0] };;// ROTR(s1,19)
-{ .mib; and T2=A,B
- _rotr r11=X[15-14],$sigma1[1] }// ROTR(s1,61)
-{ .mib; and r8=A,C };;
+ _rotr r10=X[15-14],$sigma1[0] }// ROTR(s1,19)
+{ .mmi; and T2=A,B
+ and r8=A,C
+ _rotr r11=X[15-14],$sigma1[1] };;// ROTR(s1,61)
___
$t0="t0", $t1="t1", $code.=<<___ if ($BITS==32);
-// I adhere to mmi; in order to hold Itanium 1 back and avoid 6 cycle
-// pipeline flush in last bundle. Note that even on Itanium2 the
-// latter stalls for one clock cycle...
-{ .mmi; xor s0=s0,r9 // s0=sigma0(X[(i+1)&0xF])
- dep.z $t1=E,32,32 }
-{ .mmi; xor r10=r11,r10
- zxt4 E=E };;
-{ .mmi; or $t1=$t1,E
- xor s1=s1,r10 // s1=sigma1(X[(i+14)&0xF])
- mux2 $t0=A,0x44 };; // copy lower half to upper
+{ .mib; xor s0=s0,r9 // s0=sigma0(X[(i+1)&0xF])
+ dep.z $t1=E,32,32 }
+{ .mib; xor r10=r11,r10
+ zxt4 E=E };;
+{ .mii; xor s1=s1,r10 // s1=sigma1(X[(i+14)&0xF])
+ shrp r9=E,$t1,32+$Sigma1[0] // ROTR(e,14)
+ mux2 $t0=A,0x44 };; // copy lower half to upper
+// Pair of mmi; splits on Itanium 1 and prevents pipeline flush
+// upon mux2 output usage
{ .mmi; xor T2=T2,r8
- _rotr r9=$t1,$Sigma1[0] } // ROTR(e,14)
+ shrp r8=E,$t1,32+$Sigma1[1]} // ROTR(e,18)
{ .mmi; and r10=B,C
add T1=T1,H // T1=Ch(e,f,g)+h
- $ADD X[15]=X[15],s0 };; // X[i&0xF]+=sigma0(X[(i+1)&0xF])
+ or $t1=$t1,E };;
___
$t0="A", $t1="E", $code.=<<___ if ($BITS==64);
{ .mib; xor s0=s0,r9 // s0=sigma0(X[(i+1)&0xF])
- _rotr r9=$t1,$Sigma1[0] } // ROTR(e,14)
+ _rotr r9=$t1,$Sigma1[0] } // ROTR(e,14)
{ .mib; xor r10=r11,r10
- xor T2=T2,r8 };;
+ xor T2=T2,r8 };;
{ .mib; xor s1=s1,r10 // s1=sigma1(X[(i+14)&0xF])
- add T1=T1,H }
+ _rotr r8=$t1,$Sigma1[1] } // ROTR(e,18)
{ .mib; and r10=B,C
- $ADD X[15]=X[15],s0 };; // X[i&0xF]+=sigma0(X[(i+1)&0xF])
+ add T1=T1,H };; // T1+=H
___
$code.=<<___;
-{ .mmi; xor T2=T2,r10 // T2=((a & b) ^ (a & c) ^ (b & c))
- mov H=G
- _rotr r8=$t1,$Sigma1[1] };; // ROTR(e,18)
-{ .mmi; xor r11=r8,r9
- $ADD X[15]=X[15],s1 // X[i&0xF]+=sigma1(X[(i+14)&0xF])
- _rotr r9=$t1,$Sigma1[2] } // ROTR(e,41)
-{ .mmi; mov G=F
- mov F=E };;
-{ .mib; xor r9=r9,r11 // r9=Sigma1(e)
- _rotr r10=$t0,$Sigma0[0] } // ROTR(a,28)
-{ .mib; add T1=T1,K // T1=Ch(e,f,g)+h+K512[i]
- mov E=D };;
-{ .mib; add T1=T1,r9 // T1+=Sigma1(e)
- _rotr r11=$t0,$Sigma0[1] } // ROTR(a,34)
-{ .mib; mov D=C
- mov C=B };;
-{ .mmi; add T1=T1,X[15] // T1+=X[i]
- xor r10=r10,r11
- _rotr r8=$t0,$Sigma0[2] };; // ROTR(a,39)
-{ .mmi; xor r10=r8,r10 // r10=Sigma0(a)
- mov B=A
- add A=T1,T2 };;
-{ .mib; add E=E,T1
- add A=A,r10 // T2=Maj(a,b,c)+Sigma0(a)
- br.ctop.sptk .L_rest };;
+{ .mib; xor r9=r9,r8
+ _rotr r8=$t1,$Sigma1[2] } // ROTR(e,41)
+{ .mib; xor T2=T2,r10 // T2=((a & b) ^ (a & c) ^ (b & c))
+ add X[15]=X[15],s0 };; // X[i]+=sigma0(X[i+1])
+{ .mmi; xor r9=r9,r8 // Sigma1(e)
+ add X[15]=X[15],s1 // X[i]+=sigma0(X[i+14])
+ _rotr r8=$t0,$Sigma0[0] };; // ROTR(a,28)
+{ .mmi; add K=K,X[15]
+ add T1=T1,r9 // T1+=Sigma1(e)
+ _rotr r9=$t0,$Sigma0[1] };; // ROTR(a,34)
+{ .mmi; add T1=T1,K // T1+=K[i]+X[i]
+ xor r8=r8,r9
+ _rotr r9=$t0,$Sigma0[2] };; // ROTR(a,39)
+{ .mib; add D=D,T1
+ mux2 H=X[15],0x44 } // mov H=X[15] in sha512
+{ .mib; xor r8=r8,r9 // Sigma0(a)
+ add X[15]=T1,T2 // H=T1+Maj(a,b,c)
+ br.ctop.sptk .L_rest };;
.L_rest_end:
+{ .mmi; add A=A,r8 };; // H+=Sigma0(a) from the past
{ .mmi; add A_=A_,A
add B_=B_,B
add C_=C_,C }
@@ -590,17 +601,19 @@ $code.=<<___;
.endp $func#
___
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/_rotr(\s+)([^=]+)=([^,]+),([0-9]+)/shrp$1$2=$3,$3,$4/gm;
-if ($BITS==64) {
- $code =~ s/mux2(\s+)\S+/nop.i$1 0x0/gm;
- $code =~ s/mux1(\s+)\S+/nop.i$1 0x0/gm if ($big_endian);
- $code =~ s/(shrp\s+X\[[^=]+)=([^,]+),([^,]+),([1-9]+)/$1=$3,$2,64-$4/gm
+foreach(split($/,$code)) {
+ s/\`([^\`]*)\`/eval $1/gem;
+ s/_rotr(\s+)([^=]+)=([^,]+),([0-9]+)/shrp$1$2=$3,$3,$4/gm;
+ if ($BITS==64) {
+ s/mux2(\s+)([^=]+)=([^,]+),\S+/mov$1 $2=$3/gm;
+ s/mux1(\s+)\S+/nop.i$1 0x0/gm if ($big_endian);
+ s/(shrp\s+X\[[^=]+)=([^,]+),([^,]+),([1-9]+)/$1=$3,$2,64-$4/gm
if (!$big_endian);
- $code =~ s/ld1(\s+)X\[\S+/nop.m$1 0x0/gm;
-}
+ s/ld1(\s+)X\[\S+/nop.m$1 0x0/gm;
+ }
-print $code;
+ print $_,"\n";
+}
print<<___ if ($BITS==32);
.align 64
diff --git a/crypto/sha/asm/sha512-mips.pl b/crypto/sha/asm/sha512-mips.pl
index 6807a2c7220b..b468cfb4569e 100755
--- a/crypto/sha/asm/sha512-mips.pl
+++ b/crypto/sha/asm/sha512-mips.pl
@@ -1,7 +1,7 @@
#!/usr/bin/env perl
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -17,6 +17,10 @@
# ~17%, but it comes for free, because it's same instruction sequence.
# Improvement coefficients are for aligned input.
+# September 2012.
+#
+# Add MIPS[32|64]R2 code (>25% less instructions).
+
######################################################################
# There is a number of MIPS ABI in use, O32 and N32/64 are most
# widely used. Then there is a new contender: NUBI. It appears that if
@@ -45,7 +49,7 @@
# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
#
-$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
if ($flavour =~ /64|n32/i) {
$PTR_ADD="dadd"; # incidentally works even on n32
@@ -83,6 +87,7 @@ if ($output =~ /512/) {
$SLL="dsll"; # shift left logical
$SRL="dsrl"; # shift right logical
$ADDU="daddu";
+ $ROTR="drotr";
@Sigma0=(28,34,39);
@Sigma1=(14,18,41);
@sigma0=( 7, 1, 8); # right shift first
@@ -97,6 +102,7 @@ if ($output =~ /512/) {
$SLL="sll"; # shift left logical
$SRL="srl"; # shift right logical
$ADDU="addu";
+ $ROTR="rotr";
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 3, 7,18); # right shift first
@@ -124,6 +130,10 @@ $code.=<<___ if ($i<15);
${LD}r @X[1],`($i+1)*$SZ+$LSB`($inp)
___
$code.=<<___ if (!$big_endian && $i<16 && $SZ==4);
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+ wsbh @X[0],@X[0] # byte swap($i)
+ rotr @X[0],@X[0],16
+#else
srl $tmp0,@X[0],24 # byte swap($i)
srl $tmp1,@X[0],8
andi $tmp2,@X[0],0xFF00
@@ -133,8 +143,13 @@ $code.=<<___ if (!$big_endian && $i<16 && $SZ==4);
or @X[0],$tmp0
or $tmp1,$tmp2
or @X[0],$tmp1
+#endif
___
$code.=<<___ if (!$big_endian && $i<16 && $SZ==8);
+#if defined(_MIPS_ARCH_MIPS64R2)
+ dsbh @X[0],@X[0] # byte swap($i)
+ dshd @X[0],@X[0]
+#else
ori $tmp0,$zero,0xFF
dsll $tmp2,$tmp0,32
or $tmp0,$tmp2 # 0x000000FF000000FF
@@ -153,8 +168,31 @@ $code.=<<___ if (!$big_endian && $i<16 && $SZ==8);
dsrl $tmp1,@X[0],32
dsll @X[0],32
or @X[0],$tmp1
+#endif
___
$code.=<<___;
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+ xor $tmp2,$f,$g # $i
+ $ROTR $tmp0,$e,@Sigma1[0]
+ $ADDU $T1,$X[0],$h
+ $ROTR $tmp1,$e,@Sigma1[1]
+ and $tmp2,$e
+ $ROTR $h,$e,@Sigma1[2]
+ xor $tmp0,$tmp1
+ $ROTR $tmp1,$a,@Sigma0[0]
+ xor $tmp2,$g # Ch(e,f,g)
+ xor $tmp0,$h # Sigma1(e)
+
+ $ROTR $h,$a,@Sigma0[1]
+ $ADDU $T1,$tmp2
+ $LD $tmp2,`$i*$SZ`($Ktbl) # K[$i]
+ xor $h,$tmp1
+ $ROTR $tmp1,$a,@Sigma0[2]
+ $ADDU $T1,$tmp0
+ and $tmp0,$b,$c
+ xor $h,$tmp1 # Sigma0(a)
+ xor $tmp1,$b,$c
+#else
$ADDU $T1,$X[0],$h # $i
$SRL $h,$e,@Sigma1[0]
xor $tmp2,$f,$g
@@ -184,16 +222,15 @@ $code.=<<___;
xor $h,$tmp1
$SLL $tmp1,$a,`$SZ*8-@Sigma0[0]`
xor $h,$tmp0
- $ST @X[0],`($i%16)*$SZ`($sp) # offload to ring buffer
+ and $tmp0,$b,$c
xor $h,$tmp1 # Sigma0(a)
-
- or $tmp0,$a,$b
- and $tmp1,$a,$b
- and $tmp0,$c
- or $tmp1,$tmp0 # Maj(a,b,c)
+ xor $tmp1,$b,$c
+#endif
+ $ST @X[0],`($i%16)*$SZ`($sp) # offload to ring buffer
+ $ADDU $h,$tmp0
+ and $tmp1,$a
$ADDU $T1,$tmp2 # +=K[$i]
- $ADDU $h,$tmp1
-
+ $ADDU $h,$tmp1 # +=Maj(a,b,c)
$ADDU $d,$T1
$ADDU $h,$T1
___
@@ -207,6 +244,20 @@ my $i=@_[0];
my ($tmp0,$tmp1,$tmp2,$tmp3)=(@X[4],@X[5],@X[6],@X[7]);
$code.=<<___;
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+ $SRL $tmp2,@X[1],@sigma0[0] # Xupdate($i)
+ $ROTR $tmp0,@X[1],@sigma0[1]
+ $ADDU @X[0],@X[9] # +=X[i+9]
+ xor $tmp2,$tmp0
+ $ROTR $tmp0,@X[1],@sigma0[2]
+
+ $SRL $tmp3,@X[14],@sigma1[0]
+ $ROTR $tmp1,@X[14],@sigma1[1]
+ xor $tmp2,$tmp0 # sigma0(X[i+1])
+ $ROTR $tmp0,@X[14],@sigma1[2]
+ xor $tmp3,$tmp1
+ $ADDU @X[0],$tmp2
+#else
$SRL $tmp2,@X[1],@sigma0[0] # Xupdate($i)
$ADDU @X[0],@X[9] # +=X[i+9]
$SLL $tmp1,@X[1],`$SZ*8-@sigma0[2]`
@@ -227,7 +278,7 @@ $code.=<<___;
xor $tmp3,$tmp0
$SRL $tmp0,@X[14],@sigma1[2]
xor $tmp3,$tmp1
-
+#endif
xor $tmp3,$tmp0 # sigma1(X[i+14])
$ADDU @X[0],$tmp3
___
@@ -242,9 +293,13 @@ $code.=<<___;
# include <openssl/fipssyms.h>
#endif
+#if defined(__mips_smartmips) && !defined(_MIPS_ARCH_MIPS32R2)
+#define _MIPS_ARCH_MIPS32R2
+#endif
+
.text
.set noat
-#if !defined(__vxworks) || defined(__pic__)
+#if !defined(__mips_eabi) && (!defined(__vxworks) || defined(__pic__))
.option pic2
#endif
diff --git a/crypto/sha/asm/sha512-ppc.pl b/crypto/sha/asm/sha512-ppc.pl
index 6b44a68e599e..734f3c1ca0f0 100755
--- a/crypto/sha/asm/sha512-ppc.pl
+++ b/crypto/sha/asm/sha512-ppc.pl
@@ -1,7 +1,7 @@
#!/usr/bin/env perl
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -9,8 +9,7 @@
# I let hardware handle unaligned input, except on page boundaries
# (see below for details). Otherwise straightforward implementation
-# with X vector in register bank. The module is big-endian [which is
-# not big deal as there're no little-endian targets left around].
+# with X vector in register bank.
# sha256 | sha512
# -m64 -m32 | -m64 -m32
@@ -56,6 +55,8 @@ if ($flavour =~ /64/) {
$PUSH="stw";
} else { die "nonsense $flavour"; }
+$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
+
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
@@ -64,7 +65,7 @@ die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
if ($output =~ /512/) {
- $func="sha512_block_data_order";
+ $func="sha512_block_ppc";
$SZ=8;
@Sigma0=(28,34,39);
@Sigma1=(14,18,41);
@@ -76,7 +77,7 @@ if ($output =~ /512/) {
$ROR="rotrdi";
$SHR="srdi";
} else {
- $func="sha256_block_data_order";
+ $func="sha256_block_ppc";
$SZ=4;
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@@ -110,7 +111,7 @@ $B ="r9";
$C ="r10";
$D ="r11";
$E ="r12";
-$F ="r13"; $F="r2" if ($SIZE_T==8);# reassigned to exempt TLS pointer
+$F =$t1; $t1 = "r0"; # stay away from "r13";
$G ="r14";
$H ="r15";
@@ -118,24 +119,23 @@ $H ="r15";
@X=("r16","r17","r18","r19","r20","r21","r22","r23",
"r24","r25","r26","r27","r28","r29","r30","r31");
-$inp="r31"; # reassigned $inp! aliases with @X[15]
+$inp="r31" if($SZ==4 || $SIZE_T==8); # reassigned $inp! aliases with @X[15]
sub ROUND_00_15 {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
$code.=<<___;
- $LD $T,`$i*$SZ`($Tbl)
$ROR $a0,$e,$Sigma1[0]
$ROR $a1,$e,$Sigma1[1]
and $t0,$f,$e
- andc $t1,$g,$e
- add $T,$T,$h
xor $a0,$a0,$a1
+ add $h,$h,$t1
+ andc $t1,$g,$e
$ROR $a1,$a1,`$Sigma1[2]-$Sigma1[1]`
or $t0,$t0,$t1 ; Ch(e,f,g)
- add $T,$T,@X[$i]
+ add $h,$h,@X[$i%16]
xor $a0,$a0,$a1 ; Sigma1(e)
- add $T,$T,$t0
- add $T,$T,$a0
+ add $h,$h,$t0
+ add $h,$h,$a0
$ROR $a0,$a,$Sigma0[0]
$ROR $a1,$a,$Sigma0[1]
@@ -146,9 +146,14 @@ $code.=<<___;
xor $t0,$t0,$t1
and $t1,$b,$c
xor $a0,$a0,$a1 ; Sigma0(a)
- add $d,$d,$T
+ add $d,$d,$h
xor $t0,$t0,$t1 ; Maj(a,b,c)
- add $h,$T,$a0
+___
+$code.=<<___ if ($i<15);
+ $LD $t1,`($i+1)*$SZ`($Tbl)
+___
+$code.=<<___;
+ add $h,$h,$a0
add $h,$h,$t0
___
@@ -169,10 +174,11 @@ $code.=<<___;
add @X[$i],@X[$i],@X[($i+9)%16]
xor $a0,$a0,$a1 ; sigma0(X[(i+1)&0x0f])
xor $t0,$t0,$t1 ; sigma1(X[(i+14)&0x0f])
+ $LD $t1,`$i*$SZ`($Tbl)
add @X[$i],@X[$i],$a0
add @X[$i],@X[$i],$t0
___
-&ROUND_00_15($i,$a,$b,$c,$d,$e,$f,$g,$h);
+&ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h);
}
$code=<<___;
@@ -188,8 +194,6 @@ $func:
$PUSH $ctx,`$FRAME-$SIZE_T*22`($sp)
- $PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
- $PUSH r13,`$FRAME-$SIZE_T*19`($sp)
$PUSH r14,`$FRAME-$SIZE_T*18`($sp)
$PUSH r15,`$FRAME-$SIZE_T*17`($sp)
$PUSH r16,`$FRAME-$SIZE_T*16`($sp)
@@ -209,7 +213,10 @@ $func:
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
$PUSH r0,`$FRAME+$LRSAVE`($sp)
+___
+if ($SZ==4 || $SIZE_T==8) {
+$code.=<<___;
$LD $A,`0*$SZ`($ctx)
mr $inp,r4 ; incarnate $inp
$LD $B,`1*$SZ`($ctx)
@@ -219,7 +226,16 @@ $func:
$LD $F,`5*$SZ`($ctx)
$LD $G,`6*$SZ`($ctx)
$LD $H,`7*$SZ`($ctx)
+___
+} else {
+ for ($i=16;$i<32;$i++) {
+ $code.=<<___;
+ lwz r$i,`$LITTLE_ENDIAN^(4*($i-16))`($ctx)
+___
+ }
+}
+$code.=<<___;
bl LPICmeup
LPICedup:
andi. r0,$inp,3
@@ -255,6 +271,9 @@ Lunaligned:
Lcross_page:
li $t1,`16*$SZ/4`
mtctr $t1
+___
+if ($SZ==4 || $SIZE_T==8) {
+$code.=<<___;
addi r20,$sp,$LOCALS ; aligned spot below the frame
Lmemcpy:
lbz r16,0($inp)
@@ -268,7 +287,26 @@ Lmemcpy:
stb r19,3(r20)
addi r20,r20,4
bdnz Lmemcpy
+___
+} else {
+$code.=<<___;
+ addi r12,$sp,$LOCALS ; aligned spot below the frame
+Lmemcpy:
+ lbz r8,0($inp)
+ lbz r9,1($inp)
+ lbz r10,2($inp)
+ lbz r11,3($inp)
+ addi $inp,$inp,4
+ stb r8,0(r12)
+ stb r9,1(r12)
+ stb r10,2(r12)
+ stb r11,3(r12)
+ addi r12,r12,4
+ bdnz Lmemcpy
+___
+}
+$code.=<<___;
$PUSH $inp,`$FRAME-$SIZE_T*26`($sp) ; save real inp
addi $t1,$sp,`$LOCALS+16*$SZ` ; fictitious end pointer
addi $inp,$sp,$LOCALS ; fictitious inp pointer
@@ -283,8 +321,6 @@ Lmemcpy:
Ldone:
$POP r0,`$FRAME+$LRSAVE`($sp)
- $POP $toc,`$FRAME-$SIZE_T*20`($sp)
- $POP r13,`$FRAME-$SIZE_T*19`($sp)
$POP r14,`$FRAME-$SIZE_T*18`($sp)
$POP r15,`$FRAME-$SIZE_T*17`($sp)
$POP r16,`$FRAME-$SIZE_T*16`($sp)
@@ -309,27 +345,48 @@ Ldone:
.long 0
.byte 0,12,4,1,0x80,18,3,0
.long 0
+___
+if ($SZ==4 || $SIZE_T==8) {
+$code.=<<___;
.align 4
Lsha2_block_private:
+ $LD $t1,0($Tbl)
___
for($i=0;$i<16;$i++) {
-$code.=<<___ if ($SZ==4);
+$code.=<<___ if ($SZ==4 && !$LITTLE_ENDIAN);
lwz @X[$i],`$i*$SZ`($inp)
___
+$code.=<<___ if ($SZ==4 && $LITTLE_ENDIAN);
+ lwz $a0,`$i*$SZ`($inp)
+ rotlwi @X[$i],$a0,8
+ rlwimi @X[$i],$a0,24,0,7
+ rlwimi @X[$i],$a0,24,16,23
+___
# 64-bit loads are split to 2x32-bit ones, as CPU can't handle
# unaligned 64-bit loads, only 32-bit ones...
-$code.=<<___ if ($SZ==8);
+$code.=<<___ if ($SZ==8 && !$LITTLE_ENDIAN);
lwz $t0,`$i*$SZ`($inp)
lwz @X[$i],`$i*$SZ+4`($inp)
insrdi @X[$i],$t0,32,0
___
+$code.=<<___ if ($SZ==8 && $LITTLE_ENDIAN);
+ lwz $a0,`$i*$SZ`($inp)
+ lwz $a1,`$i*$SZ+4`($inp)
+ rotlwi $t0,$a0,8
+ rotlwi @X[$i],$a1,8
+ rlwimi $t0,$a0,24,0,7
+ rlwimi @X[$i],$a1,24,0,7
+ rlwimi $t0,$a0,24,16,23
+ rlwimi @X[$i],$a1,24,16,23
+ insrdi @X[$i],$t0,32,0
+___
&ROUND_00_15($i,@V);
unshift(@V,pop(@V));
}
$code.=<<___;
- li $T,`$rounds/16-1`
- mtctr $T
+ li $t0,`$rounds/16-1`
+ mtctr $t0
.align 4
Lrounds:
addi $Tbl,$Tbl,`16*$SZ`
@@ -377,7 +434,282 @@ $code.=<<___;
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
+.size $func,.-$func
+___
+} else {
+########################################################################
+# SHA512 for PPC32, X vector is off-loaded to stack...
+#
+# | sha512
+# | -m32
+# ----------------------+-----------------------
+# PPC74x0,gcc-4.0.1 | +48%
+# POWER6,gcc-4.4.6 | +124%(*)
+# POWER7,gcc-4.4.6 | +79%(*)
+# e300,gcc-4.1.0 | +167%
+#
+# (*) ~1/3 of -m64 result [and ~20% better than -m32 code generated
+# by xlc-12.1]
+
+my $XOFF=$LOCALS;
+
+my @V=map("r$_",(16..31)); # A..H
+
+my ($s0,$s1,$t0,$t1,$t2,$t3,$a0,$a1,$a2,$a3)=map("r$_",(0,5,6,8..12,14,15));
+my ($x0,$x1)=("r3","r4"); # zaps $ctx and $inp
+
+sub ROUND_00_15_ppc32 {
+my ($i, $ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
+ $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo)=@_;
+
+$code.=<<___;
+ lwz $t2,`$SZ*($i%16)+($LITTLE_ENDIAN^4)`($Tbl)
+ xor $a0,$flo,$glo
+ lwz $t3,`$SZ*($i%16)+($LITTLE_ENDIAN^0)`($Tbl)
+ xor $a1,$fhi,$ghi
+ addc $hlo,$hlo,$t0 ; h+=x[i]
+ stw $t0,`$XOFF+0+$SZ*($i%16)`($sp) ; save x[i]
+
+ srwi $s0,$elo,$Sigma1[0]
+ srwi $s1,$ehi,$Sigma1[0]
+ and $a0,$a0,$elo
+ adde $hhi,$hhi,$t1
+ and $a1,$a1,$ehi
+ stw $t1,`$XOFF+4+$SZ*($i%16)`($sp)
+ srwi $t0,$elo,$Sigma1[1]
+ srwi $t1,$ehi,$Sigma1[1]
+ addc $hlo,$hlo,$t2 ; h+=K512[i]
+ insrwi $s0,$ehi,$Sigma1[0],0
+ insrwi $s1,$elo,$Sigma1[0],0
+ xor $a0,$a0,$glo ; Ch(e,f,g)
+ adde $hhi,$hhi,$t3
+ xor $a1,$a1,$ghi
+ insrwi $t0,$ehi,$Sigma1[1],0
+ insrwi $t1,$elo,$Sigma1[1],0
+ addc $hlo,$hlo,$a0 ; h+=Ch(e,f,g)
+ srwi $t2,$ehi,$Sigma1[2]-32
+ srwi $t3,$elo,$Sigma1[2]-32
+ xor $s0,$s0,$t0
+ xor $s1,$s1,$t1
+ insrwi $t2,$elo,$Sigma1[2]-32,0
+ insrwi $t3,$ehi,$Sigma1[2]-32,0
+ xor $a0,$alo,$blo ; a^b, b^c in next round
+ adde $hhi,$hhi,$a1
+ xor $a1,$ahi,$bhi
+ xor $s0,$s0,$t2 ; Sigma1(e)
+ xor $s1,$s1,$t3
+
+ srwi $t0,$alo,$Sigma0[0]
+ and $a2,$a2,$a0
+ addc $hlo,$hlo,$s0 ; h+=Sigma1(e)
+ and $a3,$a3,$a1
+ srwi $t1,$ahi,$Sigma0[0]
+ srwi $s0,$ahi,$Sigma0[1]-32
+ adde $hhi,$hhi,$s1
+ srwi $s1,$alo,$Sigma0[1]-32
+ insrwi $t0,$ahi,$Sigma0[0],0
+ insrwi $t1,$alo,$Sigma0[0],0
+ xor $a2,$a2,$blo ; Maj(a,b,c)
+ addc $dlo,$dlo,$hlo ; d+=h
+ xor $a3,$a3,$bhi
+ insrwi $s0,$alo,$Sigma0[1]-32,0
+ insrwi $s1,$ahi,$Sigma0[1]-32,0
+ adde $dhi,$dhi,$hhi
+ srwi $t2,$ahi,$Sigma0[2]-32
+ srwi $t3,$alo,$Sigma0[2]-32
+ xor $s0,$s0,$t0
+ addc $hlo,$hlo,$a2 ; h+=Maj(a,b,c)
+ xor $s1,$s1,$t1
+ insrwi $t2,$alo,$Sigma0[2]-32,0
+ insrwi $t3,$ahi,$Sigma0[2]-32,0
+ adde $hhi,$hhi,$a3
+___
+$code.=<<___ if ($i>=15);
+ lwz $t0,`$XOFF+0+$SZ*(($i+2)%16)`($sp)
+ lwz $t1,`$XOFF+4+$SZ*(($i+2)%16)`($sp)
+___
+$code.=<<___ if ($i<15 && !$LITTLE_ENDIAN);
+ lwz $t1,`$SZ*($i+1)+0`($inp)
+ lwz $t0,`$SZ*($i+1)+4`($inp)
___
+$code.=<<___ if ($i<15 && $LITTLE_ENDIAN);
+ lwz $a2,`$SZ*($i+1)+0`($inp)
+ lwz $a3,`$SZ*($i+1)+4`($inp)
+ rotlwi $t1,$a2,8
+ rotlwi $t0,$a3,8
+ rlwimi $t1,$a2,24,0,7
+ rlwimi $t0,$a3,24,0,7
+ rlwimi $t1,$a2,24,16,23
+ rlwimi $t0,$a3,24,16,23
+___
+$code.=<<___;
+ xor $s0,$s0,$t2 ; Sigma0(a)
+ xor $s1,$s1,$t3
+ addc $hlo,$hlo,$s0 ; h+=Sigma0(a)
+ adde $hhi,$hhi,$s1
+___
+$code.=<<___ if ($i==15);
+ lwz $x0,`$XOFF+0+$SZ*(($i+1)%16)`($sp)
+ lwz $x1,`$XOFF+4+$SZ*(($i+1)%16)`($sp)
+___
+}
+sub ROUND_16_xx_ppc32 {
+my ($i, $ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
+ $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo)=@_;
+
+$code.=<<___;
+ srwi $s0,$t0,$sigma0[0]
+ srwi $s1,$t1,$sigma0[0]
+ srwi $t2,$t0,$sigma0[1]
+ srwi $t3,$t1,$sigma0[1]
+ insrwi $s0,$t1,$sigma0[0],0
+ insrwi $s1,$t0,$sigma0[0],0
+ srwi $a0,$t0,$sigma0[2]
+ insrwi $t2,$t1,$sigma0[1],0
+ insrwi $t3,$t0,$sigma0[1],0
+ insrwi $a0,$t1,$sigma0[2],0
+ xor $s0,$s0,$t2
+ lwz $t2,`$XOFF+0+$SZ*(($i+14)%16)`($sp)
+ srwi $a1,$t1,$sigma0[2]
+ xor $s1,$s1,$t3
+ lwz $t3,`$XOFF+4+$SZ*(($i+14)%16)`($sp)
+ xor $a0,$a0,$s0
+ srwi $s0,$t2,$sigma1[0]
+ xor $a1,$a1,$s1
+ srwi $s1,$t3,$sigma1[0]
+ addc $x0,$x0,$a0 ; x[i]+=sigma0(x[i+1])
+ srwi $a0,$t3,$sigma1[1]-32
+ insrwi $s0,$t3,$sigma1[0],0
+ insrwi $s1,$t2,$sigma1[0],0
+ adde $x1,$x1,$a1
+ srwi $a1,$t2,$sigma1[1]-32
+
+ insrwi $a0,$t2,$sigma1[1]-32,0
+ srwi $t2,$t2,$sigma1[2]
+ insrwi $a1,$t3,$sigma1[1]-32,0
+ insrwi $t2,$t3,$sigma1[2],0
+ xor $s0,$s0,$a0
+ lwz $a0,`$XOFF+0+$SZ*(($i+9)%16)`($sp)
+ srwi $t3,$t3,$sigma1[2]
+ xor $s1,$s1,$a1
+ lwz $a1,`$XOFF+4+$SZ*(($i+9)%16)`($sp)
+ xor $s0,$s0,$t2
+ addc $x0,$x0,$a0 ; x[i]+=x[i+9]
+ xor $s1,$s1,$t3
+ adde $x1,$x1,$a1
+ addc $x0,$x0,$s0 ; x[i]+=sigma1(x[i+14])
+ adde $x1,$x1,$s1
+___
+ ($t0,$t1,$x0,$x1) = ($x0,$x1,$t0,$t1);
+ &ROUND_00_15_ppc32(@_);
+}
+
+$code.=<<___;
+.align 4
+Lsha2_block_private:
+___
+$code.=<<___ if (!$LITTLE_ENDIAN);
+ lwz $t1,0($inp)
+ xor $a2,@V[3],@V[5] ; B^C, magic seed
+ lwz $t0,4($inp)
+ xor $a3,@V[2],@V[4]
+___
+$code.=<<___ if ($LITTLE_ENDIAN);
+ lwz $a1,0($inp)
+ xor $a2,@V[3],@V[5] ; B^C, magic seed
+ lwz $a0,4($inp)
+ xor $a3,@V[2],@V[4]
+ rotlwi $t1,$a1,8
+ rotlwi $t0,$a0,8
+ rlwimi $t1,$a1,24,0,7
+ rlwimi $t0,$a0,24,0,7
+ rlwimi $t1,$a1,24,16,23
+ rlwimi $t0,$a0,24,16,23
+___
+for($i=0;$i<16;$i++) {
+ &ROUND_00_15_ppc32($i,@V);
+ unshift(@V,pop(@V)); unshift(@V,pop(@V));
+ ($a0,$a1,$a2,$a3) = ($a2,$a3,$a0,$a1);
+}
+$code.=<<___;
+ li $a0,`$rounds/16-1`
+ mtctr $a0
+.align 4
+Lrounds:
+ addi $Tbl,$Tbl,`16*$SZ`
+___
+for(;$i<32;$i++) {
+ &ROUND_16_xx_ppc32($i,@V);
+ unshift(@V,pop(@V)); unshift(@V,pop(@V));
+ ($a0,$a1,$a2,$a3) = ($a2,$a3,$a0,$a1);
+}
+$code.=<<___;
+ bdnz- Lrounds
+
+ $POP $ctx,`$FRAME-$SIZE_T*22`($sp)
+ $POP $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
+ $POP $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer
+ subi $Tbl,$Tbl,`($rounds-16)*$SZ` ; rewind Tbl
+
+ lwz $t0,`$LITTLE_ENDIAN^0`($ctx)
+ lwz $t1,`$LITTLE_ENDIAN^4`($ctx)
+ lwz $t2,`$LITTLE_ENDIAN^8`($ctx)
+ lwz $t3,`$LITTLE_ENDIAN^12`($ctx)
+ lwz $a0,`$LITTLE_ENDIAN^16`($ctx)
+ lwz $a1,`$LITTLE_ENDIAN^20`($ctx)
+ lwz $a2,`$LITTLE_ENDIAN^24`($ctx)
+ addc @V[1],@V[1],$t1
+ lwz $a3,`$LITTLE_ENDIAN^28`($ctx)
+ adde @V[0],@V[0],$t0
+ lwz $t0,`$LITTLE_ENDIAN^32`($ctx)
+ addc @V[3],@V[3],$t3
+ lwz $t1,`$LITTLE_ENDIAN^36`($ctx)
+ adde @V[2],@V[2],$t2
+ lwz $t2,`$LITTLE_ENDIAN^40`($ctx)
+ addc @V[5],@V[5],$a1
+ lwz $t3,`$LITTLE_ENDIAN^44`($ctx)
+ adde @V[4],@V[4],$a0
+ lwz $a0,`$LITTLE_ENDIAN^48`($ctx)
+ addc @V[7],@V[7],$a3
+ lwz $a1,`$LITTLE_ENDIAN^52`($ctx)
+ adde @V[6],@V[6],$a2
+ lwz $a2,`$LITTLE_ENDIAN^56`($ctx)
+ addc @V[9],@V[9],$t1
+ lwz $a3,`$LITTLE_ENDIAN^60`($ctx)
+ adde @V[8],@V[8],$t0
+ stw @V[0],`$LITTLE_ENDIAN^0`($ctx)
+ stw @V[1],`$LITTLE_ENDIAN^4`($ctx)
+ addc @V[11],@V[11],$t3
+ stw @V[2],`$LITTLE_ENDIAN^8`($ctx)
+ stw @V[3],`$LITTLE_ENDIAN^12`($ctx)
+ adde @V[10],@V[10],$t2
+ stw @V[4],`$LITTLE_ENDIAN^16`($ctx)
+ stw @V[5],`$LITTLE_ENDIAN^20`($ctx)
+ addc @V[13],@V[13],$a1
+ stw @V[6],`$LITTLE_ENDIAN^24`($ctx)
+ stw @V[7],`$LITTLE_ENDIAN^28`($ctx)
+ adde @V[12],@V[12],$a0
+ stw @V[8],`$LITTLE_ENDIAN^32`($ctx)
+ stw @V[9],`$LITTLE_ENDIAN^36`($ctx)
+ addc @V[15],@V[15],$a3
+ stw @V[10],`$LITTLE_ENDIAN^40`($ctx)
+ stw @V[11],`$LITTLE_ENDIAN^44`($ctx)
+ adde @V[14],@V[14],$a2
+ stw @V[12],`$LITTLE_ENDIAN^48`($ctx)
+ stw @V[13],`$LITTLE_ENDIAN^52`($ctx)
+ stw @V[14],`$LITTLE_ENDIAN^56`($ctx)
+ stw @V[15],`$LITTLE_ENDIAN^60`($ctx)
+
+ addi $inp,$inp,`16*$SZ` ; advance inp
+ $PUSH $inp,`$FRAME-$SIZE_T*23`($sp)
+ $UCMP $inp,$num
+ bne Lsha2_block_private
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+.size $func,.-$func
+___
+}
# Ugly hack here, because PPC assembler syntax seem to vary too
# much from platforms to platform...
@@ -395,46 +727,46 @@ LPICmeup:
.space `64-9*4`
___
$code.=<<___ if ($SZ==8);
- .long 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
- .long 0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
- .long 0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
- .long 0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
- .long 0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
- .long 0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
- .long 0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
- .long 0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
- .long 0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
- .long 0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
- .long 0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
- .long 0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
- .long 0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
- .long 0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
- .long 0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
- .long 0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
- .long 0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
- .long 0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
- .long 0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
- .long 0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
- .long 0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
- .long 0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
- .long 0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
- .long 0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
- .long 0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
- .long 0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
- .long 0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
- .long 0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
- .long 0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
- .long 0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
- .long 0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
- .long 0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
- .long 0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
- .long 0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
- .long 0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
- .long 0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
- .long 0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
- .long 0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
- .long 0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
- .long 0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
+ .quad 0x428a2f98d728ae22,0x7137449123ef65cd
+ .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ .quad 0x3956c25bf348b538,0x59f111f1b605d019
+ .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ .quad 0xd807aa98a3030242,0x12835b0145706fbe
+ .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ .quad 0x9bdc06a725c71235,0xc19bf174cf692694
+ .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ .quad 0x983e5152ee66dfab,0xa831c66d2db43210
+ .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ .quad 0x06ca6351e003826f,0x142929670a0e6e70
+ .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+ .quad 0x81c2c92e47edaee6,0x92722c851482353b
+ .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+ .quad 0xd192e819d6ef5218,0xd69906245565a910
+ .quad 0xf40e35855771202a,0x106aa07032bbd1b8
+ .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+ .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ .quad 0x90befffa23631e28,0xa4506cebde82bde9
+ .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ .quad 0xca273eceea26619c,0xd186b8c721c0c207
+ .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ .quad 0x113f9804bef90dae,0x1b710b35131c471b
+ .quad 0x28db77f523047d84,0x32caab7b40c72493
+ .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
___
$code.=<<___ if ($SZ==4);
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
diff --git a/crypto/sha/asm/sha512-sparcv9.pl b/crypto/sha/asm/sha512-sparcv9.pl
index 585740789e63..5a9c15d1d34a 100755
--- a/crypto/sha/asm/sha512-sparcv9.pl
+++ b/crypto/sha/asm/sha512-sparcv9.pl
@@ -5,6 +5,8 @@
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Hardware SPARC T4 support by David S. Miller <davem@davemloft.net>.
# ====================================================================
# SHA256 performance improvement over compiler generated code varies
@@ -41,11 +43,11 @@
# loads are always slower than one 64-bit load. Once again this
# is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
# 2x32-bit loads can be as fast as 1x64-bit ones.
-
-$bits=32;
-for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
-if ($bits==64) { $bias=2047; $frame=192; }
-else { $bias=0; $frame=112; }
+#
+# SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte,
+# which is 9.3x/11.1x faster than software. Multi-process benchmark
+# saturates at 11.5x single-process result on 8-core processor, or
+# ~11/16GBps per 2.85GHz socket.
$output=shift;
open STDOUT,">$output";
@@ -170,6 +172,7 @@ $code.=<<___ if ($i==0);
ld [$inp+16],%l4
ld [$inp+20],%l5
ld [$inp+24],%l6
+ cmp $tmp31,0
ld [$inp+28],%l7
___
$code.=<<___ if ($i<15);
@@ -182,29 +185,29 @@ $code.=<<___ if ($i<15);
or @pair[1],$tmp2,$tmp2
`"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)`
add $h,$tmp2,$T1
- $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
+ $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
___
$code.=<<___ if ($i==12);
- brnz,a $tmp31,.+8
+ bnz,a,pn %icc,.+8
ld [$inp+128],%l0
___
$code.=<<___ if ($i==15);
- ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
+ ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
add $tmp31,32,$tmp0
- ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
+ ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
sllx @pair[0],$tmp0,$tmp1
- ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
+ ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
srlx @pair[2],$tmp32,@pair[1]
or $tmp1,$tmp2,$tmp2
- ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
+ ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
or @pair[1],$tmp2,$tmp2
- ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
+ ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
add $h,$tmp2,$T1
- $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
- ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
- ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
- ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
+ $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
+ ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
+ ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
+ ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
___
} if ($SZ==8);
@@ -340,9 +343,9 @@ $code.=<<___;
or %l3,$tmp0,$tmp0
srlx $tmp0,@sigma0[0],$T1
- ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
+ ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
sllx $tmp0,`64-@sigma0[2]`,$tmp1
- ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
+ ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
srlx $tmp0,@sigma0[1],$tmp0
xor $tmp1,$T1,$T1
sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
@@ -354,9 +357,9 @@ $code.=<<___;
or %l7,$tmp2,$tmp2
srlx $tmp2,@sigma1[0],$tmp1
- ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
+ ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
sllx $tmp2,`64-@sigma1[2]`,$tmp0
- ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
+ ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
srlx $tmp2,@sigma1[1],$tmp2
xor $tmp0,$tmp1,$tmp1
sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
@@ -365,27 +368,30 @@ $code.=<<___;
xor $tmp0,$tmp1,$tmp1
sllx %l4,32,$tmp0
xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14])
- ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
+ ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
or %l5,$tmp0,$tmp0
- ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
+ ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
sllx %l0,32,$tmp2
add $tmp1,$T1,$T1
- ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
+ ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
or %l1,$tmp2,$tmp2
add $tmp0,$T1,$T1 ! +=X[$i+9]
- ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
+ ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
add $tmp2,$T1,$T1 ! +=X[$i]
- $ST $T1,[%sp+`$bias+$frame+($i%16)*$SZ`]
+ $ST $T1,[%sp+STACK_BIAS+STACK_FRAME+`($i%16)*$SZ`]
___
&BODY_00_15(@_);
} if ($SZ==8);
-$code.=<<___ if ($bits==64);
+$code.=<<___;
+#include "sparc_arch.h"
+
+#ifdef __arch64__
.register %g2,#scratch
.register %g3,#scratch
-___
-$code.=<<___;
+#endif
+
.section ".text",#alloc,#execinstr
.align 64
@@ -457,9 +463,203 @@ ___
}
$code.=<<___;
.size K${label},.-K${label}
+
+#ifdef __PIC__
+SPARC_PIC_THUNK(%g1)
+#endif
+
.globl sha${label}_block_data_order
+.align 32
sha${label}_block_data_order:
- save %sp,`-$frame-$locals`,%sp
+ SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
+ ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1]
+
+ andcc %g1, CFR_SHA${label}, %g0
+ be .Lsoftware
+ nop
+___
+$code.=<<___ if ($SZ==8); # SHA512
+ ldd [%o0 + 0x00], %f0 ! load context
+ ldd [%o0 + 0x08], %f2
+ ldd [%o0 + 0x10], %f4
+ ldd [%o0 + 0x18], %f6
+ ldd [%o0 + 0x20], %f8
+ ldd [%o0 + 0x28], %f10
+ andcc %o1, 0x7, %g0
+ ldd [%o0 + 0x30], %f12
+ bne,pn %icc, .Lhwunaligned
+ ldd [%o0 + 0x38], %f14
+
+.Lhwaligned_loop:
+ ldd [%o1 + 0x00], %f16
+ ldd [%o1 + 0x08], %f18
+ ldd [%o1 + 0x10], %f20
+ ldd [%o1 + 0x18], %f22
+ ldd [%o1 + 0x20], %f24
+ ldd [%o1 + 0x28], %f26
+ ldd [%o1 + 0x30], %f28
+ ldd [%o1 + 0x38], %f30
+ ldd [%o1 + 0x40], %f32
+ ldd [%o1 + 0x48], %f34
+ ldd [%o1 + 0x50], %f36
+ ldd [%o1 + 0x58], %f38
+ ldd [%o1 + 0x60], %f40
+ ldd [%o1 + 0x68], %f42
+ ldd [%o1 + 0x70], %f44
+ subcc %o2, 1, %o2 ! done yet?
+ ldd [%o1 + 0x78], %f46
+ add %o1, 0x80, %o1
+ prefetch [%o1 + 63], 20
+ prefetch [%o1 + 64+63], 20
+
+ .word 0x81b02860 ! SHA512
+
+ bne,pt SIZE_T_CC, .Lhwaligned_loop
+ nop
+
+.Lhwfinish:
+ std %f0, [%o0 + 0x00] ! store context
+ std %f2, [%o0 + 0x08]
+ std %f4, [%o0 + 0x10]
+ std %f6, [%o0 + 0x18]
+ std %f8, [%o0 + 0x20]
+ std %f10, [%o0 + 0x28]
+ std %f12, [%o0 + 0x30]
+ retl
+ std %f14, [%o0 + 0x38]
+
+.align 16
+.Lhwunaligned:
+ alignaddr %o1, %g0, %o1
+
+ ldd [%o1 + 0x00], %f18
+.Lhwunaligned_loop:
+ ldd [%o1 + 0x08], %f20
+ ldd [%o1 + 0x10], %f22
+ ldd [%o1 + 0x18], %f24
+ ldd [%o1 + 0x20], %f26
+ ldd [%o1 + 0x28], %f28
+ ldd [%o1 + 0x30], %f30
+ ldd [%o1 + 0x38], %f32
+ ldd [%o1 + 0x40], %f34
+ ldd [%o1 + 0x48], %f36
+ ldd [%o1 + 0x50], %f38
+ ldd [%o1 + 0x58], %f40
+ ldd [%o1 + 0x60], %f42
+ ldd [%o1 + 0x68], %f44
+ ldd [%o1 + 0x70], %f46
+ ldd [%o1 + 0x78], %f48
+ subcc %o2, 1, %o2 ! done yet?
+ ldd [%o1 + 0x80], %f50
+ add %o1, 0x80, %o1
+ prefetch [%o1 + 63], 20
+ prefetch [%o1 + 64+63], 20
+
+ faligndata %f18, %f20, %f16
+ faligndata %f20, %f22, %f18
+ faligndata %f22, %f24, %f20
+ faligndata %f24, %f26, %f22
+ faligndata %f26, %f28, %f24
+ faligndata %f28, %f30, %f26
+ faligndata %f30, %f32, %f28
+ faligndata %f32, %f34, %f30
+ faligndata %f34, %f36, %f32
+ faligndata %f36, %f38, %f34
+ faligndata %f38, %f40, %f36
+ faligndata %f40, %f42, %f38
+ faligndata %f42, %f44, %f40
+ faligndata %f44, %f46, %f42
+ faligndata %f46, %f48, %f44
+ faligndata %f48, %f50, %f46
+
+ .word 0x81b02860 ! SHA512
+
+ bne,pt SIZE_T_CC, .Lhwunaligned_loop
+ for %f50, %f50, %f18 ! %f18=%f50
+
+ ba .Lhwfinish
+ nop
+___
+$code.=<<___ if ($SZ==4); # SHA256
+ ld [%o0 + 0x00], %f0
+ ld [%o0 + 0x04], %f1
+ ld [%o0 + 0x08], %f2
+ ld [%o0 + 0x0c], %f3
+ ld [%o0 + 0x10], %f4
+ ld [%o0 + 0x14], %f5
+ andcc %o1, 0x7, %g0
+ ld [%o0 + 0x18], %f6
+ bne,pn %icc, .Lhwunaligned
+ ld [%o0 + 0x1c], %f7
+
+.Lhwloop:
+ ldd [%o1 + 0x00], %f8
+ ldd [%o1 + 0x08], %f10
+ ldd [%o1 + 0x10], %f12
+ ldd [%o1 + 0x18], %f14
+ ldd [%o1 + 0x20], %f16
+ ldd [%o1 + 0x28], %f18
+ ldd [%o1 + 0x30], %f20
+ subcc %o2, 1, %o2 ! done yet?
+ ldd [%o1 + 0x38], %f22
+ add %o1, 0x40, %o1
+ prefetch [%o1 + 63], 20
+
+ .word 0x81b02840 ! SHA256
+
+ bne,pt SIZE_T_CC, .Lhwloop
+ nop
+
+.Lhwfinish:
+ st %f0, [%o0 + 0x00] ! store context
+ st %f1, [%o0 + 0x04]
+ st %f2, [%o0 + 0x08]
+ st %f3, [%o0 + 0x0c]
+ st %f4, [%o0 + 0x10]
+ st %f5, [%o0 + 0x14]
+ st %f6, [%o0 + 0x18]
+ retl
+ st %f7, [%o0 + 0x1c]
+
+.align 8
+.Lhwunaligned:
+ alignaddr %o1, %g0, %o1
+
+ ldd [%o1 + 0x00], %f10
+.Lhwunaligned_loop:
+ ldd [%o1 + 0x08], %f12
+ ldd [%o1 + 0x10], %f14
+ ldd [%o1 + 0x18], %f16
+ ldd [%o1 + 0x20], %f18
+ ldd [%o1 + 0x28], %f20
+ ldd [%o1 + 0x30], %f22
+ ldd [%o1 + 0x38], %f24
+ subcc %o2, 1, %o2 ! done yet?
+ ldd [%o1 + 0x40], %f26
+ add %o1, 0x40, %o1
+ prefetch [%o1 + 63], 20
+
+ faligndata %f10, %f12, %f8
+ faligndata %f12, %f14, %f10
+ faligndata %f14, %f16, %f12
+ faligndata %f16, %f18, %f14
+ faligndata %f18, %f20, %f16
+ faligndata %f20, %f22, %f18
+ faligndata %f22, %f24, %f20
+ faligndata %f24, %f26, %f22
+
+ .word 0x81b02840 ! SHA256
+
+ bne,pt SIZE_T_CC, .Lhwunaligned_loop
+ for %f26, %f26, %f10 ! %f10=%f26
+
+ ba .Lhwfinish
+ nop
+___
+$code.=<<___;
+.align 16
+.Lsoftware:
+ save %sp,-STACK_FRAME-$locals,%sp
and $inp,`$align-1`,$tmp31
sllx $len,`log(16*$SZ)/log(2)`,$len
andn $inp,`$align-1`,$inp
@@ -578,7 +778,7 @@ ___
$code.=<<___;
add $inp,`16*$SZ`,$inp ! advance inp
cmp $inp,$len
- bne `$bits==64?"%xcc":"%icc"`,.Lloop
+ bne SIZE_T_CC,.Lloop
sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl
ret
@@ -589,6 +789,62 @@ $code.=<<___;
.align 4
___
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-print $code;
+# Purpose of these subroutines is to explicitly encode VIS instructions,
+# so that one can compile the module without having to specify VIS
+# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# Idea is to reserve for option to produce "universal" binary and let
+# programmer detect if current CPU is VIS capable at run-time.
+sub unvis {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my $ref,$opf;
+my %visopf = ( "faligndata" => 0x048,
+ "for" => 0x07c );
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+ if ($opf=$visopf{$mnemonic}) {
+ foreach ($rs1,$rs2,$rd) {
+ return $ref if (!/%f([0-9]{1,2})/);
+ $_=$1;
+ if ($1>=32) {
+ return $ref if ($1&1);
+ # re-encode for upper double register addressing
+ $_=($1|$1>>5)&31;
+ }
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+sub unalignaddr {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
+my $ref="$mnemonic\t$rs1,$rs2,$rd";
+
+ foreach ($rs1,$rs2,$rd) {
+ if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
+ else { return $ref; }
+ }
+ return sprintf ".word\t0x%08x !%s",
+ 0x81b00300|$rd<<25|$rs1<<14|$rs2,
+ $ref;
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+
+ s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
+ &unvis($1,$2,$3,$4)
+ /ge;
+ s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
+ &unalignaddr($1,$2,$3,$4)
+ /ge;
+
+ print $_,"\n";
+}
+
close STDOUT;
diff --git a/crypto/sha/asm/sha512-x86_64.pl b/crypto/sha/asm/sha512-x86_64.pl
index 8d5167855781..b7b44b441136 100755
--- a/crypto/sha/asm/sha512-x86_64.pl
+++ b/crypto/sha/asm/sha512-x86_64.pl
@@ -1,7 +1,7 @@
#!/usr/bin/env perl
#
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. Rights for redistribution and usage in source and binary
# forms are granted according to the OpenSSL license.
# ====================================================================
@@ -39,6 +39,64 @@
# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
# sha256_block:-( This is presumably because 64-bit shifts/rotates
# apparently are not atomic instructions, but implemented in microcode.
+#
+# May 2012.
+#
+# Optimization including one of Pavel Semjanov's ideas, alternative
+# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
+# unfortunately -2% SHA512 on P4 [which nobody should care about
+# that much].
+#
+# June 2012.
+#
+# Add SIMD code paths, see below for improvement coefficients. SSSE3
+# code path was not attempted for SHA512, because improvement is not
+# estimated to be high enough, noticeably less than 9%, to justify
+# the effort, not on pre-AVX processors. [Obviously with exclusion
+# for VIA Nano, but it has SHA512 instruction that is faster and
+# should be used instead.] For reference, corresponding estimated
+# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
+# higher coefficients are observed on VIA Nano and Bulldozer has more
+# to do with specifics of their architecture [which is topic for
+# separate discussion].
+#
+# November 2012.
+#
+# Add AVX2 code path. Two consecutive input blocks are loaded to
+# 256-bit %ymm registers, with data from first block to least
+# significant 128-bit halves and data from second to most significant.
+# The data is then processed with same SIMD instruction sequence as
+# for AVX, but with %ymm as operands. Side effect is increased stack
+# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
+# code size increase.
+#
+# March 2014.
+#
+# Add support for Intel SHA Extensions.
+
+######################################################################
+# Current performance in cycles per processed byte (less is better):
+#
+# SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*)
+#
+# AMD K8 14.9 - - 9.57 -
+# P4 17.3 - - 30.8 -
+# Core 2 15.6 13.8(+13%) - 9.97 -
+# Westmere 14.8 12.3(+19%) - 9.58 -
+# Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**))
+# Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%)
+# Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%)
+# Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%)
+# VIA Nano 23.0 16.5(+39%) - 14.7 -
+# Atom 23.0 18.9(+22%) - 14.7 -
+# Silvermont 27.4 20.6(+33%) - 17.5 -
+#
+# (*) whichever best applicable;
+# (**) switch from ror to shrd stands for fair share of improvement;
+# (***) execution time is fully determined by remaining integer-only
+# part, body_00_15; reducing the amount of SIMD instructions
+# below certain limit makes no difference/sense; to conserve
+# space SHA256 XOP code path is therefore omitted;
$flavour = shift;
$output = shift;
@@ -51,6 +109,28 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+ $avx = ($2>=3.0) + ($2>3.0);
+}
+
+$shaext=1; ### set to zero if compiling for 1.0.1
+$avx=1 if (!$shaext && $avx);
+
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
@@ -60,7 +140,7 @@ if ($output =~ /512/) {
$SZ=8;
@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
"%r8", "%r9", "%r10","%r11");
- ($T1,$a0,$a1,$a2)=("%r12","%r13","%r14","%r15");
+ ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
@Sigma0=(28,34,39);
@Sigma1=(14,18,41);
@sigma0=(1, 8, 7);
@@ -72,7 +152,7 @@ if ($output =~ /512/) {
$SZ=4;
@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
"%r8d","%r9d","%r10d","%r11d");
- ($T1,$a0,$a1,$a2)=("%r12d","%r13d","%r14d","%r15d");
+ ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 7,18, 3);
@@ -80,8 +160,7 @@ if ($output =~ /512/) {
$rounds=64;
}
-$ctx="%rdi"; # 1st arg
-$round="%rdi"; # zaps $ctx
+$ctx="%rdi"; # 1st arg, zapped by $a3
$inp="%rsi"; # 2nd arg
$Tbl="%rbp";
@@ -94,47 +173,51 @@ $framesz="16*$SZ+4*8";
sub ROUND_00_15()
{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+ my $STRIDE=$SZ;
+ $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
$code.=<<___;
ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
mov $f,$a2
- mov $T1,`$SZ*($i&0xf)`(%rsp)
- ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
xor $e,$a0
+ ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
xor $g,$a2 # f^g
- ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
- add $h,$T1 # T1+=h
+ mov $T1,`$SZ*($i&0xf)`(%rsp)
xor $a,$a1
-
- add ($Tbl,$round,$SZ),$T1 # T1+=K[round]
and $e,$a2 # (f^g)&e
- mov $b,$h
+
+ ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
+ add $h,$T1 # T1+=h
+ xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
xor $e,$a0
- xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
+ add $a2,$T1 # T1+=Ch(e,f,g)
- xor $c,$h # b^c
+ mov $a,$a2
+ add ($Tbl),$T1 # T1+=K[round]
xor $a,$a1
- add $a2,$T1 # T1+=Ch(e,f,g)
- mov $b,$a2
+ xor $b,$a2 # a^b, b^c in next round
ror \$$Sigma1[0],$a0 # Sigma1(e)
- and $a,$h # h=(b^c)&a
- and $c,$a2 # b&c
+ mov $b,$h
+ and $a2,$a3
ror \$$Sigma0[0],$a1 # Sigma0(a)
add $a0,$T1 # T1+=Sigma1(e)
- add $a2,$h # h+=b&c (completes +=Maj(a,b,c)
+ xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
add $T1,$d # d+=T1
add $T1,$h # h+=T1
- lea 1($round),$round # round++
- add $a1,$h # h+=Sigma0(a)
+ lea $STRIDE($Tbl),$Tbl # round++
+___
+$code.=<<___ if ($i<15);
+ add $a1,$h # h+=Sigma0(a)
___
+ ($a2,$a3) = ($a3,$a2);
}
sub ROUND_16_XX()
@@ -142,29 +225,28 @@ sub ROUND_16_XX()
$code.=<<___;
mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
- mov `$SZ*(($i+14)&0xf)`(%rsp),$a1
- mov $a0,$T1
- mov $a1,$a2
-
- ror \$`$sigma0[1]-$sigma0[0]`,$T1
- xor $a0,$T1
- shr \$$sigma0[2],$a0
-
- ror \$$sigma0[0],$T1
- xor $T1,$a0 # sigma0(X[(i+1)&0xf])
- mov `$SZ*(($i+9)&0xf)`(%rsp),$T1
+ mov `$SZ*(($i+14)&0xf)`(%rsp),$a2
+ mov $a0,$T1
+ ror \$`$sigma0[1]-$sigma0[0]`,$a0
+ add $a1,$a # modulo-scheduled h+=Sigma0(a)
+ mov $a2,$a1
ror \$`$sigma1[1]-$sigma1[0]`,$a2
+
+ xor $T1,$a0
+ shr \$$sigma0[2],$T1
+ ror \$$sigma0[0],$a0
xor $a1,$a2
shr \$$sigma1[2],$a1
ror \$$sigma1[0],$a2
- add $a0,$T1
- xor $a2,$a1 # sigma1(X[(i+14)&0xf])
+ xor $a0,$T1 # sigma0(X[(i+1)&0xf])
+ xor $a1,$a2 # sigma1(X[(i+14)&0xf])
+ add `$SZ*(($i+9)&0xf)`(%rsp),$T1
add `$SZ*($i&0xf)`(%rsp),$T1
mov $e,$a0
- add $a1,$T1
+ add $a2,$T1
mov $a,$a1
___
&ROUND_00_15(@_);
@@ -173,10 +255,43 @@ ___
$code=<<___;
.text
+.extern OPENSSL_ia32cap_P
.globl $func
-.type $func,\@function,4
+.type $func,\@function,3
.align 16
$func:
+___
+$code.=<<___ if ($SZ==4 || $avx);
+ lea OPENSSL_ia32cap_P(%rip),%r11
+ mov 0(%r11),%r9d
+ mov 4(%r11),%r10d
+ mov 8(%r11),%r11d
+___
+$code.=<<___ if ($SZ==4 && $shaext);
+ test \$`1<<29`,%r11d # check for SHA
+ jnz _shaext_shortcut
+___
+$code.=<<___ if ($avx && $SZ==8);
+ test \$`1<<11`,%r10d # check for XOP
+ jnz .Lxop_shortcut
+___
+$code.=<<___ if ($avx>1);
+ and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
+ cmp \$`1<<8|1<<5|1<<3`,%r11d
+ je .Lavx2_shortcut
+___
+$code.=<<___ if ($avx);
+ and \$`1<<30`,%r9d # mask "Intel CPU" bit
+ and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits
+ or %r9d,%r10d
+ cmp \$`1<<28|1<<9|1<<30`,%r10d
+ je .Lavx_shortcut
+___
+$code.=<<___ if ($SZ==4);
+ test \$`1<<9`,%r10d
+ jnz .Lssse3_shortcut
+___
+$code.=<<___;
push %rbx
push %rbp
push %r12
@@ -194,8 +309,6 @@ $func:
mov %r11,$_rsp # save copy of %rsp
.Lprologue:
- lea $TABLE(%rip),$Tbl
-
mov $SZ*0($ctx),$A
mov $SZ*1($ctx),$B
mov $SZ*2($ctx),$C
@@ -208,7 +321,9 @@ $func:
.align 16
.Lloop:
- xor $round,$round
+ mov $B,$a3
+ lea $TABLE(%rip),$Tbl
+ xor $C,$a3 # magic
___
for($i=0;$i<16;$i++) {
$code.=" mov $SZ*$i($inp),$T1\n";
@@ -229,10 +344,11 @@ ___
}
$code.=<<___;
- cmp \$$rounds,$round
- jb .Lrounds_16_xx
+ cmpb \$0,`$SZ-1`($Tbl)
+ jnz .Lrounds_16_xx
mov $_ctx,$ctx
+ add $a1,$A # modulo-scheduled h+=Sigma0(a)
lea 16*$SZ($inp),$inp
add $SZ*0($ctx),$A
@@ -275,21 +391,45 @@ $code.=<<___;
.type $TABLE,\@object
$TABLE:
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+ .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+ .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+ .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+ .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+ .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+ .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+ .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
___
} else {
$code.=<<___;
@@ -297,47 +437,1714 @@ $code.=<<___;
.type $TABLE,\@object
$TABLE:
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
+ .quad 0x428a2f98d728ae22,0x7137449123ef65cd
+ .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
.quad 0x3956c25bf348b538,0x59f111f1b605d019
+ .quad 0x3956c25bf348b538,0x59f111f1b605d019
+ .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
.quad 0xd807aa98a3030242,0x12835b0145706fbe
+ .quad 0xd807aa98a3030242,0x12835b0145706fbe
+ .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ .quad 0x9bdc06a725c71235,0xc19bf174cf692694
.quad 0x9bdc06a725c71235,0xc19bf174cf692694
.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ .quad 0x983e5152ee66dfab,0xa831c66d2db43210
.quad 0x983e5152ee66dfab,0xa831c66d2db43210
.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ .quad 0x06ca6351e003826f,0x142929670a0e6e70
.quad 0x06ca6351e003826f,0x142929670a0e6e70
.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+ .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
.quad 0x81c2c92e47edaee6,0x92722c851482353b
+ .quad 0x81c2c92e47edaee6,0x92722c851482353b
+ .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+ .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+ .quad 0xd192e819d6ef5218,0xd69906245565a910
.quad 0xd192e819d6ef5218,0xd69906245565a910
.quad 0xf40e35855771202a,0x106aa07032bbd1b8
+ .quad 0xf40e35855771202a,0x106aa07032bbd1b8
.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+ .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ .quad 0x90befffa23631e28,0xa4506cebde82bde9
.quad 0x90befffa23631e28,0xa4506cebde82bde9
.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ .quad 0xca273eceea26619c,0xd186b8c721c0c207
.quad 0xca273eceea26619c,0xd186b8c721c0c207
.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ .quad 0x113f9804bef90dae,0x1b710b35131c471b
.quad 0x113f9804bef90dae,0x1b710b35131c471b
.quad 0x28db77f523047d84,0x32caab7b40c72493
+ .quad 0x28db77f523047d84,0x32caab7b40c72493
+ .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+ .quad 0x0001020304050607,0x08090a0b0c0d0e0f
+ .quad 0x0001020304050607,0x08090a0b0c0d0e0f
+ .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+}
+
+######################################################################
+# SIMD code paths
+#
+if ($SZ==4 && $shaext) {{{
+######################################################################
+# Intel SHA Extensions implementation of SHA256 update function.
+#
+my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
+
+my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
+my @MSG=map("%xmm$_",(3..6));
+
+$code.=<<___;
+.type sha256_block_data_order_shaext,\@function,3
+.align 64
+sha256_block_data_order_shaext:
+_shaext_shortcut:
+___
+$code.=<<___ if ($win64);
+ lea `-8-5*16`(%rsp),%rsp
+ movaps %xmm6,-8-5*16(%rax)
+ movaps %xmm7,-8-4*16(%rax)
+ movaps %xmm8,-8-3*16(%rax)
+ movaps %xmm9,-8-2*16(%rax)
+ movaps %xmm10,-8-1*16(%rax)
+.Lprologue_shaext:
+___
+$code.=<<___;
+ lea K256+0x80(%rip),$Tbl
+ movdqu ($ctx),$ABEF # DCBA
+ movdqu 16($ctx),$CDGH # HGFE
+ movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
+
+ pshufd \$0x1b,$ABEF,$Wi # ABCD
+ pshufd \$0xb1,$ABEF,$ABEF # CDAB
+ pshufd \$0x1b,$CDGH,$CDGH # EFGH
+ movdqa $TMP,$BSWAP # offload
+ palignr \$8,$CDGH,$ABEF # ABEF
+ punpcklqdq $Wi,$CDGH # CDGH
+ jmp .Loop_shaext
+
+.align 16
+.Loop_shaext:
+ movdqu ($inp),@MSG[0]
+ movdqu 0x10($inp),@MSG[1]
+ movdqu 0x20($inp),@MSG[2]
+ pshufb $TMP,@MSG[0]
+ movdqu 0x30($inp),@MSG[3]
+
+ movdqa 0*32-0x80($Tbl),$Wi
+ paddd @MSG[0],$Wi
+ pshufb $TMP,@MSG[1]
+ movdqa $CDGH,$CDGH_SAVE # offload
+ sha256rnds2 $ABEF,$CDGH # 0-3
+ pshufd \$0x0e,$Wi,$Wi
+ nop
+ movdqa $ABEF,$ABEF_SAVE # offload
+ sha256rnds2 $CDGH,$ABEF
+
+ movdqa 1*32-0x80($Tbl),$Wi
+ paddd @MSG[1],$Wi
+ pshufb $TMP,@MSG[2]
+ sha256rnds2 $ABEF,$CDGH # 4-7
+ pshufd \$0x0e,$Wi,$Wi
+ lea 0x40($inp),$inp
+ sha256msg1 @MSG[1],@MSG[0]
+ sha256rnds2 $CDGH,$ABEF
+
+ movdqa 2*32-0x80($Tbl),$Wi
+ paddd @MSG[2],$Wi
+ pshufb $TMP,@MSG[3]
+ sha256rnds2 $ABEF,$CDGH # 8-11
+ pshufd \$0x0e,$Wi,$Wi
+ movdqa @MSG[3],$TMP
+ palignr \$4,@MSG[2],$TMP
+ nop
+ paddd $TMP,@MSG[0]
+ sha256msg1 @MSG[2],@MSG[1]
+ sha256rnds2 $CDGH,$ABEF
+
+ movdqa 3*32-0x80($Tbl),$Wi
+ paddd @MSG[3],$Wi
+ sha256msg2 @MSG[3],@MSG[0]
+ sha256rnds2 $ABEF,$CDGH # 12-15
+ pshufd \$0x0e,$Wi,$Wi
+ movdqa @MSG[0],$TMP
+ palignr \$4,@MSG[3],$TMP
+ nop
+ paddd $TMP,@MSG[1]
+ sha256msg1 @MSG[3],@MSG[2]
+ sha256rnds2 $CDGH,$ABEF
+___
+for($i=4;$i<16-3;$i++) {
+$code.=<<___;
+ movdqa $i*32-0x80($Tbl),$Wi
+ paddd @MSG[0],$Wi
+ sha256msg2 @MSG[0],@MSG[1]
+ sha256rnds2 $ABEF,$CDGH # 16-19...
+ pshufd \$0x0e,$Wi,$Wi
+ movdqa @MSG[1],$TMP
+ palignr \$4,@MSG[0],$TMP
+ nop
+ paddd $TMP,@MSG[2]
+ sha256msg1 @MSG[0],@MSG[3]
+ sha256rnds2 $CDGH,$ABEF
+___
+ push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+ movdqa 13*32-0x80($Tbl),$Wi
+ paddd @MSG[0],$Wi
+ sha256msg2 @MSG[0],@MSG[1]
+ sha256rnds2 $ABEF,$CDGH # 52-55
+ pshufd \$0x0e,$Wi,$Wi
+ movdqa @MSG[1],$TMP
+ palignr \$4,@MSG[0],$TMP
+ sha256rnds2 $CDGH,$ABEF
+ paddd $TMP,@MSG[2]
+
+ movdqa 14*32-0x80($Tbl),$Wi
+ paddd @MSG[1],$Wi
+ sha256rnds2 $ABEF,$CDGH # 56-59
+ pshufd \$0x0e,$Wi,$Wi
+ sha256msg2 @MSG[1],@MSG[2]
+ movdqa $BSWAP,$TMP
+ sha256rnds2 $CDGH,$ABEF
+
+ movdqa 15*32-0x80($Tbl),$Wi
+ paddd @MSG[2],$Wi
+ nop
+ sha256rnds2 $ABEF,$CDGH # 60-63
+ pshufd \$0x0e,$Wi,$Wi
+ dec $num
+ nop
+ sha256rnds2 $CDGH,$ABEF
+
+ paddd $CDGH_SAVE,$CDGH
+ paddd $ABEF_SAVE,$ABEF
+ jnz .Loop_shaext
+
+ pshufd \$0xb1,$CDGH,$CDGH # DCHG
+ pshufd \$0x1b,$ABEF,$TMP # FEBA
+ pshufd \$0xb1,$ABEF,$ABEF # BAFE
+ punpckhqdq $CDGH,$ABEF # DCBA
+ palignr \$8,$TMP,$CDGH # HGFE
+
+ movdqu $ABEF,($ctx)
+ movdqu $CDGH,16($ctx)
+___
+$code.=<<___ if ($win64);
+ movaps -8-5*16(%rax),%xmm6
+ movaps -8-4*16(%rax),%xmm7
+ movaps -8-3*16(%rax),%xmm8
+ movaps -8-2*16(%rax),%xmm9
+ movaps -8-1*16(%rax),%xmm10
+ mov %rax,%rsp
+.Lepilogue_shaext:
+___
+$code.=<<___;
+ ret
+.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
+___
+}}}
+{{{
+
+my $a4=$T1;
+my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+ my $arg = pop;
+ $arg = "\$$arg" if ($arg*1 eq $arg);
+ $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+sub body_00_15 () {
+ (
+ '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
+
+ '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
+ '&mov ($a,$a1)',
+ '&mov ($a4,$f)',
+
+ '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
+ '&xor ($a0,$e)',
+ '&xor ($a4,$g)', # f^g
+
+ '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
+ '&xor ($a1,$a)',
+ '&and ($a4,$e)', # (f^g)&e
+
+ '&xor ($a0,$e)',
+ '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
+ '&mov ($a2,$a)',
+
+ '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
+ '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
+ '&xor ($a2,$b)', # a^b, b^c in next round
+
+ '&add ($h,$a4)', # h+=Ch(e,f,g)
+ '&ror ($a0,$Sigma1[0])', # Sigma1(e)
+ '&and ($a3,$a2)', # (b^c)&(a^b)
+
+ '&xor ($a1,$a)',
+ '&add ($h,$a0)', # h+=Sigma1(e)
+ '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
+
+ '&ror ($a1,$Sigma0[0])', # Sigma0(a)
+ '&add ($d,$h)', # d+=h
+ '&add ($h,$a3)', # h+=Maj(a,b,c)
+
+ '&mov ($a0,$d)',
+ '&add ($a1,$h);'. # h+=Sigma0(a)
+ '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
+ );
+}
+
+######################################################################
+# SSSE3 code path
+#
+if ($SZ==4) { # SHA256 only
+my @X = map("%xmm$_",(0..3));
+my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
+
+$code.=<<___;
+.type ${func}_ssse3,\@function,3
+.align 64
+${func}_ssse3:
+.Lssse3_shortcut:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ mov %rsp,%r11 # copy %rsp
+ shl \$4,%rdx # num*16
+ sub \$`$framesz+$win64*16*4`,%rsp
+ lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
+ and \$-64,%rsp # align stack frame
+ mov $ctx,$_ctx # save ctx, 1st arg
+ mov $inp,$_inp # save inp, 2nd arh
+ mov %rdx,$_end # save end pointer, "3rd" arg
+ mov %r11,$_rsp # save copy of %rsp
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,16*$SZ+32(%rsp)
+ movaps %xmm7,16*$SZ+48(%rsp)
+ movaps %xmm8,16*$SZ+64(%rsp)
+ movaps %xmm9,16*$SZ+80(%rsp)
+___
+$code.=<<___;
+.Lprologue_ssse3:
+
+ mov $SZ*0($ctx),$A
+ mov $SZ*1($ctx),$B
+ mov $SZ*2($ctx),$C
+ mov $SZ*3($ctx),$D
+ mov $SZ*4($ctx),$E
+ mov $SZ*5($ctx),$F
+ mov $SZ*6($ctx),$G
+ mov $SZ*7($ctx),$H
+___
+
+$code.=<<___;
+ #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
+ #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
+ jmp .Lloop_ssse3
+.align 16
+.Lloop_ssse3:
+ movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
+ movdqu 0x00($inp),@X[0]
+ movdqu 0x10($inp),@X[1]
+ movdqu 0x20($inp),@X[2]
+ pshufb $t3,@X[0]
+ movdqu 0x30($inp),@X[3]
+ lea $TABLE(%rip),$Tbl
+ pshufb $t3,@X[1]
+ movdqa 0x00($Tbl),$t0
+ movdqa 0x20($Tbl),$t1
+ pshufb $t3,@X[2]
+ paddd @X[0],$t0
+ movdqa 0x40($Tbl),$t2
+ pshufb $t3,@X[3]
+ movdqa 0x60($Tbl),$t3
+ paddd @X[1],$t1
+ paddd @X[2],$t2
+ paddd @X[3],$t3
+ movdqa $t0,0x00(%rsp)
+ mov $A,$a1
+ movdqa $t1,0x10(%rsp)
+ mov $B,$a3
+ movdqa $t2,0x20(%rsp)
+ xor $C,$a3 # magic
+ movdqa $t3,0x30(%rsp)
+ mov $E,$a0
+ jmp .Lssse3_00_47
+
+.align 16
+.Lssse3_00_47:
+ sub \$`-16*2*$SZ`,$Tbl # size optimization
+___
+sub Xupdate_256_SSSE3 () {
+ (
+ '&movdqa ($t0,@X[1]);',
+ '&movdqa ($t3,@X[3])',
+ '&palignr ($t0,@X[0],$SZ)', # X[1..4]
+ '&palignr ($t3,@X[2],$SZ);', # X[9..12]
+ '&movdqa ($t1,$t0)',
+ '&movdqa ($t2,$t0);',
+ '&psrld ($t0,$sigma0[2])',
+ '&paddd (@X[0],$t3);', # X[0..3] += X[9..12]
+ '&psrld ($t2,$sigma0[0])',
+ '&pshufd ($t3,@X[3],0b11111010)',# X[14..15]
+ '&pslld ($t1,8*$SZ-$sigma0[1]);'.
+ '&pxor ($t0,$t2)',
+ '&psrld ($t2,$sigma0[1]-$sigma0[0]);'.
+ '&pxor ($t0,$t1)',
+ '&pslld ($t1,$sigma0[1]-$sigma0[0]);'.
+ '&pxor ($t0,$t2);',
+ '&movdqa ($t2,$t3)',
+ '&pxor ($t0,$t1);', # sigma0(X[1..4])
+ '&psrld ($t3,$sigma1[2])',
+ '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4])
+ '&psrlq ($t2,$sigma1[0])',
+ '&pxor ($t3,$t2);',
+ '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
+ '&pxor ($t3,$t2)',
+ '&pshufb ($t3,$t4)', # sigma1(X[14..15])
+ '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
+ '&pshufd ($t3,@X[0],0b01010000)',# X[16..17]
+ '&movdqa ($t2,$t3);',
+ '&psrld ($t3,$sigma1[2])',
+ '&psrlq ($t2,$sigma1[0])',
+ '&pxor ($t3,$t2);',
+ '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
+ '&pxor ($t3,$t2);',
+ '&movdqa ($t2,16*2*$j."($Tbl)")',
+ '&pshufb ($t3,$t5)',
+ '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17])
+ );
+}
+
+sub SSSE3_256_00_47 () {
+my $j = shift;
+my $body = shift;
+my @X = @_;
+my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
+
+ if (0) {
+ foreach (Xupdate_256_SSSE3()) { # 36 instructions
+ eval;
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ }
+ } else { # squeeze extra 4% on Westmere and 19% on Atom
+ eval(shift(@insns)); #@
+ &movdqa ($t0,@X[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa ($t3,@X[3]);
+ eval(shift(@insns)); #@
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); #@
+ eval(shift(@insns));
+ &palignr ($t0,@X[0],$SZ); # X[1..4]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &palignr ($t3,@X[2],$SZ); # X[9..12]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); #@
+ &movdqa ($t1,$t0);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa ($t2,$t0);
+ eval(shift(@insns)); #@
+ eval(shift(@insns));
+ &psrld ($t0,$sigma0[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &paddd (@X[0],$t3); # X[0..3] += X[9..12]
+ eval(shift(@insns)); #@
+ eval(shift(@insns));
+ &psrld ($t2,$sigma0[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &pshufd ($t3,@X[3],0b11111010); # X[4..15]
+ eval(shift(@insns));
+ eval(shift(@insns)); #@
+ &pslld ($t1,8*$SZ-$sigma0[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &pxor ($t0,$t2);
+ eval(shift(@insns)); #@
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); #@
+ &psrld ($t2,$sigma0[1]-$sigma0[0]);
+ eval(shift(@insns));
+ &pxor ($t0,$t1);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &pslld ($t1,$sigma0[1]-$sigma0[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &pxor ($t0,$t2);
+ eval(shift(@insns));
+ eval(shift(@insns)); #@
+ &movdqa ($t2,$t3);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &pxor ($t0,$t1); # sigma0(X[1..4])
+ eval(shift(@insns)); #@
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &psrld ($t3,$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4])
+ eval(shift(@insns)); #@
+ eval(shift(@insns));
+ &psrlq ($t2,$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &pxor ($t3,$t2);
+ eval(shift(@insns)); #@
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); #@
+ &psrlq ($t2,$sigma1[1]-$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &pxor ($t3,$t2);
+ eval(shift(@insns)); #@
+ eval(shift(@insns));
+ eval(shift(@insns));
+ #&pshufb ($t3,$t4); # sigma1(X[14..15])
+ &pshufd ($t3,$t3,0b10000000);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &psrldq ($t3,8);
+ eval(shift(@insns));
+ eval(shift(@insns)); #@
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); #@
+ &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &pshufd ($t3,@X[0],0b01010000); # X[16..17]
+ eval(shift(@insns));
+ eval(shift(@insns)); #@
+ eval(shift(@insns));
+ &movdqa ($t2,$t3);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &psrld ($t3,$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns)); #@
+ &psrlq ($t2,$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &pxor ($t3,$t2);
+ eval(shift(@insns)); #@
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); #@
+ eval(shift(@insns));
+ &psrlq ($t2,$sigma1[1]-$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &pxor ($t3,$t2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); #@
+ #&pshufb ($t3,$t5);
+ &pshufd ($t3,$t3,0b00001000);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa ($t2,16*2*$j."($Tbl)");
+ eval(shift(@insns)); #@
+ eval(shift(@insns));
+ &pslldq ($t3,8);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17])
+ eval(shift(@insns)); #@
+ eval(shift(@insns));
+ eval(shift(@insns));
+ }
+ &paddd ($t2,@X[0]);
+ foreach (@insns) { eval; } # remaining instructions
+ &movdqa (16*$j."(%rsp)",$t2);
+}
+
+ for ($i=0,$j=0; $j<4; $j++) {
+ &SSSE3_256_00_47($j,\&body_00_15,@X);
+ push(@X,shift(@X)); # rotate(@X)
+ }
+ &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
+ &jne (".Lssse3_00_47");
+
+ for ($i=0; $i<16; ) {
+ foreach(body_00_15()) { eval; }
+ }
+$code.=<<___;
+ mov $_ctx,$ctx
+ mov $a1,$A
+
+ add $SZ*0($ctx),$A
+ lea 16*$SZ($inp),$inp
+ add $SZ*1($ctx),$B
+ add $SZ*2($ctx),$C
+ add $SZ*3($ctx),$D
+ add $SZ*4($ctx),$E
+ add $SZ*5($ctx),$F
+ add $SZ*6($ctx),$G
+ add $SZ*7($ctx),$H
+
+ cmp $_end,$inp
+
+ mov $A,$SZ*0($ctx)
+ mov $B,$SZ*1($ctx)
+ mov $C,$SZ*2($ctx)
+ mov $D,$SZ*3($ctx)
+ mov $E,$SZ*4($ctx)
+ mov $F,$SZ*5($ctx)
+ mov $G,$SZ*6($ctx)
+ mov $H,$SZ*7($ctx)
+ jb .Lloop_ssse3
+
+ mov $_rsp,%rsi
+___
+$code.=<<___ if ($win64);
+ movaps 16*$SZ+32(%rsp),%xmm6
+ movaps 16*$SZ+48(%rsp),%xmm7
+ movaps 16*$SZ+64(%rsp),%xmm8
+ movaps 16*$SZ+80(%rsp),%xmm9
+___
+$code.=<<___;
+ mov (%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lepilogue_ssse3:
+ ret
+.size ${func}_ssse3,.-${func}_ssse3
+___
+}
+
+if ($avx) {{
+######################################################################
+# XOP code path
+#
+if ($SZ==8) { # SHA512 only
+$code.=<<___;
+.type ${func}_xop,\@function,3
+.align 64
+${func}_xop:
+.Lxop_shortcut:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ mov %rsp,%r11 # copy %rsp
+ shl \$4,%rdx # num*16
+ sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
+ lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
+ and \$-64,%rsp # align stack frame
+ mov $ctx,$_ctx # save ctx, 1st arg
+ mov $inp,$_inp # save inp, 2nd arh
+ mov %rdx,$_end # save end pointer, "3rd" arg
+ mov %r11,$_rsp # save copy of %rsp
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,16*$SZ+32(%rsp)
+ movaps %xmm7,16*$SZ+48(%rsp)
+ movaps %xmm8,16*$SZ+64(%rsp)
+ movaps %xmm9,16*$SZ+80(%rsp)
+___
+$code.=<<___ if ($win64 && $SZ>4);
+ movaps %xmm10,16*$SZ+96(%rsp)
+ movaps %xmm11,16*$SZ+112(%rsp)
+___
+$code.=<<___;
+.Lprologue_xop:
+
+ vzeroupper
+ mov $SZ*0($ctx),$A
+ mov $SZ*1($ctx),$B
+ mov $SZ*2($ctx),$C
+ mov $SZ*3($ctx),$D
+ mov $SZ*4($ctx),$E
+ mov $SZ*5($ctx),$F
+ mov $SZ*6($ctx),$G
+ mov $SZ*7($ctx),$H
+ jmp .Lloop_xop
+___
+ if ($SZ==4) { # SHA256
+ my @X = map("%xmm$_",(0..3));
+ my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
+
+$code.=<<___;
+.align 16
+.Lloop_xop:
+ vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
+ vmovdqu 0x00($inp),@X[0]
+ vmovdqu 0x10($inp),@X[1]
+ vmovdqu 0x20($inp),@X[2]
+ vmovdqu 0x30($inp),@X[3]
+ vpshufb $t3,@X[0],@X[0]
+ lea $TABLE(%rip),$Tbl
+ vpshufb $t3,@X[1],@X[1]
+ vpshufb $t3,@X[2],@X[2]
+ vpaddd 0x00($Tbl),@X[0],$t0
+ vpshufb $t3,@X[3],@X[3]
+ vpaddd 0x20($Tbl),@X[1],$t1
+ vpaddd 0x40($Tbl),@X[2],$t2
+ vpaddd 0x60($Tbl),@X[3],$t3
+ vmovdqa $t0,0x00(%rsp)
+ mov $A,$a1
+ vmovdqa $t1,0x10(%rsp)
+ mov $B,$a3
+ vmovdqa $t2,0x20(%rsp)
+ xor $C,$a3 # magic
+ vmovdqa $t3,0x30(%rsp)
+ mov $E,$a0
+ jmp .Lxop_00_47
+
+.align 16
+.Lxop_00_47:
+ sub \$`-16*2*$SZ`,$Tbl # size optimization
+___
+sub XOP_256_00_47 () {
+my $j = shift;
+my $body = shift;
+my @X = @_;
+my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
+
+ &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpsrld ($t0,$t0,$sigma0[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpxor ($t0,$t0,$t1);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpsrld ($t2,@X[3],$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpxor ($t3,$t3,$t2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpsrldq ($t3,$t3,8);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpsrld ($t2,@X[0],$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpxor ($t3,$t3,$t2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpslldq ($t3,$t3,8); # 22 instructions
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
+ foreach (@insns) { eval; } # remaining instructions
+ &vmovdqa (16*$j."(%rsp)",$t2);
+}
+
+ for ($i=0,$j=0; $j<4; $j++) {
+ &XOP_256_00_47($j,\&body_00_15,@X);
+ push(@X,shift(@X)); # rotate(@X)
+ }
+ &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
+ &jne (".Lxop_00_47");
+
+ for ($i=0; $i<16; ) {
+ foreach(body_00_15()) { eval; }
+ }
+
+ } else { # SHA512
+ my @X = map("%xmm$_",(0..7));
+ my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
+
+$code.=<<___;
+.align 16
+.Lloop_xop:
+ vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
+ vmovdqu 0x00($inp),@X[0]
+ lea $TABLE+0x80(%rip),$Tbl # size optimization
+ vmovdqu 0x10($inp),@X[1]
+ vmovdqu 0x20($inp),@X[2]
+ vpshufb $t3,@X[0],@X[0]
+ vmovdqu 0x30($inp),@X[3]
+ vpshufb $t3,@X[1],@X[1]
+ vmovdqu 0x40($inp),@X[4]
+ vpshufb $t3,@X[2],@X[2]
+ vmovdqu 0x50($inp),@X[5]
+ vpshufb $t3,@X[3],@X[3]
+ vmovdqu 0x60($inp),@X[6]
+ vpshufb $t3,@X[4],@X[4]
+ vmovdqu 0x70($inp),@X[7]
+ vpshufb $t3,@X[5],@X[5]
+ vpaddq -0x80($Tbl),@X[0],$t0
+ vpshufb $t3,@X[6],@X[6]
+ vpaddq -0x60($Tbl),@X[1],$t1
+ vpshufb $t3,@X[7],@X[7]
+ vpaddq -0x40($Tbl),@X[2],$t2
+ vpaddq -0x20($Tbl),@X[3],$t3
+ vmovdqa $t0,0x00(%rsp)
+ vpaddq 0x00($Tbl),@X[4],$t0
+ vmovdqa $t1,0x10(%rsp)
+ vpaddq 0x20($Tbl),@X[5],$t1
+ vmovdqa $t2,0x20(%rsp)
+ vpaddq 0x40($Tbl),@X[6],$t2
+ vmovdqa $t3,0x30(%rsp)
+ vpaddq 0x60($Tbl),@X[7],$t3
+ vmovdqa $t0,0x40(%rsp)
+ mov $A,$a1
+ vmovdqa $t1,0x50(%rsp)
+ mov $B,$a3
+ vmovdqa $t2,0x60(%rsp)
+ xor $C,$a3 # magic
+ vmovdqa $t3,0x70(%rsp)
+ mov $E,$a0
+ jmp .Lxop_00_47
+
+.align 16
+.Lxop_00_47:
+ add \$`16*2*$SZ`,$Tbl
+___
+sub XOP_512_00_47 () {
+my $j = shift;
+my $body = shift;
+my @X = @_;
+my @insns = (&$body,&$body); # 52 instructions
+
+ &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vprotq ($t1,$t0,8*$SZ-$sigma0[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpsrlq ($t0,$t0,$sigma0[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpxor ($t0,$t0,$t1);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpxor ($t0,$t0,$t2); # sigma0(X[1..2])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpsrlq ($t2,@X[7],$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpxor ($t3,$t3,$t2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
+ foreach (@insns) { eval; } # remaining instructions
+ &vmovdqa (16*$j."(%rsp)",$t2);
+}
+
+ for ($i=0,$j=0; $j<8; $j++) {
+ &XOP_512_00_47($j,\&body_00_15,@X);
+ push(@X,shift(@X)); # rotate(@X)
+ }
+ &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
+ &jne (".Lxop_00_47");
+
+ for ($i=0; $i<16; ) {
+ foreach(body_00_15()) { eval; }
+ }
+}
+$code.=<<___;
+ mov $_ctx,$ctx
+ mov $a1,$A
+
+ add $SZ*0($ctx),$A
+ lea 16*$SZ($inp),$inp
+ add $SZ*1($ctx),$B
+ add $SZ*2($ctx),$C
+ add $SZ*3($ctx),$D
+ add $SZ*4($ctx),$E
+ add $SZ*5($ctx),$F
+ add $SZ*6($ctx),$G
+ add $SZ*7($ctx),$H
+
+ cmp $_end,$inp
+
+ mov $A,$SZ*0($ctx)
+ mov $B,$SZ*1($ctx)
+ mov $C,$SZ*2($ctx)
+ mov $D,$SZ*3($ctx)
+ mov $E,$SZ*4($ctx)
+ mov $F,$SZ*5($ctx)
+ mov $G,$SZ*6($ctx)
+ mov $H,$SZ*7($ctx)
+ jb .Lloop_xop
+
+ mov $_rsp,%rsi
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps 16*$SZ+32(%rsp),%xmm6
+ movaps 16*$SZ+48(%rsp),%xmm7
+ movaps 16*$SZ+64(%rsp),%xmm8
+ movaps 16*$SZ+80(%rsp),%xmm9
+___
+$code.=<<___ if ($win64 && $SZ>4);
+ movaps 16*$SZ+96(%rsp),%xmm10
+ movaps 16*$SZ+112(%rsp),%xmm11
+___
+$code.=<<___;
+ mov (%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lepilogue_xop:
+ ret
+.size ${func}_xop,.-${func}_xop
+___
+}
+######################################################################
+# AVX+shrd code path
+#
+local *ror = sub { &shrd(@_[0],@_) };
+
+$code.=<<___;
+.type ${func}_avx,\@function,3
+.align 64
+${func}_avx:
+.Lavx_shortcut:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ mov %rsp,%r11 # copy %rsp
+ shl \$4,%rdx # num*16
+ sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
+ lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
+ and \$-64,%rsp # align stack frame
+ mov $ctx,$_ctx # save ctx, 1st arg
+ mov $inp,$_inp # save inp, 2nd arh
+ mov %rdx,$_end # save end pointer, "3rd" arg
+ mov %r11,$_rsp # save copy of %rsp
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,16*$SZ+32(%rsp)
+ movaps %xmm7,16*$SZ+48(%rsp)
+ movaps %xmm8,16*$SZ+64(%rsp)
+ movaps %xmm9,16*$SZ+80(%rsp)
+___
+$code.=<<___ if ($win64 && $SZ>4);
+ movaps %xmm10,16*$SZ+96(%rsp)
+ movaps %xmm11,16*$SZ+112(%rsp)
+___
+$code.=<<___;
+.Lprologue_avx:
+
+ vzeroupper
+ mov $SZ*0($ctx),$A
+ mov $SZ*1($ctx),$B
+ mov $SZ*2($ctx),$C
+ mov $SZ*3($ctx),$D
+ mov $SZ*4($ctx),$E
+ mov $SZ*5($ctx),$F
+ mov $SZ*6($ctx),$G
+ mov $SZ*7($ctx),$H
+___
+ if ($SZ==4) { # SHA256
+ my @X = map("%xmm$_",(0..3));
+ my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
+
+$code.=<<___;
+ vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
+ vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
+ jmp .Lloop_avx
+.align 16
+.Lloop_avx:
+ vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
+ vmovdqu 0x00($inp),@X[0]
+ vmovdqu 0x10($inp),@X[1]
+ vmovdqu 0x20($inp),@X[2]
+ vmovdqu 0x30($inp),@X[3]
+ vpshufb $t3,@X[0],@X[0]
+ lea $TABLE(%rip),$Tbl
+ vpshufb $t3,@X[1],@X[1]
+ vpshufb $t3,@X[2],@X[2]
+ vpaddd 0x00($Tbl),@X[0],$t0
+ vpshufb $t3,@X[3],@X[3]
+ vpaddd 0x20($Tbl),@X[1],$t1
+ vpaddd 0x40($Tbl),@X[2],$t2
+ vpaddd 0x60($Tbl),@X[3],$t3
+ vmovdqa $t0,0x00(%rsp)
+ mov $A,$a1
+ vmovdqa $t1,0x10(%rsp)
+ mov $B,$a3
+ vmovdqa $t2,0x20(%rsp)
+ xor $C,$a3 # magic
+ vmovdqa $t3,0x30(%rsp)
+ mov $E,$a0
+ jmp .Lavx_00_47
+
+.align 16
+.Lavx_00_47:
+ sub \$`-16*2*$SZ`,$Tbl # size optimization
+___
+sub Xupdate_256_AVX () {
+ (
+ '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
+ '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
+ '&vpsrld ($t2,$t0,$sigma0[0]);',
+ '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
+ '&vpsrld ($t3,$t0,$sigma0[2])',
+ '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
+ '&vpxor ($t0,$t3,$t2)',
+ '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
+ '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
+ '&vpxor ($t0,$t0,$t1)',
+ '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
+ '&vpxor ($t0,$t0,$t2)',
+ '&vpsrld ($t2,$t3,$sigma1[2]);',
+ '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
+ '&vpsrlq ($t3,$t3,$sigma1[0]);',
+ '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
+ '&vpxor ($t2,$t2,$t3);',
+ '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
+ '&vpxor ($t2,$t2,$t3)',
+ '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15])
+ '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
+ '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
+ '&vpsrld ($t2,$t3,$sigma1[2])',
+ '&vpsrlq ($t3,$t3,$sigma1[0])',
+ '&vpxor ($t2,$t2,$t3);',
+ '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
+ '&vpxor ($t2,$t2,$t3)',
+ '&vpshufb ($t2,$t2,$t5)',
+ '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
+ );
+}
+
+sub AVX_256_00_47 () {
+my $j = shift;
+my $body = shift;
+my @X = @_;
+my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
+
+ foreach (Xupdate_256_AVX()) { # 29 instructions
+ eval;
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ }
+ &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
+ foreach (@insns) { eval; } # remaining instructions
+ &vmovdqa (16*$j."(%rsp)",$t2);
+}
+
+ for ($i=0,$j=0; $j<4; $j++) {
+ &AVX_256_00_47($j,\&body_00_15,@X);
+ push(@X,shift(@X)); # rotate(@X)
+ }
+ &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
+ &jne (".Lavx_00_47");
+
+ for ($i=0; $i<16; ) {
+ foreach(body_00_15()) { eval; }
+ }
+
+ } else { # SHA512
+ my @X = map("%xmm$_",(0..7));
+ my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
+
+$code.=<<___;
+ jmp .Lloop_avx
+.align 16
+.Lloop_avx:
+ vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
+ vmovdqu 0x00($inp),@X[0]
+ lea $TABLE+0x80(%rip),$Tbl # size optimization
+ vmovdqu 0x10($inp),@X[1]
+ vmovdqu 0x20($inp),@X[2]
+ vpshufb $t3,@X[0],@X[0]
+ vmovdqu 0x30($inp),@X[3]
+ vpshufb $t3,@X[1],@X[1]
+ vmovdqu 0x40($inp),@X[4]
+ vpshufb $t3,@X[2],@X[2]
+ vmovdqu 0x50($inp),@X[5]
+ vpshufb $t3,@X[3],@X[3]
+ vmovdqu 0x60($inp),@X[6]
+ vpshufb $t3,@X[4],@X[4]
+ vmovdqu 0x70($inp),@X[7]
+ vpshufb $t3,@X[5],@X[5]
+ vpaddq -0x80($Tbl),@X[0],$t0
+ vpshufb $t3,@X[6],@X[6]
+ vpaddq -0x60($Tbl),@X[1],$t1
+ vpshufb $t3,@X[7],@X[7]
+ vpaddq -0x40($Tbl),@X[2],$t2
+ vpaddq -0x20($Tbl),@X[3],$t3
+ vmovdqa $t0,0x00(%rsp)
+ vpaddq 0x00($Tbl),@X[4],$t0
+ vmovdqa $t1,0x10(%rsp)
+ vpaddq 0x20($Tbl),@X[5],$t1
+ vmovdqa $t2,0x20(%rsp)
+ vpaddq 0x40($Tbl),@X[6],$t2
+ vmovdqa $t3,0x30(%rsp)
+ vpaddq 0x60($Tbl),@X[7],$t3
+ vmovdqa $t0,0x40(%rsp)
+ mov $A,$a1
+ vmovdqa $t1,0x50(%rsp)
+ mov $B,$a3
+ vmovdqa $t2,0x60(%rsp)
+ xor $C,$a3 # magic
+ vmovdqa $t3,0x70(%rsp)
+ mov $E,$a0
+ jmp .Lavx_00_47
+
+.align 16
+.Lavx_00_47:
+ add \$`16*2*$SZ`,$Tbl
+___
+sub Xupdate_512_AVX () {
+ (
+ '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2]
+ '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10]
+ '&vpsrlq ($t2,$t0,$sigma0[0])',
+ '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10]
+ '&vpsrlq ($t3,$t0,$sigma0[2])',
+ '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);',
+ '&vpxor ($t0,$t3,$t2)',
+ '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);',
+ '&vpxor ($t0,$t0,$t1)',
+ '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);',
+ '&vpxor ($t0,$t0,$t2)',
+ '&vpsrlq ($t3,@X[7],$sigma1[2]);',
+ '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2])
+ '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);',
+ '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2])
+ '&vpsrlq ($t1,@X[7],$sigma1[0]);',
+ '&vpxor ($t3,$t3,$t2)',
+ '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);',
+ '&vpxor ($t3,$t3,$t1)',
+ '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);',
+ '&vpxor ($t3,$t3,$t2)',
+ '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15])
+ '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
+ );
+}
+
+sub AVX_512_00_47 () {
+my $j = shift;
+my $body = shift;
+my @X = @_;
+my @insns = (&$body,&$body); # 52 instructions
+
+ foreach (Xupdate_512_AVX()) { # 23 instructions
+ eval;
+ eval(shift(@insns));
+ eval(shift(@insns));
+ }
+ &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
+ foreach (@insns) { eval; } # remaining instructions
+ &vmovdqa (16*$j."(%rsp)",$t2);
+}
+
+ for ($i=0,$j=0; $j<8; $j++) {
+ &AVX_512_00_47($j,\&body_00_15,@X);
+ push(@X,shift(@X)); # rotate(@X)
+ }
+ &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
+ &jne (".Lavx_00_47");
+
+ for ($i=0; $i<16; ) {
+ foreach(body_00_15()) { eval; }
+ }
+}
+$code.=<<___;
+ mov $_ctx,$ctx
+ mov $a1,$A
+
+ add $SZ*0($ctx),$A
+ lea 16*$SZ($inp),$inp
+ add $SZ*1($ctx),$B
+ add $SZ*2($ctx),$C
+ add $SZ*3($ctx),$D
+ add $SZ*4($ctx),$E
+ add $SZ*5($ctx),$F
+ add $SZ*6($ctx),$G
+ add $SZ*7($ctx),$H
+
+ cmp $_end,$inp
+
+ mov $A,$SZ*0($ctx)
+ mov $B,$SZ*1($ctx)
+ mov $C,$SZ*2($ctx)
+ mov $D,$SZ*3($ctx)
+ mov $E,$SZ*4($ctx)
+ mov $F,$SZ*5($ctx)
+ mov $G,$SZ*6($ctx)
+ mov $H,$SZ*7($ctx)
+ jb .Lloop_avx
+
+ mov $_rsp,%rsi
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps 16*$SZ+32(%rsp),%xmm6
+ movaps 16*$SZ+48(%rsp),%xmm7
+ movaps 16*$SZ+64(%rsp),%xmm8
+ movaps 16*$SZ+80(%rsp),%xmm9
+___
+$code.=<<___ if ($win64 && $SZ>4);
+ movaps 16*$SZ+96(%rsp),%xmm10
+ movaps 16*$SZ+112(%rsp),%xmm11
+___
+$code.=<<___;
+ mov (%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lepilogue_avx:
+ ret
+.size ${func}_avx,.-${func}_avx
+___
+
+if ($avx>1) {{
+######################################################################
+# AVX2+BMI code path
+#
+my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
+my $PUSH8=8*2*$SZ;
+use integer;
+
+sub bodyx_00_15 () {
+ # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
+ (
+ '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
+
+ '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
+ '&and ($a4,$e)', # f&e
+ '&rorx ($a0,$e,$Sigma1[2])',
+ '&rorx ($a2,$e,$Sigma1[1])',
+
+ '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
+ '&lea ($h,"($h,$a4)")',
+ '&andn ($a4,$e,$g)', # ~e&g
+ '&xor ($a0,$a2)',
+
+ '&rorx ($a1,$e,$Sigma1[0])',
+ '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
+ '&xor ($a0,$a1)', # Sigma1(e)
+ '&mov ($a2,$a)',
+
+ '&rorx ($a4,$a,$Sigma0[2])',
+ '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
+ '&xor ($a2,$b)', # a^b, b^c in next round
+ '&rorx ($a1,$a,$Sigma0[1])',
+
+ '&rorx ($a0,$a,$Sigma0[0])',
+ '&lea ($d,"($d,$h)")', # d+=h
+ '&and ($a3,$a2)', # (b^c)&(a^b)
+ '&xor ($a1,$a4)',
+
+ '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
+ '&xor ($a1,$a0)', # Sigma0(a)
+ '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
+ '&mov ($a4,$e)', # copy of f in future
+
+ '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
+ );
+ # and at the finish one has to $a+=$a1
+}
+
+$code.=<<___;
+.type ${func}_avx2,\@function,3
+.align 64
+${func}_avx2:
+.Lavx2_shortcut:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ mov %rsp,%r11 # copy %rsp
+ sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
+ shl \$4,%rdx # num*16
+ and \$-256*$SZ,%rsp # align stack frame
+ lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
+ add \$`2*$SZ*($rounds-8)`,%rsp
+ mov $ctx,$_ctx # save ctx, 1st arg
+ mov $inp,$_inp # save inp, 2nd arh
+ mov %rdx,$_end # save end pointer, "3rd" arg
+ mov %r11,$_rsp # save copy of %rsp
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,16*$SZ+32(%rsp)
+ movaps %xmm7,16*$SZ+48(%rsp)
+ movaps %xmm8,16*$SZ+64(%rsp)
+ movaps %xmm9,16*$SZ+80(%rsp)
+___
+$code.=<<___ if ($win64 && $SZ>4);
+ movaps %xmm10,16*$SZ+96(%rsp)
+ movaps %xmm11,16*$SZ+112(%rsp)
+___
+$code.=<<___;
+.Lprologue_avx2:
+
+ vzeroupper
+ sub \$-16*$SZ,$inp # inp++, size optimization
+ mov $SZ*0($ctx),$A
+ mov $inp,%r12 # borrow $T1
+ mov $SZ*1($ctx),$B
+ cmp %rdx,$inp # $_end
+ mov $SZ*2($ctx),$C
+ cmove %rsp,%r12 # next block or random data
+ mov $SZ*3($ctx),$D
+ mov $SZ*4($ctx),$E
+ mov $SZ*5($ctx),$F
+ mov $SZ*6($ctx),$G
+ mov $SZ*7($ctx),$H
+___
+ if ($SZ==4) { # SHA256
+ my @X = map("%ymm$_",(0..3));
+ my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
+
+$code.=<<___;
+ vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
+ vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
+ jmp .Loop_avx2
+.align 16
+.Loop_avx2:
+ vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
+ vmovdqu -16*$SZ+0($inp),%xmm0
+ vmovdqu -16*$SZ+16($inp),%xmm1
+ vmovdqu -16*$SZ+32($inp),%xmm2
+ vmovdqu -16*$SZ+48($inp),%xmm3
+ #mov $inp,$_inp # offload $inp
+ vinserti128 \$1,(%r12),@X[0],@X[0]
+ vinserti128 \$1,16(%r12),@X[1],@X[1]
+ vpshufb $t3,@X[0],@X[0]
+ vinserti128 \$1,32(%r12),@X[2],@X[2]
+ vpshufb $t3,@X[1],@X[1]
+ vinserti128 \$1,48(%r12),@X[3],@X[3]
+
+ lea $TABLE(%rip),$Tbl
+ vpshufb $t3,@X[2],@X[2]
+ vpaddd 0x00($Tbl),@X[0],$t0
+ vpshufb $t3,@X[3],@X[3]
+ vpaddd 0x20($Tbl),@X[1],$t1
+ vpaddd 0x40($Tbl),@X[2],$t2
+ vpaddd 0x60($Tbl),@X[3],$t3
+ vmovdqa $t0,0x00(%rsp)
+ xor $a1,$a1
+ vmovdqa $t1,0x20(%rsp)
+ lea -$PUSH8(%rsp),%rsp
+ mov $B,$a3
+ vmovdqa $t2,0x00(%rsp)
+ xor $C,$a3 # magic
+ vmovdqa $t3,0x20(%rsp)
+ mov $F,$a4
+ sub \$-16*2*$SZ,$Tbl # size optimization
+ jmp .Lavx2_00_47
+
+.align 16
+.Lavx2_00_47:
+___
+
+sub AVX2_256_00_47 () {
+my $j = shift;
+my $body = shift;
+my @X = @_;
+my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
+my $base = "+2*$PUSH8(%rsp)";
+
+ &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
+ foreach (Xupdate_256_AVX()) { # 29 instructions
+ eval;
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ }
+ &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
+ foreach (@insns) { eval; } # remaining instructions
+ &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
+}
+
+ for ($i=0,$j=0; $j<4; $j++) {
+ &AVX2_256_00_47($j,\&bodyx_00_15,@X);
+ push(@X,shift(@X)); # rotate(@X)
+ }
+ &lea ($Tbl,16*2*$SZ."($Tbl)");
+ &cmpb (($SZ-1)."($Tbl)",0);
+ &jne (".Lavx2_00_47");
+
+ for ($i=0; $i<16; ) {
+ my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
+ foreach(bodyx_00_15()) { eval; }
+ }
+ } else { # SHA512
+ my @X = map("%ymm$_",(0..7));
+ my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
+
+$code.=<<___;
+ jmp .Loop_avx2
+.align 16
+.Loop_avx2:
+ vmovdqu -16*$SZ($inp),%xmm0
+ vmovdqu -16*$SZ+16($inp),%xmm1
+ vmovdqu -16*$SZ+32($inp),%xmm2
+ lea $TABLE+0x80(%rip),$Tbl # size optimization
+ vmovdqu -16*$SZ+48($inp),%xmm3
+ vmovdqu -16*$SZ+64($inp),%xmm4
+ vmovdqu -16*$SZ+80($inp),%xmm5
+ vmovdqu -16*$SZ+96($inp),%xmm6
+ vmovdqu -16*$SZ+112($inp),%xmm7
+ #mov $inp,$_inp # offload $inp
+ vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2
+ vinserti128 \$1,(%r12),@X[0],@X[0]
+ vinserti128 \$1,16(%r12),@X[1],@X[1]
+ vpshufb $t2,@X[0],@X[0]
+ vinserti128 \$1,32(%r12),@X[2],@X[2]
+ vpshufb $t2,@X[1],@X[1]
+ vinserti128 \$1,48(%r12),@X[3],@X[3]
+ vpshufb $t2,@X[2],@X[2]
+ vinserti128 \$1,64(%r12),@X[4],@X[4]
+ vpshufb $t2,@X[3],@X[3]
+ vinserti128 \$1,80(%r12),@X[5],@X[5]
+ vpshufb $t2,@X[4],@X[4]
+ vinserti128 \$1,96(%r12),@X[6],@X[6]
+ vpshufb $t2,@X[5],@X[5]
+ vinserti128 \$1,112(%r12),@X[7],@X[7]
+
+ vpaddq -0x80($Tbl),@X[0],$t0
+ vpshufb $t2,@X[6],@X[6]
+ vpaddq -0x60($Tbl),@X[1],$t1
+ vpshufb $t2,@X[7],@X[7]
+ vpaddq -0x40($Tbl),@X[2],$t2
+ vpaddq -0x20($Tbl),@X[3],$t3
+ vmovdqa $t0,0x00(%rsp)
+ vpaddq 0x00($Tbl),@X[4],$t0
+ vmovdqa $t1,0x20(%rsp)
+ vpaddq 0x20($Tbl),@X[5],$t1
+ vmovdqa $t2,0x40(%rsp)
+ vpaddq 0x40($Tbl),@X[6],$t2
+ vmovdqa $t3,0x60(%rsp)
+ lea -$PUSH8(%rsp),%rsp
+ vpaddq 0x60($Tbl),@X[7],$t3
+ vmovdqa $t0,0x00(%rsp)
+ xor $a1,$a1
+ vmovdqa $t1,0x20(%rsp)
+ mov $B,$a3
+ vmovdqa $t2,0x40(%rsp)
+ xor $C,$a3 # magic
+ vmovdqa $t3,0x60(%rsp)
+ mov $F,$a4
+ add \$16*2*$SZ,$Tbl
+ jmp .Lavx2_00_47
+
+.align 16
+.Lavx2_00_47:
___
+
+sub AVX2_512_00_47 () {
+my $j = shift;
+my $body = shift;
+my @X = @_;
+my @insns = (&$body,&$body); # 48 instructions
+my $base = "+2*$PUSH8(%rsp)";
+
+ &lea ("%rsp","-$PUSH8(%rsp)") if (($j%4)==0);
+ foreach (Xupdate_512_AVX()) { # 23 instructions
+ eval;
+ if ($_ !~ /\;$/) {
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ }
+ }
+ &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
+ foreach (@insns) { eval; } # remaining instructions
+ &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
+}
+
+ for ($i=0,$j=0; $j<8; $j++) {
+ &AVX2_512_00_47($j,\&bodyx_00_15,@X);
+ push(@X,shift(@X)); # rotate(@X)
+ }
+ &lea ($Tbl,16*2*$SZ."($Tbl)");
+ &cmpb (($SZ-1-0x80)."($Tbl)",0);
+ &jne (".Lavx2_00_47");
+
+ for ($i=0; $i<16; ) {
+ my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
+ foreach(bodyx_00_15()) { eval; }
+ }
}
+$code.=<<___;
+ mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
+ add $a1,$A
+ #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
+ lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
+
+ add $SZ*0($ctx),$A
+ add $SZ*1($ctx),$B
+ add $SZ*2($ctx),$C
+ add $SZ*3($ctx),$D
+ add $SZ*4($ctx),$E
+ add $SZ*5($ctx),$F
+ add $SZ*6($ctx),$G
+ add $SZ*7($ctx),$H
+
+ mov $A,$SZ*0($ctx)
+ mov $B,$SZ*1($ctx)
+ mov $C,$SZ*2($ctx)
+ mov $D,$SZ*3($ctx)
+ mov $E,$SZ*4($ctx)
+ mov $F,$SZ*5($ctx)
+ mov $G,$SZ*6($ctx)
+ mov $H,$SZ*7($ctx)
+
+ cmp `$PUSH8+2*8`($Tbl),$inp # $_end
+ je .Ldone_avx2
+
+ xor $a1,$a1
+ mov $B,$a3
+ xor $C,$a3 # magic
+ mov $F,$a4
+ jmp .Lower_avx2
+.align 16
+.Lower_avx2:
+___
+ for ($i=0; $i<8; ) {
+ my $base="+16($Tbl)";
+ foreach(bodyx_00_15()) { eval; }
+ }
+$code.=<<___;
+ lea -$PUSH8($Tbl),$Tbl
+ cmp %rsp,$Tbl
+ jae .Lower_avx2
+
+ mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
+ add $a1,$A
+ #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
+ lea `2*$SZ*($rounds-8)`(%rsp),%rsp
+
+ add $SZ*0($ctx),$A
+ add $SZ*1($ctx),$B
+ add $SZ*2($ctx),$C
+ add $SZ*3($ctx),$D
+ add $SZ*4($ctx),$E
+ add $SZ*5($ctx),$F
+ lea `2*16*$SZ`($inp),$inp # inp+=2
+ add $SZ*6($ctx),$G
+ mov $inp,%r12
+ add $SZ*7($ctx),$H
+ cmp $_end,$inp
+
+ mov $A,$SZ*0($ctx)
+ cmove %rsp,%r12 # next block or stale data
+ mov $B,$SZ*1($ctx)
+ mov $C,$SZ*2($ctx)
+ mov $D,$SZ*3($ctx)
+ mov $E,$SZ*4($ctx)
+ mov $F,$SZ*5($ctx)
+ mov $G,$SZ*6($ctx)
+ mov $H,$SZ*7($ctx)
+
+ jbe .Loop_avx2
+ lea (%rsp),$Tbl
+
+.Ldone_avx2:
+ lea ($Tbl),%rsp
+ mov $_rsp,%rsi
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps 16*$SZ+32(%rsp),%xmm6
+ movaps 16*$SZ+48(%rsp),%xmm7
+ movaps 16*$SZ+64(%rsp),%xmm8
+ movaps 16*$SZ+80(%rsp),%xmm9
+___
+$code.=<<___ if ($win64 && $SZ>4);
+ movaps 16*$SZ+96(%rsp),%xmm10
+ movaps 16*$SZ+112(%rsp),%xmm11
+___
+$code.=<<___;
+ mov (%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lepilogue_avx2:
+ ret
+.size ${func}_avx2,.-${func}_avx2
+___
+}}
+}}}}}
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
@@ -366,16 +2173,32 @@ se_handler:
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
- lea .Lprologue(%rip),%r10
- cmp %r10,%rbx # context->Rip<.Lprologue
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HanderlData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
jb .Lin_prologue
mov 152($context),%rax # pull context->Rsp
- lea .Lepilogue(%rip),%r10
- cmp %r10,%rbx # context->Rip>=.Lepilogue
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lin_prologue
+___
+$code.=<<___ if ($avx>1);
+ lea .Lavx2_shortcut(%rip),%r10
+ cmp %r10,%rbx # context->Rip<avx2_shortcut
+ jb .Lnot_in_avx2
+ and \$-256*$SZ,%rax
+ add \$`2*$SZ*($rounds-8)`,%rax
+.Lnot_in_avx2:
+___
+$code.=<<___;
+ mov %rax,%rsi # put aside Rsp
mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
lea 48(%rax),%rax
@@ -392,6 +2215,15 @@ se_handler:
mov %r14,232($context) # restore context->R14
mov %r15,240($context) # restore context->R15
+ lea .Lepilogue(%rip),%r10
+ cmp %r10,%rbx
+ jb .Lin_prologue # non-AVX code
+
+ lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$`$SZ==4?8:12`,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+
.Lin_prologue:
mov 8(%rax),%rdi
mov 16(%rax),%rsi
@@ -431,21 +2263,136 @@ se_handler:
pop %rsi
ret
.size se_handler,.-se_handler
+___
+
+$code.=<<___ if ($SZ==4 && $shaext);
+.type shaext_handler,\@abi-omnipotent
+.align 16
+shaext_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+ lea .Lprologue_shaext(%rip),%r10
+ cmp %r10,%rbx # context->Rip<.Lprologue
+ jb .Lin_prologue
+
+ lea .Lepilogue_shaext(%rip),%r10
+ cmp %r10,%rbx # context->Rip>=.Lepilogue
+ jae .Lin_prologue
+
+ lea -8-5*16(%rax),%rsi
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$10,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+
+ jmp .Lin_prologue
+.size shaext_handler,.-shaext_handler
+___
+
+$code.=<<___;
.section .pdata
.align 4
.rva .LSEH_begin_$func
.rva .LSEH_end_$func
.rva .LSEH_info_$func
-
+___
+$code.=<<___ if ($SZ==4 && $shaext);
+ .rva .LSEH_begin_${func}_shaext
+ .rva .LSEH_end_${func}_shaext
+ .rva .LSEH_info_${func}_shaext
+___
+$code.=<<___ if ($SZ==4);
+ .rva .LSEH_begin_${func}_ssse3
+ .rva .LSEH_end_${func}_ssse3
+ .rva .LSEH_info_${func}_ssse3
+___
+$code.=<<___ if ($avx && $SZ==8);
+ .rva .LSEH_begin_${func}_xop
+ .rva .LSEH_end_${func}_xop
+ .rva .LSEH_info_${func}_xop
+___
+$code.=<<___ if ($avx);
+ .rva .LSEH_begin_${func}_avx
+ .rva .LSEH_end_${func}_avx
+ .rva .LSEH_info_${func}_avx
+___
+$code.=<<___ if ($avx>1);
+ .rva .LSEH_begin_${func}_avx2
+ .rva .LSEH_end_${func}_avx2
+ .rva .LSEH_info_${func}_avx2
+___
+$code.=<<___;
.section .xdata
.align 8
.LSEH_info_$func:
.byte 9,0,0,0
.rva se_handler
+ .rva .Lprologue,.Lepilogue # HandlerData[]
+___
+$code.=<<___ if ($SZ==4 && $shaext);
+.LSEH_info_${func}_shaext:
+ .byte 9,0,0,0
+ .rva shaext_handler
+___
+$code.=<<___ if ($SZ==4);
+.LSEH_info_${func}_ssse3:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
+___
+$code.=<<___ if ($avx && $SZ==8);
+.LSEH_info_${func}_xop:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
+___
+$code.=<<___ if ($avx);
+.LSEH_info_${func}_avx:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
+___
+$code.=<<___ if ($avx>1);
+.LSEH_info_${func}_avx2:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
___
}
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-print $code;
+sub sha256op38 {
+ my $instr = shift;
+ my %opcodelet = (
+ "sha256rnds2" => 0xcb,
+ "sha256msg1" => 0xcc,
+ "sha256msg2" => 0xcd );
+
+ if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
+ my @opcode=(0x0f,0x38);
+ push @opcode,$opcodelet{$instr};
+ push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
+ return ".byte\t".join(',',@opcode);
+ } else {
+ return $instr."\t".@_[0];
+ }
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
+
+ print $_,"\n";
+}
close STDOUT;
diff --git a/crypto/sha/asm/sha512p8-ppc.pl b/crypto/sha/asm/sha512p8-ppc.pl
new file mode 100755
index 000000000000..47189502c6cc
--- /dev/null
+++ b/crypto/sha/asm/sha512p8-ppc.pl
@@ -0,0 +1,424 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA256/512 for PowerISA v2.07.
+#
+# Accurate performance measurements are problematic, because it's
+# always virtualized setup with possibly throttled processor.
+# Relative comparison is therefore more informative. This module is
+# ~60% faster than integer-only sha512-ppc.pl. To anchor to something
+# else, SHA256 is 24% slower than sha1-ppc.pl and 2.5x slower than
+# hardware-assisted aes-128-cbc encrypt. SHA512 is 20% faster than
+# sha1-ppc.pl and 1.6x slower than aes-128-cbc. Another interesting
+# result is degree of computational resources' utilization. POWER8 is
+# "massively multi-threaded chip" and difference between single- and
+# maximum multi-process benchmark results tells that utlization is
+# whooping 94%. For sha512-ppc.pl we get [not unimpressive] 84% and
+# for sha1-ppc.pl - 73%. 100% means that multi-process result equals
+# to single-process one, given that all threads end up on the same
+# physical core.
+
+$flavour=shift;
+$output =shift;
+
+if ($flavour =~ /64/) {
+ $SIZE_T=8;
+ $LRSAVE=2*$SIZE_T;
+ $STU="stdu";
+ $POP="ld";
+ $PUSH="std";
+} elsif ($flavour =~ /32/) {
+ $SIZE_T=4;
+ $LRSAVE=$SIZE_T;
+ $STU="stwu";
+ $POP="lwz";
+ $PUSH="stw";
+} else { die "nonsense $flavour"; }
+
+$LENDIAN=($flavour=~/le/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
+
+if ($output =~ /512/) {
+ $bits=512;
+ $SZ=8;
+ $sz="d";
+ $rounds=80;
+} else {
+ $bits=256;
+ $SZ=4;
+ $sz="w";
+ $rounds=64;
+}
+
+$func="sha${bits}_block_p8";
+$FRAME=8*$SIZE_T;
+
+$sp ="r1";
+$toc="r2";
+$ctx="r3";
+$inp="r4";
+$num="r5";
+$Tbl="r6";
+$idx="r7";
+$lrsave="r8";
+$offload="r11";
+$vrsave="r12";
+($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31));
+ $x00=0 if ($flavour =~ /osx/);
+
+@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7));
+@X=map("v$_",(8..23));
+($Ki,$Func,$S0,$S1,$s0,$s1,$lemask)=map("v$_",(24..31));
+
+sub ROUND {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my $j=($i+1)%16;
+
+$code.=<<___ if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1));
+ lvx_u @X[$i+1],0,$inp ; load X[i] in advance
+ addi $inp,$inp,16
+___
+$code.=<<___ if ($i<16 && ($i%(16/$SZ)));
+ vsldoi @X[$i],@X[$i-1],@X[$i-1],$SZ
+___
+$code.=<<___ if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0);
+ vperm @X[$i],@X[$i],@X[$i],$lemask
+___
+$code.=<<___;
+ `"vshasigma${sz} $s0,@X[($j+1)%16],0,0" if ($i>=15)`
+ vsel $Func,$g,$f,$e ; Ch(e,f,g)
+ vshasigma${sz} $S1,$e,1,15 ; Sigma1(e)
+ vaddu${sz}m $h,$h,@X[$i%16] ; h+=X[i]
+ vshasigma${sz} $S0,$a,1,0 ; Sigma0(a)
+ `"vshasigma${sz} $s1,@X[($j+14)%16],0,15" if ($i>=15)`
+ vaddu${sz}m $h,$h,$Func ; h+=Ch(e,f,g)
+ vxor $Func,$a,$b
+ `"vaddu${sz}m @X[$j],@X[$j],@X[($j+9)%16]" if ($i>=15)`
+ vaddu${sz}m $h,$h,$S1 ; h+=Sigma1(e)
+ vsel $Func,$b,$c,$Func ; Maj(a,b,c)
+ vaddu${sz}m $g,$g,$Ki ; future h+=K[i]
+ vaddu${sz}m $d,$d,$h ; d+=h
+ vaddu${sz}m $S0,$S0,$Func ; Sigma0(a)+Maj(a,b,c)
+ `"vaddu${sz}m @X[$j],@X[$j],$s0" if ($i>=15)`
+ lvx $Ki,$idx,$Tbl ; load next K[i]
+ addi $idx,$idx,16
+ vaddu${sz}m $h,$h,$S0 ; h+=Sigma0(a)+Maj(a,b,c)
+ `"vaddu${sz}m @X[$j],@X[$j],$s1" if ($i>=15)`
+___
+}
+
+$code=<<___;
+.machine "any"
+.text
+
+.globl $func
+.align 6
+$func:
+ $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+ mflr $lrsave
+ li r10,`$FRAME+8*16+15`
+ li r11,`$FRAME+8*16+31`
+ stvx v20,r10,$sp # ABI says so
+ addi r10,r10,32
+ mfspr $vrsave,256
+ stvx v21,r11,$sp
+ addi r11,r11,32
+ stvx v22,r10,$sp
+ addi r10,r10,32
+ stvx v23,r11,$sp
+ addi r11,r11,32
+ stvx v24,r10,$sp
+ addi r10,r10,32
+ stvx v25,r11,$sp
+ addi r11,r11,32
+ stvx v26,r10,$sp
+ addi r10,r10,32
+ stvx v27,r11,$sp
+ addi r11,r11,32
+ stvx v28,r10,$sp
+ addi r10,r10,32
+ stvx v29,r11,$sp
+ addi r11,r11,32
+ stvx v30,r10,$sp
+ stvx v31,r11,$sp
+ li r11,-1
+ stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
+ li $x10,0x10
+ $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+ li $x20,0x20
+ $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+ li $x30,0x30
+ $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+ li $x40,0x40
+ $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+ li $x50,0x50
+ $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+ li $x60,0x60
+ $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+ li $x70,0x70
+ $PUSH $lrsave,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
+ mtspr 256,r11
+
+ bl LPICmeup
+ addi $offload,$sp,$FRAME+15
+___
+$code.=<<___ if ($LENDIAN);
+ li $idx,8
+ lvsl $lemask,0,$idx
+ vspltisb $Ki,0x0f
+ vxor $lemask,$lemask,$Ki
+___
+$code.=<<___ if ($SZ==4);
+ lvx_4w $A,$x00,$ctx
+ lvx_4w $E,$x10,$ctx
+ vsldoi $B,$A,$A,4 # unpack
+ vsldoi $C,$A,$A,8
+ vsldoi $D,$A,$A,12
+ vsldoi $F,$E,$E,4
+ vsldoi $G,$E,$E,8
+ vsldoi $H,$E,$E,12
+___
+$code.=<<___ if ($SZ==8);
+ lvx_u $A,$x00,$ctx
+ lvx_u $C,$x10,$ctx
+ lvx_u $E,$x20,$ctx
+ vsldoi $B,$A,$A,8 # unpack
+ lvx_u $G,$x30,$ctx
+ vsldoi $D,$C,$C,8
+ vsldoi $F,$E,$E,8
+ vsldoi $H,$G,$G,8
+___
+$code.=<<___;
+ li r0,`($rounds-16)/16` # inner loop counter
+ b Loop
+.align 5
+Loop:
+ lvx $Ki,$x00,$Tbl
+ li $idx,16
+ lvx_u @X[0],0,$inp
+ addi $inp,$inp,16
+ stvx $A,$x00,$offload # offload $A-$H
+ stvx $B,$x10,$offload
+ stvx $C,$x20,$offload
+ stvx $D,$x30,$offload
+ stvx $E,$x40,$offload
+ stvx $F,$x50,$offload
+ stvx $G,$x60,$offload
+ stvx $H,$x70,$offload
+ vaddu${sz}m $H,$H,$Ki # h+K[i]
+ lvx $Ki,$idx,$Tbl
+ addi $idx,$idx,16
+___
+for ($i=0;$i<16;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ mtctr r0
+ b L16_xx
+.align 5
+L16_xx:
+___
+for (;$i<32;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ bdnz L16_xx
+
+ lvx @X[2],$x00,$offload
+ subic. $num,$num,1
+ lvx @X[3],$x10,$offload
+ vaddu${sz}m $A,$A,@X[2]
+ lvx @X[4],$x20,$offload
+ vaddu${sz}m $B,$B,@X[3]
+ lvx @X[5],$x30,$offload
+ vaddu${sz}m $C,$C,@X[4]
+ lvx @X[6],$x40,$offload
+ vaddu${sz}m $D,$D,@X[5]
+ lvx @X[7],$x50,$offload
+ vaddu${sz}m $E,$E,@X[6]
+ lvx @X[8],$x60,$offload
+ vaddu${sz}m $F,$F,@X[7]
+ lvx @X[9],$x70,$offload
+ vaddu${sz}m $G,$G,@X[8]
+ vaddu${sz}m $H,$H,@X[9]
+ bne Loop
+___
+$code.=<<___ if ($SZ==4);
+ lvx @X[0],$idx,$Tbl
+ addi $idx,$idx,16
+ vperm $A,$A,$B,$Ki # pack the answer
+ lvx @X[1],$idx,$Tbl
+ vperm $E,$E,$F,$Ki
+ vperm $A,$A,$C,@X[0]
+ vperm $E,$E,$G,@X[0]
+ vperm $A,$A,$D,@X[1]
+ vperm $E,$E,$H,@X[1]
+ stvx_4w $A,$x00,$ctx
+ stvx_4w $E,$x10,$ctx
+___
+$code.=<<___ if ($SZ==8);
+ vperm $A,$A,$B,$Ki # pack the answer
+ vperm $C,$C,$D,$Ki
+ vperm $E,$E,$F,$Ki
+ vperm $G,$G,$H,$Ki
+ stvx_u $A,$x00,$ctx
+ stvx_u $C,$x10,$ctx
+ stvx_u $E,$x20,$ctx
+ stvx_u $G,$x30,$ctx
+___
+$code.=<<___;
+ li r10,`$FRAME+8*16+15`
+ mtlr $lrsave
+ li r11,`$FRAME+8*16+31`
+ mtspr 256,$vrsave
+ lvx v20,r10,$sp # ABI says so
+ addi r10,r10,32
+ lvx v21,r11,$sp
+ addi r11,r11,32
+ lvx v22,r10,$sp
+ addi r10,r10,32
+ lvx v23,r11,$sp
+ addi r11,r11,32
+ lvx v24,r10,$sp
+ addi r10,r10,32
+ lvx v25,r11,$sp
+ addi r11,r11,32
+ lvx v26,r10,$sp
+ addi r10,r10,32
+ lvx v27,r11,$sp
+ addi r11,r11,32
+ lvx v28,r10,$sp
+ addi r10,r10,32
+ lvx v29,r11,$sp
+ addi r11,r11,32
+ lvx v30,r10,$sp
+ lvx v31,r11,$sp
+ $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+ $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+ $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+ $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+ $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+ $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+ addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+ blr
+ .long 0
+ .byte 0,12,4,1,0x80,6,3,0
+ .long 0
+.size $func,.-$func
+___
+
+# Ugly hack here, because PPC assembler syntax seem to vary too
+# much from platforms to platform...
+$code.=<<___;
+.align 6
+LPICmeup:
+ mflr r0
+ bcl 20,31,\$+4
+ mflr $Tbl ; vvvvvv "distance" between . and 1st data entry
+ addi $Tbl,$Tbl,`64-8`
+ mtlr r0
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+ .space `64-9*4`
+___
+
+if ($SZ==8) {
+ local *table = sub {
+ foreach(@_) { $code.=".quad $_,$_\n"; }
+ };
+ table(
+ "0x428a2f98d728ae22","0x7137449123ef65cd",
+ "0xb5c0fbcfec4d3b2f","0xe9b5dba58189dbbc",
+ "0x3956c25bf348b538","0x59f111f1b605d019",
+ "0x923f82a4af194f9b","0xab1c5ed5da6d8118",
+ "0xd807aa98a3030242","0x12835b0145706fbe",
+ "0x243185be4ee4b28c","0x550c7dc3d5ffb4e2",
+ "0x72be5d74f27b896f","0x80deb1fe3b1696b1",
+ "0x9bdc06a725c71235","0xc19bf174cf692694",
+ "0xe49b69c19ef14ad2","0xefbe4786384f25e3",
+ "0x0fc19dc68b8cd5b5","0x240ca1cc77ac9c65",
+ "0x2de92c6f592b0275","0x4a7484aa6ea6e483",
+ "0x5cb0a9dcbd41fbd4","0x76f988da831153b5",
+ "0x983e5152ee66dfab","0xa831c66d2db43210",
+ "0xb00327c898fb213f","0xbf597fc7beef0ee4",
+ "0xc6e00bf33da88fc2","0xd5a79147930aa725",
+ "0x06ca6351e003826f","0x142929670a0e6e70",
+ "0x27b70a8546d22ffc","0x2e1b21385c26c926",
+ "0x4d2c6dfc5ac42aed","0x53380d139d95b3df",
+ "0x650a73548baf63de","0x766a0abb3c77b2a8",
+ "0x81c2c92e47edaee6","0x92722c851482353b",
+ "0xa2bfe8a14cf10364","0xa81a664bbc423001",
+ "0xc24b8b70d0f89791","0xc76c51a30654be30",
+ "0xd192e819d6ef5218","0xd69906245565a910",
+ "0xf40e35855771202a","0x106aa07032bbd1b8",
+ "0x19a4c116b8d2d0c8","0x1e376c085141ab53",
+ "0x2748774cdf8eeb99","0x34b0bcb5e19b48a8",
+ "0x391c0cb3c5c95a63","0x4ed8aa4ae3418acb",
+ "0x5b9cca4f7763e373","0x682e6ff3d6b2b8a3",
+ "0x748f82ee5defb2fc","0x78a5636f43172f60",
+ "0x84c87814a1f0ab72","0x8cc702081a6439ec",
+ "0x90befffa23631e28","0xa4506cebde82bde9",
+ "0xbef9a3f7b2c67915","0xc67178f2e372532b",
+ "0xca273eceea26619c","0xd186b8c721c0c207",
+ "0xeada7dd6cde0eb1e","0xf57d4f7fee6ed178",
+ "0x06f067aa72176fba","0x0a637dc5a2c898a6",
+ "0x113f9804bef90dae","0x1b710b35131c471b",
+ "0x28db77f523047d84","0x32caab7b40c72493",
+ "0x3c9ebe0a15c9bebc","0x431d67c49c100d4c",
+ "0x4cc5d4becb3e42b6","0x597f299cfc657e2a",
+ "0x5fcb6fab3ad6faec","0x6c44198c4a475817","0");
+$code.=<<___ if (!$LENDIAN);
+.quad 0x0001020304050607,0x1011121314151617
+___
+$code.=<<___ if ($LENDIAN); # quad-swapped
+.quad 0x1011121314151617,0x0001020304050607
+___
+} else {
+ local *table = sub {
+ foreach(@_) { $code.=".long $_,$_,$_,$_\n"; }
+ };
+ table(
+ "0x428a2f98","0x71374491","0xb5c0fbcf","0xe9b5dba5",
+ "0x3956c25b","0x59f111f1","0x923f82a4","0xab1c5ed5",
+ "0xd807aa98","0x12835b01","0x243185be","0x550c7dc3",
+ "0x72be5d74","0x80deb1fe","0x9bdc06a7","0xc19bf174",
+ "0xe49b69c1","0xefbe4786","0x0fc19dc6","0x240ca1cc",
+ "0x2de92c6f","0x4a7484aa","0x5cb0a9dc","0x76f988da",
+ "0x983e5152","0xa831c66d","0xb00327c8","0xbf597fc7",
+ "0xc6e00bf3","0xd5a79147","0x06ca6351","0x14292967",
+ "0x27b70a85","0x2e1b2138","0x4d2c6dfc","0x53380d13",
+ "0x650a7354","0x766a0abb","0x81c2c92e","0x92722c85",
+ "0xa2bfe8a1","0xa81a664b","0xc24b8b70","0xc76c51a3",
+ "0xd192e819","0xd6990624","0xf40e3585","0x106aa070",
+ "0x19a4c116","0x1e376c08","0x2748774c","0x34b0bcb5",
+ "0x391c0cb3","0x4ed8aa4a","0x5b9cca4f","0x682e6ff3",
+ "0x748f82ee","0x78a5636f","0x84c87814","0x8cc70208",
+ "0x90befffa","0xa4506ceb","0xbef9a3f7","0xc67178f2","0");
+$code.=<<___ if (!$LENDIAN);
+.long 0x00010203,0x10111213,0x10111213,0x10111213
+.long 0x00010203,0x04050607,0x10111213,0x10111213
+.long 0x00010203,0x04050607,0x08090a0b,0x10111213
+___
+$code.=<<___ if ($LENDIAN); # word-swapped
+.long 0x10111213,0x10111213,0x10111213,0x00010203
+.long 0x10111213,0x10111213,0x04050607,0x00010203
+.long 0x10111213,0x08090a0b,0x04050607,0x00010203
+___
+}
+$code.=<<___;
+.asciz "SHA${bits} for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/crypto/sha/sha512.c b/crypto/sha/sha512.c
index de0aad8c9dfa..3bf66ae1987e 100644
--- a/crypto/sha/sha512.c
+++ b/crypto/sha/sha512.c
@@ -55,6 +55,7 @@ const char SHA512_version[] = "SHA-512" OPENSSL_VERSION_PTEXT;
# if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \
defined(__x86_64) || defined(_M_AMD64) || defined(_M_X64) || \
defined(__s390__) || defined(__s390x__) || \
+ defined(__aarch64__) || \
defined(SHA512_ASM)
# define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
# endif
@@ -353,6 +354,18 @@ static const SHA_LONG64 K512[80] = {
asm ("rotrdi %0,%1,%2" \
: "=r"(ret) \
: "r"(a),"K"(n)); ret; })
+# elif defined(__aarch64__)
+# define ROTR(a,n) ({ SHA_LONG64 ret; \
+ asm ("ror %0,%1,%2" \
+ : "=r"(ret) \
+ : "r"(a),"I"(n)); ret; })
+# if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && \
+ __BYTE_ORDER__==__ORDER_LITTLE_ENDIAN__
+# define PULL64(x) ({ SHA_LONG64 ret; \
+ asm ("rev %0,%1" \
+ : "=r"(ret) \
+ : "r"(*((const SHA_LONG64 *)(&(x))))); ret; })
+# endif
# endif
# elif defined(_MSC_VER)
# if defined(_WIN64) /* applies to both IA-64 and AMD64 */
diff --git a/crypto/sparc_arch.h b/crypto/sparc_arch.h
new file mode 100644
index 000000000000..e30d322a4ae2
--- /dev/null
+++ b/crypto/sparc_arch.h
@@ -0,0 +1,101 @@
+#ifndef __SPARC_ARCH_H__
+# define __SPARC_ARCH_H__
+
+# define SPARCV9_TICK_PRIVILEGED (1<<0)
+# define SPARCV9_PREFER_FPU (1<<1)
+# define SPARCV9_VIS1 (1<<2)
+# define SPARCV9_VIS2 (1<<3)/* reserved */
+# define SPARCV9_FMADD (1<<4)/* reserved for SPARC64 V */
+# define SPARCV9_BLK (1<<5)/* VIS1 block copy */
+# define SPARCV9_VIS3 (1<<6)
+# define SPARCV9_RANDOM (1<<7)
+# define SPARCV9_64BIT_STACK (1<<8)
+
+/*
+ * OPENSSL_sparcv9cap_P[1] is copy of Compatibility Feature Register,
+ * %asr26, SPARC-T4 and later. There is no SPARCV9_CFR bit in
+ * OPENSSL_sparcv9cap_P[0], as %cfr copy is sufficient...
+ */
+# define CFR_AES 0x00000001/* Supports AES opcodes */
+# define CFR_DES 0x00000002/* Supports DES opcodes */
+# define CFR_KASUMI 0x00000004/* Supports KASUMI opcodes */
+# define CFR_CAMELLIA 0x00000008/* Supports CAMELLIA opcodes */
+# define CFR_MD5 0x00000010/* Supports MD5 opcodes */
+# define CFR_SHA1 0x00000020/* Supports SHA1 opcodes */
+# define CFR_SHA256 0x00000040/* Supports SHA256 opcodes */
+# define CFR_SHA512 0x00000080/* Supports SHA512 opcodes */
+# define CFR_MPMUL 0x00000100/* Supports MPMUL opcodes */
+# define CFR_MONTMUL 0x00000200/* Supports MONTMUL opcodes */
+# define CFR_MONTSQR 0x00000400/* Supports MONTSQR opcodes */
+# define CFR_CRC32C 0x00000800/* Supports CRC32C opcodes */
+
+# if defined(OPENSSL_PIC) && !defined(__PIC__)
+# define __PIC__
+# endif
+
+# if defined(__SUNPRO_C) && defined(__sparcv9) && !defined(__arch64__)
+# define __arch64__
+# endif
+
+# define SPARC_PIC_THUNK(reg) \
+ .align 32; \
+.Lpic_thunk: \
+ jmp %o7 + 8; \
+ add %o7, reg, reg;
+
+# define SPARC_PIC_THUNK_CALL(reg) \
+ sethi %hi(_GLOBAL_OFFSET_TABLE_-4), reg; \
+ call .Lpic_thunk; \
+ or reg, %lo(_GLOBAL_OFFSET_TABLE_+4), reg;
+
+# if 1
+# define SPARC_SETUP_GOT_REG(reg) SPARC_PIC_THUNK_CALL(reg)
+# else
+# define SPARC_SETUP_GOT_REG(reg) \
+ sethi %hi(_GLOBAL_OFFSET_TABLE_-4), reg; \
+ call .+8; \
+ or reg,%lo(_GLOBAL_OFFSET_TABLE_+4), reg; \
+ add %o7, reg, reg
+# endif
+
+# if defined(__arch64__)
+
+# define SPARC_LOAD_ADDRESS(SYM, reg) \
+ setx SYM, %o7, reg;
+# define LDPTR ldx
+# define SIZE_T_CC %xcc
+# define STACK_FRAME 192
+# define STACK_BIAS 2047
+# define STACK_7thARG (STACK_BIAS+176)
+
+# else
+
+# define SPARC_LOAD_ADDRESS(SYM, reg) \
+ set SYM, reg;
+# define LDPTR ld
+# define SIZE_T_CC %icc
+# define STACK_FRAME 112
+# define STACK_BIAS 0
+# define STACK_7thARG 92
+# define SPARC_LOAD_ADDRESS_LEAF(SYM,reg,tmp) SPARC_LOAD_ADDRESS(SYM,reg)
+
+# endif
+
+# ifdef __PIC__
+# undef SPARC_LOAD_ADDRESS
+# undef SPARC_LOAD_ADDRESS_LEAF
+# define SPARC_LOAD_ADDRESS(SYM, reg) \
+ SPARC_SETUP_GOT_REG(reg); \
+ sethi %hi(SYM), %o7; \
+ or %o7, %lo(SYM), %o7; \
+ LDPTR [reg + %o7], reg;
+# endif
+
+# ifndef SPARC_LOAD_ADDRESS_LEAF
+# define SPARC_LOAD_ADDRESS_LEAF(SYM, reg, tmp) \
+ mov %o7, tmp; \
+ SPARC_LOAD_ADDRESS(SYM, reg) \
+ mov tmp, %o7;
+# endif
+
+#endif /* __SPARC_ARCH_H__ */
diff --git a/crypto/sparccpuid.S b/crypto/sparccpuid.S
index 0cc247e48971..eea2006fba18 100644
--- a/crypto/sparccpuid.S
+++ b/crypto/sparccpuid.S
@@ -251,6 +251,11 @@ _sparcv9_vis1_probe:
! UltraSPARC IIe 7
! UltraSPARC III 7
! UltraSPARC T1 24
+! SPARC T4 65(*)
+!
+! (*) result has lesser to do with VIS instruction latencies, rdtick
+! appears that slow, but it does the trick in sense that FP and
+! VIS code paths are still slower than integer-only ones.
!
! Numbers for T2 and SPARC64 V-VII are more than welcomed.
!
@@ -260,6 +265,8 @@ _sparcv9_vis1_probe:
.global _sparcv9_vis1_instrument
.align 8
_sparcv9_vis1_instrument:
+ .word 0x81b00d80 !fxor %f0,%f0,%f0
+ .word 0x85b08d82 !fxor %f2,%f2,%f2
.word 0x91410000 !rd %tick,%o0
.word 0x81b00d80 !fxor %f0,%f0,%f0
.word 0x85b08d82 !fxor %f2,%f2,%f2
@@ -314,6 +321,30 @@ _sparcv9_fmadd_probe:
.type _sparcv9_fmadd_probe,#function
.size _sparcv9_fmadd_probe,.-_sparcv9_fmadd_probe
+.global _sparcv9_rdcfr
+.align 8
+_sparcv9_rdcfr:
+ retl
+ .word 0x91468000 !rd %asr26,%o0
+.type _sparcv9_rdcfr,#function
+.size _sparcv9_rdcfr,.-_sparcv9_rdcfr
+
+.global _sparcv9_vis3_probe
+.align 8
+_sparcv9_vis3_probe:
+ retl
+ .word 0x81b022a0 !xmulx %g0,%g0,%g0
+.type _sparcv9_vis3_probe,#function
+.size _sparcv9_vis3_probe,.-_sparcv9_vis3_probe
+
+.global _sparcv9_random
+.align 8
+_sparcv9_random:
+ retl
+ .word 0x91b002a0 !random %o0
+.type _sparcv9_random,#function
+.size _sparcv9_random,.-_sparcv9_vis3_probe
+
.global OPENSSL_cleanse
.align 32
OPENSSL_cleanse:
@@ -397,6 +428,102 @@ OPENSSL_cleanse:
.type OPENSSL_cleanse,#function
.size OPENSSL_cleanse,.-OPENSSL_cleanse
+.global _sparcv9_vis1_instrument_bus
+.align 8
+_sparcv9_vis1_instrument_bus:
+ mov %o1,%o3 ! save cnt
+ .word 0x99410000 !rd %tick,%o4 ! tick
+ mov %o4,%o5 ! lasttick = tick
+ set 0,%g4 ! diff
+
+ andn %o0,63,%g1
+ .word 0xc1985e00 !ldda [%g1]0xf0,%f0 ! block load
+ .word 0x8143e040 !membar #Sync
+ .word 0xc1b85c00 !stda %f0,[%g1]0xe0 ! block store and commit
+ .word 0x8143e040 !membar #Sync
+ ld [%o0],%o4
+ add %o4,%g4,%g4
+ .word 0xc9e2100c !cas [%o0],%o4,%g4
+
+.Loop: .word 0x99410000 !rd %tick,%o4
+ sub %o4,%o5,%g4 ! diff=tick-lasttick
+ mov %o4,%o5 ! lasttick=tick
+
+ andn %o0,63,%g1
+ .word 0xc1985e00 !ldda [%g1]0xf0,%f0 ! block load
+ .word 0x8143e040 !membar #Sync
+ .word 0xc1b85c00 !stda %f0,[%g1]0xe0 ! block store and commit
+ .word 0x8143e040 !membar #Sync
+ ld [%o0],%o4
+ add %o4,%g4,%g4
+ .word 0xc9e2100c !cas [%o0],%o4,%g4
+ subcc %o1,1,%o1 ! --$cnt
+ bnz .Loop
+ add %o0,4,%o0 ! ++$out
+
+ retl
+ mov %o3,%o0
+.type _sparcv9_vis1_instrument_bus,#function
+.size _sparcv9_vis1_instrument_bus,.-_sparcv9_vis1_instrument_bus
+
+.global _sparcv9_vis1_instrument_bus2
+.align 8
+_sparcv9_vis1_instrument_bus2:
+ mov %o1,%o3 ! save cnt
+ sll %o1,2,%o1 ! cnt*=4
+
+ .word 0x99410000 !rd %tick,%o4 ! tick
+ mov %o4,%o5 ! lasttick = tick
+ set 0,%g4 ! diff
+
+ andn %o0,63,%g1
+ .word 0xc1985e00 !ldda [%g1]0xf0,%f0 ! block load
+ .word 0x8143e040 !membar #Sync
+ .word 0xc1b85c00 !stda %f0,[%g1]0xe0 ! block store and commit
+ .word 0x8143e040 !membar #Sync
+ ld [%o0],%o4
+ add %o4,%g4,%g4
+ .word 0xc9e2100c !cas [%o0],%o4,%g4
+
+ .word 0x99410000 !rd %tick,%o4 ! tick
+ sub %o4,%o5,%g4 ! diff=tick-lasttick
+ mov %o4,%o5 ! lasttick=tick
+ mov %g4,%g5 ! lastdiff=diff
+.Loop2:
+ andn %o0,63,%g1
+ .word 0xc1985e00 !ldda [%g1]0xf0,%f0 ! block load
+ .word 0x8143e040 !membar #Sync
+ .word 0xc1b85c00 !stda %f0,[%g1]0xe0 ! block store and commit
+ .word 0x8143e040 !membar #Sync
+ ld [%o0],%o4
+ add %o4,%g4,%g4
+ .word 0xc9e2100c !cas [%o0],%o4,%g4
+
+ subcc %o2,1,%o2 ! --max
+ bz .Ldone2
+ nop
+
+ .word 0x99410000 !rd %tick,%o4 ! tick
+ sub %o4,%o5,%g4 ! diff=tick-lasttick
+ mov %o4,%o5 ! lasttick=tick
+ cmp %g4,%g5
+ mov %g4,%g5 ! lastdiff=diff
+
+ .word 0x83408000 !rd %ccr,%g1
+ and %g1,4,%g1 ! isolate zero flag
+ xor %g1,4,%g1 ! flip zero flag
+
+ subcc %o1,%g1,%o1 ! conditional --$cnt
+ bnz .Loop2
+ add %o0,%g1,%o0 ! conditional ++$out
+
+.Ldone2:
+ srl %o1,2,%o1
+ retl
+ sub %o3,%o1,%o0
+.type _sparcv9_vis1_instrument_bus2,#function
+.size _sparcv9_vis1_instrument_bus2,.-_sparcv9_vis1_instrument_bus2
+
.section ".init",#alloc,#execinstr
call OPENSSL_cpuid_setup
nop
diff --git a/crypto/sparcv9cap.c b/crypto/sparcv9cap.c
index d9f986f6b968..8bf2846929b1 100644
--- a/crypto/sparcv9cap.c
+++ b/crypto/sparcv9cap.c
@@ -4,30 +4,68 @@
#include <setjmp.h>
#include <signal.h>
#include <sys/time.h>
+#include <unistd.h>
#include <openssl/bn.h>
-#define SPARCV9_TICK_PRIVILEGED (1<<0)
-#define SPARCV9_PREFER_FPU (1<<1)
-#define SPARCV9_VIS1 (1<<2)
-#define SPARCV9_VIS2 (1<<3) /* reserved */
-#define SPARCV9_FMADD (1<<4) /* reserved for SPARC64 V */
+#include "sparc_arch.h"
-static int OPENSSL_sparcv9cap_P = SPARCV9_TICK_PRIVILEGED;
+#if defined(__GNUC__) && defined(__linux)
+__attribute__ ((visibility("hidden")))
+#endif
+unsigned int OPENSSL_sparcv9cap_P[2] = { SPARCV9_TICK_PRIVILEGED, 0 };
int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, int num)
{
+ int bn_mul_mont_vis3(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+ const BN_ULONG *np, const BN_ULONG *n0, int num);
int bn_mul_mont_fpu(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, int num);
int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, int num);
- if (num >= 8 && !(num & 1) &&
- (OPENSSL_sparcv9cap_P & (SPARCV9_PREFER_FPU | SPARCV9_VIS1)) ==
- (SPARCV9_PREFER_FPU | SPARCV9_VIS1))
- return bn_mul_mont_fpu(rp, ap, bp, np, n0, num);
- else
- return bn_mul_mont_int(rp, ap, bp, np, n0, num);
+ if (!(num & 1) && num >= 6) {
+ if ((num & 15) == 0 && num <= 64 &&
+ (OPENSSL_sparcv9cap_P[1] & (CFR_MONTMUL | CFR_MONTSQR)) ==
+ (CFR_MONTMUL | CFR_MONTSQR)) {
+ typedef int (*bn_mul_mont_f) (BN_ULONG *rp, const BN_ULONG *ap,
+ const BN_ULONG *bp,
+ const BN_ULONG *np,
+ const BN_ULONG *n0);
+ int bn_mul_mont_t4_8(BN_ULONG *rp, const BN_ULONG *ap,
+ const BN_ULONG *bp, const BN_ULONG *np,
+ const BN_ULONG *n0);
+ int bn_mul_mont_t4_16(BN_ULONG *rp, const BN_ULONG *ap,
+ const BN_ULONG *bp, const BN_ULONG *np,
+ const BN_ULONG *n0);
+ int bn_mul_mont_t4_24(BN_ULONG *rp, const BN_ULONG *ap,
+ const BN_ULONG *bp, const BN_ULONG *np,
+ const BN_ULONG *n0);
+ int bn_mul_mont_t4_32(BN_ULONG *rp, const BN_ULONG *ap,
+ const BN_ULONG *bp, const BN_ULONG *np,
+ const BN_ULONG *n0);
+ static const bn_mul_mont_f funcs[4] = {
+ bn_mul_mont_t4_8, bn_mul_mont_t4_16,
+ bn_mul_mont_t4_24, bn_mul_mont_t4_32
+ };
+ bn_mul_mont_f worker = funcs[num / 16 - 1];
+
+ if ((*worker) (rp, ap, bp, np, n0))
+ return 1;
+ /* retry once and fall back */
+ if ((*worker) (rp, ap, bp, np, n0))
+ return 1;
+ return bn_mul_mont_vis3(rp, ap, bp, np, n0, num);
+ }
+ if ((OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3))
+ return bn_mul_mont_vis3(rp, ap, bp, np, n0, num);
+ else if (num >= 8 &&
+ (OPENSSL_sparcv9cap_P[0] &
+ (SPARCV9_PREFER_FPU | SPARCV9_VIS1)) ==
+ (SPARCV9_PREFER_FPU | SPARCV9_VIS1))
+ return bn_mul_mont_fpu(rp, ap, bp, np, n0, num);
+ }
+ return bn_mul_mont_int(rp, ap, bp, np, n0, num);
}
unsigned long _sparcv9_rdtick(void);
@@ -35,10 +73,15 @@ void _sparcv9_vis1_probe(void);
unsigned long _sparcv9_vis1_instrument(void);
void _sparcv9_vis2_probe(void);
void _sparcv9_fmadd_probe(void);
+unsigned long _sparcv9_rdcfr(void);
+void _sparcv9_vis3_probe(void);
+unsigned long _sparcv9_random(void);
+size_t _sparcv9_vis1_instrument_bus(unsigned int *, size_t);
+size_t _sparcv9_vis1_instrument_bus2(unsigned int *, size_t, size_t);
unsigned long OPENSSL_rdtsc(void)
{
- if (OPENSSL_sparcv9cap_P & SPARCV9_TICK_PRIVILEGED)
+ if (OPENSSL_sparcv9cap_P[0] & SPARCV9_TICK_PRIVILEGED)
#if defined(__sun) && defined(__SVR4)
return gethrtime();
#else
@@ -48,6 +91,24 @@ unsigned long OPENSSL_rdtsc(void)
return _sparcv9_rdtick();
}
+size_t OPENSSL_instrument_bus(unsigned int *out, size_t cnt)
+{
+ if ((OPENSSL_sparcv9cap_P[0] & (SPARCV9_TICK_PRIVILEGED | SPARCV9_BLK)) ==
+ SPARCV9_BLK)
+ return _sparcv9_vis1_instrument_bus(out, cnt);
+ else
+ return 0;
+}
+
+size_t OPENSSL_instrument_bus2(unsigned int *out, size_t cnt, size_t max)
+{
+ if ((OPENSSL_sparcv9cap_P[0] & (SPARCV9_TICK_PRIVILEGED | SPARCV9_BLK)) ==
+ SPARCV9_BLK)
+ return _sparcv9_vis1_instrument_bus2(out, cnt, max);
+ else
+ return 0;
+}
+
#if 0 && defined(__sun) && defined(__SVR4)
/*
* This code path is disabled, because of incompatibility of libdevinfo.so.1
@@ -74,17 +135,17 @@ static int walk_nodename(di_node_t node, di_node_name_t di_node_name)
if (!strcmp(name, "SUNW,UltraSPARC") ||
/* covers II,III,IV */
!strncmp(name, "SUNW,UltraSPARC-I", 17)) {
- OPENSSL_sparcv9cap_P |= SPARCV9_PREFER_FPU | SPARCV9_VIS1;
+ OPENSSL_sparcv9cap_P[0] |= SPARCV9_PREFER_FPU | SPARCV9_VIS1;
/* %tick is privileged only on UltraSPARC-I/II, but not IIe */
if (name[14] != '\0' && name[17] != '\0' && name[18] != '\0')
- OPENSSL_sparcv9cap_P &= ~SPARCV9_TICK_PRIVILEGED;
+ OPENSSL_sparcv9cap_P[0] &= ~SPARCV9_TICK_PRIVILEGED;
return DI_WALK_TERMINATE;
}
/* This is expected to catch remaining UltraSPARCs, such as T1 */
else if (!strncmp(name, "SUNW,UltraSPARC", 15)) {
- OPENSSL_sparcv9cap_P &= ~SPARCV9_TICK_PRIVILEGED;
+ OPENSSL_sparcv9cap_P[0] &= ~SPARCV9_TICK_PRIVILEGED;
return DI_WALK_TERMINATE;
}
@@ -103,22 +164,22 @@ void OPENSSL_cpuid_setup(void)
trigger = 1;
if ((e = getenv("OPENSSL_sparcv9cap"))) {
- OPENSSL_sparcv9cap_P = strtoul(e, NULL, 0);
+ OPENSSL_sparcv9cap_P[0] = strtoul(e, NULL, 0);
return;
}
if (sysinfo(SI_MACHINE, si, sizeof(si)) > 0) {
if (strcmp(si, "sun4v"))
/* FPU is preferred for all CPUs, but US-T1/2 */
- OPENSSL_sparcv9cap_P |= SPARCV9_PREFER_FPU;
+ OPENSSL_sparcv9cap_P[0] |= SPARCV9_PREFER_FPU;
}
if (sysinfo(SI_ISALIST, si, sizeof(si)) > 0) {
if (strstr(si, "+vis"))
- OPENSSL_sparcv9cap_P |= SPARCV9_VIS1;
+ OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS1 | SPARCV9_BLK;
if (strstr(si, "+vis2")) {
- OPENSSL_sparcv9cap_P |= SPARCV9_VIS2;
- OPENSSL_sparcv9cap_P &= ~SPARCV9_TICK_PRIVILEGED;
+ OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS2;
+ OPENSSL_sparcv9cap_P[0] &= ~SPARCV9_TICK_PRIVILEGED;
return;
}
}
@@ -188,12 +249,14 @@ void OPENSSL_cpuid_setup(void)
trigger = 1;
if ((e = getenv("OPENSSL_sparcv9cap"))) {
- OPENSSL_sparcv9cap_P = strtoul(e, NULL, 0);
+ OPENSSL_sparcv9cap_P[0] = strtoul(e, NULL, 0);
+ if ((e = strchr(e, ':')))
+ OPENSSL_sparcv9cap_P[1] = strtoul(e + 1, NULL, 0);
return;
}
/* Initial value, fits UltraSPARC-I&II... */
- OPENSSL_sparcv9cap_P = SPARCV9_PREFER_FPU | SPARCV9_TICK_PRIVILEGED;
+ OPENSSL_sparcv9cap_P[0] = SPARCV9_PREFER_FPU | SPARCV9_TICK_PRIVILEGED;
sigfillset(&all_masked);
sigdelset(&all_masked, SIGILL);
@@ -216,30 +279,68 @@ void OPENSSL_cpuid_setup(void)
if (sigsetjmp(common_jmp, 1) == 0) {
_sparcv9_rdtick();
- OPENSSL_sparcv9cap_P &= ~SPARCV9_TICK_PRIVILEGED;
+ OPENSSL_sparcv9cap_P[0] &= ~SPARCV9_TICK_PRIVILEGED;
}
if (sigsetjmp(common_jmp, 1) == 0) {
_sparcv9_vis1_probe();
- OPENSSL_sparcv9cap_P |= SPARCV9_VIS1;
+ OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS1 | SPARCV9_BLK;
/* detect UltraSPARC-Tx, see sparccpud.S for details... */
if (_sparcv9_vis1_instrument() >= 12)
- OPENSSL_sparcv9cap_P &= ~(SPARCV9_VIS1 | SPARCV9_PREFER_FPU);
+ OPENSSL_sparcv9cap_P[0] &= ~(SPARCV9_VIS1 | SPARCV9_PREFER_FPU);
else {
_sparcv9_vis2_probe();
- OPENSSL_sparcv9cap_P |= SPARCV9_VIS2;
+ OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS2;
}
}
if (sigsetjmp(common_jmp, 1) == 0) {
_sparcv9_fmadd_probe();
- OPENSSL_sparcv9cap_P |= SPARCV9_FMADD;
+ OPENSSL_sparcv9cap_P[0] |= SPARCV9_FMADD;
+ }
+
+ /*
+ * VIS3 flag is tested independently from VIS1, unlike VIS2 that is,
+ * because VIS3 defines even integer instructions.
+ */
+ if (sigsetjmp(common_jmp, 1) == 0) {
+ _sparcv9_vis3_probe();
+ OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS3;
+ }
+# if 0 /* was planned at some point but never
+ * implemented in hardware */
+ if (sigsetjmp(common_jmp, 1) == 0) {
+ (void)_sparcv9_random();
+ OPENSSL_sparcv9cap_P[0] |= SPARCV9_RANDOM;
+ }
+# endif
+
+ /*
+ * In wait for better solution _sparcv9_rdcfr is masked by
+ * VIS3 flag, because it goes to uninterruptable endless
+ * loop on UltraSPARC II running Solaris. Things might be
+ * different on Linux...
+ */
+ if ((OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) &&
+ sigsetjmp(common_jmp, 1) == 0) {
+ OPENSSL_sparcv9cap_P[1] = (unsigned int)_sparcv9_rdcfr();
}
sigaction(SIGBUS, &bus_oact, NULL);
sigaction(SIGILL, &ill_oact, NULL);
sigprocmask(SIG_SETMASK, &oset, NULL);
+
+ if (sizeof(size_t) == 8)
+ OPENSSL_sparcv9cap_P[0] |= SPARCV9_64BIT_STACK;
+# ifdef __linux
+ else {
+ int ret = syscall(340);
+
+ if (ret >= 0 && ret & 1)
+ OPENSSL_sparcv9cap_P[0] |= SPARCV9_64BIT_STACK;
+ }
+# endif
}
#endif
diff --git a/crypto/srp/Makefile b/crypto/srp/Makefile
index 763953384b19..414af7bc66b1 100644
--- a/crypto/srp/Makefile
+++ b/crypto/srp/Makefile
@@ -37,6 +37,9 @@ lib: $(LIBOBJ)
$(RANLIB) $(LIB) || echo Never mind.
@touch lib
+files:
+ $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
+
links:
@$(PERL) $(TOP)/util/mklink.pl ../../include/openssl $(EXHEADER)
@$(PERL) $(TOP)/util/mklink.pl ../../test $(TEST)
diff --git a/crypto/srp/srptest.c b/crypto/srp/srptest.c
index 451c70e40f56..00e001bd6b83 100644
--- a/crypto/srp/srptest.c
+++ b/crypto/srp/srptest.c
@@ -148,6 +148,7 @@ int main(int argc, char **argv)
ERR_remove_thread_state(NULL);
ERR_free_strings();
CRYPTO_mem_leaks(bio_err);
+ BIO_free(bio_err);
return 0;
}
diff --git a/crypto/stack/safestack.h b/crypto/stack/safestack.h
index 519649b6ed0e..1d4f87eab34d 100644
--- a/crypto/stack/safestack.h
+++ b/crypto/stack/safestack.h
@@ -75,12 +75,12 @@ extern "C" {
# define CHECKED_STACK_OF(type, p) \
((_STACK*) (1 ? p : (STACK_OF(type)*)0))
+# define CHECKED_SK_COPY_FUNC(type, p) \
+ ((void *(*)(void *)) ((1 ? p : (type *(*)(const type *))0)))
+
# define CHECKED_SK_FREE_FUNC(type, p) \
((void (*)(void *)) ((1 ? p : (void (*)(type *))0)))
-# define CHECKED_SK_FREE_FUNC2(type, p) \
- ((void (*)(void *)) ((1 ? p : (void (*)(type))0)))
-
# define CHECKED_SK_CMP_FUNC(type, p) \
((int (*)(const void *, const void *)) \
((1 ? p : (int (*)(const type * const *, const type * const *))0)))
@@ -177,6 +177,8 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
(STACK_OF(type) *)sk_dup(CHECKED_STACK_OF(type, st))
# define SKM_sk_pop_free(type, st, free_func) \
sk_pop_free(CHECKED_STACK_OF(type, st), CHECKED_SK_FREE_FUNC(type, free_func))
+# define SKM_sk_deep_copy(type, st, copy_func, free_func) \
+ (STACK_OF(type) *)sk_deep_copy(CHECKED_STACK_OF(type, st), CHECKED_SK_COPY_FUNC(type, copy_func), CHECKED_SK_FREE_FUNC(type, free_func))
# define SKM_sk_shift(type, st) \
(type *)sk_shift(CHECKED_STACK_OF(type, st))
# define SKM_sk_pop(type, st) \
@@ -226,6 +228,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_ACCESS_DESCRIPTION_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(ACCESS_DESCRIPTION, (st), (cmp))
# define sk_ACCESS_DESCRIPTION_dup(st) SKM_sk_dup(ACCESS_DESCRIPTION, st)
# define sk_ACCESS_DESCRIPTION_pop_free(st, free_func) SKM_sk_pop_free(ACCESS_DESCRIPTION, (st), (free_func))
+# define sk_ACCESS_DESCRIPTION_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(ACCESS_DESCRIPTION, (st), (copy_func), (free_func))
# define sk_ACCESS_DESCRIPTION_shift(st) SKM_sk_shift(ACCESS_DESCRIPTION, (st))
# define sk_ACCESS_DESCRIPTION_pop(st) SKM_sk_pop(ACCESS_DESCRIPTION, (st))
# define sk_ACCESS_DESCRIPTION_sort(st) SKM_sk_sort(ACCESS_DESCRIPTION, (st))
@@ -247,6 +250,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_ASIdOrRange_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(ASIdOrRange, (st), (cmp))
# define sk_ASIdOrRange_dup(st) SKM_sk_dup(ASIdOrRange, st)
# define sk_ASIdOrRange_pop_free(st, free_func) SKM_sk_pop_free(ASIdOrRange, (st), (free_func))
+# define sk_ASIdOrRange_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(ASIdOrRange, (st), (copy_func), (free_func))
# define sk_ASIdOrRange_shift(st) SKM_sk_shift(ASIdOrRange, (st))
# define sk_ASIdOrRange_pop(st) SKM_sk_pop(ASIdOrRange, (st))
# define sk_ASIdOrRange_sort(st) SKM_sk_sort(ASIdOrRange, (st))
@@ -268,6 +272,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_ASN1_GENERALSTRING_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(ASN1_GENERALSTRING, (st), (cmp))
# define sk_ASN1_GENERALSTRING_dup(st) SKM_sk_dup(ASN1_GENERALSTRING, st)
# define sk_ASN1_GENERALSTRING_pop_free(st, free_func) SKM_sk_pop_free(ASN1_GENERALSTRING, (st), (free_func))
+# define sk_ASN1_GENERALSTRING_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(ASN1_GENERALSTRING, (st), (copy_func), (free_func))
# define sk_ASN1_GENERALSTRING_shift(st) SKM_sk_shift(ASN1_GENERALSTRING, (st))
# define sk_ASN1_GENERALSTRING_pop(st) SKM_sk_pop(ASN1_GENERALSTRING, (st))
# define sk_ASN1_GENERALSTRING_sort(st) SKM_sk_sort(ASN1_GENERALSTRING, (st))
@@ -289,6 +294,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_ASN1_INTEGER_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(ASN1_INTEGER, (st), (cmp))
# define sk_ASN1_INTEGER_dup(st) SKM_sk_dup(ASN1_INTEGER, st)
# define sk_ASN1_INTEGER_pop_free(st, free_func) SKM_sk_pop_free(ASN1_INTEGER, (st), (free_func))
+# define sk_ASN1_INTEGER_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(ASN1_INTEGER, (st), (copy_func), (free_func))
# define sk_ASN1_INTEGER_shift(st) SKM_sk_shift(ASN1_INTEGER, (st))
# define sk_ASN1_INTEGER_pop(st) SKM_sk_pop(ASN1_INTEGER, (st))
# define sk_ASN1_INTEGER_sort(st) SKM_sk_sort(ASN1_INTEGER, (st))
@@ -310,6 +316,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_ASN1_OBJECT_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(ASN1_OBJECT, (st), (cmp))
# define sk_ASN1_OBJECT_dup(st) SKM_sk_dup(ASN1_OBJECT, st)
# define sk_ASN1_OBJECT_pop_free(st, free_func) SKM_sk_pop_free(ASN1_OBJECT, (st), (free_func))
+# define sk_ASN1_OBJECT_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(ASN1_OBJECT, (st), (copy_func), (free_func))
# define sk_ASN1_OBJECT_shift(st) SKM_sk_shift(ASN1_OBJECT, (st))
# define sk_ASN1_OBJECT_pop(st) SKM_sk_pop(ASN1_OBJECT, (st))
# define sk_ASN1_OBJECT_sort(st) SKM_sk_sort(ASN1_OBJECT, (st))
@@ -331,6 +338,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_ASN1_STRING_TABLE_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(ASN1_STRING_TABLE, (st), (cmp))
# define sk_ASN1_STRING_TABLE_dup(st) SKM_sk_dup(ASN1_STRING_TABLE, st)
# define sk_ASN1_STRING_TABLE_pop_free(st, free_func) SKM_sk_pop_free(ASN1_STRING_TABLE, (st), (free_func))
+# define sk_ASN1_STRING_TABLE_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(ASN1_STRING_TABLE, (st), (copy_func), (free_func))
# define sk_ASN1_STRING_TABLE_shift(st) SKM_sk_shift(ASN1_STRING_TABLE, (st))
# define sk_ASN1_STRING_TABLE_pop(st) SKM_sk_pop(ASN1_STRING_TABLE, (st))
# define sk_ASN1_STRING_TABLE_sort(st) SKM_sk_sort(ASN1_STRING_TABLE, (st))
@@ -352,6 +360,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_ASN1_TYPE_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(ASN1_TYPE, (st), (cmp))
# define sk_ASN1_TYPE_dup(st) SKM_sk_dup(ASN1_TYPE, st)
# define sk_ASN1_TYPE_pop_free(st, free_func) SKM_sk_pop_free(ASN1_TYPE, (st), (free_func))
+# define sk_ASN1_TYPE_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(ASN1_TYPE, (st), (copy_func), (free_func))
# define sk_ASN1_TYPE_shift(st) SKM_sk_shift(ASN1_TYPE, (st))
# define sk_ASN1_TYPE_pop(st) SKM_sk_pop(ASN1_TYPE, (st))
# define sk_ASN1_TYPE_sort(st) SKM_sk_sort(ASN1_TYPE, (st))
@@ -373,6 +382,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_ASN1_UTF8STRING_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(ASN1_UTF8STRING, (st), (cmp))
# define sk_ASN1_UTF8STRING_dup(st) SKM_sk_dup(ASN1_UTF8STRING, st)
# define sk_ASN1_UTF8STRING_pop_free(st, free_func) SKM_sk_pop_free(ASN1_UTF8STRING, (st), (free_func))
+# define sk_ASN1_UTF8STRING_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(ASN1_UTF8STRING, (st), (copy_func), (free_func))
# define sk_ASN1_UTF8STRING_shift(st) SKM_sk_shift(ASN1_UTF8STRING, (st))
# define sk_ASN1_UTF8STRING_pop(st) SKM_sk_pop(ASN1_UTF8STRING, (st))
# define sk_ASN1_UTF8STRING_sort(st) SKM_sk_sort(ASN1_UTF8STRING, (st))
@@ -394,6 +404,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_ASN1_VALUE_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(ASN1_VALUE, (st), (cmp))
# define sk_ASN1_VALUE_dup(st) SKM_sk_dup(ASN1_VALUE, st)
# define sk_ASN1_VALUE_pop_free(st, free_func) SKM_sk_pop_free(ASN1_VALUE, (st), (free_func))
+# define sk_ASN1_VALUE_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(ASN1_VALUE, (st), (copy_func), (free_func))
# define sk_ASN1_VALUE_shift(st) SKM_sk_shift(ASN1_VALUE, (st))
# define sk_ASN1_VALUE_pop(st) SKM_sk_pop(ASN1_VALUE, (st))
# define sk_ASN1_VALUE_sort(st) SKM_sk_sort(ASN1_VALUE, (st))
@@ -415,6 +426,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_BIO_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(BIO, (st), (cmp))
# define sk_BIO_dup(st) SKM_sk_dup(BIO, st)
# define sk_BIO_pop_free(st, free_func) SKM_sk_pop_free(BIO, (st), (free_func))
+# define sk_BIO_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(BIO, (st), (copy_func), (free_func))
# define sk_BIO_shift(st) SKM_sk_shift(BIO, (st))
# define sk_BIO_pop(st) SKM_sk_pop(BIO, (st))
# define sk_BIO_sort(st) SKM_sk_sort(BIO, (st))
@@ -436,6 +448,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_BY_DIR_ENTRY_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(BY_DIR_ENTRY, (st), (cmp))
# define sk_BY_DIR_ENTRY_dup(st) SKM_sk_dup(BY_DIR_ENTRY, st)
# define sk_BY_DIR_ENTRY_pop_free(st, free_func) SKM_sk_pop_free(BY_DIR_ENTRY, (st), (free_func))
+# define sk_BY_DIR_ENTRY_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(BY_DIR_ENTRY, (st), (copy_func), (free_func))
# define sk_BY_DIR_ENTRY_shift(st) SKM_sk_shift(BY_DIR_ENTRY, (st))
# define sk_BY_DIR_ENTRY_pop(st) SKM_sk_pop(BY_DIR_ENTRY, (st))
# define sk_BY_DIR_ENTRY_sort(st) SKM_sk_sort(BY_DIR_ENTRY, (st))
@@ -457,6 +470,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_BY_DIR_HASH_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(BY_DIR_HASH, (st), (cmp))
# define sk_BY_DIR_HASH_dup(st) SKM_sk_dup(BY_DIR_HASH, st)
# define sk_BY_DIR_HASH_pop_free(st, free_func) SKM_sk_pop_free(BY_DIR_HASH, (st), (free_func))
+# define sk_BY_DIR_HASH_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(BY_DIR_HASH, (st), (copy_func), (free_func))
# define sk_BY_DIR_HASH_shift(st) SKM_sk_shift(BY_DIR_HASH, (st))
# define sk_BY_DIR_HASH_pop(st) SKM_sk_pop(BY_DIR_HASH, (st))
# define sk_BY_DIR_HASH_sort(st) SKM_sk_sort(BY_DIR_HASH, (st))
@@ -478,10 +492,33 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_CMS_CertificateChoices_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(CMS_CertificateChoices, (st), (cmp))
# define sk_CMS_CertificateChoices_dup(st) SKM_sk_dup(CMS_CertificateChoices, st)
# define sk_CMS_CertificateChoices_pop_free(st, free_func) SKM_sk_pop_free(CMS_CertificateChoices, (st), (free_func))
+# define sk_CMS_CertificateChoices_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(CMS_CertificateChoices, (st), (copy_func), (free_func))
# define sk_CMS_CertificateChoices_shift(st) SKM_sk_shift(CMS_CertificateChoices, (st))
# define sk_CMS_CertificateChoices_pop(st) SKM_sk_pop(CMS_CertificateChoices, (st))
# define sk_CMS_CertificateChoices_sort(st) SKM_sk_sort(CMS_CertificateChoices, (st))
# define sk_CMS_CertificateChoices_is_sorted(st) SKM_sk_is_sorted(CMS_CertificateChoices, (st))
+# define sk_CMS_RecipientEncryptedKey_new(cmp) SKM_sk_new(CMS_RecipientEncryptedKey, (cmp))
+# define sk_CMS_RecipientEncryptedKey_new_null() SKM_sk_new_null(CMS_RecipientEncryptedKey)
+# define sk_CMS_RecipientEncryptedKey_free(st) SKM_sk_free(CMS_RecipientEncryptedKey, (st))
+# define sk_CMS_RecipientEncryptedKey_num(st) SKM_sk_num(CMS_RecipientEncryptedKey, (st))
+# define sk_CMS_RecipientEncryptedKey_value(st, i) SKM_sk_value(CMS_RecipientEncryptedKey, (st), (i))
+# define sk_CMS_RecipientEncryptedKey_set(st, i, val) SKM_sk_set(CMS_RecipientEncryptedKey, (st), (i), (val))
+# define sk_CMS_RecipientEncryptedKey_zero(st) SKM_sk_zero(CMS_RecipientEncryptedKey, (st))
+# define sk_CMS_RecipientEncryptedKey_push(st, val) SKM_sk_push(CMS_RecipientEncryptedKey, (st), (val))
+# define sk_CMS_RecipientEncryptedKey_unshift(st, val) SKM_sk_unshift(CMS_RecipientEncryptedKey, (st), (val))
+# define sk_CMS_RecipientEncryptedKey_find(st, val) SKM_sk_find(CMS_RecipientEncryptedKey, (st), (val))
+# define sk_CMS_RecipientEncryptedKey_find_ex(st, val) SKM_sk_find_ex(CMS_RecipientEncryptedKey, (st), (val))
+# define sk_CMS_RecipientEncryptedKey_delete(st, i) SKM_sk_delete(CMS_RecipientEncryptedKey, (st), (i))
+# define sk_CMS_RecipientEncryptedKey_delete_ptr(st, ptr) SKM_sk_delete_ptr(CMS_RecipientEncryptedKey, (st), (ptr))
+# define sk_CMS_RecipientEncryptedKey_insert(st, val, i) SKM_sk_insert(CMS_RecipientEncryptedKey, (st), (val), (i))
+# define sk_CMS_RecipientEncryptedKey_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(CMS_RecipientEncryptedKey, (st), (cmp))
+# define sk_CMS_RecipientEncryptedKey_dup(st) SKM_sk_dup(CMS_RecipientEncryptedKey, st)
+# define sk_CMS_RecipientEncryptedKey_pop_free(st, free_func) SKM_sk_pop_free(CMS_RecipientEncryptedKey, (st), (free_func))
+# define sk_CMS_RecipientEncryptedKey_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(CMS_RecipientEncryptedKey, (st), (copy_func), (free_func))
+# define sk_CMS_RecipientEncryptedKey_shift(st) SKM_sk_shift(CMS_RecipientEncryptedKey, (st))
+# define sk_CMS_RecipientEncryptedKey_pop(st) SKM_sk_pop(CMS_RecipientEncryptedKey, (st))
+# define sk_CMS_RecipientEncryptedKey_sort(st) SKM_sk_sort(CMS_RecipientEncryptedKey, (st))
+# define sk_CMS_RecipientEncryptedKey_is_sorted(st) SKM_sk_is_sorted(CMS_RecipientEncryptedKey, (st))
# define sk_CMS_RecipientInfo_new(cmp) SKM_sk_new(CMS_RecipientInfo, (cmp))
# define sk_CMS_RecipientInfo_new_null() SKM_sk_new_null(CMS_RecipientInfo)
# define sk_CMS_RecipientInfo_free(st) SKM_sk_free(CMS_RecipientInfo, (st))
@@ -499,6 +536,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_CMS_RecipientInfo_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(CMS_RecipientInfo, (st), (cmp))
# define sk_CMS_RecipientInfo_dup(st) SKM_sk_dup(CMS_RecipientInfo, st)
# define sk_CMS_RecipientInfo_pop_free(st, free_func) SKM_sk_pop_free(CMS_RecipientInfo, (st), (free_func))
+# define sk_CMS_RecipientInfo_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(CMS_RecipientInfo, (st), (copy_func), (free_func))
# define sk_CMS_RecipientInfo_shift(st) SKM_sk_shift(CMS_RecipientInfo, (st))
# define sk_CMS_RecipientInfo_pop(st) SKM_sk_pop(CMS_RecipientInfo, (st))
# define sk_CMS_RecipientInfo_sort(st) SKM_sk_sort(CMS_RecipientInfo, (st))
@@ -520,6 +558,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_CMS_RevocationInfoChoice_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(CMS_RevocationInfoChoice, (st), (cmp))
# define sk_CMS_RevocationInfoChoice_dup(st) SKM_sk_dup(CMS_RevocationInfoChoice, st)
# define sk_CMS_RevocationInfoChoice_pop_free(st, free_func) SKM_sk_pop_free(CMS_RevocationInfoChoice, (st), (free_func))
+# define sk_CMS_RevocationInfoChoice_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(CMS_RevocationInfoChoice, (st), (copy_func), (free_func))
# define sk_CMS_RevocationInfoChoice_shift(st) SKM_sk_shift(CMS_RevocationInfoChoice, (st))
# define sk_CMS_RevocationInfoChoice_pop(st) SKM_sk_pop(CMS_RevocationInfoChoice, (st))
# define sk_CMS_RevocationInfoChoice_sort(st) SKM_sk_sort(CMS_RevocationInfoChoice, (st))
@@ -541,6 +580,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_CMS_SignerInfo_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(CMS_SignerInfo, (st), (cmp))
# define sk_CMS_SignerInfo_dup(st) SKM_sk_dup(CMS_SignerInfo, st)
# define sk_CMS_SignerInfo_pop_free(st, free_func) SKM_sk_pop_free(CMS_SignerInfo, (st), (free_func))
+# define sk_CMS_SignerInfo_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(CMS_SignerInfo, (st), (copy_func), (free_func))
# define sk_CMS_SignerInfo_shift(st) SKM_sk_shift(CMS_SignerInfo, (st))
# define sk_CMS_SignerInfo_pop(st) SKM_sk_pop(CMS_SignerInfo, (st))
# define sk_CMS_SignerInfo_sort(st) SKM_sk_sort(CMS_SignerInfo, (st))
@@ -562,6 +602,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_CONF_IMODULE_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(CONF_IMODULE, (st), (cmp))
# define sk_CONF_IMODULE_dup(st) SKM_sk_dup(CONF_IMODULE, st)
# define sk_CONF_IMODULE_pop_free(st, free_func) SKM_sk_pop_free(CONF_IMODULE, (st), (free_func))
+# define sk_CONF_IMODULE_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(CONF_IMODULE, (st), (copy_func), (free_func))
# define sk_CONF_IMODULE_shift(st) SKM_sk_shift(CONF_IMODULE, (st))
# define sk_CONF_IMODULE_pop(st) SKM_sk_pop(CONF_IMODULE, (st))
# define sk_CONF_IMODULE_sort(st) SKM_sk_sort(CONF_IMODULE, (st))
@@ -583,6 +624,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_CONF_MODULE_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(CONF_MODULE, (st), (cmp))
# define sk_CONF_MODULE_dup(st) SKM_sk_dup(CONF_MODULE, st)
# define sk_CONF_MODULE_pop_free(st, free_func) SKM_sk_pop_free(CONF_MODULE, (st), (free_func))
+# define sk_CONF_MODULE_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(CONF_MODULE, (st), (copy_func), (free_func))
# define sk_CONF_MODULE_shift(st) SKM_sk_shift(CONF_MODULE, (st))
# define sk_CONF_MODULE_pop(st) SKM_sk_pop(CONF_MODULE, (st))
# define sk_CONF_MODULE_sort(st) SKM_sk_sort(CONF_MODULE, (st))
@@ -604,6 +646,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_CONF_VALUE_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(CONF_VALUE, (st), (cmp))
# define sk_CONF_VALUE_dup(st) SKM_sk_dup(CONF_VALUE, st)
# define sk_CONF_VALUE_pop_free(st, free_func) SKM_sk_pop_free(CONF_VALUE, (st), (free_func))
+# define sk_CONF_VALUE_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(CONF_VALUE, (st), (copy_func), (free_func))
# define sk_CONF_VALUE_shift(st) SKM_sk_shift(CONF_VALUE, (st))
# define sk_CONF_VALUE_pop(st) SKM_sk_pop(CONF_VALUE, (st))
# define sk_CONF_VALUE_sort(st) SKM_sk_sort(CONF_VALUE, (st))
@@ -625,6 +668,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_CRYPTO_EX_DATA_FUNCS_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(CRYPTO_EX_DATA_FUNCS, (st), (cmp))
# define sk_CRYPTO_EX_DATA_FUNCS_dup(st) SKM_sk_dup(CRYPTO_EX_DATA_FUNCS, st)
# define sk_CRYPTO_EX_DATA_FUNCS_pop_free(st, free_func) SKM_sk_pop_free(CRYPTO_EX_DATA_FUNCS, (st), (free_func))
+# define sk_CRYPTO_EX_DATA_FUNCS_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(CRYPTO_EX_DATA_FUNCS, (st), (copy_func), (free_func))
# define sk_CRYPTO_EX_DATA_FUNCS_shift(st) SKM_sk_shift(CRYPTO_EX_DATA_FUNCS, (st))
# define sk_CRYPTO_EX_DATA_FUNCS_pop(st) SKM_sk_pop(CRYPTO_EX_DATA_FUNCS, (st))
# define sk_CRYPTO_EX_DATA_FUNCS_sort(st) SKM_sk_sort(CRYPTO_EX_DATA_FUNCS, (st))
@@ -646,6 +690,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_CRYPTO_dynlock_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(CRYPTO_dynlock, (st), (cmp))
# define sk_CRYPTO_dynlock_dup(st) SKM_sk_dup(CRYPTO_dynlock, st)
# define sk_CRYPTO_dynlock_pop_free(st, free_func) SKM_sk_pop_free(CRYPTO_dynlock, (st), (free_func))
+# define sk_CRYPTO_dynlock_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(CRYPTO_dynlock, (st), (copy_func), (free_func))
# define sk_CRYPTO_dynlock_shift(st) SKM_sk_shift(CRYPTO_dynlock, (st))
# define sk_CRYPTO_dynlock_pop(st) SKM_sk_pop(CRYPTO_dynlock, (st))
# define sk_CRYPTO_dynlock_sort(st) SKM_sk_sort(CRYPTO_dynlock, (st))
@@ -667,6 +712,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_DIST_POINT_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(DIST_POINT, (st), (cmp))
# define sk_DIST_POINT_dup(st) SKM_sk_dup(DIST_POINT, st)
# define sk_DIST_POINT_pop_free(st, free_func) SKM_sk_pop_free(DIST_POINT, (st), (free_func))
+# define sk_DIST_POINT_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(DIST_POINT, (st), (copy_func), (free_func))
# define sk_DIST_POINT_shift(st) SKM_sk_shift(DIST_POINT, (st))
# define sk_DIST_POINT_pop(st) SKM_sk_pop(DIST_POINT, (st))
# define sk_DIST_POINT_sort(st) SKM_sk_sort(DIST_POINT, (st))
@@ -688,6 +734,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_ENGINE_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(ENGINE, (st), (cmp))
# define sk_ENGINE_dup(st) SKM_sk_dup(ENGINE, st)
# define sk_ENGINE_pop_free(st, free_func) SKM_sk_pop_free(ENGINE, (st), (free_func))
+# define sk_ENGINE_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(ENGINE, (st), (copy_func), (free_func))
# define sk_ENGINE_shift(st) SKM_sk_shift(ENGINE, (st))
# define sk_ENGINE_pop(st) SKM_sk_pop(ENGINE, (st))
# define sk_ENGINE_sort(st) SKM_sk_sort(ENGINE, (st))
@@ -709,6 +756,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_ENGINE_CLEANUP_ITEM_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(ENGINE_CLEANUP_ITEM, (st), (cmp))
# define sk_ENGINE_CLEANUP_ITEM_dup(st) SKM_sk_dup(ENGINE_CLEANUP_ITEM, st)
# define sk_ENGINE_CLEANUP_ITEM_pop_free(st, free_func) SKM_sk_pop_free(ENGINE_CLEANUP_ITEM, (st), (free_func))
+# define sk_ENGINE_CLEANUP_ITEM_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(ENGINE_CLEANUP_ITEM, (st), (copy_func), (free_func))
# define sk_ENGINE_CLEANUP_ITEM_shift(st) SKM_sk_shift(ENGINE_CLEANUP_ITEM, (st))
# define sk_ENGINE_CLEANUP_ITEM_pop(st) SKM_sk_pop(ENGINE_CLEANUP_ITEM, (st))
# define sk_ENGINE_CLEANUP_ITEM_sort(st) SKM_sk_sort(ENGINE_CLEANUP_ITEM, (st))
@@ -730,6 +778,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_ESS_CERT_ID_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(ESS_CERT_ID, (st), (cmp))
# define sk_ESS_CERT_ID_dup(st) SKM_sk_dup(ESS_CERT_ID, st)
# define sk_ESS_CERT_ID_pop_free(st, free_func) SKM_sk_pop_free(ESS_CERT_ID, (st), (free_func))
+# define sk_ESS_CERT_ID_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(ESS_CERT_ID, (st), (copy_func), (free_func))
# define sk_ESS_CERT_ID_shift(st) SKM_sk_shift(ESS_CERT_ID, (st))
# define sk_ESS_CERT_ID_pop(st) SKM_sk_pop(ESS_CERT_ID, (st))
# define sk_ESS_CERT_ID_sort(st) SKM_sk_sort(ESS_CERT_ID, (st))
@@ -751,6 +800,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_EVP_MD_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(EVP_MD, (st), (cmp))
# define sk_EVP_MD_dup(st) SKM_sk_dup(EVP_MD, st)
# define sk_EVP_MD_pop_free(st, free_func) SKM_sk_pop_free(EVP_MD, (st), (free_func))
+# define sk_EVP_MD_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(EVP_MD, (st), (copy_func), (free_func))
# define sk_EVP_MD_shift(st) SKM_sk_shift(EVP_MD, (st))
# define sk_EVP_MD_pop(st) SKM_sk_pop(EVP_MD, (st))
# define sk_EVP_MD_sort(st) SKM_sk_sort(EVP_MD, (st))
@@ -772,6 +822,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_EVP_PBE_CTL_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(EVP_PBE_CTL, (st), (cmp))
# define sk_EVP_PBE_CTL_dup(st) SKM_sk_dup(EVP_PBE_CTL, st)
# define sk_EVP_PBE_CTL_pop_free(st, free_func) SKM_sk_pop_free(EVP_PBE_CTL, (st), (free_func))
+# define sk_EVP_PBE_CTL_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(EVP_PBE_CTL, (st), (copy_func), (free_func))
# define sk_EVP_PBE_CTL_shift(st) SKM_sk_shift(EVP_PBE_CTL, (st))
# define sk_EVP_PBE_CTL_pop(st) SKM_sk_pop(EVP_PBE_CTL, (st))
# define sk_EVP_PBE_CTL_sort(st) SKM_sk_sort(EVP_PBE_CTL, (st))
@@ -793,6 +844,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_EVP_PKEY_ASN1_METHOD_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(EVP_PKEY_ASN1_METHOD, (st), (cmp))
# define sk_EVP_PKEY_ASN1_METHOD_dup(st) SKM_sk_dup(EVP_PKEY_ASN1_METHOD, st)
# define sk_EVP_PKEY_ASN1_METHOD_pop_free(st, free_func) SKM_sk_pop_free(EVP_PKEY_ASN1_METHOD, (st), (free_func))
+# define sk_EVP_PKEY_ASN1_METHOD_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(EVP_PKEY_ASN1_METHOD, (st), (copy_func), (free_func))
# define sk_EVP_PKEY_ASN1_METHOD_shift(st) SKM_sk_shift(EVP_PKEY_ASN1_METHOD, (st))
# define sk_EVP_PKEY_ASN1_METHOD_pop(st) SKM_sk_pop(EVP_PKEY_ASN1_METHOD, (st))
# define sk_EVP_PKEY_ASN1_METHOD_sort(st) SKM_sk_sort(EVP_PKEY_ASN1_METHOD, (st))
@@ -814,6 +866,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_EVP_PKEY_METHOD_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(EVP_PKEY_METHOD, (st), (cmp))
# define sk_EVP_PKEY_METHOD_dup(st) SKM_sk_dup(EVP_PKEY_METHOD, st)
# define sk_EVP_PKEY_METHOD_pop_free(st, free_func) SKM_sk_pop_free(EVP_PKEY_METHOD, (st), (free_func))
+# define sk_EVP_PKEY_METHOD_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(EVP_PKEY_METHOD, (st), (copy_func), (free_func))
# define sk_EVP_PKEY_METHOD_shift(st) SKM_sk_shift(EVP_PKEY_METHOD, (st))
# define sk_EVP_PKEY_METHOD_pop(st) SKM_sk_pop(EVP_PKEY_METHOD, (st))
# define sk_EVP_PKEY_METHOD_sort(st) SKM_sk_sort(EVP_PKEY_METHOD, (st))
@@ -835,6 +888,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_GENERAL_NAME_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(GENERAL_NAME, (st), (cmp))
# define sk_GENERAL_NAME_dup(st) SKM_sk_dup(GENERAL_NAME, st)
# define sk_GENERAL_NAME_pop_free(st, free_func) SKM_sk_pop_free(GENERAL_NAME, (st), (free_func))
+# define sk_GENERAL_NAME_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(GENERAL_NAME, (st), (copy_func), (free_func))
# define sk_GENERAL_NAME_shift(st) SKM_sk_shift(GENERAL_NAME, (st))
# define sk_GENERAL_NAME_pop(st) SKM_sk_pop(GENERAL_NAME, (st))
# define sk_GENERAL_NAME_sort(st) SKM_sk_sort(GENERAL_NAME, (st))
@@ -856,6 +910,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_GENERAL_NAMES_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(GENERAL_NAMES, (st), (cmp))
# define sk_GENERAL_NAMES_dup(st) SKM_sk_dup(GENERAL_NAMES, st)
# define sk_GENERAL_NAMES_pop_free(st, free_func) SKM_sk_pop_free(GENERAL_NAMES, (st), (free_func))
+# define sk_GENERAL_NAMES_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(GENERAL_NAMES, (st), (copy_func), (free_func))
# define sk_GENERAL_NAMES_shift(st) SKM_sk_shift(GENERAL_NAMES, (st))
# define sk_GENERAL_NAMES_pop(st) SKM_sk_pop(GENERAL_NAMES, (st))
# define sk_GENERAL_NAMES_sort(st) SKM_sk_sort(GENERAL_NAMES, (st))
@@ -877,6 +932,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_GENERAL_SUBTREE_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(GENERAL_SUBTREE, (st), (cmp))
# define sk_GENERAL_SUBTREE_dup(st) SKM_sk_dup(GENERAL_SUBTREE, st)
# define sk_GENERAL_SUBTREE_pop_free(st, free_func) SKM_sk_pop_free(GENERAL_SUBTREE, (st), (free_func))
+# define sk_GENERAL_SUBTREE_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(GENERAL_SUBTREE, (st), (copy_func), (free_func))
# define sk_GENERAL_SUBTREE_shift(st) SKM_sk_shift(GENERAL_SUBTREE, (st))
# define sk_GENERAL_SUBTREE_pop(st) SKM_sk_pop(GENERAL_SUBTREE, (st))
# define sk_GENERAL_SUBTREE_sort(st) SKM_sk_sort(GENERAL_SUBTREE, (st))
@@ -898,6 +954,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_IPAddressFamily_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(IPAddressFamily, (st), (cmp))
# define sk_IPAddressFamily_dup(st) SKM_sk_dup(IPAddressFamily, st)
# define sk_IPAddressFamily_pop_free(st, free_func) SKM_sk_pop_free(IPAddressFamily, (st), (free_func))
+# define sk_IPAddressFamily_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(IPAddressFamily, (st), (copy_func), (free_func))
# define sk_IPAddressFamily_shift(st) SKM_sk_shift(IPAddressFamily, (st))
# define sk_IPAddressFamily_pop(st) SKM_sk_pop(IPAddressFamily, (st))
# define sk_IPAddressFamily_sort(st) SKM_sk_sort(IPAddressFamily, (st))
@@ -919,6 +976,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_IPAddressOrRange_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(IPAddressOrRange, (st), (cmp))
# define sk_IPAddressOrRange_dup(st) SKM_sk_dup(IPAddressOrRange, st)
# define sk_IPAddressOrRange_pop_free(st, free_func) SKM_sk_pop_free(IPAddressOrRange, (st), (free_func))
+# define sk_IPAddressOrRange_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(IPAddressOrRange, (st), (copy_func), (free_func))
# define sk_IPAddressOrRange_shift(st) SKM_sk_shift(IPAddressOrRange, (st))
# define sk_IPAddressOrRange_pop(st) SKM_sk_pop(IPAddressOrRange, (st))
# define sk_IPAddressOrRange_sort(st) SKM_sk_sort(IPAddressOrRange, (st))
@@ -940,6 +998,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_KRB5_APREQBODY_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(KRB5_APREQBODY, (st), (cmp))
# define sk_KRB5_APREQBODY_dup(st) SKM_sk_dup(KRB5_APREQBODY, st)
# define sk_KRB5_APREQBODY_pop_free(st, free_func) SKM_sk_pop_free(KRB5_APREQBODY, (st), (free_func))
+# define sk_KRB5_APREQBODY_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(KRB5_APREQBODY, (st), (copy_func), (free_func))
# define sk_KRB5_APREQBODY_shift(st) SKM_sk_shift(KRB5_APREQBODY, (st))
# define sk_KRB5_APREQBODY_pop(st) SKM_sk_pop(KRB5_APREQBODY, (st))
# define sk_KRB5_APREQBODY_sort(st) SKM_sk_sort(KRB5_APREQBODY, (st))
@@ -961,6 +1020,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_KRB5_AUTHDATA_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(KRB5_AUTHDATA, (st), (cmp))
# define sk_KRB5_AUTHDATA_dup(st) SKM_sk_dup(KRB5_AUTHDATA, st)
# define sk_KRB5_AUTHDATA_pop_free(st, free_func) SKM_sk_pop_free(KRB5_AUTHDATA, (st), (free_func))
+# define sk_KRB5_AUTHDATA_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(KRB5_AUTHDATA, (st), (copy_func), (free_func))
# define sk_KRB5_AUTHDATA_shift(st) SKM_sk_shift(KRB5_AUTHDATA, (st))
# define sk_KRB5_AUTHDATA_pop(st) SKM_sk_pop(KRB5_AUTHDATA, (st))
# define sk_KRB5_AUTHDATA_sort(st) SKM_sk_sort(KRB5_AUTHDATA, (st))
@@ -982,6 +1042,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_KRB5_AUTHENTBODY_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(KRB5_AUTHENTBODY, (st), (cmp))
# define sk_KRB5_AUTHENTBODY_dup(st) SKM_sk_dup(KRB5_AUTHENTBODY, st)
# define sk_KRB5_AUTHENTBODY_pop_free(st, free_func) SKM_sk_pop_free(KRB5_AUTHENTBODY, (st), (free_func))
+# define sk_KRB5_AUTHENTBODY_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(KRB5_AUTHENTBODY, (st), (copy_func), (free_func))
# define sk_KRB5_AUTHENTBODY_shift(st) SKM_sk_shift(KRB5_AUTHENTBODY, (st))
# define sk_KRB5_AUTHENTBODY_pop(st) SKM_sk_pop(KRB5_AUTHENTBODY, (st))
# define sk_KRB5_AUTHENTBODY_sort(st) SKM_sk_sort(KRB5_AUTHENTBODY, (st))
@@ -1003,6 +1064,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_KRB5_CHECKSUM_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(KRB5_CHECKSUM, (st), (cmp))
# define sk_KRB5_CHECKSUM_dup(st) SKM_sk_dup(KRB5_CHECKSUM, st)
# define sk_KRB5_CHECKSUM_pop_free(st, free_func) SKM_sk_pop_free(KRB5_CHECKSUM, (st), (free_func))
+# define sk_KRB5_CHECKSUM_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(KRB5_CHECKSUM, (st), (copy_func), (free_func))
# define sk_KRB5_CHECKSUM_shift(st) SKM_sk_shift(KRB5_CHECKSUM, (st))
# define sk_KRB5_CHECKSUM_pop(st) SKM_sk_pop(KRB5_CHECKSUM, (st))
# define sk_KRB5_CHECKSUM_sort(st) SKM_sk_sort(KRB5_CHECKSUM, (st))
@@ -1024,6 +1086,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_KRB5_ENCDATA_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(KRB5_ENCDATA, (st), (cmp))
# define sk_KRB5_ENCDATA_dup(st) SKM_sk_dup(KRB5_ENCDATA, st)
# define sk_KRB5_ENCDATA_pop_free(st, free_func) SKM_sk_pop_free(KRB5_ENCDATA, (st), (free_func))
+# define sk_KRB5_ENCDATA_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(KRB5_ENCDATA, (st), (copy_func), (free_func))
# define sk_KRB5_ENCDATA_shift(st) SKM_sk_shift(KRB5_ENCDATA, (st))
# define sk_KRB5_ENCDATA_pop(st) SKM_sk_pop(KRB5_ENCDATA, (st))
# define sk_KRB5_ENCDATA_sort(st) SKM_sk_sort(KRB5_ENCDATA, (st))
@@ -1045,6 +1108,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_KRB5_ENCKEY_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(KRB5_ENCKEY, (st), (cmp))
# define sk_KRB5_ENCKEY_dup(st) SKM_sk_dup(KRB5_ENCKEY, st)
# define sk_KRB5_ENCKEY_pop_free(st, free_func) SKM_sk_pop_free(KRB5_ENCKEY, (st), (free_func))
+# define sk_KRB5_ENCKEY_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(KRB5_ENCKEY, (st), (copy_func), (free_func))
# define sk_KRB5_ENCKEY_shift(st) SKM_sk_shift(KRB5_ENCKEY, (st))
# define sk_KRB5_ENCKEY_pop(st) SKM_sk_pop(KRB5_ENCKEY, (st))
# define sk_KRB5_ENCKEY_sort(st) SKM_sk_sort(KRB5_ENCKEY, (st))
@@ -1066,6 +1130,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_KRB5_PRINCNAME_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(KRB5_PRINCNAME, (st), (cmp))
# define sk_KRB5_PRINCNAME_dup(st) SKM_sk_dup(KRB5_PRINCNAME, st)
# define sk_KRB5_PRINCNAME_pop_free(st, free_func) SKM_sk_pop_free(KRB5_PRINCNAME, (st), (free_func))
+# define sk_KRB5_PRINCNAME_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(KRB5_PRINCNAME, (st), (copy_func), (free_func))
# define sk_KRB5_PRINCNAME_shift(st) SKM_sk_shift(KRB5_PRINCNAME, (st))
# define sk_KRB5_PRINCNAME_pop(st) SKM_sk_pop(KRB5_PRINCNAME, (st))
# define sk_KRB5_PRINCNAME_sort(st) SKM_sk_sort(KRB5_PRINCNAME, (st))
@@ -1087,6 +1152,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_KRB5_TKTBODY_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(KRB5_TKTBODY, (st), (cmp))
# define sk_KRB5_TKTBODY_dup(st) SKM_sk_dup(KRB5_TKTBODY, st)
# define sk_KRB5_TKTBODY_pop_free(st, free_func) SKM_sk_pop_free(KRB5_TKTBODY, (st), (free_func))
+# define sk_KRB5_TKTBODY_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(KRB5_TKTBODY, (st), (copy_func), (free_func))
# define sk_KRB5_TKTBODY_shift(st) SKM_sk_shift(KRB5_TKTBODY, (st))
# define sk_KRB5_TKTBODY_pop(st) SKM_sk_pop(KRB5_TKTBODY, (st))
# define sk_KRB5_TKTBODY_sort(st) SKM_sk_sort(KRB5_TKTBODY, (st))
@@ -1108,6 +1174,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_MEM_OBJECT_DATA_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(MEM_OBJECT_DATA, (st), (cmp))
# define sk_MEM_OBJECT_DATA_dup(st) SKM_sk_dup(MEM_OBJECT_DATA, st)
# define sk_MEM_OBJECT_DATA_pop_free(st, free_func) SKM_sk_pop_free(MEM_OBJECT_DATA, (st), (free_func))
+# define sk_MEM_OBJECT_DATA_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(MEM_OBJECT_DATA, (st), (copy_func), (free_func))
# define sk_MEM_OBJECT_DATA_shift(st) SKM_sk_shift(MEM_OBJECT_DATA, (st))
# define sk_MEM_OBJECT_DATA_pop(st) SKM_sk_pop(MEM_OBJECT_DATA, (st))
# define sk_MEM_OBJECT_DATA_sort(st) SKM_sk_sort(MEM_OBJECT_DATA, (st))
@@ -1129,6 +1196,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_MIME_HEADER_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(MIME_HEADER, (st), (cmp))
# define sk_MIME_HEADER_dup(st) SKM_sk_dup(MIME_HEADER, st)
# define sk_MIME_HEADER_pop_free(st, free_func) SKM_sk_pop_free(MIME_HEADER, (st), (free_func))
+# define sk_MIME_HEADER_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(MIME_HEADER, (st), (copy_func), (free_func))
# define sk_MIME_HEADER_shift(st) SKM_sk_shift(MIME_HEADER, (st))
# define sk_MIME_HEADER_pop(st) SKM_sk_pop(MIME_HEADER, (st))
# define sk_MIME_HEADER_sort(st) SKM_sk_sort(MIME_HEADER, (st))
@@ -1150,6 +1218,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_MIME_PARAM_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(MIME_PARAM, (st), (cmp))
# define sk_MIME_PARAM_dup(st) SKM_sk_dup(MIME_PARAM, st)
# define sk_MIME_PARAM_pop_free(st, free_func) SKM_sk_pop_free(MIME_PARAM, (st), (free_func))
+# define sk_MIME_PARAM_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(MIME_PARAM, (st), (copy_func), (free_func))
# define sk_MIME_PARAM_shift(st) SKM_sk_shift(MIME_PARAM, (st))
# define sk_MIME_PARAM_pop(st) SKM_sk_pop(MIME_PARAM, (st))
# define sk_MIME_PARAM_sort(st) SKM_sk_sort(MIME_PARAM, (st))
@@ -1171,6 +1240,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_NAME_FUNCS_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(NAME_FUNCS, (st), (cmp))
# define sk_NAME_FUNCS_dup(st) SKM_sk_dup(NAME_FUNCS, st)
# define sk_NAME_FUNCS_pop_free(st, free_func) SKM_sk_pop_free(NAME_FUNCS, (st), (free_func))
+# define sk_NAME_FUNCS_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(NAME_FUNCS, (st), (copy_func), (free_func))
# define sk_NAME_FUNCS_shift(st) SKM_sk_shift(NAME_FUNCS, (st))
# define sk_NAME_FUNCS_pop(st) SKM_sk_pop(NAME_FUNCS, (st))
# define sk_NAME_FUNCS_sort(st) SKM_sk_sort(NAME_FUNCS, (st))
@@ -1192,6 +1262,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_OCSP_CERTID_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(OCSP_CERTID, (st), (cmp))
# define sk_OCSP_CERTID_dup(st) SKM_sk_dup(OCSP_CERTID, st)
# define sk_OCSP_CERTID_pop_free(st, free_func) SKM_sk_pop_free(OCSP_CERTID, (st), (free_func))
+# define sk_OCSP_CERTID_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(OCSP_CERTID, (st), (copy_func), (free_func))
# define sk_OCSP_CERTID_shift(st) SKM_sk_shift(OCSP_CERTID, (st))
# define sk_OCSP_CERTID_pop(st) SKM_sk_pop(OCSP_CERTID, (st))
# define sk_OCSP_CERTID_sort(st) SKM_sk_sort(OCSP_CERTID, (st))
@@ -1213,6 +1284,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_OCSP_ONEREQ_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(OCSP_ONEREQ, (st), (cmp))
# define sk_OCSP_ONEREQ_dup(st) SKM_sk_dup(OCSP_ONEREQ, st)
# define sk_OCSP_ONEREQ_pop_free(st, free_func) SKM_sk_pop_free(OCSP_ONEREQ, (st), (free_func))
+# define sk_OCSP_ONEREQ_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(OCSP_ONEREQ, (st), (copy_func), (free_func))
# define sk_OCSP_ONEREQ_shift(st) SKM_sk_shift(OCSP_ONEREQ, (st))
# define sk_OCSP_ONEREQ_pop(st) SKM_sk_pop(OCSP_ONEREQ, (st))
# define sk_OCSP_ONEREQ_sort(st) SKM_sk_sort(OCSP_ONEREQ, (st))
@@ -1234,6 +1306,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_OCSP_RESPID_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(OCSP_RESPID, (st), (cmp))
# define sk_OCSP_RESPID_dup(st) SKM_sk_dup(OCSP_RESPID, st)
# define sk_OCSP_RESPID_pop_free(st, free_func) SKM_sk_pop_free(OCSP_RESPID, (st), (free_func))
+# define sk_OCSP_RESPID_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(OCSP_RESPID, (st), (copy_func), (free_func))
# define sk_OCSP_RESPID_shift(st) SKM_sk_shift(OCSP_RESPID, (st))
# define sk_OCSP_RESPID_pop(st) SKM_sk_pop(OCSP_RESPID, (st))
# define sk_OCSP_RESPID_sort(st) SKM_sk_sort(OCSP_RESPID, (st))
@@ -1255,6 +1328,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_OCSP_SINGLERESP_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(OCSP_SINGLERESP, (st), (cmp))
# define sk_OCSP_SINGLERESP_dup(st) SKM_sk_dup(OCSP_SINGLERESP, st)
# define sk_OCSP_SINGLERESP_pop_free(st, free_func) SKM_sk_pop_free(OCSP_SINGLERESP, (st), (free_func))
+# define sk_OCSP_SINGLERESP_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(OCSP_SINGLERESP, (st), (copy_func), (free_func))
# define sk_OCSP_SINGLERESP_shift(st) SKM_sk_shift(OCSP_SINGLERESP, (st))
# define sk_OCSP_SINGLERESP_pop(st) SKM_sk_pop(OCSP_SINGLERESP, (st))
# define sk_OCSP_SINGLERESP_sort(st) SKM_sk_sort(OCSP_SINGLERESP, (st))
@@ -1276,6 +1350,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_PKCS12_SAFEBAG_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(PKCS12_SAFEBAG, (st), (cmp))
# define sk_PKCS12_SAFEBAG_dup(st) SKM_sk_dup(PKCS12_SAFEBAG, st)
# define sk_PKCS12_SAFEBAG_pop_free(st, free_func) SKM_sk_pop_free(PKCS12_SAFEBAG, (st), (free_func))
+# define sk_PKCS12_SAFEBAG_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(PKCS12_SAFEBAG, (st), (copy_func), (free_func))
# define sk_PKCS12_SAFEBAG_shift(st) SKM_sk_shift(PKCS12_SAFEBAG, (st))
# define sk_PKCS12_SAFEBAG_pop(st) SKM_sk_pop(PKCS12_SAFEBAG, (st))
# define sk_PKCS12_SAFEBAG_sort(st) SKM_sk_sort(PKCS12_SAFEBAG, (st))
@@ -1297,6 +1372,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_PKCS7_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(PKCS7, (st), (cmp))
# define sk_PKCS7_dup(st) SKM_sk_dup(PKCS7, st)
# define sk_PKCS7_pop_free(st, free_func) SKM_sk_pop_free(PKCS7, (st), (free_func))
+# define sk_PKCS7_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(PKCS7, (st), (copy_func), (free_func))
# define sk_PKCS7_shift(st) SKM_sk_shift(PKCS7, (st))
# define sk_PKCS7_pop(st) SKM_sk_pop(PKCS7, (st))
# define sk_PKCS7_sort(st) SKM_sk_sort(PKCS7, (st))
@@ -1318,6 +1394,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_PKCS7_RECIP_INFO_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(PKCS7_RECIP_INFO, (st), (cmp))
# define sk_PKCS7_RECIP_INFO_dup(st) SKM_sk_dup(PKCS7_RECIP_INFO, st)
# define sk_PKCS7_RECIP_INFO_pop_free(st, free_func) SKM_sk_pop_free(PKCS7_RECIP_INFO, (st), (free_func))
+# define sk_PKCS7_RECIP_INFO_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(PKCS7_RECIP_INFO, (st), (copy_func), (free_func))
# define sk_PKCS7_RECIP_INFO_shift(st) SKM_sk_shift(PKCS7_RECIP_INFO, (st))
# define sk_PKCS7_RECIP_INFO_pop(st) SKM_sk_pop(PKCS7_RECIP_INFO, (st))
# define sk_PKCS7_RECIP_INFO_sort(st) SKM_sk_sort(PKCS7_RECIP_INFO, (st))
@@ -1339,6 +1416,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_PKCS7_SIGNER_INFO_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(PKCS7_SIGNER_INFO, (st), (cmp))
# define sk_PKCS7_SIGNER_INFO_dup(st) SKM_sk_dup(PKCS7_SIGNER_INFO, st)
# define sk_PKCS7_SIGNER_INFO_pop_free(st, free_func) SKM_sk_pop_free(PKCS7_SIGNER_INFO, (st), (free_func))
+# define sk_PKCS7_SIGNER_INFO_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(PKCS7_SIGNER_INFO, (st), (copy_func), (free_func))
# define sk_PKCS7_SIGNER_INFO_shift(st) SKM_sk_shift(PKCS7_SIGNER_INFO, (st))
# define sk_PKCS7_SIGNER_INFO_pop(st) SKM_sk_pop(PKCS7_SIGNER_INFO, (st))
# define sk_PKCS7_SIGNER_INFO_sort(st) SKM_sk_sort(PKCS7_SIGNER_INFO, (st))
@@ -1360,6 +1438,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_POLICYINFO_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(POLICYINFO, (st), (cmp))
# define sk_POLICYINFO_dup(st) SKM_sk_dup(POLICYINFO, st)
# define sk_POLICYINFO_pop_free(st, free_func) SKM_sk_pop_free(POLICYINFO, (st), (free_func))
+# define sk_POLICYINFO_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(POLICYINFO, (st), (copy_func), (free_func))
# define sk_POLICYINFO_shift(st) SKM_sk_shift(POLICYINFO, (st))
# define sk_POLICYINFO_pop(st) SKM_sk_pop(POLICYINFO, (st))
# define sk_POLICYINFO_sort(st) SKM_sk_sort(POLICYINFO, (st))
@@ -1381,6 +1460,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_POLICYQUALINFO_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(POLICYQUALINFO, (st), (cmp))
# define sk_POLICYQUALINFO_dup(st) SKM_sk_dup(POLICYQUALINFO, st)
# define sk_POLICYQUALINFO_pop_free(st, free_func) SKM_sk_pop_free(POLICYQUALINFO, (st), (free_func))
+# define sk_POLICYQUALINFO_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(POLICYQUALINFO, (st), (copy_func), (free_func))
# define sk_POLICYQUALINFO_shift(st) SKM_sk_shift(POLICYQUALINFO, (st))
# define sk_POLICYQUALINFO_pop(st) SKM_sk_pop(POLICYQUALINFO, (st))
# define sk_POLICYQUALINFO_sort(st) SKM_sk_sort(POLICYQUALINFO, (st))
@@ -1402,10 +1482,33 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_POLICY_MAPPING_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(POLICY_MAPPING, (st), (cmp))
# define sk_POLICY_MAPPING_dup(st) SKM_sk_dup(POLICY_MAPPING, st)
# define sk_POLICY_MAPPING_pop_free(st, free_func) SKM_sk_pop_free(POLICY_MAPPING, (st), (free_func))
+# define sk_POLICY_MAPPING_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(POLICY_MAPPING, (st), (copy_func), (free_func))
# define sk_POLICY_MAPPING_shift(st) SKM_sk_shift(POLICY_MAPPING, (st))
# define sk_POLICY_MAPPING_pop(st) SKM_sk_pop(POLICY_MAPPING, (st))
# define sk_POLICY_MAPPING_sort(st) SKM_sk_sort(POLICY_MAPPING, (st))
# define sk_POLICY_MAPPING_is_sorted(st) SKM_sk_is_sorted(POLICY_MAPPING, (st))
+# define sk_SCT_new(cmp) SKM_sk_new(SCT, (cmp))
+# define sk_SCT_new_null() SKM_sk_new_null(SCT)
+# define sk_SCT_free(st) SKM_sk_free(SCT, (st))
+# define sk_SCT_num(st) SKM_sk_num(SCT, (st))
+# define sk_SCT_value(st, i) SKM_sk_value(SCT, (st), (i))
+# define sk_SCT_set(st, i, val) SKM_sk_set(SCT, (st), (i), (val))
+# define sk_SCT_zero(st) SKM_sk_zero(SCT, (st))
+# define sk_SCT_push(st, val) SKM_sk_push(SCT, (st), (val))
+# define sk_SCT_unshift(st, val) SKM_sk_unshift(SCT, (st), (val))
+# define sk_SCT_find(st, val) SKM_sk_find(SCT, (st), (val))
+# define sk_SCT_find_ex(st, val) SKM_sk_find_ex(SCT, (st), (val))
+# define sk_SCT_delete(st, i) SKM_sk_delete(SCT, (st), (i))
+# define sk_SCT_delete_ptr(st, ptr) SKM_sk_delete_ptr(SCT, (st), (ptr))
+# define sk_SCT_insert(st, val, i) SKM_sk_insert(SCT, (st), (val), (i))
+# define sk_SCT_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SCT, (st), (cmp))
+# define sk_SCT_dup(st) SKM_sk_dup(SCT, st)
+# define sk_SCT_pop_free(st, free_func) SKM_sk_pop_free(SCT, (st), (free_func))
+# define sk_SCT_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(SCT, (st), (copy_func), (free_func))
+# define sk_SCT_shift(st) SKM_sk_shift(SCT, (st))
+# define sk_SCT_pop(st) SKM_sk_pop(SCT, (st))
+# define sk_SCT_sort(st) SKM_sk_sort(SCT, (st))
+# define sk_SCT_is_sorted(st) SKM_sk_is_sorted(SCT, (st))
# define sk_SRP_gN_new(cmp) SKM_sk_new(SRP_gN, (cmp))
# define sk_SRP_gN_new_null() SKM_sk_new_null(SRP_gN)
# define sk_SRP_gN_free(st) SKM_sk_free(SRP_gN, (st))
@@ -1423,6 +1526,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_SRP_gN_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRP_gN, (st), (cmp))
# define sk_SRP_gN_dup(st) SKM_sk_dup(SRP_gN, st)
# define sk_SRP_gN_pop_free(st, free_func) SKM_sk_pop_free(SRP_gN, (st), (free_func))
+# define sk_SRP_gN_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(SRP_gN, (st), (copy_func), (free_func))
# define sk_SRP_gN_shift(st) SKM_sk_shift(SRP_gN, (st))
# define sk_SRP_gN_pop(st) SKM_sk_pop(SRP_gN, (st))
# define sk_SRP_gN_sort(st) SKM_sk_sort(SRP_gN, (st))
@@ -1444,6 +1548,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_SRP_gN_cache_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRP_gN_cache, (st), (cmp))
# define sk_SRP_gN_cache_dup(st) SKM_sk_dup(SRP_gN_cache, st)
# define sk_SRP_gN_cache_pop_free(st, free_func) SKM_sk_pop_free(SRP_gN_cache, (st), (free_func))
+# define sk_SRP_gN_cache_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(SRP_gN_cache, (st), (copy_func), (free_func))
# define sk_SRP_gN_cache_shift(st) SKM_sk_shift(SRP_gN_cache, (st))
# define sk_SRP_gN_cache_pop(st) SKM_sk_pop(SRP_gN_cache, (st))
# define sk_SRP_gN_cache_sort(st) SKM_sk_sort(SRP_gN_cache, (st))
@@ -1465,6 +1570,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_SRP_user_pwd_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRP_user_pwd, (st), (cmp))
# define sk_SRP_user_pwd_dup(st) SKM_sk_dup(SRP_user_pwd, st)
# define sk_SRP_user_pwd_pop_free(st, free_func) SKM_sk_pop_free(SRP_user_pwd, (st), (free_func))
+# define sk_SRP_user_pwd_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(SRP_user_pwd, (st), (copy_func), (free_func))
# define sk_SRP_user_pwd_shift(st) SKM_sk_shift(SRP_user_pwd, (st))
# define sk_SRP_user_pwd_pop(st) SKM_sk_pop(SRP_user_pwd, (st))
# define sk_SRP_user_pwd_sort(st) SKM_sk_sort(SRP_user_pwd, (st))
@@ -1486,6 +1592,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_SRTP_PROTECTION_PROFILE_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRTP_PROTECTION_PROFILE, (st), (cmp))
# define sk_SRTP_PROTECTION_PROFILE_dup(st) SKM_sk_dup(SRTP_PROTECTION_PROFILE, st)
# define sk_SRTP_PROTECTION_PROFILE_pop_free(st, free_func) SKM_sk_pop_free(SRTP_PROTECTION_PROFILE, (st), (free_func))
+# define sk_SRTP_PROTECTION_PROFILE_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(SRTP_PROTECTION_PROFILE, (st), (copy_func), (free_func))
# define sk_SRTP_PROTECTION_PROFILE_shift(st) SKM_sk_shift(SRTP_PROTECTION_PROFILE, (st))
# define sk_SRTP_PROTECTION_PROFILE_pop(st) SKM_sk_pop(SRTP_PROTECTION_PROFILE, (st))
# define sk_SRTP_PROTECTION_PROFILE_sort(st) SKM_sk_sort(SRTP_PROTECTION_PROFILE, (st))
@@ -1507,6 +1614,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_SSL_CIPHER_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SSL_CIPHER, (st), (cmp))
# define sk_SSL_CIPHER_dup(st) SKM_sk_dup(SSL_CIPHER, st)
# define sk_SSL_CIPHER_pop_free(st, free_func) SKM_sk_pop_free(SSL_CIPHER, (st), (free_func))
+# define sk_SSL_CIPHER_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(SSL_CIPHER, (st), (copy_func), (free_func))
# define sk_SSL_CIPHER_shift(st) SKM_sk_shift(SSL_CIPHER, (st))
# define sk_SSL_CIPHER_pop(st) SKM_sk_pop(SSL_CIPHER, (st))
# define sk_SSL_CIPHER_sort(st) SKM_sk_sort(SSL_CIPHER, (st))
@@ -1528,6 +1636,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_SSL_COMP_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SSL_COMP, (st), (cmp))
# define sk_SSL_COMP_dup(st) SKM_sk_dup(SSL_COMP, st)
# define sk_SSL_COMP_pop_free(st, free_func) SKM_sk_pop_free(SSL_COMP, (st), (free_func))
+# define sk_SSL_COMP_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(SSL_COMP, (st), (copy_func), (free_func))
# define sk_SSL_COMP_shift(st) SKM_sk_shift(SSL_COMP, (st))
# define sk_SSL_COMP_pop(st) SKM_sk_pop(SSL_COMP, (st))
# define sk_SSL_COMP_sort(st) SKM_sk_sort(SSL_COMP, (st))
@@ -1549,6 +1658,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_STACK_OF_X509_NAME_ENTRY_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(STACK_OF_X509_NAME_ENTRY, (st), (cmp))
# define sk_STACK_OF_X509_NAME_ENTRY_dup(st) SKM_sk_dup(STACK_OF_X509_NAME_ENTRY, st)
# define sk_STACK_OF_X509_NAME_ENTRY_pop_free(st, free_func) SKM_sk_pop_free(STACK_OF_X509_NAME_ENTRY, (st), (free_func))
+# define sk_STACK_OF_X509_NAME_ENTRY_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(STACK_OF_X509_NAME_ENTRY, (st), (copy_func), (free_func))
# define sk_STACK_OF_X509_NAME_ENTRY_shift(st) SKM_sk_shift(STACK_OF_X509_NAME_ENTRY, (st))
# define sk_STACK_OF_X509_NAME_ENTRY_pop(st) SKM_sk_pop(STACK_OF_X509_NAME_ENTRY, (st))
# define sk_STACK_OF_X509_NAME_ENTRY_sort(st) SKM_sk_sort(STACK_OF_X509_NAME_ENTRY, (st))
@@ -1570,6 +1680,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_STORE_ATTR_INFO_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(STORE_ATTR_INFO, (st), (cmp))
# define sk_STORE_ATTR_INFO_dup(st) SKM_sk_dup(STORE_ATTR_INFO, st)
# define sk_STORE_ATTR_INFO_pop_free(st, free_func) SKM_sk_pop_free(STORE_ATTR_INFO, (st), (free_func))
+# define sk_STORE_ATTR_INFO_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(STORE_ATTR_INFO, (st), (copy_func), (free_func))
# define sk_STORE_ATTR_INFO_shift(st) SKM_sk_shift(STORE_ATTR_INFO, (st))
# define sk_STORE_ATTR_INFO_pop(st) SKM_sk_pop(STORE_ATTR_INFO, (st))
# define sk_STORE_ATTR_INFO_sort(st) SKM_sk_sort(STORE_ATTR_INFO, (st))
@@ -1591,6 +1702,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_STORE_OBJECT_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(STORE_OBJECT, (st), (cmp))
# define sk_STORE_OBJECT_dup(st) SKM_sk_dup(STORE_OBJECT, st)
# define sk_STORE_OBJECT_pop_free(st, free_func) SKM_sk_pop_free(STORE_OBJECT, (st), (free_func))
+# define sk_STORE_OBJECT_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(STORE_OBJECT, (st), (copy_func), (free_func))
# define sk_STORE_OBJECT_shift(st) SKM_sk_shift(STORE_OBJECT, (st))
# define sk_STORE_OBJECT_pop(st) SKM_sk_pop(STORE_OBJECT, (st))
# define sk_STORE_OBJECT_sort(st) SKM_sk_sort(STORE_OBJECT, (st))
@@ -1612,6 +1724,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_SXNETID_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SXNETID, (st), (cmp))
# define sk_SXNETID_dup(st) SKM_sk_dup(SXNETID, st)
# define sk_SXNETID_pop_free(st, free_func) SKM_sk_pop_free(SXNETID, (st), (free_func))
+# define sk_SXNETID_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(SXNETID, (st), (copy_func), (free_func))
# define sk_SXNETID_shift(st) SKM_sk_shift(SXNETID, (st))
# define sk_SXNETID_pop(st) SKM_sk_pop(SXNETID, (st))
# define sk_SXNETID_sort(st) SKM_sk_sort(SXNETID, (st))
@@ -1633,6 +1746,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_UI_STRING_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(UI_STRING, (st), (cmp))
# define sk_UI_STRING_dup(st) SKM_sk_dup(UI_STRING, st)
# define sk_UI_STRING_pop_free(st, free_func) SKM_sk_pop_free(UI_STRING, (st), (free_func))
+# define sk_UI_STRING_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(UI_STRING, (st), (copy_func), (free_func))
# define sk_UI_STRING_shift(st) SKM_sk_shift(UI_STRING, (st))
# define sk_UI_STRING_pop(st) SKM_sk_pop(UI_STRING, (st))
# define sk_UI_STRING_sort(st) SKM_sk_sort(UI_STRING, (st))
@@ -1654,6 +1768,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_X509_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509, (st), (cmp))
# define sk_X509_dup(st) SKM_sk_dup(X509, st)
# define sk_X509_pop_free(st, free_func) SKM_sk_pop_free(X509, (st), (free_func))
+# define sk_X509_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(X509, (st), (copy_func), (free_func))
# define sk_X509_shift(st) SKM_sk_shift(X509, (st))
# define sk_X509_pop(st) SKM_sk_pop(X509, (st))
# define sk_X509_sort(st) SKM_sk_sort(X509, (st))
@@ -1675,6 +1790,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_X509V3_EXT_METHOD_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509V3_EXT_METHOD, (st), (cmp))
# define sk_X509V3_EXT_METHOD_dup(st) SKM_sk_dup(X509V3_EXT_METHOD, st)
# define sk_X509V3_EXT_METHOD_pop_free(st, free_func) SKM_sk_pop_free(X509V3_EXT_METHOD, (st), (free_func))
+# define sk_X509V3_EXT_METHOD_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(X509V3_EXT_METHOD, (st), (copy_func), (free_func))
# define sk_X509V3_EXT_METHOD_shift(st) SKM_sk_shift(X509V3_EXT_METHOD, (st))
# define sk_X509V3_EXT_METHOD_pop(st) SKM_sk_pop(X509V3_EXT_METHOD, (st))
# define sk_X509V3_EXT_METHOD_sort(st) SKM_sk_sort(X509V3_EXT_METHOD, (st))
@@ -1696,6 +1812,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_X509_ALGOR_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509_ALGOR, (st), (cmp))
# define sk_X509_ALGOR_dup(st) SKM_sk_dup(X509_ALGOR, st)
# define sk_X509_ALGOR_pop_free(st, free_func) SKM_sk_pop_free(X509_ALGOR, (st), (free_func))
+# define sk_X509_ALGOR_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(X509_ALGOR, (st), (copy_func), (free_func))
# define sk_X509_ALGOR_shift(st) SKM_sk_shift(X509_ALGOR, (st))
# define sk_X509_ALGOR_pop(st) SKM_sk_pop(X509_ALGOR, (st))
# define sk_X509_ALGOR_sort(st) SKM_sk_sort(X509_ALGOR, (st))
@@ -1717,6 +1834,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_X509_ATTRIBUTE_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509_ATTRIBUTE, (st), (cmp))
# define sk_X509_ATTRIBUTE_dup(st) SKM_sk_dup(X509_ATTRIBUTE, st)
# define sk_X509_ATTRIBUTE_pop_free(st, free_func) SKM_sk_pop_free(X509_ATTRIBUTE, (st), (free_func))
+# define sk_X509_ATTRIBUTE_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(X509_ATTRIBUTE, (st), (copy_func), (free_func))
# define sk_X509_ATTRIBUTE_shift(st) SKM_sk_shift(X509_ATTRIBUTE, (st))
# define sk_X509_ATTRIBUTE_pop(st) SKM_sk_pop(X509_ATTRIBUTE, (st))
# define sk_X509_ATTRIBUTE_sort(st) SKM_sk_sort(X509_ATTRIBUTE, (st))
@@ -1738,6 +1856,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_X509_CRL_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509_CRL, (st), (cmp))
# define sk_X509_CRL_dup(st) SKM_sk_dup(X509_CRL, st)
# define sk_X509_CRL_pop_free(st, free_func) SKM_sk_pop_free(X509_CRL, (st), (free_func))
+# define sk_X509_CRL_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(X509_CRL, (st), (copy_func), (free_func))
# define sk_X509_CRL_shift(st) SKM_sk_shift(X509_CRL, (st))
# define sk_X509_CRL_pop(st) SKM_sk_pop(X509_CRL, (st))
# define sk_X509_CRL_sort(st) SKM_sk_sort(X509_CRL, (st))
@@ -1759,6 +1878,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_X509_EXTENSION_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509_EXTENSION, (st), (cmp))
# define sk_X509_EXTENSION_dup(st) SKM_sk_dup(X509_EXTENSION, st)
# define sk_X509_EXTENSION_pop_free(st, free_func) SKM_sk_pop_free(X509_EXTENSION, (st), (free_func))
+# define sk_X509_EXTENSION_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(X509_EXTENSION, (st), (copy_func), (free_func))
# define sk_X509_EXTENSION_shift(st) SKM_sk_shift(X509_EXTENSION, (st))
# define sk_X509_EXTENSION_pop(st) SKM_sk_pop(X509_EXTENSION, (st))
# define sk_X509_EXTENSION_sort(st) SKM_sk_sort(X509_EXTENSION, (st))
@@ -1780,6 +1900,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_X509_INFO_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509_INFO, (st), (cmp))
# define sk_X509_INFO_dup(st) SKM_sk_dup(X509_INFO, st)
# define sk_X509_INFO_pop_free(st, free_func) SKM_sk_pop_free(X509_INFO, (st), (free_func))
+# define sk_X509_INFO_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(X509_INFO, (st), (copy_func), (free_func))
# define sk_X509_INFO_shift(st) SKM_sk_shift(X509_INFO, (st))
# define sk_X509_INFO_pop(st) SKM_sk_pop(X509_INFO, (st))
# define sk_X509_INFO_sort(st) SKM_sk_sort(X509_INFO, (st))
@@ -1801,6 +1922,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_X509_LOOKUP_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509_LOOKUP, (st), (cmp))
# define sk_X509_LOOKUP_dup(st) SKM_sk_dup(X509_LOOKUP, st)
# define sk_X509_LOOKUP_pop_free(st, free_func) SKM_sk_pop_free(X509_LOOKUP, (st), (free_func))
+# define sk_X509_LOOKUP_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(X509_LOOKUP, (st), (copy_func), (free_func))
# define sk_X509_LOOKUP_shift(st) SKM_sk_shift(X509_LOOKUP, (st))
# define sk_X509_LOOKUP_pop(st) SKM_sk_pop(X509_LOOKUP, (st))
# define sk_X509_LOOKUP_sort(st) SKM_sk_sort(X509_LOOKUP, (st))
@@ -1822,6 +1944,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_X509_NAME_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509_NAME, (st), (cmp))
# define sk_X509_NAME_dup(st) SKM_sk_dup(X509_NAME, st)
# define sk_X509_NAME_pop_free(st, free_func) SKM_sk_pop_free(X509_NAME, (st), (free_func))
+# define sk_X509_NAME_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(X509_NAME, (st), (copy_func), (free_func))
# define sk_X509_NAME_shift(st) SKM_sk_shift(X509_NAME, (st))
# define sk_X509_NAME_pop(st) SKM_sk_pop(X509_NAME, (st))
# define sk_X509_NAME_sort(st) SKM_sk_sort(X509_NAME, (st))
@@ -1843,6 +1966,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_X509_NAME_ENTRY_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509_NAME_ENTRY, (st), (cmp))
# define sk_X509_NAME_ENTRY_dup(st) SKM_sk_dup(X509_NAME_ENTRY, st)
# define sk_X509_NAME_ENTRY_pop_free(st, free_func) SKM_sk_pop_free(X509_NAME_ENTRY, (st), (free_func))
+# define sk_X509_NAME_ENTRY_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(X509_NAME_ENTRY, (st), (copy_func), (free_func))
# define sk_X509_NAME_ENTRY_shift(st) SKM_sk_shift(X509_NAME_ENTRY, (st))
# define sk_X509_NAME_ENTRY_pop(st) SKM_sk_pop(X509_NAME_ENTRY, (st))
# define sk_X509_NAME_ENTRY_sort(st) SKM_sk_sort(X509_NAME_ENTRY, (st))
@@ -1864,6 +1988,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_X509_OBJECT_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509_OBJECT, (st), (cmp))
# define sk_X509_OBJECT_dup(st) SKM_sk_dup(X509_OBJECT, st)
# define sk_X509_OBJECT_pop_free(st, free_func) SKM_sk_pop_free(X509_OBJECT, (st), (free_func))
+# define sk_X509_OBJECT_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(X509_OBJECT, (st), (copy_func), (free_func))
# define sk_X509_OBJECT_shift(st) SKM_sk_shift(X509_OBJECT, (st))
# define sk_X509_OBJECT_pop(st) SKM_sk_pop(X509_OBJECT, (st))
# define sk_X509_OBJECT_sort(st) SKM_sk_sort(X509_OBJECT, (st))
@@ -1885,6 +2010,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_X509_POLICY_DATA_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509_POLICY_DATA, (st), (cmp))
# define sk_X509_POLICY_DATA_dup(st) SKM_sk_dup(X509_POLICY_DATA, st)
# define sk_X509_POLICY_DATA_pop_free(st, free_func) SKM_sk_pop_free(X509_POLICY_DATA, (st), (free_func))
+# define sk_X509_POLICY_DATA_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(X509_POLICY_DATA, (st), (copy_func), (free_func))
# define sk_X509_POLICY_DATA_shift(st) SKM_sk_shift(X509_POLICY_DATA, (st))
# define sk_X509_POLICY_DATA_pop(st) SKM_sk_pop(X509_POLICY_DATA, (st))
# define sk_X509_POLICY_DATA_sort(st) SKM_sk_sort(X509_POLICY_DATA, (st))
@@ -1906,6 +2032,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_X509_POLICY_NODE_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509_POLICY_NODE, (st), (cmp))
# define sk_X509_POLICY_NODE_dup(st) SKM_sk_dup(X509_POLICY_NODE, st)
# define sk_X509_POLICY_NODE_pop_free(st, free_func) SKM_sk_pop_free(X509_POLICY_NODE, (st), (free_func))
+# define sk_X509_POLICY_NODE_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(X509_POLICY_NODE, (st), (copy_func), (free_func))
# define sk_X509_POLICY_NODE_shift(st) SKM_sk_shift(X509_POLICY_NODE, (st))
# define sk_X509_POLICY_NODE_pop(st) SKM_sk_pop(X509_POLICY_NODE, (st))
# define sk_X509_POLICY_NODE_sort(st) SKM_sk_sort(X509_POLICY_NODE, (st))
@@ -1927,6 +2054,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_X509_PURPOSE_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509_PURPOSE, (st), (cmp))
# define sk_X509_PURPOSE_dup(st) SKM_sk_dup(X509_PURPOSE, st)
# define sk_X509_PURPOSE_pop_free(st, free_func) SKM_sk_pop_free(X509_PURPOSE, (st), (free_func))
+# define sk_X509_PURPOSE_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(X509_PURPOSE, (st), (copy_func), (free_func))
# define sk_X509_PURPOSE_shift(st) SKM_sk_shift(X509_PURPOSE, (st))
# define sk_X509_PURPOSE_pop(st) SKM_sk_pop(X509_PURPOSE, (st))
# define sk_X509_PURPOSE_sort(st) SKM_sk_sort(X509_PURPOSE, (st))
@@ -1948,6 +2076,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_X509_REVOKED_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509_REVOKED, (st), (cmp))
# define sk_X509_REVOKED_dup(st) SKM_sk_dup(X509_REVOKED, st)
# define sk_X509_REVOKED_pop_free(st, free_func) SKM_sk_pop_free(X509_REVOKED, (st), (free_func))
+# define sk_X509_REVOKED_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(X509_REVOKED, (st), (copy_func), (free_func))
# define sk_X509_REVOKED_shift(st) SKM_sk_shift(X509_REVOKED, (st))
# define sk_X509_REVOKED_pop(st) SKM_sk_pop(X509_REVOKED, (st))
# define sk_X509_REVOKED_sort(st) SKM_sk_sort(X509_REVOKED, (st))
@@ -1969,6 +2098,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_X509_TRUST_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509_TRUST, (st), (cmp))
# define sk_X509_TRUST_dup(st) SKM_sk_dup(X509_TRUST, st)
# define sk_X509_TRUST_pop_free(st, free_func) SKM_sk_pop_free(X509_TRUST, (st), (free_func))
+# define sk_X509_TRUST_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(X509_TRUST, (st), (copy_func), (free_func))
# define sk_X509_TRUST_shift(st) SKM_sk_shift(X509_TRUST, (st))
# define sk_X509_TRUST_pop(st) SKM_sk_pop(X509_TRUST, (st))
# define sk_X509_TRUST_sort(st) SKM_sk_sort(X509_TRUST, (st))
@@ -1990,6 +2120,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_X509_VERIFY_PARAM_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509_VERIFY_PARAM, (st), (cmp))
# define sk_X509_VERIFY_PARAM_dup(st) SKM_sk_dup(X509_VERIFY_PARAM, st)
# define sk_X509_VERIFY_PARAM_pop_free(st, free_func) SKM_sk_pop_free(X509_VERIFY_PARAM, (st), (free_func))
+# define sk_X509_VERIFY_PARAM_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(X509_VERIFY_PARAM, (st), (copy_func), (free_func))
# define sk_X509_VERIFY_PARAM_shift(st) SKM_sk_shift(X509_VERIFY_PARAM, (st))
# define sk_X509_VERIFY_PARAM_pop(st) SKM_sk_pop(X509_VERIFY_PARAM, (st))
# define sk_X509_VERIFY_PARAM_sort(st) SKM_sk_sort(X509_VERIFY_PARAM, (st))
@@ -2011,6 +2142,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_nid_triple_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(nid_triple, (st), (cmp))
# define sk_nid_triple_dup(st) SKM_sk_dup(nid_triple, st)
# define sk_nid_triple_pop_free(st, free_func) SKM_sk_pop_free(nid_triple, (st), (free_func))
+# define sk_nid_triple_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(nid_triple, (st), (copy_func), (free_func))
# define sk_nid_triple_shift(st) SKM_sk_shift(nid_triple, (st))
# define sk_nid_triple_pop(st) SKM_sk_pop(nid_triple, (st))
# define sk_nid_triple_sort(st) SKM_sk_sort(nid_triple, (st))
@@ -2032,6 +2164,7 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_void_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(void, (st), (cmp))
# define sk_void_dup(st) SKM_sk_dup(void, st)
# define sk_void_pop_free(st, free_func) SKM_sk_pop_free(void, (st), (free_func))
+# define sk_void_deep_copy(st, copy_func, free_func) SKM_sk_deep_copy(void, (st), (copy_func), (free_func))
# define sk_void_shift(st) SKM_sk_shift(void, (st))
# define sk_void_pop(st) SKM_sk_pop(void, (st))
# define sk_void_sort(st) SKM_sk_sort(void, (st))
@@ -2042,7 +2175,8 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_OPENSSL_STRING_find(st, val) sk_find(CHECKED_STACK_OF(OPENSSL_STRING, st), CHECKED_PTR_OF(char, val))
# define sk_OPENSSL_STRING_value(st, i) ((OPENSSL_STRING)sk_value(CHECKED_STACK_OF(OPENSSL_STRING, st), i))
# define sk_OPENSSL_STRING_num(st) SKM_sk_num(OPENSSL_STRING, st)
-# define sk_OPENSSL_STRING_pop_free(st, free_func) sk_pop_free(CHECKED_STACK_OF(OPENSSL_STRING, st), CHECKED_SK_FREE_FUNC2(OPENSSL_STRING, free_func))
+# define sk_OPENSSL_STRING_pop_free(st, free_func) sk_pop_free(CHECKED_STACK_OF(OPENSSL_STRING, st), CHECKED_SK_FREE_FUNC(char, free_func))
+# define sk_OPENSSL_STRING_deep_copy(st, copy_func, free_func) ((STACK_OF(OPENSSL_STRING) *)sk_deep_copy(CHECKED_STACK_OF(OPENSSL_STRING, st), CHECKED_SK_COPY_FUNC(char, copy_func), CHECKED_SK_FREE_FUNC(char, free_func)))
# define sk_OPENSSL_STRING_insert(st, val, i) sk_insert(CHECKED_STACK_OF(OPENSSL_STRING, st), CHECKED_PTR_OF(char, val), i)
# define sk_OPENSSL_STRING_free(st) SKM_sk_free(OPENSSL_STRING, st)
# define sk_OPENSSL_STRING_set(st, i, val) sk_set(CHECKED_STACK_OF(OPENSSL_STRING, st), i, CHECKED_PTR_OF(char, val))
@@ -2065,7 +2199,8 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_OPENSSL_BLOCK_find(st, val) sk_find(CHECKED_STACK_OF(OPENSSL_BLOCK, st), CHECKED_PTR_OF(void, val))
# define sk_OPENSSL_BLOCK_value(st, i) ((OPENSSL_BLOCK)sk_value(CHECKED_STACK_OF(OPENSSL_BLOCK, st), i))
# define sk_OPENSSL_BLOCK_num(st) SKM_sk_num(OPENSSL_BLOCK, st)
-# define sk_OPENSSL_BLOCK_pop_free(st, free_func) sk_pop_free(CHECKED_STACK_OF(OPENSSL_BLOCK, st), CHECKED_SK_FREE_FUNC2(OPENSSL_BLOCK, free_func))
+# define sk_OPENSSL_BLOCK_pop_free(st, free_func) sk_pop_free(CHECKED_STACK_OF(OPENSSL_BLOCK, st), CHECKED_SK_FREE_FUNC(void, free_func))
+# define sk_OPENSSL_BLOCK_deep_copy(st, copy_func, free_func) ((STACK_OF(OPENSSL_BLOCK) *)sk_deep_copy(CHECKED_STACK_OF(OPENSSL_BLOCK, st), CHECKED_SK_COPY_FUNC(void, copy_func), CHECKED_SK_FREE_FUNC(void, free_func)))
# define sk_OPENSSL_BLOCK_insert(st, val, i) sk_insert(CHECKED_STACK_OF(OPENSSL_BLOCK, st), CHECKED_PTR_OF(void, val), i)
# define sk_OPENSSL_BLOCK_free(st) SKM_sk_free(OPENSSL_BLOCK, st)
# define sk_OPENSSL_BLOCK_set(st, i, val) sk_set(CHECKED_STACK_OF(OPENSSL_BLOCK, st), i, CHECKED_PTR_OF(void, val))
@@ -2088,7 +2223,8 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
# define sk_OPENSSL_PSTRING_find(st, val) sk_find(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
# define sk_OPENSSL_PSTRING_value(st, i) ((OPENSSL_PSTRING)sk_value(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i))
# define sk_OPENSSL_PSTRING_num(st) SKM_sk_num(OPENSSL_PSTRING, st)
-# define sk_OPENSSL_PSTRING_pop_free(st, free_func) sk_pop_free(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_FREE_FUNC2(OPENSSL_PSTRING, free_func))
+# define sk_OPENSSL_PSTRING_pop_free(st, free_func) sk_pop_free(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_FREE_FUNC(OPENSSL_STRING, free_func))
+# define sk_OPENSSL_PSTRING_deep_copy(st, copy_func, free_func) ((STACK_OF(OPENSSL_PSTRING) *)sk_deep_copy(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_COPY_FUNC(OPENSSL_STRING, copy_func), CHECKED_SK_FREE_FUNC(OPENSSL_STRING, free_func)))
# define sk_OPENSSL_PSTRING_insert(st, val, i) sk_insert(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val), i)
# define sk_OPENSSL_PSTRING_free(st) SKM_sk_free(OPENSSL_PSTRING, st)
# define sk_OPENSSL_PSTRING_set(st, i, val) sk_set(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i, CHECKED_PTR_OF(OPENSSL_STRING, val))
diff --git a/crypto/stack/stack.c b/crypto/stack/stack.c
index 331f907190f7..de437acf6a5c 100644
--- a/crypto/stack/stack.c
+++ b/crypto/stack/stack.c
@@ -115,6 +115,40 @@ _STACK *sk_dup(_STACK *sk)
return (NULL);
}
+_STACK *sk_deep_copy(_STACK *sk, void *(*copy_func) (void *),
+ void (*free_func) (void *))
+{
+ _STACK *ret;
+ int i;
+
+ if ((ret = OPENSSL_malloc(sizeof(_STACK))) == NULL)
+ return ret;
+ ret->comp = sk->comp;
+ ret->sorted = sk->sorted;
+ ret->num = sk->num;
+ ret->num_alloc = sk->num > MIN_NODES ? sk->num : MIN_NODES;
+ ret->data = OPENSSL_malloc(sizeof(char *) * ret->num_alloc);
+ if (ret->data == NULL) {
+ OPENSSL_free(ret);
+ return NULL;
+ }
+ for (i = 0; i < ret->num_alloc; i++)
+ ret->data[i] = NULL;
+
+ for (i = 0; i < ret->num; ++i) {
+ if (sk->data[i] == NULL)
+ continue;
+ if ((ret->data[i] = copy_func(sk->data[i])) == NULL) {
+ while (--i >= 0)
+ if (ret->data[i] != NULL)
+ free_func(ret->data[i]);
+ sk_free(ret);
+ return NULL;
+ }
+ }
+ return ret;
+}
+
_STACK *sk_new_null(void)
{
return sk_new((int (*)(const void *, const void *))0);
diff --git a/crypto/stack/stack.h b/crypto/stack/stack.h
index 8d6e939a8031..eb07216659d0 100644
--- a/crypto/stack/stack.h
+++ b/crypto/stack/stack.h
@@ -83,6 +83,7 @@ _STACK *sk_new(int (*cmp) (const void *, const void *));
_STACK *sk_new_null(void);
void sk_free(_STACK *);
void sk_pop_free(_STACK *st, void (*func) (void *));
+_STACK *sk_deep_copy(_STACK *, void *(*)(void *), void (*)(void *));
int sk_insert(_STACK *sk, void *data, int where);
void *sk_delete(_STACK *st, int loc);
void *sk_delete_ptr(_STACK *st, void *p);
diff --git a/crypto/symhacks.h b/crypto/symhacks.h
index 2eadf7f3df2b..239fa4fb1b77 100644
--- a/crypto/symhacks.h
+++ b/crypto/symhacks.h
@@ -166,7 +166,7 @@
# undef CRYPTO_get_locked_mem_ex_functions
# define CRYPTO_get_locked_mem_ex_functions CRYPTO_get_locked_mem_ex_funcs
-/* Hack some long SSL names */
+/* Hack some long SSL/TLS names */
# undef SSL_CTX_set_default_verify_paths
# define SSL_CTX_set_default_verify_paths SSL_CTX_set_def_verify_paths
# undef SSL_get_ex_data_X509_STORE_CTX_idx
@@ -183,6 +183,10 @@
# define SSL_CTX_set_default_passwd_cb_userdata SSL_CTX_set_def_passwd_cb_ud
# undef SSL_COMP_get_compression_methods
# define SSL_COMP_get_compression_methods SSL_COMP_get_compress_methods
+# undef SSL_COMP_set0_compression_methods
+# define SSL_COMP_set0_compression_methods SSL_COMP_set0_compress_methods
+# undef SSL_COMP_free_compression_methods
+# define SSL_COMP_free_compression_methods SSL_COMP_free_compress_methods
# undef ssl_add_clienthello_renegotiate_ext
# define ssl_add_clienthello_renegotiate_ext ssl_add_clienthello_reneg_ext
# undef ssl_add_serverhello_renegotiate_ext
@@ -211,6 +215,16 @@
# define SSL_CTX_set_next_protos_advertised_cb SSL_CTX_set_next_protos_adv_cb
# undef SSL_CTX_set_next_proto_select_cb
# define SSL_CTX_set_next_proto_select_cb SSL_CTX_set_next_proto_sel_cb
+
+# undef tls1_send_server_supplemental_data
+# define tls1_send_server_supplemental_data tls1_send_server_suppl_data
+# undef tls1_send_client_supplemental_data
+# define tls1_send_client_supplemental_data tls1_send_client_suppl_data
+# undef tls1_get_server_supplemental_data
+# define tls1_get_server_supplemental_data tls1_get_server_suppl_data
+# undef tls1_get_client_supplemental_data
+# define tls1_get_client_supplemental_data tls1_get_client_suppl_data
+
# undef ssl3_cbc_record_digest_supported
# define ssl3_cbc_record_digest_supported ssl3_cbc_record_digest_support
# undef ssl_check_clienthello_tlsext_late
@@ -218,7 +232,11 @@
# undef ssl_check_clienthello_tlsext_early
# define ssl_check_clienthello_tlsext_early ssl_check_clihello_tlsext_early
-/* Hack some long ENGINE names */
+/* Hack some RSA long names */
+# undef RSA_padding_check_PKCS1_OAEP_mgf1
+# define RSA_padding_check_PKCS1_OAEP_mgf1 RSA_pad_check_PKCS1_OAEP_mgf1
+
+/* Hack some ENGINE long names */
# undef ENGINE_get_default_BN_mod_exp_crt
# define ENGINE_get_default_BN_mod_exp_crt ENGINE_get_def_BN_mod_exp_crt
# undef ENGINE_set_default_BN_mod_exp_crt
@@ -427,6 +445,18 @@
# define CMS_OriginatorIdentifierOrKey_it CMS_OriginatorIdOrKey_it
# undef cms_SignerIdentifier_get0_signer_id
# define cms_SignerIdentifier_get0_signer_id cms_SignerId_get0_signer_id
+# undef CMS_RecipientInfo_kari_get0_orig_id
+# define CMS_RecipientInfo_kari_get0_orig_id CMS_RecipInfo_kari_get0_orig_id
+# undef CMS_RecipientInfo_kari_get0_reks
+# define CMS_RecipientInfo_kari_get0_reks CMS_RecipInfo_kari_get0_reks
+# undef CMS_RecipientEncryptedKey_cert_cmp
+# define CMS_RecipientEncryptedKey_cert_cmp CMS_RecipEncryptedKey_cert_cmp
+# undef CMS_RecipientInfo_kari_set0_pkey
+# define CMS_RecipientInfo_kari_set0_pkey CMS_RecipInfo_kari_set0_pkey
+# undef CMS_RecipientEncryptedKey_get0_id
+# define CMS_RecipientEncryptedKey_get0_id CMS_RecipEncryptedKey_get0_id
+# undef CMS_RecipientInfo_kari_orig_id_cmp
+# define CMS_RecipientInfo_kari_orig_id_cmp CMS_RecipInfo_kari_orig_id_cmp
/* Hack some long DTLS1 names */
# undef dtls1_retransmit_buffered_messages
diff --git a/crypto/ts/ts_rsp_sign.c b/crypto/ts/ts_rsp_sign.c
index 031d872e2cf6..db6ce3241f73 100644
--- a/crypto/ts/ts_rsp_sign.c
+++ b/crypto/ts/ts_rsp_sign.c
@@ -238,7 +238,6 @@ int TS_RESP_CTX_set_def_policy(TS_RESP_CTX *ctx, ASN1_OBJECT *def_policy)
int TS_RESP_CTX_set_certs(TS_RESP_CTX *ctx, STACK_OF(X509) *certs)
{
- int i;
if (ctx->certs) {
sk_X509_pop_free(ctx->certs, X509_free);
@@ -246,14 +245,10 @@ int TS_RESP_CTX_set_certs(TS_RESP_CTX *ctx, STACK_OF(X509) *certs)
}
if (!certs)
return 1;
- if (!(ctx->certs = sk_X509_dup(certs))) {
+ if (!(ctx->certs = X509_chain_up_ref(certs))) {
TSerr(TS_F_TS_RESP_CTX_SET_CERTS, ERR_R_MALLOC_FAILURE);
return 0;
}
- for (i = 0; i < sk_X509_num(ctx->certs); ++i) {
- X509 *cert = sk_X509_value(ctx->certs, i);
- CRYPTO_add(&cert->references, +1, CRYPTO_LOCK_X509);
- }
return 1;
}
diff --git a/crypto/ts/ts_rsp_verify.c b/crypto/ts/ts_rsp_verify.c
index 32b4d9923f46..3ce765dfa1b6 100644
--- a/crypto/ts/ts_rsp_verify.c
+++ b/crypto/ts/ts_rsp_verify.c
@@ -637,7 +637,7 @@ static int TS_compute_imprint(BIO *data, TS_TST_INFO *tst_info,
X509_ALGOR_free(*md_alg);
OPENSSL_free(*imprint);
*imprint_len = 0;
- *imprint = NULL;
+ *imprint = 0;
return 0;
}
diff --git a/crypto/ui/ui_openssl.c b/crypto/ui/ui_openssl.c
index 829ea8691eb5..5d66276418fc 100644
--- a/crypto/ui/ui_openssl.c
+++ b/crypto/ui/ui_openssl.c
@@ -208,7 +208,7 @@
# elif !defined(OPENSSL_SYS_VMS) \
&& !defined(OPENSSL_SYS_MSDOS) \
&& !defined(OPENSSL_SYS_MACINTOSH_CLASSIC) \
- && !defined(MAC_OS_GUSI_SOURCE) \
+ && !defined(MAC_OS_GUSI_SOURCE) \
&& !defined(OPENSSL_SYS_VXWORKS) \
&& !defined(OPENSSL_SYS_NETWARE)
# define TERMIOS
diff --git a/crypto/whrlpool/asm/wp-mmx.pl b/crypto/whrlpool/asm/wp-mmx.pl
index cb2381c22ba1..c584e5b92b25 100755
--- a/crypto/whrlpool/asm/wp-mmx.pl
+++ b/crypto/whrlpool/asm/wp-mmx.pl
@@ -118,34 +118,36 @@ $tbl="ebp";
&movq (@mm[0],&QWP(2048*$SCALE,$tbl,"esi",8)); # rc[r]
&mov ("eax",&DWP(0,"esp"));
&mov ("ebx",&DWP(4,"esp"));
+ &movz ("ecx",&LB("eax"));
+ &movz ("edx",&HB("eax"));
for($i=0;$i<8;$i++) {
my $func = ($i==0)? \&movq : \&pxor;
- &movb (&LB("ecx"),&LB("eax"));
- &movb (&LB("edx"),&HB("eax"));
+ &shr ("eax",16);
&scale ("esi","ecx");
+ &movz ("ecx",&LB("eax"));
&scale ("edi","edx");
- &shr ("eax",16);
+ &movz ("edx",&HB("eax"));
&pxor (@mm[0],&QWP(&row(0),$tbl,"esi",8));
&$func (@mm[1],&QWP(&row(1),$tbl,"edi",8));
- &movb (&LB("ecx"),&LB("eax"));
- &movb (&LB("edx"),&HB("eax"));
&mov ("eax",&DWP(($i+1)*8,"esp"));
&scale ("esi","ecx");
+ &movz ("ecx",&LB("ebx"));
&scale ("edi","edx");
+ &movz ("edx",&HB("ebx"));
&$func (@mm[2],&QWP(&row(2),$tbl,"esi",8));
&$func (@mm[3],&QWP(&row(3),$tbl,"edi",8));
- &movb (&LB("ecx"),&LB("ebx"));
- &movb (&LB("edx"),&HB("ebx"));
+ &shr ("ebx",16);
&scale ("esi","ecx");
+ &movz ("ecx",&LB("ebx"));
&scale ("edi","edx");
- &shr ("ebx",16);
+ &movz ("edx",&HB("ebx"));
&$func (@mm[4],&QWP(&row(4),$tbl,"esi",8));
&$func (@mm[5],&QWP(&row(5),$tbl,"edi",8));
- &movb (&LB("ecx"),&LB("ebx"));
- &movb (&LB("edx"),&HB("ebx"));
&mov ("ebx",&DWP(($i+1)*8+4,"esp"));
&scale ("esi","ecx");
+ &movz ("ecx",&LB("eax"));
&scale ("edi","edx");
+ &movz ("edx",&HB("eax"));
&$func (@mm[6],&QWP(&row(6),$tbl,"esi",8));
&$func (@mm[7],&QWP(&row(7),$tbl,"edi",8));
push(@mm,shift(@mm));
@@ -154,32 +156,32 @@ for($i=0;$i<8;$i++) {
for($i=0;$i<8;$i++) { &movq(&QWP($i*8,"esp"),@mm[$i]); } # K=L
for($i=0;$i<8;$i++) {
- &movb (&LB("ecx"),&LB("eax"));
- &movb (&LB("edx"),&HB("eax"));
+ &shr ("eax",16);
&scale ("esi","ecx");
+ &movz ("ecx",&LB("eax"));
&scale ("edi","edx");
- &shr ("eax",16);
+ &movz ("edx",&HB("eax"));
&pxor (@mm[0],&QWP(&row(0),$tbl,"esi",8));
&pxor (@mm[1],&QWP(&row(1),$tbl,"edi",8));
- &movb (&LB("ecx"),&LB("eax"));
- &movb (&LB("edx"),&HB("eax"));
&mov ("eax",&DWP(64+($i+1)*8,"esp")) if ($i<7);
&scale ("esi","ecx");
+ &movz ("ecx",&LB("ebx"));
&scale ("edi","edx");
+ &movz ("edx",&HB("ebx"));
&pxor (@mm[2],&QWP(&row(2),$tbl,"esi",8));
&pxor (@mm[3],&QWP(&row(3),$tbl,"edi",8));
- &movb (&LB("ecx"),&LB("ebx"));
- &movb (&LB("edx"),&HB("ebx"));
+ &shr ("ebx",16);
&scale ("esi","ecx");
+ &movz ("ecx",&LB("ebx"));
&scale ("edi","edx");
- &shr ("ebx",16);
+ &movz ("edx",&HB("ebx"));
&pxor (@mm[4],&QWP(&row(4),$tbl,"esi",8));
&pxor (@mm[5],&QWP(&row(5),$tbl,"edi",8));
- &movb (&LB("ecx"),&LB("ebx"));
- &movb (&LB("edx"),&HB("ebx"));
&mov ("ebx",&DWP(64+($i+1)*8+4,"esp")) if ($i<7);
&scale ("esi","ecx");
+ &movz ("ecx",&LB("eax"));
&scale ("edi","edx");
+ &movz ("edx",&HB("eax"));
&pxor (@mm[6],&QWP(&row(6),$tbl,"esi",8));
&pxor (@mm[7],&QWP(&row(7),$tbl,"edi",8));
push(@mm,shift(@mm));
diff --git a/crypto/whrlpool/asm/wp-x86_64.pl b/crypto/whrlpool/asm/wp-x86_64.pl
index 24b2ff60c38b..5a3bdbcf20d1 100755
--- a/crypto/whrlpool/asm/wp-x86_64.pl
+++ b/crypto/whrlpool/asm/wp-x86_64.pl
@@ -91,41 +91,44 @@ for($i=0;$i<8;$i++) { $code.="mov @mm[$i],64+$i*8(%rsp)\n"; } # S=L
$code.=<<___;
xor %rsi,%rsi
mov %rsi,24(%rbx) # zero round counter
+ jmp .Lround
.align 16
.Lround:
mov 4096(%rbp,%rsi,8),@mm[0] # rc[r]
mov 0(%rsp),%eax
mov 4(%rsp),%ebx
+ movz %al,%ecx
+ movz %ah,%edx
___
for($i=0;$i<8;$i++) {
my $func = ($i==0)? "mov" : "xor";
$code.=<<___;
- mov %al,%cl
- mov %ah,%dl
+ shr \$16,%eax
lea (%rcx,%rcx),%rsi
+ movz %al,%ecx
lea (%rdx,%rdx),%rdi
- shr \$16,%eax
+ movz %ah,%edx
xor 0(%rbp,%rsi,8),@mm[0]
$func 7(%rbp,%rdi,8),@mm[1]
- mov %al,%cl
- mov %ah,%dl
mov $i*8+8(%rsp),%eax # ($i+1)*8
lea (%rcx,%rcx),%rsi
+ movz %bl,%ecx
lea (%rdx,%rdx),%rdi
+ movz %bh,%edx
$func 6(%rbp,%rsi,8),@mm[2]
$func 5(%rbp,%rdi,8),@mm[3]
- mov %bl,%cl
- mov %bh,%dl
+ shr \$16,%ebx
lea (%rcx,%rcx),%rsi
+ movz %bl,%ecx
lea (%rdx,%rdx),%rdi
- shr \$16,%ebx
+ movz %bh,%edx
$func 4(%rbp,%rsi,8),@mm[4]
$func 3(%rbp,%rdi,8),@mm[5]
- mov %bl,%cl
- mov %bh,%dl
mov $i*8+8+4(%rsp),%ebx # ($i+1)*8+4
lea (%rcx,%rcx),%rsi
+ movz %al,%ecx
lea (%rdx,%rdx),%rdi
+ movz %ah,%edx
$func 2(%rbp,%rsi,8),@mm[6]
$func 1(%rbp,%rdi,8),@mm[7]
___
@@ -134,32 +137,32 @@ ___
for($i=0;$i<8;$i++) { $code.="mov @mm[$i],$i*8(%rsp)\n"; } # K=L
for($i=0;$i<8;$i++) {
$code.=<<___;
- mov %al,%cl
- mov %ah,%dl
+ shr \$16,%eax
lea (%rcx,%rcx),%rsi
+ movz %al,%ecx
lea (%rdx,%rdx),%rdi
- shr \$16,%eax
+ movz %ah,%edx
xor 0(%rbp,%rsi,8),@mm[0]
xor 7(%rbp,%rdi,8),@mm[1]
- mov %al,%cl
- mov %ah,%dl
`"mov 64+$i*8+8(%rsp),%eax" if($i<7);` # 64+($i+1)*8
lea (%rcx,%rcx),%rsi
+ movz %bl,%ecx
lea (%rdx,%rdx),%rdi
+ movz %bh,%edx
xor 6(%rbp,%rsi,8),@mm[2]
xor 5(%rbp,%rdi,8),@mm[3]
- mov %bl,%cl
- mov %bh,%dl
+ shr \$16,%ebx
lea (%rcx,%rcx),%rsi
+ movz %bl,%ecx
lea (%rdx,%rdx),%rdi
- shr \$16,%ebx
+ movz %bh,%edx
xor 4(%rbp,%rsi,8),@mm[4]
xor 3(%rbp,%rdi,8),@mm[5]
- mov %bl,%cl
- mov %bh,%dl
`"mov 64+$i*8+8+4(%rsp),%ebx" if($i<7);` # 64+($i+1)*8+4
lea (%rcx,%rcx),%rsi
+ movz %al,%ecx
lea (%rdx,%rdx),%rdi
+ movz %ah,%edx
xor 2(%rbp,%rsi,8),@mm[6]
xor 1(%rbp,%rdi,8),@mm[7]
___
diff --git a/crypto/x509/Makefile b/crypto/x509/Makefile
index aac3ece18cb2..bf197a1d932a 100644
--- a/crypto/x509/Makefile
+++ b/crypto/x509/Makefile
@@ -33,7 +33,7 @@ LIBOBJ= x509_def.o x509_d2.o x509_r2x.o x509_cmp.o \
SRC= $(LIBSRC)
EXHEADER= x509.h x509_vfy.h
-HEADER= $(EXHEADER)
+HEADER= $(EXHEADER) vpm_int.h
ALL= $(GENERAL) $(SRC) $(HEADER)
@@ -314,7 +314,7 @@ x509_vfy.o: ../../include/openssl/pkcs7.h ../../include/openssl/safestack.h
x509_vfy.o: ../../include/openssl/sha.h ../../include/openssl/stack.h
x509_vfy.o: ../../include/openssl/symhacks.h ../../include/openssl/x509.h
x509_vfy.o: ../../include/openssl/x509_vfy.h ../../include/openssl/x509v3.h
-x509_vfy.o: ../cryptlib.h x509_vfy.c
+x509_vfy.o: ../cryptlib.h vpm_int.h x509_vfy.c
x509_vpm.o: ../../e_os.h ../../include/openssl/asn1.h
x509_vpm.o: ../../include/openssl/bio.h ../../include/openssl/buffer.h
x509_vpm.o: ../../include/openssl/conf.h ../../include/openssl/crypto.h
@@ -328,7 +328,7 @@ x509_vpm.o: ../../include/openssl/pkcs7.h ../../include/openssl/safestack.h
x509_vpm.o: ../../include/openssl/sha.h ../../include/openssl/stack.h
x509_vpm.o: ../../include/openssl/symhacks.h ../../include/openssl/x509.h
x509_vpm.o: ../../include/openssl/x509_vfy.h ../../include/openssl/x509v3.h
-x509_vpm.o: ../cryptlib.h x509_vpm.c
+x509_vpm.o: ../cryptlib.h vpm_int.h x509_vpm.c
x509cset.o: ../../e_os.h ../../include/openssl/asn1.h
x509cset.o: ../../include/openssl/bio.h ../../include/openssl/buffer.h
x509cset.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
@@ -395,15 +395,17 @@ x509type.o: ../../include/openssl/sha.h ../../include/openssl/stack.h
x509type.o: ../../include/openssl/symhacks.h ../../include/openssl/x509.h
x509type.o: ../../include/openssl/x509_vfy.h ../cryptlib.h x509type.c
x_all.o: ../../e_os.h ../../include/openssl/asn1.h ../../include/openssl/bio.h
-x_all.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-x_all.o: ../../include/openssl/dsa.h ../../include/openssl/e_os2.h
-x_all.o: ../../include/openssl/ec.h ../../include/openssl/ecdh.h
-x_all.o: ../../include/openssl/ecdsa.h ../../include/openssl/err.h
-x_all.o: ../../include/openssl/evp.h ../../include/openssl/lhash.h
-x_all.o: ../../include/openssl/obj_mac.h ../../include/openssl/objects.h
+x_all.o: ../../include/openssl/buffer.h ../../include/openssl/conf.h
+x_all.o: ../../include/openssl/crypto.h ../../include/openssl/dsa.h
+x_all.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h
+x_all.o: ../../include/openssl/ecdh.h ../../include/openssl/ecdsa.h
+x_all.o: ../../include/openssl/err.h ../../include/openssl/evp.h
+x_all.o: ../../include/openssl/lhash.h ../../include/openssl/obj_mac.h
+x_all.o: ../../include/openssl/objects.h ../../include/openssl/ocsp.h
x_all.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h
x_all.o: ../../include/openssl/ossl_typ.h ../../include/openssl/pkcs7.h
x_all.o: ../../include/openssl/rsa.h ../../include/openssl/safestack.h
x_all.o: ../../include/openssl/sha.h ../../include/openssl/stack.h
x_all.o: ../../include/openssl/symhacks.h ../../include/openssl/x509.h
-x_all.o: ../../include/openssl/x509_vfy.h ../cryptlib.h x_all.c
+x_all.o: ../../include/openssl/x509_vfy.h ../../include/openssl/x509v3.h
+x_all.o: ../cryptlib.h x_all.c
diff --git a/crypto/x509/verify_extra_test.c b/crypto/x509/verify_extra_test.c
index a1e41f2822bd..08509f013194 100644
--- a/crypto/x509/verify_extra_test.c
+++ b/crypto/x509/verify_extra_test.c
@@ -168,8 +168,7 @@ static int test_alt_chains_cert_forgery(void)
i = X509_verify_cert(sctx);
- if(i == 0 && X509_STORE_CTX_get_error(sctx)
- == X509_V_ERR_UNABLE_TO_GET_ISSUER_CERT) {
+ if(i == 0 && X509_STORE_CTX_get_error(sctx) == X509_V_ERR_INVALID_CA) {
/* This is the result we were expecting: Test passed */
ret = 1;
}
diff --git a/crypto/x509/vpm_int.h b/crypto/x509/vpm_int.h
new file mode 100644
index 000000000000..9c55defc512a
--- /dev/null
+++ b/crypto/x509/vpm_int.h
@@ -0,0 +1,70 @@
+/* vpm_int.h */
+/*
+ * Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL project
+ * 2013.
+ */
+/* ====================================================================
+ * Copyright (c) 2013 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+/* internal only structure to hold additional X509_VERIFY_PARAM data */
+
+struct X509_VERIFY_PARAM_ID_st {
+ STACK_OF(OPENSSL_STRING) *hosts; /* Set of acceptable names */
+ unsigned int hostflags; /* Flags to control matching features */
+ char *peername; /* Matching hostname in peer certificate */
+ char *email; /* If not NULL email address to match */
+ size_t emaillen;
+ unsigned char *ip; /* If not NULL IP address to match */
+ size_t iplen; /* Length of IP address */
+};
diff --git a/crypto/x509/x509.h b/crypto/x509/x509.h
index a4911741005e..99337b849a52 100644
--- a/crypto/x509/x509.h
+++ b/crypto/x509/x509.h
@@ -361,6 +361,7 @@ typedef struct x509_cert_pair_st {
# define X509_FLAG_NO_SIGDUMP (1L << 9)
# define X509_FLAG_NO_AUX (1L << 10)
# define X509_FLAG_NO_ATTRIBUTES (1L << 11)
+# define X509_FLAG_NO_IDS (1L << 12)
/* Flags specific to X509_NAME_print_ex() */
@@ -645,10 +646,12 @@ int X509_signature_print(BIO *bp, X509_ALGOR *alg, ASN1_STRING *sig);
int X509_sign(X509 *x, EVP_PKEY *pkey, const EVP_MD *md);
int X509_sign_ctx(X509 *x, EVP_MD_CTX *ctx);
+int X509_http_nbio(OCSP_REQ_CTX *rctx, X509 **pcert);
int X509_REQ_sign(X509_REQ *x, EVP_PKEY *pkey, const EVP_MD *md);
int X509_REQ_sign_ctx(X509_REQ *x, EVP_MD_CTX *ctx);
int X509_CRL_sign(X509_CRL *x, EVP_PKEY *pkey, const EVP_MD *md);
int X509_CRL_sign_ctx(X509_CRL *x, EVP_MD_CTX *ctx);
+int X509_CRL_http_nbio(OCSP_REQ_CTX *rctx, X509_CRL **pcrl);
int NETSCAPE_SPKI_sign(NETSCAPE_SPKI *x, EVP_PKEY *pkey, const EVP_MD *md);
int X509_pubkey_digest(const X509 *data, const EVP_MD *type,
@@ -745,6 +748,7 @@ X509 *X509_dup(X509 *x509);
X509_ATTRIBUTE *X509_ATTRIBUTE_dup(X509_ATTRIBUTE *xa);
X509_EXTENSION *X509_EXTENSION_dup(X509_EXTENSION *ex);
X509_CRL *X509_CRL_dup(X509_CRL *crl);
+X509_REVOKED *X509_REVOKED_dup(X509_REVOKED *rev);
X509_REQ *X509_REQ_dup(X509_REQ *req);
X509_ALGOR *X509_ALGOR_dup(X509_ALGOR *xn);
int X509_ALGOR_set0(X509_ALGOR *alg, ASN1_OBJECT *aobj, int ptype,
@@ -828,6 +832,12 @@ void *X509_get_ex_data(X509 *r, int idx);
int i2d_X509_AUX(X509 *a, unsigned char **pp);
X509 *d2i_X509_AUX(X509 **a, const unsigned char **pp, long length);
+int i2d_re_X509_tbs(X509 *x, unsigned char **pp);
+
+void X509_get0_signature(ASN1_BIT_STRING **psig, X509_ALGOR **palg,
+ const X509 *x);
+int X509_get_signature_nid(const X509 *x);
+
int X509_alias_set1(X509 *x, unsigned char *name, int len);
int X509_keyid_set1(X509 *x, unsigned char *id, int len);
unsigned char *X509_alias_get0(X509 *x, int *len);
@@ -939,9 +949,17 @@ int X509_CRL_sort(X509_CRL *crl);
int X509_REVOKED_set_serialNumber(X509_REVOKED *x, ASN1_INTEGER *serial);
int X509_REVOKED_set_revocationDate(X509_REVOKED *r, ASN1_TIME *tm);
+X509_CRL *X509_CRL_diff(X509_CRL *base, X509_CRL *newer,
+ EVP_PKEY *skey, const EVP_MD *md, unsigned int flags);
+
int X509_REQ_check_private_key(X509_REQ *x509, EVP_PKEY *pkey);
int X509_check_private_key(X509 *x509, EVP_PKEY *pkey);
+int X509_chain_check_suiteb(int *perror_depth,
+ X509 *x, STACK_OF(X509) *chain,
+ unsigned long flags);
+int X509_CRL_check_suiteb(X509_CRL *crl, EVP_PKEY *pk, unsigned long flags);
+STACK_OF(X509) *X509_chain_up_ref(STACK_OF(X509) *chain);
int X509_issuer_and_serial_cmp(const X509 *a, const X509 *b);
unsigned long X509_issuer_and_serial_hash(X509 *a);
@@ -1236,6 +1254,7 @@ void ERR_load_X509_strings(void);
# define X509_F_X509_ATTRIBUTE_GET0_DATA 139
# define X509_F_X509_ATTRIBUTE_SET1_DATA 138
# define X509_F_X509_CHECK_PRIVATE_KEY 128
+# define X509_F_X509_CRL_DIFF 105
# define X509_F_X509_CRL_PRINT_FP 147
# define X509_F_X509_EXTENSION_CREATE_BY_NID 108
# define X509_F_X509_EXTENSION_CREATE_BY_OBJ 109
@@ -1268,20 +1287,27 @@ void ERR_load_X509_strings(void);
# define X509_F_X509_VERIFY_CERT 127
/* Reason codes. */
+# define X509_R_AKID_MISMATCH 110
# define X509_R_BAD_X509_FILETYPE 100
# define X509_R_BASE64_DECODE_ERROR 118
# define X509_R_CANT_CHECK_DH_KEY 114
# define X509_R_CERT_ALREADY_IN_HASH_TABLE 101
+# define X509_R_CRL_ALREADY_DELTA 127
+# define X509_R_CRL_VERIFY_FAILURE 131
# define X509_R_ERR_ASN1_LIB 102
+# define X509_R_IDP_MISMATCH 128
# define X509_R_INVALID_DIRECTORY 113
# define X509_R_INVALID_FIELD_NAME 119
# define X509_R_INVALID_TRUST 123
+# define X509_R_ISSUER_MISMATCH 129
# define X509_R_KEY_TYPE_MISMATCH 115
# define X509_R_KEY_VALUES_MISMATCH 116
# define X509_R_LOADING_CERT_DIR 103
# define X509_R_LOADING_DEFAULTS 104
# define X509_R_METHOD_NOT_SUPPORTED 124
+# define X509_R_NEWER_CRL_NOT_NEWER 132
# define X509_R_NO_CERT_SET_FOR_US_TO_VERIFY 105
+# define X509_R_NO_CRL_NUMBER 130
# define X509_R_PUBLIC_KEY_DECODE_ERROR 125
# define X509_R_PUBLIC_KEY_ENCODE_ERROR 126
# define X509_R_SHOULD_RETRY 106
diff --git a/crypto/x509/x509_cmp.c b/crypto/x509/x509_cmp.c
index 3c5b717c15a3..49c71b91280e 100644
--- a/crypto/x509/x509_cmp.c
+++ b/crypto/x509/x509_cmp.c
@@ -179,11 +179,23 @@ unsigned long X509_subject_name_hash_old(X509 *x)
*/
int X509_cmp(const X509 *a, const X509 *b)
{
+ int rv;
/* ensure hash is valid */
X509_check_purpose((X509 *)a, -1, 0);
X509_check_purpose((X509 *)b, -1, 0);
- return memcmp(a->sha1_hash, b->sha1_hash, SHA_DIGEST_LENGTH);
+ rv = memcmp(a->sha1_hash, b->sha1_hash, SHA_DIGEST_LENGTH);
+ if (rv)
+ return rv;
+ /* Check for match against stored encoding too */
+ if (!a->cert_info->enc.modified && !b->cert_info->enc.modified) {
+ rv = (int)(a->cert_info->enc.len - b->cert_info->enc.len);
+ if (rv)
+ return rv;
+ return memcmp(a->cert_info->enc.enc, b->cert_info->enc.enc,
+ a->cert_info->enc.len);
+ }
+ return rv;
}
#endif
@@ -339,3 +351,148 @@ int X509_check_private_key(X509 *x, EVP_PKEY *k)
return 1;
return 0;
}
+
+/*
+ * Check a suite B algorithm is permitted: pass in a public key and the NID
+ * of its signature (or 0 if no signature). The pflags is a pointer to a
+ * flags field which must contain the suite B verification flags.
+ */
+
+#ifndef OPENSSL_NO_EC
+
+static int check_suite_b(EVP_PKEY *pkey, int sign_nid, unsigned long *pflags)
+{
+ const EC_GROUP *grp = NULL;
+ int curve_nid;
+ if (pkey && pkey->type == EVP_PKEY_EC)
+ grp = EC_KEY_get0_group(pkey->pkey.ec);
+ if (!grp)
+ return X509_V_ERR_SUITE_B_INVALID_ALGORITHM;
+ curve_nid = EC_GROUP_get_curve_name(grp);
+ /* Check curve is consistent with LOS */
+ if (curve_nid == NID_secp384r1) { /* P-384 */
+ /*
+ * Check signature algorithm is consistent with curve.
+ */
+ if (sign_nid != -1 && sign_nid != NID_ecdsa_with_SHA384)
+ return X509_V_ERR_SUITE_B_INVALID_SIGNATURE_ALGORITHM;
+ if (!(*pflags & X509_V_FLAG_SUITEB_192_LOS))
+ return X509_V_ERR_SUITE_B_LOS_NOT_ALLOWED;
+ /* If we encounter P-384 we cannot use P-256 later */
+ *pflags &= ~X509_V_FLAG_SUITEB_128_LOS_ONLY;
+ } else if (curve_nid == NID_X9_62_prime256v1) { /* P-256 */
+ if (sign_nid != -1 && sign_nid != NID_ecdsa_with_SHA256)
+ return X509_V_ERR_SUITE_B_INVALID_SIGNATURE_ALGORITHM;
+ if (!(*pflags & X509_V_FLAG_SUITEB_128_LOS_ONLY))
+ return X509_V_ERR_SUITE_B_LOS_NOT_ALLOWED;
+ } else
+ return X509_V_ERR_SUITE_B_INVALID_CURVE;
+
+ return X509_V_OK;
+}
+
+int X509_chain_check_suiteb(int *perror_depth, X509 *x, STACK_OF(X509) *chain,
+ unsigned long flags)
+{
+ int rv, i, sign_nid;
+ EVP_PKEY *pk = NULL;
+ unsigned long tflags;
+ if (!(flags & X509_V_FLAG_SUITEB_128_LOS))
+ return X509_V_OK;
+ tflags = flags;
+ /* If no EE certificate passed in must be first in chain */
+ if (x == NULL) {
+ x = sk_X509_value(chain, 0);
+ i = 1;
+ } else
+ i = 0;
+
+ if (X509_get_version(x) != 2) {
+ rv = X509_V_ERR_SUITE_B_INVALID_VERSION;
+ /* Correct error depth */
+ i = 0;
+ goto end;
+ }
+
+ pk = X509_get_pubkey(x);
+ /* Check EE key only */
+ rv = check_suite_b(pk, -1, &tflags);
+ if (rv != X509_V_OK) {
+ /* Correct error depth */
+ i = 0;
+ goto end;
+ }
+ for (; i < sk_X509_num(chain); i++) {
+ sign_nid = X509_get_signature_nid(x);
+ x = sk_X509_value(chain, i);
+ if (X509_get_version(x) != 2) {
+ rv = X509_V_ERR_SUITE_B_INVALID_VERSION;
+ goto end;
+ }
+ EVP_PKEY_free(pk);
+ pk = X509_get_pubkey(x);
+ rv = check_suite_b(pk, sign_nid, &tflags);
+ if (rv != X509_V_OK)
+ goto end;
+ }
+
+ /* Final check: root CA signature */
+ rv = check_suite_b(pk, X509_get_signature_nid(x), &tflags);
+ end:
+ if (pk)
+ EVP_PKEY_free(pk);
+ if (rv != X509_V_OK) {
+ /* Invalid signature or LOS errors are for previous cert */
+ if ((rv == X509_V_ERR_SUITE_B_INVALID_SIGNATURE_ALGORITHM
+ || rv == X509_V_ERR_SUITE_B_LOS_NOT_ALLOWED) && i)
+ i--;
+ /*
+ * If we have LOS error and flags changed then we are signing P-384
+ * with P-256. Use more meaninggul error.
+ */
+ if (rv == X509_V_ERR_SUITE_B_LOS_NOT_ALLOWED && flags != tflags)
+ rv = X509_V_ERR_SUITE_B_CANNOT_SIGN_P_384_WITH_P_256;
+ if (perror_depth)
+ *perror_depth = i;
+ }
+ return rv;
+}
+
+int X509_CRL_check_suiteb(X509_CRL *crl, EVP_PKEY *pk, unsigned long flags)
+{
+ int sign_nid;
+ if (!(flags & X509_V_FLAG_SUITEB_128_LOS))
+ return X509_V_OK;
+ sign_nid = OBJ_obj2nid(crl->crl->sig_alg->algorithm);
+ return check_suite_b(pk, sign_nid, &flags);
+}
+
+#else
+int X509_chain_check_suiteb(int *perror_depth, X509 *x, STACK_OF(X509) *chain,
+ unsigned long flags)
+{
+ return 0;
+}
+
+int X509_CRL_check_suiteb(X509_CRL *crl, EVP_PKEY *pk, unsigned long flags)
+{
+ return 0;
+}
+
+#endif
+/*
+ * Not strictly speaking an "up_ref" as a STACK doesn't have a reference
+ * count but it has the same effect by duping the STACK and upping the ref of
+ * each X509 structure.
+ */
+STACK_OF(X509) *X509_chain_up_ref(STACK_OF(X509) *chain)
+{
+ STACK_OF(X509) *ret;
+ int i;
+ ret = sk_X509_dup(chain);
+ for (i = 0; i < sk_X509_num(ret); i++) {
+ X509 *x = sk_X509_value(ret, i);
+ CRYPTO_add(&x->references, 1, CRYPTO_LOCK_X509);
+ }
+ return ret;
+}
diff --git a/crypto/x509/x509_err.c b/crypto/x509/x509_err.c
index 61a19f74109b..43cde18e49a7 100644
--- a/crypto/x509/x509_err.c
+++ b/crypto/x509/x509_err.c
@@ -1,6 +1,6 @@
/* crypto/x509/x509_err.c */
/* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1999-2012 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -88,6 +88,7 @@ static ERR_STRING_DATA X509_str_functs[] = {
{ERR_FUNC(X509_F_X509_ATTRIBUTE_GET0_DATA), "X509_ATTRIBUTE_get0_data"},
{ERR_FUNC(X509_F_X509_ATTRIBUTE_SET1_DATA), "X509_ATTRIBUTE_set1_data"},
{ERR_FUNC(X509_F_X509_CHECK_PRIVATE_KEY), "X509_check_private_key"},
+ {ERR_FUNC(X509_F_X509_CRL_DIFF), "X509_CRL_diff"},
{ERR_FUNC(X509_F_X509_CRL_PRINT_FP), "X509_CRL_print_fp"},
{ERR_FUNC(X509_F_X509_EXTENSION_CREATE_BY_NID),
"X509_EXTENSION_create_by_NID"},
@@ -131,22 +132,29 @@ static ERR_STRING_DATA X509_str_functs[] = {
};
static ERR_STRING_DATA X509_str_reasons[] = {
+ {ERR_REASON(X509_R_AKID_MISMATCH), "akid mismatch"},
{ERR_REASON(X509_R_BAD_X509_FILETYPE), "bad x509 filetype"},
{ERR_REASON(X509_R_BASE64_DECODE_ERROR), "base64 decode error"},
{ERR_REASON(X509_R_CANT_CHECK_DH_KEY), "cant check dh key"},
{ERR_REASON(X509_R_CERT_ALREADY_IN_HASH_TABLE),
"cert already in hash table"},
+ {ERR_REASON(X509_R_CRL_ALREADY_DELTA), "crl already delta"},
+ {ERR_REASON(X509_R_CRL_VERIFY_FAILURE), "crl verify failure"},
{ERR_REASON(X509_R_ERR_ASN1_LIB), "err asn1 lib"},
+ {ERR_REASON(X509_R_IDP_MISMATCH), "idp mismatch"},
{ERR_REASON(X509_R_INVALID_DIRECTORY), "invalid directory"},
{ERR_REASON(X509_R_INVALID_FIELD_NAME), "invalid field name"},
{ERR_REASON(X509_R_INVALID_TRUST), "invalid trust"},
+ {ERR_REASON(X509_R_ISSUER_MISMATCH), "issuer mismatch"},
{ERR_REASON(X509_R_KEY_TYPE_MISMATCH), "key type mismatch"},
{ERR_REASON(X509_R_KEY_VALUES_MISMATCH), "key values mismatch"},
{ERR_REASON(X509_R_LOADING_CERT_DIR), "loading cert dir"},
{ERR_REASON(X509_R_LOADING_DEFAULTS), "loading defaults"},
{ERR_REASON(X509_R_METHOD_NOT_SUPPORTED), "method not supported"},
+ {ERR_REASON(X509_R_NEWER_CRL_NOT_NEWER), "newer crl not newer"},
{ERR_REASON(X509_R_NO_CERT_SET_FOR_US_TO_VERIFY),
"no cert set for us to verify"},
+ {ERR_REASON(X509_R_NO_CRL_NUMBER), "no crl number"},
{ERR_REASON(X509_R_PUBLIC_KEY_DECODE_ERROR), "public key decode error"},
{ERR_REASON(X509_R_PUBLIC_KEY_ENCODE_ERROR), "public key encode error"},
{ERR_REASON(X509_R_SHOULD_RETRY), "should retry"},
diff --git a/crypto/x509/x509_lu.c b/crypto/x509/x509_lu.c
index 8415d1d8b353..b0d653903ff5 100644
--- a/crypto/x509/x509_lu.c
+++ b/crypto/x509/x509_lu.c
@@ -238,6 +238,19 @@ void X509_STORE_free(X509_STORE *vfy)
if (vfy == NULL)
return;
+ i = CRYPTO_add(&vfy->references, -1, CRYPTO_LOCK_X509_STORE);
+#ifdef REF_PRINT
+ REF_PRINT("X509_STORE", vfy);
+#endif
+ if (i > 0)
+ return;
+#ifdef REF_CHECK
+ if (i < 0) {
+ fprintf(stderr, "X509_STORE_free, bad reference count\n");
+ abort(); /* ok */
+ }
+#endif
+
sk = vfy->get_cert_methods;
for (i = 0; i < sk_X509_LOOKUP_num(sk); i++) {
lu = sk_X509_LOOKUP_value(sk, i);
@@ -681,6 +694,19 @@ void X509_STORE_set_verify_cb(X509_STORE *ctx,
ctx->verify_cb = verify_cb;
}
+void X509_STORE_set_lookup_crls_cb(X509_STORE *ctx,
+ STACK_OF(X509_CRL) *(*cb) (X509_STORE_CTX
+ *ctx,
+ X509_NAME *nm))
+{
+ ctx->lookup_crls = cb;
+}
+
+X509_STORE *X509_STORE_CTX_get0_store(X509_STORE_CTX *ctx)
+{
+ return ctx->ctx;
+}
+
IMPLEMENT_STACK_OF(X509_LOOKUP)
IMPLEMENT_STACK_OF(X509_OBJECT)
diff --git a/crypto/x509/x509_set.c b/crypto/x509/x509_set.c
index 4645777634b7..5b802bd6c754 100644
--- a/crypto/x509/x509_set.c
+++ b/crypto/x509/x509_set.c
@@ -67,6 +67,11 @@ int X509_set_version(X509 *x, long version)
{
if (x == NULL)
return (0);
+ if (version == 0) {
+ M_ASN1_INTEGER_free(x->cert_info->version);
+ x->cert_info->version = NULL;
+ return (1);
+ }
if (x->cert_info->version == NULL) {
if ((x->cert_info->version = M_ASN1_INTEGER_new()) == NULL)
return (0);
diff --git a/crypto/x509/x509_trs.c b/crypto/x509/x509_trs.c
index 7e444795a5e7..11e0763403ae 100644
--- a/crypto/x509/x509_trs.c
+++ b/crypto/x509/x509_trs.c
@@ -119,6 +119,14 @@ int X509_check_trust(X509 *x, int id, int flags)
int idx;
if (id == -1)
return 1;
+ /* We get this as a default value */
+ if (id == 0) {
+ int rv;
+ rv = obj_trust(NID_anyExtendedKeyUsage, x, 0);
+ if (rv != X509_TRUST_UNTRUSTED)
+ return rv;
+ return trust_compat(NULL, x, 0);
+ }
idx = X509_TRUST_get_by_id(id);
if (idx == -1)
return default_trust(id, x, flags);
diff --git a/crypto/x509/x509_txt.c b/crypto/x509/x509_txt.c
index d834180ec443..3d46d3ff8366 100644
--- a/crypto/x509/x509_txt.c
+++ b/crypto/x509/x509_txt.c
@@ -184,6 +184,26 @@ const char *X509_verify_cert_error_string(long n)
case X509_V_ERR_CRL_PATH_VALIDATION_ERROR:
return ("CRL path validation error");
+ case X509_V_ERR_SUITE_B_INVALID_VERSION:
+ return ("Suite B: certificate version invalid");
+ case X509_V_ERR_SUITE_B_INVALID_ALGORITHM:
+ return ("Suite B: invalid public key algorithm");
+ case X509_V_ERR_SUITE_B_INVALID_CURVE:
+ return ("Suite B: invalid ECC curve");
+ case X509_V_ERR_SUITE_B_INVALID_SIGNATURE_ALGORITHM:
+ return ("Suite B: invalid signature algorithm");
+ case X509_V_ERR_SUITE_B_LOS_NOT_ALLOWED:
+ return ("Suite B: curve not allowed for this LOS");
+ case X509_V_ERR_SUITE_B_CANNOT_SIGN_P_384_WITH_P_256:
+ return ("Suite B: cannot sign P-384 with P-256");
+
+ case X509_V_ERR_HOSTNAME_MISMATCH:
+ return ("Hostname mismatch");
+ case X509_V_ERR_EMAIL_MISMATCH:
+ return ("Email address mismatch");
+ case X509_V_ERR_IP_ADDRESS_MISMATCH:
+ return ("IP address mismatch");
+
default:
BIO_snprintf(buf, sizeof buf, "error number %ld", n);
return (buf);
diff --git a/crypto/x509/x509_vfy.c b/crypto/x509/x509_vfy.c
index 7009ae63076e..a2f1dbefe352 100644
--- a/crypto/x509/x509_vfy.c
+++ b/crypto/x509/x509_vfy.c
@@ -69,6 +69,7 @@
#include <openssl/x509.h>
#include <openssl/x509v3.h>
#include <openssl/objects.h>
+#include "vpm_int.h"
/* CRL score values */
@@ -113,6 +114,7 @@ static int check_issued(X509_STORE_CTX *ctx, X509 *x, X509 *issuer);
static X509 *find_issuer(X509_STORE_CTX *ctx, STACK_OF(X509) *sk, X509 *x);
static int check_chain_extensions(X509_STORE_CTX *ctx);
static int check_name_constraints(X509_STORE_CTX *ctx);
+static int check_id(X509_STORE_CTX *ctx);
static int check_trust(X509_STORE_CTX *ctx);
static int check_revocation(X509_STORE_CTX *ctx);
static int check_cert(X509_STORE_CTX *ctx);
@@ -148,6 +150,40 @@ static int x509_subject_cmp(X509 **a, X509 **b)
return X509_subject_name_cmp(*a, *b);
}
#endif
+/* Return 1 is a certificate is self signed */
+static int cert_self_signed(X509 *x)
+{
+ X509_check_purpose(x, -1, 0);
+ if (x->ex_flags & EXFLAG_SS)
+ return 1;
+ else
+ return 0;
+}
+
+/* Given a certificate try and find an exact match in the store */
+
+static X509 *lookup_cert_match(X509_STORE_CTX *ctx, X509 *x)
+{
+ STACK_OF(X509) *certs;
+ X509 *xtmp = NULL;
+ int i;
+ /* Lookup all certs with matching subject name */
+ certs = ctx->lookup_certs(ctx, X509_get_subject_name(x));
+ if (certs == NULL)
+ return NULL;
+ /* Look for exact match */
+ for (i = 0; i < sk_X509_num(certs); i++) {
+ xtmp = sk_X509_value(certs, i);
+ if (!X509_cmp(xtmp, x))
+ break;
+ }
+ if (i < sk_X509_num(certs))
+ CRYPTO_add(&xtmp->references, 1, CRYPTO_LOCK_X509);
+ else
+ xtmp = NULL;
+ sk_X509_pop_free(certs, X509_free);
+ return xtmp;
+}
int X509_verify_cert(X509_STORE_CTX *ctx)
{
@@ -205,8 +241,24 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
* later. */
/* If we are self signed, we break */
- if (ctx->check_issued(ctx, x, x))
+ if (cert_self_signed(x))
break;
+ /*
+ * If asked see if we can find issuer in trusted store first
+ */
+ if (ctx->param->flags & X509_V_FLAG_TRUSTED_FIRST) {
+ ok = ctx->get_issuer(&xtmp, ctx, x);
+ if (ok < 0)
+ return ok;
+ /*
+ * If successful for now free up cert so it will be picked up
+ * again later.
+ */
+ if (ok > 0) {
+ X509_free(xtmp);
+ break;
+ }
+ }
/* If we were passed a cert chain, use it first */
if (ctx->untrusted != NULL) {
@@ -244,7 +296,7 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
*/
i = sk_X509_num(ctx->chain);
x = sk_X509_value(ctx->chain, i - 1);
- if (ctx->check_issued(ctx, x, x)) {
+ if (cert_self_signed(x)) {
/* we have a self signed certificate */
if (sk_X509_num(ctx->chain) == 1) {
/*
@@ -290,9 +342,10 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
if (depth < num)
break;
/* If we are self signed, we break */
- if (ctx->check_issued(ctx, x, x))
+ if (cert_self_signed(x))
break;
ok = ctx->get_issuer(&xtmp, ctx, x);
+
if (ok < 0)
return ok;
if (ok == 0)
@@ -306,14 +359,22 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
num++;
}
+ /* we now have our chain, lets check it... */
+ i = check_trust(ctx);
+
+ /* If explicitly rejected error */
+ if (i == X509_TRUST_REJECTED)
+ goto end;
/*
- * If we haven't got a least one certificate from our store then check
- * if there is an alternative chain that could be used. We only do this
- * if the user hasn't switched off alternate chain checking
+ * If it's not explicitly trusted then check if there is an alternative
+ * chain that could be used. We only do this if we haven't already
+ * checked via TRUSTED_FIRST and the user hasn't switched off alternate
+ * chain checking
*/
retry = 0;
- if (num == ctx->last_untrusted &&
- !(ctx->param->flags & X509_V_FLAG_NO_ALT_CHAINS)) {
+ if (i != X509_TRUST_TRUSTED
+ && !(ctx->param->flags & X509_V_FLAG_TRUSTED_FIRST)
+ && !(ctx->param->flags & X509_V_FLAG_NO_ALT_CHAINS)) {
while (j-- > 1) {
xtmp2 = sk_X509_value(ctx->chain, j - 1);
ok = ctx->get_issuer(&xtmp, ctx, xtmp2);
@@ -343,8 +404,12 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
}
} while (retry);
- /* Is last certificate looked up self signed? */
- if (!ctx->check_issued(ctx, x, x)) {
+ /*
+ * If not explicitly trusted then indicate error unless it's a single
+ * self signed certificate in which case we've indicated an error already
+ * and set bad_chain == 1
+ */
+ if (i != X509_TRUST_TRUSTED && !bad_chain) {
if ((chain_ss == NULL) || !ctx->check_issued(ctx, x, chain_ss)) {
if (ctx->last_untrusted >= num)
ctx->error = X509_V_ERR_UNABLE_TO_GET_ISSUER_CERT_LOCALLY;
@@ -381,10 +446,7 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
if (!ok)
goto end;
- /* The chain extensions are OK: check trust */
-
- if (param->trust > 0)
- ok = check_trust(ctx);
+ ok = check_id(ctx);
if (!ok)
goto end;
@@ -401,6 +463,16 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
if (!ok)
goto end;
+ i = X509_chain_check_suiteb(&ctx->error_depth, NULL, ctx->chain,
+ ctx->param->flags);
+ if (i != X509_V_OK) {
+ ctx->error = i;
+ ctx->current_cert = sk_X509_value(ctx->chain, ctx->error_depth);
+ ok = cb(0, ctx);
+ if (!ok)
+ goto end;
+ }
+
/* At this point, we have a chain and need to verify it */
if (ctx->verify != NULL)
ok = ctx->verify(ctx);
@@ -467,7 +539,6 @@ static int check_issued(X509_STORE_CTX *ctx, X509 *x, X509 *issuer)
ctx->current_cert = x;
ctx->current_issuer = issuer;
return ctx->verify_cb(0, ctx);
- return 0;
}
/* Alternative lookup method: look from a STACK stored in other_ctx */
@@ -667,30 +738,97 @@ static int check_name_constraints(X509_STORE_CTX *ctx)
return 1;
}
-static int check_trust(X509_STORE_CTX *ctx)
+static int check_id_error(X509_STORE_CTX *ctx, int errcode)
{
-#ifdef OPENSSL_NO_CHAIN_VERIFY
+ ctx->error = errcode;
+ ctx->current_cert = ctx->cert;
+ ctx->error_depth = 0;
+ return ctx->verify_cb(0, ctx);
+}
+
+static int check_hosts(X509 *x, X509_VERIFY_PARAM_ID *id)
+{
+ int i;
+ int n = sk_OPENSSL_STRING_num(id->hosts);
+ char *name;
+
+ for (i = 0; i < n; ++i) {
+ name = sk_OPENSSL_STRING_value(id->hosts, i);
+ if (X509_check_host(x, name, 0, id->hostflags, &id->peername) > 0)
+ return 1;
+ }
+ return n == 0;
+}
+
+static int check_id(X509_STORE_CTX *ctx)
+{
+ X509_VERIFY_PARAM *vpm = ctx->param;
+ X509_VERIFY_PARAM_ID *id = vpm->id;
+ X509 *x = ctx->cert;
+ if (id->hosts && check_hosts(x, id) <= 0) {
+ if (!check_id_error(ctx, X509_V_ERR_HOSTNAME_MISMATCH))
+ return 0;
+ }
+ if (id->email && X509_check_email(x, id->email, id->emaillen, 0) <= 0) {
+ if (!check_id_error(ctx, X509_V_ERR_EMAIL_MISMATCH))
+ return 0;
+ }
+ if (id->ip && X509_check_ip(x, id->ip, id->iplen, 0) <= 0) {
+ if (!check_id_error(ctx, X509_V_ERR_IP_ADDRESS_MISMATCH))
+ return 0;
+ }
return 1;
-#else
+}
+
+static int check_trust(X509_STORE_CTX *ctx)
+{
int i, ok;
- X509 *x;
+ X509 *x = NULL;
int (*cb) (int xok, X509_STORE_CTX *xctx);
cb = ctx->verify_cb;
-/* For now just check the last certificate in the chain */
- i = sk_X509_num(ctx->chain) - 1;
- x = sk_X509_value(ctx->chain, i);
- ok = X509_check_trust(x, ctx->param->trust, 0);
- if (ok == X509_TRUST_TRUSTED)
- return 1;
- ctx->error_depth = i;
- ctx->current_cert = x;
- if (ok == X509_TRUST_REJECTED)
- ctx->error = X509_V_ERR_CERT_REJECTED;
- else
- ctx->error = X509_V_ERR_CERT_UNTRUSTED;
- ok = cb(0, ctx);
- return ok;
-#endif
+ /* Check all trusted certificates in chain */
+ for (i = ctx->last_untrusted; i < sk_X509_num(ctx->chain); i++) {
+ x = sk_X509_value(ctx->chain, i);
+ ok = X509_check_trust(x, ctx->param->trust, 0);
+ /* If explicitly trusted return trusted */
+ if (ok == X509_TRUST_TRUSTED)
+ return X509_TRUST_TRUSTED;
+ /*
+ * If explicitly rejected notify callback and reject if not
+ * overridden.
+ */
+ if (ok == X509_TRUST_REJECTED) {
+ ctx->error_depth = i;
+ ctx->current_cert = x;
+ ctx->error = X509_V_ERR_CERT_REJECTED;
+ ok = cb(0, ctx);
+ if (!ok)
+ return X509_TRUST_REJECTED;
+ }
+ }
+ /*
+ * If we accept partial chains and have at least one trusted certificate
+ * return success.
+ */
+ if (ctx->param->flags & X509_V_FLAG_PARTIAL_CHAIN) {
+ X509 *mx;
+ if (ctx->last_untrusted < sk_X509_num(ctx->chain))
+ return X509_TRUST_TRUSTED;
+ x = sk_X509_value(ctx->chain, 0);
+ mx = lookup_cert_match(ctx, x);
+ if (mx) {
+ (void)sk_X509_set(ctx->chain, 0, mx);
+ X509_free(x);
+ ctx->last_untrusted = 0;
+ return X509_TRUST_TRUSTED;
+ }
+ }
+
+ /*
+ * If no trusted certs in chain at all return untrusted and allow
+ * standard (no issuer cert) etc errors to be indicated.
+ */
+ return X509_TRUST_UNTRUSTED;
}
static int check_revocation(X509_STORE_CTX *ctx)
@@ -1409,6 +1547,14 @@ static int check_crl(X509_STORE_CTX *ctx, X509_CRL *crl)
if (!ok)
goto err;
} else {
+ int rv;
+ rv = X509_CRL_check_suiteb(crl, ikey, ctx->param->flags);
+ if (rv != X509_V_OK) {
+ ctx->error = rv;
+ ok = ctx->verify_cb(0, ctx);
+ if (!ok)
+ goto err;
+ }
/* Verify CRL signature */
if (X509_CRL_verify(crl, ikey) <= 0) {
ctx->error = X509_V_ERR_CRL_SIGNATURE_FAILURE;
@@ -1565,6 +1711,10 @@ static int internal_verify(X509_STORE_CTX *ctx)
if (ctx->check_issued(ctx, xi, xi))
xs = xi;
else {
+ if (ctx->param->flags & X509_V_FLAG_PARTIAL_CHAIN) {
+ xs = xi;
+ goto check_cert;
+ }
if (n <= 0) {
ctx->error = X509_V_ERR_UNABLE_TO_VERIFY_LEAF_SIGNATURE;
ctx->current_cert = xi;
@@ -1610,6 +1760,7 @@ static int internal_verify(X509_STORE_CTX *ctx)
xs->valid = 1;
+ check_cert:
ok = check_cert_time(ctx, xs);
if (!ok)
goto end;
@@ -1824,6 +1975,114 @@ int X509_get_pubkey_parameters(EVP_PKEY *pkey, STACK_OF(X509) *chain)
return 1;
}
+/* Make a delta CRL as the diff between two full CRLs */
+
+X509_CRL *X509_CRL_diff(X509_CRL *base, X509_CRL *newer,
+ EVP_PKEY *skey, const EVP_MD *md, unsigned int flags)
+{
+ X509_CRL *crl = NULL;
+ int i;
+ STACK_OF(X509_REVOKED) *revs = NULL;
+ /* CRLs can't be delta already */
+ if (base->base_crl_number || newer->base_crl_number) {
+ X509err(X509_F_X509_CRL_DIFF, X509_R_CRL_ALREADY_DELTA);
+ return NULL;
+ }
+ /* Base and new CRL must have a CRL number */
+ if (!base->crl_number || !newer->crl_number) {
+ X509err(X509_F_X509_CRL_DIFF, X509_R_NO_CRL_NUMBER);
+ return NULL;
+ }
+ /* Issuer names must match */
+ if (X509_NAME_cmp(X509_CRL_get_issuer(base), X509_CRL_get_issuer(newer))) {
+ X509err(X509_F_X509_CRL_DIFF, X509_R_ISSUER_MISMATCH);
+ return NULL;
+ }
+ /* AKID and IDP must match */
+ if (!crl_extension_match(base, newer, NID_authority_key_identifier)) {
+ X509err(X509_F_X509_CRL_DIFF, X509_R_AKID_MISMATCH);
+ return NULL;
+ }
+ if (!crl_extension_match(base, newer, NID_issuing_distribution_point)) {
+ X509err(X509_F_X509_CRL_DIFF, X509_R_IDP_MISMATCH);
+ return NULL;
+ }
+ /* Newer CRL number must exceed full CRL number */
+ if (ASN1_INTEGER_cmp(newer->crl_number, base->crl_number) <= 0) {
+ X509err(X509_F_X509_CRL_DIFF, X509_R_NEWER_CRL_NOT_NEWER);
+ return NULL;
+ }
+ /* CRLs must verify */
+ if (skey && (X509_CRL_verify(base, skey) <= 0 ||
+ X509_CRL_verify(newer, skey) <= 0)) {
+ X509err(X509_F_X509_CRL_DIFF, X509_R_CRL_VERIFY_FAILURE);
+ return NULL;
+ }
+ /* Create new CRL */
+ crl = X509_CRL_new();
+ if (!crl || !X509_CRL_set_version(crl, 1))
+ goto memerr;
+ /* Set issuer name */
+ if (!X509_CRL_set_issuer_name(crl, X509_CRL_get_issuer(newer)))
+ goto memerr;
+
+ if (!X509_CRL_set_lastUpdate(crl, X509_CRL_get_lastUpdate(newer)))
+ goto memerr;
+ if (!X509_CRL_set_nextUpdate(crl, X509_CRL_get_nextUpdate(newer)))
+ goto memerr;
+
+ /* Set base CRL number: must be critical */
+
+ if (!X509_CRL_add1_ext_i2d(crl, NID_delta_crl, base->crl_number, 1, 0))
+ goto memerr;
+
+ /*
+ * Copy extensions across from newest CRL to delta: this will set CRL
+ * number to correct value too.
+ */
+
+ for (i = 0; i < X509_CRL_get_ext_count(newer); i++) {
+ X509_EXTENSION *ext;
+ ext = X509_CRL_get_ext(newer, i);
+ if (!X509_CRL_add_ext(crl, ext, -1))
+ goto memerr;
+ }
+
+ /* Go through revoked entries, copying as needed */
+
+ revs = X509_CRL_get_REVOKED(newer);
+
+ for (i = 0; i < sk_X509_REVOKED_num(revs); i++) {
+ X509_REVOKED *rvn, *rvtmp;
+ rvn = sk_X509_REVOKED_value(revs, i);
+ /*
+ * Add only if not also in base. TODO: need something cleverer here
+ * for some more complex CRLs covering multiple CAs.
+ */
+ if (!X509_CRL_get0_by_serial(base, &rvtmp, rvn->serialNumber)) {
+ rvtmp = X509_REVOKED_dup(rvn);
+ if (!rvtmp)
+ goto memerr;
+ if (!X509_CRL_add0_revoked(crl, rvtmp)) {
+ X509_REVOKED_free(rvtmp);
+ goto memerr;
+ }
+ }
+ }
+ /* TODO: optionally prune deleted entries */
+
+ if (skey && md && !X509_CRL_sign(crl, skey, md))
+ goto memerr;
+
+ return crl;
+
+ memerr:
+ X509err(X509_F_X509_CRL_DIFF, ERR_R_MALLOC_FAILURE);
+ if (crl)
+ X509_CRL_free(crl);
+ return NULL;
+}
+
int X509_STORE_CTX_get_ex_new_index(long argl, void *argp,
CRYPTO_EX_new *new_func,
CRYPTO_EX_dup *dup_func,
@@ -1874,16 +2133,9 @@ STACK_OF(X509) *X509_STORE_CTX_get_chain(X509_STORE_CTX *ctx)
STACK_OF(X509) *X509_STORE_CTX_get1_chain(X509_STORE_CTX *ctx)
{
- int i;
- X509 *x;
- STACK_OF(X509) *chain;
- if (!ctx->chain || !(chain = sk_X509_dup(ctx->chain)))
+ if (!ctx->chain)
return NULL;
- for (i = 0; i < sk_X509_num(chain); i++) {
- x = sk_X509_value(chain, i);
- CRYPTO_add(&x->references, 1, CRYPTO_LOCK_X509);
- }
- return chain;
+ return X509_chain_up_ref(ctx->chain);
}
X509 *X509_STORE_CTX_get0_current_issuer(X509_STORE_CTX *ctx)
diff --git a/crypto/x509/x509_vfy.h b/crypto/x509/x509_vfy.h
index aacdf55aa279..bd8613c62ba4 100644
--- a/crypto/x509/x509_vfy.h
+++ b/crypto/x509/x509_vfy.h
@@ -156,6 +156,8 @@ typedef struct x509_lookup_method_st {
X509_OBJECT *ret);
} X509_LOOKUP_METHOD;
+typedef struct X509_VERIFY_PARAM_ID_st X509_VERIFY_PARAM_ID;
+
/*
* This structure hold all parameters associated with a verify operation by
* including an X509_VERIFY_PARAM structure in related structures the
@@ -171,6 +173,7 @@ typedef struct X509_VERIFY_PARAM_st {
int trust; /* trust setting to check */
int depth; /* Verify depth */
STACK_OF(ASN1_OBJECT) *policies; /* Permissible policies */
+ X509_VERIFY_PARAM_ID *id; /* opaque ID data */
} X509_VERIFY_PARAM;
DECLARE_STACK_OF(X509_VERIFY_PARAM)
@@ -370,6 +373,19 @@ void X509_STORE_CTX_set_depth(X509_STORE_CTX *ctx, int depth);
# define X509_V_ERR_UNSUPPORTED_NAME_SYNTAX 53
# define X509_V_ERR_CRL_PATH_VALIDATION_ERROR 54
+/* Suite B mode algorithm violation */
+# define X509_V_ERR_SUITE_B_INVALID_VERSION 56
+# define X509_V_ERR_SUITE_B_INVALID_ALGORITHM 57
+# define X509_V_ERR_SUITE_B_INVALID_CURVE 58
+# define X509_V_ERR_SUITE_B_INVALID_SIGNATURE_ALGORITHM 59
+# define X509_V_ERR_SUITE_B_LOS_NOT_ALLOWED 60
+# define X509_V_ERR_SUITE_B_CANNOT_SIGN_P_384_WITH_P_256 61
+
+/* Host, email and IP check errors */
+# define X509_V_ERR_HOSTNAME_MISMATCH 62
+# define X509_V_ERR_EMAIL_MISMATCH 63
+# define X509_V_ERR_IP_ADDRESS_MISMATCH 64
+
/* The application is not happy */
# define X509_V_ERR_APPLICATION_VERIFICATION 50
@@ -405,10 +421,21 @@ void X509_STORE_CTX_set_depth(X509_STORE_CTX *ctx, int depth);
# define X509_V_FLAG_USE_DELTAS 0x2000
/* Check selfsigned CA signature */
# define X509_V_FLAG_CHECK_SS_SIGNATURE 0x4000
+/* Use trusted store first */
+# define X509_V_FLAG_TRUSTED_FIRST 0x8000
+/* Suite B 128 bit only mode: not normally used */
+# define X509_V_FLAG_SUITEB_128_LOS_ONLY 0x10000
+/* Suite B 192 bit only mode */
+# define X509_V_FLAG_SUITEB_192_LOS 0x20000
+/* Suite B 128 bit mode allowing 192 bit algorithms */
+# define X509_V_FLAG_SUITEB_128_LOS 0x30000
+
+/* Allow partial chains if at least one certificate is in trusted store */
+# define X509_V_FLAG_PARTIAL_CHAIN 0x80000
/*
* If the initial chain is not trusted, do not attempt to build an alternative
- * chain. Alternate chain checking was introduced in 1.0.1n/1.0.2b. Setting
- * this flag will force the behaviour to match that of previous versions.
+ * chain. Alternate chain checking was introduced in 1.0.2b. Setting this flag
+ * will force the behaviour to match that of previous versions.
*/
# define X509_V_FLAG_NO_ALT_CHAINS 0x100000
@@ -445,6 +472,11 @@ int X509_STORE_set1_param(X509_STORE *ctx, X509_VERIFY_PARAM *pm);
void X509_STORE_set_verify_cb(X509_STORE *ctx,
int (*verify_cb) (int, X509_STORE_CTX *));
+void X509_STORE_set_lookup_crls_cb(X509_STORE *ctx,
+ STACK_OF(X509_CRL) *(*cb) (X509_STORE_CTX
+ *ctx,
+ X509_NAME *nm));
+
X509_STORE_CTX *X509_STORE_CTX_new(void);
int X509_STORE_CTX_get1_issuer(X509 **issuer, X509_STORE_CTX *ctx, X509 *x);
@@ -455,6 +487,8 @@ int X509_STORE_CTX_init(X509_STORE_CTX *ctx, X509_STORE *store,
void X509_STORE_CTX_trusted_stack(X509_STORE_CTX *ctx, STACK_OF(X509) *sk);
void X509_STORE_CTX_cleanup(X509_STORE_CTX *ctx);
+X509_STORE *X509_STORE_CTX_get0_store(X509_STORE_CTX *ctx);
+
X509_LOOKUP *X509_STORE_add_lookup(X509_STORE *v, X509_LOOKUP_METHOD *m);
X509_LOOKUP_METHOD *X509_LOOKUP_hash_dir(void);
@@ -552,9 +586,27 @@ int X509_VERIFY_PARAM_add0_policy(X509_VERIFY_PARAM *param,
ASN1_OBJECT *policy);
int X509_VERIFY_PARAM_set1_policies(X509_VERIFY_PARAM *param,
STACK_OF(ASN1_OBJECT) *policies);
+
+int X509_VERIFY_PARAM_set1_host(X509_VERIFY_PARAM *param,
+ const char *name, size_t namelen);
+int X509_VERIFY_PARAM_add1_host(X509_VERIFY_PARAM *param,
+ const char *name, size_t namelen);
+void X509_VERIFY_PARAM_set_hostflags(X509_VERIFY_PARAM *param,
+ unsigned int flags);
+char *X509_VERIFY_PARAM_get0_peername(X509_VERIFY_PARAM *);
+int X509_VERIFY_PARAM_set1_email(X509_VERIFY_PARAM *param,
+ const char *email, size_t emaillen);
+int X509_VERIFY_PARAM_set1_ip(X509_VERIFY_PARAM *param,
+ const unsigned char *ip, size_t iplen);
+int X509_VERIFY_PARAM_set1_ip_asc(X509_VERIFY_PARAM *param,
+ const char *ipasc);
+
int X509_VERIFY_PARAM_get_depth(const X509_VERIFY_PARAM *param);
+const char *X509_VERIFY_PARAM_get0_name(const X509_VERIFY_PARAM *param);
int X509_VERIFY_PARAM_add0_table(X509_VERIFY_PARAM *param);
+int X509_VERIFY_PARAM_get_count(void);
+const X509_VERIFY_PARAM *X509_VERIFY_PARAM_get0(int id);
const X509_VERIFY_PARAM *X509_VERIFY_PARAM_lookup(const char *name);
void X509_VERIFY_PARAM_table_cleanup(void);
diff --git a/crypto/x509/x509_vpm.c b/crypto/x509/x509_vpm.c
index 6b0bf8a6e69e..1ea0c69f5743 100644
--- a/crypto/x509/x509_vpm.c
+++ b/crypto/x509/x509_vpm.c
@@ -66,10 +66,73 @@
#include <openssl/x509.h>
#include <openssl/x509v3.h>
+#include "vpm_int.h"
+
/* X509_VERIFY_PARAM functions */
+#define SET_HOST 0
+#define ADD_HOST 1
+
+static char *str_copy(const char *s)
+{
+ return OPENSSL_strdup(s);
+}
+
+static void str_free(char *s)
+{
+ OPENSSL_free(s);
+}
+
+#define string_stack_free(sk) sk_OPENSSL_STRING_pop_free(sk, str_free)
+
+static int int_x509_param_set_hosts(X509_VERIFY_PARAM_ID *id, int mode,
+ const char *name, size_t namelen)
+{
+ char *copy;
+
+ /*
+ * Refuse names with embedded NUL bytes, except perhaps as final byte.
+ * XXX: Do we need to push an error onto the error stack?
+ */
+ if (namelen == 0)
+ namelen = name ? strlen(name) : 0;
+ else if (name && memchr(name, '\0', namelen > 1 ? namelen - 1 : namelen))
+ return 0;
+ if (name && name[namelen - 1] == '\0')
+ --namelen;
+
+ if (mode == SET_HOST && id->hosts) {
+ string_stack_free(id->hosts);
+ id->hosts = NULL;
+ }
+ if (name == NULL || namelen == 0)
+ return 1;
+
+ copy = BUF_strndup(name, namelen);
+ if (copy == NULL)
+ return 0;
+
+ if (id->hosts == NULL &&
+ (id->hosts = sk_OPENSSL_STRING_new_null()) == NULL) {
+ OPENSSL_free(copy);
+ return 0;
+ }
+
+ if (!sk_OPENSSL_STRING_push(id->hosts, copy)) {
+ OPENSSL_free(copy);
+ if (sk_OPENSSL_STRING_num(id->hosts) == 0) {
+ sk_OPENSSL_STRING_free(id->hosts);
+ id->hosts = NULL;
+ }
+ return 0;
+ }
+
+ return 1;
+}
+
static void x509_verify_param_zero(X509_VERIFY_PARAM *param)
{
+ X509_VERIFY_PARAM_ID *paramid;
if (!param)
return;
param->name = NULL;
@@ -85,15 +148,42 @@ static void x509_verify_param_zero(X509_VERIFY_PARAM *param)
sk_ASN1_OBJECT_pop_free(param->policies, ASN1_OBJECT_free);
param->policies = NULL;
}
+ paramid = param->id;
+ if (paramid->hosts) {
+ string_stack_free(paramid->hosts);
+ paramid->hosts = NULL;
+ }
+ if (paramid->peername)
+ OPENSSL_free(paramid->peername);
+ if (paramid->email) {
+ OPENSSL_free(paramid->email);
+ paramid->email = NULL;
+ paramid->emaillen = 0;
+ }
+ if (paramid->ip) {
+ OPENSSL_free(paramid->ip);
+ paramid->ip = NULL;
+ paramid->iplen = 0;
+ }
+
}
X509_VERIFY_PARAM *X509_VERIFY_PARAM_new(void)
{
X509_VERIFY_PARAM *param;
- param = OPENSSL_malloc(sizeof(X509_VERIFY_PARAM));
+ X509_VERIFY_PARAM_ID *paramid;
+
+ param = OPENSSL_malloc(sizeof *param);
if (!param)
return NULL;
- memset(param, 0, sizeof(X509_VERIFY_PARAM));
+ paramid = OPENSSL_malloc(sizeof *paramid);
+ if (!paramid) {
+ OPENSSL_free(param);
+ return NULL;
+ }
+ memset(param, 0, sizeof *param);
+ memset(paramid, 0, sizeof *paramid);
+ param->id = paramid;
x509_verify_param_zero(param);
return param;
}
@@ -103,6 +193,7 @@ void X509_VERIFY_PARAM_free(X509_VERIFY_PARAM *param)
if (param == NULL)
return;
x509_verify_param_zero(param);
+ OPENSSL_free(param->id);
OPENSSL_free(param);
}
@@ -144,6 +235,11 @@ void X509_VERIFY_PARAM_free(X509_VERIFY_PARAM *param)
(to_overwrite || \
((src->field != def) && (to_default || (dest->field == def))))
+/* As above but for ID fields */
+
+#define test_x509_verify_param_copy_id(idf, def) \
+ test_x509_verify_param_copy(id->idf, def)
+
/* Macro to test and copy a field if necessary */
#define x509_verify_param_copy(field, def) \
@@ -155,8 +251,10 @@ int X509_VERIFY_PARAM_inherit(X509_VERIFY_PARAM *dest,
{
unsigned long inh_flags;
int to_default, to_overwrite;
+ X509_VERIFY_PARAM_ID *id;
if (!src)
return 1;
+ id = src->id;
inh_flags = dest->inh_flags | src->inh_flags;
if (inh_flags & X509_VP_FLAG_ONCE)
@@ -197,6 +295,31 @@ int X509_VERIFY_PARAM_inherit(X509_VERIFY_PARAM *dest,
return 0;
}
+ /* Copy the host flags if and only if we're copying the host list */
+ if (test_x509_verify_param_copy_id(hosts, NULL)) {
+ if (dest->id->hosts) {
+ string_stack_free(dest->id->hosts);
+ dest->id->hosts = NULL;
+ }
+ if (id->hosts) {
+ dest->id->hosts =
+ sk_OPENSSL_STRING_deep_copy(id->hosts, str_copy, str_free);
+ if (dest->id->hosts == NULL)
+ return 0;
+ dest->id->hostflags = id->hostflags;
+ }
+ }
+
+ if (test_x509_verify_param_copy_id(email, NULL)) {
+ if (!X509_VERIFY_PARAM_set1_email(dest, id->email, id->emaillen))
+ return 0;
+ }
+
+ if (test_x509_verify_param_copy_id(ip, NULL)) {
+ if (!X509_VERIFY_PARAM_set1_ip(dest, id->ip, id->iplen))
+ return 0;
+ }
+
return 1;
}
@@ -211,6 +334,30 @@ int X509_VERIFY_PARAM_set1(X509_VERIFY_PARAM *to,
return ret;
}
+static int int_x509_param_set1(char **pdest, size_t *pdestlen,
+ const char *src, size_t srclen)
+{
+ void *tmp;
+ if (src) {
+ if (srclen == 0) {
+ tmp = BUF_strdup(src);
+ srclen = strlen(src);
+ } else
+ tmp = BUF_memdup(src, srclen);
+ if (!tmp)
+ return 0;
+ } else {
+ tmp = NULL;
+ srclen = 0;
+ }
+ if (*pdest)
+ OPENSSL_free(*pdest);
+ *pdest = tmp;
+ if (pdestlen)
+ *pdestlen = srclen;
+ return 1;
+}
+
int X509_VERIFY_PARAM_set1_name(X509_VERIFY_PARAM *param, const char *name)
{
if (param->name)
@@ -308,11 +455,70 @@ int X509_VERIFY_PARAM_set1_policies(X509_VERIFY_PARAM *param,
return 1;
}
+int X509_VERIFY_PARAM_set1_host(X509_VERIFY_PARAM *param,
+ const char *name, size_t namelen)
+{
+ return int_x509_param_set_hosts(param->id, SET_HOST, name, namelen);
+}
+
+int X509_VERIFY_PARAM_add1_host(X509_VERIFY_PARAM *param,
+ const char *name, size_t namelen)
+{
+ return int_x509_param_set_hosts(param->id, ADD_HOST, name, namelen);
+}
+
+void X509_VERIFY_PARAM_set_hostflags(X509_VERIFY_PARAM *param,
+ unsigned int flags)
+{
+ param->id->hostflags = flags;
+}
+
+char *X509_VERIFY_PARAM_get0_peername(X509_VERIFY_PARAM *param)
+{
+ return param->id->peername;
+}
+
+int X509_VERIFY_PARAM_set1_email(X509_VERIFY_PARAM *param,
+ const char *email, size_t emaillen)
+{
+ return int_x509_param_set1(&param->id->email, &param->id->emaillen,
+ email, emaillen);
+}
+
+int X509_VERIFY_PARAM_set1_ip(X509_VERIFY_PARAM *param,
+ const unsigned char *ip, size_t iplen)
+{
+ if (iplen != 0 && iplen != 4 && iplen != 16)
+ return 0;
+ return int_x509_param_set1((char **)&param->id->ip, &param->id->iplen,
+ (char *)ip, iplen);
+}
+
+int X509_VERIFY_PARAM_set1_ip_asc(X509_VERIFY_PARAM *param, const char *ipasc)
+{
+ unsigned char ipout[16];
+ size_t iplen;
+
+ iplen = (size_t)a2i_ipadd(ipout, ipasc);
+ if (iplen == 0)
+ return 0;
+ return X509_VERIFY_PARAM_set1_ip(param, ipout, iplen);
+}
+
int X509_VERIFY_PARAM_get_depth(const X509_VERIFY_PARAM *param)
{
return param->depth;
}
+const char *X509_VERIFY_PARAM_get0_name(const X509_VERIFY_PARAM *param)
+{
+ return param->name;
+}
+
+static X509_VERIFY_PARAM_ID _empty_id = { NULL, 0U, NULL, NULL, 0, NULL, 0 };
+
+#define vpm_empty_id (X509_VERIFY_PARAM_ID *)&_empty_id
+
/*
* Default verify parameters: these are used for various applications and can
* be overridden by the user specified table. NB: the 'name' field *must* be
@@ -328,8 +534,8 @@ static const X509_VERIFY_PARAM default_table[] = {
0, /* purpose */
0, /* trust */
100, /* depth */
- NULL /* policies */
- },
+ NULL, /* policies */
+ vpm_empty_id},
{
"pkcs7", /* S/MIME sign parameters */
0, /* Check time */
@@ -338,8 +544,8 @@ static const X509_VERIFY_PARAM default_table[] = {
X509_PURPOSE_SMIME_SIGN, /* purpose */
X509_TRUST_EMAIL, /* trust */
-1, /* depth */
- NULL /* policies */
- },
+ NULL, /* policies */
+ vpm_empty_id},
{
"smime_sign", /* S/MIME sign parameters */
0, /* Check time */
@@ -348,8 +554,8 @@ static const X509_VERIFY_PARAM default_table[] = {
X509_PURPOSE_SMIME_SIGN, /* purpose */
X509_TRUST_EMAIL, /* trust */
-1, /* depth */
- NULL /* policies */
- },
+ NULL, /* policies */
+ vpm_empty_id},
{
"ssl_client", /* SSL/TLS client parameters */
0, /* Check time */
@@ -358,8 +564,8 @@ static const X509_VERIFY_PARAM default_table[] = {
X509_PURPOSE_SSL_CLIENT, /* purpose */
X509_TRUST_SSL_CLIENT, /* trust */
-1, /* depth */
- NULL /* policies */
- },
+ NULL, /* policies */
+ vpm_empty_id},
{
"ssl_server", /* SSL/TLS server parameters */
0, /* Check time */
@@ -368,8 +574,8 @@ static const X509_VERIFY_PARAM default_table[] = {
X509_PURPOSE_SSL_SERVER, /* purpose */
X509_TRUST_SSL_SERVER, /* trust */
-1, /* depth */
- NULL /* policies */
- }
+ NULL, /* policies */
+ vpm_empty_id}
};
static STACK_OF(X509_VERIFY_PARAM) *param_table = NULL;
@@ -409,6 +615,22 @@ int X509_VERIFY_PARAM_add0_table(X509_VERIFY_PARAM *param)
return 1;
}
+int X509_VERIFY_PARAM_get_count(void)
+{
+ int num = sizeof(default_table) / sizeof(X509_VERIFY_PARAM);
+ if (param_table)
+ num += sk_X509_VERIFY_PARAM_num(param_table);
+ return num;
+}
+
+const X509_VERIFY_PARAM *X509_VERIFY_PARAM_get0(int id)
+{
+ int num = sizeof(default_table) / sizeof(X509_VERIFY_PARAM);
+ if (id < num)
+ return default_table + id;
+ return sk_X509_VERIFY_PARAM_value(param_table, id - num);
+}
+
const X509_VERIFY_PARAM *X509_VERIFY_PARAM_lookup(const char *name)
{
int idx;
diff --git a/crypto/x509/x_all.c b/crypto/x509/x_all.c
index 43152e933fa1..0f26c546d835 100644
--- a/crypto/x509/x_all.c
+++ b/crypto/x509/x_all.c
@@ -63,6 +63,7 @@
#include <openssl/asn1.h>
#include <openssl/evp.h>
#include <openssl/x509.h>
+#include <openssl/ocsp.h>
#ifndef OPENSSL_NO_RSA
# include <openssl/rsa.h>
#endif
@@ -105,6 +106,12 @@ int X509_sign_ctx(X509 *x, EVP_MD_CTX *ctx)
x->sig_alg, x->signature, x->cert_info, ctx);
}
+int X509_http_nbio(OCSP_REQ_CTX *rctx, X509 **pcert)
+{
+ return OCSP_REQ_CTX_nbio_d2i(rctx,
+ (ASN1_VALUE **)pcert, ASN1_ITEM_rptr(X509));
+}
+
int X509_REQ_sign(X509_REQ *x, EVP_PKEY *pkey, const EVP_MD *md)
{
return (ASN1_item_sign(ASN1_ITEM_rptr(X509_REQ_INFO), x->sig_alg, NULL,
@@ -133,6 +140,13 @@ int X509_CRL_sign_ctx(X509_CRL *x, EVP_MD_CTX *ctx)
x->crl, ctx);
}
+int X509_CRL_http_nbio(OCSP_REQ_CTX *rctx, X509_CRL **pcrl)
+{
+ return OCSP_REQ_CTX_nbio_d2i(rctx,
+ (ASN1_VALUE **)pcrl,
+ ASN1_ITEM_rptr(X509_CRL));
+}
+
int NETSCAPE_SPKI_sign(NETSCAPE_SPKI *x, EVP_PKEY *pkey, const EVP_MD *md)
{
return (ASN1_item_sign(ASN1_ITEM_rptr(NETSCAPE_SPKAC), x->sig_algor, NULL,
diff --git a/crypto/x509v3/Makefile b/crypto/x509v3/Makefile
index 05125aba2782..9791b77a0765 100644
--- a/crypto/x509v3/Makefile
+++ b/crypto/x509v3/Makefile
@@ -13,7 +13,7 @@ AR= ar r
CFLAGS= $(INCLUDES) $(CFLAG)
GENERAL=Makefile README
-TEST=
+TEST=v3nametest.c
APPS=
LIB=$(TOP)/libcrypto.a
@@ -22,13 +22,13 @@ v3_prn.c v3_utl.c v3err.c v3_genn.c v3_alt.c v3_skey.c v3_akey.c v3_pku.c \
v3_int.c v3_enum.c v3_sxnet.c v3_cpols.c v3_crld.c v3_purp.c v3_info.c \
v3_ocsp.c v3_akeya.c v3_pmaps.c v3_pcons.c v3_ncons.c v3_pcia.c v3_pci.c \
pcy_cache.c pcy_node.c pcy_data.c pcy_map.c pcy_tree.c pcy_lib.c \
-v3_asid.c v3_addr.c
+v3_asid.c v3_addr.c v3_scts.c
LIBOBJ= v3_bcons.o v3_bitst.o v3_conf.o v3_extku.o v3_ia5.o v3_lib.o \
v3_prn.o v3_utl.o v3err.o v3_genn.o v3_alt.o v3_skey.o v3_akey.o v3_pku.o \
v3_int.o v3_enum.o v3_sxnet.o v3_cpols.o v3_crld.o v3_purp.o v3_info.o \
v3_ocsp.o v3_akeya.o v3_pmaps.o v3_pcons.o v3_ncons.o v3_pcia.o v3_pci.o \
pcy_cache.o pcy_node.o pcy_data.o pcy_map.o pcy_tree.o pcy_lib.o \
-v3_asid.o v3_addr.o
+v3_asid.o v3_addr.o v3_scts.o
SRC= $(LIBSRC)
@@ -535,6 +535,20 @@ v3_purp.o: ../../include/openssl/sha.h ../../include/openssl/stack.h
v3_purp.o: ../../include/openssl/symhacks.h ../../include/openssl/x509.h
v3_purp.o: ../../include/openssl/x509_vfy.h ../../include/openssl/x509v3.h
v3_purp.o: ../cryptlib.h v3_purp.c
+v3_scts.o: ../../e_os.h ../../include/openssl/asn1.h
+v3_scts.o: ../../include/openssl/bio.h ../../include/openssl/buffer.h
+v3_scts.o: ../../include/openssl/conf.h ../../include/openssl/crypto.h
+v3_scts.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h
+v3_scts.o: ../../include/openssl/ecdh.h ../../include/openssl/ecdsa.h
+v3_scts.o: ../../include/openssl/err.h ../../include/openssl/evp.h
+v3_scts.o: ../../include/openssl/lhash.h ../../include/openssl/obj_mac.h
+v3_scts.o: ../../include/openssl/objects.h ../../include/openssl/opensslconf.h
+v3_scts.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+v3_scts.o: ../../include/openssl/pkcs7.h ../../include/openssl/safestack.h
+v3_scts.o: ../../include/openssl/sha.h ../../include/openssl/stack.h
+v3_scts.o: ../../include/openssl/symhacks.h ../../include/openssl/x509.h
+v3_scts.o: ../../include/openssl/x509_vfy.h ../../include/openssl/x509v3.h
+v3_scts.o: ../cryptlib.h v3_scts.c
v3_skey.o: ../../e_os.h ../../include/openssl/asn1.h
v3_skey.o: ../../include/openssl/bio.h ../../include/openssl/buffer.h
v3_skey.o: ../../include/openssl/conf.h ../../include/openssl/crypto.h
diff --git a/crypto/x509v3/ext_dat.h b/crypto/x509v3/ext_dat.h
index 136b1f8ecfba..c3a6fce7524f 100644
--- a/crypto/x509v3/ext_dat.h
+++ b/crypto/x509v3/ext_dat.h
@@ -69,6 +69,7 @@ extern X509V3_EXT_METHOD v3_crl_hold, v3_pci;
extern X509V3_EXT_METHOD v3_policy_mappings, v3_policy_constraints;
extern X509V3_EXT_METHOD v3_name_constraints, v3_inhibit_anyp, v3_idp;
extern X509V3_EXT_METHOD v3_addr, v3_asid;
+extern X509V3_EXT_METHOD v3_ct_scts[];
/*
* This table will be searched using OBJ_bsearch so it *must* kept in order
@@ -126,6 +127,8 @@ static const X509V3_EXT_METHOD *standard_exts[] = {
&v3_idp,
&v3_alt[2],
&v3_freshest_crl,
+ &v3_ct_scts[0],
+ &v3_ct_scts[1],
};
/* Number of standard extensions */
diff --git a/crypto/x509v3/v3_lib.c b/crypto/x509v3/v3_lib.c
index b5598c9a3e95..8350429aafbe 100644
--- a/crypto/x509v3/v3_lib.c
+++ b/crypto/x509v3/v3_lib.c
@@ -122,6 +122,28 @@ const X509V3_EXT_METHOD *X509V3_EXT_get(X509_EXTENSION *ext)
return X509V3_EXT_get_nid(nid);
}
+int X509V3_EXT_free(int nid, void *ext_data)
+{
+ const X509V3_EXT_METHOD *ext_method = X509V3_EXT_get_nid(nid);
+ if (ext_method == NULL) {
+ X509V3err(X509V3_F_X509V3_EXT_FREE,
+ X509V3_R_CANNOT_FIND_FREE_FUNCTION);
+ return 0;
+ }
+
+ if (ext_method->it != NULL)
+ ASN1_item_free(ext_data, ASN1_ITEM_ptr(ext_method->it));
+ else if (ext_method->ext_free != NULL)
+ ext_method->ext_free(ext_data);
+ else {
+ X509V3err(X509V3_F_X509V3_EXT_FREE,
+ X509V3_R_CANNOT_FIND_FREE_FUNCTION);
+ return 0;
+ }
+
+ return 1;
+}
+
int X509V3_EXT_add_list(X509V3_EXT_METHOD *extlist)
{
for (; extlist->ext_nid != -1; extlist++)
diff --git a/crypto/x509v3/v3_purp.c b/crypto/x509v3/v3_purp.c
index 0d9bc58ce75a..36b0d87a0d8b 100644
--- a/crypto/x509v3/v3_purp.c
+++ b/crypto/x509v3/v3_purp.c
@@ -395,9 +395,6 @@ static void x509v3_cache_extensions(X509 *x)
#ifndef OPENSSL_NO_SHA
X509_digest(x, EVP_sha1(), x->sha1_hash, NULL);
#endif
- /* Does subject name match issuer ? */
- if (!X509_NAME_cmp(X509_get_subject_name(x), X509_get_issuer_name(x)))
- x->ex_flags |= EXFLAG_SI;
/* V1 should mean no extensions ... */
if (!X509_get_version(x))
x->ex_flags |= EXFLAG_V1;
@@ -479,6 +476,10 @@ static void x509v3_cache_extensions(X509 *x)
case NID_dvcs:
x->ex_xkusage |= XKU_DVCS;
break;
+
+ case NID_anyExtendedKeyUsage:
+ x->ex_xkusage |= XKU_ANYEKU;
+ break;
}
}
sk_ASN1_OBJECT_pop_free(extusage, ASN1_OBJECT_free);
@@ -494,6 +495,13 @@ static void x509v3_cache_extensions(X509 *x)
}
x->skid = X509_get_ext_d2i(x, NID_subject_key_identifier, NULL, NULL);
x->akid = X509_get_ext_d2i(x, NID_authority_key_identifier, NULL, NULL);
+ /* Does subject name match issuer ? */
+ if (!X509_NAME_cmp(X509_get_subject_name(x), X509_get_issuer_name(x))) {
+ x->ex_flags |= EXFLAG_SI;
+ /* If SKID matches AKID also indicate self signed */
+ if (X509_check_akid(x, x->akid) == X509_V_OK)
+ x->ex_flags |= EXFLAG_SS;
+ }
x->altname = X509_get_ext_d2i(x, NID_subject_alt_name, NULL, NULL);
x->nc = X509_get_ext_d2i(x, NID_name_constraints, &i, NULL);
if (!x->nc && (i != -1))
@@ -598,8 +606,8 @@ static int check_purpose_ssl_client(const X509_PURPOSE *xp, const X509 *x,
return 0;
if (ca)
return check_ssl_ca(x);
- /* We need to do digital signatures with it */
- if (ku_reject(x, KU_DIGITAL_SIGNATURE))
+ /* We need to do digital signatures or key agreement */
+ if (ku_reject(x, KU_DIGITAL_SIGNATURE | KU_KEY_AGREEMENT))
return 0;
/* nsCertType if present should allow SSL client use */
if (ns_reject(x, NS_SSL_CLIENT))
@@ -607,6 +615,14 @@ static int check_purpose_ssl_client(const X509_PURPOSE *xp, const X509 *x,
return 1;
}
+/*
+ * Key usage needed for TLS/SSL server: digital signature, encipherment or
+ * key agreement. The ssl code can check this more thoroughly for individual
+ * key types.
+ */
+#define KU_TLS \
+ KU_DIGITAL_SIGNATURE|KU_KEY_ENCIPHERMENT|KU_KEY_AGREEMENT
+
static int check_purpose_ssl_server(const X509_PURPOSE *xp, const X509 *x,
int ca)
{
@@ -617,8 +633,7 @@ static int check_purpose_ssl_server(const X509_PURPOSE *xp, const X509 *x,
if (ns_reject(x, NS_SSL_SERVER))
return 0;
- /* Now as for keyUsage: we'll at least need to sign OR encipher */
- if (ku_reject(x, KU_DIGITAL_SIGNATURE | KU_KEY_ENCIPHERMENT))
+ if (ku_reject(x, KU_TLS))
return 0;
return 1;
diff --git a/crypto/x509v3/v3_scts.c b/crypto/x509v3/v3_scts.c
new file mode 100644
index 000000000000..6e0b8d6844c8
--- /dev/null
+++ b/crypto/x509v3/v3_scts.c
@@ -0,0 +1,332 @@
+/* v3_scts.c */
+/*
+ * Written by Rob Stradling (rob@comodo.com) for the OpenSSL project 2014.
+ */
+/* ====================================================================
+ * Copyright (c) 2014 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#include <stdio.h>
+#include "cryptlib.h"
+#include <openssl/asn1.h>
+#include <openssl/x509v3.h>
+
+/* Signature and hash algorithms from RFC 5246 */
+#define TLSEXT_hash_sha256 4
+
+#define TLSEXT_signature_rsa 1
+#define TLSEXT_signature_ecdsa 3
+
+
+#define n2s(c,s) ((s=(((unsigned int)(c[0]))<< 8)| \
+ (((unsigned int)(c[1])) )),c+=2)
+
+#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
+# define SCT_TIMESTAMP unsigned __int64
+#elif defined(__arch64__)
+# define SCT_TIMESTAMP unsigned long
+#else
+# define SCT_TIMESTAMP unsigned long long
+#endif
+
+#define n2l8(c,l) (l =((SCT_TIMESTAMP)(*((c)++)))<<56, \
+ l|=((SCT_TIMESTAMP)(*((c)++)))<<48, \
+ l|=((SCT_TIMESTAMP)(*((c)++)))<<40, \
+ l|=((SCT_TIMESTAMP)(*((c)++)))<<32, \
+ l|=((SCT_TIMESTAMP)(*((c)++)))<<24, \
+ l|=((SCT_TIMESTAMP)(*((c)++)))<<16, \
+ l|=((SCT_TIMESTAMP)(*((c)++)))<< 8, \
+ l|=((SCT_TIMESTAMP)(*((c)++))))
+
+typedef struct SCT_st {
+ /* The encoded SCT */
+ unsigned char *sct;
+ unsigned short sctlen;
+ /*
+ * Components of the SCT. "logid", "ext" and "sig" point to addresses
+ * inside "sct".
+ */
+ unsigned char version;
+ unsigned char *logid;
+ unsigned short logidlen;
+ SCT_TIMESTAMP timestamp;
+ unsigned char *ext;
+ unsigned short extlen;
+ unsigned char hash_alg;
+ unsigned char sig_alg;
+ unsigned char *sig;
+ unsigned short siglen;
+} SCT;
+
+DECLARE_STACK_OF(SCT)
+
+static void SCT_LIST_free(STACK_OF(SCT) *a);
+static STACK_OF(SCT) *d2i_SCT_LIST(STACK_OF(SCT) **a,
+ const unsigned char **pp, long length);
+static int i2r_SCT_LIST(X509V3_EXT_METHOD *method, STACK_OF(SCT) *sct_list,
+ BIO *out, int indent);
+
+const X509V3_EXT_METHOD v3_ct_scts[] = {
+ {NID_ct_precert_scts, 0, NULL,
+ 0, (X509V3_EXT_FREE)SCT_LIST_free,
+ (X509V3_EXT_D2I)d2i_SCT_LIST, 0,
+ 0, 0, 0, 0,
+ (X509V3_EXT_I2R)i2r_SCT_LIST, 0,
+ NULL},
+
+ {NID_ct_cert_scts, 0, NULL,
+ 0, (X509V3_EXT_FREE)SCT_LIST_free,
+ (X509V3_EXT_D2I)d2i_SCT_LIST, 0,
+ 0, 0, 0, 0,
+ (X509V3_EXT_I2R)i2r_SCT_LIST, 0,
+ NULL},
+};
+
+static void tls12_signature_print(BIO *out, const unsigned char hash_alg,
+ const unsigned char sig_alg)
+{
+ int nid = NID_undef;
+ /* RFC6962 only permits two signature algorithms */
+ if (hash_alg == TLSEXT_hash_sha256) {
+ if (sig_alg == TLSEXT_signature_rsa)
+ nid = NID_sha256WithRSAEncryption;
+ else if (sig_alg == TLSEXT_signature_ecdsa)
+ nid = NID_ecdsa_with_SHA256;
+ }
+ if (nid == NID_undef)
+ BIO_printf(out, "%02X%02X", hash_alg, sig_alg);
+ else
+ BIO_printf(out, "%s", OBJ_nid2ln(nid));
+}
+
+static void timestamp_print(BIO *out, SCT_TIMESTAMP timestamp)
+{
+ ASN1_GENERALIZEDTIME *gen;
+ char genstr[20];
+ gen = ASN1_GENERALIZEDTIME_new();
+ ASN1_GENERALIZEDTIME_adj(gen, (time_t)0,
+ (int)(timestamp / 86400000),
+ (timestamp % 86400000) / 1000);
+ /*
+ * Note GeneralizedTime from ASN1_GENERALIZETIME_adj is always 15
+ * characters long with a final Z. Update it with fractional seconds.
+ */
+ BIO_snprintf(genstr, sizeof(genstr), "%.14s.%03dZ",
+ ASN1_STRING_data(gen), (unsigned int)(timestamp % 1000));
+ ASN1_GENERALIZEDTIME_set_string(gen, genstr);
+ ASN1_GENERALIZEDTIME_print(out, gen);
+ ASN1_GENERALIZEDTIME_free(gen);
+}
+
+static void SCT_free(SCT *sct)
+{
+ if (sct) {
+ if (sct->sct)
+ OPENSSL_free(sct->sct);
+ OPENSSL_free(sct);
+ }
+}
+
+static void SCT_LIST_free(STACK_OF(SCT) *a)
+{
+ sk_SCT_pop_free(a, SCT_free);
+}
+
+static STACK_OF(SCT) *d2i_SCT_LIST(STACK_OF(SCT) **a,
+ const unsigned char **pp, long length)
+{
+ ASN1_OCTET_STRING *oct = NULL;
+ STACK_OF(SCT) *sk = NULL;
+ SCT *sct;
+ unsigned char *p, *p2;
+ unsigned short listlen, sctlen = 0, fieldlen;
+
+ if (d2i_ASN1_OCTET_STRING(&oct, pp, length) == NULL)
+ return NULL;
+ if (oct->length < 2)
+ goto done;
+ p = oct->data;
+ n2s(p, listlen);
+ if (listlen != oct->length - 2)
+ goto done;
+
+ if ((sk = sk_SCT_new_null()) == NULL)
+ goto done;
+
+ while (listlen > 0) {
+ if (listlen < 2)
+ goto err;
+ n2s(p, sctlen);
+ listlen -= 2;
+
+ if ((sctlen < 1) || (sctlen > listlen))
+ goto err;
+ listlen -= sctlen;
+
+ sct = OPENSSL_malloc(sizeof(SCT));
+ if (!sct)
+ goto err;
+ if (!sk_SCT_push(sk, sct)) {
+ OPENSSL_free(sct);
+ goto err;
+ }
+
+ sct->sct = OPENSSL_malloc(sctlen);
+ if (!sct->sct)
+ goto err;
+ memcpy(sct->sct, p, sctlen);
+ sct->sctlen = sctlen;
+ p += sctlen;
+ p2 = sct->sct;
+
+ sct->version = *p2++;
+ if (sct->version == 0) { /* SCT v1 */
+ /*-
+ * Fixed-length header:
+ * struct {
+ * (1 byte) Version sct_version;
+ * (32 bytes) LogID id;
+ * (8 bytes) uint64 timestamp;
+ * (2 bytes + ?) CtExtensions extensions;
+ */
+ if (sctlen < 43)
+ goto err;
+ sctlen -= 43;
+
+ sct->logid = p2;
+ sct->logidlen = 32;
+ p2 += 32;
+
+ n2l8(p2, sct->timestamp);
+
+ n2s(p2, fieldlen);
+ if (sctlen < fieldlen)
+ goto err;
+ sct->ext = p2;
+ sct->extlen = fieldlen;
+ p2 += fieldlen;
+ sctlen -= fieldlen;
+
+ /*-
+ * digitally-signed struct header:
+ * (1 byte) Hash algorithm
+ * (1 byte) Signature algorithm
+ * (2 bytes + ?) Signature
+ */
+ if (sctlen < 4)
+ goto err;
+ sctlen -= 4;
+
+ sct->hash_alg = *p2++;
+ sct->sig_alg = *p2++;
+ n2s(p2, fieldlen);
+ if (sctlen != fieldlen)
+ goto err;
+ sct->sig = p2;
+ sct->siglen = fieldlen;
+ }
+ }
+
+ done:
+ ASN1_OCTET_STRING_free(oct);
+ return sk;
+
+ err:
+ SCT_LIST_free(sk);
+ sk = NULL;
+ goto done;
+}
+
+static int i2r_SCT_LIST(X509V3_EXT_METHOD *method, STACK_OF(SCT) *sct_list,
+ BIO *out, int indent)
+{
+ SCT *sct;
+ int i;
+
+ for (i = 0; i < sk_SCT_num(sct_list);) {
+ sct = sk_SCT_value(sct_list, i);
+
+ BIO_printf(out, "%*sSigned Certificate Timestamp:", indent, "");
+ BIO_printf(out, "\n%*sVersion : ", indent + 4, "");
+
+ if (sct->version == 0) { /* SCT v1 */
+ BIO_printf(out, "v1(0)");
+
+ BIO_printf(out, "\n%*sLog ID : ", indent + 4, "");
+ BIO_hex_string(out, indent + 16, 16, sct->logid, sct->logidlen);
+
+ BIO_printf(out, "\n%*sTimestamp : ", indent + 4, "");
+ timestamp_print(out, sct->timestamp);
+
+ BIO_printf(out, "\n%*sExtensions: ", indent + 4, "");
+ if (sct->extlen == 0)
+ BIO_printf(out, "none");
+ else
+ BIO_hex_string(out, indent + 16, 16, sct->ext, sct->extlen);
+
+ BIO_printf(out, "\n%*sSignature : ", indent + 4, "");
+ tls12_signature_print(out, sct->hash_alg, sct->sig_alg);
+ BIO_printf(out, "\n%*s ", indent + 4, "");
+ BIO_hex_string(out, indent + 16, 16, sct->sig, sct->siglen);
+ } else { /* Unknown version */
+
+ BIO_printf(out, "unknown\n%*s", indent + 16, "");
+ BIO_hex_string(out, indent + 16, 16, sct->sct, sct->sctlen);
+ }
+
+ if (++i < sk_SCT_num(sct_list))
+ BIO_printf(out, "\n");
+ }
+
+ return 1;
+}
diff --git a/crypto/x509v3/v3_utl.c b/crypto/x509v3/v3_utl.c
index 94aaebba3e3f..bdd7b95f4570 100644
--- a/crypto/x509v3/v3_utl.c
+++ b/crypto/x509v3/v3_utl.c
@@ -632,6 +632,433 @@ void X509_email_free(STACK_OF(OPENSSL_STRING) *sk)
sk_OPENSSL_STRING_pop_free(sk, str_free);
}
+typedef int (*equal_fn) (const unsigned char *pattern, size_t pattern_len,
+ const unsigned char *subject, size_t subject_len,
+ unsigned int flags);
+
+/* Skip pattern prefix to match "wildcard" subject */
+static void skip_prefix(const unsigned char **p, size_t *plen,
+ const unsigned char *subject, size_t subject_len,
+ unsigned int flags)
+{
+ const unsigned char *pattern = *p;
+ size_t pattern_len = *plen;
+
+ /*
+ * If subject starts with a leading '.' followed by more octets, and
+ * pattern is longer, compare just an equal-length suffix with the
+ * full subject (starting at the '.'), provided the prefix contains
+ * no NULs.
+ */
+ if ((flags & _X509_CHECK_FLAG_DOT_SUBDOMAINS) == 0)
+ return;
+
+ while (pattern_len > subject_len && *pattern) {
+ if ((flags & X509_CHECK_FLAG_SINGLE_LABEL_SUBDOMAINS) &&
+ *pattern == '.')
+ break;
+ ++pattern;
+ --pattern_len;
+ }
+
+ /* Skip if entire prefix acceptable */
+ if (pattern_len == subject_len) {
+ *p = pattern;
+ *plen = pattern_len;
+ }
+}
+
+/* Compare while ASCII ignoring case. */
+static int equal_nocase(const unsigned char *pattern, size_t pattern_len,
+ const unsigned char *subject, size_t subject_len,
+ unsigned int flags)
+{
+ skip_prefix(&pattern, &pattern_len, subject, subject_len, flags);
+ if (pattern_len != subject_len)
+ return 0;
+ while (pattern_len) {
+ unsigned char l = *pattern;
+ unsigned char r = *subject;
+ /* The pattern must not contain NUL characters. */
+ if (l == 0)
+ return 0;
+ if (l != r) {
+ if ('A' <= l && l <= 'Z')
+ l = (l - 'A') + 'a';
+ if ('A' <= r && r <= 'Z')
+ r = (r - 'A') + 'a';
+ if (l != r)
+ return 0;
+ }
+ ++pattern;
+ ++subject;
+ --pattern_len;
+ }
+ return 1;
+}
+
+/* Compare using memcmp. */
+static int equal_case(const unsigned char *pattern, size_t pattern_len,
+ const unsigned char *subject, size_t subject_len,
+ unsigned int flags)
+{
+ skip_prefix(&pattern, &pattern_len, subject, subject_len, flags);
+ if (pattern_len != subject_len)
+ return 0;
+ return !memcmp(pattern, subject, pattern_len);
+}
+
+/*
+ * RFC 5280, section 7.5, requires that only the domain is compared in a
+ * case-insensitive manner.
+ */
+static int equal_email(const unsigned char *a, size_t a_len,
+ const unsigned char *b, size_t b_len,
+ unsigned int unused_flags)
+{
+ size_t i = a_len;
+ if (a_len != b_len)
+ return 0;
+ /*
+ * We search backwards for the '@' character, so that we do not have to
+ * deal with quoted local-parts. The domain part is compared in a
+ * case-insensitive manner.
+ */
+ while (i > 0) {
+ --i;
+ if (a[i] == '@' || b[i] == '@') {
+ if (!equal_nocase(a + i, a_len - i, b + i, a_len - i, 0))
+ return 0;
+ break;
+ }
+ }
+ if (i == 0)
+ i = a_len;
+ return equal_case(a, i, b, i, 0);
+}
+
+/*
+ * Compare the prefix and suffix with the subject, and check that the
+ * characters in-between are valid.
+ */
+static int wildcard_match(const unsigned char *prefix, size_t prefix_len,
+ const unsigned char *suffix, size_t suffix_len,
+ const unsigned char *subject, size_t subject_len,
+ unsigned int flags)
+{
+ const unsigned char *wildcard_start;
+ const unsigned char *wildcard_end;
+ const unsigned char *p;
+ int allow_multi = 0;
+ int allow_idna = 0;
+
+ if (subject_len < prefix_len + suffix_len)
+ return 0;
+ if (!equal_nocase(prefix, prefix_len, subject, prefix_len, flags))
+ return 0;
+ wildcard_start = subject + prefix_len;
+ wildcard_end = subject + (subject_len - suffix_len);
+ if (!equal_nocase(wildcard_end, suffix_len, suffix, suffix_len, flags))
+ return 0;
+ /*
+ * If the wildcard makes up the entire first label, it must match at
+ * least one character.
+ */
+ if (prefix_len == 0 && *suffix == '.') {
+ if (wildcard_start == wildcard_end)
+ return 0;
+ allow_idna = 1;
+ if (flags & X509_CHECK_FLAG_MULTI_LABEL_WILDCARDS)
+ allow_multi = 1;
+ }
+ /* IDNA labels cannot match partial wildcards */
+ if (!allow_idna &&
+ subject_len >= 4 && strncasecmp((char *)subject, "xn--", 4) == 0)
+ return 0;
+ /* The wildcard may match a literal '*' */
+ if (wildcard_end == wildcard_start + 1 && *wildcard_start == '*')
+ return 1;
+ /*
+ * Check that the part matched by the wildcard contains only
+ * permitted characters and only matches a single label unless
+ * allow_multi is set.
+ */
+ for (p = wildcard_start; p != wildcard_end; ++p)
+ if (!(('0' <= *p && *p <= '9') ||
+ ('A' <= *p && *p <= 'Z') ||
+ ('a' <= *p && *p <= 'z') ||
+ *p == '-' || (allow_multi && *p == '.')))
+ return 0;
+ return 1;
+}
+
+#define LABEL_START (1 << 0)
+#define LABEL_END (1 << 1)
+#define LABEL_HYPHEN (1 << 2)
+#define LABEL_IDNA (1 << 3)
+
+static const unsigned char *valid_star(const unsigned char *p, size_t len,
+ unsigned int flags)
+{
+ const unsigned char *star = 0;
+ size_t i;
+ int state = LABEL_START;
+ int dots = 0;
+ for (i = 0; i < len; ++i) {
+ /*
+ * Locate first and only legal wildcard, either at the start
+ * or end of a non-IDNA first and not final label.
+ */
+ if (p[i] == '*') {
+ int atstart = (state & LABEL_START);
+ int atend = (i == len - 1 || p[i + 1] == '.');
+ /*-
+ * At most one wildcard per pattern.
+ * No wildcards in IDNA labels.
+ * No wildcards after the first label.
+ */
+ if (star != NULL || (state & LABEL_IDNA) != 0 || dots)
+ return NULL;
+ /* Only full-label '*.example.com' wildcards? */
+ if ((flags & X509_CHECK_FLAG_NO_PARTIAL_WILDCARDS)
+ && (!atstart || !atend))
+ return NULL;
+ /* No 'foo*bar' wildcards */
+ if (!atstart && !atend)
+ return NULL;
+ star = &p[i];
+ state &= ~LABEL_START;
+ } else if (('a' <= p[i] && p[i] <= 'z')
+ || ('A' <= p[i] && p[i] <= 'Z')
+ || ('0' <= p[i] && p[i] <= '9')) {
+ if ((state & LABEL_START) != 0
+ && len - i >= 4 && strncasecmp((char *)&p[i], "xn--", 4) == 0)
+ state |= LABEL_IDNA;
+ state &= ~(LABEL_HYPHEN | LABEL_START);
+ } else if (p[i] == '.') {
+ if ((state & (LABEL_HYPHEN | LABEL_START)) != 0)
+ return NULL;
+ state = LABEL_START;
+ ++dots;
+ } else if (p[i] == '-') {
+ if ((state & LABEL_HYPHEN) != 0)
+ return NULL;
+ state |= LABEL_HYPHEN;
+ } else
+ return NULL;
+ }
+
+ /*
+ * The final label must not end in a hyphen or ".", and
+ * there must be at least two dots after the star.
+ */
+ if ((state & (LABEL_START | LABEL_HYPHEN)) != 0 || dots < 2)
+ return NULL;
+ return star;
+}
+
+/* Compare using wildcards. */
+static int equal_wildcard(const unsigned char *pattern, size_t pattern_len,
+ const unsigned char *subject, size_t subject_len,
+ unsigned int flags)
+{
+ const unsigned char *star = NULL;
+
+ /*
+ * Subject names starting with '.' can only match a wildcard pattern
+ * via a subject sub-domain pattern suffix match.
+ */
+ if (!(subject_len > 1 && subject[0] == '.'))
+ star = valid_star(pattern, pattern_len, flags);
+ if (star == NULL)
+ return equal_nocase(pattern, pattern_len,
+ subject, subject_len, flags);
+ return wildcard_match(pattern, star - pattern,
+ star + 1, (pattern + pattern_len) - star - 1,
+ subject, subject_len, flags);
+}
+
+/*
+ * Compare an ASN1_STRING to a supplied string. If they match return 1. If
+ * cmp_type > 0 only compare if string matches the type, otherwise convert it
+ * to UTF8.
+ */
+
+static int do_check_string(ASN1_STRING *a, int cmp_type, equal_fn equal,
+ unsigned int flags, const char *b, size_t blen,
+ char **peername)
+{
+ int rv = 0;
+
+ if (!a->data || !a->length)
+ return 0;
+ if (cmp_type > 0) {
+ if (cmp_type != a->type)
+ return 0;
+ if (cmp_type == V_ASN1_IA5STRING)
+ rv = equal(a->data, a->length, (unsigned char *)b, blen, flags);
+ else if (a->length == (int)blen && !memcmp(a->data, b, blen))
+ rv = 1;
+ if (rv > 0 && peername)
+ *peername = BUF_strndup((char *)a->data, a->length);
+ } else {
+ int astrlen;
+ unsigned char *astr;
+ astrlen = ASN1_STRING_to_UTF8(&astr, a);
+ if (astrlen < 0) {
+ /*
+ * -1 could be an internal malloc failure or a decoding error from
+ * malformed input; we can't distinguish.
+ */
+ return -1;
+ }
+ rv = equal(astr, astrlen, (unsigned char *)b, blen, flags);
+ if (rv > 0 && peername)
+ *peername = BUF_strndup((char *)astr, astrlen);
+ OPENSSL_free(astr);
+ }
+ return rv;
+}
+
+static int do_x509_check(X509 *x, const char *chk, size_t chklen,
+ unsigned int flags, int check_type, char **peername)
+{
+ GENERAL_NAMES *gens = NULL;
+ X509_NAME *name = NULL;
+ int i;
+ int cnid;
+ int alt_type;
+ int san_present = 0;
+ int rv = 0;
+ equal_fn equal;
+
+ /* See below, this flag is internal-only */
+ flags &= ~_X509_CHECK_FLAG_DOT_SUBDOMAINS;
+ if (check_type == GEN_EMAIL) {
+ cnid = NID_pkcs9_emailAddress;
+ alt_type = V_ASN1_IA5STRING;
+ equal = equal_email;
+ } else if (check_type == GEN_DNS) {
+ cnid = NID_commonName;
+ /* Implicit client-side DNS sub-domain pattern */
+ if (chklen > 1 && chk[0] == '.')
+ flags |= _X509_CHECK_FLAG_DOT_SUBDOMAINS;
+ alt_type = V_ASN1_IA5STRING;
+ if (flags & X509_CHECK_FLAG_NO_WILDCARDS)
+ equal = equal_nocase;
+ else
+ equal = equal_wildcard;
+ } else {
+ cnid = 0;
+ alt_type = V_ASN1_OCTET_STRING;
+ equal = equal_case;
+ }
+
+ if (chklen == 0)
+ chklen = strlen(chk);
+
+ gens = X509_get_ext_d2i(x, NID_subject_alt_name, NULL, NULL);
+ if (gens) {
+ for (i = 0; i < sk_GENERAL_NAME_num(gens); i++) {
+ GENERAL_NAME *gen;
+ ASN1_STRING *cstr;
+ gen = sk_GENERAL_NAME_value(gens, i);
+ if (gen->type != check_type)
+ continue;
+ san_present = 1;
+ if (check_type == GEN_EMAIL)
+ cstr = gen->d.rfc822Name;
+ else if (check_type == GEN_DNS)
+ cstr = gen->d.dNSName;
+ else
+ cstr = gen->d.iPAddress;
+ /* Positive on success, negative on error! */
+ if ((rv = do_check_string(cstr, alt_type, equal, flags,
+ chk, chklen, peername)) != 0)
+ break;
+ }
+ GENERAL_NAMES_free(gens);
+ if (rv != 0)
+ return rv;
+ if (!cnid
+ || (san_present
+ && !(flags & X509_CHECK_FLAG_ALWAYS_CHECK_SUBJECT)))
+ return 0;
+ }
+ i = -1;
+ name = X509_get_subject_name(x);
+ while ((i = X509_NAME_get_index_by_NID(name, cnid, i)) >= 0) {
+ X509_NAME_ENTRY *ne;
+ ASN1_STRING *str;
+ ne = X509_NAME_get_entry(name, i);
+ str = X509_NAME_ENTRY_get_data(ne);
+ /* Positive on success, negative on error! */
+ if ((rv = do_check_string(str, -1, equal, flags,
+ chk, chklen, peername)) != 0)
+ return rv;
+ }
+ return 0;
+}
+
+int X509_check_host(X509 *x, const char *chk, size_t chklen,
+ unsigned int flags, char **peername)
+{
+ if (chk == NULL)
+ return -2;
+ /*
+ * Embedded NULs are disallowed, except as the last character of a
+ * string of length 2 or more (tolerate caller including terminating
+ * NUL in string length).
+ */
+ if (chklen == 0)
+ chklen = strlen(chk);
+ else if (memchr(chk, '\0', chklen > 1 ? chklen - 1 : chklen))
+ return -2;
+ if (chklen > 1 && chk[chklen - 1] == '\0')
+ --chklen;
+ return do_x509_check(x, chk, chklen, flags, GEN_DNS, peername);
+}
+
+int X509_check_email(X509 *x, const char *chk, size_t chklen,
+ unsigned int flags)
+{
+ if (chk == NULL)
+ return -2;
+ /*
+ * Embedded NULs are disallowed, except as the last character of a
+ * string of length 2 or more (tolerate caller including terminating
+ * NUL in string length).
+ */
+ if (chklen == 0)
+ chklen = strlen((char *)chk);
+ else if (memchr(chk, '\0', chklen > 1 ? chklen - 1 : chklen))
+ return -2;
+ if (chklen > 1 && chk[chklen - 1] == '\0')
+ --chklen;
+ return do_x509_check(x, chk, chklen, flags, GEN_EMAIL, NULL);
+}
+
+int X509_check_ip(X509 *x, const unsigned char *chk, size_t chklen,
+ unsigned int flags)
+{
+ if (chk == NULL)
+ return -2;
+ return do_x509_check(x, (char *)chk, chklen, flags, GEN_IPADD, NULL);
+}
+
+int X509_check_ip_asc(X509 *x, const char *ipasc, unsigned int flags)
+{
+ unsigned char ipout[16];
+ size_t iplen;
+
+ if (ipasc == NULL)
+ return -2;
+ iplen = (size_t)a2i_ipadd(ipout, ipasc);
+ if (iplen == 0)
+ return -2;
+ return do_x509_check(x, (char *)ipout, iplen, flags, GEN_IPADD, NULL);
+}
+
/*
* Convert IP addresses both IPv4 and IPv6 into an OCTET STRING compatible
* with RFC3280.
diff --git a/crypto/x509v3/v3err.c b/crypto/x509v3/v3err.c
index 0138f7a8462a..bcc1be722e42 100644
--- a/crypto/x509v3/v3err.c
+++ b/crypto/x509v3/v3err.c
@@ -1,6 +1,6 @@
/* crypto/x509v3/v3err.c */
/* ====================================================================
- * Copyright (c) 1999-2007 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1999-2014 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -70,7 +70,7 @@
# define ERR_REASON(reason) ERR_PACK(ERR_LIB_X509V3,0,reason)
static ERR_STRING_DATA X509V3_str_functs[] = {
- {ERR_FUNC(X509V3_F_A2I_GENERAL_NAME), "A2I_GENERAL_NAME"},
+ {ERR_FUNC(X509V3_F_A2I_GENERAL_NAME), "a2i_GENERAL_NAME"},
{ERR_FUNC(X509V3_F_ASIDENTIFIERCHOICE_CANONIZE),
"ASIDENTIFIERCHOICE_CANONIZE"},
{ERR_FUNC(X509V3_F_ASIDENTIFIERCHOICE_IS_CANONICAL),
@@ -132,6 +132,7 @@ static ERR_STRING_DATA X509V3_str_functs[] = {
{ERR_FUNC(X509V3_F_X509V3_EXT_ADD), "X509V3_EXT_add"},
{ERR_FUNC(X509V3_F_X509V3_EXT_ADD_ALIAS), "X509V3_EXT_add_alias"},
{ERR_FUNC(X509V3_F_X509V3_EXT_CONF), "X509V3_EXT_conf"},
+ {ERR_FUNC(X509V3_F_X509V3_EXT_FREE), "X509V3_EXT_free"},
{ERR_FUNC(X509V3_F_X509V3_EXT_I2D), "X509V3_EXT_i2d"},
{ERR_FUNC(X509V3_F_X509V3_EXT_NCONF), "X509V3_EXT_nconf"},
{ERR_FUNC(X509V3_F_X509V3_GET_SECTION), "X509V3_get_section"},
@@ -149,6 +150,8 @@ static ERR_STRING_DATA X509V3_str_reasons[] = {
{ERR_REASON(X509V3_R_BN_DEC2BN_ERROR), "bn dec2bn error"},
{ERR_REASON(X509V3_R_BN_TO_ASN1_INTEGER_ERROR),
"bn to asn1 integer error"},
+ {ERR_REASON(X509V3_R_CANNOT_FIND_FREE_FUNCTION),
+ "cannot find free function"},
{ERR_REASON(X509V3_R_DIRNAME_ERROR), "dirname error"},
{ERR_REASON(X509V3_R_DISTPOINT_ALREADY_SET), "distpoint already set"},
{ERR_REASON(X509V3_R_DUPLICATE_ZONE_ID), "duplicate zone id"},
@@ -167,7 +170,6 @@ static ERR_STRING_DATA X509V3_str_reasons[] = {
{ERR_REASON(X509V3_R_ILLEGAL_HEX_DIGIT), "illegal hex digit"},
{ERR_REASON(X509V3_R_INCORRECT_POLICY_SYNTAX_TAG),
"incorrect policy syntax tag"},
- {ERR_REASON(X509V3_R_INVALID_MULTIPLE_RDNS), "invalid multiple rdns"},
{ERR_REASON(X509V3_R_INVALID_ASNUMBER), "invalid asnumber"},
{ERR_REASON(X509V3_R_INVALID_ASRANGE), "invalid asrange"},
{ERR_REASON(X509V3_R_INVALID_BOOLEAN_STRING), "invalid boolean string"},
@@ -175,6 +177,7 @@ static ERR_STRING_DATA X509V3_str_reasons[] = {
"invalid extension string"},
{ERR_REASON(X509V3_R_INVALID_INHERITANCE), "invalid inheritance"},
{ERR_REASON(X509V3_R_INVALID_IPADDRESS), "invalid ipaddress"},
+ {ERR_REASON(X509V3_R_INVALID_MULTIPLE_RDNS), "invalid multiple rdns"},
{ERR_REASON(X509V3_R_INVALID_NAME), "invalid name"},
{ERR_REASON(X509V3_R_INVALID_NULL_ARGUMENT), "invalid null argument"},
{ERR_REASON(X509V3_R_INVALID_NULL_NAME), "invalid null name"},
diff --git a/crypto/x509v3/v3nametest.c b/crypto/x509v3/v3nametest.c
new file mode 100644
index 000000000000..7b5c1c8e5127
--- /dev/null
+++ b/crypto/x509v3/v3nametest.c
@@ -0,0 +1,336 @@
+#include <openssl/x509.h>
+#include <openssl/x509v3.h>
+#include "../e_os.h"
+#include <string.h>
+
+static const char *const names[] = {
+ "a", "b", ".", "*", "@",
+ ".a", "a.", ".b", "b.", ".*", "*.", "*@", "@*", "a@", "@a", "b@", "..",
+ "@@", "**", "*.com", "*com", "*.*.com", "*com", "com*", "*example.com",
+ "*@example.com", "test@*.example.com", "example.com", "www.example.com",
+ "test.www.example.com", "*.example.com", "*.www.example.com",
+ "test.*.example.com", "www.*.com",
+ ".www.example.com", "*www.example.com",
+ "example.net", "xn--rger-koa.example.com",
+ "a.example.com", "b.example.com",
+ "postmaster@example.com", "Postmaster@example.com",
+ "postmaster@EXAMPLE.COM",
+ NULL
+};
+
+static const char *const exceptions[] = {
+ "set CN: host: [*.example.com] matches [a.example.com]",
+ "set CN: host: [*.example.com] matches [b.example.com]",
+ "set CN: host: [*.example.com] matches [www.example.com]",
+ "set CN: host: [*.example.com] matches [xn--rger-koa.example.com]",
+ "set CN: host: [*.www.example.com] matches [test.www.example.com]",
+ "set CN: host: [*.www.example.com] matches [.www.example.com]",
+ "set CN: host: [*www.example.com] matches [www.example.com]",
+ "set CN: host: [test.www.example.com] matches [.www.example.com]",
+ "set CN: host-no-wildcards: [*.www.example.com] matches [.www.example.com]",
+ "set CN: host-no-wildcards: [test.www.example.com] matches [.www.example.com]",
+ "set emailAddress: email: [postmaster@example.com] does not match [Postmaster@example.com]",
+ "set emailAddress: email: [postmaster@EXAMPLE.COM] does not match [Postmaster@example.com]",
+ "set emailAddress: email: [Postmaster@example.com] does not match [postmaster@example.com]",
+ "set emailAddress: email: [Postmaster@example.com] does not match [postmaster@EXAMPLE.COM]",
+ "set dnsName: host: [*.example.com] matches [www.example.com]",
+ "set dnsName: host: [*.example.com] matches [a.example.com]",
+ "set dnsName: host: [*.example.com] matches [b.example.com]",
+ "set dnsName: host: [*.example.com] matches [xn--rger-koa.example.com]",
+ "set dnsName: host: [*.www.example.com] matches [test.www.example.com]",
+ "set dnsName: host-no-wildcards: [*.www.example.com] matches [.www.example.com]",
+ "set dnsName: host-no-wildcards: [test.www.example.com] matches [.www.example.com]",
+ "set dnsName: host: [*.www.example.com] matches [.www.example.com]",
+ "set dnsName: host: [*www.example.com] matches [www.example.com]",
+ "set dnsName: host: [test.www.example.com] matches [.www.example.com]",
+ "set rfc822Name: email: [postmaster@example.com] does not match [Postmaster@example.com]",
+ "set rfc822Name: email: [Postmaster@example.com] does not match [postmaster@example.com]",
+ "set rfc822Name: email: [Postmaster@example.com] does not match [postmaster@EXAMPLE.COM]",
+ "set rfc822Name: email: [postmaster@EXAMPLE.COM] does not match [Postmaster@example.com]",
+ NULL
+};
+
+static int is_exception(const char *msg)
+{
+ const char *const *p;
+ for (p = exceptions; *p; ++p)
+ if (strcmp(msg, *p) == 0)
+ return 1;
+ return 0;
+}
+
+static int set_cn(X509 *crt, ...)
+{
+ int ret = 0;
+ X509_NAME *n = NULL;
+ va_list ap;
+ va_start(ap, crt);
+ n = X509_NAME_new();
+ if (n == NULL)
+ goto out;
+ while (1) {
+ int nid;
+ const char *name;
+ nid = va_arg(ap, int);
+ if (nid == 0)
+ break;
+ name = va_arg(ap, const char *);
+ if (!X509_NAME_add_entry_by_NID(n, nid, MBSTRING_ASC,
+ (unsigned char *)name, -1, -1, 1))
+ goto out;
+ }
+ if (!X509_set_subject_name(crt, n))
+ goto out;
+ ret = 1;
+ out:
+ X509_NAME_free(n);
+ va_end(ap);
+ return ret;
+}
+
+/*-
+int X509_add_ext(X509 *x, X509_EXTENSION *ex, int loc);
+X509_EXTENSION *X509_EXTENSION_create_by_NID(X509_EXTENSION **ex,
+ int nid, int crit, ASN1_OCTET_STRING *data);
+int X509_add_ext(X509 *x, X509_EXTENSION *ex, int loc);
+*/
+
+static int set_altname(X509 *crt, ...)
+{
+ int ret = 0;
+ GENERAL_NAMES *gens = NULL;
+ GENERAL_NAME *gen = NULL;
+ ASN1_IA5STRING *ia5 = NULL;
+ va_list ap;
+ va_start(ap, crt);
+ gens = sk_GENERAL_NAME_new_null();
+ if (gens == NULL)
+ goto out;
+ while (1) {
+ int type;
+ const char *name;
+ type = va_arg(ap, int);
+ if (type == 0)
+ break;
+ name = va_arg(ap, const char *);
+
+ gen = GENERAL_NAME_new();
+ if (gen == NULL)
+ goto out;
+ ia5 = ASN1_IA5STRING_new();
+ if (ia5 == NULL)
+ goto out;
+ if (!ASN1_STRING_set(ia5, name, -1))
+ goto out;
+ switch (type) {
+ case GEN_EMAIL:
+ case GEN_DNS:
+ GENERAL_NAME_set0_value(gen, type, ia5);
+ ia5 = NULL;
+ break;
+ default:
+ abort();
+ }
+ sk_GENERAL_NAME_push(gens, gen);
+ gen = NULL;
+ }
+ if (!X509_add1_ext_i2d(crt, NID_subject_alt_name, gens, 0, 0))
+ goto out;
+ ret = 1;
+ out:
+ ASN1_IA5STRING_free(ia5);
+ GENERAL_NAME_free(gen);
+ GENERAL_NAMES_free(gens);
+ va_end(ap);
+ return ret;
+}
+
+static int set_cn1(X509 *crt, const char *name)
+{
+ return set_cn(crt, NID_commonName, name, 0);
+}
+
+static int set_cn_and_email(X509 *crt, const char *name)
+{
+ return set_cn(crt, NID_commonName, name,
+ NID_pkcs9_emailAddress, "dummy@example.com", 0);
+}
+
+static int set_cn2(X509 *crt, const char *name)
+{
+ return set_cn(crt, NID_commonName, "dummy value",
+ NID_commonName, name, 0);
+}
+
+static int set_cn3(X509 *crt, const char *name)
+{
+ return set_cn(crt, NID_commonName, name,
+ NID_commonName, "dummy value", 0);
+}
+
+static int set_email1(X509 *crt, const char *name)
+{
+ return set_cn(crt, NID_pkcs9_emailAddress, name, 0);
+}
+
+static int set_email2(X509 *crt, const char *name)
+{
+ return set_cn(crt, NID_pkcs9_emailAddress, "dummy@example.com",
+ NID_pkcs9_emailAddress, name, 0);
+}
+
+static int set_email3(X509 *crt, const char *name)
+{
+ return set_cn(crt, NID_pkcs9_emailAddress, name,
+ NID_pkcs9_emailAddress, "dummy@example.com", 0);
+}
+
+static int set_email_and_cn(X509 *crt, const char *name)
+{
+ return set_cn(crt, NID_pkcs9_emailAddress, name,
+ NID_commonName, "www.example.org", 0);
+}
+
+static int set_altname_dns(X509 *crt, const char *name)
+{
+ return set_altname(crt, GEN_DNS, name, 0);
+}
+
+static int set_altname_email(X509 *crt, const char *name)
+{
+ return set_altname(crt, GEN_EMAIL, name, 0);
+}
+
+struct set_name_fn {
+ int (*fn) (X509 *, const char *);
+ const char *name;
+ int host;
+ int email;
+};
+
+static const struct set_name_fn name_fns[] = {
+ {set_cn1, "set CN", 1, 0},
+ {set_cn2, "set CN", 1, 0},
+ {set_cn3, "set CN", 1, 0},
+ {set_cn_and_email, "set CN", 1, 0},
+ {set_email1, "set emailAddress", 0, 1},
+ {set_email2, "set emailAddress", 0, 1},
+ {set_email3, "set emailAddress", 0, 1},
+ {set_email_and_cn, "set emailAddress", 0, 1},
+ {set_altname_dns, "set dnsName", 1, 0},
+ {set_altname_email, "set rfc822Name", 0, 1},
+ {NULL, NULL, 0}
+};
+
+static X509 *make_cert()
+{
+ X509 *ret = NULL;
+ X509 *crt = NULL;
+ X509_NAME *issuer = NULL;
+ crt = X509_new();
+ if (crt == NULL)
+ goto out;
+ if (!X509_set_version(crt, 3))
+ goto out;
+ ret = crt;
+ crt = NULL;
+ out:
+ X509_NAME_free(issuer);
+ return ret;
+}
+
+static int errors;
+
+static void check_message(const struct set_name_fn *fn, const char *op,
+ const char *nameincert, int match, const char *name)
+{
+ char msg[1024];
+ if (match < 0)
+ return;
+ BIO_snprintf(msg, sizeof(msg), "%s: %s: [%s] %s [%s]",
+ fn->name, op, nameincert,
+ match ? "matches" : "does not match", name);
+ if (is_exception(msg))
+ return;
+ puts(msg);
+ ++errors;
+}
+
+static void run_cert(X509 *crt, const char *nameincert,
+ const struct set_name_fn *fn)
+{
+ const char *const *pname = names;
+ while (*pname) {
+ int samename = strcasecmp(nameincert, *pname) == 0;
+ size_t namelen = strlen(*pname);
+ char *name = malloc(namelen);
+ int match, ret;
+ memcpy(name, *pname, namelen);
+
+ ret = X509_check_host(crt, name, namelen, 0, NULL);
+ match = -1;
+ if (ret < 0) {
+ fprintf(stderr, "internal error in X509_check_host");
+ ++errors;
+ } else if (fn->host) {
+ if (ret == 1 && !samename)
+ match = 1;
+ if (ret == 0 && samename)
+ match = 0;
+ } else if (ret == 1)
+ match = 1;
+ check_message(fn, "host", nameincert, match, *pname);
+
+ ret = X509_check_host(crt, name, namelen,
+ X509_CHECK_FLAG_NO_WILDCARDS, NULL);
+ match = -1;
+ if (ret < 0) {
+ fprintf(stderr, "internal error in X509_check_host");
+ ++errors;
+ } else if (fn->host) {
+ if (ret == 1 && !samename)
+ match = 1;
+ if (ret == 0 && samename)
+ match = 0;
+ } else if (ret == 1)
+ match = 1;
+ check_message(fn, "host-no-wildcards", nameincert, match, *pname);
+
+ ret = X509_check_email(crt, name, namelen, 0);
+ match = -1;
+ if (fn->email) {
+ if (ret && !samename)
+ match = 1;
+ if (!ret && samename && strchr(nameincert, '@') != NULL)
+ match = 0;
+ } else if (ret)
+ match = 1;
+ check_message(fn, "email", nameincert, match, *pname);
+ ++pname;
+ free(name);
+ }
+}
+
+int main(void)
+{
+ const struct set_name_fn *pfn = name_fns;
+ while (pfn->name) {
+ const char *const *pname = names;
+ while (*pname) {
+ X509 *crt = make_cert();
+ if (crt == NULL) {
+ fprintf(stderr, "make_cert failed\n");
+ return 1;
+ }
+ if (!pfn->fn(crt, *pname)) {
+ fprintf(stderr, "X509 name setting failed\n");
+ return 1;
+ }
+ run_cert(crt, *pname, pfn);
+ X509_free(crt);
+ ++pname;
+ }
+ ++pfn;
+ }
+ return errors > 0 ? 1 : 0;
+}
diff --git a/crypto/x509v3/x509v3.h b/crypto/x509v3/x509v3.h
index db9c3e8bde6e..f5c61560aa10 100644
--- a/crypto/x509v3/x509v3.h
+++ b/crypto/x509v3/x509v3.h
@@ -67,6 +67,13 @@
extern "C" {
#endif
+# ifdef OPENSSL_SYS_WIN32
+/* Under Win32 these are defined in wincrypt.h */
+# undef X509_NAME
+# undef X509_CERT_PAIR
+# undef X509_EXTENSIONS
+# endif
+
/* Forward reference */
struct v3_ext_method;
struct v3_ext_ctx;
@@ -405,7 +412,6 @@ struct ISSUING_DIST_POINT_st {
# define EXFLAG_CA 0x10
/* Really self issued not necessarily self signed */
# define EXFLAG_SI 0x20
-# define EXFLAG_SS 0x20
# define EXFLAG_V1 0x40
# define EXFLAG_INVALID 0x80
# define EXFLAG_SET 0x100
@@ -414,6 +420,8 @@ struct ISSUING_DIST_POINT_st {
# define EXFLAG_INVALID_POLICY 0x800
# define EXFLAG_FRESHEST 0x1000
+/* Self signed */
+# define EXFLAG_SS 0x2000
# define KU_DIGITAL_SIGNATURE 0x0080
# define KU_NON_REPUDIATION 0x0040
@@ -442,6 +450,7 @@ struct ISSUING_DIST_POINT_st {
# define XKU_OCSP_SIGN 0x20
# define XKU_TIMESTAMP 0x40
# define XKU_DVCS 0x80
+# define XKU_ANYEKU 0x100
# define X509_PURPOSE_DYNAMIC 0x1
# define X509_PURPOSE_DYNAMIC_NAME 0x2
@@ -665,6 +674,7 @@ STACK_OF(CONF_VALUE) *X509V3_parse_list(const char *line);
void *X509V3_EXT_d2i(X509_EXTENSION *ext);
void *X509V3_get_d2i(STACK_OF(X509_EXTENSION) *x, int nid, int *crit,
int *idx);
+int X509V3_EXT_free(int nid, void *ext_data);
X509_EXTENSION *X509V3_EXT_i2d(int ext_nid, int crit, void *ext_struc);
int X509V3_add1_i2d(STACK_OF(X509_EXTENSION) **x, int nid, void *value,
@@ -707,6 +717,34 @@ STACK_OF(OPENSSL_STRING) *X509_get1_email(X509 *x);
STACK_OF(OPENSSL_STRING) *X509_REQ_get1_email(X509_REQ *x);
void X509_email_free(STACK_OF(OPENSSL_STRING) *sk);
STACK_OF(OPENSSL_STRING) *X509_get1_ocsp(X509 *x);
+/* Flags for X509_check_* functions */
+
+/*
+ * Always check subject name for host match even if subject alt names present
+ */
+# define X509_CHECK_FLAG_ALWAYS_CHECK_SUBJECT 0x1
+/* Disable wildcard matching for dnsName fields and common name. */
+# define X509_CHECK_FLAG_NO_WILDCARDS 0x2
+/* Wildcards must not match a partial label. */
+# define X509_CHECK_FLAG_NO_PARTIAL_WILDCARDS 0x4
+/* Allow (non-partial) wildcards to match multiple labels. */
+# define X509_CHECK_FLAG_MULTI_LABEL_WILDCARDS 0x8
+/* Constraint verifier subdomain patterns to match a single labels. */
+# define X509_CHECK_FLAG_SINGLE_LABEL_SUBDOMAINS 0x10
+/*
+ * Match reference identifiers starting with "." to any sub-domain.
+ * This is a non-public flag, turned on implicitly when the subject
+ * reference identity is a DNS name.
+ */
+# define _X509_CHECK_FLAG_DOT_SUBDOMAINS 0x8000
+
+int X509_check_host(X509 *x, const char *chk, size_t chklen,
+ unsigned int flags, char **peername);
+int X509_check_email(X509 *x, const char *chk, size_t chklen,
+ unsigned int flags);
+int X509_check_ip(X509 *x, const unsigned char *chk, size_t chklen,
+ unsigned int flags);
+int X509_check_ip_asc(X509 *x, const char *ipasc, unsigned int flags);
ASN1_OCTET_STRING *a2i_IPADDRESS(const char *ipasc);
ASN1_OCTET_STRING *a2i_IPADDRESS_NC(const char *ipasc);
@@ -930,6 +968,7 @@ void ERR_load_X509V3_strings(void);
# define X509V3_F_X509V3_EXT_ADD 104
# define X509V3_F_X509V3_EXT_ADD_ALIAS 106
# define X509V3_F_X509V3_EXT_CONF 107
+# define X509V3_F_X509V3_EXT_FREE 165
# define X509V3_F_X509V3_EXT_I2D 136
# define X509V3_F_X509V3_EXT_NCONF 152
# define X509V3_F_X509V3_GET_SECTION 142
@@ -944,6 +983,7 @@ void ERR_load_X509V3_strings(void);
# define X509V3_R_BAD_OBJECT 119
# define X509V3_R_BN_DEC2BN_ERROR 100
# define X509V3_R_BN_TO_ASN1_INTEGER_ERROR 101
+# define X509V3_R_CANNOT_FIND_FREE_FUNCTION 168
# define X509V3_R_DIRNAME_ERROR 149
# define X509V3_R_DISTPOINT_ALREADY_SET 160
# define X509V3_R_DUPLICATE_ZONE_ID 133
@@ -959,13 +999,13 @@ void ERR_load_X509V3_strings(void);
# define X509V3_R_ILLEGAL_EMPTY_EXTENSION 151
# define X509V3_R_ILLEGAL_HEX_DIGIT 113
# define X509V3_R_INCORRECT_POLICY_SYNTAX_TAG 152
-# define X509V3_R_INVALID_MULTIPLE_RDNS 161
# define X509V3_R_INVALID_ASNUMBER 162
# define X509V3_R_INVALID_ASRANGE 163
# define X509V3_R_INVALID_BOOLEAN_STRING 104
# define X509V3_R_INVALID_EXTENSION_STRING 105
# define X509V3_R_INVALID_INHERITANCE 165
# define X509V3_R_INVALID_IPADDRESS 166
+# define X509V3_R_INVALID_MULTIPLE_RDNS 161
# define X509V3_R_INVALID_NAME 106
# define X509V3_R_INVALID_NULL_ARGUMENT 107
# define X509V3_R_INVALID_NULL_NAME 108
diff --git a/crypto/x86_64cpuid.pl b/crypto/x86_64cpuid.pl
index 6ebfd017ea56..d208d02392e9 100644
--- a/crypto/x86_64cpuid.pl
+++ b/crypto/x86_64cpuid.pl
@@ -24,7 +24,7 @@ print<<___;
call OPENSSL_cpuid_setup
.hidden OPENSSL_ia32cap_P
-.comm OPENSSL_ia32cap_P,8,4
+.comm OPENSSL_ia32cap_P,16,4
.text
@@ -53,12 +53,13 @@ OPENSSL_rdtsc:
.size OPENSSL_rdtsc,.-OPENSSL_rdtsc
.globl OPENSSL_ia32_cpuid
-.type OPENSSL_ia32_cpuid,\@abi-omnipotent
+.type OPENSSL_ia32_cpuid,\@function,1
.align 16
OPENSSL_ia32_cpuid:
mov %rbx,%r8 # save %rbx
xor %eax,%eax
+ mov %eax,8(%rdi) # clear 3rd word
cpuid
mov %eax,%r11d # max value for standard query level
@@ -126,6 +127,14 @@ OPENSSL_ia32_cpuid:
shr \$14,%r10d
and \$0xfff,%r10d # number of cores -1 per L1D
+ cmp \$7,%r11d
+ jb .Lnocacheinfo
+
+ mov \$7,%eax
+ xor %ecx,%ecx
+ cpuid
+ mov %ebx,8(%rdi)
+
.Lnocacheinfo:
mov \$1,%eax
cpuid
@@ -165,6 +174,7 @@ OPENSSL_ia32_cpuid:
.Lclear_avx:
mov \$0xefffe7ff,%eax # ~(1<<28|1<<12|1<<11)
and %eax,%r9d # clear AVX, FMA and AMD XOP bits
+ andl \$0xffffffdf,8(%rdi) # cleax AVX2, ~(1<<5)
.Ldone:
shl \$32,%r9
mov %r10d,%eax
@@ -279,6 +289,21 @@ OPENSSL_ia32_rdrand:
cmove %rcx,%rax
ret
.size OPENSSL_ia32_rdrand,.-OPENSSL_ia32_rdrand
+
+.globl OPENSSL_ia32_rdseed
+.type OPENSSL_ia32_rdseed,\@abi-omnipotent
+.align 16
+OPENSSL_ia32_rdseed:
+ mov \$8,%ecx
+.Loop_rdseed:
+ rdseed %rax
+ jc .Lbreak_rdseed
+ loop .Loop_rdseed
+.Lbreak_rdseed:
+ cmp \$0,%rax
+ cmove %rcx,%rax
+ ret
+.size OPENSSL_ia32_rdseed,.-OPENSSL_ia32_rdseed
___
close STDOUT; # flush
diff --git a/crypto/x86cpuid.pl b/crypto/x86cpuid.pl
index b270b44337d1..e95f6274f5e0 100644
--- a/crypto/x86cpuid.pl
+++ b/crypto/x86cpuid.pl
@@ -22,6 +22,8 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&xor ("eax","eax");
&bt ("ecx",21);
&jnc (&label("nocpuid"));
+ &mov ("esi",&wparam(0));
+ &mov (&DWP(8,"esi"),"eax"); # clear 3rd word
&cpuid ();
&mov ("edi","eax"); # max value for standard query level
@@ -79,6 +81,16 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&jmp (&label("generic"));
&set_label("intel");
+ &cmp ("edi",7);
+ &jb (&label("cacheinfo"));
+
+ &mov ("esi",&wparam(0));
+ &mov ("eax",7);
+ &xor ("ecx","ecx");
+ &cpuid ();
+ &mov (&DWP(8,"esi"),"ebx");
+
+&set_label("cacheinfo");
&cmp ("edi",4);
&mov ("edi",-1);
&jb (&label("nocacheinfo"));
@@ -135,6 +147,8 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&and ("esi",0xfeffffff); # clear FXSR
&set_label("clear_avx");
&and ("ebp",0xefffe7ff); # clear AVX, FMA and AMD XOP bits
+ &mov ("edi",&wparam(0));
+ &and (&DWP(8,"edi"),0xffffffdf); # clear AVX2
&set_label("done");
&mov ("eax","esi");
&mov ("edx","ebp");
@@ -198,7 +212,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&function_begin_B("OPENSSL_far_spin");
&pushf ();
- &pop ("eax")
+ &pop ("eax");
&bt ("eax",9);
&jnc (&label("nospin")); # interrupts are disabled
@@ -353,6 +367,21 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&ret ();
&function_end_B("OPENSSL_ia32_rdrand");
+&function_begin_B("OPENSSL_ia32_rdseed");
+ &mov ("ecx",8);
+&set_label("loop");
+ &rdseed ("eax");
+ &jc (&label("break"));
+ &loop (&label("loop"));
+&set_label("break");
+ &cmp ("eax",0);
+ &cmove ("eax","ecx");
+ &ret ();
+&function_end_B("OPENSSL_ia32_rdseed");
+
&initseg("OPENSSL_cpuid_setup");
+&hidden("OPENSSL_cpuid_setup");
+&hidden("OPENSSL_ia32cap_P");
+
&asm_finish();