diff options
490 files changed, 94368 insertions, 13179 deletions
@@ -2,7 +2,7 @@ OpenSSL CHANGES _______________ - Changes between 1.0.1o and 1.0.1p [9 Jul 2015] + Changes between 1.0.2c and 1.0.2d [9 Jul 2015] *) Alternate chains certificate forgery @@ -17,13 +17,13 @@ (Google/BoringSSL). [Matt Caswell] - Changes between 1.0.1n and 1.0.1o [12 Jun 2015] + Changes between 1.0.2b and 1.0.2c [12 Jun 2015] *) Fix HMAC ABI incompatibility. The previous version introduced an ABI incompatibility in the handling of HMAC. The previous ABI has now been restored. - Changes between 1.0.1m and 1.0.1n [11 Jun 2015] + Changes between 1.0.2a and 1.0.2b [11 Jun 2015] *) Malformed ECParameters causes infinite loop @@ -91,10 +91,65 @@ (CVE-2015-1791) [Matt Caswell] + *) Removed support for the two export grade static DH ciphersuites + EXP-DH-RSA-DES-CBC-SHA and EXP-DH-DSS-DES-CBC-SHA. These two ciphersuites + were newly added (along with a number of other static DH ciphersuites) to + 1.0.2. However the two export ones have *never* worked since they were + introduced. It seems strange in any case to be adding new export + ciphersuites, and given "logjam" it also does not seem correct to fix them. + [Matt Caswell] + + *) Only support 256-bit or stronger elliptic curves with the + 'ecdh_auto' setting (server) or by default (client). Of supported + curves, prefer P-256 (both). + [Emilia Kasper] + *) Reject DH handshakes with parameters shorter than 768 bits. [Kurt Roeckx and Emilia Kasper] - Changes between 1.0.1l and 1.0.1m [19 Mar 2015] + Changes between 1.0.2 and 1.0.2a [19 Mar 2015] + + *) ClientHello sigalgs DoS fix + + If a client connects to an OpenSSL 1.0.2 server and renegotiates with an + invalid signature algorithms extension a NULL pointer dereference will + occur. This can be exploited in a DoS attack against the server. + + This issue was was reported to OpenSSL by David Ramos of Stanford + University. + (CVE-2015-0291) + [Stephen Henson and Matt Caswell] + + *) Multiblock corrupted pointer fix + + OpenSSL 1.0.2 introduced the "multiblock" performance improvement. This + feature only applies on 64 bit x86 architecture platforms that support AES + NI instructions. A defect in the implementation of "multiblock" can cause + OpenSSL's internal write buffer to become incorrectly set to NULL when + using non-blocking IO. Typically, when the user application is using a + socket BIO for writing, this will only result in a failed connection. + However if some other BIO is used then it is likely that a segmentation + fault will be triggered, thus enabling a potential DoS attack. + + This issue was reported to OpenSSL by Daniel Danner and Rainer Mueller. + (CVE-2015-0290) + [Matt Caswell] + + *) Segmentation fault in DTLSv1_listen fix + + The DTLSv1_listen function is intended to be stateless and processes the + initial ClientHello from many peers. It is common for user code to loop + over the call to DTLSv1_listen until a valid ClientHello is received with + an associated cookie. A defect in the implementation of DTLSv1_listen means + that state is preserved in the SSL object from one invocation to the next + that can lead to a segmentation fault. Errors processing the initial + ClientHello can trigger this scenario. An example of such an error could be + that a DTLS1.0 only client is attempting to connect to a DTLS1.2 only + server. + + This issue was reported to OpenSSL by Per Allansson. + (CVE-2015-0207) + [Matt Caswell] *) Segmentation fault in ASN1_TYPE_cmp fix @@ -107,6 +162,20 @@ (CVE-2015-0286) [Stephen Henson] + *) Segmentation fault for invalid PSS parameters fix + + The signature verification routines will crash with a NULL pointer + dereference if presented with an ASN.1 signature using the RSA PSS + algorithm and invalid parameters. Since these routines are used to verify + certificate signature algorithms this can be used to crash any + certificate verification operation and exploited in a DoS attack. Any + application which performs certificate verification is vulnerable including + OpenSSL clients and servers which enable client authentication. + + This issue was was reported to OpenSSL by Brian Carpenter. + (CVE-2015-0208) + [Stephen Henson] + *) ASN.1 structure reuse memory corruption fix Reusing a structure in ASN.1 parsing may allow an attacker to cause @@ -145,6 +214,36 @@ (CVE-2015-0293) [Emilia Käsper] + *) Empty CKE with client auth and DHE fix + + If client auth is used then a server can seg fault in the event of a DHE + ciphersuite being selected and a zero length ClientKeyExchange message + being sent by the client. This could be exploited in a DoS attack. + (CVE-2015-1787) + [Matt Caswell] + + *) Handshake with unseeded PRNG fix + + Under certain conditions an OpenSSL 1.0.2 client can complete a handshake + with an unseeded PRNG. The conditions are: + - The client is on a platform where the PRNG has not been seeded + automatically, and the user has not seeded manually + - A protocol specific client method version has been used (i.e. not + SSL_client_methodv23) + - A ciphersuite is used that does not require additional random data from + the PRNG beyond the initial ClientHello client random (e.g. PSK-RC4-SHA). + + If the handshake succeeds then the client random that has been used will + have been generated from a PRNG with insufficient entropy and therefore the + output may be predictable. + + For example using the following command with an unseeded openssl will + succeed on an unpatched platform: + + openssl s_client -psk 1a2b3c4d -tls1_2 -cipher PSK-RC4-SHA + (CVE-2015-0285) + [Matt Caswell] + *) Use After Free following d2i_ECPrivatekey error fix A malformed EC private key file consumed via the d2i_ECPrivateKey function @@ -171,6 +270,336 @@ *) Removed the export ciphers from the DEFAULT ciphers [Kurt Roeckx] + Changes between 1.0.1l and 1.0.2 [22 Jan 2015] + + *) Facilitate "universal" ARM builds targeting range of ARM ISAs, e.g. + ARMv5 through ARMv8, as opposite to "locking" it to single one. + So far those who have to target multiple plaforms would compromise + and argue that binary targeting say ARMv5 would still execute on + ARMv8. "Universal" build resolves this compromise by providing + near-optimal performance even on newer platforms. + [Andy Polyakov] + + *) Accelerated NIST P-256 elliptic curve implementation for x86_64 + (other platforms pending). + [Shay Gueron & Vlad Krasnov (Intel Corp), Andy Polyakov] + + *) Add support for the SignedCertificateTimestampList certificate and + OCSP response extensions from RFC6962. + [Rob Stradling] + + *) Fix ec_GFp_simple_points_make_affine (thus, EC_POINTs_mul etc.) + for corner cases. (Certain input points at infinity could lead to + bogus results, with non-infinity inputs mapped to infinity too.) + [Bodo Moeller] + + *) Initial support for PowerISA 2.0.7, first implemented in POWER8. + This covers AES, SHA256/512 and GHASH. "Initial" means that most + common cases are optimized and there still is room for further + improvements. Vector Permutation AES for Altivec is also added. + [Andy Polyakov] + + *) Add support for little-endian ppc64 Linux target. + [Marcelo Cerri (IBM)] + + *) Initial support for AMRv8 ISA crypto extensions. This covers AES, + SHA1, SHA256 and GHASH. "Initial" means that most common cases + are optimized and there still is room for further improvements. + Both 32- and 64-bit modes are supported. + [Andy Polyakov, Ard Biesheuvel (Linaro)] + + *) Improved ARMv7 NEON support. + [Andy Polyakov] + + *) Support for SPARC Architecture 2011 crypto extensions, first + implemented in SPARC T4. This covers AES, DES, Camellia, SHA1, + SHA256/512, MD5, GHASH and modular exponentiation. + [Andy Polyakov, David Miller] + + *) Accelerated modular exponentiation for Intel processors, a.k.a. + RSAZ. + [Shay Gueron & Vlad Krasnov (Intel Corp)] + + *) Support for new and upcoming Intel processors, including AVX2, + BMI and SHA ISA extensions. This includes additional "stitched" + implementations, AESNI-SHA256 and GCM, and multi-buffer support + for TLS encrypt. + + This work was sponsored by Intel Corp. + [Andy Polyakov] + + *) Support for DTLS 1.2. This adds two sets of DTLS methods: DTLS_*_method() + supports both DTLS 1.2 and 1.0 and should use whatever version the peer + supports and DTLSv1_2_*_method() which supports DTLS 1.2 only. + [Steve Henson] + + *) Use algorithm specific chains in SSL_CTX_use_certificate_chain_file(): + this fixes a limiation in previous versions of OpenSSL. + [Steve Henson] + + *) Extended RSA OAEP support via EVP_PKEY API. Options to specify digest, + MGF1 digest and OAEP label. + [Steve Henson] + + *) Add EVP support for key wrapping algorithms, to avoid problems with + existing code the flag EVP_CIPHER_CTX_WRAP_ALLOW has to be set in + the EVP_CIPHER_CTX or an error is returned. Add AES and DES3 wrap + algorithms and include tests cases. + [Steve Henson] + + *) Add functions to allocate and set the fields of an ECDSA_METHOD + structure. + [Douglas E. Engert, Steve Henson] + + *) New functions OPENSSL_gmtime_diff and ASN1_TIME_diff to find the + difference in days and seconds between two tm or ASN1_TIME structures. + [Steve Henson] + + *) Add -rev test option to s_server to just reverse order of characters + received by client and send back to server. Also prints an abbreviated + summary of the connection parameters. + [Steve Henson] + + *) New option -brief for s_client and s_server to print out a brief summary + of connection parameters. + [Steve Henson] + + *) Add callbacks for arbitrary TLS extensions. + [Trevor Perrin <trevp@trevp.net> and Ben Laurie] + + *) New option -crl_download in several openssl utilities to download CRLs + from CRLDP extension in certificates. + [Steve Henson] + + *) New options -CRL and -CRLform for s_client and s_server for CRLs. + [Steve Henson] + + *) New function X509_CRL_diff to generate a delta CRL from the difference + of two full CRLs. Add support to "crl" utility. + [Steve Henson] + + *) New functions to set lookup_crls function and to retrieve + X509_STORE from X509_STORE_CTX. + [Steve Henson] + + *) Print out deprecated issuer and subject unique ID fields in + certificates. + [Steve Henson] + + *) Extend OCSP I/O functions so they can be used for simple general purpose + HTTP as well as OCSP. New wrapper function which can be used to download + CRLs using the OCSP API. + [Steve Henson] + + *) Delegate command line handling in s_client/s_server to SSL_CONF APIs. + [Steve Henson] + + *) SSL_CONF* functions. These provide a common framework for application + configuration using configuration files or command lines. + [Steve Henson] + + *) SSL/TLS tracing code. This parses out SSL/TLS records using the + message callback and prints the results. Needs compile time option + "enable-ssl-trace". New options to s_client and s_server to enable + tracing. + [Steve Henson] + + *) New ctrl and macro to retrieve supported points extensions. + Print out extension in s_server and s_client. + [Steve Henson] + + *) New functions to retrieve certificate signature and signature + OID NID. + [Steve Henson] + + *) Add functions to retrieve and manipulate the raw cipherlist sent by a + client to OpenSSL. + [Steve Henson] + + *) New Suite B modes for TLS code. These use and enforce the requirements + of RFC6460: restrict ciphersuites, only permit Suite B algorithms and + only use Suite B curves. The Suite B modes can be set by using the + strings "SUITEB128", "SUITEB192" or "SUITEB128ONLY" for the cipherstring. + [Steve Henson] + + *) New chain verification flags for Suite B levels of security. Check + algorithms are acceptable when flags are set in X509_verify_cert. + [Steve Henson] + + *) Make tls1_check_chain return a set of flags indicating checks passed + by a certificate chain. Add additional tests to handle client + certificates: checks for matching certificate type and issuer name + comparison. + [Steve Henson] + + *) If an attempt is made to use a signature algorithm not in the peer + preference list abort the handshake. If client has no suitable + signature algorithms in response to a certificate request do not + use the certificate. + [Steve Henson] + + *) If server EC tmp key is not in client preference list abort handshake. + [Steve Henson] + + *) Add support for certificate stores in CERT structure. This makes it + possible to have different stores per SSL structure or one store in + the parent SSL_CTX. Include distint stores for certificate chain + verification and chain building. New ctrl SSL_CTRL_BUILD_CERT_CHAIN + to build and store a certificate chain in CERT structure: returing + an error if the chain cannot be built: this will allow applications + to test if a chain is correctly configured. + + Note: if the CERT based stores are not set then the parent SSL_CTX + store is used to retain compatibility with existing behaviour. + + [Steve Henson] + + *) New function ssl_set_client_disabled to set a ciphersuite disabled + mask based on the current session, check mask when sending client + hello and checking the requested ciphersuite. + [Steve Henson] + + *) New ctrls to retrieve and set certificate types in a certificate + request message. Print out received values in s_client. If certificate + types is not set with custom values set sensible values based on + supported signature algorithms. + [Steve Henson] + + *) Support for distinct client and server supported signature algorithms. + [Steve Henson] + + *) Add certificate callback. If set this is called whenever a certificate + is required by client or server. An application can decide which + certificate chain to present based on arbitrary criteria: for example + supported signature algorithms. Add very simple example to s_server. + This fixes many of the problems and restrictions of the existing client + certificate callback: for example you can now clear an existing + certificate and specify the whole chain. + [Steve Henson] + + *) Add new "valid_flags" field to CERT_PKEY structure which determines what + the certificate can be used for (if anything). Set valid_flags field + in new tls1_check_chain function. Simplify ssl_set_cert_masks which used + to have similar checks in it. + + Add new "cert_flags" field to CERT structure and include a "strict mode". + This enforces some TLS certificate requirements (such as only permitting + certificate signature algorithms contained in the supported algorithms + extension) which some implementations ignore: this option should be used + with caution as it could cause interoperability issues. + [Steve Henson] + + *) Update and tidy signature algorithm extension processing. Work out + shared signature algorithms based on preferences and peer algorithms + and print them out in s_client and s_server. Abort handshake if no + shared signature algorithms. + [Steve Henson] + + *) Add new functions to allow customised supported signature algorithms + for SSL and SSL_CTX structures. Add options to s_client and s_server + to support them. + [Steve Henson] + + *) New function SSL_certs_clear() to delete all references to certificates + from an SSL structure. Before this once a certificate had been added + it couldn't be removed. + [Steve Henson] + + *) Integrate hostname, email address and IP address checking with certificate + verification. New verify options supporting checking in opensl utility. + [Steve Henson] + + *) Fixes and wildcard matching support to hostname and email checking + functions. Add manual page. + [Florian Weimer (Red Hat Product Security Team)] + + *) New functions to check a hostname email or IP address against a + certificate. Add options x509 utility to print results of checks against + a certificate. + [Steve Henson] + + *) Fix OCSP checking. + [Rob Stradling <rob.stradling@comodo.com> and Ben Laurie] + + *) Initial experimental support for explicitly trusted non-root CAs. + OpenSSL still tries to build a complete chain to a root but if an + intermediate CA has a trust setting included that is used. The first + setting is used: whether to trust (e.g., -addtrust option to the x509 + utility) or reject. + [Steve Henson] + + *) Add -trusted_first option which attempts to find certificates in the + trusted store even if an untrusted chain is also supplied. + [Steve Henson] + + *) MIPS assembly pack updates: support for MIPS32r2 and SmartMIPS ASE, + platform support for Linux and Android. + [Andy Polyakov] + + *) Support for linux-x32, ILP32 environment in x86_64 framework. + [Andy Polyakov] + + *) Experimental multi-implementation support for FIPS capable OpenSSL. + When in FIPS mode the approved implementations are used as normal, + when not in FIPS mode the internal unapproved versions are used instead. + This means that the FIPS capable OpenSSL isn't forced to use the + (often lower perfomance) FIPS implementations outside FIPS mode. + [Steve Henson] + + *) Transparently support X9.42 DH parameters when calling + PEM_read_bio_DHparameters. This means existing applications can handle + the new parameter format automatically. + [Steve Henson] + + *) Initial experimental support for X9.42 DH parameter format: mainly + to support use of 'q' parameter for RFC5114 parameters. + [Steve Henson] + + *) Add DH parameters from RFC5114 including test data to dhtest. + [Steve Henson] + + *) Support for automatic EC temporary key parameter selection. If enabled + the most preferred EC parameters are automatically used instead of + hardcoded fixed parameters. Now a server just has to call: + SSL_CTX_set_ecdh_auto(ctx, 1) and the server will automatically + support ECDH and use the most appropriate parameters. + [Steve Henson] + + *) Enhance and tidy EC curve and point format TLS extension code. Use + static structures instead of allocation if default values are used. + New ctrls to set curves we wish to support and to retrieve shared curves. + Print out shared curves in s_server. New options to s_server and s_client + to set list of supported curves. + [Steve Henson] + + *) New ctrls to retrieve supported signature algorithms and + supported curve values as an array of NIDs. Extend openssl utility + to print out received values. + [Steve Henson] + + *) Add new APIs EC_curve_nist2nid and EC_curve_nid2nist which convert + between NIDs and the more common NIST names such as "P-256". Enhance + ecparam utility and ECC method to recognise the NIST names for curves. + [Steve Henson] + + *) Enhance SSL/TLS certificate chain handling to support different + chains for each certificate instead of one chain in the parent SSL_CTX. + [Steve Henson] + + *) Support for fixed DH ciphersuite client authentication: where both + server and client use DH certificates with common parameters. + [Steve Henson] + + *) Support for fixed DH ciphersuites: those requiring DH server + certificates. + [Steve Henson] + + *) New function i2d_re_X509_tbs for re-encoding the TBS portion of + the certificate. + Note: Related 1.0.2-beta specific macros X509_get_cert_info, + X509_CINF_set_modified, X509_CINF_get_issuer, X509_CINF_get_extensions and + X509_CINF_get_signature were reverted post internal team review. + Changes between 1.0.1k and 1.0.1l [15 Jan 2015] *) Build fixes for the Windows and OpenVMS platforms diff --git a/Configure b/Configure index 60ec3783e605..d99eed7f9303 100755 --- a/Configure +++ b/Configure @@ -105,6 +105,25 @@ my $usage="Usage: Configure [no-<cipher> ...] [enable-<cipher> ...] [experimenta my $gcc_devteam_warn = "-Wall -pedantic -DPEDANTIC -Wno-long-long -Wsign-compare -Wmissing-prototypes -Wshadow -Wformat -Werror -DCRYPTO_MDEBUG_ALL -DCRYPTO_MDEBUG_ABORT -DREF_CHECK -DOPENSSL_NO_DEPRECATED"; +# TODO(openssl-team): fix problems and investigate if (at least) the following +# warnings can also be enabled: +# -Wconditional-uninitialized, -Wswitch-enum, -Wunused-macros, +# -Wmissing-field-initializers, -Wmissing-variable-declarations, +# -Wincompatible-pointer-types-discards-qualifiers, -Wcast-align, +# -Wunreachable-code -Wunused-parameter -Wlanguage-extension-token +# -Wextended-offsetof +my $clang_disabled_warnings = "-Wno-unused-parameter -Wno-missing-field-initializers -Wno-language-extension-token -Wno-extended-offsetof"; + +# These are used in addition to $gcc_devteam_warn when the compiler is clang. +# TODO(openssl-team): fix problems and investigate if (at least) the +# following warnings can also be enabled: -Wconditional-uninitialized, +# -Wswitch-enum, -Wunused-macros, -Wmissing-field-initializers, +# -Wmissing-variable-declarations, +# -Wincompatible-pointer-types-discards-qualifiers, -Wcast-align, +# -Wunreachable-code -Wunused-parameter -Wlanguage-extension-token +# -Wextended-offsetof +my $clang_devteam_warn = "-Wno-unused-parameter -Wno-missing-field-initializers -Wno-language-extension-token -Wno-extended-offsetof -Qunused-arguments"; + my $strict_warnings = 0; my $x86_gcc_des="DES_PTR DES_RISC1 DES_UNROLL"; @@ -124,24 +143,25 @@ my $tlib="-lnsl -lsocket"; my $bits1="THIRTY_TWO_BIT "; my $bits2="SIXTY_FOUR_BIT "; -my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o x86-gf2m.o:des-586.o crypt586.o:aes-586.o vpaes-x86.o aesni-x86.o:bf-586.o:md5-586.o:sha1-586.o sha256-586.o sha512-586.o:cast-586.o:rc4-586.o:rmd-586.o:rc5-586.o:wp_block.o wp-mmx.o:cmll-x86.o:ghash-x86.o:"; +my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o x86-gf2m.o::des-586.o crypt586.o:aes-586.o vpaes-x86.o aesni-x86.o:bf-586.o:md5-586.o:sha1-586.o sha256-586.o sha512-586.o:cast-586.o:rc4-586.o:rmd-586.o:rc5-586.o:wp_block.o wp-mmx.o:cmll-x86.o:ghash-x86.o:"; my $x86_elf_asm="$x86_asm:elf"; -my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o:"; -my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o::void"; -my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void"; -my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::::void"; -my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o::void"; -my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o::::::::"; -my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o::::::::"; -my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o::aes-s390x.o aes-ctr.o aes-xts.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:"; -my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o::void"; -my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::32"; -my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::64"; -my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::::"; -my $ppc64_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o::::::::"; -my $no_asm=":::::::::::::::void"; +my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o:ecp_nistz256.o ecp_nistz256-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o aesni-gcm-x86_64.o:"; +my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o:::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o::void"; +my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o::des_enc-sparc.o fcrypt_b.o dest4-sparcv9.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o::::::camellia.o cmll_misc.o cmll_cbc.o cmllt4-sparcv9.o:ghash-sparcv9.o::void"; +my $sparcv8_asm=":sparcv8.o::des_enc-sparc.o fcrypt_b.o:::::::::::::void"; +my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o::::::sha1-alpha.o:::::::ghash-alpha.o::void"; +my $mips64_asm=":bn-mips.o mips-mont.o:::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o::::::::"; +my $mips32_asm=$mips64_asm; $mips32_asm =~ s/\s*sha512\-mips\.o//; +my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o:::aes-s390x.o aes-ctr.o aes-xts.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:"; +my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o:::aes_cbc.o aes-armv4.o bsaes-armv7.o aesv8-armx.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o ghashv8-armx.o::void"; +my $aarch64_asm="armcap.o arm64cpuid.o mem_clr.o::::aes_core.o aes_cbc.o aesv8-armx.o:::sha1-armv8.o sha256-armv8.o sha512-armv8.o:::::::ghashv8-armx.o:"; +my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o:::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::32"; +my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o:::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::64"; +my $ppc64_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o:::aes_core.o aes_cbc.o aes-ppc.o vpaes-ppc.o aesp8-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o sha256p8-ppc.o sha512p8-ppc.o:::::::ghashp8-ppc.o:"; +my $ppc32_asm=$ppc64_asm; +my $no_asm="::::::::::::::::void"; # As for $BSDthreads. Idea is to maintain "collective" set of flags, # which would cover all BSD flavors. -pthread applies to them all, @@ -152,7 +172,7 @@ my $no_asm=":::::::::::::::void"; # seems to be sufficient? my $BSDthreads="-pthread -D_THREAD_SAFE -D_REENTRANT"; -#config-string $cc : $cflags : $unistd : $thread_cflag : $sys_id : $lflags : $bn_ops : $cpuid_obj : $bn_obj : $des_obj : $aes_obj : $bf_obj : $md5_obj : $sha1_obj : $cast_obj : $rc4_obj : $rmd160_obj : $rc5_obj : $wp_obj : $cmll_obj : $modes_obj : $engines_obj : $dso_scheme : $shared_target : $shared_cflag : $shared_ldflag : $shared_extension : $ranlib : $arflags : $multilib +#config-string $cc : $cflags : $unistd : $thread_cflag : $sys_id : $lflags : $bn_ops : $cpuid_obj : $bn_obj : $ec_obj : $des_obj : $aes_obj : $bf_obj : $md5_obj : $sha1_obj : $cast_obj : $rc4_obj : $rmd160_obj : $rc5_obj : $wp_obj : $cmll_obj : $modes_obj : $engines_obj : $dso_scheme : $shared_target : $shared_cflag : $shared_ldflag : $shared_extension : $ranlib : $arflags : $multilib my %table=( # File 'TABLE' (created by 'make TABLE') contains the data from this list, @@ -174,14 +194,14 @@ my %table=( "debug-ben-debug-64", "gcc:$gcc_devteam_warn -Wno-error=overlength-strings -DBN_DEBUG -DCONF_DEBUG -DDEBUG_SAFESTACK -DDEBUG_UNUSED -g3 -O3 -pipe::${BSDthreads}:::SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:elf:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "debug-ben-macos", "cc:$gcc_devteam_warn -arch i386 -DBN_DEBUG -DCONF_DEBUG -DDEBUG_SAFESTACK -DDEBUG_UNUSED -DOPENSSL_THREADS -D_REENTRANT -DDSO_DLFCN -DHAVE_DLFCN_H -O3 -DL_ENDIAN -g3 -pipe::(unknown)::-Wl,-search_paths_first::::", "debug-ben-macos-gcc46", "gcc-mp-4.6:$gcc_devteam_warn -Wconversion -DBN_DEBUG -DCONF_DEBUG -DDEBUG_SAFESTACK -DDEBUG_UNUSED -DOPENSSL_THREADS -D_REENTRANT -DDSO_DLFCN -DHAVE_DLFCN_H -O3 -DL_ENDIAN -g3 -pipe::(unknown)::::::", -"debug-ben-darwin64","cc:$gcc_devteam_warn -Wno-language-extension-token -Wno-extended-offsetof -arch x86_64 -O3 -DL_ENDIAN -Wall::-D_REENTRANT:MACOSX:-Wl,-search_paths_first%:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL:".eval{my $asm=$x86_64_asm;$asm=~s/rc4\-[^:]+//;$asm}.":macosx:dlfcn:darwin-shared:-fPIC -fno-common:-arch x86_64 -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", +"debug-ben-darwin64","cc:$gcc_devteam_warn -g -Wno-language-extension-token -Wno-extended-offsetof -arch x86_64 -O3 -DL_ENDIAN -Wall::-D_REENTRANT:MACOSX:-Wl,-search_paths_first%:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL:".eval{my $asm=$x86_64_asm;$asm=~s/rc4\-[^:]+//;$asm}.":macosx:dlfcn:darwin-shared:-fPIC -fno-common:-arch x86_64 -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", +"debug-ben-debug-64-clang", "clang:$gcc_devteam_warn -Wno-error=overlength-strings -Wno-error=extended-offsetof -Qunused-arguments -DBN_DEBUG -DCONF_DEBUG -DDEBUG_SAFESTACK -DDEBUG_UNUSED -g3 -O3 -pipe::${BSDthreads}:::SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:elf:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "debug-ben-no-opt", "gcc: -Wall -Wmissing-prototypes -Wstrict-prototypes -Wmissing-declarations -DDEBUG_SAFESTACK -DCRYPTO_MDEBUG -Werror -DL_ENDIAN -DTERMIOS -Wall -g3::(unknown)::::::", "debug-ben-strict", "gcc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_CTX_DEBUG -DCRYPTO_MDEBUG -DCONST_STRICT -O2 -Wall -Wshadow -Werror -Wpointer-arith -Wcast-qual -Wwrite-strings -pipe::(unknown)::::::", "debug-rse","cc:-DTERMIOS -DL_ENDIAN -pipe -O -g -ggdb3 -Wall::(unknown):::BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}", "debug-bodo", "gcc:$gcc_devteam_warn -Wno-error=overlength-strings -DBN_DEBUG -DBN_DEBUG_RAND -DCONF_DEBUG -DBIO_PAIR_DEBUG -m64 -DL_ENDIAN -DTERMIO -g -DMD32_REG_T=int::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:elf:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64", -"debug-ulf", "gcc:-DTERMIOS -DL_ENDIAN -march=i486 -Wall -DBN_DEBUG -DBN_DEBUG_RAND -DREF_CHECK -DCONF_DEBUG -DBN_CTX_DEBUG -DCRYPTO_MDEBUG -DOPENSSL_NO_ASM -g -Wformat -Wshadow -Wmissing-prototypes -Wmissing-declarations:::CYGWIN32:::${no_asm}:win32:cygwin-shared:::.dll", "debug-steve64", "gcc:$gcc_devteam_warn -m64 -DL_ENDIAN -DTERMIO -DCONF_DEBUG -DDEBUG_SAFESTACK -Wno-overlength-strings -g::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:elf:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", -"debug-steve32", "gcc:$gcc_devteam_warn -m32 -DL_ENDIAN -DCONF_DEBUG -DDEBUG_SAFESTACK -g -pipe::-D_REENTRANT::-rdynamic -ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-fPIC:-m32:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"debug-steve32", "gcc:$gcc_devteam_warn -m32 -DL_ENDIAN -DCONF_DEBUG -DDEBUG_SAFESTACK -Wno-overlength-strings -g -pipe::-D_REENTRANT::-rdynamic -ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-fPIC:-m32:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "debug-steve-opt", "gcc:$gcc_devteam_warn -m64 -O3 -DL_ENDIAN -DTERMIO -DCONF_DEBUG -DDEBUG_SAFESTACK -g::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:elf:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "debug-levitte-linux-elf","gcc:-DLEVITTE_DEBUG -DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DCRYPTO_MDEBUG -DL_ENDIAN -ggdb -g3 -Wall::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "debug-levitte-linux-noasm","gcc:-DLEVITTE_DEBUG -DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DCRYPTO_MDEBUG -DOPENSSL_NO_ASM -DL_ENDIAN -ggdb -g3 -Wall::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", @@ -193,9 +213,9 @@ my %table=( "debug-linux-ppro","gcc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_CTX_DEBUG -DCRYPTO_MDEBUG -DL_ENDIAN -g -mcpu=pentiumpro -Wall::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn", "debug-linux-elf","gcc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_CTX_DEBUG -DCRYPTO_MDEBUG -DL_ENDIAN -g -march=i486 -Wall::-D_REENTRANT::-lefence -ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "debug-linux-elf-noefence","gcc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_CTX_DEBUG -DCRYPTO_MDEBUG -DL_ENDIAN -g -march=i486 -Wall::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", -"debug-linux-ia32-aes", "gcc:-DAES_EXPERIMENTAL -DL_ENDIAN -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:x86cpuid.o:bn-586.o co-586.o x86-mont.o:des-586.o crypt586.o:aes_x86core.o aes_cbc.o aesni-x86.o:bf-586.o:md5-586.o:sha1-586.o sha256-586.o sha512-586.o:cast-586.o:rc4-586.o:rmd-586.o:rc5-586.o:wp_block.o wp-mmx.o::ghash-x86.o::elf:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"debug-linux-ia32-aes", "gcc:-DAES_EXPERIMENTAL -DL_ENDIAN -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:x86cpuid.o:bn-586.o co-586.o x86-mont.o::des-586.o crypt586.o:aes_x86core.o aes_cbc.o aesni-x86.o:bf-586.o:md5-586.o:sha1-586.o sha256-586.o sha512-586.o:cast-586.o:rc4-586.o:rmd-586.o:rc5-586.o:wp_block.o wp-mmx.o::ghash-x86.o::elf:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "debug-linux-generic32","gcc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DCRYPTO_MDEBUG -g -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", -"debug-linux-generic64","gcc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DCRYPTO_MDEBUG -g -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"debug-linux-generic64","gcc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DCRYPTO_MDEBUG -DTERMIO -g -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "debug-linux-x86_64","gcc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DCRYPTO_MDEBUG -m64 -DL_ENDIAN -g -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:elf:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64", "dist", "cc:-O::(unknown)::::::", @@ -225,7 +245,7 @@ my %table=( "solaris64-x86_64-gcc","gcc:-m64 -O3 -Wall -DL_ENDIAN::-D_REENTRANT::-lsocket -lnsl -ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:elf:dlfcn:solaris-shared:-fPIC:-m64 -shared -static-libgcc:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::/64", #### Solaris x86 with Sun C setups -"solaris-x86-cc","cc:-fast -O -Xa::-D_REENTRANT::-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_PTR DES_UNROLL BF_PTR:${no_asm}:dlfcn:solaris-shared:-KPIC:-G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"solaris-x86-cc","cc:-fast -xarch=generic -O -Xa::-D_REENTRANT::-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_PTR DES_UNROLL BF_PTR:${no_asm}:dlfcn:solaris-shared:-KPIC:-G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "solaris64-x86_64-cc","cc:-fast -xarch=amd64 -xstrconst -Xa -DL_ENDIAN::-D_REENTRANT::-lsocket -lnsl -ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:elf:dlfcn:solaris-shared:-KPIC:-xarch=amd64 -G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::/64", #### SPARC Solaris with GNU C setups @@ -300,7 +320,7 @@ my %table=( "hpux-parisc-gcc","gcc:-O3 -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT::-Wl,+s -ldld:BN_LLONG DES_PTR DES_UNROLL DES_RISC1:${no_asm}:dl:hpux-shared:-fPIC:-shared:.sl.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "hpux-parisc1_1-gcc","gcc:-O3 -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT::-Wl,+s -ldld:BN_LLONG DES_PTR DES_UNROLL DES_RISC1:${parisc11_asm}:dl:hpux-shared:-fPIC:-shared:.sl.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::/pa1.1", "hpux-parisc2-gcc","gcc:-march=2.0 -O3 -DB_ENDIAN -D_REENTRANT::::-Wl,+s -ldld:SIXTY_FOUR_BIT RC4_CHAR RC4_CHUNK DES_PTR DES_UNROLL DES_RISC1:".eval{my $asm=$parisc20_asm;$asm=~s/2W\./2\./;$asm=~s/:64/:32/;$asm}.":dl:hpux-shared:-fPIC:-shared:.sl.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::/pa20_32", -"hpux64-parisc2-gcc","gcc:-O3 -DB_ENDIAN -D_REENTRANT::::-ldl:SIXTY_FOUR_BIT_LONG MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC1 DES_INT::pa-risc2W.o::::::::::::::void:dlfcn:hpux-shared:-fpic:-shared:.sl.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::/pa20_64", +"hpux64-parisc2-gcc","gcc:-O3 -DB_ENDIAN -D_REENTRANT::::-ldl:SIXTY_FOUR_BIT_LONG MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC1 DES_INT::pa-risc2W.o:::::::::::::::void:dlfcn:hpux-shared:-fpic:-shared:.sl.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::/pa20_64", # More attempts at unified 10.X and 11.X targets for HP C compiler. # @@ -347,20 +367,57 @@ my %table=( # throw in -D[BL]_ENDIAN, whichever appropriate... "linux-generic32","gcc:-O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-ppc", "gcc:-DB_ENDIAN -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL:${ppc32_asm}:linux32:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", -# It's believed that majority of ARM toolchains predefine appropriate -march. -# If you compiler does not, do complement config command line with one! -"linux-armv4", "gcc:-O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${armv4_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", + +####################################################################### +# Note that -march is not among compiler options in below linux-armv4 +# target line. Not specifying one is intentional to give you choice to: +# +# a) rely on your compiler default by not specifying one; +# b) specify your target platform explicitly for optimal performance, +# e.g. -march=armv6 or -march=armv7-a; +# c) build "universal" binary that targets *range* of platforms by +# specifying minimum and maximum supported architecture; +# +# As for c) option. It actually makes no sense to specify maximum to be +# less than ARMv7, because it's the least requirement for run-time +# switch between platform-specific code paths. And without run-time +# switch performance would be equivalent to one for minimum. Secondly, +# there are some natural limitations that you'd have to accept and +# respect. Most notably you can *not* build "universal" binary for +# big-endian platform. This is because ARMv7 processor always picks +# instructions in little-endian order. Another similar limitation is +# that -mthumb can't "cross" -march=armv6t2 boundary, because that's +# where it became Thumb-2. Well, this limitation is a bit artificial, +# because it's not really impossible, but it's deemed too tricky to +# support. And of course you have to be sure that your binutils are +# actually up to the task of handling maximum target platform. With all +# this in mind here is an example of how to configure "universal" build: +# +# ./Configure linux-armv4 -march=armv6 -D__ARM_MAX_ARCH__=8 +# +"linux-armv4", "gcc: -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${armv4_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"linux-aarch64","gcc: -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${aarch64_asm}:linux64:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +# Configure script adds minimally required -march for assembly support, +# if no -march was specified at command line. mips32 and mips64 below +# refer to contemporary MIPS Architecture specifications, MIPS32 and +# MIPS64, rather than to kernel bitness. +"linux-mips32", "gcc:-mabi=32 -O3 -Wall -DBN_DIV3W::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${mips32_asm}:o32:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"linux-mips64", "gcc:-mabi=n32 -O3 -Wall -DBN_DIV3W::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${mips64_asm}:n32:dlfcn:linux-shared:-fPIC:-mabi=n32:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::32", +"linux64-mips64", "gcc:-mabi=64 -O3 -Wall -DBN_DIV3W::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${mips64_asm}:64:dlfcn:linux-shared:-fPIC:-mabi=64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64", #### IA-32 targets... -"linux-ia32-icc", "icc:-DL_ENDIAN -O2 -no_cpprt::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-KPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"linux-ia32-icc", "icc:-DL_ENDIAN -O2::-D_REENTRANT::-ldl -no_cpprt:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-KPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-elf", "gcc:-DL_ENDIAN -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-aout", "gcc:-DL_ENDIAN -O3 -fomit-frame-pointer -march=i486 -Wall::(unknown):::BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_asm}:a.out", #### "linux-generic64","gcc:-O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-ppc64", "gcc:-m64 -DB_ENDIAN -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL:${ppc64_asm}:linux64:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64", -"linux-ia64", "gcc:-DL_ENDIAN -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_UNROLL DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", -"linux-ia64-ecc","ecc:-DL_ENDIAN -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", -"linux-ia64-icc","icc:-DL_ENDIAN -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"linux-ppc64le","gcc:-m64 -DL_ENDIAN -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL:$ppc64_asm:linux64le:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::", +"linux-ia64", "gcc:-DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_UNROLL DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"linux-ia64-icc","icc:-DL_ENDIAN -O2 -Wall::-D_REENTRANT::-ldl -no_cpprt:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-x86_64", "gcc:-m64 -DL_ENDIAN -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:elf:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64", +"linux-x86_64-clang", "clang: -m64 -DL_ENDIAN -O3 -Wall -Wextra $clang_disabled_warnings -Qunused-arguments::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:elf:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64", +"linux-x86_64-icc", "icc:-DL_ENDIAN -O2::-D_REENTRANT::-ldl -no_cpprt:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:elf:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64", +"linux-x32", "gcc:-mx32 -DL_ENDIAN -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT DES_UNROLL:${x86_64_asm}:elf:dlfcn:linux-shared:-fPIC:-mx32:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::x32", "linux64-s390x", "gcc:-m64 -DB_ENDIAN -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL:${s390x_asm}:64:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64", #### So called "highgprs" target for z/Architecture CPUs # "Highgprs" is kernel feature first implemented in Linux 2.6.32, see @@ -407,6 +464,7 @@ my %table=( "android","gcc:-mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "android-x86","gcc:-mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:".eval{my $asm=${x86_elf_asm};$asm=~s/:elf/:android/;$asm}.":dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "android-armv7","gcc:-march=armv7-a -mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${armv4_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"android-mips","gcc:-mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${mips32_asm}:o32:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", #### *BSD [do see comment about ${BSDthreads} above!] "BSD-generic32","gcc:-O3 -fomit-frame-pointer -Wall::${BSDthreads}:::BN_LLONG RC2_CHAR RC4_INDEX DES_INT DES_UNROLL:${no_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", @@ -421,7 +479,7 @@ my %table=( # triggered by RIPEMD160 code. "BSD-sparc64", "gcc:-DB_ENDIAN -O3 -DMD32_REG_T=int -Wall::${BSDthreads}:::BN_LLONG RC2_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC2 BF_PTR:${sparcv9_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "BSD-ia64", "gcc:-DL_ENDIAN -O3 -Wall::${BSDthreads}:::SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_UNROLL DES_INT:${ia64_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", -"BSD-x86_64", "gcc:-DL_ENDIAN -O3 -Wall::${BSDthreads}:::SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:elf:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"BSD-x86_64", "cc:-DL_ENDIAN -O3 -Wall::${BSDthreads}:::SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:elf:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "bsdi-elf-gcc", "gcc:-DPERL5 -DL_ENDIAN -fomit-frame-pointer -O3 -march=i486 -Wall::(unknown)::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", @@ -454,11 +512,11 @@ my %table=( # UnixWare 2.0x fails destest with -O. "unixware-2.0","cc:-DFILIO_H -DNO_STRINGS_H::-Kthread::-lsocket -lnsl -lresolv -lx:${x86_gcc_des} ${x86_gcc_opts}:::", "unixware-2.1","cc:-O -DFILIO_H::-Kthread::-lsocket -lnsl -lresolv -lx:${x86_gcc_des} ${x86_gcc_opts}:::", -"unixware-7","cc:-O -DFILIO_H -Kalloca::-Kthread::-lsocket -lnsl:BN_LLONG MD2_CHAR RC4_INDEX ${x86_gcc_des}:${x86_elf_asm}:dlfcn:svr5-shared:-Kpic::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", -"unixware-7-gcc","gcc:-DL_ENDIAN -DFILIO_H -O3 -fomit-frame-pointer -march=pentium -Wall::-D_REENTRANT::-lsocket -lnsl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:gnu-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"unixware-7","cc:-O -DFILIO_H -Kalloca::-Kthread::-lsocket -lnsl:BN_LLONG MD2_CHAR RC4_INDEX ${x86_gcc_des}:${x86_elf_asm}-1:dlfcn:svr5-shared:-Kpic::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"unixware-7-gcc","gcc:-DL_ENDIAN -DFILIO_H -O3 -fomit-frame-pointer -march=pentium -Wall::-D_REENTRANT::-lsocket -lnsl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}-1:dlfcn:gnu-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", # SCO 5 - Ben Laurie <ben@algroup.co.uk> says the -O breaks the SCO cc. -"sco5-cc", "cc:-belf::(unknown)::-lsocket -lnsl:${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:svr3-shared:-Kpic::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", -"sco5-gcc", "gcc:-O3 -fomit-frame-pointer::(unknown)::-lsocket -lnsl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:svr3-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"sco5-cc", "cc:-belf::(unknown)::-lsocket -lnsl:${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}-1:dlfcn:svr3-shared:-Kpic::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"sco5-gcc", "gcc:-O3 -fomit-frame-pointer::(unknown)::-lsocket -lnsl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}-1:dlfcn:svr3-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", #### IBM's AIX. "aix3-cc", "cc:-O -DB_ENDIAN -qmaxmem=16384::(unknown):AIX::BN_LLONG RC4_CHAR:::", @@ -518,9 +576,9 @@ my %table=( # Visual C targets # # Win64 targets, WIN64I denotes IA-64 and WIN64A - AMD64 -"VC-WIN64I","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64I::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:ia64cpuid.o:ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o:::::::ghash-ia64.o::ias:win32", +"VC-WIN64I","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64I::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:ia64cpuid.o:ia64.o ia64-mont.o:::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o:::::::ghash-ia64.o::ias:win32", "VC-WIN64A","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64A::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:".eval{my $asm=$x86_64_asm;$asm=~s/x86_64-gcc\.o/bn_asm.o/;$asm}.":auto:win32", -"debug-VC-WIN64I","cl:-W3 -Gs0 -Gy -Zi -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64I::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:ia64cpuid.o:ia64.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o:::::::ghash-ia64.o::ias:win32", +"debug-VC-WIN64I","cl:-W3 -Gs0 -Gy -Zi -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64I::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:ia64cpuid.o:ia64.o:::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o:::::::ghash-ia64.o::ias:win32", "debug-VC-WIN64A","cl:-W3 -Gs0 -Gy -Zi -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64A::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:".eval{my $asm=$x86_64_asm;$asm=~s/x86_64-gcc\.o/bn_asm.o/;$asm}.":auto:win32", # x86 Win32 target defaults to ANSI API, if you want UNICODE, complement # 'perl Configure VC-WIN32' with '-DUNICODE -D_UNICODE' @@ -547,9 +605,8 @@ my %table=( "UWIN", "cc:-DTERMIOS -DL_ENDIAN -O -Wall:::UWIN::BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${no_asm}:win32", # Cygwin -"Cygwin-pre1.3", "gcc:-DTERMIOS -DL_ENDIAN -fomit-frame-pointer -O3 -m486 -Wall::(unknown):CYGWIN32::BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${no_asm}:win32", -"Cygwin", "gcc:-DTERMIOS -DL_ENDIAN -fomit-frame-pointer -O3 -march=i486 -Wall:::CYGWIN32::BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_asm}:coff:dlfcn:cygwin-shared:-D_WINDLL:-shared:.dll.a", -"debug-Cygwin", "gcc:-DTERMIOS -DL_ENDIAN -march=i486 -Wall -DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_CTX_DEBUG -DCRYPTO_MDEBUG -DOPENSSL_NO_ASM -g -Wformat -Wshadow -Wmissing-prototypes -Wmissing-declarations -Werror:::CYGWIN32:::${no_asm}:dlfcn:cygwin-shared:-D_WINDLL:-shared:.dll.a", +"Cygwin", "gcc:-DTERMIOS -DL_ENDIAN -fomit-frame-pointer -O3 -march=i486 -Wall:::CYGWIN::BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_asm}:coff:dlfcn:cygwin-shared:-D_WINDLL:-shared:.dll.a", +"Cygwin-x86_64", "gcc:-DTERMIOS -DL_ENDIAN -O3 -Wall:::CYGWIN::SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:mingw64:dlfcn:cygwin-shared:-D_WINDLL:-shared:.dll.a", # NetWare from David Ward (dsward@novell.com) # requires either MetroWerks NLM development tools, or gcc / nlmconv @@ -581,7 +638,8 @@ my %table=( "darwin64-ppc-cc","cc:-arch ppc64 -O3 -DB_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first%:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${ppc64_asm}:osx64:dlfcn:darwin-shared:-fPIC -fno-common:-arch ppc64 -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", "darwin-i386-cc","cc:-arch i386 -O3 -fomit-frame-pointer -DL_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first%:BN_LLONG RC4_INT RC4_CHUNK DES_UNROLL BF_PTR:".eval{my $asm=$x86_asm;$asm=~s/cast\-586\.o//;$asm}.":macosx:dlfcn:darwin-shared:-fPIC -fno-common:-arch i386 -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", "debug-darwin-i386-cc","cc:-arch i386 -g3 -DL_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first%:BN_LLONG RC4_INT RC4_CHUNK DES_UNROLL BF_PTR:${x86_asm}:macosx:dlfcn:darwin-shared:-fPIC -fno-common:-arch i386 -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", -"darwin64-x86_64-cc","cc:-arch x86_64 -O3 -DL_ENDIAN -Wall::-D_REENTRANT:MACOSX:-Wl,-search_paths_first%:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL:".eval{my $asm=$x86_64_asm;$asm=~s/rc4\-[^:]+//;$asm}.":macosx:dlfcn:darwin-shared:-fPIC -fno-common:-arch x86_64 -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", +"darwin64-x86_64-cc","cc:-arch x86_64 -O3 -DL_ENDIAN -Wall::-D_REENTRANT:MACOSX:-Wl,-search_paths_first%:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:".eval{my $asm=$x86_64_asm;$asm=~s/rc4\-[^:]+//;$asm}.":macosx:dlfcn:darwin-shared:-fPIC -fno-common:-arch x86_64 -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", +"debug-darwin64-x86_64-cc","cc:-arch x86_64 -ggdb -g2 -O0 -DL_ENDIAN -Wall::-D_REENTRANT:MACOSX:-Wl,-search_paths_first%:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:".eval{my $asm=$x86_64_asm;$asm=~s/rc4\-[^:]+//;$asm}.":macosx:dlfcn:darwin-shared:-fPIC -fno-common:-arch x86_64 -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", "debug-darwin-ppc-cc","cc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DCRYPTO_MDEBUG -DB_ENDIAN -g -Wall -O::-D_REENTRANT:MACOSX::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${ppc32_asm}:osx32:dlfcn:darwin-shared:-fPIC:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", # iPhoneOS/iOS "iphoneos-cross","llvm-gcc:-O3 -isysroot \$(CROSS_TOP)/SDKs/\$(CROSS_SDK) -fomit-frame-pointer -fno-common::-D_REENTRANT:iOS:-Wl,-search_paths_first%:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${no_asm}:dlfcn:darwin-shared:-fPIC -fno-common:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", @@ -634,6 +692,7 @@ my $idx_lflags = $idx++; my $idx_bn_ops = $idx++; my $idx_cpuid_obj = $idx++; my $idx_bn_obj = $idx++; +my $idx_ec_obj = $idx++; my $idx_des_obj = $idx++; my $idx_aes_obj = $idx++; my $idx_bf_obj = $idx++; @@ -714,11 +773,13 @@ my %disabled = ( # "what" => "comment" [or special keyword "experimental "ec_nistp_64_gcc_128" => "default", "gmp" => "default", "jpake" => "experimental", + "libunbound" => "experimental", "md2" => "default", "rc5" => "default", "rfc3779" => "default", "sctp" => "default", "shared" => "default", + "ssl-trace" => "default", "store" => "experimental", "unit-test" => "default", "zlib" => "default", @@ -728,7 +789,7 @@ my @experimental = (); # This is what $depflags will look like with the above defaults # (we need this to see if we should advise the user to run "make depend"): -my $default_depflags = " -DOPENSSL_NO_EC_NISTP_64_GCC_128 -DOPENSSL_NO_GMP -DOPENSSL_NO_JPAKE -DOPENSSL_NO_MD2 -DOPENSSL_NO_RC5 -DOPENSSL_NO_RFC3779 -DOPENSSL_NO_SCTP -DOPENSSL_NO_STORE -DOPENSSL_NO_UNIT_TEST"; +my $default_depflags = " -DOPENSSL_NO_EC_NISTP_64_GCC_128 -DOPENSSL_NO_GMP -DOPENSSL_NO_JPAKE -DOPENSSL_NO_LIBUNBOUND -DOPENSSL_NO_MD2 -DOPENSSL_NO_RC5 -DOPENSSL_NO_RFC3779 -DOPENSSL_NO_SCTP -DOPENSSL_NO_SSL_TRACE -DOPENSSL_NO_STORE -DOPENSSL_NO_UNIT_TEST"; # Explicit "no-..." options will be collected in %disabled along with the defaults. # To remove something from %disabled, use "enable-foo" (unless it's experimental). @@ -873,16 +934,7 @@ PROCESS_ARGS: } elsif (/^[-+]/) { - if (/^-[lL](.*)$/ or /^-Wl,/) - { - $libs.=$_." "; - } - elsif (/^-[^-]/ or /^\+/) - { - $_ =~ s/%([0-9a-f]{1,2})/chr(hex($1))/gei; - $flags.=$_." "; - } - elsif (/^--prefix=(.*)$/) + if (/^--prefix=(.*)$/) { $prefix=$1; } @@ -926,10 +978,14 @@ PROCESS_ARGS: { $cross_compile_prefix=$1; } - else + elsif (/^-[lL](.*)$/ or /^-Wl,/) + { + $libs.=$_." "; + } + else # common if (/^[-+]/), just pass down... { - print STDERR $usage; - exit(1); + $_ =~ s/%([0-9a-f]{1,2})/chr(hex($1))/gei; + $flags.=$_." "; } } elsif ($_ =~ /^([^:]+):(.+)$/) @@ -1156,6 +1212,7 @@ my $cc = $fields[$idx_cc]; if($ENV{CC}) { $cc = $ENV{CC}; } + my $cflags = $fields[$idx_cflags]; my $unistd = $fields[$idx_unistd]; my $thread_cflag = $fields[$idx_thread_cflag]; @@ -1164,6 +1221,7 @@ my $lflags = $fields[$idx_lflags]; my $bn_ops = $fields[$idx_bn_ops]; my $cpuid_obj = $fields[$idx_cpuid_obj]; my $bn_obj = $fields[$idx_bn_obj]; +my $ec_obj = $fields[$idx_ec_obj]; my $des_obj = $fields[$idx_des_obj]; my $aes_obj = $fields[$idx_aes_obj]; my $bf_obj = $fields[$idx_bf_obj]; @@ -1209,6 +1267,12 @@ if ($target =~ /^mingw/ && `$cc --target-help 2>&1` !~ m/\-mno\-cygwin/m) $shared_ldflag =~ s/\-mno\-cygwin\s*//; } +if ($target =~ /linux.*\-mips/ && !$no_asm && $flags !~ /\-m(ips|arch=)/) { + # minimally required architecture flags for assembly modules + $cflags="-mips2 $cflags" if ($target =~ /mips32/); + $cflags="-mips3 $cflags" if ($target =~ /mips64/); +} + my $no_shared_warn=0; my $no_user_cflags=0; @@ -1335,7 +1399,7 @@ $lflags="$libs$lflags" if ($libs ne ""); if ($no_asm) { - $cpuid_obj=$bn_obj= + $cpuid_obj=$bn_obj=$ec_obj= $des_obj=$aes_obj=$bf_obj=$cast_obj=$rc4_obj=$rc5_obj=$cmll_obj= $modes_obj=$sha1_obj=$md5_obj=$rmd160_obj=$wp_obj=$engines_obj=""; } @@ -1416,6 +1480,7 @@ if ($target =~ /\-icc$/) # Intel C compiler } if ($iccver>=8) { + $cflags=~s/\-KPIC/-fPIC/; # Eliminate unnecessary dependency from libirc.a. This is # essential for shared library support, as otherwise # apps/openssl can end up in endless loop upon startup... @@ -1423,12 +1488,17 @@ if ($target =~ /\-icc$/) # Intel C compiler } if ($iccver>=9) { - $cflags.=" -i-static"; - $cflags=~s/\-no_cpprt/-no-cpprt/; + $lflags.=" -i-static"; + $lflags=~s/\-no_cpprt/-no-cpprt/; } if ($iccver>=10) { - $cflags=~s/\-i\-static/-static-intel/; + $lflags=~s/\-i\-static/-static-intel/; + } + if ($iccver>=11) + { + $cflags.=" -no-intel-extensions"; # disable Cilk + $lflags=~s/\-no\-cpprt/-no-cxxlib/; } } @@ -1509,7 +1579,7 @@ if ($rmd160_obj =~ /\.o$/) } if ($aes_obj =~ /\.o$/) { - $cflags.=" -DAES_ASM"; + $cflags.=" -DAES_ASM" if ($aes_obj =~ m/\baes\-/);; # aes-ctr.o is not a real file, only indication that assembler # module implements AES_ctr32_encrypt... $cflags.=" -DAES_CTR_ASM" if ($aes_obj =~ s/\s*aes\-ctr\.o//); @@ -1531,10 +1601,14 @@ else { $wp_obj="wp_block.o"; } $cmll_obj=$cmll_enc unless ($cmll_obj =~ /.o$/); -if ($modes_obj =~ /ghash/) +if ($modes_obj =~ /ghash\-/) { $cflags.=" -DGHASH_ASM"; } +if ($ec_obj =~ /ecp_nistz256/) + { + $cflags.=" -DECP_NISTZ256_ASM"; + } # "Stringify" the C flags string. This permits it to be made part of a string # and works as well on command lines. @@ -1574,12 +1648,21 @@ if ($shlib_version_number =~ /(^[0-9]*)\.([0-9\.]*)/) if ($strict_warnings) { + my $ecc = $cc; + $ecc = "clang" if `$cc --version 2>&1` =~ /clang/; my $wopt; - die "ERROR --strict-warnings requires gcc" unless ($cc =~ /gcc$/); + die "ERROR --strict-warnings requires gcc or clang" unless ($ecc =~ /gcc$/ or $ecc =~ /clang$/); foreach $wopt (split /\s+/, $gcc_devteam_warn) { $cflags .= " $wopt" unless ($cflags =~ /$wopt/) } + if ($ecc eq "clang") + { + foreach $wopt (split /\s+/, $clang_devteam_warn) + { + $cflags .= " $wopt" unless ($cflags =~ /$wopt/) + } + } } open(IN,'<Makefile.org') || die "unable to read Makefile.org:$!\n"; @@ -1638,6 +1721,7 @@ while (<IN>) s/^EXE_EXT=.*$/EXE_EXT= $exe_ext/; s/^CPUID_OBJ=.*$/CPUID_OBJ= $cpuid_obj/; s/^BN_ASM=.*$/BN_ASM= $bn_obj/; + s/^EC_ASM=.*$/EC_ASM= $ec_obj/; s/^DES_ENC=.*$/DES_ENC= $des_obj/; s/^AES_ENC=.*$/AES_ENC= $aes_obj/; s/^BF_ENC=.*$/BF_ENC= $bf_obj/; @@ -1699,6 +1783,7 @@ print "CFLAG =$cflags\n"; print "EX_LIBS =$lflags\n"; print "CPUID_OBJ =$cpuid_obj\n"; print "BN_ASM =$bn_obj\n"; +print "EC_ASM =$ec_obj\n"; print "DES_ENC =$des_obj\n"; print "AES_ENC =$aes_obj\n"; print "BF_ENC =$bf_obj\n"; @@ -1997,7 +2082,7 @@ BEGIN VALUE "ProductVersion", "$version\\0" // Optional: //VALUE "Comments", "\\0" - VALUE "LegalCopyright", "Copyright © 1998-2005 The OpenSSL Project. Copyright © 1995-1998 Eric A. Young, Tim J. Hudson. All rights reserved.\\0" + VALUE "LegalCopyright", "Copyright © 1998-2005 The OpenSSL Project. Copyright © 1995-1998 Eric A. Young, Tim J. Hudson. All rights reserved.\\0" //VALUE "LegalTrademarks", "\\0" //VALUE "PrivateBuild", "\\0" //VALUE "SpecialBuild", "\\0" @@ -2106,12 +2191,12 @@ sub print_table_entry { my $target = shift; - (my $cc,my $cflags,my $unistd,my $thread_cflag,my $sys_id,my $lflags, - my $bn_ops,my $cpuid_obj,my $bn_obj,my $des_obj,my $aes_obj, my $bf_obj, - my $md5_obj,my $sha1_obj,my $cast_obj,my $rc4_obj,my $rmd160_obj, - my $rc5_obj,my $wp_obj,my $cmll_obj,my $modes_obj, my $engines_obj, - my $perlasm_scheme,my $dso_scheme,my $shared_target,my $shared_cflag, - my $shared_ldflag,my $shared_extension,my $ranlib,my $arflags,my $multilib)= + my ($cc, $cflags, $unistd, $thread_cflag, $sys_id, $lflags, + $bn_ops, $cpuid_obj, $bn_obj, $ec_obj, $des_obj, $aes_obj, $bf_obj, + $md5_obj, $sha1_obj, $cast_obj, $rc4_obj, $rmd160_obj, + $rc5_obj, $wp_obj, $cmll_obj, $modes_obj, $engines_obj, + $perlasm_scheme, $dso_scheme, $shared_target, $shared_cflag, + $shared_ldflag, $shared_extension, $ranlib, $arflags, $multilib)= split(/\s*:\s*/,$table{$target} . ":" x 30 , -1); print <<EOF @@ -2126,6 +2211,7 @@ sub print_table_entry \$bn_ops = $bn_ops \$cpuid_obj = $cpuid_obj \$bn_obj = $bn_obj +\$ec_obj = $ec_obj \$des_obj = $des_obj \$aes_obj = $aes_obj \$bf_obj = $bf_obj @@ -83,7 +83,7 @@ OpenSSL - Frequently Asked Questions * Which is the current version of OpenSSL? The current version is available from <URL: http://www.openssl.org>. -OpenSSL 1.0.1e was released on Feb 11th, 2013. +OpenSSL 1.0.1a was released on Apr 19th, 2012. In addition to the current stable release, you can also access daily snapshots of the OpenSSL development version at <URL: @@ -184,14 +184,18 @@ Therefore the answer to the common question "when will feature X be backported to OpenSSL 1.0.0/0.9.8?" is "never" but it could appear in the next minor release. +* What happens when the letter release reaches z? + +It was decided after the release of OpenSSL 0.9.8y the next version should +be 0.9.8za then 0.9.8zb and so on. + + [LEGAL] ======================================================================= * Do I need patent licenses to use OpenSSL? -The patents section of the README file lists patents that may apply to -you if you want to use OpenSSL. For information on intellectual -property rights, please consult a lawyer. The OpenSSL team does not -offer legal advice. +For information on intellectual property rights, please consult a lawyer. +The OpenSSL team does not offer legal advice. You can configure OpenSSL so as not to use IDEA, MDC2 and RC5 by using ./config no-idea no-mdc2 no-rc5 @@ -608,8 +612,8 @@ valid for the current DOS session. * What is special about OpenSSL on Redhat? Red Hat Linux (release 7.0 and later) include a preinstalled limited -version of OpenSSL. For patent reasons, support for IDEA, RC5 and MDC2 -is disabled in this version. The same may apply to other Linux distributions. +version of OpenSSL. Red Hat has chosen to disable support for IDEA, RC5 and +MDC2 in this version. The same may apply to other Linux distributions. Users may therefore wish to install more or all of the features left out. To do this you MUST ensure that you do not overwrite the openssl that is in @@ -632,11 +636,6 @@ relevant updates in packages up to and including 0.9.6b. A possible way around this is to persuade Red Hat to produce a non-US version of Red Hat Linux. -FYI: Patent numbers and expiry dates of US patents: -MDC-2: 4,908,861 13/03/2007 -IDEA: 5,214,703 25/05/2010 -RC5: 5,724,428 03/03/2015 - * Why does the OpenSSL compilation fail on MacOS X? @@ -862,7 +861,7 @@ The opposite assumes we already have len bytes in buf: p = buf; p7 = d2i_PKCS7(NULL, &p, len); -At this point p7 contains a valid PKCS7 structure of NULL if an error +At this point p7 contains a valid PKCS7 structure or NULL if an error occurred. If an error occurred ERR_print_errors(bio) should give more information. @@ -874,6 +873,21 @@ that has been read or written. This may well be uninitialized data and attempts to free the buffer will have unpredictable results because it no longer points to the same address. +Memory allocation and encoding can also be combined in a single +operation by the ASN1 routines: + + unsigned char *buf = NULL; /* mandatory */ + int len; + len = i2d_PKCS7(p7, &buf); + if (len < 0) + /* Error */ + /* Do some things with 'buf' */ + /* Finished with buf: free it */ + OPENSSL_free(buf); + +In this special case the "buf" parameter is *not* incremented, it points +to the start of the encoding. + * OpenSSL uses DER but I need BER format: does OpenSSL support BER? diff --git a/FREEBSD-Xlist b/FREEBSD-Xlist index 8020b316f2f0..bce7bf41d9fd 100644 --- a/FREEBSD-Xlist +++ b/FREEBSD-Xlist @@ -10,6 +10,7 @@ openssl-*/*/*.save openssl-*/*/*/*.bat openssl-*/*/*/*.com openssl-*/*/*/*.save +openssl-*/Git* openssl-*/INSTALL.DJGPP openssl-*/INSTALL.MacOS openssl-*/INSTALL.NW diff --git a/FREEBSD-upgrade b/FREEBSD-upgrade index bd3c1314a796..f1b3c425d594 100644 --- a/FREEBSD-upgrade +++ b/FREEBSD-upgrade @@ -10,9 +10,9 @@ First, read http://wiki.freebsd.org/SubversionPrimer/VendorImports # Xlist setenv XLIST /FreeBSD/work/openssl/svn-FREEBSD-files/FREEBSD-Xlist -setenv FSVN "svn+ssh://svn.freebsd.org/base" -setenv OSSLVER 1.0.1p -# OSSLTAG format: v1_0_1p +setenv FSVN "svn+ssh://repo.freebsd.org/base" +setenv OSSLVER 1.0.2d +# OSSLTAG format: v1_0_2d ###setenv OSSLTAG v`echo ${OSSLVER} | tr . _` @@ -4,16 +4,16 @@ ## Makefile for OpenSSL ## -VERSION=1.0.1p +VERSION=1.0.2d MAJOR=1 -MINOR=0.1 +MINOR=0.2 SHLIB_VERSION_NUMBER=1.0.0 SHLIB_VERSION_HISTORY= SHLIB_MAJOR=1 SHLIB_MINOR=0.0 SHLIB_EXT= PLATFORM=dist -OPTIONS= no-ec_nistp_64_gcc_128 no-gmp no-jpake no-krb5 no-md2 no-rc5 no-rfc3779 no-sctp no-shared no-store no-unit-test no-zlib no-zlib-dynamic static-engine +OPTIONS= no-ec_nistp_64_gcc_128 no-gmp no-jpake no-krb5 no-libunbound no-md2 no-rc5 no-rfc3779 no-sctp no-shared no-ssl-trace no-store no-unit-test no-zlib no-zlib-dynamic static-engine CONFIGURE_ARGS=dist SHLIB_TARGET= @@ -61,7 +61,7 @@ OPENSSLDIR=/usr/local/ssl CC= cc CFLAG= -O -DEPFLAG= -DOPENSSL_NO_EC_NISTP_64_GCC_128 -DOPENSSL_NO_GMP -DOPENSSL_NO_JPAKE -DOPENSSL_NO_MD2 -DOPENSSL_NO_RC5 -DOPENSSL_NO_RFC3779 -DOPENSSL_NO_SCTP -DOPENSSL_NO_STORE -DOPENSSL_NO_UNIT_TEST +DEPFLAG= -DOPENSSL_NO_EC_NISTP_64_GCC_128 -DOPENSSL_NO_GMP -DOPENSSL_NO_JPAKE -DOPENSSL_NO_LIBUNBOUND -DOPENSSL_NO_MD2 -DOPENSSL_NO_RC5 -DOPENSSL_NO_RFC3779 -DOPENSSL_NO_SCTP -DOPENSSL_NO_SSL_TRACE -DOPENSSL_NO_STORE -DOPENSSL_NO_UNIT_TEST PEX_LIBS= EX_LIBS= EXE_EXT= @@ -71,7 +71,7 @@ RANLIB= /usr/bin/ranlib NM= nm PERL= /usr/bin/perl TAR= tar -TARFLAGS= --no-recursion --record-size=10240 +TARFLAGS= --no-recursion MAKEDEPPROG=makedepend LIBDIR=lib @@ -90,6 +90,7 @@ PROCESSOR= # CPUID module collects small commonly used assembler snippets CPUID_OBJ= mem_clr.o BN_ASM= bn_asm.o +EC_ASM= DES_ENC= des_enc.o fcrypt_b.o AES_ENC= aes_core.o aes_cbc.o BF_ENC= bf_enc.o @@ -223,8 +224,8 @@ BUILDENV= PLATFORM='$(PLATFORM)' PROCESSOR='$(PROCESSOR)' \ EXE_EXT='$(EXE_EXT)' SHARED_LIBS='$(SHARED_LIBS)' \ SHLIB_EXT='$(SHLIB_EXT)' SHLIB_TARGET='$(SHLIB_TARGET)' \ PEX_LIBS='$(PEX_LIBS)' EX_LIBS='$(EX_LIBS)' \ - CPUID_OBJ='$(CPUID_OBJ)' \ - BN_ASM='$(BN_ASM)' DES_ENC='$(DES_ENC)' \ + CPUID_OBJ='$(CPUID_OBJ)' BN_ASM='$(BN_ASM)' \ + EC_ASM='$(EC_ASM)' DES_ENC='$(DES_ENC)' \ AES_ENC='$(AES_ENC)' CMLL_ENC='$(CMLL_ENC)' \ BF_ENC='$(BF_ENC)' CAST_ENC='$(CAST_ENC)' \ RC4_ENC='$(RC4_ENC)' RC5_ENC='$(RC5_ENC)' \ @@ -332,7 +333,7 @@ clean-shared: done; \ fi; \ ( set -x; rm -f lib$$i$(SHLIB_EXT) ); \ - if [ "$(PLATFORM)" = "Cygwin" ]; then \ + if expr "$(PLATFORM)" : "Cygwin" >/dev/null; then \ ( set -x; rm -f cyg$$i$(SHLIB_EXT) lib$$i$(SHLIB_EXT).a ); \ fi; \ done @@ -381,11 +382,11 @@ libssl.pc: Makefile echo 'libdir=$${exec_prefix}/$(LIBDIR)'; \ echo 'includedir=$${prefix}/include'; \ echo ''; \ - echo 'Name: OpenSSL'; \ + echo 'Name: OpenSSL-libssl'; \ echo 'Description: Secure Sockets Layer and cryptography libraries'; \ echo 'Version: '$(VERSION); \ - echo 'Requires: '; \ - echo 'Libs: -L$${libdir} -lssl -lcrypto'; \ + echo 'Requires.private: libcrypto'; \ + echo 'Libs: -L$${libdir} -lssl'; \ echo 'Libs.private: $(EX_LIBS)'; \ echo 'Cflags: -I$${includedir} $(KRB5_INCLUDES)' ) > libssl.pc @@ -398,10 +399,7 @@ openssl.pc: Makefile echo 'Name: OpenSSL'; \ echo 'Description: Secure Sockets Layer and cryptography libraries and tools'; \ echo 'Version: '$(VERSION); \ - echo 'Requires: '; \ - echo 'Libs: -L$${libdir} -lssl -lcrypto'; \ - echo 'Libs.private: $(EX_LIBS)'; \ - echo 'Cflags: -I$${includedir} $(KRB5_INCLUDES)' ) > openssl.pc + echo 'Requires: libssl libcrypto' ) > openssl.pc Makefile: Makefile.org Configure config @echo "Makefile is older than Makefile.org, Configure or config." @@ -564,11 +562,7 @@ install_sw: do \ if [ -f "$$i" -o -f "$$i.a" ]; then \ ( echo installing $$i; \ - if [ "$(PLATFORM)" != "Cygwin" ]; then \ - cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/$(LIBDIR)/$$i.new; \ - chmod 555 $(INSTALL_PREFIX)$(INSTALLTOP)/$(LIBDIR)/$$i.new; \ - mv -f $(INSTALL_PREFIX)$(INSTALLTOP)/$(LIBDIR)/$$i.new $(INSTALL_PREFIX)$(INSTALLTOP)/$(LIBDIR)/$$i; \ - else \ + if expr "$(PLATFORM)" : "Cygwin" >/dev/null; then \ c=`echo $$i | sed 's/^lib\(.*\)\.dll\.a/cyg\1-$(SHLIB_VERSION_NUMBER).dll/'`; \ cp $$c $(INSTALL_PREFIX)$(INSTALLTOP)/bin/$$c.new; \ chmod 755 $(INSTALL_PREFIX)$(INSTALLTOP)/bin/$$c.new; \ @@ -576,6 +570,10 @@ install_sw: cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/$(LIBDIR)/$$i.new; \ chmod 644 $(INSTALL_PREFIX)$(INSTALLTOP)/$(LIBDIR)/$$i.new; \ mv -f $(INSTALL_PREFIX)$(INSTALLTOP)/$(LIBDIR)/$$i.new $(INSTALL_PREFIX)$(INSTALLTOP)/$(LIBDIR)/$$i; \ + else \ + cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/$(LIBDIR)/$$i.new; \ + chmod 555 $(INSTALL_PREFIX)$(INSTALLTOP)/$(LIBDIR)/$$i.new; \ + mv -f $(INSTALL_PREFIX)$(INSTALLTOP)/$(LIBDIR)/$$i.new $(INSTALL_PREFIX)$(INSTALLTOP)/$(LIBDIR)/$$i; \ fi ); \ if expr $(PLATFORM) : 'mingw' > /dev/null; then \ ( case $$i in \ @@ -608,6 +606,10 @@ install_sw: install_html_docs: here="`pwd`"; \ + filecase=; \ + case "$(PLATFORM)" in DJGPP|Cygwin*|mingw*|darwin*-*-cc) \ + filecase=-i; \ + esac; \ for subdir in apps crypto ssl; do \ mkdir -p $(INSTALL_PREFIX)$(HTMLDIR)/$$subdir; \ for i in doc/$$subdir/*.pod; do \ @@ -636,9 +638,9 @@ install_docs: @pod2man="`cd ./util; ./pod2mantest $(PERL)`"; \ here="`pwd`"; \ filecase=; \ - if [ "$(PLATFORM)" = "DJGPP" -o "$(PLATFORM)" = "Cygwin" -o "$(PLATFORM)" = "mingw" ]; then \ + case "$(PLATFORM)" in DJGPP|Cygwin*|mingw*|darwin*-*-cc) \ filecase=-i; \ - fi; \ + esac; \ set -e; for i in doc/apps/*.pod; do \ fn=`basename $$i .pod`; \ sec=`$(PERL) util/extract-section.pl 1 < $$i`; \ diff --git a/Makefile.org b/Makefile.org index 55a37008a95b..d77e26495887 100644 --- a/Makefile.org +++ b/Makefile.org @@ -69,7 +69,7 @@ RANLIB= ranlib NM= nm PERL= perl TAR= tar -TARFLAGS= --no-recursion --record-size=10240 +TARFLAGS= --no-recursion MAKEDEPPROG=makedepend LIBDIR=lib @@ -88,6 +88,7 @@ PROCESSOR= # CPUID module collects small commonly used assembler snippets CPUID_OBJ= BN_ASM= bn_asm.o +EC_ASM= DES_ENC= des_enc.o fcrypt_b.o AES_ENC= aes_core.o aes_cbc.o BF_ENC= bf_enc.o @@ -221,8 +222,8 @@ BUILDENV= PLATFORM='$(PLATFORM)' PROCESSOR='$(PROCESSOR)' \ EXE_EXT='$(EXE_EXT)' SHARED_LIBS='$(SHARED_LIBS)' \ SHLIB_EXT='$(SHLIB_EXT)' SHLIB_TARGET='$(SHLIB_TARGET)' \ PEX_LIBS='$(PEX_LIBS)' EX_LIBS='$(EX_LIBS)' \ - CPUID_OBJ='$(CPUID_OBJ)' \ - BN_ASM='$(BN_ASM)' DES_ENC='$(DES_ENC)' \ + CPUID_OBJ='$(CPUID_OBJ)' BN_ASM='$(BN_ASM)' \ + EC_ASM='$(EC_ASM)' DES_ENC='$(DES_ENC)' \ AES_ENC='$(AES_ENC)' CMLL_ENC='$(CMLL_ENC)' \ BF_ENC='$(BF_ENC)' CAST_ENC='$(CAST_ENC)' \ RC4_ENC='$(RC4_ENC)' RC5_ENC='$(RC5_ENC)' \ @@ -330,7 +331,7 @@ clean-shared: done; \ fi; \ ( set -x; rm -f lib$$i$(SHLIB_EXT) ); \ - if [ "$(PLATFORM)" = "Cygwin" ]; then \ + if expr "$(PLATFORM)" : "Cygwin" >/dev/null; then \ ( set -x; rm -f cyg$$i$(SHLIB_EXT) lib$$i$(SHLIB_EXT).a ); \ fi; \ done @@ -379,11 +380,11 @@ libssl.pc: Makefile echo 'libdir=$${exec_prefix}/$(LIBDIR)'; \ echo 'includedir=$${prefix}/include'; \ echo ''; \ - echo 'Name: OpenSSL'; \ + echo 'Name: OpenSSL-libssl'; \ echo 'Description: Secure Sockets Layer and cryptography libraries'; \ echo 'Version: '$(VERSION); \ - echo 'Requires: '; \ - echo 'Libs: -L$${libdir} -lssl -lcrypto'; \ + echo 'Requires.private: libcrypto'; \ + echo 'Libs: -L$${libdir} -lssl'; \ echo 'Libs.private: $(EX_LIBS)'; \ echo 'Cflags: -I$${includedir} $(KRB5_INCLUDES)' ) > libssl.pc @@ -396,10 +397,7 @@ openssl.pc: Makefile echo 'Name: OpenSSL'; \ echo 'Description: Secure Sockets Layer and cryptography libraries and tools'; \ echo 'Version: '$(VERSION); \ - echo 'Requires: '; \ - echo 'Libs: -L$${libdir} -lssl -lcrypto'; \ - echo 'Libs.private: $(EX_LIBS)'; \ - echo 'Cflags: -I$${includedir} $(KRB5_INCLUDES)' ) > openssl.pc + echo 'Requires: libssl libcrypto' ) > openssl.pc Makefile: Makefile.org Configure config @echo "Makefile is older than Makefile.org, Configure or config." @@ -562,11 +560,7 @@ install_sw: do \ if [ -f "$$i" -o -f "$$i.a" ]; then \ ( echo installing $$i; \ - if [ "$(PLATFORM)" != "Cygwin" ]; then \ - cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/$(LIBDIR)/$$i.new; \ - chmod 555 $(INSTALL_PREFIX)$(INSTALLTOP)/$(LIBDIR)/$$i.new; \ - mv -f $(INSTALL_PREFIX)$(INSTALLTOP)/$(LIBDIR)/$$i.new $(INSTALL_PREFIX)$(INSTALLTOP)/$(LIBDIR)/$$i; \ - else \ + if expr "$(PLATFORM)" : "Cygwin" >/dev/null; then \ c=`echo $$i | sed 's/^lib\(.*\)\.dll\.a/cyg\1-$(SHLIB_VERSION_NUMBER).dll/'`; \ cp $$c $(INSTALL_PREFIX)$(INSTALLTOP)/bin/$$c.new; \ chmod 755 $(INSTALL_PREFIX)$(INSTALLTOP)/bin/$$c.new; \ @@ -574,6 +568,10 @@ install_sw: cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/$(LIBDIR)/$$i.new; \ chmod 644 $(INSTALL_PREFIX)$(INSTALLTOP)/$(LIBDIR)/$$i.new; \ mv -f $(INSTALL_PREFIX)$(INSTALLTOP)/$(LIBDIR)/$$i.new $(INSTALL_PREFIX)$(INSTALLTOP)/$(LIBDIR)/$$i; \ + else \ + cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/$(LIBDIR)/$$i.new; \ + chmod 555 $(INSTALL_PREFIX)$(INSTALLTOP)/$(LIBDIR)/$$i.new; \ + mv -f $(INSTALL_PREFIX)$(INSTALLTOP)/$(LIBDIR)/$$i.new $(INSTALL_PREFIX)$(INSTALLTOP)/$(LIBDIR)/$$i; \ fi ); \ if expr $(PLATFORM) : 'mingw' > /dev/null; then \ ( case $$i in \ @@ -606,6 +604,10 @@ install_sw: install_html_docs: here="`pwd`"; \ + filecase=; \ + case "$(PLATFORM)" in DJGPP|Cygwin*|mingw*|darwin*-*-cc) \ + filecase=-i; \ + esac; \ for subdir in apps crypto ssl; do \ mkdir -p $(INSTALL_PREFIX)$(HTMLDIR)/$$subdir; \ for i in doc/$$subdir/*.pod; do \ @@ -634,9 +636,9 @@ install_docs: @pod2man="`cd ./util; ./pod2mantest $(PERL)`"; \ here="`pwd`"; \ filecase=; \ - if [ "$(PLATFORM)" = "DJGPP" -o "$(PLATFORM)" = "Cygwin" -o "$(PLATFORM)" = "mingw" ]; then \ + case "$(PLATFORM)" in DJGPP|Cygwin*|mingw*|darwin*-*-cc) \ filecase=-i; \ - fi; \ + esac; \ set -e; for i in doc/apps/*.pod; do \ fn=`basename $$i .pod`; \ sec=`$(PERL) util/extract-section.pl 1 < $$i`; \ @@ -5,15 +5,15 @@ This file gives a brief overview of the major changes between each OpenSSL release. For more details please read the CHANGES file. - Major changes between OpenSSL 1.0.1o and OpenSSL 1.0.1p [9 Jul 2015] + Major changes between OpenSSL 1.0.2c and OpenSSL 1.0.2d [9 Jul 2015] o Alternate chains certificate forgery (CVE-2015-1793) - Major changes between OpenSSL 1.0.1n and OpenSSL 1.0.1o [12 Jun 2015] + Major changes between OpenSSL 1.0.2b and OpenSSL 1.0.2c [12 Jun 2015] o Fix HMAC ABI incompatibility - Major changes between OpenSSL 1.0.1m and OpenSSL 1.0.1n [11 Jun 2015] + Major changes between OpenSSL 1.0.2a and OpenSSL 1.0.2b [11 Jun 2015] o Malformed ECParameters causes infinite loop (CVE-2015-1788) o Exploitable out-of-bounds read in X509_cmp_time (CVE-2015-1789) @@ -21,16 +21,33 @@ o CMS verify infinite loop with unknown hash function (CVE-2015-1792) o Race condition handling NewSessionTicket (CVE-2015-1791) - Major changes between OpenSSL 1.0.1l and OpenSSL 1.0.1m [19 Mar 2015] + Major changes between OpenSSL 1.0.2 and OpenSSL 1.0.2a [19 Mar 2015] + o OpenSSL 1.0.2 ClientHello sigalgs DoS fix (CVE-2015-0291) + o Multiblock corrupted pointer fix (CVE-2015-0290) + o Segmentation fault in DTLSv1_listen fix (CVE-2015-0207) o Segmentation fault in ASN1_TYPE_cmp fix (CVE-2015-0286) + o Segmentation fault for invalid PSS parameters fix (CVE-2015-0208) o ASN.1 structure reuse memory corruption fix (CVE-2015-0287) o PKCS7 NULL pointer dereferences fix (CVE-2015-0289) o DoS via reachable assert in SSLv2 servers fix (CVE-2015-0293) + o Empty CKE with client auth and DHE fix (CVE-2015-1787) + o Handshake with unseeded PRNG fix (CVE-2015-0285) o Use After Free following d2i_ECPrivatekey error fix (CVE-2015-0209) o X509_to_X509_REQ NULL pointer deref fix (CVE-2015-0288) o Removed the export ciphers from the DEFAULT ciphers + Major changes between OpenSSL 1.0.1l and OpenSSL 1.0.2 [22 Jan 2015]: + + o Suite B support for TLS 1.2 and DTLS 1.2 + o Support for DTLS 1.2 + o TLS automatic EC curve selection. + o API to set TLS supported signature algorithms and curves + o SSL_CONF configuration API. + o TLS Brainpool support. + o ALPN support. + o CMS support for RSA-PSS, RSA-OAEP, ECDH and X9.42 DH. + Major changes between OpenSSL 1.0.1k and OpenSSL 1.0.1l [15 Jan 2015] o Build fixes for the Windows and OpenVMS platforms @@ -1,5 +1,5 @@ - OpenSSL 1.0.1p 9 Jul 2015 + OpenSSL 1.0.2d 9 Jul 2015 Copyright (c) 1998-2011 The OpenSSL Project Copyright (c) 1995-1998 Eric A. Young, Tim J. Hudson @@ -90,32 +90,6 @@ SSL/TLS Client and Server Tests Handling of S/MIME signed or encrypted mail - - PATENTS - ------- - - Various companies hold various patents for various algorithms in various - locations around the world. _YOU_ are responsible for ensuring that your use - of any algorithms is legal by checking if there are any patents in your - country. The file contains some of the patents that we know about or are - rumored to exist. This is not a definitive list. - - RSA Security holds software patents on the RC5 algorithm. If you - intend to use this cipher, you must contact RSA Security for - licensing conditions. Their web page is http://www.rsasecurity.com/. - - RC4 is a trademark of RSA Security, so use of this label should perhaps - only be used with RSA Security's permission. - - The IDEA algorithm is patented by Ascom in Austria, France, Germany, Italy, - Japan, the Netherlands, Spain, Sweden, Switzerland, UK and the USA. They - should be contacted if that algorithm is to be used; their web page is - http://www.ascom.ch/. - - NTT and Mitsubishi have patents and pending patents on the Camellia - algorithm, but allow use at no charge without requiring an explicit - licensing agreement: http://info.isl.ntt.co.jp/crypt/eng/info/chiteki.html - INSTALLATION ------------ @@ -161,8 +135,7 @@ - Problem Description (steps that will reproduce the problem, if known) - Stack Traceback (if the application dumps core) - Report the bug to the OpenSSL project via the Request Tracker - (http://www.openssl.org/support/rt.html) by mail to: + Email the report to: openssl-bugs@openssl.org @@ -170,10 +143,11 @@ or support queries. Just because something doesn't work the way you expect does not mean it is necessarily a bug in OpenSSL. - Note that mail to openssl-bugs@openssl.org is recorded in the publicly - readable request tracker database and is forwarded to a public - mailing list. Confidential mail may be sent to openssl-security@openssl.org - (PGP key available from the key servers). + Note that mail to openssl-bugs@openssl.org is recorded in the public + request tracker database (see https://www.openssl.org/support/rt.html + for details) and also forwarded to a public mailing list. Confidential + mail may be sent to openssl-security@openssl.org (PGP key available from + the key servers). HOW TO CONTRIBUTE TO OpenSSL ---------------------------- diff --git a/apps/apps.c b/apps/apps.c index 680123834831..7478fc379a55 100644 --- a/apps/apps.c +++ b/apps/apps.c @@ -119,7 +119,7 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> -#if !defined(OPENSSL_SYSNAME_WIN32) && !defined(NETWARE_CLIB) +#if !defined(OPENSSL_SYSNAME_WIN32) && !defined(OPENSSL_SYSNAME_WINCE) && !defined(NETWARE_CLIB) # include <strings.h> #endif #include <sys/types.h> @@ -285,6 +285,8 @@ int str2fmt(char *s) return (FORMAT_PKCS12); else if ((*s == 'E') || (*s == 'e')) return (FORMAT_ENGINE); + else if ((*s == 'H') || (*s == 'h')) + return FORMAT_HTTP; else if ((*s == 'P') || (*s == 'p')) { if (s[1] == 'V' || s[1] == 'v') return FORMAT_PVK; @@ -787,12 +789,72 @@ static int load_pkcs12(BIO *err, BIO *in, const char *desc, return ret; } +int load_cert_crl_http(const char *url, BIO *err, + X509 **pcert, X509_CRL **pcrl) +{ + char *host = NULL, *port = NULL, *path = NULL; + BIO *bio = NULL; + OCSP_REQ_CTX *rctx = NULL; + int use_ssl, rv = 0; + if (!OCSP_parse_url(url, &host, &port, &path, &use_ssl)) + goto err; + if (use_ssl) { + if (err) + BIO_puts(err, "https not supported\n"); + goto err; + } + bio = BIO_new_connect(host); + if (!bio || !BIO_set_conn_port(bio, port)) + goto err; + rctx = OCSP_REQ_CTX_new(bio, 1024); + if (!rctx) + goto err; + if (!OCSP_REQ_CTX_http(rctx, "GET", path)) + goto err; + if (!OCSP_REQ_CTX_add1_header(rctx, "Host", host)) + goto err; + if (pcert) { + do { + rv = X509_http_nbio(rctx, pcert); + } + while (rv == -1); + } else { + do { + rv = X509_CRL_http_nbio(rctx, pcrl); + } while (rv == -1); + } + + err: + if (host) + OPENSSL_free(host); + if (path) + OPENSSL_free(path); + if (port) + OPENSSL_free(port); + if (bio) + BIO_free_all(bio); + if (rctx) + OCSP_REQ_CTX_free(rctx); + if (rv != 1) { + if (bio && err) + BIO_printf(bio_err, "Error loading %s from %s\n", + pcert ? "certificate" : "CRL", url); + ERR_print_errors(bio_err); + } + return rv; +} + X509 *load_cert(BIO *err, const char *file, int format, const char *pass, ENGINE *e, const char *cert_descrip) { X509 *x = NULL; BIO *cert; + if (format == FORMAT_HTTP) { + load_cert_crl_http(file, err, &x, NULL); + return x; + } + if ((cert = BIO_new(BIO_s_file())) == NULL) { ERR_print_errors(err); goto end; @@ -850,6 +912,49 @@ X509 *load_cert(BIO *err, const char *file, int format, return (x); } +X509_CRL *load_crl(const char *infile, int format) +{ + X509_CRL *x = NULL; + BIO *in = NULL; + + if (format == FORMAT_HTTP) { + load_cert_crl_http(infile, bio_err, NULL, &x); + return x; + } + + in = BIO_new(BIO_s_file()); + if (in == NULL) { + ERR_print_errors(bio_err); + goto end; + } + + if (infile == NULL) + BIO_set_fp(in, stdin, BIO_NOCLOSE); + else { + if (BIO_read_filename(in, infile) <= 0) { + perror(infile); + goto end; + } + } + if (format == FORMAT_ASN1) + x = d2i_X509_CRL_bio(in, NULL); + else if (format == FORMAT_PEM) + x = PEM_read_bio_X509_CRL(in, NULL, NULL, NULL); + else { + BIO_printf(bio_err, "bad input format specified for input crl\n"); + goto end; + } + if (x == NULL) { + BIO_printf(bio_err, "unable to load CRL\n"); + ERR_print_errors(bio_err); + goto end; + } + + end: + BIO_free(in); + return (x); +} + EVP_PKEY *load_key(BIO *err, const char *file, int format, int maybe_stdin, const char *pass, ENGINE *e, const char *key_descrip) { @@ -2159,6 +2264,9 @@ int args_verify(char ***pargs, int *pargc, char **oldargs = *pargs; char *arg = **pargs, *argn = (*pargs)[1]; time_t at_time = 0; + char *hostname = NULL; + char *email = NULL; + char *ipasc = NULL; if (!strcmp(arg, "-policy")) { if (!argn) *badarg = 1; @@ -2212,6 +2320,21 @@ int args_verify(char ***pargs, int *pargc, at_time = (time_t)timestamp; } (*pargs)++; + } else if (strcmp(arg, "-verify_hostname") == 0) { + if (!argn) + *badarg = 1; + hostname = argn; + (*pargs)++; + } else if (strcmp(arg, "-verify_email") == 0) { + if (!argn) + *badarg = 1; + email = argn; + (*pargs)++; + } else if (strcmp(arg, "-verify_ip") == 0) { + if (!argn) + *badarg = 1; + ipasc = argn; + (*pargs)++; } else if (!strcmp(arg, "-ignore_critical")) flags |= X509_V_FLAG_IGNORE_CRITICAL; else if (!strcmp(arg, "-issuer_checks")) @@ -2238,6 +2361,16 @@ int args_verify(char ***pargs, int *pargc, flags |= X509_V_FLAG_NOTIFY_POLICY; else if (!strcmp(arg, "-check_ss_sig")) flags |= X509_V_FLAG_CHECK_SS_SIGNATURE; + else if (!strcmp(arg, "-trusted_first")) + flags |= X509_V_FLAG_TRUSTED_FIRST; + else if (!strcmp(arg, "-suiteB_128_only")) + flags |= X509_V_FLAG_SUITEB_128_LOS_ONLY; + else if (!strcmp(arg, "-suiteB_128")) + flags |= X509_V_FLAG_SUITEB_128_LOS; + else if (!strcmp(arg, "-suiteB_192")) + flags |= X509_V_FLAG_SUITEB_192_LOS; + else if (!strcmp(arg, "-partial_chain")) + flags |= X509_V_FLAG_PARTIAL_CHAIN; else if (!strcmp(arg, "-no_alt_chains")) flags |= X509_V_FLAG_NO_ALT_CHAINS; else @@ -2269,6 +2402,15 @@ int args_verify(char ***pargs, int *pargc, if (at_time) X509_VERIFY_PARAM_set_time(*pm, at_time); + if (hostname && !X509_VERIFY_PARAM_set1_host(*pm, hostname, 0)) + *badarg = 1; + + if (email && !X509_VERIFY_PARAM_set1_email(*pm, email, 0)) + *badarg = 1; + + if (ipasc && !X509_VERIFY_PARAM_set1_ip_asc(*pm, ipasc)) + *badarg = 1; + end: (*pargs)++; @@ -2552,6 +2694,9 @@ void jpake_client_auth(BIO *out, BIO *conn, const char *secret) BIO_puts(out, "JPAKE authentication succeeded, setting PSK\n"); + if (psk_key) + OPENSSL_free(psk_key); + psk_key = BN_bn2hex(JPAKE_get_shared_key(ctx)); BIO_pop(bconn); @@ -2581,6 +2726,9 @@ void jpake_server_auth(BIO *out, BIO *conn, const char *secret) BIO_puts(out, "JPAKE authentication succeeded, setting PSK\n"); + if (psk_key) + OPENSSL_free(psk_key); + psk_key = BN_bn2hex(JPAKE_get_shared_key(ctx)); BIO_pop(bconn); @@ -2591,7 +2739,7 @@ void jpake_server_auth(BIO *out, BIO *conn, const char *secret) #endif -#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG) +#ifndef OPENSSL_NO_TLSEXT /*- * next_protos_parse parses a comma separated list of strings into a string * in a format suitable for passing to SSL_CTX_set_next_protos_advertised. @@ -2630,8 +2778,106 @@ unsigned char *next_protos_parse(unsigned short *outlen, const char *in) *outlen = len + 1; return out; } -#endif /* !OPENSSL_NO_TLSEXT && - * !OPENSSL_NO_NEXTPROTONEG */ +#endif /* ndef OPENSSL_NO_TLSEXT */ + +void print_cert_checks(BIO *bio, X509 *x, + const char *checkhost, + const char *checkemail, const char *checkip) +{ + if (x == NULL) + return; + if (checkhost) { + BIO_printf(bio, "Hostname %s does%s match certificate\n", + checkhost, X509_check_host(x, checkhost, 0, 0, NULL) == 1 + ? "" : " NOT"); + } + + if (checkemail) { + BIO_printf(bio, "Email %s does%s match certificate\n", + checkemail, X509_check_email(x, checkemail, 0, + 0) ? "" : " NOT"); + } + + if (checkip) { + BIO_printf(bio, "IP %s does%s match certificate\n", + checkip, X509_check_ip_asc(x, checkip, 0) ? "" : " NOT"); + } +} + +/* Get first http URL from a DIST_POINT structure */ + +static const char *get_dp_url(DIST_POINT *dp) +{ + GENERAL_NAMES *gens; + GENERAL_NAME *gen; + int i, gtype; + ASN1_STRING *uri; + if (!dp->distpoint || dp->distpoint->type != 0) + return NULL; + gens = dp->distpoint->name.fullname; + for (i = 0; i < sk_GENERAL_NAME_num(gens); i++) { + gen = sk_GENERAL_NAME_value(gens, i); + uri = GENERAL_NAME_get0_value(gen, >ype); + if (gtype == GEN_URI && ASN1_STRING_length(uri) > 6) { + char *uptr = (char *)ASN1_STRING_data(uri); + if (!strncmp(uptr, "http://", 7)) + return uptr; + } + } + return NULL; +} + +/* + * Look through a CRLDP structure and attempt to find an http URL to + * downloads a CRL from. + */ + +static X509_CRL *load_crl_crldp(STACK_OF(DIST_POINT) *crldp) +{ + int i; + const char *urlptr = NULL; + for (i = 0; i < sk_DIST_POINT_num(crldp); i++) { + DIST_POINT *dp = sk_DIST_POINT_value(crldp, i); + urlptr = get_dp_url(dp); + if (urlptr) + return load_crl(urlptr, FORMAT_HTTP); + } + return NULL; +} + +/* + * Example of downloading CRLs from CRLDP: not usable for real world as it + * always downloads, doesn't support non-blocking I/O and doesn't cache + * anything. + */ + +static STACK_OF(X509_CRL) *crls_http_cb(X509_STORE_CTX *ctx, X509_NAME *nm) +{ + X509 *x; + STACK_OF(X509_CRL) *crls = NULL; + X509_CRL *crl; + STACK_OF(DIST_POINT) *crldp; + x = X509_STORE_CTX_get_current_cert(ctx); + crldp = X509_get_ext_d2i(x, NID_crl_distribution_points, NULL, NULL); + crl = load_crl_crldp(crldp); + sk_DIST_POINT_pop_free(crldp, DIST_POINT_free); + if (!crl) + return NULL; + crls = sk_X509_CRL_new_null(); + sk_X509_CRL_push(crls, crl); + /* Try to download delta CRL */ + crldp = X509_get_ext_d2i(x, NID_freshest_crl, NULL, NULL); + crl = load_crl_crldp(crldp); + sk_DIST_POINT_pop_free(crldp, DIST_POINT_free); + if (crl) + sk_X509_CRL_push(crls, crl); + return crls; +} + +void store_setup_crl_download(X509_STORE *st) +{ + X509_STORE_set_lookup_crls_cb(st, crls_http_cb); +} /* * Platform-specific sections diff --git a/apps/apps.h b/apps/apps.h index 33b293e5b215..8276e708694d 100644 --- a/apps/apps.h +++ b/apps/apps.h @@ -205,7 +205,7 @@ extern BIO *bio_err; # endif # endif -# ifdef OPENSSL_SYSNAME_WIN32 +# if defined(OPENSSL_SYSNAME_WIN32) || defined(OPENSSL_SYSNAME_WINCE) # define openssl_fdset(a,b) FD_SET((unsigned int)a, b) # else # define openssl_fdset(a,b) FD_SET(a, b) @@ -245,6 +245,9 @@ int app_passwd(BIO *err, char *arg1, char *arg2, char **pass1, char **pass2); int add_oid_section(BIO *err, CONF *conf); X509 *load_cert(BIO *err, const char *file, int format, const char *pass, ENGINE *e, const char *cert_descrip); +X509_CRL *load_crl(const char *infile, int format); +int load_cert_crl_http(const char *url, BIO *err, + X509 **pcert, X509_CRL **pcrl); EVP_PKEY *load_key(BIO *err, const char *file, int format, int maybe_stdin, const char *pass, ENGINE *e, const char *key_descrip); EVP_PKEY *load_pubkey(BIO *err, const char *file, int format, int maybe_stdin, @@ -262,8 +265,9 @@ ENGINE *setup_engine(BIO *err, const char *engine, int debug); # ifndef OPENSSL_NO_OCSP OCSP_RESPONSE *process_responder(BIO *err, OCSP_REQUEST *req, - char *host, char *path, char *port, - int use_ssl, STACK_OF(CONF_VALUE) *headers, + const char *host, const char *path, + const char *port, int use_ssl, + const STACK_OF(CONF_VALUE) *headers, int req_timeout); # endif @@ -334,10 +338,15 @@ void jpake_client_auth(BIO *out, BIO *conn, const char *secret); void jpake_server_auth(BIO *out, BIO *conn, const char *secret); # endif -# if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG) +# ifndef OPENSSL_NO_TLSEXT unsigned char *next_protos_parse(unsigned short *outlen, const char *in); -# endif /* !OPENSSL_NO_TLSEXT && - * !OPENSSL_NO_NEXTPROTONEG */ +# endif /* ndef OPENSSL_NO_TLSEXT */ + +void print_cert_checks(BIO *bio, X509 *x, + const char *checkhost, + const char *checkemail, const char *checkip); + +void store_setup_crl_download(X509_STORE *st); # define FORMAT_UNDEF 0 # define FORMAT_ASN1 1 @@ -353,6 +362,7 @@ unsigned char *next_protos_parse(unsigned short *outlen, const char *in); # define FORMAT_ASN1RSA 10 /* DER RSAPubicKey format */ # define FORMAT_MSBLOB 11 /* MS Key blob format */ # define FORMAT_PVK 12 /* MS PVK file format */ +# define FORMAT_HTTP 13 /* Download using HTTP */ # define EXT_COPY_NONE 0 # define EXT_COPY_ADD 1 diff --git a/apps/ca.c b/apps/ca.c index 97ad0c1ffe96..3b7336c0466e 100644 --- a/apps/ca.c +++ b/apps/ca.c @@ -479,6 +479,11 @@ int MAIN(int argc, char **argv) goto bad; infile = *(++argv); dorevoke = 1; + } else if (strcmp(*argv, "-valid") == 0) { + if (--argc < 1) + goto bad; + infile = *(++argv); + dorevoke = 2; } else if (strcmp(*argv, "-extensions") == 0) { if (--argc < 1) goto bad; @@ -1441,6 +1446,8 @@ int MAIN(int argc, char **argv) revcert = load_cert(bio_err, infile, FORMAT_PEM, NULL, e, infile); if (revcert == NULL) goto err; + if (dorevoke == 2) + rev_type = -1; j = do_revoke(revcert, db, rev_type, rev_arg); if (j <= 0) goto err; @@ -1968,8 +1975,12 @@ static int do_body(X509 **xret, EVP_PKEY *pkey, X509 *x509, if (enddate == NULL) X509_time_adj_ex(X509_get_notAfter(ret), days, 0, NULL); - else + else { + int tdays; ASN1_TIME_set_string(X509_get_notAfter(ret), enddate); + ASN1_TIME_diff(&tdays, NULL, NULL, X509_get_notAfter(ret)); + days = tdays; + } if (!X509_set_subject_name(ret, subject)) goto err; @@ -2409,13 +2420,20 @@ static int do_revoke(X509 *x509, CA_DB *db, int type, char *value) } /* Revoke Certificate */ - ok = do_revoke(x509, db, type, value); + if (type == -1) + ok = 1; + else + ok = do_revoke(x509, db, type, value); goto err; } else if (index_name_cmp_noconst(row, rrow)) { BIO_printf(bio_err, "ERROR:name does not match %s\n", row[DB_name]); goto err; + } else if (type == -1) { + BIO_printf(bio_err, "ERROR:Already present, serial number %s\n", + row[DB_serial]); + goto err; } else if (rrow[DB_type][0] == 'R') { BIO_printf(bio_err, "ERROR:Already revoked, serial number %s\n", row[DB_serial]); diff --git a/apps/ciphers.c b/apps/ciphers.c index f299175f11e1..66636d2dfd18 100644 --- a/apps/ciphers.c +++ b/apps/ciphers.c @@ -85,6 +85,9 @@ int MAIN(int argc, char **argv) { int ret = 1, i; int verbose = 0, Verbose = 0; +#ifndef OPENSSL_NO_SSL_TRACE + int stdname = 0; +#endif const char **pp; const char *p; int badops = 0; @@ -119,6 +122,10 @@ int MAIN(int argc, char **argv) verbose = 1; else if (strcmp(*argv, "-V") == 0) verbose = Verbose = 1; +#ifndef OPENSSL_NO_SSL_TRACE + else if (strcmp(*argv, "-stdname") == 0) + stdname = verbose = 1; +#endif #ifndef OPENSSL_NO_SSL2 else if (strcmp(*argv, "-ssl2") == 0) meth = SSLv2_client_method(); @@ -202,7 +209,14 @@ int MAIN(int argc, char **argv) id1, id2, id3); } } - +#ifndef OPENSSL_NO_SSL_TRACE + if (stdname) { + const char *nm = SSL_CIPHER_standard_name(c); + if (nm == NULL) + nm = "UNKNOWN"; + BIO_printf(STDout, "%s - ", nm); + } +#endif BIO_puts(STDout, SSL_CIPHER_description(c, buf, sizeof buf)); } } diff --git a/apps/cms.c b/apps/cms.c index d7645c0d62b1..60479374cdf2 100644 --- a/apps/cms.c +++ b/apps/cms.c @@ -75,6 +75,8 @@ static void receipt_request_print(BIO *out, CMS_ContentInfo *cms); static CMS_ReceiptRequest *make_receipt_request(STACK_OF(OPENSSL_STRING) *rr_to, int rr_allorfirst, STACK_OF(OPENSSL_STRING) *rr_from); +static int cms_set_pkey_param(EVP_PKEY_CTX *pctx, + STACK_OF(OPENSSL_STRING) *param); # define SMIME_OP 0x10 # define SMIME_IP 0x20 @@ -98,6 +100,14 @@ static CMS_ReceiptRequest *make_receipt_request(STACK_OF(OPENSSL_STRING) int verify_err = 0; +typedef struct cms_key_param_st cms_key_param; + +struct cms_key_param_st { + int idx; + STACK_OF(OPENSSL_STRING) *param; + cms_key_param *next; +}; + int MAIN(int, char **); int MAIN(int argc, char **argv) @@ -112,7 +122,7 @@ int MAIN(int argc, char **argv) STACK_OF(OPENSSL_STRING) *sksigners = NULL, *skkeys = NULL; char *certfile = NULL, *keyfile = NULL, *contfile = NULL; char *certsoutfile = NULL; - const EVP_CIPHER *cipher = NULL; + const EVP_CIPHER *cipher = NULL, *wrap_cipher = NULL; CMS_ContentInfo *cms = NULL, *rcms = NULL; X509_STORE *store = NULL; X509 *cert = NULL, *recip = NULL, *signer = NULL; @@ -140,6 +150,8 @@ int MAIN(int argc, char **argv) unsigned char *pwri_pass = NULL, *pwri_tmp = NULL; size_t secret_keylen = 0, secret_keyidlen = 0; + cms_key_param *key_first = NULL, *key_param = NULL; + ASN1_OBJECT *econtent_type = NULL; X509_VERIFY_PARAM *vpm = NULL; @@ -201,6 +213,8 @@ int MAIN(int argc, char **argv) cipher = EVP_des_ede3_cbc(); else if (!strcmp(*args, "-des")) cipher = EVP_des_cbc(); + else if (!strcmp(*args, "-des3-wrap")) + wrap_cipher = EVP_des_ede3_wrap(); # endif # ifndef OPENSSL_NO_SEED else if (!strcmp(*args, "-seed")) @@ -221,6 +235,12 @@ int MAIN(int argc, char **argv) cipher = EVP_aes_192_cbc(); else if (!strcmp(*args, "-aes256")) cipher = EVP_aes_256_cbc(); + else if (!strcmp(*args, "-aes128-wrap")) + wrap_cipher = EVP_aes_128_wrap(); + else if (!strcmp(*args, "-aes192-wrap")) + wrap_cipher = EVP_aes_192_wrap(); + else if (!strcmp(*args, "-aes256-wrap")) + wrap_cipher = EVP_aes_256_wrap(); # endif # ifndef OPENSSL_NO_CAMELLIA else if (!strcmp(*args, "-camellia128")) @@ -378,7 +398,17 @@ int MAIN(int argc, char **argv) } else if (!strcmp(*args, "-recip")) { if (!args[1]) goto argerr; - recipfile = *++args; + if (operation == SMIME_ENCRYPT) { + if (!encerts) + encerts = sk_X509_new_null(); + cert = load_cert(bio_err, *++args, FORMAT_PEM, + NULL, e, "recipient certificate file"); + if (!cert) + goto end; + sk_X509_push(encerts, cert); + cert = NULL; + } else + recipfile = *++args; } else if (!strcmp(*args, "-certsout")) { if (!args[1]) goto argerr; @@ -413,6 +443,40 @@ int MAIN(int argc, char **argv) if (!args[1]) goto argerr; keyform = str2fmt(*++args); + } else if (!strcmp(*args, "-keyopt")) { + int keyidx = -1; + if (!args[1]) + goto argerr; + if (operation == SMIME_ENCRYPT) { + if (encerts) + keyidx += sk_X509_num(encerts); + } else { + if (keyfile || signerfile) + keyidx++; + if (skkeys) + keyidx += sk_OPENSSL_STRING_num(skkeys); + } + if (keyidx < 0) { + BIO_printf(bio_err, "No key specified\n"); + goto argerr; + } + if (key_param == NULL || key_param->idx != keyidx) { + cms_key_param *nparam; + nparam = OPENSSL_malloc(sizeof(cms_key_param)); + if (!nparam) { + BIO_printf(bio_err, "Out of memory\n"); + goto argerr; + } + nparam->idx = keyidx; + nparam->param = sk_OPENSSL_STRING_new_null(); + nparam->next = NULL; + if (key_first == NULL) + key_first = nparam; + else + key_param->next = nparam; + key_param = nparam; + } + sk_OPENSSL_STRING_push(key_param->param, *++args); } else if (!strcmp(*args, "-rctform")) { if (!args[1]) goto argerr; @@ -502,7 +566,7 @@ int MAIN(int argc, char **argv) badarg = 1; } } else if (operation == SMIME_ENCRYPT) { - if (!*args && !secret_key && !pwri_pass) { + if (!*args && !secret_key && !pwri_pass && !encerts) { BIO_printf(bio_err, "No recipient(s) certificate(s) specified\n"); badarg = 1; } @@ -567,6 +631,7 @@ int MAIN(int argc, char **argv) "-inkey file input private key (if not signer or recipient)\n"); BIO_printf(bio_err, "-keyform arg input private key format (PEM or ENGINE)\n"); + BIO_printf(bio_err, "-keyopt nm:v set public key parameters\n"); BIO_printf(bio_err, "-out file output file\n"); BIO_printf(bio_err, "-outform arg output format SMIME (default), PEM or DER\n"); @@ -652,7 +717,7 @@ int MAIN(int argc, char **argv) goto end; } - if (*args) + if (*args && !encerts) encerts = sk_X509_new_null(); while (*args) { if (!(cert = load_cert(bio_err, *args, FORMAT_PEM, @@ -804,10 +869,39 @@ int MAIN(int argc, char **argv) } else if (operation == SMIME_COMPRESS) { cms = CMS_compress(in, -1, flags); } else if (operation == SMIME_ENCRYPT) { + int i; flags |= CMS_PARTIAL; - cms = CMS_encrypt(encerts, in, cipher, flags); + cms = CMS_encrypt(NULL, in, cipher, flags); if (!cms) goto end; + for (i = 0; i < sk_X509_num(encerts); i++) { + CMS_RecipientInfo *ri; + cms_key_param *kparam; + int tflags = flags; + X509 *x = sk_X509_value(encerts, i); + for (kparam = key_first; kparam; kparam = kparam->next) { + if (kparam->idx == i) { + tflags |= CMS_KEY_PARAM; + break; + } + } + ri = CMS_add1_recipient_cert(cms, x, tflags); + if (!ri) + goto end; + if (kparam) { + EVP_PKEY_CTX *pctx; + pctx = CMS_RecipientInfo_get0_pkey_ctx(ri); + if (!cms_set_pkey_param(pctx, kparam->param)) + goto end; + } + if (CMS_RecipientInfo_type(ri) == CMS_RECIPINFO_AGREE + && wrap_cipher) { + EVP_CIPHER_CTX *wctx; + wctx = CMS_RecipientInfo_kari_get0_ctx(ri); + EVP_EncryptInit_ex(wctx, wrap_cipher, NULL, NULL, NULL); + } + } + if (secret_key) { if (!CMS_add0_recipient_key(cms, NID_undef, secret_key, secret_keylen, @@ -880,8 +974,11 @@ int MAIN(int argc, char **argv) flags |= CMS_REUSE_DIGEST; for (i = 0; i < sk_OPENSSL_STRING_num(sksigners); i++) { CMS_SignerInfo *si; + cms_key_param *kparam; + int tflags = flags; signerfile = sk_OPENSSL_STRING_value(sksigners, i); keyfile = sk_OPENSSL_STRING_value(skkeys, i); + signer = load_cert(bio_err, signerfile, FORMAT_PEM, NULL, e, "signer certificate"); if (!signer) @@ -890,9 +987,21 @@ int MAIN(int argc, char **argv) "signing key file"); if (!key) goto end; - si = CMS_add1_signer(cms, signer, key, sign_md, flags); + for (kparam = key_first; kparam; kparam = kparam->next) { + if (kparam->idx == i) { + tflags |= CMS_KEY_PARAM; + break; + } + } + si = CMS_add1_signer(cms, signer, key, sign_md, tflags); if (!si) goto end; + if (kparam) { + EVP_PKEY_CTX *pctx; + pctx = CMS_SignerInfo_get0_pkey_ctx(si); + if (!cms_set_pkey_param(pctx, kparam->param)) + goto end; + } if (rr && !CMS_add1_ReceiptRequest(si, rr)) goto end; X509_free(signer); @@ -1047,6 +1156,13 @@ int MAIN(int argc, char **argv) sk_OPENSSL_STRING_free(rr_to); if (rr_from) sk_OPENSSL_STRING_free(rr_from); + for (key_param = key_first; key_param;) { + cms_key_param *tparam; + sk_OPENSSL_STRING_free(key_param->param); + tparam = key_param->next; + OPENSSL_free(key_param); + key_param = tparam; + } X509_STORE_free(store); X509_free(cert); X509_free(recip); @@ -1220,4 +1336,22 @@ static CMS_ReceiptRequest *make_receipt_request(STACK_OF(OPENSSL_STRING) return NULL; } +static int cms_set_pkey_param(EVP_PKEY_CTX *pctx, + STACK_OF(OPENSSL_STRING) *param) +{ + char *keyopt; + int i; + if (sk_OPENSSL_STRING_num(param) <= 0) + return 1; + for (i = 0; i < sk_OPENSSL_STRING_num(param); i++) { + keyopt = sk_OPENSSL_STRING_value(param, i); + if (pkey_ctrl_string(pctx, keyopt) <= 0) { + BIO_printf(bio_err, "parameter error \"%s\"\n", keyopt); + ERR_print_errors(bio_err); + return 0; + } + } + return 1; +} + #endif diff --git a/apps/crl.c b/apps/crl.c index 0a05870ca1fc..c9c3a5f6d196 100644 --- a/apps/crl.c +++ b/apps/crl.c @@ -96,7 +96,6 @@ static const char *crl_usage[] = { NULL }; -static X509_CRL *load_crl(char *file, int format); static BIO *bio_out = NULL; int MAIN(int, char **); @@ -106,10 +105,10 @@ int MAIN(int argc, char **argv) unsigned long nmflag = 0; X509_CRL *x = NULL; char *CAfile = NULL, *CApath = NULL; - int ret = 1, i, num, badops = 0; + int ret = 1, i, num, badops = 0, badsig = 0; BIO *out = NULL; - int informat, outformat; - char *infile = NULL, *outfile = NULL; + int informat, outformat, keyformat; + char *infile = NULL, *outfile = NULL, *crldiff = NULL, *keyfile = NULL; int hash = 0, issuer = 0, lastupdate = 0, nextupdate = 0, noout = 0, text = 0; #ifndef OPENSSL_NO_MD5 @@ -147,6 +146,7 @@ int MAIN(int argc, char **argv) informat = FORMAT_PEM; outformat = FORMAT_PEM; + keyformat = FORMAT_PEM; argc--; argv++; @@ -173,6 +173,18 @@ int MAIN(int argc, char **argv) if (--argc < 1) goto bad; infile = *(++argv); + } else if (strcmp(*argv, "-gendelta") == 0) { + if (--argc < 1) + goto bad; + crldiff = *(++argv); + } else if (strcmp(*argv, "-key") == 0) { + if (--argc < 1) + goto bad; + keyfile = *(++argv); + } else if (strcmp(*argv, "-keyform") == 0) { + if (--argc < 1) + goto bad; + keyformat = str2fmt(*(++argv)); } else if (strcmp(*argv, "-out") == 0) { if (--argc < 1) goto bad; @@ -214,6 +226,8 @@ int MAIN(int argc, char **argv) fingerprint = ++num; else if (strcmp(*argv, "-crlnumber") == 0) crlnumber = ++num; + else if (strcmp(*argv, "-badsig") == 0) + badsig = 1; else if ((md_alg = EVP_get_digestbyname(*argv + 1))) { /* ok */ digest = md_alg; @@ -281,6 +295,33 @@ int MAIN(int argc, char **argv) BIO_printf(bio_err, "verify OK\n"); } + if (crldiff) { + X509_CRL *newcrl, *delta; + if (!keyfile) { + BIO_puts(bio_err, "Missing CRL signing key\n"); + goto end; + } + newcrl = load_crl(crldiff, informat); + if (!newcrl) + goto end; + pkey = load_key(bio_err, keyfile, keyformat, 0, NULL, NULL, + "CRL signing key"); + if (!pkey) { + X509_CRL_free(newcrl); + goto end; + } + delta = X509_CRL_diff(x, newcrl, pkey, digest, 0); + X509_CRL_free(newcrl); + EVP_PKEY_free(pkey); + if (delta) { + X509_CRL_free(x); + x = delta; + } else { + BIO_puts(bio_err, "Error creating delta CRL\n"); + goto end; + } + } + if (num) { for (i = 1; i <= num; i++) { if (issuer == i) { @@ -369,6 +410,9 @@ int MAIN(int argc, char **argv) goto end; } + if (badsig) + x->signature->data[x->signature->length - 1] ^= 0x1; + if (outformat == FORMAT_ASN1) i = (int)i2d_X509_CRL_bio(out, x); else if (outformat == FORMAT_PEM) @@ -383,6 +427,8 @@ int MAIN(int argc, char **argv) } ret = 0; end: + if (ret != 0) + ERR_print_errors(bio_err); BIO_free_all(out); BIO_free_all(bio_out); bio_out = NULL; @@ -394,41 +440,3 @@ int MAIN(int argc, char **argv) apps_shutdown(); OPENSSL_EXIT(ret); } - -static X509_CRL *load_crl(char *infile, int format) -{ - X509_CRL *x = NULL; - BIO *in = NULL; - - in = BIO_new(BIO_s_file()); - if (in == NULL) { - ERR_print_errors(bio_err); - goto end; - } - - if (infile == NULL) - BIO_set_fp(in, stdin, BIO_NOCLOSE); - else { - if (BIO_read_filename(in, infile) <= 0) { - perror(infile); - goto end; - } - } - if (format == FORMAT_ASN1) - x = d2i_X509_CRL_bio(in, NULL); - else if (format == FORMAT_PEM) - x = PEM_read_bio_X509_CRL(in, NULL, NULL, NULL); - else { - BIO_printf(bio_err, "bad input format specified for input crl\n"); - goto end; - } - if (x == NULL) { - BIO_printf(bio_err, "unable to load CRL\n"); - ERR_print_errors(bio_err); - goto end; - } - - end: - BIO_free(in); - return (x); -} diff --git a/apps/dgst.c b/apps/dgst.c index ad2f2348ac09..95e5fa3fc7b7 100644 --- a/apps/dgst.c +++ b/apps/dgst.c @@ -103,7 +103,7 @@ int MAIN(int, char **); int MAIN(int argc, char **argv) { - ENGINE *e = NULL; + ENGINE *e = NULL, *impl = NULL; unsigned char *buf = NULL; int i, err = 1; const EVP_MD *md = NULL, *m; @@ -124,6 +124,7 @@ int MAIN(int argc, char **argv) char *passargin = NULL, *passin = NULL; #ifndef OPENSSL_NO_ENGINE char *engine = NULL; + int engine_impl = 0; #endif char *hmac_key = NULL; char *mac_name = NULL; @@ -199,7 +200,8 @@ int MAIN(int argc, char **argv) break; engine = *(++argv); e = setup_engine(bio_err, engine, 0); - } + } else if (strcmp(*argv, "-engine_impl") == 0) + engine_impl = 1; #endif else if (strcmp(*argv, "-hex") == 0) out_bin = 0; @@ -284,6 +286,10 @@ int MAIN(int argc, char **argv) EVP_MD_do_all_sorted(list_md_fn, bio_err); goto end; } +#ifndef OPENSSL_NO_ENGINE + if (engine_impl) + impl = e; +#endif in = BIO_new(BIO_s_file()); bmd = BIO_new(BIO_f_md()); @@ -357,7 +363,7 @@ int MAIN(int argc, char **argv) if (mac_name) { EVP_PKEY_CTX *mac_ctx = NULL; int r = 0; - if (!init_gen_str(bio_err, &mac_ctx, mac_name, e, 0)) + if (!init_gen_str(bio_err, &mac_ctx, mac_name, impl, 0)) goto mac_end; if (macopts) { char *macopt; @@ -391,7 +397,7 @@ int MAIN(int argc, char **argv) } if (hmac_key) { - sigkey = EVP_PKEY_new_mac_key(EVP_PKEY_HMAC, e, + sigkey = EVP_PKEY_new_mac_key(EVP_PKEY_HMAC, impl, (unsigned char *)hmac_key, -1); if (!sigkey) goto end; @@ -407,9 +413,9 @@ int MAIN(int argc, char **argv) goto end; } if (do_verify) - r = EVP_DigestVerifyInit(mctx, &pctx, md, NULL, sigkey); + r = EVP_DigestVerifyInit(mctx, &pctx, md, impl, sigkey); else - r = EVP_DigestSignInit(mctx, &pctx, md, NULL, sigkey); + r = EVP_DigestSignInit(mctx, &pctx, md, impl, sigkey); if (!r) { BIO_printf(bio_err, "Error setting context\n"); ERR_print_errors(bio_err); @@ -429,9 +435,15 @@ int MAIN(int argc, char **argv) } /* we use md as a filter, reading from 'in' */ else { + EVP_MD_CTX *mctx = NULL; + if (!BIO_get_md_ctx(bmd, &mctx)) { + BIO_printf(bio_err, "Error getting context\n"); + ERR_print_errors(bio_err); + goto end; + } if (md == NULL) md = EVP_md5(); - if (!BIO_set_md(bmd, md)) { + if (!EVP_DigestInit_ex(mctx, md, impl)) { BIO_printf(bio_err, "Error setting digest %s\n", pname); ERR_print_errors(bio_err); goto end; @@ -483,7 +495,8 @@ int MAIN(int argc, char **argv) EVP_PKEY_asn1_get0_info(NULL, NULL, NULL, NULL, &sig_name, ameth); } - md_name = EVP_MD_name(md); + if (md) + md_name = EVP_MD_name(md); } err = 0; for (i = 0; i < argc; i++) { @@ -581,9 +594,12 @@ int do_fp(BIO *out, unsigned char *buf, BIO *bp, int sep, int binout, BIO_printf(out, "%02x", buf[i]); BIO_printf(out, " *%s\n", file); } else { - if (sig_name) - BIO_printf(out, "%s-%s(%s)= ", sig_name, md_name, file); - else if (md_name) + if (sig_name) { + BIO_puts(out, sig_name); + if (md_name) + BIO_printf(out, "-%s", md_name); + BIO_printf(out, "(%s)= ", file); + } else if (md_name) BIO_printf(out, "%s(%s)= ", md_name, file); else BIO_printf(out, "(%s)= ", file); diff --git a/apps/dhparam.c b/apps/dhparam.c index d3b6d58ad2e5..57199a8d2ad8 100644 --- a/apps/dhparam.c +++ b/apps/dhparam.c @@ -489,9 +489,12 @@ int MAIN(int argc, char **argv) if (!noout) { if (outformat == FORMAT_ASN1) i = i2d_DHparams_bio(out, dh); - else if (outformat == FORMAT_PEM) - i = PEM_write_bio_DHparams(out, dh); - else { + else if (outformat == FORMAT_PEM) { + if (dh->q) + i = PEM_write_bio_DHxparams(out, dh); + else + i = PEM_write_bio_DHparams(out, dh); + } else { BIO_printf(bio_err, "bad output format specified for outfile\n"); goto end; } diff --git a/apps/ecparam.c b/apps/ecparam.c index 1f340a9b84e0..06ac77b838a3 100644 --- a/apps/ecparam.c +++ b/apps/ecparam.c @@ -370,6 +370,9 @@ int MAIN(int argc, char **argv) } else nid = OBJ_sn2nid(curve_name); + if (nid == 0) + nid = EC_curve_nist2nid(curve_name); + if (nid == 0) { BIO_printf(bio_err, "unknown curve name (%s)\n", curve_name); goto end; diff --git a/apps/genrsa.c b/apps/genrsa.c index 2eabadcc6b28..91e6550a5767 100644 --- a/apps/genrsa.c +++ b/apps/genrsa.c @@ -80,7 +80,7 @@ # include <openssl/pem.h> # include <openssl/rand.h> -# define DEFBITS 1024 +# define DEFBITS 2048 # undef PROG # define PROG genrsa_main diff --git a/apps/ocsp.c b/apps/ocsp.c index 572f0643e13c..926083dd1b5c 100644 --- a/apps/ocsp.c +++ b/apps/ocsp.c @@ -110,16 +110,17 @@ static int print_ocsp_summary(BIO *out, OCSP_BASICRESP *bs, OCSP_REQUEST *req, static int make_ocsp_response(OCSP_RESPONSE **resp, OCSP_REQUEST *req, CA_DB *db, X509 *ca, X509 *rcert, - EVP_PKEY *rkey, STACK_OF(X509) *rother, - unsigned long flags, int nmin, int ndays); + EVP_PKEY *rkey, const EVP_MD *md, + STACK_OF(X509) *rother, unsigned long flags, + int nmin, int ndays, int badsig); static char **lookup_serial(CA_DB *db, ASN1_INTEGER *ser); -static BIO *init_responder(char *port); +static BIO *init_responder(const char *port); static int do_responder(OCSP_REQUEST **preq, BIO **pcbio, BIO *acbio, - char *port); + const char *port); static int send_ocsp_response(BIO *cbio, OCSP_RESPONSE *resp); -static OCSP_RESPONSE *query_responder(BIO *err, BIO *cbio, char *path, - STACK_OF(CONF_VALUE) *headers, +static OCSP_RESPONSE *query_responder(BIO *err, BIO *cbio, const char *path, + const STACK_OF(CONF_VALUE) *headers, OCSP_REQUEST *req, int req_timeout); # undef PROG @@ -154,12 +155,14 @@ int MAIN(int argc, char **argv) long nsec = MAX_VALIDITY_PERIOD, maxage = -1; char *CAfile = NULL, *CApath = NULL; X509_STORE *store = NULL; + X509_VERIFY_PARAM *vpm = NULL; STACK_OF(X509) *sign_other = NULL, *verify_other = NULL, *rother = NULL; char *sign_certfile = NULL, *verify_certfile = NULL, *rcertfile = NULL; unsigned long sign_flags = 0, verify_flags = 0, rflags = 0; int ret = 1; int accept_count = -1; int badarg = 0; + int badsig = 0; int i; int ignore_err = 0; STACK_OF(OPENSSL_STRING) *reqnames = NULL; @@ -170,7 +173,7 @@ int MAIN(int argc, char **argv) char *rca_filename = NULL; CA_DB *rdb = NULL; int nmin = 0, ndays = -1; - const EVP_MD *cert_id_md = NULL; + const EVP_MD *cert_id_md = NULL, *rsign_md = NULL; if (bio_err == NULL) bio_err = BIO_new_fp(stderr, BIO_NOCLOSE); @@ -206,6 +209,7 @@ int MAIN(int argc, char **argv) OPENSSL_free(tport); if (tpath) OPENSSL_free(tpath); + thost = tport = tpath = NULL; if (args[1]) { args++; if (!OCSP_parse_url(*args, &host, &port, &path, &use_ssl)) { @@ -264,6 +268,8 @@ int MAIN(int argc, char **argv) verify_flags |= OCSP_TRUSTOTHER; else if (!strcmp(*args, "-no_intern")) verify_flags |= OCSP_NOINTERN; + else if (!strcmp(*args, "-badsig")) + badsig = 1; else if (!strcmp(*args, "-text")) { req_text = 1; resp_text = 1; @@ -320,6 +326,10 @@ int MAIN(int argc, char **argv) CApath = *args; } else badarg = 1; + } else if (args_verify(&args, NULL, &badarg, bio_err, &vpm)) { + if (badarg) + goto end; + continue; } else if (!strcmp(*args, "-validity_period")) { if (args[1]) { args++; @@ -465,6 +475,14 @@ int MAIN(int argc, char **argv) rcertfile = *args; } else badarg = 1; + } else if (!strcmp(*args, "-rmd")) { + if (args[1]) { + args++; + rsign_md = EVP_get_digestbyname(*args); + if (!rsign_md) + badarg = 1; + } else + badarg = 1; } else if ((cert_id_md = EVP_get_digestbyname((*args) + 1)) == NULL) { badarg = 1; } @@ -584,7 +602,10 @@ int MAIN(int argc, char **argv) add_nonce = 0; if (!req && reqin) { - derbio = BIO_new_file(reqin, "rb"); + if (!strcmp(reqin, "-")) + derbio = BIO_new_fp(stdin, BIO_NOCLOSE); + else + derbio = BIO_new_file(reqin, "rb"); if (!derbio) { BIO_printf(bio_err, "Error Opening OCSP request file\n"); goto end; @@ -681,7 +702,10 @@ int MAIN(int argc, char **argv) OCSP_REQUEST_print(out, req, 0); if (reqout) { - derbio = BIO_new_file(reqout, "wb"); + if (!strcmp(reqout, "-")) + derbio = BIO_new_fp(stdout, BIO_NOCLOSE); + else + derbio = BIO_new_file(reqout, "wb"); if (!derbio) { BIO_printf(bio_err, "Error opening file %s\n", reqout); goto end; @@ -706,7 +730,7 @@ int MAIN(int argc, char **argv) if (rdb) { i = make_ocsp_response(&resp, req, rdb, rca_cert, rsigner, rkey, - rother, rflags, nmin, ndays); + rsign_md, rother, rflags, nmin, ndays, badsig); if (cbio) send_ocsp_response(cbio, resp); } else if (host) { @@ -721,7 +745,10 @@ int MAIN(int argc, char **argv) goto end; # endif } else if (respin) { - derbio = BIO_new_file(respin, "rb"); + if (!strcmp(respin, "-")) + derbio = BIO_new_fp(stdin, BIO_NOCLOSE); + else + derbio = BIO_new_file(respin, "rb"); if (!derbio) { BIO_printf(bio_err, "Error Opening OCSP response file\n"); goto end; @@ -741,7 +768,10 @@ int MAIN(int argc, char **argv) done_resp: if (respout) { - derbio = BIO_new_file(respout, "wb"); + if (!strcmp(respout, "-")) + derbio = BIO_new_fp(stdout, BIO_NOCLOSE); + else + derbio = BIO_new_file(respout, "wb"); if (!derbio) { BIO_printf(bio_err, "Error opening file %s\n", respout); goto end; @@ -778,6 +808,10 @@ int MAIN(int argc, char **argv) resp = NULL; goto redo_accept; } + ret = 0; + goto end; + } else if (ridx_filename) { + ret = 0; goto end; } @@ -785,6 +819,8 @@ int MAIN(int argc, char **argv) store = setup_verify(bio_err, CAfile, CApath); if (!store) goto end; + if (vpm) + X509_STORE_set1_param(store, vpm); if (verify_certfile) { verify_other = load_certs(bio_err, verify_certfile, FORMAT_PEM, NULL, e, "validator certificate"); @@ -799,37 +835,38 @@ int MAIN(int argc, char **argv) goto end; } + ret = 0; + if (!noverify) { if (req && ((i = OCSP_check_nonce(req, bs)) <= 0)) { if (i == -1) BIO_printf(bio_err, "WARNING: no nonce in response\n"); else { BIO_printf(bio_err, "Nonce Verify error\n"); + ret = 1; goto end; } } i = OCSP_basic_verify(bs, verify_other, store, verify_flags); - if (i < 0) - i = OCSP_basic_verify(bs, NULL, store, 0); - if (i <= 0) { BIO_printf(bio_err, "Response Verify Failure\n"); ERR_print_errors(bio_err); + ret = 1; } else BIO_printf(bio_err, "Response verify OK\n"); } if (!print_ocsp_summary(out, bs, req, reqnames, ids, nsec, maxage)) - goto end; - - ret = 0; + ret = 1; end: ERR_print_errors(bio_err); X509_free(signer); X509_STORE_free(store); + if (vpm) + X509_VERIFY_PARAM_free(vpm); EVP_PKEY_free(key); EVP_PKEY_free(rkey); X509_free(issuer); @@ -984,8 +1021,9 @@ static int print_ocsp_summary(BIO *out, OCSP_BASICRESP *bs, OCSP_REQUEST *req, static int make_ocsp_response(OCSP_RESPONSE **resp, OCSP_REQUEST *req, CA_DB *db, X509 *ca, X509 *rcert, - EVP_PKEY *rkey, STACK_OF(X509) *rother, - unsigned long flags, int nmin, int ndays) + EVP_PKEY *rkey, const EVP_MD *rmd, + STACK_OF(X509) *rother, unsigned long flags, + int nmin, int ndays, int badsig) { ASN1_TIME *thisupd = NULL, *nextupd = NULL; OCSP_CERTID *cid, *ca_id = NULL; @@ -1069,7 +1107,10 @@ static int make_ocsp_response(OCSP_RESPONSE **resp, OCSP_REQUEST *req, OCSP_copy_nonce(bs, req); - OCSP_basic_sign(bs, rcert, rkey, NULL, rother, flags); + OCSP_basic_sign(bs, rcert, rkey, rmd, rother, flags); + + if (badsig) + bs->signature->data[bs->signature->length - 1] ^= 0x1; *resp = OCSP_response_create(OCSP_RESPONSE_STATUS_SUCCESSFUL, bs); @@ -1105,7 +1146,7 @@ static char **lookup_serial(CA_DB *db, ASN1_INTEGER *ser) /* Quick and dirty OCSP server: read in and parse input request */ -static BIO *init_responder(char *port) +static BIO *init_responder(const char *port) { BIO *acbio = NULL, *bufbio = NULL; bufbio = BIO_new(BIO_f_buffer()); @@ -1137,7 +1178,7 @@ static BIO *init_responder(char *port) } static int do_responder(OCSP_REQUEST **preq, BIO **pcbio, BIO *acbio, - char *port) + const char *port) { int have_post = 0, len; OCSP_REQUEST *req = NULL; @@ -1198,8 +1239,8 @@ static int send_ocsp_response(BIO *cbio, OCSP_RESPONSE *resp) return 1; } -static OCSP_RESPONSE *query_responder(BIO *err, BIO *cbio, char *path, - STACK_OF(CONF_VALUE) *headers, +static OCSP_RESPONSE *query_responder(BIO *err, BIO *cbio, const char *path, + const STACK_OF(CONF_VALUE) *headers, OCSP_REQUEST *req, int req_timeout) { int fd; @@ -1286,8 +1327,9 @@ static OCSP_RESPONSE *query_responder(BIO *err, BIO *cbio, char *path, } OCSP_RESPONSE *process_responder(BIO *err, OCSP_REQUEST *req, - char *host, char *path, char *port, - int use_ssl, STACK_OF(CONF_VALUE) *headers, + const char *host, const char *path, + const char *port, int use_ssl, + const STACK_OF(CONF_VALUE) *headers, int req_timeout) { BIO *cbio = NULL; diff --git a/apps/openssl.cnf b/apps/openssl.cnf index 18760c6e673d..1eb86c401263 100644 --- a/apps/openssl.cnf +++ b/apps/openssl.cnf @@ -103,7 +103,7 @@ emailAddress = optional #################################################################### [ req ] -default_bits = 1024 +default_bits = 2048 default_keyfile = privkey.pem distinguished_name = req_distinguished_name attributes = req_attributes diff --git a/apps/pkcs8.c b/apps/pkcs8.c index 5c7290ed09bc..5099e18417e9 100644 --- a/apps/pkcs8.c +++ b/apps/pkcs8.c @@ -124,6 +124,16 @@ int MAIN(int argc, char **argv) } } else badarg = 1; + } else if (!strcmp(*args, "-v2prf")) { + if (args[1]) { + args++; + pbe_nid = OBJ_txt2nid(*args); + if (!EVP_PBE_find(EVP_PBE_TYPE_PRF, pbe_nid, NULL, NULL, 0)) { + BIO_printf(bio_err, "Unknown PRF algorithm %s\n", *args); + badarg = 1; + } + } else + badarg = 1; } else if (!strcmp(*args, "-inform")) { if (args[1]) { args++; diff --git a/apps/s_apps.h b/apps/s_apps.h index 2769b899ed35..5b54bfdc4e1c 100644 --- a/apps/s_apps.h +++ b/apps/s_apps.h @@ -152,15 +152,21 @@ typedef fd_mask fd_set; #define PROTOCOL "tcp" int do_server(int port, int type, int *ret, - int (*cb) (char *hostname, int s, unsigned char *context), - unsigned char *context); + int (*cb) (char *hostname, int s, int stype, + unsigned char *context), unsigned char *context, + int naccept); #ifdef HEADER_X509_H int MS_CALLBACK verify_callback(int ok, X509_STORE_CTX *ctx); #endif #ifdef HEADER_SSL_H int set_cert_stuff(SSL_CTX *ctx, char *cert_file, char *key_file); -int set_cert_key_stuff(SSL_CTX *ctx, X509 *cert, EVP_PKEY *key); +int set_cert_key_stuff(SSL_CTX *ctx, X509 *cert, EVP_PKEY *key, + STACK_OF(X509) *chain, int build_chain); +int ssl_print_sigalgs(BIO *out, SSL *s); +int ssl_print_point_formats(BIO *out, SSL *s); +int ssl_print_curves(BIO *out, SSL *s, int noshared); #endif +int ssl_print_tmp_key(BIO *out, SSL *s); int init_client(int *sock, char *server, int port, int type); int should_retry(int i); int extract_port(char *str, short *port_ptr); @@ -182,3 +188,24 @@ int MS_CALLBACK generate_cookie_callback(SSL *ssl, unsigned char *cookie, unsigned int *cookie_len); int MS_CALLBACK verify_cookie_callback(SSL *ssl, unsigned char *cookie, unsigned int cookie_len); + +typedef struct ssl_excert_st SSL_EXCERT; + +void ssl_ctx_set_excert(SSL_CTX *ctx, SSL_EXCERT *exc); +void ssl_excert_free(SSL_EXCERT *exc); +int args_excert(char ***pargs, int *pargc, + int *badarg, BIO *err, SSL_EXCERT **pexc); +int load_excert(SSL_EXCERT **pexc, BIO *err); +void print_ssl_summary(BIO *bio, SSL *s); +#ifdef HEADER_SSL_H +int args_ssl(char ***pargs, int *pargc, SSL_CONF_CTX *cctx, + int *badarg, BIO *err, STACK_OF(OPENSSL_STRING) **pstr); +int args_ssl_call(SSL_CTX *ctx, BIO *err, SSL_CONF_CTX *cctx, + STACK_OF(OPENSSL_STRING) *str, int no_ecdhe, int no_jpake); +int ssl_ctx_add_crls(SSL_CTX *ctx, STACK_OF(X509_CRL) *crls, + int crl_download); +int ssl_load_stores(SSL_CTX *ctx, const char *vfyCApath, + const char *vfyCAfile, const char *chCApath, + const char *chCAfile, STACK_OF(X509_CRL) *crls, + int crl_download); +#endif diff --git a/apps/s_cb.c b/apps/s_cb.c index fabf9cfb2222..dd3aa74e02af 100644 --- a/apps/s_cb.c +++ b/apps/s_cb.c @@ -111,7 +111,7 @@ #include <stdio.h> #include <stdlib.h> -#include <string.h> /* for memcpy() */ +#include <string.h> /* for memcpy() and strcmp() */ #define USE_SOCKETS #define NON_MAIN #include "apps.h" @@ -126,6 +126,7 @@ #define COOKIE_SECRET_LENGTH 16 int verify_depth = 0; +int verify_quiet = 0; int verify_error = X509_V_OK; int verify_return_error = 0; unsigned char cookie_secret[COOKIE_SECRET_LENGTH]; @@ -140,13 +141,16 @@ int MS_CALLBACK verify_callback(int ok, X509_STORE_CTX *ctx) err = X509_STORE_CTX_get_error(ctx); depth = X509_STORE_CTX_get_error_depth(ctx); - BIO_printf(bio_err, "depth=%d ", depth); - if (err_cert) { - X509_NAME_print_ex(bio_err, X509_get_subject_name(err_cert), - 0, XN_FLAG_ONELINE); - BIO_puts(bio_err, "\n"); - } else - BIO_puts(bio_err, "<no cert>\n"); + if (!verify_quiet || !ok) { + BIO_printf(bio_err, "depth=%d ", depth); + if (err_cert) { + X509_NAME_print_ex(bio_err, + X509_get_subject_name(err_cert), + 0, XN_FLAG_ONELINE); + BIO_puts(bio_err, "\n"); + } else + BIO_puts(bio_err, "<no cert>\n"); + } if (!ok) { BIO_printf(bio_err, "verify error:num=%d:%s\n", err, X509_verify_cert_error_string(err)); @@ -179,13 +183,14 @@ int MS_CALLBACK verify_callback(int ok, X509_STORE_CTX *ctx) BIO_printf(bio_err, "\n"); break; case X509_V_ERR_NO_EXPLICIT_POLICY: - policies_print(bio_err, ctx); + if (!verify_quiet) + policies_print(bio_err, ctx); break; } - if (err == X509_V_OK && ok == 2) + if (err == X509_V_OK && ok == 2 && !verify_quiet) policies_print(bio_err, ctx); - - BIO_printf(bio_err, "verify return:%d\n", ok); + if (ok && !verify_quiet) + BIO_printf(bio_err, "verify return:%d\n", ok); return (ok); } @@ -246,8 +251,10 @@ int set_cert_stuff(SSL_CTX *ctx, char *cert_file, char *key_file) return (1); } -int set_cert_key_stuff(SSL_CTX *ctx, X509 *cert, EVP_PKEY *key) +int set_cert_key_stuff(SSL_CTX *ctx, X509 *cert, EVP_PKEY *key, + STACK_OF(X509) *chain, int build_chain) { + int chflags = chain ? SSL_BUILD_CHAIN_FLAG_CHECK : 0; if (cert == NULL) return 1; if (SSL_CTX_use_certificate(ctx, cert) <= 0) { @@ -255,6 +262,7 @@ int set_cert_key_stuff(SSL_CTX *ctx, X509 *cert, EVP_PKEY *key) ERR_print_errors(bio_err); return 0; } + if (SSL_CTX_use_PrivateKey(ctx, key) <= 0) { BIO_printf(bio_err, "error setting private key\n"); ERR_print_errors(bio_err); @@ -269,6 +277,263 @@ int set_cert_key_stuff(SSL_CTX *ctx, X509 *cert, EVP_PKEY *key) "Private key does not match the certificate public key\n"); return 0; } + if (chain && !SSL_CTX_set1_chain(ctx, chain)) { + BIO_printf(bio_err, "error setting certificate chain\n"); + ERR_print_errors(bio_err); + return 0; + } + if (build_chain && !SSL_CTX_build_cert_chain(ctx, chflags)) { + BIO_printf(bio_err, "error building certificate chain\n"); + ERR_print_errors(bio_err); + return 0; + } + return 1; +} + +static void ssl_print_client_cert_types(BIO *bio, SSL *s) +{ + const unsigned char *p; + int i; + int cert_type_num = SSL_get0_certificate_types(s, &p); + if (!cert_type_num) + return; + BIO_puts(bio, "Client Certificate Types: "); + for (i = 0; i < cert_type_num; i++) { + unsigned char cert_type = p[i]; + char *cname; + switch (cert_type) { + case TLS_CT_RSA_SIGN: + cname = "RSA sign"; + break; + + case TLS_CT_DSS_SIGN: + cname = "DSA sign"; + break; + + case TLS_CT_RSA_FIXED_DH: + cname = "RSA fixed DH"; + break; + + case TLS_CT_DSS_FIXED_DH: + cname = "DSS fixed DH"; + break; + + case TLS_CT_ECDSA_SIGN: + cname = "ECDSA sign"; + break; + + case TLS_CT_RSA_FIXED_ECDH: + cname = "RSA fixed ECDH"; + break; + + case TLS_CT_ECDSA_FIXED_ECDH: + cname = "ECDSA fixed ECDH"; + break; + + case TLS_CT_GOST94_SIGN: + cname = "GOST94 Sign"; + break; + + case TLS_CT_GOST01_SIGN: + cname = "GOST01 Sign"; + break; + + default: + cname = NULL; + } + + if (i) + BIO_puts(bio, ", "); + + if (cname) + BIO_puts(bio, cname); + else + BIO_printf(bio, "UNKNOWN (%d),", cert_type); + } + BIO_puts(bio, "\n"); +} + +static int do_print_sigalgs(BIO *out, SSL *s, int shared) +{ + int i, nsig, client; + client = SSL_is_server(s) ? 0 : 1; + if (shared) + nsig = SSL_get_shared_sigalgs(s, -1, NULL, NULL, NULL, NULL, NULL); + else + nsig = SSL_get_sigalgs(s, -1, NULL, NULL, NULL, NULL, NULL); + if (nsig == 0) + return 1; + + if (shared) + BIO_puts(out, "Shared "); + + if (client) + BIO_puts(out, "Requested "); + BIO_puts(out, "Signature Algorithms: "); + for (i = 0; i < nsig; i++) { + int hash_nid, sign_nid; + unsigned char rhash, rsign; + const char *sstr = NULL; + if (shared) + SSL_get_shared_sigalgs(s, i, &sign_nid, &hash_nid, NULL, + &rsign, &rhash); + else + SSL_get_sigalgs(s, i, &sign_nid, &hash_nid, NULL, &rsign, &rhash); + if (i) + BIO_puts(out, ":"); + if (sign_nid == EVP_PKEY_RSA) + sstr = "RSA"; + else if (sign_nid == EVP_PKEY_DSA) + sstr = "DSA"; + else if (sign_nid == EVP_PKEY_EC) + sstr = "ECDSA"; + if (sstr) + BIO_printf(out, "%s+", sstr); + else + BIO_printf(out, "0x%02X+", (int)rsign); + if (hash_nid != NID_undef) + BIO_printf(out, "%s", OBJ_nid2sn(hash_nid)); + else + BIO_printf(out, "0x%02X", (int)rhash); + } + BIO_puts(out, "\n"); + return 1; +} + +int ssl_print_sigalgs(BIO *out, SSL *s) +{ + int mdnid; + if (!SSL_is_server(s)) + ssl_print_client_cert_types(out, s); + do_print_sigalgs(out, s, 0); + do_print_sigalgs(out, s, 1); + if (SSL_get_peer_signature_nid(s, &mdnid)) + BIO_printf(out, "Peer signing digest: %s\n", OBJ_nid2sn(mdnid)); + return 1; +} + +#ifndef OPENSSL_NO_EC +int ssl_print_point_formats(BIO *out, SSL *s) +{ + int i, nformats; + const char *pformats; + nformats = SSL_get0_ec_point_formats(s, &pformats); + if (nformats <= 0) + return 1; + BIO_puts(out, "Supported Elliptic Curve Point Formats: "); + for (i = 0; i < nformats; i++, pformats++) { + if (i) + BIO_puts(out, ":"); + switch (*pformats) { + case TLSEXT_ECPOINTFORMAT_uncompressed: + BIO_puts(out, "uncompressed"); + break; + + case TLSEXT_ECPOINTFORMAT_ansiX962_compressed_prime: + BIO_puts(out, "ansiX962_compressed_prime"); + break; + + case TLSEXT_ECPOINTFORMAT_ansiX962_compressed_char2: + BIO_puts(out, "ansiX962_compressed_char2"); + break; + + default: + BIO_printf(out, "unknown(%d)", (int)*pformats); + break; + + } + } + if (nformats <= 0) + BIO_puts(out, "NONE"); + BIO_puts(out, "\n"); + return 1; +} + +int ssl_print_curves(BIO *out, SSL *s, int noshared) +{ + int i, ncurves, *curves, nid; + const char *cname; + ncurves = SSL_get1_curves(s, NULL); + if (ncurves <= 0) + return 1; + curves = OPENSSL_malloc(ncurves * sizeof(int)); + if (!curves) { + BIO_puts(out, "Malloc error getting supported curves\n"); + return 0; + } + SSL_get1_curves(s, curves); + + + BIO_puts(out, "Supported Elliptic Curves: "); + for (i = 0; i < ncurves; i++) { + if (i) + BIO_puts(out, ":"); + nid = curves[i]; + /* If unrecognised print out hex version */ + if (nid & TLSEXT_nid_unknown) + BIO_printf(out, "0x%04X", nid & 0xFFFF); + else { + /* Use NIST name for curve if it exists */ + cname = EC_curve_nid2nist(nid); + if (!cname) + cname = OBJ_nid2sn(nid); + BIO_printf(out, "%s", cname); + } + } + if (ncurves == 0) + BIO_puts(out, "NONE"); + OPENSSL_free(curves); + if (noshared) { + BIO_puts(out, "\n"); + return 1; + } + BIO_puts(out, "\nShared Elliptic curves: "); + ncurves = SSL_get_shared_curve(s, -1); + for (i = 0; i < ncurves; i++) { + if (i) + BIO_puts(out, ":"); + nid = SSL_get_shared_curve(s, i); + cname = EC_curve_nid2nist(nid); + if (!cname) + cname = OBJ_nid2sn(nid); + BIO_printf(out, "%s", cname); + } + if (ncurves == 0) + BIO_puts(out, "NONE"); + BIO_puts(out, "\n"); + return 1; +} +#endif +int ssl_print_tmp_key(BIO *out, SSL *s) +{ + EVP_PKEY *key; + if (!SSL_get_server_tmp_key(s, &key)) + return 1; + BIO_puts(out, "Server Temp Key: "); + switch (EVP_PKEY_id(key)) { + case EVP_PKEY_RSA: + BIO_printf(out, "RSA, %d bits\n", EVP_PKEY_bits(key)); + break; + + case EVP_PKEY_DH: + BIO_printf(out, "DH, %d bits\n", EVP_PKEY_bits(key)); + break; +#ifndef OPENSSL_NO_ECDH + case EVP_PKEY_EC: + { + EC_KEY *ec = EVP_PKEY_get1_EC_KEY(key); + int nid; + const char *cname; + nid = EC_GROUP_get_curve_name(EC_KEY_get0_group(ec)); + EC_KEY_free(ec); + cname = EC_curve_nid2nist(nid); + if (!cname) + cname = OBJ_nid2sn(nid); + BIO_printf(out, "ECDH, %s, %d bits\n", cname, EVP_PKEY_bits(key)); + } +#endif + } + EVP_PKEY_free(key); return 1; } @@ -884,3 +1149,504 @@ int MS_CALLBACK verify_cookie_callback(SSL *ssl, unsigned char *cookie, return 0; } + +/* + * Example of extended certificate handling. Where the standard support of + * one certificate per algorithm is not sufficient an application can decide + * which certificate(s) to use at runtime based on whatever criteria it deems + * appropriate. + */ + +/* Linked list of certificates, keys and chains */ +struct ssl_excert_st { + int certform; + const char *certfile; + int keyform; + const char *keyfile; + const char *chainfile; + X509 *cert; + EVP_PKEY *key; + STACK_OF(X509) *chain; + int build_chain; + struct ssl_excert_st *next, *prev; +}; + +struct chain_flags { + int flag; + const char *name; +}; + +struct chain_flags chain_flags_list[] = { + {CERT_PKEY_VALID, "Overall Validity"}, + {CERT_PKEY_SIGN, "Sign with EE key"}, + {CERT_PKEY_EE_SIGNATURE, "EE signature"}, + {CERT_PKEY_CA_SIGNATURE, "CA signature"}, + {CERT_PKEY_EE_PARAM, "EE key parameters"}, + {CERT_PKEY_CA_PARAM, "CA key parameters"}, + {CERT_PKEY_EXPLICIT_SIGN, "Explicity sign with EE key"}, + {CERT_PKEY_ISSUER_NAME, "Issuer Name"}, + {CERT_PKEY_CERT_TYPE, "Certificate Type"}, + {0, NULL} +}; + +static void print_chain_flags(BIO *out, SSL *s, int flags) +{ + struct chain_flags *ctmp = chain_flags_list; + while (ctmp->name) { + BIO_printf(out, "\t%s: %s\n", ctmp->name, + flags & ctmp->flag ? "OK" : "NOT OK"); + ctmp++; + } + BIO_printf(out, "\tSuite B: "); + if (SSL_set_cert_flags(s, 0) & SSL_CERT_FLAG_SUITEB_128_LOS) + BIO_puts(out, flags & CERT_PKEY_SUITEB ? "OK\n" : "NOT OK\n"); + else + BIO_printf(out, "not tested\n"); +} + +/* + * Very basic selection callback: just use any certificate chain reported as + * valid. More sophisticated could prioritise according to local policy. + */ +static int set_cert_cb(SSL *ssl, void *arg) +{ + int i, rv; + SSL_EXCERT *exc = arg; +#ifdef CERT_CB_TEST_RETRY + static int retry_cnt; + if (retry_cnt < 5) { + retry_cnt++; + fprintf(stderr, "Certificate callback retry test: count %d\n", + retry_cnt); + return -1; + } +#endif + SSL_certs_clear(ssl); + + if (!exc) + return 1; + + /* + * Go to end of list and traverse backwards since we prepend newer + * entries this retains the original order. + */ + while (exc->next) + exc = exc->next; + + i = 0; + + while (exc) { + i++; + rv = SSL_check_chain(ssl, exc->cert, exc->key, exc->chain); + BIO_printf(bio_err, "Checking cert chain %d:\nSubject: ", i); + X509_NAME_print_ex(bio_err, X509_get_subject_name(exc->cert), 0, + XN_FLAG_ONELINE); + BIO_puts(bio_err, "\n"); + + print_chain_flags(bio_err, ssl, rv); + if (rv & CERT_PKEY_VALID) { + SSL_use_certificate(ssl, exc->cert); + SSL_use_PrivateKey(ssl, exc->key); + /* + * NB: we wouldn't normally do this as it is not efficient + * building chains on each connection better to cache the chain + * in advance. + */ + if (exc->build_chain) { + if (!SSL_build_cert_chain(ssl, 0)) + return 0; + } else if (exc->chain) + SSL_set1_chain(ssl, exc->chain); + } + exc = exc->prev; + } + return 1; +} + +void ssl_ctx_set_excert(SSL_CTX *ctx, SSL_EXCERT *exc) +{ + SSL_CTX_set_cert_cb(ctx, set_cert_cb, exc); +} + +static int ssl_excert_prepend(SSL_EXCERT **pexc) +{ + SSL_EXCERT *exc; + exc = OPENSSL_malloc(sizeof(SSL_EXCERT)); + if (!exc) + return 0; + exc->certfile = NULL; + exc->keyfile = NULL; + exc->chainfile = NULL; + exc->cert = NULL; + exc->key = NULL; + exc->chain = NULL; + exc->prev = NULL; + exc->build_chain = 0; + + exc->next = *pexc; + *pexc = exc; + + if (exc->next) { + exc->certform = exc->next->certform; + exc->keyform = exc->next->keyform; + exc->next->prev = exc; + } else { + exc->certform = FORMAT_PEM; + exc->keyform = FORMAT_PEM; + } + return 1; + +} + +void ssl_excert_free(SSL_EXCERT *exc) +{ + SSL_EXCERT *curr; + while (exc) { + if (exc->cert) + X509_free(exc->cert); + if (exc->key) + EVP_PKEY_free(exc->key); + if (exc->chain) + sk_X509_pop_free(exc->chain, X509_free); + curr = exc; + exc = exc->next; + OPENSSL_free(curr); + } +} + +int load_excert(SSL_EXCERT **pexc, BIO *err) +{ + SSL_EXCERT *exc = *pexc; + if (!exc) + return 1; + /* If nothing in list, free and set to NULL */ + if (!exc->certfile && !exc->next) { + ssl_excert_free(exc); + *pexc = NULL; + return 1; + } + for (; exc; exc = exc->next) { + if (!exc->certfile) { + BIO_printf(err, "Missing filename\n"); + return 0; + } + exc->cert = load_cert(err, exc->certfile, exc->certform, + NULL, NULL, "Server Certificate"); + if (!exc->cert) + return 0; + if (exc->keyfile) { + exc->key = load_key(err, exc->keyfile, exc->keyform, + 0, NULL, NULL, "Server Key"); + } else { + exc->key = load_key(err, exc->certfile, exc->certform, + 0, NULL, NULL, "Server Key"); + } + if (!exc->key) + return 0; + if (exc->chainfile) { + exc->chain = load_certs(err, + exc->chainfile, FORMAT_PEM, + NULL, NULL, "Server Chain"); + if (!exc->chain) + return 0; + } + } + return 1; +} + +int args_excert(char ***pargs, int *pargc, + int *badarg, BIO *err, SSL_EXCERT **pexc) +{ + char *arg = **pargs, *argn = (*pargs)[1]; + SSL_EXCERT *exc = *pexc; + int narg = 2; + if (!exc) { + if (ssl_excert_prepend(&exc)) + *pexc = exc; + else { + BIO_printf(err, "Error initialising xcert\n"); + *badarg = 1; + goto err; + } + } + if (strcmp(arg, "-xcert") == 0) { + if (!argn) { + *badarg = 1; + return 1; + } + if (exc->certfile && !ssl_excert_prepend(&exc)) { + BIO_printf(err, "Error adding xcert\n"); + *badarg = 1; + goto err; + } + exc->certfile = argn; + } else if (strcmp(arg, "-xkey") == 0) { + if (!argn) { + *badarg = 1; + return 1; + } + if (exc->keyfile) { + BIO_printf(err, "Key already specified\n"); + *badarg = 1; + return 1; + } + exc->keyfile = argn; + } else if (strcmp(arg, "-xchain") == 0) { + if (!argn) { + *badarg = 1; + return 1; + } + if (exc->chainfile) { + BIO_printf(err, "Chain already specified\n"); + *badarg = 1; + return 1; + } + exc->chainfile = argn; + } else if (strcmp(arg, "-xchain_build") == 0) { + narg = 1; + exc->build_chain = 1; + } else if (strcmp(arg, "-xcertform") == 0) { + if (!argn) { + *badarg = 1; + goto err; + } + exc->certform = str2fmt(argn); + } else if (strcmp(arg, "-xkeyform") == 0) { + if (!argn) { + *badarg = 1; + goto err; + } + exc->keyform = str2fmt(argn); + } else + return 0; + + (*pargs) += narg; + + if (pargc) + *pargc -= narg; + + *pexc = exc; + + return 1; + + err: + ERR_print_errors(err); + ssl_excert_free(exc); + *pexc = NULL; + return 1; +} + +static void print_raw_cipherlist(BIO *bio, SSL *s) +{ + const unsigned char *rlist; + static const unsigned char scsv_id[] = { 0, 0, 0xFF }; + size_t i, rlistlen, num; + if (!SSL_is_server(s)) + return; + num = SSL_get0_raw_cipherlist(s, NULL); + rlistlen = SSL_get0_raw_cipherlist(s, &rlist); + BIO_puts(bio, "Client cipher list: "); + for (i = 0; i < rlistlen; i += num, rlist += num) { + const SSL_CIPHER *c = SSL_CIPHER_find(s, rlist); + if (i) + BIO_puts(bio, ":"); + if (c) + BIO_puts(bio, SSL_CIPHER_get_name(c)); + else if (!memcmp(rlist, scsv_id - num + 3, num)) + BIO_puts(bio, "SCSV"); + else { + size_t j; + BIO_puts(bio, "0x"); + for (j = 0; j < num; j++) + BIO_printf(bio, "%02X", rlist[j]); + } + } + BIO_puts(bio, "\n"); +} + +void print_ssl_summary(BIO *bio, SSL *s) +{ + const SSL_CIPHER *c; + X509 *peer; + /* + * const char *pnam = SSL_is_server(s) ? "client" : "server"; + */ + BIO_printf(bio, "Protocol version: %s\n", SSL_get_version(s)); + print_raw_cipherlist(bio, s); + c = SSL_get_current_cipher(s); + BIO_printf(bio, "Ciphersuite: %s\n", SSL_CIPHER_get_name(c)); + do_print_sigalgs(bio, s, 0); + peer = SSL_get_peer_certificate(s); + if (peer) { + int nid; + BIO_puts(bio, "Peer certificate: "); + X509_NAME_print_ex(bio, X509_get_subject_name(peer), + 0, XN_FLAG_ONELINE); + BIO_puts(bio, "\n"); + if (SSL_get_peer_signature_nid(s, &nid)) + BIO_printf(bio, "Hash used: %s\n", OBJ_nid2sn(nid)); + } else + BIO_puts(bio, "No peer certificate\n"); + if (peer) + X509_free(peer); +#ifndef OPENSSL_NO_EC + ssl_print_point_formats(bio, s); + if (SSL_is_server(s)) + ssl_print_curves(bio, s, 1); + else + ssl_print_tmp_key(bio, s); +#else + if (!SSL_is_server(s)) + ssl_print_tmp_key(bio, s); +#endif +} + +int args_ssl(char ***pargs, int *pargc, SSL_CONF_CTX *cctx, + int *badarg, BIO *err, STACK_OF(OPENSSL_STRING) **pstr) +{ + char *arg = **pargs, *argn = (*pargs)[1]; + int rv; + + /* Attempt to run SSL configuration command */ + rv = SSL_CONF_cmd_argv(cctx, pargc, pargs); + /* If parameter not recognised just return */ + if (rv == 0) + return 0; + /* see if missing argument error */ + if (rv == -3) { + BIO_printf(err, "%s needs an argument\n", arg); + *badarg = 1; + goto end; + } + /* Check for some other error */ + if (rv < 0) { + BIO_printf(err, "Error with command: \"%s %s\"\n", + arg, argn ? argn : ""); + *badarg = 1; + goto end; + } + /* Store command and argument */ + /* If only one argument processed store value as NULL */ + if (rv == 1) + argn = NULL; + if (!*pstr) + *pstr = sk_OPENSSL_STRING_new_null(); + if (!*pstr || !sk_OPENSSL_STRING_push(*pstr, arg) || + !sk_OPENSSL_STRING_push(*pstr, argn)) { + BIO_puts(err, "Memory allocation failure\n"); + goto end; + } + + end: + if (*badarg) + ERR_print_errors(err); + + return 1; +} + +int args_ssl_call(SSL_CTX *ctx, BIO *err, SSL_CONF_CTX *cctx, + STACK_OF(OPENSSL_STRING) *str, int no_ecdhe, int no_jpake) +{ + int i; + SSL_CONF_CTX_set_ssl_ctx(cctx, ctx); + for (i = 0; i < sk_OPENSSL_STRING_num(str); i += 2) { + const char *param = sk_OPENSSL_STRING_value(str, i); + const char *value = sk_OPENSSL_STRING_value(str, i + 1); + /* + * If no_ecdhe or named curve already specified don't need a default. + */ + if (!no_ecdhe && !strcmp(param, "-named_curve")) + no_ecdhe = 1; +#ifndef OPENSSL_NO_JPAKE + if (!no_jpake && !strcmp(param, "-cipher")) { + BIO_puts(err, "JPAKE sets cipher to PSK\n"); + return 0; + } +#endif + if (SSL_CONF_cmd(cctx, param, value) <= 0) { + BIO_printf(err, "Error with command: \"%s %s\"\n", + param, value ? value : ""); + ERR_print_errors(err); + return 0; + } + } + /* + * This is a special case to keep existing s_server functionality: if we + * don't have any curve specified *and* we haven't disabled ECDHE then + * use P-256. + */ + if (!no_ecdhe) { + if (SSL_CONF_cmd(cctx, "-named_curve", "P-256") <= 0) { + BIO_puts(err, "Error setting EC curve\n"); + ERR_print_errors(err); + return 0; + } + } +#ifndef OPENSSL_NO_JPAKE + if (!no_jpake) { + if (SSL_CONF_cmd(cctx, "-cipher", "PSK") <= 0) { + BIO_puts(err, "Error setting cipher to PSK\n"); + ERR_print_errors(err); + return 0; + } + } +#endif + if (!SSL_CONF_CTX_finish(cctx)) { + BIO_puts(err, "Error finishing context\n"); + ERR_print_errors(err); + return 0; + } + return 1; +} + +static int add_crls_store(X509_STORE *st, STACK_OF(X509_CRL) *crls) +{ + X509_CRL *crl; + int i; + for (i = 0; i < sk_X509_CRL_num(crls); i++) { + crl = sk_X509_CRL_value(crls, i); + X509_STORE_add_crl(st, crl); + } + return 1; +} + +int ssl_ctx_add_crls(SSL_CTX *ctx, STACK_OF(X509_CRL) *crls, int crl_download) +{ + X509_STORE *st; + st = SSL_CTX_get_cert_store(ctx); + add_crls_store(st, crls); + if (crl_download) + store_setup_crl_download(st); + return 1; +} + +int ssl_load_stores(SSL_CTX *ctx, + const char *vfyCApath, const char *vfyCAfile, + const char *chCApath, const char *chCAfile, + STACK_OF(X509_CRL) *crls, int crl_download) +{ + X509_STORE *vfy = NULL, *ch = NULL; + int rv = 0; + if (vfyCApath || vfyCAfile) { + vfy = X509_STORE_new(); + if (!X509_STORE_load_locations(vfy, vfyCAfile, vfyCApath)) + goto err; + add_crls_store(vfy, crls); + SSL_CTX_set1_verify_cert_store(ctx, vfy); + if (crl_download) + store_setup_crl_download(vfy); + } + if (chCApath || chCAfile) { + ch = X509_STORE_new(); + if (!X509_STORE_load_locations(ch, chCAfile, chCApath)) + goto err; + SSL_CTX_set1_chain_cert_store(ctx, ch); + } + rv = 1; + err: + if (vfy) + X509_STORE_free(vfy); + if (ch) + X509_STORE_free(ch); + return rv; +} diff --git a/apps/s_client.c b/apps/s_client.c index 28737b6d1e6b..e55f2c5abc88 100644 --- a/apps/s_client.c +++ b/apps/s_client.c @@ -202,6 +202,7 @@ typedef unsigned int u_int; extern int verify_depth; extern int verify_error; extern int verify_return_error; +extern int verify_quiet; #ifdef FIONBIO static int c_nbio = 0; @@ -224,8 +225,10 @@ static void print_stuff(BIO *berr, SSL *con, int full); static int ocsp_resp_cb(SSL *s, void *arg); #endif static BIO *bio_c_out = NULL; +static BIO *bio_c_msg = NULL; static int c_quiet = 0; static int c_ign_eof = 0; +static int c_brief = 0; #ifndef OPENSSL_NO_PSK /* Default PSK identity and key */ @@ -304,6 +307,12 @@ static void sc_usage(void) BIO_printf(bio_err, " -connect host:port - who to connect to (default is %s:%s)\n", SSL_HOST_NAME, PORT_STR); + BIO_printf(bio_err, + " -verify_host host - check peer certificate matches \"host\"\n"); + BIO_printf(bio_err, + " -verify_email email - check peer certificate matches \"email\"\n"); + BIO_printf(bio_err, + " -verify_ip ipaddr - check peer certificate matches \"ipaddr\"\n"); BIO_printf(bio_err, " -verify arg - turn on peer certificate verification\n"); @@ -413,12 +422,16 @@ static void sc_usage(void) " -status - request certificate status from server\n"); BIO_printf(bio_err, " -no_ticket - disable use of RFC4507bis session tickets\n"); -# ifndef OPENSSL_NO_NEXTPROTONEG + BIO_printf(bio_err, + " -serverinfo types - send empty ClientHello extensions (comma-separated numbers)\n"); +#endif +#ifndef OPENSSL_NO_NEXTPROTONEG BIO_printf(bio_err, " -nextprotoneg arg - enable NPN extension, considering named protocols supported (comma-separated list)\n"); -# endif #endif BIO_printf(bio_err, + " -alpn arg - enable ALPN extension, considering named protocols supported (comma-separated list)\n"); + BIO_printf(bio_err, " -legacy_renegotiation - enable use of legacy renegotiation (dangerous)\n"); #ifndef OPENSSL_NO_SRTP BIO_printf(bio_err, @@ -605,6 +618,27 @@ static int next_proto_cb(SSL *s, unsigned char **out, unsigned char *outlen, return SSL_TLSEXT_ERR_OK; } # endif /* ndef OPENSSL_NO_NEXTPROTONEG */ + +static int serverinfo_cli_parse_cb(SSL *s, unsigned int ext_type, + const unsigned char *in, size_t inlen, + int *al, void *arg) +{ + char pem_name[100]; + unsigned char ext_buf[4 + 65536]; + + /* Reconstruct the type/len fields prior to extension data */ + ext_buf[0] = ext_type >> 8; + ext_buf[1] = ext_type & 0xFF; + ext_buf[2] = inlen >> 8; + ext_buf[3] = inlen & 0xFF; + memcpy(ext_buf + 4, in, inlen); + + BIO_snprintf(pem_name, sizeof(pem_name), "SERVERINFO FOR EXTENSION %d", + ext_type); + PEM_write_bio(bio_c_out, pem_name, "", ext_buf, 4 + inlen); + return 1; +} + #endif enum { @@ -620,7 +654,7 @@ int MAIN(int, char **); int MAIN(int argc, char **argv) { - unsigned int off = 0, clr = 0; + int build_chain = 0; SSL *con = NULL; #ifndef OPENSSL_NO_KRB5 KSSL_CTX *kctx; @@ -633,13 +667,16 @@ int MAIN(int argc, char **argv) short port = PORT; int full_log = 1; char *host = SSL_HOST_NAME; - char *cert_file = NULL, *key_file = NULL; + char *cert_file = NULL, *key_file = NULL, *chain_file = NULL; int cert_format = FORMAT_PEM, key_format = FORMAT_PEM; char *passarg = NULL, *pass = NULL; X509 *cert = NULL; EVP_PKEY *key = NULL; - char *CApath = NULL, *CAfile = NULL, *cipher = NULL; - int reconnect = 0, badop = 0, verify = SSL_VERIFY_NONE, bugs = 0; + STACK_OF(X509) *chain = NULL; + char *CApath = NULL, *CAfile = NULL; + char *chCApath = NULL, *chCAfile = NULL; + char *vfyCApath = NULL, *vfyCAfile = NULL; + int reconnect = 0, badop = 0, verify = SSL_VERIFY_NONE; int crlf = 0; int write_tty, read_tty, write_ssl, read_ssl, tty_on, ssl_pending; SSL_CTX *ctx = NULL; @@ -672,6 +709,10 @@ int MAIN(int argc, char **argv) # ifndef OPENSSL_NO_NEXTPROTONEG const char *next_proto_neg_in = NULL; # endif + const char *alpn_in = NULL; +# define MAX_SI_TYPES 100 + unsigned short serverinfo_types[MAX_SI_TYPES]; + int serverinfo_types_count = 0; #endif char *sess_in = NULL; char *sess_out = NULL; @@ -681,13 +722,25 @@ int MAIN(int argc, char **argv) int enable_timeouts = 0; long socket_mtu = 0; #ifndef OPENSSL_NO_JPAKE - char *jpake_secret = NULL; + static char *jpake_secret = NULL; +# define no_jpake !jpake_secret +#else +# define no_jpake 1 #endif #ifndef OPENSSL_NO_SRP char *srppass = NULL; int srp_lateuser = 0; SRP_ARG srp_arg = { NULL, NULL, 0, 0, 0, 1024 }; #endif + SSL_EXCERT *exc = NULL; + + SSL_CONF_CTX *cctx = NULL; + STACK_OF(OPENSSL_STRING) *ssl_args = NULL; + + char *crl_file = NULL; + int crl_format = FORMAT_PEM; + int crl_download = 0; + STACK_OF(X509_CRL) *crls = NULL; meth = SSLv23_client_method(); @@ -705,6 +758,12 @@ int MAIN(int argc, char **argv) if (!load_config(bio_err, NULL)) goto end; + cctx = SSL_CONF_CTX_new(); + if (!cctx) + goto end; + SSL_CONF_CTX_set_flags(cctx, SSL_CONF_FLAG_CLIENT); + SSL_CONF_CTX_set_flags(cctx, SSL_CONF_FLAG_CMDLINE); + if (((cbuf = OPENSSL_malloc(BUFSIZZ)) == NULL) || ((sbuf = OPENSSL_malloc(BUFSIZZ)) == NULL) || ((mbuf = OPENSSL_malloc(BUFSIZZ)) == NULL)) { @@ -741,12 +800,19 @@ int MAIN(int argc, char **argv) if (--argc < 1) goto bad; verify_depth = atoi(*(++argv)); - BIO_printf(bio_err, "verify depth is %d\n", verify_depth); + if (!c_quiet) + BIO_printf(bio_err, "verify depth is %d\n", verify_depth); } else if (strcmp(*argv, "-cert") == 0) { if (--argc < 1) goto bad; cert_file = *(++argv); - } else if (strcmp(*argv, "-sess_out") == 0) { + } else if (strcmp(*argv, "-CRL") == 0) { + if (--argc < 1) + goto bad; + crl_file = *(++argv); + } else if (strcmp(*argv, "-crl_download") == 0) + crl_download = 1; + else if (strcmp(*argv, "-sess_out") == 0) { if (--argc < 1) goto bad; sess_out = *(++argv); @@ -758,13 +824,31 @@ int MAIN(int argc, char **argv) if (--argc < 1) goto bad; cert_format = str2fmt(*(++argv)); + } else if (strcmp(*argv, "-CRLform") == 0) { + if (--argc < 1) + goto bad; + crl_format = str2fmt(*(++argv)); } else if (args_verify(&argv, &argc, &badarg, bio_err, &vpm)) { if (badarg) goto bad; continue; } else if (strcmp(*argv, "-verify_return_error") == 0) verify_return_error = 1; - else if (strcmp(*argv, "-prexit") == 0) + else if (strcmp(*argv, "-verify_quiet") == 0) + verify_quiet = 1; + else if (strcmp(*argv, "-brief") == 0) { + c_brief = 1; + verify_quiet = 1; + c_quiet = 1; + } else if (args_excert(&argv, &argc, &badarg, bio_err, &exc)) { + if (badarg) + goto bad; + continue; + } else if (args_ssl(&argv, &argc, cctx, &badarg, bio_err, &ssl_args)) { + if (badarg) + goto bad; + continue; + } else if (strcmp(*argv, "-prexit") == 0) prexit = 1; else if (strcmp(*argv, "-crlf") == 0) crlf = 1; @@ -791,6 +875,15 @@ int MAIN(int argc, char **argv) #endif else if (strcmp(*argv, "-msg") == 0) c_msg = 1; + else if (strcmp(*argv, "-msgfile") == 0) { + if (--argc < 1) + goto bad; + bio_c_msg = BIO_new_file(*(++argv), "w"); + } +#ifndef OPENSSL_NO_SSL_TRACE + else if (strcmp(*argv, "-trace") == 0) + c_msg = 2; +#endif else if (strcmp(*argv, "-showcerts") == 0) c_showcerts = 1; else if (strcmp(*argv, "-nbio_test") == 0) @@ -859,11 +952,15 @@ int MAIN(int argc, char **argv) meth = TLSv1_client_method(); #endif #ifndef OPENSSL_NO_DTLS1 - else if (strcmp(*argv, "-dtls1") == 0) { + else if (strcmp(*argv, "-dtls") == 0) { + meth = DTLS_client_method(); + socket_type = SOCK_DGRAM; + } else if (strcmp(*argv, "-dtls1") == 0) { meth = DTLSv1_client_method(); socket_type = SOCK_DGRAM; - } else if (strcmp(*argv, "-fallback_scsv") == 0) { - fallback_scsv = 1; + } else if (strcmp(*argv, "-dtls1_2") == 0) { + meth = DTLSv1_2_client_method(); + socket_type = SOCK_DGRAM; } else if (strcmp(*argv, "-timeout") == 0) enable_timeouts = 1; else if (strcmp(*argv, "-mtu") == 0) { @@ -872,9 +969,9 @@ int MAIN(int argc, char **argv) socket_mtu = atol(*(++argv)); } #endif - else if (strcmp(*argv, "-bugs") == 0) - bugs = 1; - else if (strcmp(*argv, "-keyform") == 0) { + else if (strcmp(*argv, "-fallback_scsv") == 0) { + fallback_scsv = 1; + } else if (strcmp(*argv, "-keyform") == 0) { if (--argc < 1) goto bad; key_format = str2fmt(*(++argv)); @@ -882,6 +979,10 @@ int MAIN(int argc, char **argv) if (--argc < 1) goto bad; passarg = *(++argv); + } else if (strcmp(*argv, "-cert_chain") == 0) { + if (--argc < 1) + goto bad; + chain_file = *(++argv); } else if (strcmp(*argv, "-key") == 0) { if (--argc < 1) goto bad; @@ -892,27 +993,30 @@ int MAIN(int argc, char **argv) if (--argc < 1) goto bad; CApath = *(++argv); - } else if (strcmp(*argv, "-CAfile") == 0) { + } else if (strcmp(*argv, "-chainCApath") == 0) { + if (--argc < 1) + goto bad; + chCApath = *(++argv); + } else if (strcmp(*argv, "-verifyCApath") == 0) { + if (--argc < 1) + goto bad; + vfyCApath = *(++argv); + } else if (strcmp(*argv, "-build_chain") == 0) + build_chain = 1; + else if (strcmp(*argv, "-CAfile") == 0) { if (--argc < 1) goto bad; CAfile = *(++argv); - } else if (strcmp(*argv, "-no_tls1_2") == 0) - off |= SSL_OP_NO_TLSv1_2; - else if (strcmp(*argv, "-no_tls1_1") == 0) - off |= SSL_OP_NO_TLSv1_1; - else if (strcmp(*argv, "-no_tls1") == 0) - off |= SSL_OP_NO_TLSv1; - else if (strcmp(*argv, "-no_ssl3") == 0) - off |= SSL_OP_NO_SSLv3; - else if (strcmp(*argv, "-no_ssl2") == 0) - off |= SSL_OP_NO_SSLv2; - else if (strcmp(*argv, "-no_comp") == 0) { - off |= SSL_OP_NO_COMPRESSION; + } else if (strcmp(*argv, "-chainCAfile") == 0) { + if (--argc < 1) + goto bad; + chCAfile = *(++argv); + } else if (strcmp(*argv, "-verifyCAfile") == 0) { + if (--argc < 1) + goto bad; + vfyCAfile = *(++argv); } #ifndef OPENSSL_NO_TLSEXT - else if (strcmp(*argv, "-no_ticket") == 0) { - off |= SSL_OP_NO_TICKET; - } # ifndef OPENSSL_NO_NEXTPROTONEG else if (strcmp(*argv, "-nextprotoneg") == 0) { if (--argc < 1) @@ -920,20 +1024,32 @@ int MAIN(int argc, char **argv) next_proto_neg_in = *(++argv); } # endif -#endif - else if (strcmp(*argv, "-serverpref") == 0) - off |= SSL_OP_CIPHER_SERVER_PREFERENCE; - else if (strcmp(*argv, "-legacy_renegotiation") == 0) - off |= SSL_OP_ALLOW_UNSAFE_LEGACY_RENEGOTIATION; - else if (strcmp(*argv, "-legacy_server_connect") == 0) { - off |= SSL_OP_LEGACY_SERVER_CONNECT; - } else if (strcmp(*argv, "-no_legacy_server_connect") == 0) { - clr |= SSL_OP_LEGACY_SERVER_CONNECT; - } else if (strcmp(*argv, "-cipher") == 0) { + else if (strcmp(*argv, "-alpn") == 0) { + if (--argc < 1) + goto bad; + alpn_in = *(++argv); + } else if (strcmp(*argv, "-serverinfo") == 0) { + char *c; + int start = 0; + int len; + if (--argc < 1) goto bad; - cipher = *(++argv); + c = *(++argv); + serverinfo_types_count = 0; + len = strlen(c); + for (i = 0; i <= len; ++i) { + if (i == len || c[i] == ',') { + serverinfo_types[serverinfo_types_count] + = atoi(c + start); + serverinfo_types_count++; + start = i + 1; + } + if (serverinfo_types_count == MAX_SI_TYPES) + break; + } } +#endif #ifdef FIONBIO else if (strcmp(*argv, "-nbio") == 0) { c_nbio = 1; @@ -1024,11 +1140,6 @@ int MAIN(int argc, char **argv) goto end; } psk_identity = "JPAKE"; - if (cipher) { - BIO_printf(bio_err, "JPAKE sets cipher to PSK\n"); - goto end; - } - cipher = "PSK"; } #endif @@ -1087,6 +1198,33 @@ int MAIN(int argc, char **argv) } } + if (chain_file) { + chain = load_certs(bio_err, chain_file, FORMAT_PEM, + NULL, e, "client certificate chain"); + if (!chain) + goto end; + } + + if (crl_file) { + X509_CRL *crl; + crl = load_crl(crl_file, crl_format); + if (!crl) { + BIO_puts(bio_err, "Error loading CRL\n"); + ERR_print_errors(bio_err); + goto end; + } + crls = sk_X509_CRL_new_null(); + if (!crls || !sk_X509_CRL_push(crls, crl)) { + BIO_puts(bio_err, "Error adding CRL\n"); + ERR_print_errors(bio_err); + X509_CRL_free(crl); + goto end; + } + } + + if (!load_excert(&exc, bio_err)) + goto end; + if (!app_RAND_load_file(NULL, bio_err, 1) && inrand == NULL && !RAND_status()) { BIO_printf(bio_err, @@ -1097,8 +1235,10 @@ int MAIN(int argc, char **argv) app_RAND_load_files(inrand)); if (bio_c_out == NULL) { - if (c_quiet && !c_debug && !c_msg) { + if (c_quiet && !c_debug) { bio_c_out = BIO_new(BIO_s_null()); + if (c_msg && !bio_c_msg) + bio_c_msg = BIO_new_fp(stdout, BIO_NOCLOSE); } else { if (bio_c_out == NULL) bio_c_out = BIO_new_fp(stdout, BIO_NOCLOSE); @@ -1120,6 +1260,17 @@ int MAIN(int argc, char **argv) if (vpm) SSL_CTX_set1_param(ctx, vpm); + if (!args_ssl_call(ctx, bio_err, cctx, ssl_args, 1, no_jpake)) { + ERR_print_errors(bio_err); + goto end; + } + + if (!ssl_load_stores(ctx, vfyCApath, vfyCAfile, chCApath, chCAfile, + crls, crl_download)) { + BIO_printf(bio_err, "Error loading store locations\n"); + ERR_print_errors(bio_err); + goto end; + } #ifndef OPENSSL_NO_ENGINE if (ssl_client_engine) { if (!SSL_CTX_set_client_cert_engine(ctx, ssl_client_engine)) { @@ -1149,35 +1300,43 @@ int MAIN(int argc, char **argv) if (srtp_profiles != NULL) SSL_CTX_set_tlsext_use_srtp(ctx, srtp_profiles); #endif - if (bugs) - SSL_CTX_set_options(ctx, SSL_OP_ALL | off); - else - SSL_CTX_set_options(ctx, off); + if (exc) + ssl_ctx_set_excert(ctx, exc); - if (clr) - SSL_CTX_clear_options(ctx, clr); - -#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG) +#if !defined(OPENSSL_NO_TLSEXT) +# if !defined(OPENSSL_NO_NEXTPROTONEG) if (next_proto.data) SSL_CTX_set_next_proto_select_cb(ctx, next_proto_cb, &next_proto); +# endif + if (alpn_in) { + unsigned short alpn_len; + unsigned char *alpn = next_protos_parse(&alpn_len, alpn_in); + + if (alpn == NULL) { + BIO_printf(bio_err, "Error parsing -alpn argument\n"); + goto end; + } + SSL_CTX_set_alpn_protos(ctx, alpn, alpn_len); + OPENSSL_free(alpn); + } +#endif +#ifndef OPENSSL_NO_TLSEXT + for (i = 0; i < serverinfo_types_count; i++) { + SSL_CTX_add_client_custom_ext(ctx, + serverinfo_types[i], + NULL, NULL, NULL, + serverinfo_cli_parse_cb, NULL); + } #endif if (state) SSL_CTX_set_info_callback(ctx, apps_ssl_info_callback); - if (cipher != NULL) - if (!SSL_CTX_set_cipher_list(ctx, cipher)) { - BIO_printf(bio_err, "error setting cipher list\n"); - ERR_print_errors(bio_err); - goto end; - } #if 0 - else - SSL_CTX_set_cipher_list(ctx, getenv("SSL_CIPHER")); + else + SSL_CTX_set_cipher_list(ctx, getenv("SSL_CIPHER")); #endif SSL_CTX_set_verify(ctx, verify, verify_callback); - if (!set_cert_key_stuff(ctx, cert, key)) - goto end; if ((CAfile || CApath) && !SSL_CTX_load_verify_locations(ctx, CAfile, CApath)) { @@ -1186,6 +1345,11 @@ int MAIN(int argc, char **argv) if (!SSL_CTX_set_default_verify_paths(ctx)) { ERR_print_errors(bio_err); } + + ssl_ctx_add_crls(ctx, crls, crl_download); + if (!set_cert_key_stuff(ctx, cert, key, chain, build_chain)) + goto end; + #ifndef OPENSSL_NO_TLSEXT if (servername != NULL) { tlsextcbp.biodebug = bio_err; @@ -1277,7 +1441,7 @@ int MAIN(int argc, char **argv) if (c_Pause & 0x01) SSL_set_debug(con, 1); - if (SSL_version(con) == DTLS1_VERSION) { + if (socket_type == SOCK_DGRAM) { sbio = BIO_new_dgram(s, BIO_NOCLOSE); if (getsockname(s, &peer, (void *)&peerlen) < 0) { @@ -1331,8 +1495,13 @@ int MAIN(int argc, char **argv) BIO_set_callback_arg(sbio, (char *)bio_c_out); } if (c_msg) { - SSL_set_msg_callback(con, msg_cb); - SSL_set_msg_callback_arg(con, bio_c_out); +#ifndef OPENSSL_NO_SSL_TRACE + if (c_msg == 2) + SSL_set_msg_callback(con, SSL_trace); + else +#endif + SSL_set_msg_callback(con, msg_cb); + SSL_set_msg_callback_arg(con, bio_c_msg ? bio_c_msg : bio_c_out); } #ifndef OPENSSL_NO_TLSEXT if (c_tlsextdebug) { @@ -1515,6 +1684,11 @@ int MAIN(int argc, char **argv) BIO_printf(bio_err, "Error writing session file %s\n", sess_out); } + if (c_brief) { + BIO_puts(bio_err, "CONNECTION ESTABLISHED\n"); + print_ssl_summary(bio_err, con); + } + print_stuff(bio_c_out, con, full_log); if (full_log > 0) full_log--; @@ -1780,7 +1954,10 @@ int MAIN(int argc, char **argv) break; case SSL_ERROR_SYSCALL: ret = get_last_socket_error(); - BIO_printf(bio_err, "read:errno=%d\n", ret); + if (c_brief) + BIO_puts(bio_err, "CONNECTION CLOSED BY SERVER\n"); + else + BIO_printf(bio_err, "read:errno=%d\n", ret); goto shut; case SSL_ERROR_ZERO_RETURN: BIO_printf(bio_c_out, "closed\n"); @@ -1880,12 +2057,25 @@ int MAIN(int argc, char **argv) SSL_CTX_free(ctx); if (cert) X509_free(cert); + if (crls) + sk_X509_CRL_pop_free(crls, X509_CRL_free); if (key) EVP_PKEY_free(key); + if (chain) + sk_X509_pop_free(chain, X509_free); if (pass) OPENSSL_free(pass); if (vpm) X509_VERIFY_PARAM_free(vpm); + ssl_excert_free(exc); + if (ssl_args) + sk_OPENSSL_STRING_free(ssl_args); + if (cctx) + SSL_CONF_CTX_free(cctx); +#ifndef OPENSSL_NO_JPAKE + if (jpake_secret && psk_key) + OPENSSL_free(psk_key); +#endif if (cbuf != NULL) { OPENSSL_cleanse(cbuf, BUFSIZZ); OPENSSL_free(cbuf); @@ -1902,6 +2092,10 @@ int MAIN(int argc, char **argv) BIO_free(bio_c_out); bio_c_out = NULL; } + if (bio_c_msg != NULL) { + BIO_free(bio_c_msg); + bio_c_msg = NULL; + } apps_shutdown(); OPENSSL_EXIT(ret); } @@ -1995,6 +2189,9 @@ static void print_stuff(BIO *bio, SSL *s, int full) BIO_write(bio, "\n", 1); } + ssl_print_sigalgs(bio, s); + ssl_print_tmp_key(bio, s); + BIO_printf(bio, "---\nSSL handshake has read %ld bytes and written %ld bytes\n", BIO_number_read(SSL_get_rbio(s)), @@ -2034,7 +2231,8 @@ static void print_stuff(BIO *bio, SSL *s, int full) } #endif -#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG) +#if !defined(OPENSSL_NO_TLSEXT) +# if !defined(OPENSSL_NO_NEXTPROTONEG) if (next_proto.status != -1) { const unsigned char *proto; unsigned int proto_len; @@ -2043,6 +2241,18 @@ static void print_stuff(BIO *bio, SSL *s, int full) BIO_write(bio, proto, proto_len); BIO_write(bio, "\n", 1); } +# endif + { + const unsigned char *proto; + unsigned int proto_len; + SSL_get0_alpn_selected(s, &proto, &proto_len); + if (proto_len > 0) { + BIO_printf(bio, "ALPN protocol: "); + BIO_write(bio, proto, proto_len); + BIO_write(bio, "\n", 1); + } else + BIO_printf(bio, "No ALPN negotiated\n"); + } #endif #ifndef OPENSSL_NO_SRTP diff --git a/apps/s_server.c b/apps/s_server.c index b58e5e07c41c..acef382c2c4b 100644 --- a/apps/s_server.c +++ b/apps/s_server.c @@ -209,14 +209,17 @@ typedef unsigned int u_int; #ifndef OPENSSL_NO_RSA static RSA MS_CALLBACK *tmp_rsa_cb(SSL *s, int is_export, int keylength); #endif -static int sv_body(char *hostname, int s, unsigned char *context); -static int www_body(char *hostname, int s, unsigned char *context); +static int sv_body(char *hostname, int s, int stype, unsigned char *context); +static int www_body(char *hostname, int s, int stype, unsigned char *context); +static int rev_body(char *hostname, int s, int stype, unsigned char *context); static void close_accept_socket(void); static void sv_usage(void); static int init_ssl_connection(SSL *s); static void print_stats(BIO *bp, SSL_CTX *ctx); static int generate_session_id(const SSL *ssl, unsigned char *id, unsigned int *id_len); +static void init_session_cache_ctx(SSL_CTX *sctx); +static void free_sessions(void); #ifndef OPENSSL_NO_DH static DH *load_dh_param(const char *dhfile); static DH *get_dh2048(void); @@ -286,16 +289,16 @@ static int accept_socket = -1; #undef PROG #define PROG s_server_main -extern int verify_depth, verify_return_error; +extern int verify_depth, verify_return_error, verify_quiet; -static char *cipher = NULL; static int s_server_verify = SSL_VERIFY_NONE; static int s_server_session_id_context = 1; /* anything will do */ -static const char *s_cert_file = TEST_CERT, *s_key_file = NULL; +static const char *s_cert_file = TEST_CERT, *s_key_file = + NULL, *s_chain_file = NULL; #ifndef OPENSSL_NO_TLSEXT static const char *s_cert_file2 = TEST_CERT2, *s_key_file2 = NULL; #endif -static char *s_dcert_file = NULL, *s_dkey_file = NULL; +static char *s_dcert_file = NULL, *s_dkey_file = NULL, *s_dchain_file = NULL; #ifdef FIONBIO static int s_nbio = 0; #endif @@ -308,14 +311,18 @@ static SSL_CTX *ctx2 = NULL; static int www = 0; static BIO *bio_s_out = NULL; +static BIO *bio_s_msg = NULL; static int s_debug = 0; #ifndef OPENSSL_NO_TLSEXT static int s_tlsextdebug = 0; static int s_tlsextstatus = 0; static int cert_status_cb(SSL *s, void *arg); #endif +static int no_resume_ephemeral = 0; static int s_msg = 0; static int s_quiet = 0; +static int s_ign_eof = 0; +static int s_brief = 0; static char *keymatexportlabel = NULL; static int keymatexportlen = 20; @@ -332,6 +339,12 @@ static long socket_mtu; static int cert_chain = 0; #endif +#ifndef OPENSSL_NO_TLSEXT +static BIO *serverinfo_in = NULL; +static const char *s_serverinfo_file = NULL; + +#endif + #ifndef OPENSSL_NO_PSK static char *psk_identity = "Client_identity"; char *psk_key = NULL; /* by default PSK is not used */ @@ -447,12 +460,13 @@ static int MS_CALLBACK ssl_srp_server_param_cb(SSL *s, int *ad, void *arg) static void s_server_init(void) { accept_socket = -1; - cipher = NULL; s_server_verify = SSL_VERIFY_NONE; s_dcert_file = NULL; s_dkey_file = NULL; + s_dchain_file = NULL; s_cert_file = TEST_CERT; s_key_file = NULL; + s_chain_file = NULL; # ifndef OPENSSL_NO_TLSEXT s_cert_file2 = TEST_CERT2; s_key_file2 = NULL; @@ -469,6 +483,7 @@ static void s_server_init(void) s_debug = 0; s_msg = 0; s_quiet = 0; + s_brief = 0; hack = 0; # ifndef OPENSSL_NO_ENGINE engine_id = NULL; @@ -482,6 +497,12 @@ static void sv_usage(void) BIO_printf(bio_err, "\n"); BIO_printf(bio_err, " -accept arg - port to accept on (default is %d)\n", PORT); + BIO_printf(bio_err, + " -verify_host host - check peer certificate matches \"host\"\n"); + BIO_printf(bio_err, + " -verify_email email - check peer certificate matches \"email\"\n"); + BIO_printf(bio_err, + " -verify_ip ipaddr - check peer certificate matches \"ipaddr\"\n"); BIO_printf(bio_err, " -context arg - set session ID context\n"); BIO_printf(bio_err, " -verify arg - turn on peer certificate verification\n"); @@ -491,6 +512,16 @@ static void sv_usage(void) " -verify_return_error - return verification errors\n"); BIO_printf(bio_err, " -cert arg - certificate file to use\n"); BIO_printf(bio_err, " (default is %s)\n", TEST_CERT); +#ifndef OPENSSL_NO_TLSEXT + BIO_printf(bio_err, + " -serverinfo arg - PEM serverinfo file for certificate\n"); + BIO_printf(bio_err, + " -auth - send and receive RFC 5878 TLS auth extensions and supplemental data\n"); + BIO_printf(bio_err, + " -auth_require_reneg - Do not send TLS auth extensions until renegotiation\n"); +#endif + BIO_printf(bio_err, + " -no_resumption_on_reneg - set SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION flag\n"); BIO_printf(bio_err, " -crl_check - check the peer certificate has not been revoked by its CA.\n" " The CRL(s) are appended to the certificate file\n"); @@ -569,6 +600,7 @@ static void sv_usage(void) BIO_printf(bio_err, " -tls1_1 - Just talk TLSv1.1\n"); BIO_printf(bio_err, " -tls1 - Just talk TLSv1\n"); BIO_printf(bio_err, " -dtls1 - Just talk DTLSv1\n"); + BIO_printf(bio_err, " -dtls1_2 - Just talk DTLSv1.2\n"); BIO_printf(bio_err, " -timeout - Enable timeouts\n"); BIO_printf(bio_err, " -mtu - Set link layer MTU\n"); BIO_printf(bio_err, " -chain - Read a certificate chain\n"); @@ -628,6 +660,8 @@ static void sv_usage(void) BIO_printf(bio_err, " -use_srtp profiles - Offer SRTP key management with a colon-separated profile list\n"); # endif + BIO_printf(bio_err, + " -alpn arg - set the advertised protocols for the ALPN extension (comma-separated list)\n"); #endif BIO_printf(bio_err, " -keymatexport label - Export keying material using label\n"); @@ -988,12 +1022,53 @@ static int next_proto_cb(SSL *s, const unsigned char **data, } # endif /* ndef OPENSSL_NO_NEXTPROTONEG */ -#endif +/* This the context that we pass to alpn_cb */ +typedef struct tlsextalpnctx_st { + unsigned char *data; + unsigned short len; +} tlsextalpnctx; + +static int alpn_cb(SSL *s, const unsigned char **out, unsigned char *outlen, + const unsigned char *in, unsigned int inlen, void *arg) +{ + tlsextalpnctx *alpn_ctx = arg; + + if (!s_quiet) { + /* We can assume that |in| is syntactically valid. */ + unsigned i; + BIO_printf(bio_s_out, "ALPN protocols advertised by the client: "); + for (i = 0; i < inlen;) { + if (i) + BIO_write(bio_s_out, ", ", 2); + BIO_write(bio_s_out, &in[i + 1], in[i]); + i += in[i] + 1; + } + BIO_write(bio_s_out, "\n", 1); + } + + if (SSL_select_next_proto + ((unsigned char **)out, outlen, alpn_ctx->data, alpn_ctx->len, in, + inlen) != OPENSSL_NPN_NEGOTIATED) { + return SSL_TLSEXT_ERR_NOACK; + } + + if (!s_quiet) { + BIO_printf(bio_s_out, "ALPN protocols selected: "); + BIO_write(bio_s_out, *out, *outlen); + BIO_write(bio_s_out, "\n", 1); + } + + return SSL_TLSEXT_ERR_OK; +} +#endif /* ndef OPENSSL_NO_TLSEXT */ int MAIN(int, char **); #ifndef OPENSSL_NO_JPAKE static char *jpake_secret = NULL; +# define no_jpake !jpake_secret +#else +# define no_jpake 1 #endif #ifndef OPENSSL_NO_SRP static srpsrvparm srp_callback_parm; @@ -1008,18 +1083,14 @@ int MAIN(int argc, char *argv[]) int badarg = 0; short port = PORT; char *CApath = NULL, *CAfile = NULL; + char *chCApath = NULL, *chCAfile = NULL; + char *vfyCApath = NULL, *vfyCAfile = NULL; unsigned char *context = NULL; char *dhfile = NULL; -#ifndef OPENSSL_NO_ECDH - char *named_curve = NULL; -#endif - int badop = 0, bugs = 0; + int badop = 0; int ret = 1; - int off = 0; - int no_tmp_rsa = 0, no_dhe = 0, nocert = 0; -#ifndef OPENSSL_NO_ECDH - int no_ecdhe = 0; -#endif + int build_chain = 0; + int no_tmp_rsa = 0, no_dhe = 0, no_ecdhe = 0, nocert = 0; int state = 0; const SSL_METHOD *meth = NULL; int socket_type = SOCK_STREAM; @@ -1030,16 +1101,20 @@ int MAIN(int argc, char *argv[]) char *dpassarg = NULL, *dpass = NULL; int s_dcert_format = FORMAT_PEM, s_dkey_format = FORMAT_PEM; X509 *s_cert = NULL, *s_dcert = NULL; + STACK_OF(X509) *s_chain = NULL, *s_dchain = NULL; EVP_PKEY *s_key = NULL, *s_dkey = NULL; - int no_cache = 0; + int no_cache = 0, ext_cache = 0; + int rev = 0, naccept = -1; #ifndef OPENSSL_NO_TLSEXT EVP_PKEY *s_key2 = NULL; X509 *s_cert2 = NULL; tlsextctx tlsextcbp = { NULL, NULL, SSL_TLSEXT_ERR_ALERT_WARNING }; # ifndef OPENSSL_NO_NEXTPROTONEG const char *next_proto_neg_in = NULL; - tlsextnextprotoctx next_proto; + tlsextnextprotoctx next_proto = { NULL, 0 }; # endif + const char *alpn_in = NULL; + tlsextalpnctx alpn_ctx = { NULL, 0 }; #endif #ifndef OPENSSL_NO_PSK /* by default do not send a PSK identity hint */ @@ -1049,6 +1124,15 @@ int MAIN(int argc, char *argv[]) char *srpuserseed = NULL; char *srp_verifier_file = NULL; #endif + SSL_EXCERT *exc = NULL; + SSL_CONF_CTX *cctx = NULL; + STACK_OF(OPENSSL_STRING) *ssl_args = NULL; + + char *crl_file = NULL; + int crl_format = FORMAT_PEM; + int crl_download = 0; + STACK_OF(X509_CRL) *crls = NULL; + meth = SSLv23_server_method(); local_argc = argc; @@ -1065,6 +1149,12 @@ int MAIN(int argc, char *argv[]) if (!load_config(bio_err, NULL)) goto end; + cctx = SSL_CONF_CTX_new(); + if (!cctx) + goto end; + SSL_CONF_CTX_set_flags(cctx, SSL_CONF_FLAG_SERVER); + SSL_CONF_CTX_set_flags(cctx, SSL_CONF_FLAG_CMDLINE); + verify_depth = 0; #ifdef FIONBIO s_nbio = 0; @@ -1080,12 +1170,21 @@ int MAIN(int argc, char *argv[]) goto bad; if (!extract_port(*(++argv), &port)) goto bad; + } else if (strcmp(*argv, "-naccept") == 0) { + if (--argc < 1) + goto bad; + naccept = atol(*(++argv)); + if (naccept <= 0) { + BIO_printf(bio_err, "bad accept value %s\n", *argv); + goto bad; + } } else if (strcmp(*argv, "-verify") == 0) { s_server_verify = SSL_VERIFY_PEER | SSL_VERIFY_CLIENT_ONCE; if (--argc < 1) goto bad; verify_depth = atoi(*(++argv)); - BIO_printf(bio_err, "verify depth is %d\n", verify_depth); + if (!s_quiet) + BIO_printf(bio_err, "verify depth is %d\n", verify_depth); } else if (strcmp(*argv, "-Verify") == 0) { s_server_verify = SSL_VERIFY_PEER | SSL_VERIFY_FAIL_IF_NO_PEER_CERT | @@ -1093,9 +1192,10 @@ int MAIN(int argc, char *argv[]) if (--argc < 1) goto bad; verify_depth = atoi(*(++argv)); - BIO_printf(bio_err, - "verify depth is %d, must return a certificate\n", - verify_depth); + if (!s_quiet) + BIO_printf(bio_err, + "verify depth is %d, must return a certificate\n", + verify_depth); } else if (strcmp(*argv, "-context") == 0) { if (--argc < 1) goto bad; @@ -1104,7 +1204,20 @@ int MAIN(int argc, char *argv[]) if (--argc < 1) goto bad; s_cert_file = *(++argv); - } else if (strcmp(*argv, "-certform") == 0) { + } else if (strcmp(*argv, "-CRL") == 0) { + if (--argc < 1) + goto bad; + crl_file = *(++argv); + } else if (strcmp(*argv, "-crl_download") == 0) + crl_download = 1; +#ifndef OPENSSL_NO_TLSEXT + else if (strcmp(*argv, "-serverinfo") == 0) { + if (--argc < 1) + goto bad; + s_serverinfo_file = *(++argv); + } +#endif + else if (strcmp(*argv, "-certform") == 0) { if (--argc < 1) goto bad; s_cert_format = str2fmt(*(++argv)); @@ -1120,19 +1233,15 @@ int MAIN(int argc, char *argv[]) if (--argc < 1) goto bad; passarg = *(++argv); - } else if (strcmp(*argv, "-dhparam") == 0) { + } else if (strcmp(*argv, "-cert_chain") == 0) { if (--argc < 1) goto bad; - dhfile = *(++argv); - } -#ifndef OPENSSL_NO_ECDH - else if (strcmp(*argv, "-named_curve") == 0) { + s_chain_file = *(++argv); + } else if (strcmp(*argv, "-dhparam") == 0) { if (--argc < 1) goto bad; - named_curve = *(++argv); - } -#endif - else if (strcmp(*argv, "-dcertform") == 0) { + dhfile = *(++argv); + } else if (strcmp(*argv, "-dcertform") == 0) { if (--argc < 1) goto bad; s_dcert_format = str2fmt(*(++argv)); @@ -1152,32 +1261,62 @@ int MAIN(int argc, char *argv[]) if (--argc < 1) goto bad; s_dkey_file = *(++argv); + } else if (strcmp(*argv, "-dcert_chain") == 0) { + if (--argc < 1) + goto bad; + s_dchain_file = *(++argv); } else if (strcmp(*argv, "-nocert") == 0) { nocert = 1; } else if (strcmp(*argv, "-CApath") == 0) { if (--argc < 1) goto bad; CApath = *(++argv); + } else if (strcmp(*argv, "-chainCApath") == 0) { + if (--argc < 1) + goto bad; + chCApath = *(++argv); + } else if (strcmp(*argv, "-verifyCApath") == 0) { + if (--argc < 1) + goto bad; + vfyCApath = *(++argv); } else if (strcmp(*argv, "-no_cache") == 0) no_cache = 1; - else if (args_verify(&argv, &argc, &badarg, bio_err, &vpm)) { + else if (strcmp(*argv, "-ext_cache") == 0) + ext_cache = 1; + else if (strcmp(*argv, "-CRLform") == 0) { + if (--argc < 1) + goto bad; + crl_format = str2fmt(*(++argv)); + } else if (args_verify(&argv, &argc, &badarg, bio_err, &vpm)) { + if (badarg) + goto bad; + continue; + } else if (args_excert(&argv, &argc, &badarg, bio_err, &exc)) { + if (badarg) + goto bad; + continue; + } else if (args_ssl(&argv, &argc, cctx, &badarg, bio_err, &ssl_args)) { if (badarg) goto bad; continue; } else if (strcmp(*argv, "-verify_return_error") == 0) verify_return_error = 1; - else if (strcmp(*argv, "-serverpref") == 0) { - off |= SSL_OP_CIPHER_SERVER_PREFERENCE; - } else if (strcmp(*argv, "-legacy_renegotiation") == 0) - off |= SSL_OP_ALLOW_UNSAFE_LEGACY_RENEGOTIATION; - else if (strcmp(*argv, "-cipher") == 0) { + else if (strcmp(*argv, "-verify_quiet") == 0) + verify_quiet = 1; + else if (strcmp(*argv, "-build_chain") == 0) + build_chain = 1; + else if (strcmp(*argv, "-CAfile") == 0) { if (--argc < 1) goto bad; - cipher = *(++argv); - } else if (strcmp(*argv, "-CAfile") == 0) { + CAfile = *(++argv); + } else if (strcmp(*argv, "-chainCAfile") == 0) { if (--argc < 1) goto bad; - CAfile = *(++argv); + chCAfile = *(++argv); + } else if (strcmp(*argv, "-verifyCAfile") == 0) { + if (--argc < 1) + goto bad; + vfyCAfile = *(++argv); } #ifdef FIONBIO else if (strcmp(*argv, "-nbio") == 0) { @@ -1189,7 +1328,11 @@ int MAIN(int argc, char *argv[]) s_nbio = 1; #endif s_nbio_test = 1; - } else if (strcmp(*argv, "-debug") == 0) { + } else if (strcmp(*argv, "-ign_eof") == 0) + s_ign_eof = 1; + else if (strcmp(*argv, "-no_ign_eof") == 0) + s_ign_eof = 0; + else if (strcmp(*argv, "-debug") == 0) { s_debug = 1; } #ifndef OPENSSL_NO_TLSEXT @@ -1220,7 +1363,17 @@ int MAIN(int argc, char *argv[]) #endif else if (strcmp(*argv, "-msg") == 0) { s_msg = 1; - } else if (strcmp(*argv, "-hack") == 0) { + } else if (strcmp(*argv, "-msgfile") == 0) { + if (--argc < 1) + goto bad; + bio_s_msg = BIO_new_file(*(++argv), "w"); + } +#ifndef OPENSSL_NO_SSL_TRACE + else if (strcmp(*argv, "-trace") == 0) { + s_msg = 2; + } +#endif + else if (strcmp(*argv, "-hack") == 0) { hack = 1; } else if (strcmp(*argv, "-state") == 0) { state = 1; @@ -1228,18 +1381,19 @@ int MAIN(int argc, char *argv[]) s_crlf = 1; } else if (strcmp(*argv, "-quiet") == 0) { s_quiet = 1; - } else if (strcmp(*argv, "-bugs") == 0) { - bugs = 1; + } else if (strcmp(*argv, "-brief") == 0) { + s_quiet = 1; + s_brief = 1; + verify_quiet = 1; } else if (strcmp(*argv, "-no_tmp_rsa") == 0) { no_tmp_rsa = 1; } else if (strcmp(*argv, "-no_dhe") == 0) { no_dhe = 1; - } -#ifndef OPENSSL_NO_ECDH - else if (strcmp(*argv, "-no_ecdhe") == 0) { + } else if (strcmp(*argv, "-no_ecdhe") == 0) { no_ecdhe = 1; + } else if (strcmp(*argv, "-no_resume_ephemeral") == 0) { + no_resume_ephemeral = 1; } -#endif #ifndef OPENSSL_NO_PSK else if (strcmp(*argv, "-psk_hint") == 0) { if (--argc < 1) @@ -1272,32 +1426,18 @@ int MAIN(int argc, char *argv[]) meth = TLSv1_server_method(); } #endif - else if (strcmp(*argv, "-www") == 0) { + else if (strcmp(*argv, "-rev") == 0) { + rev = 1; + } else if (strcmp(*argv, "-www") == 0) { www = 1; } else if (strcmp(*argv, "-WWW") == 0) { www = 2; } else if (strcmp(*argv, "-HTTP") == 0) { www = 3; - } else if (strcmp(*argv, "-no_ssl2") == 0) { - off |= SSL_OP_NO_SSLv2; - } else if (strcmp(*argv, "-no_ssl3") == 0) { - off |= SSL_OP_NO_SSLv3; - } else if (strcmp(*argv, "-no_tls1") == 0) { - off |= SSL_OP_NO_TLSv1; - } else if (strcmp(*argv, "-no_tls1_1") == 0) { - off |= SSL_OP_NO_TLSv1_1; - } else if (strcmp(*argv, "-no_tls1_2") == 0) { - off |= SSL_OP_NO_TLSv1_2; - } else if (strcmp(*argv, "-no_comp") == 0) { - off |= SSL_OP_NO_COMPRESSION; } -#ifndef OPENSSL_NO_TLSEXT - else if (strcmp(*argv, "-no_ticket") == 0) { - off |= SSL_OP_NO_TICKET; - } -#endif #ifndef OPENSSL_NO_SSL2 else if (strcmp(*argv, "-ssl2") == 0) { + no_ecdhe = 1; meth = SSLv2_server_method(); } #endif @@ -1316,9 +1456,15 @@ int MAIN(int argc, char *argv[]) } #endif #ifndef OPENSSL_NO_DTLS1 - else if (strcmp(*argv, "-dtls1") == 0) { + else if (strcmp(*argv, "-dtls") == 0) { + meth = DTLS_server_method(); + socket_type = SOCK_DGRAM; + } else if (strcmp(*argv, "-dtls1") == 0) { meth = DTLSv1_server_method(); socket_type = SOCK_DGRAM; + } else if (strcmp(*argv, "-dtls1_2") == 0) { + meth = DTLSv1_2_server_method(); + socket_type = SOCK_DGRAM; } else if (strcmp(*argv, "-timeout") == 0) enable_timeouts = 1; else if (strcmp(*argv, "-mtu") == 0) { @@ -1368,6 +1514,11 @@ int MAIN(int argc, char *argv[]) next_proto_neg_in = *(++argv); } # endif + else if (strcmp(*argv, "-alpn") == 0) { + if (--argc < 1) + goto bad; + alpn_in = *(++argv); + } #endif #if !defined(OPENSSL_NO_JPAKE) && !defined(OPENSSL_NO_PSK) else if (strcmp(*argv, "-jpake") == 0) { @@ -1420,11 +1571,6 @@ int MAIN(int argc, char *argv[]) goto end; } psk_identity = "JPAKE"; - if (cipher) { - BIO_printf(bio_err, "JPAKE sets cipher to PSK\n"); - goto end; - } - cipher = "PSK"; } #endif @@ -1447,6 +1593,9 @@ int MAIN(int argc, char *argv[]) s_key_file2 = s_cert_file2; #endif + if (!load_excert(&exc, bio_err)) + goto end; + if (nocert == 0) { s_key = load_key(bio_err, s_key_file, s_key_format, 0, pass, e, "server certificate private key file"); @@ -1462,6 +1611,12 @@ int MAIN(int argc, char *argv[]) ERR_print_errors(bio_err); goto end; } + if (s_chain_file) { + s_chain = load_certs(bio_err, s_chain_file, FORMAT_PEM, + NULL, e, "server certificate chain"); + if (!s_chain) + goto end; + } #ifndef OPENSSL_NO_TLSEXT if (tlsextcbp.servername) { s_key2 = load_key(bio_err, s_key_file2, s_key_format, 0, pass, e, @@ -1479,9 +1634,10 @@ int MAIN(int argc, char *argv[]) goto end; } } -#endif +#endif /* OPENSSL_NO_TLSEXT */ } -#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG) +#if !defined(OPENSSL_NO_TLSEXT) +# if !defined(OPENSSL_NO_NEXTPROTONEG) if (next_proto_neg_in) { unsigned short len; next_proto.data = next_protos_parse(&len, next_proto_neg_in); @@ -1491,8 +1647,34 @@ int MAIN(int argc, char *argv[]) } else { next_proto.data = NULL; } +# endif + alpn_ctx.data = NULL; + if (alpn_in) { + unsigned short len; + alpn_ctx.data = next_protos_parse(&len, alpn_in); + if (alpn_ctx.data == NULL) + goto end; + alpn_ctx.len = len; + } #endif + if (crl_file) { + X509_CRL *crl; + crl = load_crl(crl_file, crl_format); + if (!crl) { + BIO_puts(bio_err, "Error loading CRL\n"); + ERR_print_errors(bio_err); + goto end; + } + crls = sk_X509_CRL_new_null(); + if (!crls || !sk_X509_CRL_push(crls, crl)) { + BIO_puts(bio_err, "Error adding CRL\n"); + ERR_print_errors(bio_err); + X509_CRL_free(crl); + goto end; + } + } + if (s_dcert_file) { if (s_dkey_file == NULL) @@ -1512,6 +1694,12 @@ int MAIN(int argc, char *argv[]) ERR_print_errors(bio_err); goto end; } + if (s_dchain_file) { + s_dchain = load_certs(bio_err, s_dchain_file, FORMAT_PEM, + NULL, e, "second server certificate chain"); + if (!s_dchain) + goto end; + } } @@ -1525,8 +1713,10 @@ int MAIN(int argc, char *argv[]) app_RAND_load_files(inrand)); if (bio_s_out == NULL) { - if (s_quiet && !s_debug && !s_msg) { + if (s_quiet && !s_debug) { bio_s_out = BIO_new(BIO_s_null()); + if (s_msg && !bio_s_msg) + bio_s_msg = BIO_new_fp(stdout, BIO_NOCLOSE); } else { if (bio_s_out == NULL) bio_s_out = BIO_new_fp(stdout, BIO_NOCLOSE); @@ -1566,16 +1756,17 @@ int MAIN(int argc, char *argv[]) BIO_printf(bio_err, "id_prefix '%s' set.\n", session_id_prefix); } SSL_CTX_set_quiet_shutdown(ctx, 1); - if (bugs) - SSL_CTX_set_options(ctx, SSL_OP_ALL); if (hack) SSL_CTX_set_options(ctx, SSL_OP_NETSCAPE_DEMO_CIPHER_CHANGE_BUG); - SSL_CTX_set_options(ctx, off); + if (exc) + ssl_ctx_set_excert(ctx, exc); if (state) SSL_CTX_set_info_callback(ctx, apps_ssl_info_callback); if (no_cache) SSL_CTX_set_session_cache_mode(ctx, SSL_SESS_CACHE_OFF); + else if (ext_cache) + init_session_cache_ctx(ctx); else SSL_CTX_sess_set_cache_size(ctx, 128); @@ -1606,6 +1797,17 @@ int MAIN(int argc, char *argv[]) if (vpm) SSL_CTX_set1_param(ctx, vpm); + ssl_ctx_add_crls(ctx, crls, 0); + + if (!args_ssl_call(ctx, bio_err, cctx, ssl_args, no_ecdhe, no_jpake)) + goto end; + + if (!ssl_load_stores(ctx, vfyCApath, vfyCAfile, chCApath, chCAfile, + crls, crl_download)) { + BIO_printf(bio_err, "Error loading store locations\n"); + ERR_print_errors(bio_err); + goto end; + } #ifndef OPENSSL_NO_TLSEXT if (s_cert2) { ctx2 = SSL_CTX_new(meth); @@ -1633,17 +1835,18 @@ int MAIN(int argc, char *argv[]) BIO_printf(bio_err, "id_prefix '%s' set.\n", session_id_prefix); } SSL_CTX_set_quiet_shutdown(ctx2, 1); - if (bugs) - SSL_CTX_set_options(ctx2, SSL_OP_ALL); if (hack) SSL_CTX_set_options(ctx2, SSL_OP_NETSCAPE_DEMO_CIPHER_CHANGE_BUG); - SSL_CTX_set_options(ctx2, off); + if (exc) + ssl_ctx_set_excert(ctx2, exc); if (state) SSL_CTX_set_info_callback(ctx2, apps_ssl_info_callback); if (no_cache) SSL_CTX_set_session_cache_mode(ctx2, SSL_SESS_CACHE_OFF); + else if (ext_cache) + init_session_cache_ctx(ctx2); else SSL_CTX_sess_set_cache_size(ctx2, 128); @@ -1653,12 +1856,20 @@ int MAIN(int argc, char *argv[]) } if (vpm) SSL_CTX_set1_param(ctx2, vpm); + + ssl_ctx_add_crls(ctx2, crls, 0); + + if (!args_ssl_call(ctx2, bio_err, cctx, ssl_args, no_ecdhe, no_jpake)) + goto end; + } # ifndef OPENSSL_NO_NEXTPROTONEG if (next_proto.data) SSL_CTX_set_next_protos_advertised_cb(ctx, next_proto_cb, &next_proto); # endif + if (alpn_ctx.data) + SSL_CTX_set_alpn_select_cb(ctx, alpn_cb, &alpn_ctx); #endif #ifndef OPENSSL_NO_DH @@ -1702,54 +1913,21 @@ int MAIN(int argc, char *argv[]) } #endif -#ifndef OPENSSL_NO_ECDH - if (!no_ecdhe) { - EC_KEY *ecdh = NULL; - - if (named_curve) { - int nid = OBJ_sn2nid(named_curve); - - if (nid == 0) { - BIO_printf(bio_err, "unknown curve name (%s)\n", named_curve); - goto end; - } - ecdh = EC_KEY_new_by_curve_name(nid); - if (ecdh == NULL) { - BIO_printf(bio_err, "unable to create curve (%s)\n", - named_curve); - goto end; - } - } - - if (ecdh != NULL) { - BIO_printf(bio_s_out, "Setting temp ECDH parameters\n"); - } else { - BIO_printf(bio_s_out, "Using default temp ECDH parameters\n"); - ecdh = EC_KEY_new_by_curve_name(NID_X9_62_prime256v1); - if (ecdh == NULL) { - BIO_printf(bio_err, "unable to create curve (nistp256)\n"); - goto end; - } - } - (void)BIO_flush(bio_s_out); - - SSL_CTX_set_tmp_ecdh(ctx, ecdh); -# ifndef OPENSSL_NO_TLSEXT - if (ctx2) - SSL_CTX_set_tmp_ecdh(ctx2, ecdh); -# endif - EC_KEY_free(ecdh); + if (!set_cert_key_stuff(ctx, s_cert, s_key, s_chain, build_chain)) + goto end; +#ifndef OPENSSL_NO_TLSEXT + if (s_serverinfo_file != NULL + && !SSL_CTX_use_serverinfo_file(ctx, s_serverinfo_file)) { + ERR_print_errors(bio_err); + goto end; } #endif - - if (!set_cert_key_stuff(ctx, s_cert, s_key)) - goto end; #ifndef OPENSSL_NO_TLSEXT - if (ctx2 && !set_cert_key_stuff(ctx2, s_cert2, s_key2)) + if (ctx2 && !set_cert_key_stuff(ctx2, s_cert2, s_key2, NULL, build_chain)) goto end; #endif if (s_dcert != NULL) { - if (!set_cert_key_stuff(ctx, s_dcert, s_dkey)) + if (!set_cert_key_stuff(ctx, s_dcert, s_dkey, s_dchain, build_chain)) goto end; } #ifndef OPENSSL_NO_RSA @@ -1808,20 +1986,6 @@ int MAIN(int argc, char *argv[]) } #endif - if (cipher != NULL) { - if (!SSL_CTX_set_cipher_list(ctx, cipher)) { - BIO_printf(bio_err, "error setting cipher list\n"); - ERR_print_errors(bio_err); - goto end; - } -#ifndef OPENSSL_NO_TLSEXT - if (ctx2 && !SSL_CTX_set_cipher_list(ctx2, cipher)) { - BIO_printf(bio_err, "error setting cipher list\n"); - ERR_print_errors(bio_err); - goto end; - } -#endif - } SSL_CTX_set_verify(ctx, s_server_verify, verify_callback); SSL_CTX_set_session_id_context(ctx, (void *)&s_server_session_id_context, sizeof s_server_session_id_context); @@ -1873,10 +2037,15 @@ int MAIN(int argc, char *argv[]) BIO_printf(bio_s_out, "ACCEPT\n"); (void)BIO_flush(bio_s_out); - if (www) - do_server(port, socket_type, &accept_socket, www_body, context); + if (rev) + do_server(port, socket_type, &accept_socket, rev_body, context, + naccept); + else if (www) + do_server(port, socket_type, &accept_socket, www_body, context, + naccept); else - do_server(port, socket_type, &accept_socket, sv_body, context); + do_server(port, socket_type, &accept_socket, sv_body, context, + naccept); print_stats(bio_s_out, ctx); ret = 0; end: @@ -1884,18 +2053,25 @@ int MAIN(int argc, char *argv[]) SSL_CTX_free(ctx); if (s_cert) X509_free(s_cert); + if (crls) + sk_X509_CRL_pop_free(crls, X509_CRL_free); if (s_dcert) X509_free(s_dcert); if (s_key) EVP_PKEY_free(s_key); if (s_dkey) EVP_PKEY_free(s_dkey); + if (s_chain) + sk_X509_pop_free(s_chain, X509_free); + if (s_dchain) + sk_X509_pop_free(s_dchain, X509_free); if (pass) OPENSSL_free(pass); if (dpass) OPENSSL_free(dpass); if (vpm) X509_VERIFY_PARAM_free(vpm); + free_sessions(); #ifndef OPENSSL_NO_TLSEXT if (tlscstatp.host) OPENSSL_free(tlscstatp.host); @@ -1909,11 +2085,32 @@ int MAIN(int argc, char *argv[]) X509_free(s_cert2); if (s_key2) EVP_PKEY_free(s_key2); + if (serverinfo_in != NULL) + BIO_free(serverinfo_in); +# ifndef OPENSSL_NO_NEXTPROTONEG + if (next_proto.data) + OPENSSL_free(next_proto.data); +# endif + if (alpn_ctx.data) + OPENSSL_free(alpn_ctx.data); +#endif + ssl_excert_free(exc); + if (ssl_args) + sk_OPENSSL_STRING_free(ssl_args); + if (cctx) + SSL_CONF_CTX_free(cctx); +#ifndef OPENSSL_NO_JPAKE + if (jpake_secret && psk_key) + OPENSSL_free(psk_key); #endif if (bio_s_out != NULL) { BIO_free(bio_s_out); bio_s_out = NULL; } + if (bio_s_msg != NULL) { + BIO_free(bio_s_msg); + bio_s_msg = NULL; + } apps_shutdown(); OPENSSL_EXIT(ret); } @@ -1946,7 +2143,7 @@ static void print_stats(BIO *bio, SSL_CTX *ssl_ctx) SSL_CTX_sess_get_cache_size(ssl_ctx)); } -static int sv_body(char *hostname, int s, unsigned char *context) +static int sv_body(char *hostname, int s, int stype, unsigned char *context) { char *buf = NULL; fd_set readfds; @@ -2010,7 +2207,7 @@ static int sv_body(char *hostname, int s, unsigned char *context) # endif #endif - if (SSL_version(con) == DTLS1_VERSION) { + if (stype == SOCK_DGRAM) { sbio = BIO_new_dgram(s, BIO_NOCLOSE); @@ -2069,8 +2266,13 @@ static int sv_body(char *hostname, int s, unsigned char *context) BIO_set_callback_arg(SSL_get_rbio(con), (char *)bio_s_out); } if (s_msg) { - SSL_set_msg_callback(con, msg_cb); - SSL_set_msg_callback_arg(con, bio_s_out); +#ifndef OPENSSL_NO_SSL_TRACE + if (s_msg == 2) + SSL_set_msg_callback(con, SSL_trace); + else +#endif + SSL_set_msg_callback(con, msg_cb); + SSL_set_msg_callback_arg(con, bio_s_msg ? bio_s_msg : bio_s_out); } #ifndef OPENSSL_NO_TLSEXT if (s_tlsextdebug) { @@ -2168,7 +2370,7 @@ static int sv_body(char *hostname, int s, unsigned char *context) assert(lf_num == 0); } else i = raw_read_stdin(buf, bufsize); - if (!s_quiet) { + if (!s_quiet && !s_brief) { if ((i <= 0) || (buf[0] == 'Q')) { BIO_printf(bio_s_out, "DONE\n"); SHUTDOWN(s); @@ -2383,6 +2585,16 @@ static int init_ssl_connection(SSL *con) unsigned char *exportedkeymat; i = SSL_accept(con); +#ifdef CERT_CB_TEST_RETRY + { + while (i <= 0 && SSL_get_error(con, i) == SSL_ERROR_WANT_X509_LOOKUP + && SSL_state(con) == SSL3_ST_SR_CLNT_HELLO_C) { + fprintf(stderr, + "LOOKUP from certificate callback during accept\n"); + i = SSL_accept(con); + } + } +#endif #ifndef OPENSSL_NO_SRP while (i <= 0 && SSL_get_error(con, i) == SSL_ERROR_WANT_X509_LOOKUP) { BIO_printf(bio_s_out, "LOOKUP during accept %s\n", @@ -2398,6 +2610,7 @@ static int init_ssl_connection(SSL *con) i = SSL_accept(con); } #endif + if (i <= 0) { if (BIO_sock_should_retry(i)) { BIO_printf(bio_s_out, "DELAY\n"); @@ -2409,11 +2622,15 @@ static int init_ssl_connection(SSL *con) if (verify_error != X509_V_OK) { BIO_printf(bio_err, "verify error:%s\n", X509_verify_cert_error_string(verify_error)); - } else - ERR_print_errors(bio_err); + } + /* Always print any error messages */ + ERR_print_errors(bio_err); return (0); } + if (s_brief) + print_ssl_summary(bio_err, con); + PEM_write_bio_SSL_SESSION(bio_s_out, SSL_get_session(con)); peer = SSL_get_peer_certificate(con); @@ -2430,6 +2647,11 @@ static int init_ssl_connection(SSL *con) if (SSL_get_shared_ciphers(con, buf, sizeof buf) != NULL) BIO_printf(bio_s_out, "Shared ciphers:%s\n", buf); str = SSL_CIPHER_get_name(SSL_get_current_cipher(con)); + ssl_print_sigalgs(bio_s_out, con); +#ifndef OPENSSL_NO_EC + ssl_print_point_formats(bio_s_out, con); + ssl_print_curves(bio_s_out, con, 0); +#endif BIO_printf(bio_s_out, "CIPHER is %s\n", (str != NULL) ? str : "(NONE)"); #if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG) @@ -2529,7 +2751,7 @@ static int load_CA(SSL_CTX *ctx, char *file) } #endif -static int www_body(char *hostname, int s, unsigned char *context) +static int www_body(char *hostname, int s, int stype, unsigned char *context) { char *buf = NULL; int ret = 1; @@ -2604,8 +2826,13 @@ static int www_body(char *hostname, int s, unsigned char *context) BIO_set_callback_arg(SSL_get_rbio(con), (char *)bio_s_out); } if (s_msg) { - SSL_set_msg_callback(con, msg_cb); - SSL_set_msg_callback_arg(con, bio_s_out); +#ifndef OPENSSL_NO_SSL_TRACE + if (s_msg == 2) + SSL_set_msg_callback(con, SSL_trace); + else +#endif + SSL_set_msg_callback(con, msg_cb); + SSL_set_msg_callback_arg(con, bio_s_msg ? bio_s_msg : bio_s_out); } for (;;) { @@ -2724,6 +2951,10 @@ static int www_body(char *hostname, int s, unsigned char *context) } BIO_puts(io, "\n"); } + ssl_print_sigalgs(io, con); +#ifndef OPENSSL_NO_EC + ssl_print_curves(io, con, 0); +#endif BIO_printf(io, (SSL_cache_hit(con) ? "---\nReused, " : "---\nNew, ")); c = SSL_get_current_cipher(con); @@ -2906,6 +3137,140 @@ static int www_body(char *hostname, int s, unsigned char *context) return (ret); } +static int rev_body(char *hostname, int s, int stype, unsigned char *context) +{ + char *buf = NULL; + int i; + int ret = 1; + SSL *con; + BIO *io, *ssl_bio, *sbio; +#ifndef OPENSSL_NO_KRB5 + KSSL_CTX *kctx; +#endif + + buf = OPENSSL_malloc(bufsize); + if (buf == NULL) + return (0); + io = BIO_new(BIO_f_buffer()); + ssl_bio = BIO_new(BIO_f_ssl()); + if ((io == NULL) || (ssl_bio == NULL)) + goto err; + + /* lets make the output buffer a reasonable size */ + if (!BIO_set_write_buffer_size(io, bufsize)) + goto err; + + if ((con = SSL_new(ctx)) == NULL) + goto err; +#ifndef OPENSSL_NO_TLSEXT + if (s_tlsextdebug) { + SSL_set_tlsext_debug_callback(con, tlsext_cb); + SSL_set_tlsext_debug_arg(con, bio_s_out); + } +#endif +#ifndef OPENSSL_NO_KRB5 + if ((kctx = kssl_ctx_new()) != NULL) { + kssl_ctx_setstring(kctx, KSSL_SERVICE, KRB5SVC); + kssl_ctx_setstring(kctx, KSSL_KEYTAB, KRB5KEYTAB); + } +#endif /* OPENSSL_NO_KRB5 */ + if (context) + SSL_set_session_id_context(con, context, strlen((char *)context)); + + sbio = BIO_new_socket(s, BIO_NOCLOSE); + SSL_set_bio(con, sbio, sbio); + SSL_set_accept_state(con); + + BIO_set_ssl(ssl_bio, con, BIO_CLOSE); + BIO_push(io, ssl_bio); +#ifdef CHARSET_EBCDIC + io = BIO_push(BIO_new(BIO_f_ebcdic_filter()), io); +#endif + + if (s_debug) { + SSL_set_debug(con, 1); + BIO_set_callback(SSL_get_rbio(con), bio_dump_callback); + BIO_set_callback_arg(SSL_get_rbio(con), (char *)bio_s_out); + } + if (s_msg) { +#ifndef OPENSSL_NO_SSL_TRACE + if (s_msg == 2) + SSL_set_msg_callback(con, SSL_trace); + else +#endif + SSL_set_msg_callback(con, msg_cb); + SSL_set_msg_callback_arg(con, bio_s_msg ? bio_s_msg : bio_s_out); + } + + for (;;) { + i = BIO_do_handshake(io); + if (i > 0) + break; + if (!BIO_should_retry(io)) { + BIO_puts(bio_err, "CONNECTION FAILURE\n"); + ERR_print_errors(bio_err); + goto end; + } + } + BIO_printf(bio_err, "CONNECTION ESTABLISHED\n"); + print_ssl_summary(bio_err, con); + + for (;;) { + i = BIO_gets(io, buf, bufsize - 1); + if (i < 0) { /* error */ + if (!BIO_should_retry(io)) { + if (!s_quiet) + ERR_print_errors(bio_err); + goto err; + } else { + BIO_printf(bio_s_out, "read R BLOCK\n"); +#if defined(OPENSSL_SYS_NETWARE) + delay(1000); +#elif !defined(OPENSSL_SYS_MSDOS) && !defined(__DJGPP__) + sleep(1); +#endif + continue; + } + } else if (i == 0) { /* end of input */ + ret = 1; + BIO_printf(bio_err, "CONNECTION CLOSED\n"); + goto end; + } else { + char *p = buf + i - 1; + while (i && (*p == '\n' || *p == '\r')) { + p--; + i--; + } + if (!s_ign_eof && i == 5 && !strncmp(buf, "CLOSE", 5)) { + ret = 1; + BIO_printf(bio_err, "CONNECTION CLOSED\n"); + goto end; + } + BUF_reverse((unsigned char *)buf, NULL, i); + buf[i] = '\n'; + BIO_write(io, buf, i + 1); + for (;;) { + i = BIO_flush(io); + if (i > 0) + break; + if (!BIO_should_retry(io)) + goto end; + } + } + } + end: + /* make sure we re-use sessions */ + SSL_set_shutdown(con, SSL_SENT_SHUTDOWN | SSL_RECEIVED_SHUTDOWN); + + err: + + if (buf != NULL) + OPENSSL_free(buf); + if (io != NULL) + BIO_free_all(io); + return (ret); +} + #ifndef OPENSSL_NO_RSA static RSA MS_CALLBACK *tmp_rsa_cb(SSL *s, int is_export, int keylength) { @@ -2961,3 +3326,116 @@ static int generate_session_id(const SSL *ssl, unsigned char *id, return 0; return 1; } + +/* + * By default s_server uses an in-memory cache which caches SSL_SESSION + * structures without any serialisation. This hides some bugs which only + * become apparent in deployed servers. By implementing a basic external + * session cache some issues can be debugged using s_server. + */ + +typedef struct simple_ssl_session_st { + unsigned char *id; + unsigned int idlen; + unsigned char *der; + int derlen; + struct simple_ssl_session_st *next; +} simple_ssl_session; + +static simple_ssl_session *first = NULL; + +static int add_session(SSL *ssl, SSL_SESSION *session) +{ + simple_ssl_session *sess; + unsigned char *p; + + sess = OPENSSL_malloc(sizeof(simple_ssl_session)); + if (!sess) { + BIO_printf(bio_err, "Out of memory adding session to external cache\n"); + return 0; + } + + SSL_SESSION_get_id(session, &sess->idlen); + sess->derlen = i2d_SSL_SESSION(session, NULL); + + sess->id = BUF_memdup(SSL_SESSION_get_id(session, NULL), sess->idlen); + + sess->der = OPENSSL_malloc(sess->derlen); + if (!sess->id || !sess->der) { + BIO_printf(bio_err, "Out of memory adding session to external cache\n"); + + if (sess->id) + OPENSSL_free(sess->id); + if (sess->der) + OPENSSL_free(sess->der); + OPENSSL_free(sess); + return 0; + } + p = sess->der; + i2d_SSL_SESSION(session, &p); + + sess->next = first; + first = sess; + BIO_printf(bio_err, "New session added to external cache\n"); + return 0; +} + +static SSL_SESSION *get_session(SSL *ssl, unsigned char *id, int idlen, + int *do_copy) +{ + simple_ssl_session *sess; + *do_copy = 0; + for (sess = first; sess; sess = sess->next) { + if (idlen == (int)sess->idlen && !memcmp(sess->id, id, idlen)) { + const unsigned char *p = sess->der; + BIO_printf(bio_err, "Lookup session: cache hit\n"); + return d2i_SSL_SESSION(NULL, &p, sess->derlen); + } + } + BIO_printf(bio_err, "Lookup session: cache miss\n"); + return NULL; +} + +static void del_session(SSL_CTX *sctx, SSL_SESSION *session) +{ + simple_ssl_session *sess, *prev = NULL; + const unsigned char *id; + unsigned int idlen; + id = SSL_SESSION_get_id(session, &idlen); + for (sess = first; sess; sess = sess->next) { + if (idlen == sess->idlen && !memcmp(sess->id, id, idlen)) { + if (prev) + prev->next = sess->next; + else + first = sess->next; + OPENSSL_free(sess->id); + OPENSSL_free(sess->der); + OPENSSL_free(sess); + return; + } + prev = sess; + } +} + +static void init_session_cache_ctx(SSL_CTX *sctx) +{ + SSL_CTX_set_session_cache_mode(sctx, + SSL_SESS_CACHE_NO_INTERNAL | + SSL_SESS_CACHE_SERVER); + SSL_CTX_sess_set_new_cb(sctx, add_session); + SSL_CTX_sess_set_get_cb(sctx, get_session); + SSL_CTX_sess_set_remove_cb(sctx, del_session); +} + +static void free_sessions(void) +{ + simple_ssl_session *sess, *tsess; + for (sess = first; sess;) { + OPENSSL_free(sess->id); + OPENSSL_free(sess->der); + tsess = sess; + sess = sess->next; + OPENSSL_free(tsess); + } + first = NULL; +} diff --git a/apps/s_socket.c b/apps/s_socket.c index 9e5565d16a4b..77a7688f8d0c 100644 --- a/apps/s_socket.c +++ b/apps/s_socket.c @@ -290,8 +290,9 @@ static int init_client_ip(int *sock, unsigned char ip[4], int port, int type) } int do_server(int port, int type, int *ret, - int (*cb) (char *hostname, int s, unsigned char *context), - unsigned char *context) + int (*cb) (char *hostname, int s, int stype, + unsigned char *context), unsigned char *context, + int naccept) { int sock; char *name = NULL; @@ -313,12 +314,14 @@ int do_server(int port, int type, int *ret, } } else sock = accept_socket; - i = (*cb) (name, sock, context); + i = (*cb) (name, sock, type, context); if (name != NULL) OPENSSL_free(name); if (type == SOCK_STREAM) SHUTDOWN2(sock); - if (i < 0) { + if (naccept != -1) + naccept--; + if (i < 0 || naccept == 0) { SHUTDOWN2(accept_socket); return (i); } diff --git a/apps/smime.c b/apps/smime.c index 53e43c5e7322..6044ccf5f590 100644 --- a/apps/smime.c +++ b/apps/smime.c @@ -634,6 +634,12 @@ int MAIN(int argc, char **argv) p7 = PKCS7_sign(NULL, NULL, other, in, flags); if (!p7) goto end; + if (flags & PKCS7_NOCERTS) { + for (i = 0; i < sk_X509_num(other); i++) { + X509 *x = sk_X509_value(other, i); + PKCS7_add_certificate(p7, x); + } + } } else flags |= PKCS7_REUSE_DIGEST; for (i = 0; i < sk_OPENSSL_STRING_num(sksigners); i++) { diff --git a/apps/speed.c b/apps/speed.c index 7d9fd8ac9823..3697b71ec18b 100644 --- a/apps/speed.c +++ b/apps/speed.c @@ -366,6 +366,8 @@ static void *KDF1_SHA1(const void *in, size_t inlen, void *out, } # endif /* OPENSSL_NO_ECDH */ +static void multiblock_speed(const EVP_CIPHER *evp_cipher); + int MAIN(int, char **); int MAIN(int argc, char **argv) @@ -646,6 +648,7 @@ int MAIN(int argc, char **argv) # ifndef NO_FORK int multi = 0; # endif + int multiblock = 0; # ifndef TIMES usertime = -1; @@ -776,6 +779,9 @@ int MAIN(int argc, char **argv) mr = 1; j--; /* Otherwise, -mr gets confused with an * algorithm. */ + } else if (argc > 0 && !strcmp(*argv, "-mb")) { + multiblock = 1; + j--; } else # ifndef OPENSSL_NO_MD2 if (strcmp(*argv, "md2") == 0) @@ -1941,6 +1947,20 @@ int MAIN(int argc, char **argv) # endif if (doit[D_EVP]) { +# ifdef EVP_CIPH_FLAG_TLS1_1_MULTIBLOCK + if (multiblock && evp_cipher) { + if (! + (EVP_CIPHER_flags(evp_cipher) & + EVP_CIPH_FLAG_TLS1_1_MULTIBLOCK)) { + fprintf(stderr, "%s is not multi-block capable\n", + OBJ_nid2ln(evp_cipher->nid)); + goto end; + } + multiblock_speed(evp_cipher); + mret = 0; + goto end; + } +# endif for (j = 0; j < SIZE_NUM; j++) { if (evp_cipher) { EVP_CIPHER_CTX ctx; @@ -2742,4 +2762,113 @@ static int do_multi(int multi) return 1; } # endif + +static void multiblock_speed(const EVP_CIPHER *evp_cipher) +{ + static int mblengths[] = + { 8 * 1024, 2 * 8 * 1024, 4 * 8 * 1024, 8 * 8 * 1024, 8 * 16 * 1024 }; + int j, count, num = sizeof(lengths) / sizeof(lengths[0]); + const char *alg_name; + unsigned char *inp, *out, no_key[32], no_iv[16]; + EVP_CIPHER_CTX ctx; + double d = 0.0; + + inp = OPENSSL_malloc(mblengths[num - 1]); + out = OPENSSL_malloc(mblengths[num - 1] + 1024); + if (!inp || !out) { + BIO_printf(bio_err,"Out of memory\n"); + goto end; + } + + + EVP_CIPHER_CTX_init(&ctx); + EVP_EncryptInit_ex(&ctx, evp_cipher, NULL, no_key, no_iv); + EVP_CIPHER_CTX_ctrl(&ctx, EVP_CTRL_AEAD_SET_MAC_KEY, sizeof(no_key), + no_key); + alg_name = OBJ_nid2ln(evp_cipher->nid); + + for (j = 0; j < num; j++) { + print_message(alg_name, 0, mblengths[j]); + Time_F(START); + for (count = 0, run = 1; run && count < 0x7fffffff; count++) { + unsigned char aad[EVP_AEAD_TLS1_AAD_LEN]; + EVP_CTRL_TLS1_1_MULTIBLOCK_PARAM mb_param; + size_t len = mblengths[j]; + int packlen; + + memset(aad, 0, 8); /* avoid uninitialized values */ + aad[8] = 23; /* SSL3_RT_APPLICATION_DATA */ + aad[9] = 3; /* version */ + aad[10] = 2; + aad[11] = 0; /* length */ + aad[12] = 0; + mb_param.out = NULL; + mb_param.inp = aad; + mb_param.len = len; + mb_param.interleave = 8; + + packlen = EVP_CIPHER_CTX_ctrl(&ctx, + EVP_CTRL_TLS1_1_MULTIBLOCK_AAD, + sizeof(mb_param), &mb_param); + + if (packlen > 0) { + mb_param.out = out; + mb_param.inp = inp; + mb_param.len = len; + EVP_CIPHER_CTX_ctrl(&ctx, + EVP_CTRL_TLS1_1_MULTIBLOCK_ENCRYPT, + sizeof(mb_param), &mb_param); + } else { + int pad; + + RAND_bytes(out, 16); + len += 16; + aad[11] = len >> 8; + aad[12] = len; + pad = EVP_CIPHER_CTX_ctrl(&ctx, + EVP_CTRL_AEAD_TLS1_AAD, + EVP_AEAD_TLS1_AAD_LEN, aad); + EVP_Cipher(&ctx, out, inp, len + pad); + } + } + d = Time_F(STOP); + BIO_printf(bio_err, + mr ? "+R:%d:%s:%f\n" + : "%d %s's in %.2fs\n", count, "evp", d); + results[D_EVP][j] = ((double)count) / d * mblengths[j]; + } + + if (mr) { + fprintf(stdout, "+H"); + for (j = 0; j < num; j++) + fprintf(stdout, ":%d", mblengths[j]); + fprintf(stdout, "\n"); + fprintf(stdout, "+F:%d:%s", D_EVP, alg_name); + for (j = 0; j < num; j++) + fprintf(stdout, ":%.2f", results[D_EVP][j]); + fprintf(stdout, "\n"); + } else { + fprintf(stdout, + "The 'numbers' are in 1000s of bytes per second processed.\n"); + fprintf(stdout, "type "); + for (j = 0; j < num; j++) + fprintf(stdout, "%7d bytes", mblengths[j]); + fprintf(stdout, "\n"); + fprintf(stdout, "%-24s", alg_name); + + for (j = 0; j < num; j++) { + if (results[D_EVP][j] > 10000) + fprintf(stdout, " %11.2fk", results[D_EVP][j] / 1e3); + else + fprintf(stdout, " %11.2f ", results[D_EVP][j]); + } + fprintf(stdout, "\n"); + } + +end: + if (inp) + OPENSSL_free(inp); + if (out) + OPENSSL_free(out); +} #endif diff --git a/apps/verify.c b/apps/verify.c index e29f9bb7e09c..78e729fc890f 100644 --- a/apps/verify.c +++ b/apps/verify.c @@ -88,6 +88,7 @@ int MAIN(int argc, char **argv) X509_STORE *cert_ctx = NULL; X509_LOOKUP *lookup = NULL; X509_VERIFY_PARAM *vpm = NULL; + int crl_download = 0; #ifndef OPENSSL_NO_ENGINE char *engine = NULL; #endif @@ -136,7 +137,8 @@ int MAIN(int argc, char **argv) if (argc-- < 1) goto end; crlfile = *(++argv); - } + } else if (strcmp(*argv, "-crl_download") == 0) + crl_download = 1; #ifndef OPENSSL_NO_ENGINE else if (strcmp(*argv, "-engine") == 0) { if (--argc < 1) @@ -214,6 +216,9 @@ int MAIN(int argc, char **argv) } ret = 0; + + if (crl_download) + store_setup_crl_download(cert_ctx); if (argc < 1) { if (1 != check(cert_ctx, NULL, untrusted, trusted, crls, e)) ret = -1; diff --git a/apps/x509.c b/apps/x509.c index 929359b0da0b..864a60dda2e7 100644 --- a/apps/x509.c +++ b/apps/x509.c @@ -150,6 +150,9 @@ static const char *x509_usage[] = { " -engine e - use engine e, possibly a hardware device.\n", #endif " -certopt arg - various certificate text options\n", + " -checkhost host - check certificate matches \"host\"\n", + " -checkemail email - check certificate matches \"email\"\n", + " -checkip ipaddr - check certificate matches \"ipaddr\"\n", NULL }; @@ -163,6 +166,9 @@ static int x509_certify(X509_STORE *ctx, char *CAfile, const EVP_MD *digest, char *section, ASN1_INTEGER *sno); static int purpose_print(BIO *bio, X509 *cert, X509_PURPOSE *pt); static int reqfile = 0; +#ifdef OPENSSL_SSL_DEBUG_BROKEN_PROTOCOL +static int force_version = 2; +#endif int MAIN(int, char **); @@ -174,15 +180,16 @@ int MAIN(int argc, char **argv) X509 *x = NULL, *xca = NULL; ASN1_OBJECT *objtmp; STACK_OF(OPENSSL_STRING) *sigopts = NULL; - EVP_PKEY *Upkey = NULL, *CApkey = NULL; + EVP_PKEY *Upkey = NULL, *CApkey = NULL, *fkey = NULL; ASN1_INTEGER *sno = NULL; - int i, num, badops = 0; + int i, num, badops = 0, badsig = 0; BIO *out = NULL; BIO *STDout = NULL; STACK_OF(ASN1_OBJECT) *trust = NULL, *reject = NULL; int informat, outformat, keyformat, CAformat, CAkeyformat; char *infile = NULL, *outfile = NULL, *keyfile = NULL, *CAfile = NULL; char *CAkeyfile = NULL, *CAserial = NULL; + char *fkeyfile = NULL; char *alias = NULL; int text = 0, serial = 0, subject = 0, issuer = 0, startdate = 0, enddate = 0; @@ -208,6 +215,9 @@ int MAIN(int argc, char **argv) int need_rand = 0; int checkend = 0, checkoffset = 0; unsigned long nmflag = 0, certflag = 0; + char *checkhost = NULL; + char *checkemail = NULL; + char *checkip = NULL; #ifndef OPENSSL_NO_ENGINE char *engine = NULL; #endif @@ -274,7 +284,15 @@ int MAIN(int argc, char **argv) sigopts = sk_OPENSSL_STRING_new_null(); if (!sigopts || !sk_OPENSSL_STRING_push(sigopts, *(++argv))) goto bad; - } else if (strcmp(*argv, "-days") == 0) { + } +#ifdef OPENSSL_SSL_DEBUG_BROKEN_PROTOCOL + else if (strcmp(*argv, "-force_version") == 0) { + if (--argc < 1) + goto bad; + force_version = atoi(*(++argv)) - 1; + } +#endif + else if (strcmp(*argv, "-days") == 0) { if (--argc < 1) goto bad; days = atoi(*(++argv)); @@ -327,6 +345,10 @@ int MAIN(int argc, char **argv) goto bad; if (!(sno = s2i_ASN1_INTEGER(NULL, *(++argv)))) goto bad; + } else if (strcmp(*argv, "-force_pubkey") == 0) { + if (--argc < 1) + goto bad; + fkeyfile = *(++argv); } else if (strcmp(*argv, "-addtrust") == 0) { if (--argc < 1) goto bad; @@ -424,6 +446,18 @@ int MAIN(int argc, char **argv) goto bad; checkoffset = atoi(*(++argv)); checkend = 1; + } else if (strcmp(*argv, "-checkhost") == 0) { + if (--argc < 1) + goto bad; + checkhost = *(++argv); + } else if (strcmp(*argv, "-checkemail") == 0) { + if (--argc < 1) + goto bad; + checkemail = *(++argv); + } else if (strcmp(*argv, "-checkip") == 0) { + if (--argc < 1) + goto bad; + checkip = *(++argv); } else if (strcmp(*argv, "-noout") == 0) noout = ++num; else if (strcmp(*argv, "-trustout") == 0) @@ -447,6 +481,8 @@ int MAIN(int argc, char **argv) #endif else if (strcmp(*argv, "-ocspid") == 0) ocspid = ++num; + else if (strcmp(*argv, "-badsig") == 0) + badsig = 1; else if ((md_alg = EVP_get_digestbyname(*argv + 1))) { /* ok */ digest = md_alg; @@ -484,6 +520,13 @@ int MAIN(int argc, char **argv) goto end; } + if (fkeyfile) { + fkey = load_pubkey(bio_err, fkeyfile, keyformat, 0, + NULL, e, "Forced key"); + if (fkey == NULL) + goto end; + } + if ((CAkeyfile == NULL) && (CA_flag) && (CAformat == FORMAT_PEM)) { CAkeyfile = CAfile; } else if ((CA_flag) && (CAkeyfile == NULL)) { @@ -605,10 +648,13 @@ int MAIN(int argc, char **argv) X509_gmtime_adj(X509_get_notBefore(x), 0); X509_time_adj_ex(X509_get_notAfter(x), days, 0, NULL); - - pkey = X509_REQ_get_pubkey(req); - X509_set_pubkey(x, pkey); - EVP_PKEY_free(pkey); + if (fkey) + X509_set_pubkey(x, fkey); + else { + pkey = X509_REQ_get_pubkey(req); + X509_set_pubkey(x, pkey); + EVP_PKEY_free(pkey); + } } else x = load_cert(bio_err, infile, informat, NULL, e, "Certificate"); @@ -937,11 +983,16 @@ int MAIN(int argc, char **argv) goto end; } + print_cert_checks(STDout, x, checkhost, checkemail, checkip); + if (noout) { ret = 0; goto end; } + if (badsig) + x->signature->data[x->signature->length - 1] ^= 0x1; + if (outformat == FORMAT_ASN1) i = i2d_X509_bio(out, x); else if (outformat == FORMAT_PEM) { @@ -982,6 +1033,7 @@ int MAIN(int argc, char **argv) X509_free(xca); EVP_PKEY_free(Upkey); EVP_PKEY_free(CApkey); + EVP_PKEY_free(fkey); if (sigopts) sk_OPENSSL_STRING_free(sigopts); X509_REQ_free(rq); @@ -1101,7 +1153,11 @@ static int x509_certify(X509_STORE *ctx, char *CAfile, const EVP_MD *digest, if (conf) { X509V3_CTX ctx2; +#ifdef OPENSSL_SSL_DEBUG_BROKEN_PROTOCOL + X509_set_version(x, force_version); +#else X509_set_version(x, 2); /* version 3 certificate */ +#endif X509V3_set_ctx(&ctx2, xca, x, NULL, NULL, 0); X509V3_set_nconf(&ctx2, conf); if (!X509V3_EXT_add_nconf(conf, &ctx2, section, x)) @@ -1186,7 +1242,11 @@ static int sign(X509 *x, EVP_PKEY *pkey, int days, int clrext, } if (conf) { X509V3_CTX ctx; +#ifdef OPENSSL_SSL_DEBUG_BROKEN_PROTOCOL + X509_set_version(x, force_version); +#else X509_set_version(x, 2); /* version 3 certificate */ +#endif X509V3_set_ctx(&ctx, x, x, NULL, NULL, 0); X509V3_set_nconf(&ctx, conf); if (!X509V3_EXT_add_nconf(conf, &ctx, section, x)) @@ -587,15 +587,33 @@ case "$GUESSOS" in fi ;; ppc64-*-linux2) + if [ -z "$KERNEL_BITS" ]; then + echo "WARNING! If you wish to build 64-bit library, then you have to" + echo " invoke './Configure linux-ppc64' *manually*." + if [ "$TEST" = "false" -a -t 1 ]; then + echo " You have about 5 seconds to press Ctrl-C to abort." + (trap "stty `stty -g`" 2 0; stty -icanon min 0 time 50; read waste) <&1 + fi + fi + if [ "$KERNEL_BITS" = "64" ]; then + OUT="linux-ppc64" + else + OUT="linux-ppc" + (echo "__LP64__" | gcc -E -x c - 2>/dev/null | grep "^__LP64__" 2>&1 > /dev/null) || options="$options -m32" + fi + ;; + ppc64le-*-linux2) OUT="linux-ppc64le" ;; + ppc-*-linux2) OUT="linux-ppc" ;; + mips64*-*-linux2) echo "WARNING! If you wish to build 64-bit library, then you have to" - echo " invoke './Configure linux-ppc64' *manually*." + echo " invoke './Configure linux64-mips64' *manually*." if [ "$TEST" = "false" -a -t 1 ]; then echo " You have about 5 seconds to press Ctrl-C to abort." (trap "stty `stty -g`" 2 0; stty -icanon min 0 time 50; read waste) <&1 fi - OUT="linux-ppc" + OUT="linux-mips64" ;; - ppc-*-linux2) OUT="linux-ppc" ;; + mips*-*-linux2) OUT="linux-mips32" ;; ppc60x-*-vxworks*) OUT="vxworks-ppc60x" ;; ppcgen-*-vxworks*) OUT="vxworks-ppcgen" ;; pentium-*-vxworks*) OUT="vxworks-pentium" ;; @@ -644,6 +662,7 @@ case "$GUESSOS" in armv[1-3]*-*-linux2) OUT="linux-generic32" ;; armv[7-9]*-*-linux2) OUT="linux-armv4"; options="$options -march=armv7-a" ;; arm*-*-linux2) OUT="linux-armv4" ;; + aarch64-*-linux2) OUT="linux-aarch64" ;; sh*b-*-linux2) OUT="linux-generic32"; options="$options -DB_ENDIAN" ;; sh*-*-linux2) OUT="linux-generic32"; options="$options -DL_ENDIAN" ;; m68k*-*-linux2) OUT="linux-generic32"; options="$options -DB_ENDIAN" ;; diff --git a/crypto/Makefile b/crypto/Makefile index 618c95878ce4..7869996a9c07 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -74,9 +74,9 @@ ia64cpuid.s: ia64cpuid.S; $(CC) $(CFLAGS) -E ia64cpuid.S > $@ ppccpuid.s: ppccpuid.pl; $(PERL) ppccpuid.pl $(PERLASM_SCHEME) $@ pariscid.s: pariscid.pl; $(PERL) pariscid.pl $(PERLASM_SCHEME) $@ alphacpuid.s: alphacpuid.pl - (preproc=/tmp/$$$$.$@; trap "rm $$preproc" INT; \ + (preproc=$$$$.$@.S; trap "rm $$preproc" INT; \ $(PERL) alphacpuid.pl > $$preproc && \ - $(CC) -E $$preproc > $@ && rm $$preproc) + $(CC) -E -P $$preproc > $@ && rm $$preproc) testapps: [ -z "$(THIS)" ] || ( if echo $(SDIRS) | fgrep ' des '; \ @@ -88,7 +88,7 @@ subdirs: @target=all; $(RECURSIVE_MAKE) files: - $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO + $(PERL) $(TOP)/util/files.pl "CPUID_OBJ=$(CPUID_OBJ)" Makefile >> $(TOP)/MINFO @target=files; $(RECURSIVE_MAKE) links: @@ -102,7 +102,7 @@ lib: $(LIB) @touch lib $(LIB): $(LIBOBJ) $(AR) $(LIB) $(LIBOBJ) - [ -z "$(FIPSLIBDIR)" ] || $(AR) $(LIB) $(FIPSLIBDIR)fipscanister.o + test -z "$(FIPSLIBDIR)" || $(AR) $(LIB) $(FIPSLIBDIR)fipscanister.o $(RANLIB) $(LIB) || echo Never mind. shared: buildinf.h lib subdirs diff --git a/crypto/aes/Makefile b/crypto/aes/Makefile index b3a95812af3b..e825c140194f 100644 --- a/crypto/aes/Makefile +++ b/crypto/aes/Makefile @@ -65,12 +65,22 @@ aesni-x86_64.s: asm/aesni-x86_64.pl $(PERL) asm/aesni-x86_64.pl $(PERLASM_SCHEME) > $@ aesni-sha1-x86_64.s: asm/aesni-sha1-x86_64.pl $(PERL) asm/aesni-sha1-x86_64.pl $(PERLASM_SCHEME) > $@ +aesni-sha256-x86_64.s: asm/aesni-sha256-x86_64.pl + $(PERL) asm/aesni-sha256-x86_64.pl $(PERLASM_SCHEME) > $@ +aesni-mb-x86_64.s: asm/aesni-mb-x86_64.pl + $(PERL) asm/aesni-mb-x86_64.pl $(PERLASM_SCHEME) > $@ aes-sparcv9.s: asm/aes-sparcv9.pl $(PERL) asm/aes-sparcv9.pl $(CFLAGS) > $@ +aest4-sparcv9.s: asm/aest4-sparcv9.pl ../perlasm/sparcv9_modes.pl + $(PERL) asm/aest4-sparcv9.pl $(CFLAGS) > $@ aes-ppc.s: asm/aes-ppc.pl $(PERL) asm/aes-ppc.pl $(PERLASM_SCHEME) $@ +vpaes-ppc.s: asm/vpaes-ppc.pl + $(PERL) asm/vpaes-ppc.pl $(PERLASM_SCHEME) $@ +aesp8-ppc.s: asm/aesp8-ppc.pl + $(PERL) asm/aesp8-ppc.pl $(PERLASM_SCHEME) $@ aes-parisc.s: asm/aes-parisc.pl $(PERL) asm/aes-parisc.pl $(PERLASM_SCHEME) $@ @@ -78,12 +88,18 @@ aes-parisc.s: asm/aes-parisc.pl aes-mips.S: asm/aes-mips.pl $(PERL) asm/aes-mips.pl $(PERLASM_SCHEME) $@ +aesv8-armx.S: asm/aesv8-armx.pl + $(PERL) asm/aesv8-armx.pl $(PERLASM_SCHEME) $@ +aesv8-armx.o: aesv8-armx.S + # GNU make "catch all" aes-%.S: asm/aes-%.pl; $(PERL) $< $(PERLASM_SCHEME) > $@ aes-armv4.o: aes-armv4.S +bsaes-%.S: asm/bsaes-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@ +bsaes-armv7.o: bsaes-armv7.S files: - $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO + $(PERL) $(TOP)/util/files.pl "AES_ENC=$(AES_ENC)" Makefile >> $(TOP)/MINFO links: @$(PERL) $(TOP)/util/mklink.pl ../../include/openssl $(EXHEADER) @@ -149,7 +165,7 @@ aes_wrap.o: ../../e_os.h ../../include/openssl/aes.h aes_wrap.o: ../../include/openssl/bio.h ../../include/openssl/buffer.h aes_wrap.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h aes_wrap.o: ../../include/openssl/err.h ../../include/openssl/lhash.h -aes_wrap.o: ../../include/openssl/opensslconf.h +aes_wrap.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h aes_wrap.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h aes_wrap.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h aes_wrap.o: ../../include/openssl/symhacks.h ../cryptlib.h aes_wrap.c diff --git a/crypto/aes/aes_wrap.c b/crypto/aes/aes_wrap.c index b1ab8e25ef37..b7b64d57a487 100644 --- a/crypto/aes/aes_wrap.c +++ b/crypto/aes/aes_wrap.c @@ -54,197 +54,19 @@ #include "cryptlib.h" #include <openssl/aes.h> -#include <openssl/bio.h> - -static const unsigned char default_iv[] = { - 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, -}; +#include <openssl/modes.h> int AES_wrap_key(AES_KEY *key, const unsigned char *iv, unsigned char *out, const unsigned char *in, unsigned int inlen) { - unsigned char *A, B[16], *R; - unsigned int i, j, t; - if ((inlen & 0x7) || (inlen < 8)) - return -1; - A = B; - t = 1; - memcpy(out + 8, in, inlen); - if (!iv) - iv = default_iv; - - memcpy(A, iv, 8); - - for (j = 0; j < 6; j++) { - R = out + 8; - for (i = 0; i < inlen; i += 8, t++, R += 8) { - memcpy(B + 8, R, 8); - AES_encrypt(B, B, key); - A[7] ^= (unsigned char)(t & 0xff); - if (t > 0xff) { - A[6] ^= (unsigned char)((t >> 8) & 0xff); - A[5] ^= (unsigned char)((t >> 16) & 0xff); - A[4] ^= (unsigned char)((t >> 24) & 0xff); - } - memcpy(R, B + 8, 8); - } - } - memcpy(out, A, 8); - return inlen + 8; + return CRYPTO_128_wrap(key, iv, out, in, inlen, (block128_f) AES_encrypt); } int AES_unwrap_key(AES_KEY *key, const unsigned char *iv, unsigned char *out, const unsigned char *in, unsigned int inlen) { - unsigned char *A, B[16], *R; - unsigned int i, j, t; - inlen -= 8; - if (inlen & 0x7) - return -1; - if (inlen < 8) - return -1; - A = B; - t = 6 * (inlen >> 3); - memcpy(A, in, 8); - memcpy(out, in + 8, inlen); - for (j = 0; j < 6; j++) { - R = out + inlen - 8; - for (i = 0; i < inlen; i += 8, t--, R -= 8) { - A[7] ^= (unsigned char)(t & 0xff); - if (t > 0xff) { - A[6] ^= (unsigned char)((t >> 8) & 0xff); - A[5] ^= (unsigned char)((t >> 16) & 0xff); - A[4] ^= (unsigned char)((t >> 24) & 0xff); - } - memcpy(B + 8, R, 8); - AES_decrypt(B, B, key); - memcpy(R, B + 8, 8); - } - } - if (!iv) - iv = default_iv; - if (memcmp(A, iv, 8)) { - OPENSSL_cleanse(out, inlen); - return 0; - } - return inlen; -} - -#ifdef AES_WRAP_TEST - -int AES_wrap_unwrap_test(const unsigned char *kek, int keybits, - const unsigned char *iv, - const unsigned char *eout, - const unsigned char *key, int keylen) -{ - unsigned char *otmp = NULL, *ptmp = NULL; - int r, ret = 0; - AES_KEY wctx; - otmp = OPENSSL_malloc(keylen + 8); - ptmp = OPENSSL_malloc(keylen); - if (!otmp || !ptmp) - return 0; - if (AES_set_encrypt_key(kek, keybits, &wctx)) - goto err; - r = AES_wrap_key(&wctx, iv, otmp, key, keylen); - if (r <= 0) - goto err; - - if (eout && memcmp(eout, otmp, keylen)) - goto err; - - if (AES_set_decrypt_key(kek, keybits, &wctx)) - goto err; - r = AES_unwrap_key(&wctx, iv, ptmp, otmp, r); - - if (memcmp(key, ptmp, keylen)) - goto err; - - ret = 1; - - err: - if (otmp) - OPENSSL_free(otmp); - if (ptmp) - OPENSSL_free(ptmp); - - return ret; - + return CRYPTO_128_unwrap(key, iv, out, in, inlen, + (block128_f) AES_decrypt); } - -int main(int argc, char **argv) -{ - - static const unsigned char kek[] = { - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, - 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, - 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, - 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f - }; - - static const unsigned char key[] = { - 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, - 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, - 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f - }; - - static const unsigned char e1[] = { - 0x1f, 0xa6, 0x8b, 0x0a, 0x81, 0x12, 0xb4, 0x47, - 0xae, 0xf3, 0x4b, 0xd8, 0xfb, 0x5a, 0x7b, 0x82, - 0x9d, 0x3e, 0x86, 0x23, 0x71, 0xd2, 0xcf, 0xe5 - }; - - static const unsigned char e2[] = { - 0x96, 0x77, 0x8b, 0x25, 0xae, 0x6c, 0xa4, 0x35, - 0xf9, 0x2b, 0x5b, 0x97, 0xc0, 0x50, 0xae, 0xd2, - 0x46, 0x8a, 0xb8, 0xa1, 0x7a, 0xd8, 0x4e, 0x5d - }; - - static const unsigned char e3[] = { - 0x64, 0xe8, 0xc3, 0xf9, 0xce, 0x0f, 0x5b, 0xa2, - 0x63, 0xe9, 0x77, 0x79, 0x05, 0x81, 0x8a, 0x2a, - 0x93, 0xc8, 0x19, 0x1e, 0x7d, 0x6e, 0x8a, 0xe7 - }; - - static const unsigned char e4[] = { - 0x03, 0x1d, 0x33, 0x26, 0x4e, 0x15, 0xd3, 0x32, - 0x68, 0xf2, 0x4e, 0xc2, 0x60, 0x74, 0x3e, 0xdc, - 0xe1, 0xc6, 0xc7, 0xdd, 0xee, 0x72, 0x5a, 0x93, - 0x6b, 0xa8, 0x14, 0x91, 0x5c, 0x67, 0x62, 0xd2 - }; - - static const unsigned char e5[] = { - 0xa8, 0xf9, 0xbc, 0x16, 0x12, 0xc6, 0x8b, 0x3f, - 0xf6, 0xe6, 0xf4, 0xfb, 0xe3, 0x0e, 0x71, 0xe4, - 0x76, 0x9c, 0x8b, 0x80, 0xa3, 0x2c, 0xb8, 0x95, - 0x8c, 0xd5, 0xd1, 0x7d, 0x6b, 0x25, 0x4d, 0xa1 - }; - - static const unsigned char e6[] = { - 0x28, 0xc9, 0xf4, 0x04, 0xc4, 0xb8, 0x10, 0xf4, - 0xcb, 0xcc, 0xb3, 0x5c, 0xfb, 0x87, 0xf8, 0x26, - 0x3f, 0x57, 0x86, 0xe2, 0xd8, 0x0e, 0xd3, 0x26, - 0xcb, 0xc7, 0xf0, 0xe7, 0x1a, 0x99, 0xf4, 0x3b, - 0xfb, 0x98, 0x8b, 0x9b, 0x7a, 0x02, 0xdd, 0x21 - }; - - AES_KEY wctx, xctx; - int ret; - ret = AES_wrap_unwrap_test(kek, 128, NULL, e1, key, 16); - fprintf(stderr, "Key test result %d\n", ret); - ret = AES_wrap_unwrap_test(kek, 192, NULL, e2, key, 16); - fprintf(stderr, "Key test result %d\n", ret); - ret = AES_wrap_unwrap_test(kek, 256, NULL, e3, key, 16); - fprintf(stderr, "Key test result %d\n", ret); - ret = AES_wrap_unwrap_test(kek, 192, NULL, e4, key, 24); - fprintf(stderr, "Key test result %d\n", ret); - ret = AES_wrap_unwrap_test(kek, 256, NULL, e5, key, 24); - fprintf(stderr, "Key test result %d\n", ret); - ret = AES_wrap_unwrap_test(kek, 256, NULL, e6, key, 32); - fprintf(stderr, "Key test result %d\n", ret); -} - -#endif diff --git a/crypto/aes/aes_x86core.c b/crypto/aes/aes_x86core.c index 1defbb1abfb2..c869ed719852 100644 --- a/crypto/aes/aes_x86core.c +++ b/crypto/aes/aes_x86core.c @@ -89,8 +89,10 @@ typedef unsigned long long u64; #endif #undef ROTATE -#if defined(_MSC_VER) || defined(__ICC) -# define ROTATE(a,n) _lrotl(a,n) +#if defined(_MSC_VER) +# define ROTATE(a,n) _lrotl(a,n) +#elif defined(__ICC) +# define ROTATE(a,n) _rotl(a,n) #elif defined(__GNUC__) && __GNUC__>=2 # if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__) # define ROTATE(a,n) ({ register unsigned int ret; \ diff --git a/crypto/aes/asm/aes-586.pl b/crypto/aes/asm/aes-586.pl index 687ed811be47..451d0e0ed1e2 100755 --- a/crypto/aes/asm/aes-586.pl +++ b/crypto/aes/asm/aes-586.pl @@ -39,7 +39,7 @@ # but exhibits up to 10% improvement on other cores. # # Second version is "monolithic" replacement for aes_core.c, which in -# addition to AES_[de|en]crypt implements private_AES_set_[de|en]cryption_key. +# addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key. # This made it possible to implement little-endian variant of the # algorithm without modifying the base C code. Motivating factor for # the undertaken effort was that it appeared that in tight IA-32 @@ -103,11 +103,12 @@ # byte for 128-bit key. # # ECB encrypt ECB decrypt CBC large chunk -# P4 56[60] 84[100] 23 -# AMD K8 48[44] 70[79] 18 -# PIII 41[50] 61[91] 24 -# Core 2 32[38] 45[70] 18.5 -# Pentium 120 160 77 +# P4 52[54] 83[95] 23 +# AMD K8 46[41] 66[70] 18 +# PIII 41[50] 60[77] 24 +# Core 2 31[36] 45[64] 18.5 +# Atom 76[100] 96[138] 60 +# Pentium 115 150 77 # # Version 4.1 switches to compact S-box even in key schedule setup. # @@ -242,7 +243,7 @@ $vertical_spin=0; # shift "verticaly" defaults to 0, because of sub encvert() { my ($te,@s) = @_; - my $v0 = $acc, $v1 = $key; + my ($v0,$v1) = ($acc,$key); &mov ($v0,$s[3]); # copy s3 &mov (&DWP(4,"esp"),$s[2]); # save s2 @@ -299,7 +300,7 @@ sub encvert() # Another experimental routine, which features "horizontal spin," but # eliminates one reference to stack. Strangely enough runs slower... sub enchoriz() -{ my $v0 = $key, $v1 = $acc; +{ my ($v0,$v1) = ($key,$acc); &movz ($v0,&LB($s0)); # 3, 2, 1, 0* &rotr ($s2,8); # 8,11,10, 9 @@ -427,7 +428,7 @@ sub sse_encbody() ###################################################################### sub enccompact() -{ my $Fn = mov; +{ my $Fn = \&mov; while ($#_>5) { pop(@_); $Fn=sub{}; } my ($i,$te,@s)=@_; my $tmp = $key; @@ -476,24 +477,25 @@ sub enctransform() my $tmp = $tbl; my $r2 = $key ; - &mov ($acc,$s[$i]); - &and ($acc,0x80808080); - &mov ($tmp,$acc); - &shr ($tmp,7); + &and ($tmp,$s[$i]); &lea ($r2,&DWP(0,$s[$i],$s[$i])); - &sub ($acc,$tmp); + &mov ($acc,$tmp); + &shr ($tmp,7); &and ($r2,0xfefefefe); - &and ($acc,0x1b1b1b1b); + &sub ($acc,$tmp); &mov ($tmp,$s[$i]); + &and ($acc,0x1b1b1b1b); + &rotr ($tmp,16); &xor ($acc,$r2); # r2 + &mov ($r2,$s[$i]); &xor ($s[$i],$acc); # r0 ^ r2 + &rotr ($r2,16+8); + &xor ($acc,$tmp); &rotl ($s[$i],24); - &xor ($s[$i],$acc) # ROTATE(r2^r0,24) ^ r2 - &rotr ($tmp,16); - &xor ($s[$i],$tmp); - &rotr ($tmp,8); - &xor ($s[$i],$tmp); + &xor ($acc,$r2); + &mov ($tmp,0x80808080) if ($i!=1); + &xor ($s[$i],$acc); # ROTATE(r2^r0,24) ^ r2 } &function_begin_B("_x86_AES_encrypt_compact"); @@ -526,6 +528,7 @@ sub enctransform() &enccompact(1,$tbl,$s1,$s2,$s3,$s0,1); &enccompact(2,$tbl,$s2,$s3,$s0,$s1,1); &enccompact(3,$tbl,$s3,$s0,$s1,$s2,1); + &mov ($tbl,0x80808080); &enctransform(2); &enctransform(3); &enctransform(0); @@ -607,82 +610,84 @@ sub sse_enccompact() &pshufw ("mm5","mm4",0x0d); # 15,14,11,10 &movd ("eax","mm1"); # 5, 4, 1, 0 &movd ("ebx","mm5"); # 15,14,11,10 + &mov ($__key,$key); &movz ($acc,&LB("eax")); # 0 - &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0 - &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2 &movz ("edx",&HB("eax")); # 1 + &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2 + &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0 + &movz ($key,&LB("ebx")); # 10 &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1 - &shl ("edx",8); # 1 &shr ("eax",16); # 5, 4 + &shl ("edx",8); # 1 - &movz ($acc,&LB("ebx")); # 10 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 10 + &movz ($key,&HB("ebx")); # 11 &shl ($acc,16); # 10 - &or ("ecx",$acc); # 10 &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8 - &movz ($acc,&HB("ebx")); # 11 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11 + &or ("ecx",$acc); # 10 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 11 + &movz ($key,&HB("eax")); # 5 &shl ($acc,24); # 11 - &or ("edx",$acc); # 11 &shr ("ebx",16); # 15,14 + &or ("edx",$acc); # 11 - &movz ($acc,&HB("eax")); # 5 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 5 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 5 + &movz ($key,&HB("ebx")); # 15 &shl ($acc,8); # 5 &or ("ecx",$acc); # 5 - &movz ($acc,&HB("ebx")); # 15 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 15 + &movz ($key,&LB("eax")); # 4 &shl ($acc,24); # 15 &or ("ecx",$acc); # 15 - &movd ("mm0","ecx"); # t[0] collected - &movz ($acc,&LB("eax")); # 4 - &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 4 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 4 + &movz ($key,&LB("ebx")); # 14 &movd ("eax","mm2"); # 7, 6, 3, 2 - &movz ($acc,&LB("ebx")); # 14 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14 - &shl ($acc,16); # 14 + &movd ("mm0","ecx"); # t[0] collected + &movz ("ecx",&BP(-128,$tbl,$key,1)); # 14 + &movz ($key,&HB("eax")); # 3 + &shl ("ecx",16); # 14 + &movd ("ebx","mm6"); # 13,12, 9, 8 &or ("ecx",$acc); # 14 - &movd ("ebx","mm6"); # 13,12, 9, 8 - &movz ($acc,&HB("eax")); # 3 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 3 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 3 + &movz ($key,&HB("ebx")); # 9 &shl ($acc,24); # 3 &or ("ecx",$acc); # 3 - &movz ($acc,&HB("ebx")); # 9 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 9 + &movz ($key,&LB("ebx")); # 8 &shl ($acc,8); # 9 + &shr ("ebx",16); # 13,12 &or ("ecx",$acc); # 9 - &movd ("mm1","ecx"); # t[1] collected - &movz ($acc,&LB("ebx")); # 8 - &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 8 - &shr ("ebx",16); # 13,12 - &movz ($acc,&LB("eax")); # 2 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2 - &shl ($acc,16); # 2 - &or ("ecx",$acc); # 2 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 8 + &movz ($key,&LB("eax")); # 2 &shr ("eax",16); # 7, 6 + &movd ("mm1","ecx"); # t[1] collected + &movz ("ecx",&BP(-128,$tbl,$key,1)); # 2 + &movz ($key,&HB("eax")); # 7 + &shl ("ecx",16); # 2 + &and ("eax",0xff); # 6 + &or ("ecx",$acc); # 2 &punpckldq ("mm0","mm1"); # t[0,1] collected - &movz ($acc,&HB("eax")); # 7 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 7 + &movz ($key,&HB("ebx")); # 13 &shl ($acc,24); # 7 - &or ("ecx",$acc); # 7 - &and ("eax",0xff); # 6 + &and ("ebx",0xff); # 12 &movz ("eax",&BP(-128,$tbl,"eax",1)); # 6 + &or ("ecx",$acc); # 7 &shl ("eax",16); # 6 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 13 &or ("edx","eax"); # 6 - &movz ($acc,&HB("ebx")); # 13 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13 &shl ($acc,8); # 13 - &or ("ecx",$acc); # 13 - &movd ("mm4","ecx"); # t[2] collected - &and ("ebx",0xff); # 12 &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12 + &or ("ecx",$acc); # 13 &or ("edx","ebx"); # 12 + &mov ($key,$__key); + &movd ("mm4","ecx"); # t[2] collected &movd ("mm5","edx"); # t[3] collected &punpckldq ("mm4","mm5"); # t[2,3] collected @@ -1222,7 +1227,7 @@ sub enclast() ###################################################################### sub deccompact() -{ my $Fn = mov; +{ my $Fn = \&mov; while ($#_>5) { pop(@_); $Fn=sub{}; } my ($i,$td,@s)=@_; my $tmp = $key; @@ -1270,30 +1275,30 @@ sub dectransform() my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1); my $tp8 = $tbl; - &mov ($acc,$s[$i]); - &and ($acc,0x80808080); - &mov ($tmp,$acc); + &mov ($tmp,0x80808080); + &and ($tmp,$s[$i]); + &mov ($acc,$tmp); &shr ($tmp,7); &lea ($tp2,&DWP(0,$s[$i],$s[$i])); &sub ($acc,$tmp); &and ($tp2,0xfefefefe); &and ($acc,0x1b1b1b1b); - &xor ($acc,$tp2); - &mov ($tp2,$acc); + &xor ($tp2,$acc); + &mov ($tmp,0x80808080); - &and ($acc,0x80808080); - &mov ($tmp,$acc); + &and ($tmp,$tp2); + &mov ($acc,$tmp); &shr ($tmp,7); &lea ($tp4,&DWP(0,$tp2,$tp2)); &sub ($acc,$tmp); &and ($tp4,0xfefefefe); &and ($acc,0x1b1b1b1b); &xor ($tp2,$s[$i]); # tp2^tp1 - &xor ($acc,$tp4); - &mov ($tp4,$acc); + &xor ($tp4,$acc); + &mov ($tmp,0x80808080); - &and ($acc,0x80808080); - &mov ($tmp,$acc); + &and ($tmp,$tp4); + &mov ($acc,$tmp); &shr ($tmp,7); &lea ($tp8,&DWP(0,$tp4,$tp4)); &sub ($acc,$tmp); @@ -1305,13 +1310,13 @@ sub dectransform() &xor ($s[$i],$tp2); &xor ($tp2,$tp8); - &rotl ($tp2,24); &xor ($s[$i],$tp4); &xor ($tp4,$tp8); - &rotl ($tp4,16); + &rotl ($tp2,24); &xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1) - &rotl ($tp8,8); + &rotl ($tp4,16); &xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24) + &rotl ($tp8,8); &xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16) &mov ($s[0],$__s0) if($i==2); #prefetch $s0 &mov ($s[1],$__s1) if($i==3); #prefetch $s1 @@ -1389,85 +1394,87 @@ sub dectransform() sub sse_deccompact() { &pshufw ("mm1","mm0",0x0c); # 7, 6, 1, 0 + &pshufw ("mm5","mm4",0x09); # 13,12,11,10 &movd ("eax","mm1"); # 7, 6, 1, 0 + &movd ("ebx","mm5"); # 13,12,11,10 + &mov ($__key,$key); - &pshufw ("mm5","mm4",0x09); # 13,12,11,10 &movz ($acc,&LB("eax")); # 0 - &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0 - &movd ("ebx","mm5"); # 13,12,11,10 &movz ("edx",&HB("eax")); # 1 + &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4 + &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0 + &movz ($key,&LB("ebx")); # 10 &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1 + &shr ("eax",16); # 7, 6 &shl ("edx",8); # 1 - &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4 - &movz ($acc,&LB("ebx")); # 10 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 10 + &movz ($key,&HB("ebx")); # 11 &shl ($acc,16); # 10 + &pshufw ("mm6","mm4",0x03); # 9, 8,15,14 &or ("ecx",$acc); # 10 - &shr ("eax",16); # 7, 6 - &movz ($acc,&HB("ebx")); # 11 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 11 + &movz ($key,&HB("eax")); # 7 &shl ($acc,24); # 11 - &or ("edx",$acc); # 11 &shr ("ebx",16); # 13,12 + &or ("edx",$acc); # 11 - &pshufw ("mm6","mm4",0x03); # 9, 8,15,14 - &movz ($acc,&HB("eax")); # 7 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 7 + &movz ($key,&HB("ebx")); # 13 &shl ($acc,24); # 7 &or ("ecx",$acc); # 7 - &movz ($acc,&HB("ebx")); # 13 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 13 + &movz ($key,&LB("eax")); # 6 &shl ($acc,8); # 13 + &movd ("eax","mm2"); # 3, 2, 5, 4 &or ("ecx",$acc); # 13 - &movd ("mm0","ecx"); # t[0] collected - &movz ($acc,&LB("eax")); # 6 - &movd ("eax","mm2"); # 3, 2, 5, 4 - &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 6 - &shl ("ecx",16); # 6 - &movz ($acc,&LB("ebx")); # 12 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 6 + &movz ($key,&LB("ebx")); # 12 + &shl ($acc,16); # 6 &movd ("ebx","mm6"); # 9, 8,15,14 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 12 + &movd ("mm0","ecx"); # t[0] collected + &movz ("ecx",&BP(-128,$tbl,$key,1)); # 12 + &movz ($key,&LB("eax")); # 4 &or ("ecx",$acc); # 12 - &movz ($acc,&LB("eax")); # 4 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 4 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 4 + &movz ($key,&LB("ebx")); # 14 &or ("edx",$acc); # 4 - &movz ($acc,&LB("ebx")); # 14 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 14 + &movz ($key,&HB("eax")); # 5 &shl ($acc,16); # 14 + &shr ("eax",16); # 3, 2 &or ("edx",$acc); # 14 - &movd ("mm1","edx"); # t[1] collected - &movz ($acc,&HB("eax")); # 5 - &movz ("edx",&BP(-128,$tbl,$acc,1)); # 5 - &shl ("edx",8); # 5 - &movz ($acc,&HB("ebx")); # 15 - &shr ("eax",16); # 3, 2 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15 - &shl ($acc,24); # 15 - &or ("edx",$acc); # 15 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 5 + &movz ($key,&HB("ebx")); # 15 &shr ("ebx",16); # 9, 8 + &shl ($acc,8); # 5 + &movd ("mm1","edx"); # t[1] collected + &movz ("edx",&BP(-128,$tbl,$key,1)); # 15 + &movz ($key,&HB("ebx")); # 9 + &shl ("edx",24); # 15 + &and ("ebx",0xff); # 8 + &or ("edx",$acc); # 15 &punpckldq ("mm0","mm1"); # t[0,1] collected - &movz ($acc,&HB("ebx")); # 9 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 9 + &movz ($key,&LB("eax")); # 2 &shl ($acc,8); # 9 - &or ("ecx",$acc); # 9 - &and ("ebx",0xff); # 8 + &movz ("eax",&HB("eax")); # 3 &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8 + &or ("ecx",$acc); # 9 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 2 &or ("edx","ebx"); # 8 - &movz ($acc,&LB("eax")); # 2 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2 &shl ($acc,16); # 2 - &or ("edx",$acc); # 2 - &movd ("mm4","edx"); # t[2] collected - &movz ("eax",&HB("eax")); # 3 &movz ("eax",&BP(-128,$tbl,"eax",1)); # 3 + &or ("edx",$acc); # 2 &shl ("eax",24); # 3 &or ("ecx","eax"); # 3 + &mov ($key,$__key); + &movd ("mm4","edx"); # t[2] collected &movd ("mm5","ecx"); # t[3] collected &punpckldq ("mm4","mm5"); # t[2,3] collected @@ -2181,8 +2188,8 @@ my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds &mov ("ecx",240/4); &xor ("eax","eax"); &align (4); - &data_word(0xABF3F689); # rep stosd - &set_label("skip_ezero") + &data_word(0xABF3F689); # rep stosd + &set_label("skip_ezero"); &mov ("esp",$_esp); &popf (); &set_label("drop_out"); @@ -2301,8 +2308,8 @@ my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds &mov ("ecx",240/4); &xor ("eax","eax"); &align (4); - &data_word(0xABF3F689); # rep stosd - &set_label("skip_dzero") + &data_word(0xABF3F689); # rep stosd + &set_label("skip_dzero"); &mov ("esp",$_esp); &popf (); &function_end_A(); @@ -2865,32 +2872,32 @@ sub deckey() { my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_; my $tmp = $tbl; - &mov ($acc,$tp1); - &and ($acc,0x80808080); - &mov ($tmp,$acc); - &shr ($tmp,7); + &mov ($tmp,0x80808080); + &and ($tmp,$tp1); &lea ($tp2,&DWP(0,$tp1,$tp1)); + &mov ($acc,$tmp); + &shr ($tmp,7); &sub ($acc,$tmp); &and ($tp2,0xfefefefe); &and ($acc,0x1b1b1b1b); - &xor ($acc,$tp2); - &mov ($tp2,$acc); + &xor ($tp2,$acc); + &mov ($tmp,0x80808080); - &and ($acc,0x80808080); - &mov ($tmp,$acc); - &shr ($tmp,7); + &and ($tmp,$tp2); &lea ($tp4,&DWP(0,$tp2,$tp2)); + &mov ($acc,$tmp); + &shr ($tmp,7); &sub ($acc,$tmp); &and ($tp4,0xfefefefe); &and ($acc,0x1b1b1b1b); &xor ($tp2,$tp1); # tp2^tp1 - &xor ($acc,$tp4); - &mov ($tp4,$acc); + &xor ($tp4,$acc); + &mov ($tmp,0x80808080); - &and ($acc,0x80808080); - &mov ($tmp,$acc); - &shr ($tmp,7); + &and ($tmp,$tp4); &lea ($tp8,&DWP(0,$tp4,$tp4)); + &mov ($acc,$tmp); + &shr ($tmp,7); &xor ($tp4,$tp1); # tp4^tp1 &sub ($acc,$tmp); &and ($tp8,0xfefefefe); diff --git a/crypto/aes/asm/aes-armv4.pl b/crypto/aes/asm/aes-armv4.pl index 86b86c4a0fbd..4f8917089f6c 100755 --- a/crypto/aes/asm/aes-armv4.pl +++ b/crypto/aes/asm/aes-armv4.pl @@ -1,7 +1,7 @@ #!/usr/bin/env perl # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -51,9 +51,23 @@ $key="r11"; $rounds="r12"; $code=<<___; -#include "arm_arch.h" +#ifndef __KERNEL__ +# include "arm_arch.h" +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +#endif + .text +#if __ARM_ARCH__<7 +.code 32 +#else +.syntax unified +# ifdef __thumb2__ +.thumb +# else .code 32 +# endif +#endif .type AES_Te,%object .align 5 @@ -167,7 +181,11 @@ AES_Te: .type AES_encrypt,%function .align 5 AES_encrypt: +#if __ARM_ARCH__<7 sub r3,pc,#8 @ AES_encrypt +#else + adr r3,AES_encrypt +#endif stmdb sp!,{r1,r4-r12,lr} mov $rounds,r0 @ inp mov $key,r2 @@ -409,11 +427,21 @@ _armv4_AES_encrypt: .align 5 private_AES_set_encrypt_key: _armv4_AES_set_encrypt_key: +#if __ARM_ARCH__<7 sub r3,pc,#8 @ AES_set_encrypt_key +#else + adr r3,private_AES_set_encrypt_key +#endif teq r0,#0 +#if __ARM_ARCH__>=7 + itt eq @ Thumb2 thing, sanity check in ARM +#endif moveq r0,#-1 beq .Labrt teq r2,#0 +#if __ARM_ARCH__>=7 + itt eq @ Thumb2 thing, sanity check in ARM +#endif moveq r0,#-1 beq .Labrt @@ -422,6 +450,9 @@ _armv4_AES_set_encrypt_key: teq r1,#192 beq .Lok teq r1,#256 +#if __ARM_ARCH__>=7 + itt ne @ Thumb2 thing, sanity check in ARM +#endif movne r0,#-1 bne .Labrt @@ -576,6 +607,9 @@ _armv4_AES_set_encrypt_key: str $s2,[$key,#-16] subs $rounds,$rounds,#1 str $s3,[$key,#-12] +#if __ARM_ARCH__>=7 + itt eq @ Thumb2 thing, sanity check in ARM +#endif subeq r2,$key,#216 beq .Ldone @@ -645,6 +679,9 @@ _armv4_AES_set_encrypt_key: str $s2,[$key,#-24] subs $rounds,$rounds,#1 str $s3,[$key,#-20] +#if __ARM_ARCH__>=7 + itt eq @ Thumb2 thing, sanity check in ARM +#endif subeq r2,$key,#256 beq .Ldone @@ -674,11 +711,17 @@ _armv4_AES_set_encrypt_key: str $i3,[$key,#-4] b .L256_loop +.align 2 .Ldone: mov r0,#0 ldmia sp!,{r4-r12,lr} -.Labrt: tst lr,#1 +.Labrt: +#if __ARM_ARCH__>=5 + ret @ bx lr +#else + tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) +#endif .size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key .global private_AES_set_decrypt_key @@ -688,34 +731,57 @@ private_AES_set_decrypt_key: str lr,[sp,#-4]! @ push lr bl _armv4_AES_set_encrypt_key teq r0,#0 - ldrne lr,[sp],#4 @ pop lr + ldr lr,[sp],#4 @ pop lr bne .Labrt - stmdb sp!,{r4-r12} + mov r0,r2 @ AES_set_encrypt_key preserves r2, + mov r1,r2 @ which is AES_KEY *key + b _armv4_AES_set_enc2dec_key +.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key - ldr $rounds,[r2,#240] @ AES_set_encrypt_key preserves r2, - mov $key,r2 @ which is AES_KEY *key - mov $i1,r2 - add $i2,r2,$rounds,lsl#4 +@ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out) +.global AES_set_enc2dec_key +.type AES_set_enc2dec_key,%function +.align 5 +AES_set_enc2dec_key: +_armv4_AES_set_enc2dec_key: + stmdb sp!,{r4-r12,lr} + + ldr $rounds,[r0,#240] + mov $i1,r0 @ input + add $i2,r0,$rounds,lsl#4 + mov $key,r1 @ ouput + add $tbl,r1,$rounds,lsl#4 + str $rounds,[r1,#240] + +.Linv: ldr $s0,[$i1],#16 + ldr $s1,[$i1,#-12] + ldr $s2,[$i1,#-8] + ldr $s3,[$i1,#-4] + ldr $t1,[$i2],#-16 + ldr $t2,[$i2,#16+4] + ldr $t3,[$i2,#16+8] + ldr $i3,[$i2,#16+12] + str $s0,[$tbl],#-16 + str $s1,[$tbl,#16+4] + str $s2,[$tbl,#16+8] + str $s3,[$tbl,#16+12] + str $t1,[$key],#16 + str $t2,[$key,#-12] + str $t3,[$key,#-8] + str $i3,[$key,#-4] + teq $i1,$i2 + bne .Linv -.Linv: ldr $s0,[$i1] + ldr $s0,[$i1] ldr $s1,[$i1,#4] ldr $s2,[$i1,#8] ldr $s3,[$i1,#12] - ldr $t1,[$i2] - ldr $t2,[$i2,#4] - ldr $t3,[$i2,#8] - ldr $i3,[$i2,#12] - str $s0,[$i2],#-16 - str $s1,[$i2,#16+4] - str $s2,[$i2,#16+8] - str $s3,[$i2,#16+12] - str $t1,[$i1],#16 - str $t2,[$i1,#-12] - str $t3,[$i1,#-8] - str $i3,[$i1,#-4] - teq $i1,$i2 - bne .Linv + str $s0,[$key] + str $s1,[$key,#4] + str $s2,[$key,#8] + str $s3,[$key,#12] + sub $key,$key,$rounds,lsl#3 ___ $mask80=$i1; $mask1b=$i2; @@ -773,7 +839,7 @@ $code.=<<___; moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) #endif -.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key +.size AES_set_enc2dec_key,.-AES_set_enc2dec_key .type AES_Td,%object .align 5 @@ -883,7 +949,11 @@ AES_Td: .type AES_decrypt,%function .align 5 AES_decrypt: +#if __ARM_ARCH__<7 sub r3,pc,#8 @ AES_decrypt +#else + adr r3,AES_decrypt +#endif stmdb sp!,{r1,r4-r12,lr} mov $rounds,r0 @ inp mov $key,r2 @@ -1080,8 +1150,9 @@ _armv4_AES_decrypt: ldrb $t3,[$tbl,$i3] @ Td4[s0>>0] and $i3,lr,$s1,lsr#8 + add $s1,$tbl,$s1,lsr#24 ldrb $i1,[$tbl,$i1] @ Td4[s1>>0] - ldrb $s1,[$tbl,$s1,lsr#24] @ Td4[s1>>24] + ldrb $s1,[$s1] @ Td4[s1>>24] ldrb $i2,[$tbl,$i2] @ Td4[s1>>16] eor $s0,$i1,$s0,lsl#24 ldrb $i3,[$tbl,$i3] @ Td4[s1>>8] @@ -1094,7 +1165,8 @@ _armv4_AES_decrypt: ldrb $i2,[$tbl,$i2] @ Td4[s2>>0] and $i3,lr,$s2,lsr#16 - ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24] + add $s2,$tbl,$s2,lsr#24 + ldrb $s2,[$s2] @ Td4[s2>>24] eor $s0,$s0,$i1,lsl#8 ldrb $i3,[$tbl,$i3] @ Td4[s2>>16] eor $s1,$i2,$s1,lsl#16 @@ -1106,8 +1178,9 @@ _armv4_AES_decrypt: ldrb $i2,[$tbl,$i2] @ Td4[s3>>8] and $i3,lr,$s3 @ i2 + add $s3,$tbl,$s3,lsr#24 ldrb $i3,[$tbl,$i3] @ Td4[s3>>0] - ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24] + ldrb $s3,[$s3] @ Td4[s3>>24] eor $s0,$s0,$i1,lsl#16 ldr $i1,[$key,#0] eor $s1,$s1,$i2,lsl#8 @@ -1130,5 +1203,15 @@ _armv4_AES_decrypt: ___ $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 +$code =~ s/\bret\b/bx\tlr/gm; + +open SELF,$0; +while(<SELF>) { + next if (/^#!/); + last if (!s/^#/@/ and !/^$/); + print; +} +close SELF; + print $code; close STDOUT; # enforce flush diff --git a/crypto/aes/asm/aes-mips.pl b/crypto/aes/asm/aes-mips.pl index 537c8d3172b6..4de3ee26bb74 100755 --- a/crypto/aes/asm/aes-mips.pl +++ b/crypto/aes/asm/aes-mips.pl @@ -20,6 +20,13 @@ # thing about this module is its endian neutrality, which means that # it processes data without ever changing byte order... +# September 2012 +# +# Add MIPS32R2 (~10% less instructions) and SmartMIPS ASE (further +# ~25% less instructions) code. Note that there is no run-time switch, +# instead, code path is chosen upon pre-process time, pass -mips32r2 +# or/and -msmartmips. + ###################################################################### # There is a number of MIPS ABI in use, O32 and N32/64 are most # widely used. Then there is a new contender: NUBI. It appears that if @@ -47,11 +54,12 @@ # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); # -$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 +$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64 if ($flavour =~ /64|n32/i) { $PTR_ADD="dadd"; # incidentally works even on n32 $PTR_SUB="dsub"; # incidentally works even on n32 + $PTR_INS="dins"; $REG_S="sd"; $REG_L="ld"; $PTR_SLL="dsll"; # incidentally works even on n32 @@ -59,6 +67,7 @@ if ($flavour =~ /64|n32/i) { } else { $PTR_ADD="add"; $PTR_SUB="sub"; + $PTR_INS="ins"; $REG_S="sw"; $REG_L="lw"; $PTR_SLL="sll"; @@ -89,7 +98,11 @@ $code.=<<___; # include <openssl/fipssyms.h> #endif -#if !defined(__vxworks) || defined(__pic__) +#if defined(__mips_smartmips) && !defined(_MIPS_ARCH_MIPS32R2) +#define _MIPS_ARCH_MIPS32R2 +#endif + +#if !defined(__mips_eabi) && (!defined(__vxworks) || defined(__pic__)) .option pic2 #endif .set noat @@ -125,6 +138,89 @@ _mips_AES_encrypt: xor $s3,$t3 sub $cnt,1 +#if defined(__mips_smartmips) + ext $i0,$s1,16,8 +.Loop_enc: + ext $i1,$s2,16,8 + ext $i2,$s3,16,8 + ext $i3,$s0,16,8 + lwxs $t0,$i0($Tbl) # Te1[s1>>16] + ext $i0,$s2,8,8 + lwxs $t1,$i1($Tbl) # Te1[s2>>16] + ext $i1,$s3,8,8 + lwxs $t2,$i2($Tbl) # Te1[s3>>16] + ext $i2,$s0,8,8 + lwxs $t3,$i3($Tbl) # Te1[s0>>16] + ext $i3,$s1,8,8 + + lwxs $t4,$i0($Tbl) # Te2[s2>>8] + ext $i0,$s3,0,8 + lwxs $t5,$i1($Tbl) # Te2[s3>>8] + ext $i1,$s0,0,8 + lwxs $t6,$i2($Tbl) # Te2[s0>>8] + ext $i2,$s1,0,8 + lwxs $t7,$i3($Tbl) # Te2[s1>>8] + ext $i3,$s2,0,8 + + lwxs $t8,$i0($Tbl) # Te3[s3] + ext $i0,$s0,24,8 + lwxs $t9,$i1($Tbl) # Te3[s0] + ext $i1,$s1,24,8 + lwxs $t10,$i2($Tbl) # Te3[s1] + ext $i2,$s2,24,8 + lwxs $t11,$i3($Tbl) # Te3[s2] + ext $i3,$s3,24,8 + + rotr $t0,$t0,8 + rotr $t1,$t1,8 + rotr $t2,$t2,8 + rotr $t3,$t3,8 + + rotr $t4,$t4,16 + rotr $t5,$t5,16 + rotr $t6,$t6,16 + rotr $t7,$t7,16 + + xor $t0,$t4 + lwxs $t4,$i0($Tbl) # Te0[s0>>24] + xor $t1,$t5 + lwxs $t5,$i1($Tbl) # Te0[s1>>24] + xor $t2,$t6 + lwxs $t6,$i2($Tbl) # Te0[s2>>24] + xor $t3,$t7 + lwxs $t7,$i3($Tbl) # Te0[s3>>24] + + rotr $t8,$t8,24 + lw $s0,0($key0) + rotr $t9,$t9,24 + lw $s1,4($key0) + rotr $t10,$t10,24 + lw $s2,8($key0) + rotr $t11,$t11,24 + lw $s3,12($key0) + + xor $t0,$t8 + xor $t1,$t9 + xor $t2,$t10 + xor $t3,$t11 + + xor $t0,$t4 + xor $t1,$t5 + xor $t2,$t6 + xor $t3,$t7 + + sub $cnt,1 + $PTR_ADD $key0,16 + xor $s0,$t0 + xor $s1,$t1 + xor $s2,$t2 + xor $s3,$t3 + .set noreorder + bnez $cnt,.Loop_enc + ext $i0,$s1,16,8 + + _xtr $i0,$s1,16-2 +#else _xtr $i0,$s1,16-2 .Loop_enc: _xtr $i1,$s2,16-2 @@ -138,19 +234,29 @@ _mips_AES_encrypt: $PTR_ADD $i1,$Tbl $PTR_ADD $i2,$Tbl $PTR_ADD $i3,$Tbl +#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) + lw $t0,0($i0) # Te1[s1>>16] + _xtr $i0,$s2,8-2 + lw $t1,0($i1) # Te1[s2>>16] + _xtr $i1,$s3,8-2 + lw $t2,0($i2) # Te1[s3>>16] + _xtr $i2,$s0,8-2 + lw $t3,0($i3) # Te1[s0>>16] + _xtr $i3,$s1,8-2 +#else lwl $t0,3($i0) # Te1[s1>>16] lwl $t1,3($i1) # Te1[s2>>16] lwl $t2,3($i2) # Te1[s3>>16] lwl $t3,3($i3) # Te1[s0>>16] lwr $t0,2($i0) # Te1[s1>>16] - lwr $t1,2($i1) # Te1[s2>>16] - lwr $t2,2($i2) # Te1[s3>>16] - lwr $t3,2($i3) # Te1[s0>>16] - _xtr $i0,$s2,8-2 + lwr $t1,2($i1) # Te1[s2>>16] _xtr $i1,$s3,8-2 + lwr $t2,2($i2) # Te1[s3>>16] _xtr $i2,$s0,8-2 + lwr $t3,2($i3) # Te1[s0>>16] _xtr $i3,$s1,8-2 +#endif and $i0,0x3fc and $i1,0x3fc and $i2,0x3fc @@ -159,19 +265,88 @@ _mips_AES_encrypt: $PTR_ADD $i1,$Tbl $PTR_ADD $i2,$Tbl $PTR_ADD $i3,$Tbl +#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) + rotr $t0,$t0,8 + rotr $t1,$t1,8 + rotr $t2,$t2,8 + rotr $t3,$t3,8 +# if defined(_MIPSEL) + lw $t4,0($i0) # Te2[s2>>8] + _xtr $i0,$s3,0-2 + lw $t5,0($i1) # Te2[s3>>8] + _xtr $i1,$s0,0-2 + lw $t6,0($i2) # Te2[s0>>8] + _xtr $i2,$s1,0-2 + lw $t7,0($i3) # Te2[s1>>8] + _xtr $i3,$s2,0-2 + + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lw $t8,0($i0) # Te3[s3] + $PTR_INS $i0,$s0,2,8 + lw $t9,0($i1) # Te3[s0] + $PTR_INS $i1,$s1,2,8 + lw $t10,0($i2) # Te3[s1] + $PTR_INS $i2,$s2,2,8 + lw $t11,0($i3) # Te3[s2] + $PTR_INS $i3,$s3,2,8 +# else + lw $t4,0($i0) # Te2[s2>>8] + $PTR_INS $i0,$s3,2,8 + lw $t5,0($i1) # Te2[s3>>8] + $PTR_INS $i1,$s0,2,8 + lw $t6,0($i2) # Te2[s0>>8] + $PTR_INS $i2,$s1,2,8 + lw $t7,0($i3) # Te2[s1>>8] + $PTR_INS $i3,$s2,2,8 + + lw $t8,0($i0) # Te3[s3] + _xtr $i0,$s0,24-2 + lw $t9,0($i1) # Te3[s0] + _xtr $i1,$s1,24-2 + lw $t10,0($i2) # Te3[s1] + _xtr $i2,$s2,24-2 + lw $t11,0($i3) # Te3[s2] + _xtr $i3,$s3,24-2 + + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl +# endif + rotr $t4,$t4,16 + rotr $t5,$t5,16 + rotr $t6,$t6,16 + rotr $t7,$t7,16 + + rotr $t8,$t8,24 + rotr $t9,$t9,24 + rotr $t10,$t10,24 + rotr $t11,$t11,24 +#else lwl $t4,2($i0) # Te2[s2>>8] lwl $t5,2($i1) # Te2[s3>>8] lwl $t6,2($i2) # Te2[s0>>8] lwl $t7,2($i3) # Te2[s1>>8] lwr $t4,1($i0) # Te2[s2>>8] - lwr $t5,1($i1) # Te2[s3>>8] - lwr $t6,1($i2) # Te2[s0>>8] - lwr $t7,1($i3) # Te2[s1>>8] - _xtr $i0,$s3,0-2 + lwr $t5,1($i1) # Te2[s3>>8] _xtr $i1,$s0,0-2 + lwr $t6,1($i2) # Te2[s0>>8] _xtr $i2,$s1,0-2 + lwr $t7,1($i3) # Te2[s1>>8] _xtr $i3,$s2,0-2 + and $i0,0x3fc and $i1,0x3fc and $i2,0x3fc @@ -185,14 +360,14 @@ _mips_AES_encrypt: lwl $t10,1($i2) # Te3[s1] lwl $t11,1($i3) # Te3[s2] lwr $t8,0($i0) # Te3[s3] - lwr $t9,0($i1) # Te3[s0] - lwr $t10,0($i2) # Te3[s1] - lwr $t11,0($i3) # Te3[s2] - _xtr $i0,$s0,24-2 + lwr $t9,0($i1) # Te3[s0] _xtr $i1,$s1,24-2 + lwr $t10,0($i2) # Te3[s1] _xtr $i2,$s2,24-2 + lwr $t11,0($i3) # Te3[s2] _xtr $i3,$s3,24-2 + and $i0,0x3fc and $i1,0x3fc and $i2,0x3fc @@ -201,24 +376,24 @@ _mips_AES_encrypt: $PTR_ADD $i1,$Tbl $PTR_ADD $i2,$Tbl $PTR_ADD $i3,$Tbl +#endif xor $t0,$t4 - xor $t1,$t5 - xor $t2,$t6 - xor $t3,$t7 lw $t4,0($i0) # Te0[s0>>24] + xor $t1,$t5 lw $t5,0($i1) # Te0[s1>>24] + xor $t2,$t6 lw $t6,0($i2) # Te0[s2>>24] + xor $t3,$t7 lw $t7,0($i3) # Te0[s3>>24] - lw $s0,0($key0) - lw $s1,4($key0) - lw $s2,8($key0) - lw $s3,12($key0) - xor $t0,$t8 + lw $s0,0($key0) xor $t1,$t9 + lw $s1,4($key0) xor $t2,$t10 + lw $s2,8($key0) xor $t3,$t11 + lw $s3,12($key0) xor $t0,$t4 xor $t1,$t5 @@ -234,6 +409,7 @@ _mips_AES_encrypt: .set noreorder bnez $cnt,.Loop_enc _xtr $i0,$s1,16-2 +#endif .set reorder _xtr $i1,$s2,16-2 @@ -248,14 +424,14 @@ _mips_AES_encrypt: $PTR_ADD $i2,$Tbl $PTR_ADD $i3,$Tbl lbu $t0,2($i0) # Te4[s1>>16] - lbu $t1,2($i1) # Te4[s2>>16] - lbu $t2,2($i2) # Te4[s3>>16] - lbu $t3,2($i3) # Te4[s0>>16] - _xtr $i0,$s2,8-2 + lbu $t1,2($i1) # Te4[s2>>16] _xtr $i1,$s3,8-2 + lbu $t2,2($i2) # Te4[s3>>16] _xtr $i2,$s0,8-2 + lbu $t3,2($i3) # Te4[s0>>16] _xtr $i3,$s1,8-2 + and $i0,0x3fc and $i1,0x3fc and $i2,0x3fc @@ -264,15 +440,44 @@ _mips_AES_encrypt: $PTR_ADD $i1,$Tbl $PTR_ADD $i2,$Tbl $PTR_ADD $i3,$Tbl +#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) +# if defined(_MIPSEL) lbu $t4,2($i0) # Te4[s2>>8] + $PTR_INS $i0,$s0,2,8 lbu $t5,2($i1) # Te4[s3>>8] + $PTR_INS $i1,$s1,2,8 lbu $t6,2($i2) # Te4[s0>>8] + $PTR_INS $i2,$s2,2,8 lbu $t7,2($i3) # Te4[s1>>8] + $PTR_INS $i3,$s3,2,8 + lbu $t8,2($i0) # Te4[s0>>24] + _xtr $i0,$s3,0-2 + lbu $t9,2($i1) # Te4[s1>>24] + _xtr $i1,$s0,0-2 + lbu $t10,2($i2) # Te4[s2>>24] + _xtr $i2,$s1,0-2 + lbu $t11,2($i3) # Te4[s3>>24] + _xtr $i3,$s2,0-2 + + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl +# else + lbu $t4,2($i0) # Te4[s2>>8] _xtr $i0,$s0,24-2 + lbu $t5,2($i1) # Te4[s3>>8] _xtr $i1,$s1,24-2 + lbu $t6,2($i2) # Te4[s0>>8] _xtr $i2,$s2,24-2 + lbu $t7,2($i3) # Te4[s1>>8] _xtr $i3,$s3,24-2 + and $i0,0x3fc and $i1,0x3fc and $i2,0x3fc @@ -282,18 +487,76 @@ _mips_AES_encrypt: $PTR_ADD $i2,$Tbl $PTR_ADD $i3,$Tbl lbu $t8,2($i0) # Te4[s0>>24] + $PTR_INS $i0,$s3,2,8 lbu $t9,2($i1) # Te4[s1>>24] + $PTR_INS $i1,$s0,2,8 lbu $t10,2($i2) # Te4[s2>>24] + $PTR_INS $i2,$s1,2,8 lbu $t11,2($i3) # Te4[s3>>24] + $PTR_INS $i3,$s2,2,8 +# endif + _ins $t0,16 + _ins $t1,16 + _ins $t2,16 + _ins $t3,16 + _ins2 $t0,$t4,8 + lbu $t4,2($i0) # Te4[s3] + _ins2 $t1,$t5,8 + lbu $t5,2($i1) # Te4[s0] + _ins2 $t2,$t6,8 + lbu $t6,2($i2) # Te4[s1] + _ins2 $t3,$t7,8 + lbu $t7,2($i3) # Te4[s2] + + _ins2 $t0,$t8,24 + lw $s0,0($key0) + _ins2 $t1,$t9,24 + lw $s1,4($key0) + _ins2 $t2,$t10,24 + lw $s2,8($key0) + _ins2 $t3,$t11,24 + lw $s3,12($key0) + + _ins2 $t0,$t4,0 + _ins2 $t1,$t5,0 + _ins2 $t2,$t6,0 + _ins2 $t3,$t7,0 +#else + lbu $t4,2($i0) # Te4[s2>>8] + _xtr $i0,$s0,24-2 + lbu $t5,2($i1) # Te4[s3>>8] + _xtr $i1,$s1,24-2 + lbu $t6,2($i2) # Te4[s0>>8] + _xtr $i2,$s2,24-2 + lbu $t7,2($i3) # Te4[s1>>8] + _xtr $i3,$s3,24-2 + + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $t8,2($i0) # Te4[s0>>24] _xtr $i0,$s3,0-2 + lbu $t9,2($i1) # Te4[s1>>24] _xtr $i1,$s0,0-2 + lbu $t10,2($i2) # Te4[s2>>24] _xtr $i2,$s1,0-2 + lbu $t11,2($i3) # Te4[s3>>24] _xtr $i3,$s2,0-2 + and $i0,0x3fc and $i1,0x3fc and $i2,0x3fc and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl _ins $t0,16 _ins $t1,16 @@ -306,27 +569,21 @@ _mips_AES_encrypt: _ins $t7,8 xor $t0,$t4 - xor $t1,$t5 - xor $t2,$t6 - xor $t3,$t7 - - $PTR_ADD $i0,$Tbl - $PTR_ADD $i1,$Tbl - $PTR_ADD $i2,$Tbl - $PTR_ADD $i3,$Tbl lbu $t4,2($i0) # Te4[s3] + xor $t1,$t5 lbu $t5,2($i1) # Te4[s0] + xor $t2,$t6 lbu $t6,2($i2) # Te4[s1] + xor $t3,$t7 lbu $t7,2($i3) # Te4[s2] _ins $t8,24 - _ins $t9,24 - _ins $t10,24 - _ins $t11,24 - lw $s0,0($key0) + _ins $t9,24 lw $s1,4($key0) + _ins $t10,24 lw $s2,8($key0) + _ins $t11,24 lw $s3,12($key0) xor $t0,$t8 @@ -343,7 +600,7 @@ _mips_AES_encrypt: xor $t1,$t5 xor $t2,$t6 xor $t3,$t7 - +#endif xor $s0,$t0 xor $s1,$t1 xor $s2,$t2 @@ -455,6 +712,89 @@ _mips_AES_decrypt: xor $s3,$t3 sub $cnt,1 +#if defined(__mips_smartmips) + ext $i0,$s3,16,8 +.Loop_dec: + ext $i1,$s0,16,8 + ext $i2,$s1,16,8 + ext $i3,$s2,16,8 + lwxs $t0,$i0($Tbl) # Td1[s3>>16] + ext $i0,$s2,8,8 + lwxs $t1,$i1($Tbl) # Td1[s0>>16] + ext $i1,$s3,8,8 + lwxs $t2,$i2($Tbl) # Td1[s1>>16] + ext $i2,$s0,8,8 + lwxs $t3,$i3($Tbl) # Td1[s2>>16] + ext $i3,$s1,8,8 + + lwxs $t4,$i0($Tbl) # Td2[s2>>8] + ext $i0,$s1,0,8 + lwxs $t5,$i1($Tbl) # Td2[s3>>8] + ext $i1,$s2,0,8 + lwxs $t6,$i2($Tbl) # Td2[s0>>8] + ext $i2,$s3,0,8 + lwxs $t7,$i3($Tbl) # Td2[s1>>8] + ext $i3,$s0,0,8 + + lwxs $t8,$i0($Tbl) # Td3[s1] + ext $i0,$s0,24,8 + lwxs $t9,$i1($Tbl) # Td3[s2] + ext $i1,$s1,24,8 + lwxs $t10,$i2($Tbl) # Td3[s3] + ext $i2,$s2,24,8 + lwxs $t11,$i3($Tbl) # Td3[s0] + ext $i3,$s3,24,8 + + rotr $t0,$t0,8 + rotr $t1,$t1,8 + rotr $t2,$t2,8 + rotr $t3,$t3,8 + + rotr $t4,$t4,16 + rotr $t5,$t5,16 + rotr $t6,$t6,16 + rotr $t7,$t7,16 + + xor $t0,$t4 + lwxs $t4,$i0($Tbl) # Td0[s0>>24] + xor $t1,$t5 + lwxs $t5,$i1($Tbl) # Td0[s1>>24] + xor $t2,$t6 + lwxs $t6,$i2($Tbl) # Td0[s2>>24] + xor $t3,$t7 + lwxs $t7,$i3($Tbl) # Td0[s3>>24] + + rotr $t8,$t8,24 + lw $s0,0($key0) + rotr $t9,$t9,24 + lw $s1,4($key0) + rotr $t10,$t10,24 + lw $s2,8($key0) + rotr $t11,$t11,24 + lw $s3,12($key0) + + xor $t0,$t8 + xor $t1,$t9 + xor $t2,$t10 + xor $t3,$t11 + + xor $t0,$t4 + xor $t1,$t5 + xor $t2,$t6 + xor $t3,$t7 + + sub $cnt,1 + $PTR_ADD $key0,16 + xor $s0,$t0 + xor $s1,$t1 + xor $s2,$t2 + xor $s3,$t3 + .set noreorder + bnez $cnt,.Loop_dec + ext $i0,$s3,16,8 + + _xtr $i0,$s3,16-2 +#else _xtr $i0,$s3,16-2 .Loop_dec: _xtr $i1,$s0,16-2 @@ -468,19 +808,88 @@ _mips_AES_decrypt: $PTR_ADD $i1,$Tbl $PTR_ADD $i2,$Tbl $PTR_ADD $i3,$Tbl +#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) + lw $t0,0($i0) # Td1[s3>>16] + _xtr $i0,$s2,8-2 + lw $t1,0($i1) # Td1[s0>>16] + _xtr $i1,$s3,8-2 + lw $t2,0($i2) # Td1[s1>>16] + _xtr $i2,$s0,8-2 + lw $t3,0($i3) # Td1[s2>>16] + _xtr $i3,$s1,8-2 +#else lwl $t0,3($i0) # Td1[s3>>16] lwl $t1,3($i1) # Td1[s0>>16] lwl $t2,3($i2) # Td1[s1>>16] lwl $t3,3($i3) # Td1[s2>>16] lwr $t0,2($i0) # Td1[s3>>16] - lwr $t1,2($i1) # Td1[s0>>16] - lwr $t2,2($i2) # Td1[s1>>16] - lwr $t3,2($i3) # Td1[s2>>16] - _xtr $i0,$s2,8-2 + lwr $t1,2($i1) # Td1[s0>>16] _xtr $i1,$s3,8-2 + lwr $t2,2($i2) # Td1[s1>>16] _xtr $i2,$s0,8-2 + lwr $t3,2($i3) # Td1[s2>>16] _xtr $i3,$s1,8-2 +#endif + + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl +#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) + rotr $t0,$t0,8 + rotr $t1,$t1,8 + rotr $t2,$t2,8 + rotr $t3,$t3,8 +# if defined(_MIPSEL) + lw $t4,0($i0) # Td2[s2>>8] + _xtr $i0,$s1,0-2 + lw $t5,0($i1) # Td2[s3>>8] + _xtr $i1,$s2,0-2 + lw $t6,0($i2) # Td2[s0>>8] + _xtr $i2,$s3,0-2 + lw $t7,0($i3) # Td2[s1>>8] + _xtr $i3,$s0,0-2 + + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lw $t8,0($i0) # Td3[s1] + $PTR_INS $i0,$s0,2,8 + lw $t9,0($i1) # Td3[s2] + $PTR_INS $i1,$s1,2,8 + lw $t10,0($i2) # Td3[s3] + $PTR_INS $i2,$s2,2,8 + lw $t11,0($i3) # Td3[s0] + $PTR_INS $i3,$s3,2,8 +#else + lw $t4,0($i0) # Td2[s2>>8] + $PTR_INS $i0,$s1,2,8 + lw $t5,0($i1) # Td2[s3>>8] + $PTR_INS $i1,$s2,2,8 + lw $t6,0($i2) # Td2[s0>>8] + $PTR_INS $i2,$s3,2,8 + lw $t7,0($i3) # Td2[s1>>8] + $PTR_INS $i3,$s0,2,8 + + lw $t8,0($i0) # Td3[s1] + _xtr $i0,$s0,24-2 + lw $t9,0($i1) # Td3[s2] + _xtr $i1,$s1,24-2 + lw $t10,0($i2) # Td3[s3] + _xtr $i2,$s2,24-2 + lw $t11,0($i3) # Td3[s0] + _xtr $i3,$s3,24-2 + and $i0,0x3fc and $i1,0x3fc and $i2,0x3fc @@ -489,19 +898,30 @@ _mips_AES_decrypt: $PTR_ADD $i1,$Tbl $PTR_ADD $i2,$Tbl $PTR_ADD $i3,$Tbl +#endif + rotr $t4,$t4,16 + rotr $t5,$t5,16 + rotr $t6,$t6,16 + rotr $t7,$t7,16 + + rotr $t8,$t8,24 + rotr $t9,$t9,24 + rotr $t10,$t10,24 + rotr $t11,$t11,24 +#else lwl $t4,2($i0) # Td2[s2>>8] lwl $t5,2($i1) # Td2[s3>>8] lwl $t6,2($i2) # Td2[s0>>8] lwl $t7,2($i3) # Td2[s1>>8] lwr $t4,1($i0) # Td2[s2>>8] - lwr $t5,1($i1) # Td2[s3>>8] - lwr $t6,1($i2) # Td2[s0>>8] - lwr $t7,1($i3) # Td2[s1>>8] - _xtr $i0,$s1,0-2 + lwr $t5,1($i1) # Td2[s3>>8] _xtr $i1,$s2,0-2 + lwr $t6,1($i2) # Td2[s0>>8] _xtr $i2,$s3,0-2 + lwr $t7,1($i3) # Td2[s1>>8] _xtr $i3,$s0,0-2 + and $i0,0x3fc and $i1,0x3fc and $i2,0x3fc @@ -515,14 +935,14 @@ _mips_AES_decrypt: lwl $t10,1($i2) # Td3[s3] lwl $t11,1($i3) # Td3[s0] lwr $t8,0($i0) # Td3[s1] - lwr $t9,0($i1) # Td3[s2] - lwr $t10,0($i2) # Td3[s3] - lwr $t11,0($i3) # Td3[s0] - _xtr $i0,$s0,24-2 + lwr $t9,0($i1) # Td3[s2] _xtr $i1,$s1,24-2 + lwr $t10,0($i2) # Td3[s3] _xtr $i2,$s2,24-2 + lwr $t11,0($i3) # Td3[s0] _xtr $i3,$s3,24-2 + and $i0,0x3fc and $i1,0x3fc and $i2,0x3fc @@ -531,27 +951,25 @@ _mips_AES_decrypt: $PTR_ADD $i1,$Tbl $PTR_ADD $i2,$Tbl $PTR_ADD $i3,$Tbl +#endif xor $t0,$t4 - xor $t1,$t5 - xor $t2,$t6 - xor $t3,$t7 - - lw $t4,0($i0) # Td0[s0>>24] + xor $t1,$t5 lw $t5,0($i1) # Td0[s1>>24] + xor $t2,$t6 lw $t6,0($i2) # Td0[s2>>24] + xor $t3,$t7 lw $t7,0($i3) # Td0[s3>>24] - lw $s0,0($key0) - lw $s1,4($key0) - lw $s2,8($key0) - lw $s3,12($key0) - xor $t0,$t8 + lw $s0,0($key0) xor $t1,$t9 + lw $s1,4($key0) xor $t2,$t10 + lw $s2,8($key0) xor $t3,$t11 + lw $s3,12($key0) xor $t0,$t4 xor $t1,$t5 @@ -567,38 +985,39 @@ _mips_AES_decrypt: .set noreorder bnez $cnt,.Loop_dec _xtr $i0,$s3,16-2 +#endif .set reorder lw $t4,1024($Tbl) # prefetch Td4 - lw $t5,1024+32($Tbl) - lw $t6,1024+64($Tbl) - lw $t7,1024+96($Tbl) - lw $t8,1024+128($Tbl) - lw $t9,1024+160($Tbl) - lw $t10,1024+192($Tbl) - lw $t11,1024+224($Tbl) - _xtr $i0,$s3,16 + lw $t5,1024+32($Tbl) _xtr $i1,$s0,16 + lw $t6,1024+64($Tbl) _xtr $i2,$s1,16 + lw $t7,1024+96($Tbl) _xtr $i3,$s2,16 + lw $t8,1024+128($Tbl) and $i0,0xff + lw $t9,1024+160($Tbl) and $i1,0xff + lw $t10,1024+192($Tbl) and $i2,0xff + lw $t11,1024+224($Tbl) and $i3,0xff + $PTR_ADD $i0,$Tbl $PTR_ADD $i1,$Tbl $PTR_ADD $i2,$Tbl $PTR_ADD $i3,$Tbl lbu $t0,1024($i0) # Td4[s3>>16] - lbu $t1,1024($i1) # Td4[s0>>16] - lbu $t2,1024($i2) # Td4[s1>>16] - lbu $t3,1024($i3) # Td4[s2>>16] - _xtr $i0,$s2,8 + lbu $t1,1024($i1) # Td4[s0>>16] _xtr $i1,$s3,8 + lbu $t2,1024($i2) # Td4[s1>>16] _xtr $i2,$s0,8 + lbu $t3,1024($i3) # Td4[s2>>16] _xtr $i3,$s1,8 + and $i0,0xff and $i1,0xff and $i2,0xff @@ -607,29 +1026,108 @@ _mips_AES_decrypt: $PTR_ADD $i1,$Tbl $PTR_ADD $i2,$Tbl $PTR_ADD $i3,$Tbl +#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) +# if defined(_MIPSEL) lbu $t4,1024($i0) # Td4[s2>>8] + $PTR_INS $i0,$s0,0,8 lbu $t5,1024($i1) # Td4[s3>>8] + $PTR_INS $i1,$s1,0,8 lbu $t6,1024($i2) # Td4[s0>>8] + $PTR_INS $i2,$s2,0,8 lbu $t7,1024($i3) # Td4[s1>>8] + $PTR_INS $i3,$s3,0,8 + lbu $t8,1024($i0) # Td4[s0>>24] + _xtr $i0,$s1,0 + lbu $t9,1024($i1) # Td4[s1>>24] + _xtr $i1,$s2,0 + lbu $t10,1024($i2) # Td4[s2>>24] + _xtr $i2,$s3,0 + lbu $t11,1024($i3) # Td4[s3>>24] + _xtr $i3,$s0,0 + + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl +# else + lbu $t4,1024($i0) # Td4[s2>>8] _xtr $i0,$s0,24 + lbu $t5,1024($i1) # Td4[s3>>8] _xtr $i1,$s1,24 + lbu $t6,1024($i2) # Td4[s0>>8] _xtr $i2,$s2,24 + lbu $t7,1024($i3) # Td4[s1>>8] _xtr $i3,$s3,24 + $PTR_ADD $i0,$Tbl $PTR_ADD $i1,$Tbl $PTR_ADD $i2,$Tbl $PTR_ADD $i3,$Tbl lbu $t8,1024($i0) # Td4[s0>>24] + $PTR_INS $i0,$s1,0,8 lbu $t9,1024($i1) # Td4[s1>>24] + $PTR_INS $i1,$s2,0,8 lbu $t10,1024($i2) # Td4[s2>>24] + $PTR_INS $i2,$s3,0,8 lbu $t11,1024($i3) # Td4[s3>>24] + $PTR_INS $i3,$s0,0,8 +# endif + _ins $t0,16 + _ins $t1,16 + _ins $t2,16 + _ins $t3,16 + + _ins2 $t0,$t4,8 + lbu $t4,1024($i0) # Td4[s1] + _ins2 $t1,$t5,8 + lbu $t5,1024($i1) # Td4[s2] + _ins2 $t2,$t6,8 + lbu $t6,1024($i2) # Td4[s3] + _ins2 $t3,$t7,8 + lbu $t7,1024($i3) # Td4[s0] + + _ins2 $t0,$t8,24 + lw $s0,0($key0) + _ins2 $t1,$t9,24 + lw $s1,4($key0) + _ins2 $t2,$t10,24 + lw $s2,8($key0) + _ins2 $t3,$t11,24 + lw $s3,12($key0) + _ins2 $t0,$t4,0 + _ins2 $t1,$t5,0 + _ins2 $t2,$t6,0 + _ins2 $t3,$t7,0 +#else + lbu $t4,1024($i0) # Td4[s2>>8] + _xtr $i0,$s0,24 + lbu $t5,1024($i1) # Td4[s3>>8] + _xtr $i1,$s1,24 + lbu $t6,1024($i2) # Td4[s0>>8] + _xtr $i2,$s2,24 + lbu $t7,1024($i3) # Td4[s1>>8] + _xtr $i3,$s3,24 + + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $t8,1024($i0) # Td4[s0>>24] _xtr $i0,$s1,0 + lbu $t9,1024($i1) # Td4[s1>>24] _xtr $i1,$s2,0 + lbu $t10,1024($i2) # Td4[s2>>24] _xtr $i2,$s3,0 + lbu $t11,1024($i3) # Td4[s3>>24] _xtr $i3,$s0,0 + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + _ins $t0,16 _ins $t1,16 _ins $t2,16 @@ -641,44 +1139,38 @@ _mips_AES_decrypt: _ins $t7,8 xor $t0,$t4 - xor $t1,$t5 - xor $t2,$t6 - xor $t3,$t7 - - $PTR_ADD $i0,$Tbl - $PTR_ADD $i1,$Tbl - $PTR_ADD $i2,$Tbl - $PTR_ADD $i3,$Tbl lbu $t4,1024($i0) # Td4[s1] + xor $t1,$t5 lbu $t5,1024($i1) # Td4[s2] + xor $t2,$t6 lbu $t6,1024($i2) # Td4[s3] + xor $t3,$t7 lbu $t7,1024($i3) # Td4[s0] _ins $t8,24 - _ins $t9,24 - _ins $t10,24 - _ins $t11,24 - lw $s0,0($key0) + _ins $t9,24 lw $s1,4($key0) + _ins $t10,24 lw $s2,8($key0) + _ins $t11,24 lw $s3,12($key0) - _ins $t4,0 - _ins $t5,0 - _ins $t6,0 - _ins $t7,0 - - xor $t0,$t8 xor $t1,$t9 xor $t2,$t10 xor $t3,$t11 + _ins $t4,0 + _ins $t5,0 + _ins $t6,0 + _ins $t7,0 + xor $t0,$t4 xor $t1,$t5 xor $t2,$t6 xor $t3,$t7 +#endif xor $s0,$t0 xor $s1,$t1 @@ -791,7 +1283,7 @@ _mips_AES_set_encrypt_key: beqz $inp,.Lekey_done li $t0,-1 beqz $key,.Lekey_done - $PTR_ADD $rcon,$Tbl,1024+256 + $PTR_ADD $rcon,$Tbl,256 .set reorder lwl $rk0,0+$MSB($inp) # load 128 bits @@ -843,10 +1335,10 @@ _mips_AES_set_encrypt_key: $PTR_ADD $i1,$Tbl $PTR_ADD $i2,$Tbl $PTR_ADD $i3,$Tbl - lbu $i0,1024($i0) - lbu $i1,1024($i1) - lbu $i2,1024($i2) - lbu $i3,1024($i3) + lbu $i0,0($i0) + lbu $i1,0($i1) + lbu $i2,0($i2) + lbu $i3,0($i3) sw $rk0,0($key) sw $rk1,4($key) @@ -898,10 +1390,10 @@ _mips_AES_set_encrypt_key: $PTR_ADD $i1,$Tbl $PTR_ADD $i2,$Tbl $PTR_ADD $i3,$Tbl - lbu $i0,1024($i0) - lbu $i1,1024($i1) - lbu $i2,1024($i2) - lbu $i3,1024($i3) + lbu $i0,0($i0) + lbu $i1,0($i1) + lbu $i2,0($i2) + lbu $i3,0($i3) sw $rk0,0($key) sw $rk1,4($key) @@ -957,10 +1449,10 @@ _mips_AES_set_encrypt_key: $PTR_ADD $i1,$Tbl $PTR_ADD $i2,$Tbl $PTR_ADD $i3,$Tbl - lbu $i0,1024($i0) - lbu $i1,1024($i1) - lbu $i2,1024($i2) - lbu $i3,1024($i3) + lbu $i0,0($i0) + lbu $i1,0($i1) + lbu $i2,0($i2) + lbu $i3,0($i3) sw $rk0,0($key) sw $rk1,4($key) @@ -999,10 +1491,10 @@ _mips_AES_set_encrypt_key: $PTR_ADD $i1,$Tbl $PTR_ADD $i2,$Tbl $PTR_ADD $i3,$Tbl - lbu $i0,1024($i0) - lbu $i1,1024($i1) - lbu $i2,1024($i2) - lbu $i3,1024($i3) + lbu $i0,0($i0) + lbu $i1,0($i1) + lbu $i2,0($i2) + lbu $i3,0($i3) sll $i0,24 sll $i1,16 sll $i2,8 @@ -1064,7 +1556,7 @@ $code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification ___ $code.=<<___; .set reorder - la $Tbl,AES_Te # PIC-ified 'load address' + la $Tbl,AES_Te4 # PIC-ified 'load address' bal _mips_AES_set_encrypt_key @@ -1119,7 +1611,7 @@ $code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification ___ $code.=<<___; .set reorder - la $Tbl,AES_Te # PIC-ified 'load address' + la $Tbl,AES_Te4 # PIC-ified 'load address' bal _mips_AES_set_encrypt_key @@ -1190,6 +1682,16 @@ $code.=<<___; xor $tpb,$tp9,$tp2 xor $tpd,$tp9,$tp4 +#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) + rotr $tp1,$tpd,16 + xor $tpe,$tp2 + rotr $tp2,$tp9,8 + xor $tpe,$tp1 + rotr $tp4,$tpb,24 + xor $tpe,$tp2 + lw $tp1,4($key) # modulo-scheduled + xor $tpe,$tp4 +#else _ror $tp1,$tpd,16 xor $tpe,$tp2 _ror $tp2,$tpd,-16 @@ -1204,6 +1706,7 @@ $code.=<<___; xor $tpe,$tp1 lw $tp1,4($key) # modulo-scheduled xor $tpe,$tp2 +#endif sub $cnt,1 sw $tpe,0($key) $PTR_ADD $key,4 @@ -1234,7 +1737,7 @@ ___ # Tables are kept in endian-neutral manner $code.=<<___; .rdata -.align 6 +.align 10 AES_Te: .byte 0xc6,0x63,0x63,0xa5, 0xf8,0x7c,0x7c,0x84 # Te0 .byte 0xee,0x77,0x77,0x99, 0xf6,0x7b,0x7b,0x8d @@ -1365,46 +1868,6 @@ AES_Te: .byte 0x7b,0xb0,0xb0,0xcb, 0xa8,0x54,0x54,0xfc .byte 0x6d,0xbb,0xbb,0xd6, 0x2c,0x16,0x16,0x3a -.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 # Te4 -.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 -.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 -.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 -.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc -.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 -.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a -.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 -.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 -.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 -.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b -.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf -.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 -.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 -.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 -.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 -.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 -.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 -.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 -.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb -.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c -.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 -.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 -.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 -.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 -.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a -.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e -.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e -.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 -.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf -.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 -.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 - -.byte 0x01,0x00,0x00,0x00, 0x02,0x00,0x00,0x00 # rcon -.byte 0x04,0x00,0x00,0x00, 0x08,0x00,0x00,0x00 -.byte 0x10,0x00,0x00,0x00, 0x20,0x00,0x00,0x00 -.byte 0x40,0x00,0x00,0x00, 0x80,0x00,0x00,0x00 -.byte 0x1B,0x00,0x00,0x00, 0x36,0x00,0x00,0x00 - -.align 6 AES_Td: .byte 0x51,0xf4,0xa7,0x50, 0x7e,0x41,0x65,0x53 # Td0 .byte 0x1a,0x17,0xa4,0xc3, 0x3a,0x27,0x5e,0x96 @@ -1567,6 +2030,46 @@ AES_Td: .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d + +AES_Te4: +.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 # Te4 +.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 +.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 +.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 +.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc +.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 +.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a +.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 +.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 +.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 +.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b +.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf +.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 +.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 +.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 +.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 +.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 +.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 +.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 +.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb +.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c +.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 +.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 +.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 +.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 +.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a +.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e +.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e +.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 +.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf +.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 +.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 + +.byte 0x01,0x00,0x00,0x00, 0x02,0x00,0x00,0x00 # rcon +.byte 0x04,0x00,0x00,0x00, 0x08,0x00,0x00,0x00 +.byte 0x10,0x00,0x00,0x00, 0x20,0x00,0x00,0x00 +.byte 0x40,0x00,0x00,0x00, 0x80,0x00,0x00,0x00 +.byte 0x1B,0x00,0x00,0x00, 0x36,0x00,0x00,0x00 ___ foreach (split("\n",$code)) { @@ -1583,6 +2086,9 @@ foreach (split("\n",$code)) { s/_ins\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/ sprintf("sll\t$1,$2,%d",$big_endian ? eval($3) : eval("24-$3"))/e or + s/_ins2\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/ + sprintf("ins\t$1,$2,%d,8",$big_endian ? eval($3) + : eval("24-$3"))/e or s/_ror\s+(\$[0-9]+),(\$[0-9]+),(\-?[0-9]+)/ sprintf("srl\t$1,$2,%d",$big_endian ? eval($3) : eval("$3*-1"))/e or @@ -1605,6 +2111,11 @@ foreach (split("\n",$code)) { sprintf("$1%d($3)",eval("$2-$2%4+($2%4+1)&3"))/e; } + if (!$big_endian) { + s/(rotr\s+\$[0-9]+,\$[0-9]+),([0-9]+)/sprintf("$1,%d",32-$2)/e; + s/(ext\s+\$[0-9]+,\$[0-9]+),([0-9]+),8/sprintf("$1,%d,8",24-$2)/e; + } + print $_,"\n"; } diff --git a/crypto/aes/asm/aes-ppc.pl b/crypto/aes/asm/aes-ppc.pl index 7c52cbe5f9fa..7a99fc3d0452 100755 --- a/crypto/aes/asm/aes-ppc.pl +++ b/crypto/aes/asm/aes-ppc.pl @@ -45,6 +45,8 @@ if ($flavour =~ /64/) { $PUSH ="stw"; } else { die "nonsense $flavour"; } +$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0; + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or @@ -68,7 +70,7 @@ $key="r5"; $Tbl0="r3"; $Tbl1="r6"; $Tbl2="r7"; -$Tbl3="r2"; +$Tbl3=$out; # stay away from "r2"; $out is offloaded to stack $s0="r8"; $s1="r9"; @@ -76,7 +78,7 @@ $s2="r10"; $s3="r11"; $t0="r12"; -$t1="r13"; +$t1="r0"; # stay away from "r13"; $t2="r14"; $t3="r15"; @@ -100,9 +102,6 @@ $acc13="r29"; $acc14="r30"; $acc15="r31"; -# stay away from TLS pointer -if ($SIZE_T==8) { die if ($t1 ne "r13"); $t1="r0"; } -else { die if ($Tbl3 ne "r2"); $Tbl3=$t0; $t0="r0"; } $mask80=$Tbl2; $mask1b=$Tbl3; @@ -337,8 +336,7 @@ $code.=<<___; $STU $sp,-$FRAME($sp) mflr r0 - $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) - $PUSH r13,`$FRAME-$SIZE_T*19`($sp) + $PUSH $out,`$FRAME-$SIZE_T*19`($sp) $PUSH r14,`$FRAME-$SIZE_T*18`($sp) $PUSH r15,`$FRAME-$SIZE_T*17`($sp) $PUSH r16,`$FRAME-$SIZE_T*16`($sp) @@ -365,16 +363,61 @@ $code.=<<___; bne Lenc_unaligned Lenc_unaligned_ok: +___ +$code.=<<___ if (!$LITTLE_ENDIAN); lwz $s0,0($inp) lwz $s1,4($inp) lwz $s2,8($inp) lwz $s3,12($inp) +___ +$code.=<<___ if ($LITTLE_ENDIAN); + lwz $t0,0($inp) + lwz $t1,4($inp) + lwz $t2,8($inp) + lwz $t3,12($inp) + rotlwi $s0,$t0,8 + rotlwi $s1,$t1,8 + rotlwi $s2,$t2,8 + rotlwi $s3,$t3,8 + rlwimi $s0,$t0,24,0,7 + rlwimi $s1,$t1,24,0,7 + rlwimi $s2,$t2,24,0,7 + rlwimi $s3,$t3,24,0,7 + rlwimi $s0,$t0,24,16,23 + rlwimi $s1,$t1,24,16,23 + rlwimi $s2,$t2,24,16,23 + rlwimi $s3,$t3,24,16,23 +___ +$code.=<<___; bl LAES_Te bl Lppc_AES_encrypt_compact + $POP $out,`$FRAME-$SIZE_T*19`($sp) +___ +$code.=<<___ if ($LITTLE_ENDIAN); + rotlwi $t0,$s0,8 + rotlwi $t1,$s1,8 + rotlwi $t2,$s2,8 + rotlwi $t3,$s3,8 + rlwimi $t0,$s0,24,0,7 + rlwimi $t1,$s1,24,0,7 + rlwimi $t2,$s2,24,0,7 + rlwimi $t3,$s3,24,0,7 + rlwimi $t0,$s0,24,16,23 + rlwimi $t1,$s1,24,16,23 + rlwimi $t2,$s2,24,16,23 + rlwimi $t3,$s3,24,16,23 + stw $t0,0($out) + stw $t1,4($out) + stw $t2,8($out) + stw $t3,12($out) +___ +$code.=<<___ if (!$LITTLE_ENDIAN); stw $s0,0($out) stw $s1,4($out) stw $s2,8($out) stw $s3,12($out) +___ +$code.=<<___; b Lenc_done Lenc_unaligned: @@ -417,6 +460,7 @@ Lenc_xpage: bl LAES_Te bl Lppc_AES_encrypt_compact + $POP $out,`$FRAME-$SIZE_T*19`($sp) extrwi $acc00,$s0,8,0 extrwi $acc01,$s0,8,8 @@ -449,8 +493,6 @@ Lenc_xpage: Lenc_done: $POP r0,`$FRAME+$LRSAVE`($sp) - $POP $toc,`$FRAME-$SIZE_T*20`($sp) - $POP r13,`$FRAME-$SIZE_T*19`($sp) $POP r14,`$FRAME-$SIZE_T*18`($sp) $POP r15,`$FRAME-$SIZE_T*17`($sp) $POP r16,`$FRAME-$SIZE_T*16`($sp) @@ -764,6 +806,7 @@ Lenc_compact_done: blr .long 0 .byte 0,12,0x14,0,0,0,0,0 +.size .AES_encrypt,.-.AES_encrypt .globl .AES_decrypt .align 7 @@ -771,8 +814,7 @@ Lenc_compact_done: $STU $sp,-$FRAME($sp) mflr r0 - $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) - $PUSH r13,`$FRAME-$SIZE_T*19`($sp) + $PUSH $out,`$FRAME-$SIZE_T*19`($sp) $PUSH r14,`$FRAME-$SIZE_T*18`($sp) $PUSH r15,`$FRAME-$SIZE_T*17`($sp) $PUSH r16,`$FRAME-$SIZE_T*16`($sp) @@ -799,16 +841,61 @@ Lenc_compact_done: bne Ldec_unaligned Ldec_unaligned_ok: +___ +$code.=<<___ if (!$LITTLE_ENDIAN); lwz $s0,0($inp) lwz $s1,4($inp) lwz $s2,8($inp) lwz $s3,12($inp) +___ +$code.=<<___ if ($LITTLE_ENDIAN); + lwz $t0,0($inp) + lwz $t1,4($inp) + lwz $t2,8($inp) + lwz $t3,12($inp) + rotlwi $s0,$t0,8 + rotlwi $s1,$t1,8 + rotlwi $s2,$t2,8 + rotlwi $s3,$t3,8 + rlwimi $s0,$t0,24,0,7 + rlwimi $s1,$t1,24,0,7 + rlwimi $s2,$t2,24,0,7 + rlwimi $s3,$t3,24,0,7 + rlwimi $s0,$t0,24,16,23 + rlwimi $s1,$t1,24,16,23 + rlwimi $s2,$t2,24,16,23 + rlwimi $s3,$t3,24,16,23 +___ +$code.=<<___; bl LAES_Td bl Lppc_AES_decrypt_compact + $POP $out,`$FRAME-$SIZE_T*19`($sp) +___ +$code.=<<___ if ($LITTLE_ENDIAN); + rotlwi $t0,$s0,8 + rotlwi $t1,$s1,8 + rotlwi $t2,$s2,8 + rotlwi $t3,$s3,8 + rlwimi $t0,$s0,24,0,7 + rlwimi $t1,$s1,24,0,7 + rlwimi $t2,$s2,24,0,7 + rlwimi $t3,$s3,24,0,7 + rlwimi $t0,$s0,24,16,23 + rlwimi $t1,$s1,24,16,23 + rlwimi $t2,$s2,24,16,23 + rlwimi $t3,$s3,24,16,23 + stw $t0,0($out) + stw $t1,4($out) + stw $t2,8($out) + stw $t3,12($out) +___ +$code.=<<___ if (!$LITTLE_ENDIAN); stw $s0,0($out) stw $s1,4($out) stw $s2,8($out) stw $s3,12($out) +___ +$code.=<<___; b Ldec_done Ldec_unaligned: @@ -851,6 +938,7 @@ Ldec_xpage: bl LAES_Td bl Lppc_AES_decrypt_compact + $POP $out,`$FRAME-$SIZE_T*19`($sp) extrwi $acc00,$s0,8,0 extrwi $acc01,$s0,8,8 @@ -883,8 +971,6 @@ Ldec_xpage: Ldec_done: $POP r0,`$FRAME+$LRSAVE`($sp) - $POP $toc,`$FRAME-$SIZE_T*20`($sp) - $POP r13,`$FRAME-$SIZE_T*19`($sp) $POP r14,`$FRAME-$SIZE_T*18`($sp) $POP r15,`$FRAME-$SIZE_T*17`($sp) $POP r16,`$FRAME-$SIZE_T*16`($sp) @@ -1355,6 +1441,7 @@ Ldec_compact_done: blr .long 0 .byte 0,12,0x14,0,0,0,0,0 +.size .AES_decrypt,.-.AES_decrypt .asciz "AES for PPC, CRYPTOGAMS by <appro\@openssl.org>" .align 7 diff --git a/crypto/aes/asm/aes-x86_64.pl b/crypto/aes/asm/aes-x86_64.pl index 34cbb5d84429..47f416375d1e 100755 --- a/crypto/aes/asm/aes-x86_64.pl +++ b/crypto/aes/asm/aes-x86_64.pl @@ -19,9 +19,10 @@ # Performance in number of cycles per processed byte for 128-bit key: # # ECB encrypt ECB decrypt CBC large chunk -# AMD64 33 41 13.0 -# EM64T 38 59 18.6(*) -# Core 2 30 43 14.5(*) +# AMD64 33 43 13.0 +# EM64T 38 56 18.6(*) +# Core 2 30 42 14.5(*) +# Atom 65 86 32.1(*) # # (*) with hyper-threading off @@ -366,68 +367,66 @@ $code.=<<___; movzb `&lo("$s0")`,$t0 movzb `&lo("$s1")`,$t1 movzb `&lo("$s2")`,$t2 - movzb ($sbox,$t0,1),$t0 - movzb ($sbox,$t1,1),$t1 - movzb ($sbox,$t2,1),$t2 - movzb `&lo("$s3")`,$t3 movzb `&hi("$s1")`,$acc0 movzb `&hi("$s2")`,$acc1 + shr \$16,$s2 + movzb `&hi("$s3")`,$acc2 + movzb ($sbox,$t0,1),$t0 + movzb ($sbox,$t1,1),$t1 + movzb ($sbox,$t2,1),$t2 movzb ($sbox,$t3,1),$t3 - movzb ($sbox,$acc0,1),$t4 #$t0 - movzb ($sbox,$acc1,1),$t5 #$t1 - movzb `&hi("$s3")`,$acc2 + movzb ($sbox,$acc0,1),$t4 #$t0 movzb `&hi("$s0")`,$acc0 - shr \$16,$s2 + movzb ($sbox,$acc1,1),$t5 #$t1 + movzb `&lo("$s2")`,$acc1 movzb ($sbox,$acc2,1),$acc2 #$t2 movzb ($sbox,$acc0,1),$acc0 #$t3 - shr \$16,$s3 - movzb `&lo("$s2")`,$acc1 shl \$8,$t4 + shr \$16,$s3 shl \$8,$t5 - movzb ($sbox,$acc1,1),$acc1 #$t0 xor $t4,$t0 - xor $t5,$t1 - - movzb `&lo("$s3")`,$t4 shr \$16,$s0 + movzb `&lo("$s3")`,$t4 shr \$16,$s1 - movzb `&lo("$s0")`,$t5 + xor $t5,$t1 shl \$8,$acc2 - shl \$8,$acc0 - movzb ($sbox,$t4,1),$t4 #$t1 - movzb ($sbox,$t5,1),$t5 #$t2 + movzb `&lo("$s0")`,$t5 + movzb ($sbox,$acc1,1),$acc1 #$t0 xor $acc2,$t2 - xor $acc0,$t3 + shl \$8,$acc0 movzb `&lo("$s1")`,$acc2 - movzb `&hi("$s3")`,$acc0 shl \$16,$acc1 - movzb ($sbox,$acc2,1),$acc2 #$t3 - movzb ($sbox,$acc0,1),$acc0 #$t0 + xor $acc0,$t3 + movzb ($sbox,$t4,1),$t4 #$t1 + movzb `&hi("$s3")`,$acc0 + movzb ($sbox,$t5,1),$t5 #$t2 xor $acc1,$t0 - movzb `&hi("$s0")`,$acc1 shr \$8,$s2 + movzb `&hi("$s0")`,$acc1 + shl \$16,$t4 shr \$8,$s1 + shl \$16,$t5 + xor $t4,$t1 + movzb ($sbox,$acc2,1),$acc2 #$t3 + movzb ($sbox,$acc0,1),$acc0 #$t0 movzb ($sbox,$acc1,1),$acc1 #$t1 movzb ($sbox,$s2,1),$s3 #$t3 movzb ($sbox,$s1,1),$s2 #$t2 - shl \$16,$t4 - shl \$16,$t5 + shl \$16,$acc2 - xor $t4,$t1 xor $t5,$t2 - xor $acc2,$t3 - shl \$24,$acc0 + xor $acc2,$t3 shl \$24,$acc1 - shl \$24,$s3 xor $acc0,$t0 - shl \$24,$s2 + shl \$24,$s3 xor $acc1,$t1 + shl \$24,$s2 mov $t0,$s0 mov $t1,$s1 xor $t2,$s2 @@ -466,12 +465,12 @@ sub enctransform() { my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d"); $code.=<<___; - mov $s0,$acc0 - mov $s1,$acc1 - and \$0x80808080,$acc0 - and \$0x80808080,$acc1 - mov $acc0,$t0 - mov $acc1,$t1 + mov \$0x80808080,$t0 + mov \$0x80808080,$t1 + and $s0,$t0 + and $s1,$t1 + mov $t0,$acc0 + mov $t1,$acc1 shr \$7,$t0 lea ($s0,$s0),$r20 shr \$7,$t1 @@ -489,25 +488,25 @@ $code.=<<___; xor $r20,$s0 xor $r21,$s1 - mov $s2,$acc0 - mov $s3,$acc1 + mov \$0x80808080,$t2 rol \$24,$s0 + mov \$0x80808080,$t3 rol \$24,$s1 - and \$0x80808080,$acc0 - and \$0x80808080,$acc1 + and $s2,$t2 + and $s3,$t3 xor $r20,$s0 xor $r21,$s1 - mov $acc0,$t2 - mov $acc1,$t3 + mov $t2,$acc0 ror \$16,$t0 + mov $t3,$acc1 ror \$16,$t1 - shr \$7,$t2 lea ($s2,$s2),$r20 + shr \$7,$t2 xor $t0,$s0 - xor $t1,$s1 shr \$7,$t3 - lea ($s3,$s3),$r21 + xor $t1,$s1 ror \$8,$t0 + lea ($s3,$s3),$r21 ror \$8,$t1 sub $t2,$acc0 sub $t3,$acc1 @@ -523,23 +522,23 @@ $code.=<<___; xor $acc0,$r20 xor $acc1,$r21 + ror \$16,$t2 xor $r20,$s2 + ror \$16,$t3 xor $r21,$s3 rol \$24,$s2 + mov 0($sbox),$acc0 # prefetch Te4 rol \$24,$s3 xor $r20,$s2 - xor $r21,$s3 - mov 0($sbox),$acc0 # prefetch Te4 - ror \$16,$t2 - ror \$16,$t3 mov 64($sbox),$acc1 - xor $t2,$s2 - xor $t3,$s3 + xor $r21,$s3 mov 128($sbox),$r20 + xor $t2,$s2 ror \$8,$t2 + xor $t3,$s3 ror \$8,$t3 - mov 192($sbox),$r21 xor $t2,$s2 + mov 192($sbox),$r21 xor $t3,$s3 ___ } @@ -936,70 +935,69 @@ $code.=<<___; movzb `&lo("$s0")`,$t0 movzb `&lo("$s1")`,$t1 movzb `&lo("$s2")`,$t2 - movzb ($sbox,$t0,1),$t0 - movzb ($sbox,$t1,1),$t1 - movzb ($sbox,$t2,1),$t2 - movzb `&lo("$s3")`,$t3 movzb `&hi("$s3")`,$acc0 movzb `&hi("$s0")`,$acc1 + shr \$16,$s3 + movzb `&hi("$s1")`,$acc2 + movzb ($sbox,$t0,1),$t0 + movzb ($sbox,$t1,1),$t1 + movzb ($sbox,$t2,1),$t2 movzb ($sbox,$t3,1),$t3 - movzb ($sbox,$acc0,1),$t4 #$t0 - movzb ($sbox,$acc1,1),$t5 #$t1 - movzb `&hi("$s1")`,$acc2 + movzb ($sbox,$acc0,1),$t4 #$t0 movzb `&hi("$s2")`,$acc0 - shr \$16,$s2 + movzb ($sbox,$acc1,1),$t5 #$t1 movzb ($sbox,$acc2,1),$acc2 #$t2 movzb ($sbox,$acc0,1),$acc0 #$t3 - shr \$16,$s3 - movzb `&lo("$s2")`,$acc1 - shl \$8,$t4 + shr \$16,$s2 shl \$8,$t5 - movzb ($sbox,$acc1,1),$acc1 #$t0 - xor $t4,$t0 - xor $t5,$t1 - - movzb `&lo("$s3")`,$t4 + shl \$8,$t4 + movzb `&lo("$s2")`,$acc1 shr \$16,$s0 + xor $t4,$t0 shr \$16,$s1 - movzb `&lo("$s0")`,$t5 + movzb `&lo("$s3")`,$t4 + shl \$8,$acc2 + xor $t5,$t1 shl \$8,$acc0 - movzb ($sbox,$t4,1),$t4 #$t1 - movzb ($sbox,$t5,1),$t5 #$t2 + movzb `&lo("$s0")`,$t5 + movzb ($sbox,$acc1,1),$acc1 #$t0 xor $acc2,$t2 - xor $acc0,$t3 - movzb `&lo("$s1")`,$acc2 - movzb `&hi("$s1")`,$acc0 + shl \$16,$acc1 + xor $acc0,$t3 + movzb ($sbox,$t4,1),$t4 #$t1 + movzb `&hi("$s1")`,$acc0 movzb ($sbox,$acc2,1),$acc2 #$t3 - movzb ($sbox,$acc0,1),$acc0 #$t0 xor $acc1,$t0 - + movzb ($sbox,$t5,1),$t5 #$t2 movzb `&hi("$s2")`,$acc1 + + shl \$16,$acc2 shl \$16,$t4 shl \$16,$t5 - movzb ($sbox,$acc1,1),$s1 #$t1 + xor $acc2,$t3 + movzb `&hi("$s3")`,$acc2 xor $t4,$t1 + shr \$8,$s0 xor $t5,$t2 - movzb `&hi("$s3")`,$acc1 - shr \$8,$s0 - shl \$16,$acc2 - movzb ($sbox,$acc1,1),$s2 #$t2 + movzb ($sbox,$acc0,1),$acc0 #$t0 + movzb ($sbox,$acc1,1),$s1 #$t1 + movzb ($sbox,$acc2,1),$s2 #$t2 movzb ($sbox,$s0,1),$s3 #$t3 - xor $acc2,$t3 + mov $t0,$s0 shl \$24,$acc0 shl \$24,$s1 shl \$24,$s2 - xor $acc0,$t0 + xor $acc0,$s0 shl \$24,$s3 xor $t1,$s1 - mov $t0,$s0 xor $t2,$s2 xor $t3,$s3 ___ @@ -1014,12 +1012,12 @@ sub dectransform() my $prefetch = shift; $code.=<<___; - mov $tp10,$acc0 - mov $tp18,$acc8 - and $mask80,$acc0 - and $mask80,$acc8 - mov $acc0,$tp40 - mov $acc8,$tp48 + mov $mask80,$tp40 + mov $mask80,$tp48 + and $tp10,$tp40 + and $tp18,$tp48 + mov $tp40,$acc0 + mov $tp48,$acc8 shr \$7,$tp40 lea ($tp10,$tp10),$tp20 shr \$7,$tp48 @@ -1030,15 +1028,15 @@ $code.=<<___; and $maskfe,$tp28 and $mask1b,$acc0 and $mask1b,$acc8 - xor $tp20,$acc0 - xor $tp28,$acc8 - mov $acc0,$tp20 - mov $acc8,$tp28 - - and $mask80,$acc0 - and $mask80,$acc8 - mov $acc0,$tp80 - mov $acc8,$tp88 + xor $acc0,$tp20 + xor $acc8,$tp28 + mov $mask80,$tp80 + mov $mask80,$tp88 + + and $tp20,$tp80 + and $tp28,$tp88 + mov $tp80,$acc0 + mov $tp88,$acc8 shr \$7,$tp80 lea ($tp20,$tp20),$tp40 shr \$7,$tp88 @@ -1049,15 +1047,15 @@ $code.=<<___; and $maskfe,$tp48 and $mask1b,$acc0 and $mask1b,$acc8 - xor $tp40,$acc0 - xor $tp48,$acc8 - mov $acc0,$tp40 - mov $acc8,$tp48 - - and $mask80,$acc0 - and $mask80,$acc8 - mov $acc0,$tp80 - mov $acc8,$tp88 + xor $acc0,$tp40 + xor $acc8,$tp48 + mov $mask80,$tp80 + mov $mask80,$tp88 + + and $tp40,$tp80 + and $tp48,$tp88 + mov $tp80,$acc0 + mov $tp88,$acc8 shr \$7,$tp80 xor $tp10,$tp20 # tp2^=tp1 shr \$7,$tp88 @@ -1082,51 +1080,51 @@ $code.=<<___; mov $tp10,$acc0 mov $tp18,$acc8 xor $tp80,$tp40 # tp4^tp1^=tp8 - xor $tp88,$tp48 # tp4^tp1^=tp8 shr \$32,$acc0 + xor $tp88,$tp48 # tp4^tp1^=tp8 shr \$32,$acc8 xor $tp20,$tp80 # tp8^=tp8^tp2^tp1=tp2^tp1 - xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1 rol \$8,`&LO("$tp10")` # ROTATE(tp1^tp8,8) + xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1 rol \$8,`&LO("$tp18")` # ROTATE(tp1^tp8,8) xor $tp40,$tp80 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2 + rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8) xor $tp48,$tp88 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2 - rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8) rol \$8,`&LO("$acc8")` # ROTATE(tp1^tp8,8) xor `&LO("$tp80")`,`&LO("$tp10")` - xor `&LO("$tp88")`,`&LO("$tp18")` shr \$32,$tp80 + xor `&LO("$tp88")`,`&LO("$tp18")` shr \$32,$tp88 xor `&LO("$tp80")`,`&LO("$acc0")` xor `&LO("$tp88")`,`&LO("$acc8")` mov $tp20,$tp80 - mov $tp28,$tp88 - shr \$32,$tp80 - shr \$32,$tp88 rol \$24,`&LO("$tp20")` # ROTATE(tp2^tp1^tp8,24) + mov $tp28,$tp88 rol \$24,`&LO("$tp28")` # ROTATE(tp2^tp1^tp8,24) - rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24) - rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24) + shr \$32,$tp80 xor `&LO("$tp20")`,`&LO("$tp10")` + shr \$32,$tp88 xor `&LO("$tp28")`,`&LO("$tp18")` + rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24) mov $tp40,$tp20 + rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24) mov $tp48,$tp28 + shr \$32,$tp20 xor `&LO("$tp80")`,`&LO("$acc0")` + shr \$32,$tp28 xor `&LO("$tp88")`,`&LO("$acc8")` `"mov 0($sbox),$mask80" if ($prefetch)` - shr \$32,$tp20 - shr \$32,$tp28 - `"mov 64($sbox),$maskfe" if ($prefetch)` rol \$16,`&LO("$tp40")` # ROTATE(tp4^tp1^tp8,16) + `"mov 64($sbox),$maskfe" if ($prefetch)` rol \$16,`&LO("$tp48")` # ROTATE(tp4^tp1^tp8,16) `"mov 128($sbox),$mask1b" if ($prefetch)` rol \$16,`&LO("$tp20")` # ROTATE(tp4^tp1^tp8,16) - rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16) `"mov 192($sbox),$tp80" if ($prefetch)` xor `&LO("$tp40")`,`&LO("$tp10")` + rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16) xor `&LO("$tp48")`,`&LO("$tp18")` `"mov 256($sbox),$tp88" if ($prefetch)` xor `&LO("$tp20")`,`&LO("$acc0")` @@ -1302,10 +1300,6 @@ private_AES_set_encrypt_key: call _x86_64_AES_set_encrypt_key - mov 8(%rsp),%r15 - mov 16(%rsp),%r14 - mov 24(%rsp),%r13 - mov 32(%rsp),%r12 mov 40(%rsp),%rbp mov 48(%rsp),%rbx add \$56,%rsp diff --git a/crypto/aes/asm/aesni-mb-x86_64.pl b/crypto/aes/asm/aesni-mb-x86_64.pl new file mode 100755 index 000000000000..33b1aed3c0b4 --- /dev/null +++ b/crypto/aes/asm/aesni-mb-x86_64.pl @@ -0,0 +1,1395 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# Multi-buffer AES-NI procedures process several independent buffers +# in parallel by interleaving independent instructions. +# +# Cycles per byte for interleave factor 4: +# +# asymptotic measured +# --------------------------- +# Westmere 5.00/4=1.25 5.13/4=1.28 +# Atom 15.0/4=3.75 ?15.7/4=3.93 +# Sandy Bridge 5.06/4=1.27 5.18/4=1.29 +# Ivy Bridge 5.06/4=1.27 5.14/4=1.29 +# Haswell 4.44/4=1.11 4.44/4=1.11 +# Bulldozer 5.75/4=1.44 5.76/4=1.44 +# +# Cycles per byte for interleave factor 8 (not implemented for +# pre-AVX processors, where higher interleave factor incidentally +# doesn't result in improvement): +# +# asymptotic measured +# --------------------------- +# Sandy Bridge 5.06/8=0.64 7.10/8=0.89(*) +# Ivy Bridge 5.06/8=0.64 7.14/8=0.89(*) +# Haswell 5.00/8=0.63 5.00/8=0.63 +# Bulldozer 5.75/8=0.72 5.77/8=0.72 +# +# (*) Sandy/Ivy Bridge are known to handle high interleave factors +# suboptimally; + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +$avx=0; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22); +} + +if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.09) + ($1>=2.10); +} + +if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); +} + +if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) { + $avx = ($2>=3.0) + ($2>3.0); +} + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +# void aesni_multi_cbc_encrypt ( +# struct { void *inp,*out; int blocks; double iv[2]; } inp[8]; +# const AES_KEY *key, +# int num); /* 1 or 2 */ +# +$inp="%rdi"; # 1st arg +$key="%rsi"; # 2nd arg +$num="%edx"; + +@inptr=map("%r$_",(8..11)); +@outptr=map("%r$_",(12..15)); + +($rndkey0,$rndkey1)=("%xmm0","%xmm1"); +@out=map("%xmm$_",(2..5)); +@inp=map("%xmm$_",(6..9)); +($counters,$mask,$zero)=map("%xmm$_",(10..12)); + +($rounds,$one,$sink,$offset)=("%eax","%ecx","%rbp","%rbx"); + +$code.=<<___; +.text + +.extern OPENSSL_ia32cap_P + +.globl aesni_multi_cbc_encrypt +.type aesni_multi_cbc_encrypt,\@function,3 +.align 32 +aesni_multi_cbc_encrypt: +___ +$code.=<<___ if ($avx); + cmp \$2,$num + jb .Lenc_non_avx + mov OPENSSL_ia32cap_P+4(%rip),%ecx + test \$`1<<28`,%ecx # AVX bit + jnz _avx_cbc_enc_shortcut + jmp .Lenc_non_avx +.align 16 +.Lenc_non_avx: +___ +$code.=<<___; + mov %rsp,%rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,0x40(%rsp) + movaps %xmm11,0x50(%rsp) + movaps %xmm12,0x60(%rsp) + movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler + movaps %xmm14,-0x58(%rax) + movaps %xmm15,-0x48(%rax) +___ +$code.=<<___; + # stack layout + # + # +0 output sink + # +16 input sink [original %rsp and $num] + # +32 counters + + sub \$48,%rsp + and \$-64,%rsp + mov %rax,16(%rsp) # original %rsp + +.Lenc4x_body: + movdqu ($key),$zero # 0-round key + lea 0x78($key),$key # size optimization + lea 40*2($inp),$inp + +.Lenc4x_loop_grande: + mov $num,24(%rsp) # original $num + xor $num,$num +___ +for($i=0;$i<4;$i++) { + $code.=<<___; + mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks + mov `40*$i+0-40*2`($inp),@inptr[$i] + cmp $num,$one + mov `40*$i+8-40*2`($inp),@outptr[$i] + cmovg $one,$num # find maximum + test $one,$one + movdqu `40*$i+24-40*2`($inp),@out[$i] # load IV + mov $one,`32+4*$i`(%rsp) # initialize counters + cmovle %rsp,@inptr[$i] # cancel input +___ +} +$code.=<<___; + test $num,$num + jz .Lenc4x_done + + movups 0x10-0x78($key),$rndkey1 + pxor $zero,@out[0] + movups 0x20-0x78($key),$rndkey0 + pxor $zero,@out[1] + mov 0xf0-0x78($key),$rounds + pxor $zero,@out[2] + movdqu (@inptr[0]),@inp[0] # load inputs + pxor $zero,@out[3] + movdqu (@inptr[1]),@inp[1] + pxor @inp[0],@out[0] + movdqu (@inptr[2]),@inp[2] + pxor @inp[1],@out[1] + movdqu (@inptr[3]),@inp[3] + pxor @inp[2],@out[2] + pxor @inp[3],@out[3] + movdqa 32(%rsp),$counters # load counters + xor $offset,$offset + jmp .Loop_enc4x + +.align 32 +.Loop_enc4x: + add \$16,$offset + lea 16(%rsp),$sink # sink pointer + mov \$1,$one # constant of 1 + sub $offset,$sink + + aesenc $rndkey1,@out[0] + prefetcht0 31(@inptr[0],$offset) # prefetch input + prefetcht0 31(@inptr[1],$offset) + aesenc $rndkey1,@out[1] + prefetcht0 31(@inptr[2],$offset) + prefetcht0 31(@inptr[2],$offset) + aesenc $rndkey1,@out[2] + aesenc $rndkey1,@out[3] + movups 0x30-0x78($key),$rndkey1 +___ +for($i=0;$i<4;$i++) { +my $rndkey = ($i&1) ? $rndkey1 : $rndkey0; +$code.=<<___; + cmp `32+4*$i`(%rsp),$one + aesenc $rndkey,@out[0] + aesenc $rndkey,@out[1] + aesenc $rndkey,@out[2] + cmovge $sink,@inptr[$i] # cancel input + cmovg $sink,@outptr[$i] # sink output + aesenc $rndkey,@out[3] + movups `0x40+16*$i-0x78`($key),$rndkey +___ +} +$code.=<<___; + movdqa $counters,$mask + aesenc $rndkey0,@out[0] + prefetcht0 15(@outptr[0],$offset) # prefetch output + prefetcht0 15(@outptr[1],$offset) + aesenc $rndkey0,@out[1] + prefetcht0 15(@outptr[2],$offset) + prefetcht0 15(@outptr[3],$offset) + aesenc $rndkey0,@out[2] + aesenc $rndkey0,@out[3] + movups 0x80-0x78($key),$rndkey0 + pxor $zero,$zero + + aesenc $rndkey1,@out[0] + pcmpgtd $zero,$mask + movdqu -0x78($key),$zero # reload 0-round key + aesenc $rndkey1,@out[1] + paddd $mask,$counters # decrement counters + movdqa $counters,32(%rsp) # update counters + aesenc $rndkey1,@out[2] + aesenc $rndkey1,@out[3] + movups 0x90-0x78($key),$rndkey1 + + cmp \$11,$rounds + + aesenc $rndkey0,@out[0] + aesenc $rndkey0,@out[1] + aesenc $rndkey0,@out[2] + aesenc $rndkey0,@out[3] + movups 0xa0-0x78($key),$rndkey0 + + jb .Lenc4x_tail + + aesenc $rndkey1,@out[0] + aesenc $rndkey1,@out[1] + aesenc $rndkey1,@out[2] + aesenc $rndkey1,@out[3] + movups 0xb0-0x78($key),$rndkey1 + + aesenc $rndkey0,@out[0] + aesenc $rndkey0,@out[1] + aesenc $rndkey0,@out[2] + aesenc $rndkey0,@out[3] + movups 0xc0-0x78($key),$rndkey0 + + je .Lenc4x_tail + + aesenc $rndkey1,@out[0] + aesenc $rndkey1,@out[1] + aesenc $rndkey1,@out[2] + aesenc $rndkey1,@out[3] + movups 0xd0-0x78($key),$rndkey1 + + aesenc $rndkey0,@out[0] + aesenc $rndkey0,@out[1] + aesenc $rndkey0,@out[2] + aesenc $rndkey0,@out[3] + movups 0xe0-0x78($key),$rndkey0 + jmp .Lenc4x_tail + +.align 32 +.Lenc4x_tail: + aesenc $rndkey1,@out[0] + aesenc $rndkey1,@out[1] + aesenc $rndkey1,@out[2] + aesenc $rndkey1,@out[3] + movdqu (@inptr[0],$offset),@inp[0] + movdqu 0x10-0x78($key),$rndkey1 + + aesenclast $rndkey0,@out[0] + movdqu (@inptr[1],$offset),@inp[1] + pxor $zero,@inp[0] + aesenclast $rndkey0,@out[1] + movdqu (@inptr[2],$offset),@inp[2] + pxor $zero,@inp[1] + aesenclast $rndkey0,@out[2] + movdqu (@inptr[3],$offset),@inp[3] + pxor $zero,@inp[2] + aesenclast $rndkey0,@out[3] + movdqu 0x20-0x78($key),$rndkey0 + pxor $zero,@inp[3] + + movups @out[0],-16(@outptr[0],$offset) + pxor @inp[0],@out[0] + movups @out[1],-16(@outptr[1],$offset) + pxor @inp[1],@out[1] + movups @out[2],-16(@outptr[2],$offset) + pxor @inp[2],@out[2] + movups @out[3],-16(@outptr[3],$offset) + pxor @inp[3],@out[3] + + dec $num + jnz .Loop_enc4x + + mov 16(%rsp),%rax # original %rsp + mov 24(%rsp),$num + + #pxor @inp[0],@out[0] + #pxor @inp[1],@out[1] + #movdqu @out[0],`40*0+24-40*2`($inp) # output iv FIX ME! + #pxor @inp[2],@out[2] + #movdqu @out[1],`40*1+24-40*2`($inp) + #pxor @inp[3],@out[3] + #movdqu @out[2],`40*2+24-40*2`($inp) # won't fix, let caller + #movdqu @out[3],`40*3+24-40*2`($inp) # figure this out... + + lea `40*4`($inp),$inp + dec $num + jnz .Lenc4x_loop_grande + +.Lenc4x_done: +___ +$code.=<<___ if ($win64); + movaps -0xd8(%rax),%xmm6 + movaps -0xc8(%rax),%xmm7 + movaps -0xb8(%rax),%xmm8 + movaps -0xa8(%rax),%xmm9 + movaps -0x98(%rax),%xmm10 + movaps -0x88(%rax),%xmm11 + movaps -0x78(%rax),%xmm12 + #movaps -0x68(%rax),%xmm13 + #movaps -0x58(%rax),%xmm14 + #movaps -0x48(%rax),%xmm15 +___ +$code.=<<___; + mov -48(%rax),%r15 + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp +.Lenc4x_epilogue: + ret +.size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt + +.globl aesni_multi_cbc_decrypt +.type aesni_multi_cbc_decrypt,\@function,3 +.align 32 +aesni_multi_cbc_decrypt: +___ +$code.=<<___ if ($avx); + cmp \$2,$num + jb .Ldec_non_avx + mov OPENSSL_ia32cap_P+4(%rip),%ecx + test \$`1<<28`,%ecx # AVX bit + jnz _avx_cbc_dec_shortcut + jmp .Ldec_non_avx +.align 16 +.Ldec_non_avx: +___ +$code.=<<___; + mov %rsp,%rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,0x40(%rsp) + movaps %xmm11,0x50(%rsp) + movaps %xmm12,0x60(%rsp) + movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler + movaps %xmm14,-0x58(%rax) + movaps %xmm15,-0x48(%rax) +___ +$code.=<<___; + # stack layout + # + # +0 output sink + # +16 input sink [original %rsp and $num] + # +32 counters + + sub \$48,%rsp + and \$-64,%rsp + mov %rax,16(%rsp) # original %rsp + +.Ldec4x_body: + movdqu ($key),$zero # 0-round key + lea 0x78($key),$key # size optimization + lea 40*2($inp),$inp + +.Ldec4x_loop_grande: + mov $num,24(%rsp) # original $num + xor $num,$num +___ +for($i=0;$i<4;$i++) { + $code.=<<___; + mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks + mov `40*$i+0-40*2`($inp),@inptr[$i] + cmp $num,$one + mov `40*$i+8-40*2`($inp),@outptr[$i] + cmovg $one,$num # find maximum + test $one,$one + movdqu `40*$i+24-40*2`($inp),@inp[$i] # load IV + mov $one,`32+4*$i`(%rsp) # initialize counters + cmovle %rsp,@inptr[$i] # cancel input +___ +} +$code.=<<___; + test $num,$num + jz .Ldec4x_done + + movups 0x10-0x78($key),$rndkey1 + movups 0x20-0x78($key),$rndkey0 + mov 0xf0-0x78($key),$rounds + movdqu (@inptr[0]),@out[0] # load inputs + movdqu (@inptr[1]),@out[1] + pxor $zero,@out[0] + movdqu (@inptr[2]),@out[2] + pxor $zero,@out[1] + movdqu (@inptr[3]),@out[3] + pxor $zero,@out[2] + pxor $zero,@out[3] + movdqa 32(%rsp),$counters # load counters + xor $offset,$offset + jmp .Loop_dec4x + +.align 32 +.Loop_dec4x: + add \$16,$offset + lea 16(%rsp),$sink # sink pointer + mov \$1,$one # constant of 1 + sub $offset,$sink + + aesdec $rndkey1,@out[0] + prefetcht0 31(@inptr[0],$offset) # prefetch input + prefetcht0 31(@inptr[1],$offset) + aesdec $rndkey1,@out[1] + prefetcht0 31(@inptr[2],$offset) + prefetcht0 31(@inptr[3],$offset) + aesdec $rndkey1,@out[2] + aesdec $rndkey1,@out[3] + movups 0x30-0x78($key),$rndkey1 +___ +for($i=0;$i<4;$i++) { +my $rndkey = ($i&1) ? $rndkey1 : $rndkey0; +$code.=<<___; + cmp `32+4*$i`(%rsp),$one + aesdec $rndkey,@out[0] + aesdec $rndkey,@out[1] + aesdec $rndkey,@out[2] + cmovge $sink,@inptr[$i] # cancel input + cmovg $sink,@outptr[$i] # sink output + aesdec $rndkey,@out[3] + movups `0x40+16*$i-0x78`($key),$rndkey +___ +} +$code.=<<___; + movdqa $counters,$mask + aesdec $rndkey0,@out[0] + prefetcht0 15(@outptr[0],$offset) # prefetch output + prefetcht0 15(@outptr[1],$offset) + aesdec $rndkey0,@out[1] + prefetcht0 15(@outptr[2],$offset) + prefetcht0 15(@outptr[3],$offset) + aesdec $rndkey0,@out[2] + aesdec $rndkey0,@out[3] + movups 0x80-0x78($key),$rndkey0 + pxor $zero,$zero + + aesdec $rndkey1,@out[0] + pcmpgtd $zero,$mask + movdqu -0x78($key),$zero # reload 0-round key + aesdec $rndkey1,@out[1] + paddd $mask,$counters # decrement counters + movdqa $counters,32(%rsp) # update counters + aesdec $rndkey1,@out[2] + aesdec $rndkey1,@out[3] + movups 0x90-0x78($key),$rndkey1 + + cmp \$11,$rounds + + aesdec $rndkey0,@out[0] + aesdec $rndkey0,@out[1] + aesdec $rndkey0,@out[2] + aesdec $rndkey0,@out[3] + movups 0xa0-0x78($key),$rndkey0 + + jb .Ldec4x_tail + + aesdec $rndkey1,@out[0] + aesdec $rndkey1,@out[1] + aesdec $rndkey1,@out[2] + aesdec $rndkey1,@out[3] + movups 0xb0-0x78($key),$rndkey1 + + aesdec $rndkey0,@out[0] + aesdec $rndkey0,@out[1] + aesdec $rndkey0,@out[2] + aesdec $rndkey0,@out[3] + movups 0xc0-0x78($key),$rndkey0 + + je .Ldec4x_tail + + aesdec $rndkey1,@out[0] + aesdec $rndkey1,@out[1] + aesdec $rndkey1,@out[2] + aesdec $rndkey1,@out[3] + movups 0xd0-0x78($key),$rndkey1 + + aesdec $rndkey0,@out[0] + aesdec $rndkey0,@out[1] + aesdec $rndkey0,@out[2] + aesdec $rndkey0,@out[3] + movups 0xe0-0x78($key),$rndkey0 + jmp .Ldec4x_tail + +.align 32 +.Ldec4x_tail: + aesdec $rndkey1,@out[0] + aesdec $rndkey1,@out[1] + aesdec $rndkey1,@out[2] + pxor $rndkey0,@inp[0] + pxor $rndkey0,@inp[1] + aesdec $rndkey1,@out[3] + movdqu 0x10-0x78($key),$rndkey1 + pxor $rndkey0,@inp[2] + pxor $rndkey0,@inp[3] + movdqu 0x20-0x78($key),$rndkey0 + + aesdeclast @inp[0],@out[0] + aesdeclast @inp[1],@out[1] + movdqu -16(@inptr[0],$offset),@inp[0] # load next IV + movdqu -16(@inptr[1],$offset),@inp[1] + aesdeclast @inp[2],@out[2] + aesdeclast @inp[3],@out[3] + movdqu -16(@inptr[2],$offset),@inp[2] + movdqu -16(@inptr[3],$offset),@inp[3] + + movups @out[0],-16(@outptr[0],$offset) + movdqu (@inptr[0],$offset),@out[0] + movups @out[1],-16(@outptr[1],$offset) + movdqu (@inptr[1],$offset),@out[1] + pxor $zero,@out[0] + movups @out[2],-16(@outptr[2],$offset) + movdqu (@inptr[2],$offset),@out[2] + pxor $zero,@out[1] + movups @out[3],-16(@outptr[3],$offset) + movdqu (@inptr[3],$offset),@out[3] + pxor $zero,@out[2] + pxor $zero,@out[3] + + dec $num + jnz .Loop_dec4x + + mov 16(%rsp),%rax # original %rsp + mov 24(%rsp),$num + + lea `40*4`($inp),$inp + dec $num + jnz .Ldec4x_loop_grande + +.Ldec4x_done: +___ +$code.=<<___ if ($win64); + movaps -0xd8(%rax),%xmm6 + movaps -0xc8(%rax),%xmm7 + movaps -0xb8(%rax),%xmm8 + movaps -0xa8(%rax),%xmm9 + movaps -0x98(%rax),%xmm10 + movaps -0x88(%rax),%xmm11 + movaps -0x78(%rax),%xmm12 + #movaps -0x68(%rax),%xmm13 + #movaps -0x58(%rax),%xmm14 + #movaps -0x48(%rax),%xmm15 +___ +$code.=<<___; + mov -48(%rax),%r15 + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp +.Ldec4x_epilogue: + ret +.size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt +___ + + if ($avx) {{{ +my @ptr=map("%r$_",(8..15)); +my $offload=$sink; + +my @out=map("%xmm$_",(2..9)); +my @inp=map("%xmm$_",(10..13)); +my ($counters,$zero)=("%xmm14","%xmm15"); + +$code.=<<___; +.type aesni_multi_cbc_encrypt_avx,\@function,3 +.align 32 +aesni_multi_cbc_encrypt_avx: +_avx_cbc_enc_shortcut: + mov %rsp,%rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,0x40(%rsp) + movaps %xmm11,0x50(%rsp) + movaps %xmm12,-0x78(%rax) + movaps %xmm13,-0x68(%rax) + movaps %xmm14,-0x58(%rax) + movaps %xmm15,-0x48(%rax) +___ +$code.=<<___; + # stack layout + # + # +0 output sink + # +16 input sink [original %rsp and $num] + # +32 counters + # +64 distances between inputs and outputs + # +128 off-load area for @inp[0..3] + + sub \$192,%rsp + and \$-128,%rsp + mov %rax,16(%rsp) # original %rsp + +.Lenc8x_body: + vzeroupper + vmovdqu ($key),$zero # 0-round key + lea 0x78($key),$key # size optimization + lea 40*4($inp),$inp + shr \$1,$num + +.Lenc8x_loop_grande: + #mov $num,24(%rsp) # original $num + xor $num,$num +___ +for($i=0;$i<8;$i++) { + my $temp = $i ? $offload : $offset; + $code.=<<___; + mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks + mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer + cmp $num,$one + mov `40*$i+8-40*4`($inp),$temp # output pointer + cmovg $one,$num # find maximum + test $one,$one + vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV + mov $one,`32+4*$i`(%rsp) # initialize counters + cmovle %rsp,@ptr[$i] # cancel input + sub @ptr[$i],$temp # distance between input and output + mov $temp,`64+8*$i`(%rsp) # initialize distances +___ +} +$code.=<<___; + test $num,$num + jz .Lenc8x_done + + vmovups 0x10-0x78($key),$rndkey1 + vmovups 0x20-0x78($key),$rndkey0 + mov 0xf0-0x78($key),$rounds + + vpxor (@ptr[0]),$zero,@inp[0] # load inputs and xor with 0-round + lea 128(%rsp),$offload # offload area + vpxor (@ptr[1]),$zero,@inp[1] + vpxor (@ptr[2]),$zero,@inp[2] + vpxor (@ptr[3]),$zero,@inp[3] + vpxor @inp[0],@out[0],@out[0] + vpxor (@ptr[4]),$zero,@inp[0] + vpxor @inp[1],@out[1],@out[1] + vpxor (@ptr[5]),$zero,@inp[1] + vpxor @inp[2],@out[2],@out[2] + vpxor (@ptr[6]),$zero,@inp[2] + vpxor @inp[3],@out[3],@out[3] + vpxor (@ptr[7]),$zero,@inp[3] + vpxor @inp[0],@out[4],@out[4] + mov \$1,$one # constant of 1 + vpxor @inp[1],@out[5],@out[5] + vpxor @inp[2],@out[6],@out[6] + vpxor @inp[3],@out[7],@out[7] + jmp .Loop_enc8x + +.align 32 +.Loop_enc8x: +___ +for($i=0;$i<8;$i++) { +my $rndkey=($i&1)?$rndkey0:$rndkey1; +$code.=<<___; + vaesenc $rndkey,@out[0],@out[0] + cmp 32+4*$i(%rsp),$one +___ +$code.=<<___ if ($i); + mov 64+8*$i(%rsp),$offset +___ +$code.=<<___; + vaesenc $rndkey,@out[1],@out[1] + prefetcht0 31(@ptr[$i]) # prefetch input + vaesenc $rndkey,@out[2],@out[2] +___ +$code.=<<___ if ($i>1); + prefetcht0 15(@ptr[$i-2]) # prefetch output +___ +$code.=<<___; + vaesenc $rndkey,@out[3],@out[3] + lea (@ptr[$i],$offset),$offset + cmovge %rsp,@ptr[$i] # cancel input + vaesenc $rndkey,@out[4],@out[4] + cmovg %rsp,$offset # sink output + vaesenc $rndkey,@out[5],@out[5] + sub @ptr[$i],$offset + vaesenc $rndkey,@out[6],@out[6] + vpxor 16(@ptr[$i]),$zero,@inp[$i%4] # load input and xor with 0-round + mov $offset,64+8*$i(%rsp) + vaesenc $rndkey,@out[7],@out[7] + vmovups `16*(3+$i)-0x78`($key),$rndkey + lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output +___ +$code.=<<___ if ($i<4) + vmovdqu @inp[$i%4],`16*$i`($offload) # off-load +___ +} +$code.=<<___; + vmovdqu 32(%rsp),$counters + prefetcht0 15(@ptr[$i-2]) # prefetch output + prefetcht0 15(@ptr[$i-1]) + cmp \$11,$rounds + jb .Lenc8x_tail + + vaesenc $rndkey1,@out[0],@out[0] + vaesenc $rndkey1,@out[1],@out[1] + vaesenc $rndkey1,@out[2],@out[2] + vaesenc $rndkey1,@out[3],@out[3] + vaesenc $rndkey1,@out[4],@out[4] + vaesenc $rndkey1,@out[5],@out[5] + vaesenc $rndkey1,@out[6],@out[6] + vaesenc $rndkey1,@out[7],@out[7] + vmovups 0xb0-0x78($key),$rndkey1 + + vaesenc $rndkey0,@out[0],@out[0] + vaesenc $rndkey0,@out[1],@out[1] + vaesenc $rndkey0,@out[2],@out[2] + vaesenc $rndkey0,@out[3],@out[3] + vaesenc $rndkey0,@out[4],@out[4] + vaesenc $rndkey0,@out[5],@out[5] + vaesenc $rndkey0,@out[6],@out[6] + vaesenc $rndkey0,@out[7],@out[7] + vmovups 0xc0-0x78($key),$rndkey0 + je .Lenc8x_tail + + vaesenc $rndkey1,@out[0],@out[0] + vaesenc $rndkey1,@out[1],@out[1] + vaesenc $rndkey1,@out[2],@out[2] + vaesenc $rndkey1,@out[3],@out[3] + vaesenc $rndkey1,@out[4],@out[4] + vaesenc $rndkey1,@out[5],@out[5] + vaesenc $rndkey1,@out[6],@out[6] + vaesenc $rndkey1,@out[7],@out[7] + vmovups 0xd0-0x78($key),$rndkey1 + + vaesenc $rndkey0,@out[0],@out[0] + vaesenc $rndkey0,@out[1],@out[1] + vaesenc $rndkey0,@out[2],@out[2] + vaesenc $rndkey0,@out[3],@out[3] + vaesenc $rndkey0,@out[4],@out[4] + vaesenc $rndkey0,@out[5],@out[5] + vaesenc $rndkey0,@out[6],@out[6] + vaesenc $rndkey0,@out[7],@out[7] + vmovups 0xe0-0x78($key),$rndkey0 + +.Lenc8x_tail: + vaesenc $rndkey1,@out[0],@out[0] + vpxor $zero,$zero,$zero + vaesenc $rndkey1,@out[1],@out[1] + vaesenc $rndkey1,@out[2],@out[2] + vpcmpgtd $zero,$counters,$zero + vaesenc $rndkey1,@out[3],@out[3] + vaesenc $rndkey1,@out[4],@out[4] + vpaddd $counters,$zero,$zero # decrement counters + vmovdqu 48(%rsp),$counters + vaesenc $rndkey1,@out[5],@out[5] + mov 64(%rsp),$offset # pre-load 1st offset + vaesenc $rndkey1,@out[6],@out[6] + vaesenc $rndkey1,@out[7],@out[7] + vmovups 0x10-0x78($key),$rndkey1 + + vaesenclast $rndkey0,@out[0],@out[0] + vmovdqa $zero,32(%rsp) # update counters + vpxor $zero,$zero,$zero + vaesenclast $rndkey0,@out[1],@out[1] + vaesenclast $rndkey0,@out[2],@out[2] + vpcmpgtd $zero,$counters,$zero + vaesenclast $rndkey0,@out[3],@out[3] + vaesenclast $rndkey0,@out[4],@out[4] + vpaddd $zero,$counters,$counters # decrement counters + vmovdqu -0x78($key),$zero # 0-round + vaesenclast $rndkey0,@out[5],@out[5] + vaesenclast $rndkey0,@out[6],@out[6] + vmovdqa $counters,48(%rsp) # update counters + vaesenclast $rndkey0,@out[7],@out[7] + vmovups 0x20-0x78($key),$rndkey0 + + vmovups @out[0],-16(@ptr[0]) # write output + sub $offset,@ptr[0] # switch to input + vpxor 0x00($offload),@out[0],@out[0] + vmovups @out[1],-16(@ptr[1]) + sub `64+1*8`(%rsp),@ptr[1] + vpxor 0x10($offload),@out[1],@out[1] + vmovups @out[2],-16(@ptr[2]) + sub `64+2*8`(%rsp),@ptr[2] + vpxor 0x20($offload),@out[2],@out[2] + vmovups @out[3],-16(@ptr[3]) + sub `64+3*8`(%rsp),@ptr[3] + vpxor 0x30($offload),@out[3],@out[3] + vmovups @out[4],-16(@ptr[4]) + sub `64+4*8`(%rsp),@ptr[4] + vpxor @inp[0],@out[4],@out[4] + vmovups @out[5],-16(@ptr[5]) + sub `64+5*8`(%rsp),@ptr[5] + vpxor @inp[1],@out[5],@out[5] + vmovups @out[6],-16(@ptr[6]) + sub `64+6*8`(%rsp),@ptr[6] + vpxor @inp[2],@out[6],@out[6] + vmovups @out[7],-16(@ptr[7]) + sub `64+7*8`(%rsp),@ptr[7] + vpxor @inp[3],@out[7],@out[7] + + dec $num + jnz .Loop_enc8x + + mov 16(%rsp),%rax # original %rsp + #mov 24(%rsp),$num + #lea `40*8`($inp),$inp + #dec $num + #jnz .Lenc8x_loop_grande + +.Lenc8x_done: + vzeroupper +___ +$code.=<<___ if ($win64); + movaps -0xd8(%rax),%xmm6 + movaps -0xc8(%rax),%xmm7 + movaps -0xb8(%rax),%xmm8 + movaps -0xa8(%rax),%xmm9 + movaps -0x98(%rax),%xmm10 + movaps -0x88(%rax),%xmm11 + movaps -0x78(%rax),%xmm12 + movaps -0x68(%rax),%xmm13 + movaps -0x58(%rax),%xmm14 + movaps -0x48(%rax),%xmm15 +___ +$code.=<<___; + mov -48(%rax),%r15 + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp +.Lenc8x_epilogue: + ret +.size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx + +.type aesni_multi_cbc_decrypt_avx,\@function,3 +.align 32 +aesni_multi_cbc_decrypt_avx: +_avx_cbc_dec_shortcut: + mov %rsp,%rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,0x40(%rsp) + movaps %xmm11,0x50(%rsp) + movaps %xmm12,-0x78(%rax) + movaps %xmm13,-0x68(%rax) + movaps %xmm14,-0x58(%rax) + movaps %xmm15,-0x48(%rax) +___ +$code.=<<___; + # stack layout + # + # +0 output sink + # +16 input sink [original %rsp and $num] + # +32 counters + # +64 distances between inputs and outputs + # +128 off-load area for @inp[0..3] + # +192 IV/input offload + + sub \$256,%rsp + and \$-256,%rsp + sub \$192,%rsp + mov %rax,16(%rsp) # original %rsp + +.Ldec8x_body: + vzeroupper + vmovdqu ($key),$zero # 0-round key + lea 0x78($key),$key # size optimization + lea 40*4($inp),$inp + shr \$1,$num + +.Ldec8x_loop_grande: + #mov $num,24(%rsp) # original $num + xor $num,$num +___ +for($i=0;$i<8;$i++) { + my $temp = $i ? $offload : $offset; + $code.=<<___; + mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks + mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer + cmp $num,$one + mov `40*$i+8-40*4`($inp),$temp # output pointer + cmovg $one,$num # find maximum + test $one,$one + vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV + mov $one,`32+4*$i`(%rsp) # initialize counters + cmovle %rsp,@ptr[$i] # cancel input + sub @ptr[$i],$temp # distance between input and output + mov $temp,`64+8*$i`(%rsp) # initialize distances + vmovdqu @out[$i],`192+16*$i`(%rsp) # offload IV +___ +} +$code.=<<___; + test $num,$num + jz .Ldec8x_done + + vmovups 0x10-0x78($key),$rndkey1 + vmovups 0x20-0x78($key),$rndkey0 + mov 0xf0-0x78($key),$rounds + lea 192+128(%rsp),$offload # offload area + + vmovdqu (@ptr[0]),@out[0] # load inputs + vmovdqu (@ptr[1]),@out[1] + vmovdqu (@ptr[2]),@out[2] + vmovdqu (@ptr[3]),@out[3] + vmovdqu (@ptr[4]),@out[4] + vmovdqu (@ptr[5]),@out[5] + vmovdqu (@ptr[6]),@out[6] + vmovdqu (@ptr[7]),@out[7] + vmovdqu @out[0],0x00($offload) # offload inputs + vpxor $zero,@out[0],@out[0] # xor inputs with 0-round + vmovdqu @out[1],0x10($offload) + vpxor $zero,@out[1],@out[1] + vmovdqu @out[2],0x20($offload) + vpxor $zero,@out[2],@out[2] + vmovdqu @out[3],0x30($offload) + vpxor $zero,@out[3],@out[3] + vmovdqu @out[4],0x40($offload) + vpxor $zero,@out[4],@out[4] + vmovdqu @out[5],0x50($offload) + vpxor $zero,@out[5],@out[5] + vmovdqu @out[6],0x60($offload) + vpxor $zero,@out[6],@out[6] + vmovdqu @out[7],0x70($offload) + vpxor $zero,@out[7],@out[7] + xor \$0x80,$offload + mov \$1,$one # constant of 1 + jmp .Loop_dec8x + +.align 32 +.Loop_dec8x: +___ +for($i=0;$i<8;$i++) { +my $rndkey=($i&1)?$rndkey0:$rndkey1; +$code.=<<___; + vaesdec $rndkey,@out[0],@out[0] + cmp 32+4*$i(%rsp),$one +___ +$code.=<<___ if ($i); + mov 64+8*$i(%rsp),$offset +___ +$code.=<<___; + vaesdec $rndkey,@out[1],@out[1] + prefetcht0 31(@ptr[$i]) # prefetch input + vaesdec $rndkey,@out[2],@out[2] +___ +$code.=<<___ if ($i>1); + prefetcht0 15(@ptr[$i-2]) # prefetch output +___ +$code.=<<___; + vaesdec $rndkey,@out[3],@out[3] + lea (@ptr[$i],$offset),$offset + cmovge %rsp,@ptr[$i] # cancel input + vaesdec $rndkey,@out[4],@out[4] + cmovg %rsp,$offset # sink output + vaesdec $rndkey,@out[5],@out[5] + sub @ptr[$i],$offset + vaesdec $rndkey,@out[6],@out[6] + vmovdqu 16(@ptr[$i]),@inp[$i%4] # load input + mov $offset,64+8*$i(%rsp) + vaesdec $rndkey,@out[7],@out[7] + vmovups `16*(3+$i)-0x78`($key),$rndkey + lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output +___ +$code.=<<___ if ($i<4); + vmovdqu @inp[$i%4],`128+16*$i`(%rsp) # off-load +___ +} +$code.=<<___; + vmovdqu 32(%rsp),$counters + prefetcht0 15(@ptr[$i-2]) # prefetch output + prefetcht0 15(@ptr[$i-1]) + cmp \$11,$rounds + jb .Ldec8x_tail + + vaesdec $rndkey1,@out[0],@out[0] + vaesdec $rndkey1,@out[1],@out[1] + vaesdec $rndkey1,@out[2],@out[2] + vaesdec $rndkey1,@out[3],@out[3] + vaesdec $rndkey1,@out[4],@out[4] + vaesdec $rndkey1,@out[5],@out[5] + vaesdec $rndkey1,@out[6],@out[6] + vaesdec $rndkey1,@out[7],@out[7] + vmovups 0xb0-0x78($key),$rndkey1 + + vaesdec $rndkey0,@out[0],@out[0] + vaesdec $rndkey0,@out[1],@out[1] + vaesdec $rndkey0,@out[2],@out[2] + vaesdec $rndkey0,@out[3],@out[3] + vaesdec $rndkey0,@out[4],@out[4] + vaesdec $rndkey0,@out[5],@out[5] + vaesdec $rndkey0,@out[6],@out[6] + vaesdec $rndkey0,@out[7],@out[7] + vmovups 0xc0-0x78($key),$rndkey0 + je .Ldec8x_tail + + vaesdec $rndkey1,@out[0],@out[0] + vaesdec $rndkey1,@out[1],@out[1] + vaesdec $rndkey1,@out[2],@out[2] + vaesdec $rndkey1,@out[3],@out[3] + vaesdec $rndkey1,@out[4],@out[4] + vaesdec $rndkey1,@out[5],@out[5] + vaesdec $rndkey1,@out[6],@out[6] + vaesdec $rndkey1,@out[7],@out[7] + vmovups 0xd0-0x78($key),$rndkey1 + + vaesdec $rndkey0,@out[0],@out[0] + vaesdec $rndkey0,@out[1],@out[1] + vaesdec $rndkey0,@out[2],@out[2] + vaesdec $rndkey0,@out[3],@out[3] + vaesdec $rndkey0,@out[4],@out[4] + vaesdec $rndkey0,@out[5],@out[5] + vaesdec $rndkey0,@out[6],@out[6] + vaesdec $rndkey0,@out[7],@out[7] + vmovups 0xe0-0x78($key),$rndkey0 + +.Ldec8x_tail: + vaesdec $rndkey1,@out[0],@out[0] + vpxor $zero,$zero,$zero + vaesdec $rndkey1,@out[1],@out[1] + vaesdec $rndkey1,@out[2],@out[2] + vpcmpgtd $zero,$counters,$zero + vaesdec $rndkey1,@out[3],@out[3] + vaesdec $rndkey1,@out[4],@out[4] + vpaddd $counters,$zero,$zero # decrement counters + vmovdqu 48(%rsp),$counters + vaesdec $rndkey1,@out[5],@out[5] + mov 64(%rsp),$offset # pre-load 1st offset + vaesdec $rndkey1,@out[6],@out[6] + vaesdec $rndkey1,@out[7],@out[7] + vmovups 0x10-0x78($key),$rndkey1 + + vaesdeclast $rndkey0,@out[0],@out[0] + vmovdqa $zero,32(%rsp) # update counters + vpxor $zero,$zero,$zero + vaesdeclast $rndkey0,@out[1],@out[1] + vpxor 0x00($offload),@out[0],@out[0] # xor with IV + vaesdeclast $rndkey0,@out[2],@out[2] + vpxor 0x10($offload),@out[1],@out[1] + vpcmpgtd $zero,$counters,$zero + vaesdeclast $rndkey0,@out[3],@out[3] + vpxor 0x20($offload),@out[2],@out[2] + vaesdeclast $rndkey0,@out[4],@out[4] + vpxor 0x30($offload),@out[3],@out[3] + vpaddd $zero,$counters,$counters # decrement counters + vmovdqu -0x78($key),$zero # 0-round + vaesdeclast $rndkey0,@out[5],@out[5] + vpxor 0x40($offload),@out[4],@out[4] + vaesdeclast $rndkey0,@out[6],@out[6] + vpxor 0x50($offload),@out[5],@out[5] + vmovdqa $counters,48(%rsp) # update counters + vaesdeclast $rndkey0,@out[7],@out[7] + vpxor 0x60($offload),@out[6],@out[6] + vmovups 0x20-0x78($key),$rndkey0 + + vmovups @out[0],-16(@ptr[0]) # write output + sub $offset,@ptr[0] # switch to input + vmovdqu 128+0(%rsp),@out[0] + vpxor 0x70($offload),@out[7],@out[7] + vmovups @out[1],-16(@ptr[1]) + sub `64+1*8`(%rsp),@ptr[1] + vmovdqu @out[0],0x00($offload) + vpxor $zero,@out[0],@out[0] + vmovdqu 128+16(%rsp),@out[1] + vmovups @out[2],-16(@ptr[2]) + sub `64+2*8`(%rsp),@ptr[2] + vmovdqu @out[1],0x10($offload) + vpxor $zero,@out[1],@out[1] + vmovdqu 128+32(%rsp),@out[2] + vmovups @out[3],-16(@ptr[3]) + sub `64+3*8`(%rsp),@ptr[3] + vmovdqu @out[2],0x20($offload) + vpxor $zero,@out[2],@out[2] + vmovdqu 128+48(%rsp),@out[3] + vmovups @out[4],-16(@ptr[4]) + sub `64+4*8`(%rsp),@ptr[4] + vmovdqu @out[3],0x30($offload) + vpxor $zero,@out[3],@out[3] + vmovdqu @inp[0],0x40($offload) + vpxor @inp[0],$zero,@out[4] + vmovups @out[5],-16(@ptr[5]) + sub `64+5*8`(%rsp),@ptr[5] + vmovdqu @inp[1],0x50($offload) + vpxor @inp[1],$zero,@out[5] + vmovups @out[6],-16(@ptr[6]) + sub `64+6*8`(%rsp),@ptr[6] + vmovdqu @inp[2],0x60($offload) + vpxor @inp[2],$zero,@out[6] + vmovups @out[7],-16(@ptr[7]) + sub `64+7*8`(%rsp),@ptr[7] + vmovdqu @inp[3],0x70($offload) + vpxor @inp[3],$zero,@out[7] + + xor \$128,$offload + dec $num + jnz .Loop_dec8x + + mov 16(%rsp),%rax # original %rsp + #mov 24(%rsp),$num + #lea `40*8`($inp),$inp + #dec $num + #jnz .Ldec8x_loop_grande + +.Ldec8x_done: + vzeroupper +___ +$code.=<<___ if ($win64); + movaps -0xd8(%rax),%xmm6 + movaps -0xc8(%rax),%xmm7 + movaps -0xb8(%rax),%xmm8 + movaps -0xa8(%rax),%xmm9 + movaps -0x98(%rax),%xmm10 + movaps -0x88(%rax),%xmm11 + movaps -0x78(%rax),%xmm12 + movaps -0x68(%rax),%xmm13 + movaps -0x58(%rax),%xmm14 + movaps -0x48(%rax),%xmm15 +___ +$code.=<<___; + mov -48(%rax),%r15 + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp +.Ldec8x_epilogue: + ret +.size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx +___ + }}} + +if ($win64) { +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->Rip<.Lprologue + jb .Lin_prologue + + mov 152($context),%rax # pull context->Rsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=.Lepilogue + jae .Lin_prologue + + mov 16(%rax),%rax # pull saved stack pointer + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore cotnext->R12 + mov %r13,224($context) # restore cotnext->R13 + mov %r14,232($context) # restore cotnext->R14 + mov %r15,240($context) # restore cotnext->R15 + + lea -56-10*16(%rax),%rsi + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx + .long 0xa548f3fc # cld; rep movsq + +.Lin_prologue: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size se_handler,.-se_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_aesni_multi_cbc_encrypt + .rva .LSEH_end_aesni_multi_cbc_encrypt + .rva .LSEH_info_aesni_multi_cbc_encrypt + .rva .LSEH_begin_aesni_multi_cbc_decrypt + .rva .LSEH_end_aesni_multi_cbc_decrypt + .rva .LSEH_info_aesni_multi_cbc_decrypt +___ +$code.=<<___ if ($avx); + .rva .LSEH_begin_aesni_multi_cbc_encrypt_avx + .rva .LSEH_end_aesni_multi_cbc_encrypt_avx + .rva .LSEH_info_aesni_multi_cbc_encrypt_avx + .rva .LSEH_begin_aesni_multi_cbc_decrypt_avx + .rva .LSEH_end_aesni_multi_cbc_decrypt_avx + .rva .LSEH_info_aesni_multi_cbc_decrypt_avx +___ +$code.=<<___; +.section .xdata +.align 8 +.LSEH_info_aesni_multi_cbc_encrypt: + .byte 9,0,0,0 + .rva se_handler + .rva .Lenc4x_body,.Lenc4x_epilogue # HandlerData[] +.LSEH_info_aesni_multi_cbc_decrypt: + .byte 9,0,0,0 + .rva se_handler + .rva .Ldec4x_body,.Ldec4x_epilogue # HandlerData[] +___ +$code.=<<___ if ($avx); +.LSEH_info_aesni_multi_cbc_encrypt_avx: + .byte 9,0,0,0 + .rva se_handler + .rva .Lenc8x_body,.Lenc8x_epilogue # HandlerData[] +.LSEH_info_aesni_multi_cbc_decrypt_avx: + .byte 9,0,0,0 + .rva se_handler + .rva .Ldec8x_body,.Ldec8x_epilogue # HandlerData[] +___ +} +#################################################################### + +sub rex { + local *opcode=shift; + my ($dst,$src)=@_; + my $rex=0; + + $rex|=0x04 if($dst>=8); + $rex|=0x01 if($src>=8); + push @opcode,$rex|0x40 if($rex); +} + +sub aesni { + my $line=shift; + my @opcode=(0x66); + + if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + rex(\@opcode,$4,$3); + push @opcode,0x0f,0x3a,0xdf; + push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M + my $c=$2; + push @opcode,$c=~/^0/?oct($c):$c; + return ".byte\t".join(',',@opcode); + } + elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my %opcodelet = ( + "aesimc" => 0xdb, + "aesenc" => 0xdc, "aesenclast" => 0xdd, + "aesdec" => 0xde, "aesdeclast" => 0xdf + ); + return undef if (!defined($opcodelet{$1})); + rex(\@opcode,$3,$2); + push @opcode,0x0f,0x38,$opcodelet{$1}; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + return ".byte\t".join(',',@opcode); + } + elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) { + my %opcodelet = ( + "aesenc" => 0xdc, "aesenclast" => 0xdd, + "aesdec" => 0xde, "aesdeclast" => 0xdf + ); + return undef if (!defined($opcodelet{$1})); + my $off = $2; + push @opcode,0x44 if ($3>=8); + push @opcode,0x0f,0x38,$opcodelet{$1}; + push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M + push @opcode,($off=~/^0/?oct($off):$off)&0xff; + return ".byte\t".join(',',@opcode); + } + return $line; +} + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; +$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; + +print $code; +close STDOUT; diff --git a/crypto/aes/asm/aesni-sha1-x86_64.pl b/crypto/aes/asm/aesni-sha1-x86_64.pl index 3c8f6c19e7bd..97992adca7c3 100755 --- a/crypto/aes/asm/aesni-sha1-x86_64.pl +++ b/crypto/aes/asm/aesni-sha1-x86_64.pl @@ -21,16 +21,25 @@ # subroutine: # # AES-128-CBC +SHA1 stitch gain -# Westmere 3.77[+5.6] 9.37 6.65 +41% -# Sandy Bridge 5.05[+5.2(6.3)] 10.25(11.35) 6.16(7.08) +67%(+60%) +# Westmere 3.77[+5.3] 9.07 6.55 +38% +# Sandy Bridge 5.05[+5.0(6.1)] 10.06(11.15) 5.98(7.05) +68%(+58%) +# Ivy Bridge 5.05[+4.6] 9.65 5.54 +74% +# Haswell 4.43[+3.6(4.2)] 8.00(8.58) 4.55(5.21) +75%(+65%) +# Bulldozer 5.77[+6.0] 11.72 6.37 +84% # # AES-192-CBC -# Westmere 4.51 10.11 6.97 +45% -# Sandy Bridge 6.05 11.25(12.35) 6.34(7.27) +77%(+70%) +# Westmere 4.51 9.81 6.80 +44% +# Sandy Bridge 6.05 11.06(12.15) 6.11(7.19) +81%(+69%) +# Ivy Bridge 6.05 10.65 6.07 +75% +# Haswell 5.29 8.86(9.44) 5.32(5.32) +67%(+77%) +# Bulldozer 6.89 12.84 6.96 +84% # # AES-256-CBC -# Westmere 5.25 10.85 7.25 +50% -# Sandy Bridge 7.05 12.25(13.35) 7.06(7.70) +74%(+73%) +# Westmere 5.25 10.55 7.21 +46% +# Sandy Bridge 7.05 12.06(13.15) 7.12(7.72) +69%(+70%) +# Ivy Bridge 7.05 11.65 7.12 +64% +# Haswell 6.19 9.76(10.34) 6.21(6.25) +57%(+65%) +# Bulldozer 8.00 13.95 8.25 +69% # # (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for # background information. Above numbers in parentheses are SSSE3 @@ -45,8 +54,25 @@ # standalone AESNI-CBC decrypt: # # AES-128-CBC AES-192-CBC AES-256-CBC -# Westmere 1.31 1.55 1.80 -# Sandy Bridge 0.93 1.06 1.22 +# Westmere 1.25 1.50 1.75 +# Sandy Bridge 0.74 0.91 1.09 +# Ivy Bridge 0.74 0.90 1.11 +# Haswell 0.63 0.76 0.88 +# Bulldozer 0.70 0.85 0.99 + +# And indeed: +# +# AES-256-CBC +SHA1 stitch gain +# Westmere 1.75 7.20 6.68 +7.8% +# Sandy Bridge 1.09 6.09(7.22) 5.82(6.95) +4.6%(+3.9%) +# Ivy Bridge 1.11 5.70 5.45 +4.6% +# Haswell 0.88 4.45(5.00) 4.39(4.69) +1.4%(*)(+6.6%) +# Bulldozer 0.99 6.95 5.95 +17%(**) +# +# (*) Tiny improvement coefficient on Haswell is because we compare +# AVX1 stitch to sum with AVX2 SHA1. +# (**) Execution is fully dominated by integer code sequence and +# SIMD still hardly shows [in single-process benchmark;-] $flavour = shift; $output = shift; @@ -68,6 +94,11 @@ $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && $avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./ && $1>=10); +$avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/ && $2>=3.0); + +$shaext=1; ### set to zero if compiling for 1.0.1 + +$stitched_decrypt=0; open OUT,"| \"$^X\" $xlate $flavour $output"; *STDOUT=*OUT; @@ -86,11 +117,15 @@ $code.=<<___; .globl aesni_cbc_sha1_enc .type aesni_cbc_sha1_enc,\@abi-omnipotent -.align 16 +.align 32 aesni_cbc_sha1_enc: # caller should check for SSSE3 and AES-NI bits mov OPENSSL_ia32cap_P+0(%rip),%r10d - mov OPENSSL_ia32cap_P+4(%rip),%r11d + mov OPENSSL_ia32cap_P+4(%rip),%r11 +___ +$code.=<<___ if ($shaext); + bt \$61,%r11 # check SHA bit + jc aesni_cbc_sha1_enc_shaext ___ $code.=<<___ if ($avx); and \$`1<<28`,%r11d # mask AVX bit @@ -112,10 +147,21 @@ my @X=map("%xmm$_",(4..7,0..3)); my @Tx=map("%xmm$_",(8..10)); my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization my @T=("%esi","%edi"); -my $j=0; my $jj=0; my $r=0; my $sn=0; +my $j=0; my $jj=0; my $r=0; my $sn=0; my $rx=0; my $K_XX_XX="%r11"; -my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13)); -my @rndkey=("%xmm14","%xmm15"); +my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13)); # for enc +my @rndkey=("%xmm14","%xmm15"); # for enc +my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15)); # for dec + +if (1) { # reassign for Atom Silvermont + # The goal is to minimize amount of instructions with more than + # 3 prefix bytes. Or in more practical terms to keep AES-NI *and* + # SSSE3 instructions to upper half of the register bank. + @X=map("%xmm$_",(8..11,4..7)); + @Tx=map("%xmm$_",(12,13,3)); + ($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15)); + @rndkey=("%xmm0","%xmm1"); +} sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; @@ -129,7 +175,7 @@ my $_ror=sub { &ror(@_) }; $code.=<<___; .type aesni_cbc_sha1_enc_ssse3,\@function,6 -.align 16 +.align 32 aesni_cbc_sha1_enc_ssse3: mov `($win64?56:8)`(%rsp),$inp # load 7th argument #shr \$6,$len # debugging artefact @@ -161,16 +207,16 @@ $code.=<<___; mov $in0,%r12 # reassign arguments mov $out,%r13 mov $len,%r14 - mov $key,%r15 + lea 112($key),%r15 # size optimization movdqu ($ivp),$iv # load IV mov $ivp,88(%rsp) # save $ivp ___ -my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments +($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments my $rounds="${ivp}d"; $code.=<<___; shl \$6,$len sub $in0,$out - mov 240($key),$rounds + mov 240-112($key),$rounds add $inp,$len # end of input lea K_XX_XX(%rip),$K_XX_XX @@ -180,19 +226,22 @@ $code.=<<___; mov 12($ctx),$D mov $B,@T[0] # magic seed mov 16($ctx),$E + mov $C,@T[1] + xor $D,@T[1] + and @T[1],@T[0] - movdqa 64($K_XX_XX),@X[2] # pbswap mask + movdqa 64($K_XX_XX),@Tx[2] # pbswap mask movdqa 0($K_XX_XX),@Tx[1] # K_00_19 movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] movdqu 16($inp),@X[-3&7] movdqu 32($inp),@X[-2&7] movdqu 48($inp),@X[-1&7] - pshufb @X[2],@X[-4&7] # byte swap + pshufb @Tx[2],@X[-4&7] # byte swap + pshufb @Tx[2],@X[-3&7] + pshufb @Tx[2],@X[-2&7] add \$64,$inp - pshufb @X[2],@X[-3&7] - pshufb @X[2],@X[-2&7] - pshufb @X[2],@X[-1&7] paddd @Tx[1],@X[-4&7] # add K_00_19 + pshufb @Tx[2],@X[-1&7] paddd @Tx[1],@X[-3&7] paddd @Tx[1],@X[-2&7] movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU @@ -201,8 +250,8 @@ $code.=<<___; psubd @Tx[1],@X[-3&7] movdqa @X[-2&7],32(%rsp) psubd @Tx[1],@X[-2&7] - movups ($key),$rndkey0 # $key[0] - movups 16($key),$rndkey[0] # forward reference + movups -112($key),$rndkey0 # $key[0] + movups 16-112($key),$rndkey[0] # forward reference jmp .Loop_ssse3 ___ @@ -219,31 +268,31 @@ ___ ___ $code.=<<___; xorps $in,$iv + movups `32+16*$k-112`($key),$rndkey[1] aesenc $rndkey[0],$iv - movups `32+16*$k`($key),$rndkey[1] ___ } elsif ($k==9) { $sn++; $code.=<<___; cmp \$11,$rounds jb .Laesenclast$sn - movups `32+16*($k+0)`($key),$rndkey[1] + movups `32+16*($k+0)-112`($key),$rndkey[1] aesenc $rndkey[0],$iv - movups `32+16*($k+1)`($key),$rndkey[0] + movups `32+16*($k+1)-112`($key),$rndkey[0] aesenc $rndkey[1],$iv je .Laesenclast$sn - movups `32+16*($k+2)`($key),$rndkey[1] + movups `32+16*($k+2)-112`($key),$rndkey[1] aesenc $rndkey[0],$iv - movups `32+16*($k+3)`($key),$rndkey[0] + movups `32+16*($k+3)-112`($key),$rndkey[0] aesenc $rndkey[1],$iv .Laesenclast$sn: aesenclast $rndkey[0],$iv - movups 16($key),$rndkey[1] # forward reference + movups 16-112($key),$rndkey[1] # forward reference ___ } else { $code.=<<___; + movups `32+16*$k-112`($key),$rndkey[1] aesenc $rndkey[0],$iv - movups `32+16*$k`($key),$rndkey[1] ___ } $r++; unshift(@rndkey,pop(@rndkey)); @@ -255,61 +304,61 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions my ($a,$b,$c,$d,$e); - &movdqa (@X[0],@X[-3&7]); - eval(shift(@insns)); + eval(shift(@insns)); # ror + &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]); eval(shift(@insns)); &movdqa (@Tx[0],@X[-1&7]); - &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]" + &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); - &paddd (@Tx[1],@X[-1&7]); + &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8); eval(shift(@insns)); + eval(shift(@insns)); # rol eval(shift(@insns)); &psrldq (@Tx[0],4); # "X[-3]", 3 dwords eval(shift(@insns)); eval(shift(@insns)); + &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" eval(shift(@insns)); - eval(shift(@insns)); - + eval(shift(@insns)); # ror &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); - eval(shift(@insns)); &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" eval(shift(@insns)); - eval(shift(@insns)); + eval(shift(@insns)); # rol &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); &movdqa (@Tx[2],@X[0]); - &movdqa (@Tx[0],@X[0]); - eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); + eval(shift(@insns)); # ror + &movdqa (@Tx[0],@X[0]); eval(shift(@insns)); &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword &paddd (@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); &psrld (@Tx[0],31); eval(shift(@insns)); + eval(shift(@insns)); # rol eval(shift(@insns)); &movdqa (@Tx[1],@Tx[2]); eval(shift(@insns)); eval(shift(@insns)); &psrld (@Tx[2],30); - &por (@X[0],@Tx[0]); # "X[0]"<<<=1 eval(shift(@insns)); + eval(shift(@insns)); # ror + &por (@X[0],@Tx[0]); # "X[0]"<<<=1 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); @@ -317,12 +366,13 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 &pslld (@Tx[1],2); &pxor (@X[0],@Tx[2]); eval(shift(@insns)); - eval(shift(@insns)); &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX + eval(shift(@insns)); # rol eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 + &pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79 foreach (@insns) { eval; } # remaining instructions [if any] @@ -333,27 +383,30 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 sub Xupdate_ssse3_32_79() { use integer; my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions + my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions my ($a,$b,$c,$d,$e); - &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8); - eval(shift(@insns)); # body_20_39 + eval(shift(@insns)) if ($Xi==8); &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" - &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]" + eval(shift(@insns)) if ($Xi==8); + eval(shift(@insns)); # body_20_39 eval(shift(@insns)); + eval(shift(@insns)) if (@insns[1] =~ /_ror/); + eval(shift(@insns)) if (@insns[0] =~ /_ror/); + &punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8); eval(shift(@insns)); eval(shift(@insns)); # rol &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" eval(shift(@insns)); - eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); + eval(shift(@insns)); if ($Xi%5) { &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... } else { # ... or load next one &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); } - &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); # ror + &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]" @@ -361,29 +414,31 @@ sub Xupdate_ssse3_32_79() eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol + eval(shift(@insns)) if (@insns[0] =~ /_ror/); &movdqa (@Tx[0],@X[0]); - &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); + &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); # ror eval(shift(@insns)); + eval(shift(@insns)); # body_20_39 &pslld (@X[0],2); - eval(shift(@insns)); # body_20_39 eval(shift(@insns)); - &psrld (@Tx[0],30); eval(shift(@insns)); - eval(shift(@insns)); # rol + &psrld (@Tx[0],30); + eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror - eval(shift(@insns)); &por (@X[0],@Tx[0]); # "X[0]"<<<=2 - eval(shift(@insns)); # body_20_39 eval(shift(@insns)); - &movdqa (@Tx[1],@X[0]) if ($Xi<19); + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)) if (@insns[1] =~ /_rol/); + eval(shift(@insns)) if (@insns[0] =~ /_rol/); + &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0]) eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); @@ -404,10 +459,11 @@ sub Xuplast_ssse3_80() my ($a,$b,$c,$d,$e); eval(shift(@insns)); - &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); + &paddd (@Tx[1],@X[-1&7]); + eval(shift(@insns)); eval(shift(@insns)); &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU @@ -415,17 +471,17 @@ sub Xuplast_ssse3_80() foreach (@insns) { eval; } # remaining instructions &cmp ($inp,$len); - &je (".Ldone_ssse3"); + &je (shift); unshift(@Tx,pop(@Tx)); - &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask + &movdqa (@Tx[2],"64($K_XX_XX)"); # pbswap mask &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19 &movdqu (@X[-4&7],"0($inp)"); # load input &movdqu (@X[-3&7],"16($inp)"); &movdqu (@X[-2&7],"32($inp)"); &movdqu (@X[-1&7],"48($inp)"); - &pshufb (@X[-4&7],@X[2]); # byte swap + &pshufb (@X[-4&7],@Tx[2]); # byte swap &add ($inp,64); $Xi=0; @@ -439,7 +495,10 @@ sub Xloop_ssse3() eval(shift(@insns)); eval(shift(@insns)); - &pshufb (@X[($Xi-3)&7],@X[2]); + eval(shift(@insns)); + &pshufb (@X[($Xi-3)&7],@Tx[2]); + eval(shift(@insns)); + eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[($Xi-4)&7],@Tx[1]); @@ -450,6 +509,8 @@ sub Xloop_ssse3() &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); &psubd (@X[($Xi-4)&7],@Tx[1]); foreach (@insns) { eval; } @@ -465,76 +526,106 @@ sub Xtail_ssse3() foreach (@insns) { eval; } } -sub body_00_19 () { - use integer; - my ($k,$n); - my @r=( +my @body_00_19 = ( '($a,$b,$c,$d,$e)=@V;'. - '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer - '&xor ($c,$d);', - '&mov (@T[1],$a);', # $b in next round - '&$_rol ($a,5);', - '&and (@T[0],$c);', # ($b&($c^$d)) - '&xor ($c,$d);', # restore $c - '&xor (@T[0],$d);', - '&add ($e,$a);', '&$_ror ($b,$j?7:2);', # $b>>>2 - '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' + '&xor (@T[0],$d);', + '&mov (@T[1],$a);', # $b for next round + + '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer + '&xor ($b,$c);', # $c^$d for next round + + '&$_rol ($a,5);', + '&add ($e,@T[0]);', + '&and (@T[1],$b);', # ($b&($c^$d)) for next round + + '&xor ($b,$c);', # restore $b + '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); + +sub body_00_19 () { # ((c^d)&b)^d + # on start @T[0]=(c^d)&b + return &body_20_39() if ($rx==19); $rx++; + + use integer; + my ($k,$n); + my @r=@body_00_19; + $n = scalar(@r); $k = (($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds @r[$k%$n].='&$aesenc();' if ($jj==$k/$n); $jj++; + return @r; } -sub body_20_39 () { - use integer; - my ($k,$n); - my @r=( +my @body_20_39 = ( '($a,$b,$c,$d,$e)=@V;'. - '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer - '&xor (@T[0],$d);', # ($b^$d) - '&mov (@T[1],$a);', # $b in next round + '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer + '&xor (@T[0],$d) if($j==19);'. + '&xor (@T[0],$c) if($j> 19);', # ($b^$d^$c) + '&mov (@T[1],$a);', # $b for next round + '&$_rol ($a,5);', - '&xor (@T[0],$c);', # ($b^$d^$c) - '&add ($e,$a);', + '&add ($e,@T[0]);', + '&xor (@T[1],$c) if ($j< 79);', # $b^$d for next round + '&$_ror ($b,7);', # $b>>>2 - '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' + '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); + +sub body_20_39 () { # b^d^c + # on entry @T[0]=b^d + return &body_40_59() if ($rx==39); $rx++; + + use integer; + my ($k,$n); + my @r=@body_20_39; + $n = scalar(@r); $k = (($jj+1)*8/20)*20*$n/8; # 8 aesencs per these 20 rounds - @r[$k%$n].='&$aesenc();' if ($jj==$k/$n); + @r[$k%$n].='&$aesenc();' if ($jj==$k/$n && $rx!=20); $jj++; + return @r; } -sub body_40_59 () { - use integer; - my ($k,$n); - my @r=( +my @body_40_59 = ( '($a,$b,$c,$d,$e)=@V;'. - '&mov (@T[1],$c);', - '&xor ($c,$d);', - '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer - '&and (@T[1],$d);', - '&and (@T[0],$c);', # ($b&($c^$d)) + '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer + '&and (@T[0],$c) if ($j>=40);', # (b^c)&(c^d) + '&xor ($c,$d) if ($j>=40);', # restore $c + '&$_ror ($b,7);', # $b>>>2 - '&add ($e,@T[1]);', - '&mov (@T[1],$a);', # $b in next round + '&mov (@T[1],$a);', # $b for next round + '&xor (@T[0],$c);', + '&$_rol ($a,5);', '&add ($e,@T[0]);', - '&xor ($c,$d);', # restore $c - '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' + '&xor (@T[1],$c) if ($j==59);'. + '&xor (@T[1],$b) if ($j< 59);', # b^c for next round + + '&xor ($b,$c) if ($j< 59);', # c^d for next round + '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); + +sub body_40_59 () { # ((b^c)&(c^d))^c + # on entry @T[0]=(b^c), (c^=d) + $rx++; + + use integer; + my ($k,$n); + my @r=@body_40_59; + $n = scalar(@r); $k=(($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds - @r[$k%$n].='&$aesenc();' if ($jj==$k/$n); + @r[$k%$n].='&$aesenc();' if ($jj==$k/$n && $rx!=40); $jj++; + return @r; } $code.=<<___; -.align 16 +.align 32 .Loop_ssse3: ___ &Xupdate_ssse3_16_31(\&body_00_19); @@ -553,7 +644,7 @@ ___ &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_20_39); - &Xuplast_ssse3_80(\&body_20_39); # can jump to "done" + &Xuplast_ssse3_80(\&body_20_39,".Ldone_ssse3"); # can jump to "done" $saved_j=$j; @saved_V=@V; $saved_r=$r; @saved_rndkey=@rndkey; @@ -575,11 +666,13 @@ $code.=<<___; mov @T[0],4($ctx) mov @T[0],$B # magic seed mov $C,8($ctx) + mov $C,@T[1] mov $D,12($ctx) + xor $D,@T[1] mov $E,16($ctx) + and @T[1],@T[0] jmp .Loop_ssse3 -.align 16 .Ldone_ssse3: ___ $jj=$j=$saved_j; @V=@saved_V; @@ -631,7 +724,278 @@ $code.=<<___; .size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3 ___ -$j=$jj=$r=$sn=0; + if ($stitched_decrypt) {{{ +# reset +($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); +$j=$jj=$r=$rx=0; +$Xi=4; + +# reassign for Atom Silvermont (see above) +($inout0,$inout1,$inout2,$inout3,$rndkey0)=map("%xmm$_",(0..4)); +@X=map("%xmm$_",(8..13,6,7)); +@Tx=map("%xmm$_",(14,15,5)); + +my @aes256_dec = ( + '&movdqu($inout0,"0x00($in0)");', + '&movdqu($inout1,"0x10($in0)"); &pxor ($inout0,$rndkey0);', + '&movdqu($inout2,"0x20($in0)"); &pxor ($inout1,$rndkey0);', + '&movdqu($inout3,"0x30($in0)"); &pxor ($inout2,$rndkey0);', + + '&pxor ($inout3,$rndkey0); &movups ($rndkey0,"16-112($key)");', + '&movaps("64(%rsp)",@X[2]);', # save IV, originally @X[3] + undef,undef + ); +for ($i=0;$i<13;$i++) { + push (@aes256_dec,( + '&aesdec ($inout0,$rndkey0);', + '&aesdec ($inout1,$rndkey0);', + '&aesdec ($inout2,$rndkey0);', + '&aesdec ($inout3,$rndkey0); &movups($rndkey0,"'.(16*($i+2)-112).'($key)");' + )); + push (@aes256_dec,(undef,undef)) if (($i>=3 && $i<=5) || $i>=11); + push (@aes256_dec,(undef,undef)) if ($i==5); +} +push(@aes256_dec,( + '&aesdeclast ($inout0,$rndkey0); &movups (@X[0],"0x00($in0)");', + '&aesdeclast ($inout1,$rndkey0); &movups (@X[1],"0x10($in0)");', + '&aesdeclast ($inout2,$rndkey0); &movups (@X[2],"0x20($in0)");', + '&aesdeclast ($inout3,$rndkey0); &movups (@X[3],"0x30($in0)");', + + '&xorps ($inout0,"64(%rsp)"); &movdqu ($rndkey0,"-112($key)");', + '&xorps ($inout1,@X[0]); &movups ("0x00($out,$in0)",$inout0);', + '&xorps ($inout2,@X[1]); &movups ("0x10($out,$in0)",$inout1);', + '&xorps ($inout3,@X[2]); &movups ("0x20($out,$in0)",$inout2);', + + '&movups ("0x30($out,$in0)",$inout3);' + )); + +sub body_00_19_dec () { # ((c^d)&b)^d + # on start @T[0]=(c^d)&b + return &body_20_39_dec() if ($rx==19); + + my @r=@body_00_19; + + unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]); + $rx++; + + return @r; +} + +sub body_20_39_dec () { # b^d^c + # on entry @T[0]=b^d + return &body_40_59_dec() if ($rx==39); + + my @r=@body_20_39; + + unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]); + $rx++; + + return @r; +} + +sub body_40_59_dec () { # ((b^c)&(c^d))^c + # on entry @T[0]=(b^c), (c^=d) + + my @r=@body_40_59; + + unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]); + $rx++; + + return @r; +} + +$code.=<<___; +.globl aesni256_cbc_sha1_dec +.type aesni256_cbc_sha1_dec,\@abi-omnipotent +.align 32 +aesni256_cbc_sha1_dec: + # caller should check for SSSE3 and AES-NI bits + mov OPENSSL_ia32cap_P+0(%rip),%r10d + mov OPENSSL_ia32cap_P+4(%rip),%r11d +___ +$code.=<<___ if ($avx); + and \$`1<<28`,%r11d # mask AVX bit + and \$`1<<30`,%r10d # mask "Intel CPU" bit + or %r11d,%r10d + cmp \$`1<<28|1<<30`,%r10d + je aesni256_cbc_sha1_dec_avx +___ +$code.=<<___; + jmp aesni256_cbc_sha1_dec_ssse3 + ret +.size aesni256_cbc_sha1_dec,.-aesni256_cbc_sha1_dec + +.type aesni256_cbc_sha1_dec_ssse3,\@function,6 +.align 32 +aesni256_cbc_sha1_dec_ssse3: + mov `($win64?56:8)`(%rsp),$inp # load 7th argument + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + lea `-104-($win64?10*16:0)`(%rsp),%rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,96+0(%rsp) + movaps %xmm7,96+16(%rsp) + movaps %xmm8,96+32(%rsp) + movaps %xmm9,96+48(%rsp) + movaps %xmm10,96+64(%rsp) + movaps %xmm11,96+80(%rsp) + movaps %xmm12,96+96(%rsp) + movaps %xmm13,96+112(%rsp) + movaps %xmm14,96+128(%rsp) + movaps %xmm15,96+144(%rsp) +.Lprologue_dec_ssse3: +___ +$code.=<<___; + mov $in0,%r12 # reassign arguments + mov $out,%r13 + mov $len,%r14 + lea 112($key),%r15 # size optimization + movdqu ($ivp),@X[3] # load IV + #mov $ivp,88(%rsp) # save $ivp +___ +($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments +$code.=<<___; + shl \$6,$len + sub $in0,$out + add $inp,$len # end of input + + lea K_XX_XX(%rip),$K_XX_XX + mov 0($ctx),$A # load context + mov 4($ctx),$B + mov 8($ctx),$C + mov 12($ctx),$D + mov $B,@T[0] # magic seed + mov 16($ctx),$E + mov $C,@T[1] + xor $D,@T[1] + and @T[1],@T[0] + + movdqa 64($K_XX_XX),@Tx[2] # pbswap mask + movdqa 0($K_XX_XX),@Tx[1] # K_00_19 + movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] + movdqu 16($inp),@X[-3&7] + movdqu 32($inp),@X[-2&7] + movdqu 48($inp),@X[-1&7] + pshufb @Tx[2],@X[-4&7] # byte swap + add \$64,$inp + pshufb @Tx[2],@X[-3&7] + pshufb @Tx[2],@X[-2&7] + pshufb @Tx[2],@X[-1&7] + paddd @Tx[1],@X[-4&7] # add K_00_19 + paddd @Tx[1],@X[-3&7] + paddd @Tx[1],@X[-2&7] + movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU + psubd @Tx[1],@X[-4&7] # restore X[] + movdqa @X[-3&7],16(%rsp) + psubd @Tx[1],@X[-3&7] + movdqa @X[-2&7],32(%rsp) + psubd @Tx[1],@X[-2&7] + movdqu -112($key),$rndkey0 # $key[0] + jmp .Loop_dec_ssse3 + +.align 32 +.Loop_dec_ssse3: +___ + &Xupdate_ssse3_16_31(\&body_00_19_dec); + &Xupdate_ssse3_16_31(\&body_00_19_dec); + &Xupdate_ssse3_16_31(\&body_00_19_dec); + &Xupdate_ssse3_16_31(\&body_00_19_dec); + &Xupdate_ssse3_32_79(\&body_00_19_dec); + &Xupdate_ssse3_32_79(\&body_20_39_dec); + &Xupdate_ssse3_32_79(\&body_20_39_dec); + &Xupdate_ssse3_32_79(\&body_20_39_dec); + &Xupdate_ssse3_32_79(\&body_20_39_dec); + &Xupdate_ssse3_32_79(\&body_20_39_dec); + &Xupdate_ssse3_32_79(\&body_40_59_dec); + &Xupdate_ssse3_32_79(\&body_40_59_dec); + &Xupdate_ssse3_32_79(\&body_40_59_dec); + &Xupdate_ssse3_32_79(\&body_40_59_dec); + &Xupdate_ssse3_32_79(\&body_40_59_dec); + &Xupdate_ssse3_32_79(\&body_20_39_dec); + &Xuplast_ssse3_80(\&body_20_39_dec,".Ldone_dec_ssse3"); # can jump to "done" + + $saved_j=$j; @saved_V=@V; + $saved_rx=$rx; + + &Xloop_ssse3(\&body_20_39_dec); + &Xloop_ssse3(\&body_20_39_dec); + &Xloop_ssse3(\&body_20_39_dec); + + eval(@aes256_dec[-1]); # last store +$code.=<<___; + lea 64($in0),$in0 + + add 0($ctx),$A # update context + add 4($ctx),@T[0] + add 8($ctx),$C + add 12($ctx),$D + mov $A,0($ctx) + add 16($ctx),$E + mov @T[0],4($ctx) + mov @T[0],$B # magic seed + mov $C,8($ctx) + mov $C,@T[1] + mov $D,12($ctx) + xor $D,@T[1] + mov $E,16($ctx) + and @T[1],@T[0] + jmp .Loop_dec_ssse3 + +.Ldone_dec_ssse3: +___ + $jj=$j=$saved_j; @V=@saved_V; + $rx=$saved_rx; + + &Xtail_ssse3(\&body_20_39_dec); + &Xtail_ssse3(\&body_20_39_dec); + &Xtail_ssse3(\&body_20_39_dec); + + eval(@aes256_dec[-1]); # last store +$code.=<<___; + add 0($ctx),$A # update context + add 4($ctx),@T[0] + add 8($ctx),$C + mov $A,0($ctx) + add 12($ctx),$D + mov @T[0],4($ctx) + add 16($ctx),$E + mov $C,8($ctx) + mov $D,12($ctx) + mov $E,16($ctx) + movups @X[3],($ivp) # write IV +___ +$code.=<<___ if ($win64); + movaps 96+0(%rsp),%xmm6 + movaps 96+16(%rsp),%xmm7 + movaps 96+32(%rsp),%xmm8 + movaps 96+48(%rsp),%xmm9 + movaps 96+64(%rsp),%xmm10 + movaps 96+80(%rsp),%xmm11 + movaps 96+96(%rsp),%xmm12 + movaps 96+112(%rsp),%xmm13 + movaps 96+128(%rsp),%xmm14 + movaps 96+144(%rsp),%xmm15 +___ +$code.=<<___; + lea `104+($win64?10*16:0)`(%rsp),%rsi + mov 0(%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lepilogue_dec_ssse3: + ret +.size aesni256_cbc_sha1_dec_ssse3,.-aesni256_cbc_sha1_dec_ssse3 +___ + }}} +$j=$jj=$r=$rx=0; if ($avx) { my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); @@ -641,13 +1005,17 @@ my @X=map("%xmm$_",(4..7,0..3)); my @Tx=map("%xmm$_",(8..10)); my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization my @T=("%esi","%edi"); +my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13)); +my @rndkey=("%xmm14","%xmm15"); +my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15)); # for dec +my $Kx=@Tx[2]; my $_rol=sub { &shld(@_[0],@_) }; my $_ror=sub { &shrd(@_[0],@_) }; $code.=<<___; .type aesni_cbc_sha1_enc_avx,\@function,6 -.align 16 +.align 32 aesni_cbc_sha1_enc_avx: mov `($win64?56:8)`(%rsp),$inp # load 7th argument #shr \$6,$len # debugging artefact @@ -680,17 +1048,16 @@ $code.=<<___; mov $in0,%r12 # reassign arguments mov $out,%r13 mov $len,%r14 - mov $key,%r15 + lea 112($key),%r15 # size optimization vmovdqu ($ivp),$iv # load IV mov $ivp,88(%rsp) # save $ivp ___ -my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments +($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments my $rounds="${ivp}d"; $code.=<<___; shl \$6,$len sub $in0,$out - mov 240($key),$rounds - add \$112,$key # size optimization + mov 240-112($key),$rounds add $inp,$len # end of input lea K_XX_XX(%rip),$K_XX_XX @@ -700,9 +1067,12 @@ $code.=<<___; mov 12($ctx),$D mov $B,@T[0] # magic seed mov 16($ctx),$E + mov $C,@T[1] + xor $D,@T[1] + and @T[1],@T[0] vmovdqa 64($K_XX_XX),@X[2] # pbswap mask - vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19 + vmovdqa 0($K_XX_XX),$Kx # K_00_19 vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] vmovdqu 16($inp),@X[-3&7] vmovdqu 32($inp),@X[-2&7] @@ -712,13 +1082,13 @@ $code.=<<___; vpshufb @X[2],@X[-3&7],@X[-3&7] vpshufb @X[2],@X[-2&7],@X[-2&7] vpshufb @X[2],@X[-1&7],@X[-1&7] - vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19 - vpaddd @Tx[1],@X[-3&7],@X[1] - vpaddd @Tx[1],@X[-2&7],@X[2] + vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19 + vpaddd $Kx,@X[-3&7],@X[1] + vpaddd $Kx,@X[-2&7],@X[2] vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU vmovdqa @X[1],16(%rsp) vmovdqa @X[2],32(%rsp) - vmovups -112($key),$rndkey0 # $key[0] + vmovups -112($key),$rndkey[1] # $key[0] vmovups 16-112($key),$rndkey[0] # forward reference jmp .Loop_avx ___ @@ -728,14 +1098,14 @@ my $aesenc=sub { my ($n,$k)=($r/10,$r%10); if ($k==0) { $code.=<<___; - vmovups `16*$n`($in0),$in # load input - vxorps $rndkey0,$in,$in + vmovdqu `16*$n`($in0),$in # load input + vpxor $rndkey[1],$in,$in ___ $code.=<<___ if ($n); vmovups $iv,`16*($n-1)`($out,$in0) # write output ___ $code.=<<___; - vxorps $in,$iv,$iv + vpxor $in,$iv,$iv vaesenc $rndkey[0],$iv,$iv vmovups `32+16*$k-112`($key),$rndkey[1] ___ @@ -755,6 +1125,7 @@ ___ vmovups `32+16*($k+3)-112`($key),$rndkey[0] .Lvaesenclast$sn: vaesenclast $rndkey[0],$iv,$iv + vmovups -112($key),$rndkey[0] vmovups 16-112($key),$rndkey[1] # forward reference ___ } else { @@ -778,10 +1149,10 @@ sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 eval(shift(@insns)); eval(shift(@insns)); - &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); + &vpaddd (@Tx[1],$Kx,@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); - &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords + &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" @@ -807,31 +1178,31 @@ sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 eval(shift(@insns)); eval(shift(@insns)); - &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword + &vpslldq(@Tx[1],@X[0],12); # "X[0]"<<96, extract one dword &vpaddd (@X[0],@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); - &vpsrld (@Tx[1],@Tx[2],30); &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 + &vpsrld (@Tx[0],@Tx[1],30); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); - &vpslld (@Tx[2],@Tx[2],2); - &vpxor (@X[0],@X[0],@Tx[1]); + &vpslld (@Tx[1],@Tx[1],2); + &vpxor (@X[0],@X[0],@Tx[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); - &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 + &vpxor (@X[0],@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 eval(shift(@insns)); eval(shift(@insns)); - &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX + &vmovdqa ($Kx,eval(16*(($Xi)/5))."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX eval(shift(@insns)); eval(shift(@insns)); @@ -839,7 +1210,6 @@ sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 foreach (@insns) { eval; } # remaining instructions [if any] $Xi++; push(@X,shift(@X)); # "rotate" X[] - push(@Tx,shift(@Tx)); } sub Xupdate_avx_32_79() @@ -858,12 +1228,8 @@ sub Xupdate_avx_32_79() &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" eval(shift(@insns)); eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); - if ($Xi%5) { - &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... - } else { # ... or load next one - &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); - } - &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); + &vpaddd (@Tx[1],$Kx,@X[-1&7]); + &vmovdqa ($Kx,eval(16*($Xi/5))."($K_XX_XX)") if ($Xi%5==0); eval(shift(@insns)); # ror eval(shift(@insns)); @@ -893,7 +1259,6 @@ sub Xupdate_avx_32_79() &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2 eval(shift(@insns)); # body_20_39 eval(shift(@insns)); - &vmovdqa (@Tx[1],@X[0]) if ($Xi<19); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); @@ -904,7 +1269,6 @@ sub Xupdate_avx_32_79() foreach (@insns) { eval; } # remaining instructions $Xi++; push(@X,shift(@X)); # "rotate" X[] - push(@Tx,shift(@Tx)); } sub Xuplast_avx_80() @@ -914,28 +1278,26 @@ sub Xuplast_avx_80() my ($a,$b,$c,$d,$e); eval(shift(@insns)); - &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); + &vpaddd (@Tx[1],$Kx,@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); - &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU + &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU foreach (@insns) { eval; } # remaining instructions &cmp ($inp,$len); - &je (".Ldone_avx"); + &je (shift); - unshift(@Tx,pop(@Tx)); - - &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask - &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19 + &vmovdqa(@Tx[1],"64($K_XX_XX)"); # pbswap mask + &vmovdqa($Kx,"0($K_XX_XX)"); # K_00_19 &vmovdqu(@X[-4&7],"0($inp)"); # load input &vmovdqu(@X[-3&7],"16($inp)"); &vmovdqu(@X[-2&7],"32($inp)"); &vmovdqu(@X[-1&7],"48($inp)"); - &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap + &vpshufb(@X[-4&7],@X[-4&7],@Tx[1]); # byte swap &add ($inp,64); $Xi=0; @@ -949,15 +1311,15 @@ sub Xloop_avx() eval(shift(@insns)); eval(shift(@insns)); - &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]); + &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@Tx[1]); eval(shift(@insns)); eval(shift(@insns)); - &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]); + &vpaddd (@Tx[0],@X[($Xi-4)&7],$Kx); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); - &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU + &vmovdqa(eval(16*$Xi)."(%rsp)",@Tx[0]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); @@ -975,7 +1337,7 @@ sub Xtail_avx() } $code.=<<___; -.align 16 +.align 32 .Loop_avx: ___ &Xupdate_avx_16_31(\&body_00_19); @@ -994,7 +1356,7 @@ ___ &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_20_39); - &Xuplast_avx_80(\&body_20_39); # can jump to "done" + &Xuplast_avx_80(\&body_20_39,".Ldone_avx"); # can jump to "done" $saved_j=$j; @saved_V=@V; $saved_r=$r; @saved_rndkey=@rndkey; @@ -1016,11 +1378,13 @@ $code.=<<___; mov @T[0],4($ctx) mov @T[0],$B # magic seed mov $C,8($ctx) + mov $C,@T[1] mov $D,12($ctx) + xor $D,@T[1] mov $E,16($ctx) + and @T[1],@T[0] jmp .Loop_avx -.align 16 .Ldone_avx: ___ $jj=$j=$saved_j; @V=@saved_V; @@ -1072,6 +1436,218 @@ $code.=<<___; ret .size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx ___ + + if ($stitched_decrypt) {{{ +# reset +($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); + +$j=$jj=$r=$rx=0; +$Xi=4; + +@aes256_dec = ( + '&vpxor ($inout0,$rndkey0,"0x00($in0)");', + '&vpxor ($inout1,$rndkey0,"0x10($in0)");', + '&vpxor ($inout2,$rndkey0,"0x20($in0)");', + '&vpxor ($inout3,$rndkey0,"0x30($in0)");', + + '&vmovups($rndkey0,"16-112($key)");', + '&vmovups("64(%rsp)",@X[2]);', # save IV, originally @X[3] + undef,undef + ); +for ($i=0;$i<13;$i++) { + push (@aes256_dec,( + '&vaesdec ($inout0,$inout0,$rndkey0);', + '&vaesdec ($inout1,$inout1,$rndkey0);', + '&vaesdec ($inout2,$inout2,$rndkey0);', + '&vaesdec ($inout3,$inout3,$rndkey0); &vmovups($rndkey0,"'.(16*($i+2)-112).'($key)");' + )); + push (@aes256_dec,(undef,undef)) if (($i>=3 && $i<=5) || $i>=11); + push (@aes256_dec,(undef,undef)) if ($i==5); +} +push(@aes256_dec,( + '&vaesdeclast ($inout0,$inout0,$rndkey0); &vmovups(@X[0],"0x00($in0)");', + '&vaesdeclast ($inout1,$inout1,$rndkey0); &vmovups(@X[1],"0x10($in0)");', + '&vaesdeclast ($inout2,$inout2,$rndkey0); &vmovups(@X[2],"0x20($in0)");', + '&vaesdeclast ($inout3,$inout3,$rndkey0); &vmovups(@X[3],"0x30($in0)");', + + '&vxorps ($inout0,$inout0,"64(%rsp)"); &vmovdqu($rndkey0,"-112($key)");', + '&vxorps ($inout1,$inout1,@X[0]); &vmovups("0x00($out,$in0)",$inout0);', + '&vxorps ($inout2,$inout2,@X[1]); &vmovups("0x10($out,$in0)",$inout1);', + '&vxorps ($inout3,$inout3,@X[2]); &vmovups("0x20($out,$in0)",$inout2);', + + '&vmovups ("0x30($out,$in0)",$inout3);' + )); + +$code.=<<___; +.type aesni256_cbc_sha1_dec_avx,\@function,6 +.align 32 +aesni256_cbc_sha1_dec_avx: + mov `($win64?56:8)`(%rsp),$inp # load 7th argument + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + lea `-104-($win64?10*16:0)`(%rsp),%rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,96+0(%rsp) + movaps %xmm7,96+16(%rsp) + movaps %xmm8,96+32(%rsp) + movaps %xmm9,96+48(%rsp) + movaps %xmm10,96+64(%rsp) + movaps %xmm11,96+80(%rsp) + movaps %xmm12,96+96(%rsp) + movaps %xmm13,96+112(%rsp) + movaps %xmm14,96+128(%rsp) + movaps %xmm15,96+144(%rsp) +.Lprologue_dec_avx: +___ +$code.=<<___; + vzeroall + mov $in0,%r12 # reassign arguments + mov $out,%r13 + mov $len,%r14 + lea 112($key),%r15 # size optimization + vmovdqu ($ivp),@X[3] # load IV +___ +($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments +$code.=<<___; + shl \$6,$len + sub $in0,$out + add $inp,$len # end of input + + lea K_XX_XX(%rip),$K_XX_XX + mov 0($ctx),$A # load context + mov 4($ctx),$B + mov 8($ctx),$C + mov 12($ctx),$D + mov $B,@T[0] # magic seed + mov 16($ctx),$E + mov $C,@T[1] + xor $D,@T[1] + and @T[1],@T[0] + + vmovdqa 64($K_XX_XX),@X[2] # pbswap mask + vmovdqa 0($K_XX_XX),$Kx # K_00_19 + vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] + vmovdqu 16($inp),@X[-3&7] + vmovdqu 32($inp),@X[-2&7] + vmovdqu 48($inp),@X[-1&7] + vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap + add \$64,$inp + vpshufb @X[2],@X[-3&7],@X[-3&7] + vpshufb @X[2],@X[-2&7],@X[-2&7] + vpshufb @X[2],@X[-1&7],@X[-1&7] + vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19 + vpaddd $Kx,@X[-3&7],@X[1] + vpaddd $Kx,@X[-2&7],@X[2] + vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU + vmovdqa @X[1],16(%rsp) + vmovdqa @X[2],32(%rsp) + vmovups -112($key),$rndkey0 # $key[0] + jmp .Loop_dec_avx + +.align 32 +.Loop_dec_avx: +___ + &Xupdate_avx_16_31(\&body_00_19_dec); + &Xupdate_avx_16_31(\&body_00_19_dec); + &Xupdate_avx_16_31(\&body_00_19_dec); + &Xupdate_avx_16_31(\&body_00_19_dec); + &Xupdate_avx_32_79(\&body_00_19_dec); + &Xupdate_avx_32_79(\&body_20_39_dec); + &Xupdate_avx_32_79(\&body_20_39_dec); + &Xupdate_avx_32_79(\&body_20_39_dec); + &Xupdate_avx_32_79(\&body_20_39_dec); + &Xupdate_avx_32_79(\&body_20_39_dec); + &Xupdate_avx_32_79(\&body_40_59_dec); + &Xupdate_avx_32_79(\&body_40_59_dec); + &Xupdate_avx_32_79(\&body_40_59_dec); + &Xupdate_avx_32_79(\&body_40_59_dec); + &Xupdate_avx_32_79(\&body_40_59_dec); + &Xupdate_avx_32_79(\&body_20_39_dec); + &Xuplast_avx_80(\&body_20_39_dec,".Ldone_dec_avx"); # can jump to "done" + + $saved_j=$j; @saved_V=@V; + $saved_rx=$rx; + + &Xloop_avx(\&body_20_39_dec); + &Xloop_avx(\&body_20_39_dec); + &Xloop_avx(\&body_20_39_dec); + + eval(@aes256_dec[-1]); # last store +$code.=<<___; + lea 64($in0),$in0 + + add 0($ctx),$A # update context + add 4($ctx),@T[0] + add 8($ctx),$C + add 12($ctx),$D + mov $A,0($ctx) + add 16($ctx),$E + mov @T[0],4($ctx) + mov @T[0],$B # magic seed + mov $C,8($ctx) + mov $C,@T[1] + mov $D,12($ctx) + xor $D,@T[1] + mov $E,16($ctx) + and @T[1],@T[0] + jmp .Loop_dec_avx + +.Ldone_dec_avx: +___ + $jj=$j=$saved_j; @V=@saved_V; + $rx=$saved_rx; + + &Xtail_avx(\&body_20_39_dec); + &Xtail_avx(\&body_20_39_dec); + &Xtail_avx(\&body_20_39_dec); + + eval(@aes256_dec[-1]); # last store +$code.=<<___; + + add 0($ctx),$A # update context + add 4($ctx),@T[0] + add 8($ctx),$C + mov $A,0($ctx) + add 12($ctx),$D + mov @T[0],4($ctx) + add 16($ctx),$E + mov $C,8($ctx) + mov $D,12($ctx) + mov $E,16($ctx) + vmovups @X[3],($ivp) # write IV + vzeroall +___ +$code.=<<___ if ($win64); + movaps 96+0(%rsp),%xmm6 + movaps 96+16(%rsp),%xmm7 + movaps 96+32(%rsp),%xmm8 + movaps 96+48(%rsp),%xmm9 + movaps 96+64(%rsp),%xmm10 + movaps 96+80(%rsp),%xmm11 + movaps 96+96(%rsp),%xmm12 + movaps 96+112(%rsp),%xmm13 + movaps 96+128(%rsp),%xmm14 + movaps 96+144(%rsp),%xmm15 +___ +$code.=<<___; + lea `104+($win64?10*16:0)`(%rsp),%rsi + mov 0(%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lepilogue_dec_avx: + ret +.size aesni256_cbc_sha1_dec_avx,.-aesni256_cbc_sha1_dec_avx +___ + }}} } $code.=<<___; .align 64 @@ -1081,11 +1657,180 @@ K_XX_XX: .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask +.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 .asciz "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>" .align 64 ___ + if ($shaext) {{{ +($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); + +$rounds="%r11d"; + +($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15)); +@rndkey=("%xmm0","%xmm1"); +$r=0; + +my ($BSWAP,$ABCD,$E,$E_,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(7..12)); +my @MSG=map("%xmm$_",(3..6)); + +$code.=<<___; +.type aesni_cbc_sha1_enc_shaext,\@function,6 +.align 32 +aesni_cbc_sha1_enc_shaext: + mov `($win64?56:8)`(%rsp),$inp # load 7th argument +___ +$code.=<<___ if ($win64); + lea `-8-10*16`(%rsp),%rsp + movaps %xmm6,-8-10*16(%rax) + movaps %xmm7,-8-9*16(%rax) + movaps %xmm8,-8-8*16(%rax) + movaps %xmm9,-8-7*16(%rax) + movaps %xmm10,-8-6*16(%rax) + movaps %xmm11,-8-5*16(%rax) + movaps %xmm12,-8-4*16(%rax) + movaps %xmm13,-8-3*16(%rax) + movaps %xmm14,-8-2*16(%rax) + movaps %xmm15,-8-1*16(%rax) +.Lprologue_shaext: +___ +$code.=<<___; + movdqu ($ctx),$ABCD + movd 16($ctx),$E + movdqa K_XX_XX+0x50(%rip),$BSWAP # byte-n-word swap + + mov 240($key),$rounds + sub $in0,$out + movups ($key),$rndkey0 # $key[0] + movups 16($key),$rndkey[0] # forward reference + lea 112($key),$key # size optimization + + pshufd \$0b00011011,$ABCD,$ABCD # flip word order + pshufd \$0b00011011,$E,$E # flip word order + jmp .Loop_shaext +.align 16 +.Loop_shaext: +___ + &$aesenc(); +$code.=<<___; + movdqu ($inp),@MSG[0] + movdqa $E,$E_SAVE # offload $E + pshufb $BSWAP,@MSG[0] + movdqu 0x10($inp),@MSG[1] + movdqa $ABCD,$ABCD_SAVE # offload $ABCD +___ + &$aesenc(); +$code.=<<___; + pshufb $BSWAP,@MSG[1] + + paddd @MSG[0],$E + movdqu 0x20($inp),@MSG[2] + lea 0x40($inp),$inp + pxor $E_SAVE,@MSG[0] # black magic +___ + &$aesenc(); +$code.=<<___; + pxor $E_SAVE,@MSG[0] # black magic + movdqa $ABCD,$E_ + pshufb $BSWAP,@MSG[2] + sha1rnds4 \$0,$E,$ABCD # 0-3 + sha1nexte @MSG[1],$E_ +___ + &$aesenc(); +$code.=<<___; + sha1msg1 @MSG[1],@MSG[0] + movdqu -0x10($inp),@MSG[3] + movdqa $ABCD,$E + pshufb $BSWAP,@MSG[3] +___ + &$aesenc(); +$code.=<<___; + sha1rnds4 \$0,$E_,$ABCD # 4-7 + sha1nexte @MSG[2],$E + pxor @MSG[2],@MSG[0] + sha1msg1 @MSG[2],@MSG[1] +___ + &$aesenc(); + +for($i=2;$i<20-4;$i++) { +$code.=<<___; + movdqa $ABCD,$E_ + sha1rnds4 \$`int($i/5)`,$E,$ABCD # 8-11 + sha1nexte @MSG[3],$E_ +___ + &$aesenc(); +$code.=<<___; + sha1msg2 @MSG[3],@MSG[0] + pxor @MSG[3],@MSG[1] + sha1msg1 @MSG[3],@MSG[2] +___ + ($E,$E_)=($E_,$E); + push(@MSG,shift(@MSG)); + + &$aesenc(); +} +$code.=<<___; + movdqa $ABCD,$E_ + sha1rnds4 \$3,$E,$ABCD # 64-67 + sha1nexte @MSG[3],$E_ + sha1msg2 @MSG[3],@MSG[0] + pxor @MSG[3],@MSG[1] +___ + &$aesenc(); +$code.=<<___; + movdqa $ABCD,$E + sha1rnds4 \$3,$E_,$ABCD # 68-71 + sha1nexte @MSG[0],$E + sha1msg2 @MSG[0],@MSG[1] +___ + &$aesenc(); +$code.=<<___; + movdqa $E_SAVE,@MSG[0] + movdqa $ABCD,$E_ + sha1rnds4 \$3,$E,$ABCD # 72-75 + sha1nexte @MSG[1],$E_ +___ + &$aesenc(); +$code.=<<___; + movdqa $ABCD,$E + sha1rnds4 \$3,$E_,$ABCD # 76-79 + sha1nexte $MSG[0],$E +___ + while($r<40) { &$aesenc(); } # remaining aesenc's +$code.=<<___; + dec $len + + paddd $ABCD_SAVE,$ABCD + movups $iv,48($out,$in0) # write output + lea 64($in0),$in0 + jnz .Loop_shaext + + pshufd \$0b00011011,$ABCD,$ABCD + pshufd \$0b00011011,$E,$E + movups $iv,($ivp) # write IV + movdqu $ABCD,($ctx) + movd $E,16($ctx) +___ +$code.=<<___ if ($win64); + movaps -8-10*16(%rax),%xmm6 + movaps -8-9*16(%rax),%xmm7 + movaps -8-8*16(%rax),%xmm8 + movaps -8-7*16(%rax),%xmm9 + movaps -8-6*16(%rax),%xmm10 + movaps -8-5*16(%rax),%xmm11 + movaps -8-4*16(%rax),%xmm12 + movaps -8-3*16(%rax),%xmm13 + movaps -8-2*16(%rax),%xmm14 + movaps -8-1*16(%rax),%xmm15 + mov %rax,%rsp +.Lepilogue_shaext: +___ +$code.=<<___; + ret +.size aesni_cbc_sha1_enc_shaext,.-aesni_cbc_sha1_enc_shaext +___ + }}} # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { @@ -1127,7 +1872,21 @@ ssse3_handler: lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail +___ +$code.=<<___ if ($shaext); + lea aesni_cbc_sha1_enc_shaext(%rip),%r10 + cmp %r10,%rbx + jb .Lseh_no_shaext + lea (%rax),%rsi + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx + .long 0xa548f3fc # cld; rep movsq + lea 168(%rax),%rax # adjust stack pointer + jmp .Lcommon_seh_tail +.Lseh_no_shaext: +___ +$code.=<<___; lea 96(%rax),%rsi lea 512($context),%rdi # &context.Xmm6 mov \$20,%ecx @@ -1199,6 +1958,11 @@ $code.=<<___ if ($avx); .rva .LSEH_end_aesni_cbc_sha1_enc_avx .rva .LSEH_info_aesni_cbc_sha1_enc_avx ___ +$code.=<<___ if ($shaext); + .rva .LSEH_begin_aesni_cbc_sha1_enc_shaext + .rva .LSEH_end_aesni_cbc_sha1_enc_shaext + .rva .LSEH_info_aesni_cbc_sha1_enc_shaext +___ $code.=<<___; .section .xdata .align 8 @@ -1213,6 +1977,12 @@ $code.=<<___ if ($avx); .rva ssse3_handler .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] ___ +$code.=<<___ if ($shaext); +.LSEH_info_aesni_cbc_sha1_enc_shaext: + .byte 9,0,0,0 + .rva ssse3_handler + .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[] +___ } #################################################################### @@ -1223,28 +1993,65 @@ sub rex { $rex|=0x04 if($dst>=8); $rex|=0x01 if($src>=8); - push @opcode,$rex|0x40 if($rex); + unshift @opcode,$rex|0x40 if($rex); +} + +sub sha1rnds4 { + if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x0f,0x3a,0xcc); + rex(\@opcode,$3,$2); + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + my $c=$1; + push @opcode,$c=~/^0/?oct($c):$c; + return ".byte\t".join(',',@opcode); + } else { + return "sha1rnds4\t".@_[0]; + } +} + +sub sha1op38 { + my $instr = shift; + my %opcodelet = ( + "sha1nexte" => 0xc8, + "sha1msg1" => 0xc9, + "sha1msg2" => 0xca ); + + if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x0f,0x38); + rex(\@opcode,$2,$1); + push @opcode,$opcodelet{$instr}; + push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M + return ".byte\t".join(',',@opcode); + } else { + return $instr."\t".@_[0]; + } } sub aesni { my $line=shift; - my @opcode=(0x66); + my @opcode=(0x0f,0x38); if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { my %opcodelet = ( - "aesenc" => 0xdc, "aesenclast" => 0xdd + "aesenc" => 0xdc, "aesenclast" => 0xdd, + "aesdec" => 0xde, "aesdeclast" => 0xdf ); return undef if (!defined($opcodelet{$1})); rex(\@opcode,$3,$2); - push @opcode,0x0f,0x38,$opcodelet{$1}; - push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + push @opcode,$opcodelet{$1},0xc0|($2&7)|(($3&7)<<3); # ModR/M + unshift @opcode,0x66; return ".byte\t".join(',',@opcode); } return $line; } -$code =~ s/\`([^\`]*)\`/eval($1)/gem; -$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or + s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo or + s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/geo; -print $code; + print $_,"\n"; +} close STDOUT; diff --git a/crypto/aes/asm/aesni-sha256-x86_64.pl b/crypto/aes/asm/aesni-sha256-x86_64.pl new file mode 100755 index 000000000000..19b0433b3b1b --- /dev/null +++ b/crypto/aes/asm/aesni-sha256-x86_64.pl @@ -0,0 +1,1708 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# January 2013 +# +# This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled +# in http://download.intel.com/design/intarch/papers/323686.pdf, is +# that since AESNI-CBC encrypt exhibit *very* low instruction-level +# parallelism, interleaving it with another algorithm would allow to +# utilize processor resources better and achieve better performance. +# SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and +# AESNI code is weaved into it. As SHA256 dominates execution time, +# stitch performance does not depend on AES key length. Below are +# performance numbers in cycles per processed byte, less is better, +# for standalone AESNI-CBC encrypt, standalone SHA256, and stitched +# subroutine: +# +# AES-128/-192/-256+SHA256 this(**)gain +# Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43% +# Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50% +# Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59% +# Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58% +# +# (*) there are XOP, AVX1 and AVX2 code pathes, meaning that +# Westmere is omitted from loop, this is because gain was not +# estimated high enough to justify the effort; +# (**) these are EVP-free results, results obtained with 'speed +# -evp aes-256-cbc-hmac-sha256' will vary by percent or two; + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22); +} + +if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.09) + ($1>=2.10); +} + +if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=12); +} + +if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) { + $avx = ($2>=3.0) + ($2>3.0); +} + +$shaext=$avx; ### set to zero if compiling for 1.0.1 +$avx=1 if (!$shaext && $avx); + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +$func="aesni_cbc_sha256_enc"; +$TABLE="K256"; +$SZ=4; +@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", + "%r8d","%r9d","%r10d","%r11d"); +($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi"); +@Sigma0=( 2,13,22); +@Sigma1=( 6,11,25); +@sigma0=( 7,18, 3); +@sigma1=(17,19,10); +$rounds=64; + +######################################################################## +# void aesni_cbc_sha256_enc(const void *inp, +# void *out, +# size_t length, +# const AES_KEY *key, +# unsigned char *iv, +# SHA256_CTX *ctx, +# const void *in0); +($inp, $out, $len, $key, $ivp, $ctx, $in0) = +("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); + +$Tbl="%rbp"; + +$_inp="16*$SZ+0*8(%rsp)"; +$_out="16*$SZ+1*8(%rsp)"; +$_end="16*$SZ+2*8(%rsp)"; +$_key="16*$SZ+3*8(%rsp)"; +$_ivp="16*$SZ+4*8(%rsp)"; +$_ctx="16*$SZ+5*8(%rsp)"; +$_in0="16*$SZ+6*8(%rsp)"; +$_rsp="16*$SZ+7*8(%rsp)"; +$framesz=16*$SZ+8*8; + +$code=<<___; +.text + +.extern OPENSSL_ia32cap_P +.globl $func +.type $func,\@abi-omnipotent +.align 16 +$func: +___ + if ($avx) { +$code.=<<___; + lea OPENSSL_ia32cap_P(%rip),%r11 + mov \$1,%eax + cmp \$0,`$win64?"%rcx":"%rdi"` + je .Lprobe + mov 0(%r11),%eax + mov 4(%r11),%r10 +___ +$code.=<<___ if ($shaext); + bt \$61,%r10 # check for SHA + jc ${func}_shaext +___ +$code.=<<___; + mov %r10,%r11 + shr \$32,%r11 + + test \$`1<<11`,%r10d # check for XOP + jnz ${func}_xop +___ +$code.=<<___ if ($avx>1); + and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 + cmp \$`1<<8|1<<5|1<<3`,%r11d + je ${func}_avx2 +___ +$code.=<<___; + and \$`1<<30`,%eax # mask "Intel CPU" bit + and \$`1<<28|1<<9`,%r10d # mask AVX+SSSE3 bits + or %eax,%r10d + cmp \$`1<<28|1<<9|1<<30`,%r10d + je ${func}_avx + ud2 +___ + } +$code.=<<___; + xor %eax,%eax + cmp \$0,`$win64?"%rcx":"%rdi"` + je .Lprobe + ud2 +.Lprobe: + ret +.size $func,.-$func + +.align 64 +.type $TABLE,\@object +$TABLE: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + + .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f + .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f + .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1 + .long 0,0,0,0, 0,0,0,0 + .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>" +.align 64 +___ + +###################################################################### +# SIMD code paths +# +{{{ +($iv,$inout,$roundkey,$temp, + $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15)); + +$aesni_cbc_idx=0; +@aesni_cbc_block = ( +## &vmovdqu ($roundkey,"0x00-0x80($inp)");' +## &vmovdqu ($inout,($inp)); +## &mov ($_inp,$inp); + + '&vpxor ($inout,$inout,$roundkey);'. + ' &vmovdqu ($roundkey,"0x10-0x80($inp)");', + + '&vpxor ($inout,$inout,$iv);', + + '&vaesenc ($inout,$inout,$roundkey);'. + ' &vmovdqu ($roundkey,"0x20-0x80($inp)");', + + '&vaesenc ($inout,$inout,$roundkey);'. + ' &vmovdqu ($roundkey,"0x30-0x80($inp)");', + + '&vaesenc ($inout,$inout,$roundkey);'. + ' &vmovdqu ($roundkey,"0x40-0x80($inp)");', + + '&vaesenc ($inout,$inout,$roundkey);'. + ' &vmovdqu ($roundkey,"0x50-0x80($inp)");', + + '&vaesenc ($inout,$inout,$roundkey);'. + ' &vmovdqu ($roundkey,"0x60-0x80($inp)");', + + '&vaesenc ($inout,$inout,$roundkey);'. + ' &vmovdqu ($roundkey,"0x70-0x80($inp)");', + + '&vaesenc ($inout,$inout,$roundkey);'. + ' &vmovdqu ($roundkey,"0x80-0x80($inp)");', + + '&vaesenc ($inout,$inout,$roundkey);'. + ' &vmovdqu ($roundkey,"0x90-0x80($inp)");', + + '&vaesenc ($inout,$inout,$roundkey);'. + ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");', + + '&vaesenclast ($temp,$inout,$roundkey);'. + ' &vaesenc ($inout,$inout,$roundkey);'. + ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");', + + '&vpand ($iv,$temp,$mask10);'. + ' &vaesenc ($inout,$inout,$roundkey);'. + ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");', + + '&vaesenclast ($temp,$inout,$roundkey);'. + ' &vaesenc ($inout,$inout,$roundkey);'. + ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");', + + '&vpand ($temp,$temp,$mask12);'. + ' &vaesenc ($inout,$inout,$roundkey);'. + '&vmovdqu ($roundkey,"0xe0-0x80($inp)");', + + '&vpor ($iv,$iv,$temp);'. + ' &vaesenclast ($temp,$inout,$roundkey);'. + ' &vmovdqu ($roundkey,"0x00-0x80($inp)");' + +## &mov ($inp,$_inp); +## &mov ($out,$_out); +## &vpand ($temp,$temp,$mask14); +## &vpor ($iv,$iv,$temp); +## &vmovdqu ($iv,($out,$inp); +## &lea (inp,16($inp)); +); + +my $a4=$T1; +my ($a,$b,$c,$d,$e,$f,$g,$h); + +sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; + my $arg = pop; + $arg = "\$$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; +} + +sub body_00_15 () { + ( + '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. + + '&ror ($a0,$Sigma1[2]-$Sigma1[1])', + '&mov ($a,$a1)', + '&mov ($a4,$f)', + + '&xor ($a0,$e)', + '&ror ($a1,$Sigma0[2]-$Sigma0[1])', + '&xor ($a4,$g)', # f^g + + '&ror ($a0,$Sigma1[1]-$Sigma1[0])', + '&xor ($a1,$a)', + '&and ($a4,$e)', # (f^g)&e + + @aesni_cbc_block[$aesni_cbc_idx++]. + '&xor ($a0,$e)', + '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] + '&mov ($a2,$a)', + + '&ror ($a1,$Sigma0[1]-$Sigma0[0])', + '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g + '&xor ($a2,$b)', # a^b, b^c in next round + + '&ror ($a0,$Sigma1[0])', # Sigma1(e) + '&add ($h,$a4)', # h+=Ch(e,f,g) + '&and ($a3,$a2)', # (b^c)&(a^b) + + '&xor ($a1,$a)', + '&add ($h,$a0)', # h+=Sigma1(e) + '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) + + '&add ($d,$h)', # d+=h + '&ror ($a1,$Sigma0[0])', # Sigma0(a) + '&add ($h,$a3)', # h+=Maj(a,b,c) + + '&mov ($a0,$d)', + '&add ($a1,$h);'. # h+=Sigma0(a) + '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' + ); +} + +if ($avx) {{ +###################################################################### +# XOP code path +# +$code.=<<___; +.type ${func}_xop,\@function,6 +.align 64 +${func}_xop: +.Lxop_shortcut: + mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + mov %rsp,%r11 # copy %rsp + sub \$`$framesz+$win64*16*10`,%rsp + and \$-64,%rsp # align stack frame + + shl \$6,$len + sub $inp,$out # re-bias + sub $inp,$in0 + add $inp,$len # end of input + + #mov $inp,$_inp # saved later + mov $out,$_out + mov $len,$_end + #mov $key,$_key # remains resident in $inp register + mov $ivp,$_ivp + mov $ctx,$_ctx + mov $in0,$_in0 + mov %r11,$_rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,`$framesz+16*0`(%rsp) + movaps %xmm7,`$framesz+16*1`(%rsp) + movaps %xmm8,`$framesz+16*2`(%rsp) + movaps %xmm9,`$framesz+16*3`(%rsp) + movaps %xmm10,`$framesz+16*4`(%rsp) + movaps %xmm11,`$framesz+16*5`(%rsp) + movaps %xmm12,`$framesz+16*6`(%rsp) + movaps %xmm13,`$framesz+16*7`(%rsp) + movaps %xmm14,`$framesz+16*8`(%rsp) + movaps %xmm15,`$framesz+16*9`(%rsp) +___ +$code.=<<___; +.Lprologue_xop: + vzeroall + + mov $inp,%r12 # borrow $a4 + lea 0x80($key),$inp # size optimization, reassign + lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0 + mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 + mov $ctx,%r15 # borrow $a2 + mov $in0,%rsi # borrow $a3 + vmovdqu ($ivp),$iv # load IV + sub \$9,%r14 + + mov $SZ*0(%r15),$A + mov $SZ*1(%r15),$B + mov $SZ*2(%r15),$C + mov $SZ*3(%r15),$D + mov $SZ*4(%r15),$E + mov $SZ*5(%r15),$F + mov $SZ*6(%r15),$G + mov $SZ*7(%r15),$H + + vmovdqa 0x00(%r13,%r14,8),$mask14 + vmovdqa 0x10(%r13,%r14,8),$mask12 + vmovdqa 0x20(%r13,%r14,8),$mask10 + vmovdqu 0x00-0x80($inp),$roundkey + jmp .Lloop_xop +___ + if ($SZ==4) { # SHA256 + my @X = map("%xmm$_",(0..3)); + my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); + +$code.=<<___; +.align 16 +.Lloop_xop: + vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 + vmovdqu 0x00(%rsi,%r12),@X[0] + vmovdqu 0x10(%rsi,%r12),@X[1] + vmovdqu 0x20(%rsi,%r12),@X[2] + vmovdqu 0x30(%rsi,%r12),@X[3] + vpshufb $t3,@X[0],@X[0] + lea $TABLE(%rip),$Tbl + vpshufb $t3,@X[1],@X[1] + vpshufb $t3,@X[2],@X[2] + vpaddd 0x00($Tbl),@X[0],$t0 + vpshufb $t3,@X[3],@X[3] + vpaddd 0x20($Tbl),@X[1],$t1 + vpaddd 0x40($Tbl),@X[2],$t2 + vpaddd 0x60($Tbl),@X[3],$t3 + vmovdqa $t0,0x00(%rsp) + mov $A,$a1 + vmovdqa $t1,0x10(%rsp) + mov $B,$a3 + vmovdqa $t2,0x20(%rsp) + xor $C,$a3 # magic + vmovdqa $t3,0x30(%rsp) + mov $E,$a0 + jmp .Lxop_00_47 + +.align 16 +.Lxop_00_47: + sub \$-16*2*$SZ,$Tbl # size optimization + vmovdqu (%r12),$inout # $a4 + mov %r12,$_inp # $a4 +___ +sub XOP_256_00_47 () { +my $j = shift; +my $body = shift; +my @X = @_; +my @insns = (&$body,&$body,&$body,&$body); # 104 instructions + + &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4] + eval(shift(@insns)); + eval(shift(@insns)); + &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + &vprotd ($t1,$t0,8*$SZ-$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpsrld ($t0,$t0,$sigma0[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpxor ($t0,$t0,$t1); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpxor ($t0,$t0,$t2); # sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &vpsrld ($t2,@X[3],$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpxor ($t3,$t3,$t2); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vpsrldq ($t3,$t3,8); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpsrld ($t2,@X[0],$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpxor ($t3,$t3,$t2); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vpxor ($t3,$t3,$t1); # sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vpslldq ($t3,$t3,8); # 22 instructions + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); + foreach (@insns) { eval; } # remaining instructions + &vmovdqa (16*$j."(%rsp)",$t2); +} + + $aesni_cbc_idx=0; + for ($i=0,$j=0; $j<4; $j++) { + &XOP_256_00_47($j,\&body_00_15,@X); + push(@X,shift(@X)); # rotate(@X) + } + &mov ("%r12",$_inp); # borrow $a4 + &vpand ($temp,$temp,$mask14); + &mov ("%r15",$_out); # borrow $a2 + &vpor ($iv,$iv,$temp); + &vmovdqu ("(%r15,%r12)",$iv); # write output + &lea ("%r12","16(%r12)"); # inp++ + + &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); + &jne (".Lxop_00_47"); + + &vmovdqu ($inout,"(%r12)"); + &mov ($_inp,"%r12"); + + $aesni_cbc_idx=0; + for ($i=0; $i<16; ) { + foreach(body_00_15()) { eval; } + } + } +$code.=<<___; + mov $_inp,%r12 # borrow $a4 + mov $_out,%r13 # borrow $a0 + mov $_ctx,%r15 # borrow $a2 + mov $_in0,%rsi # borrow $a3 + + vpand $mask14,$temp,$temp + mov $a1,$A + vpor $temp,$iv,$iv + vmovdqu $iv,(%r13,%r12) # write output + lea 16(%r12),%r12 # inp++ + + add $SZ*0(%r15),$A + add $SZ*1(%r15),$B + add $SZ*2(%r15),$C + add $SZ*3(%r15),$D + add $SZ*4(%r15),$E + add $SZ*5(%r15),$F + add $SZ*6(%r15),$G + add $SZ*7(%r15),$H + + cmp $_end,%r12 + + mov $A,$SZ*0(%r15) + mov $B,$SZ*1(%r15) + mov $C,$SZ*2(%r15) + mov $D,$SZ*3(%r15) + mov $E,$SZ*4(%r15) + mov $F,$SZ*5(%r15) + mov $G,$SZ*6(%r15) + mov $H,$SZ*7(%r15) + + jb .Lloop_xop + + mov $_ivp,$ivp + mov $_rsp,%rsi + vmovdqu $iv,($ivp) # output IV + vzeroall +___ +$code.=<<___ if ($win64); + movaps `$framesz+16*0`(%rsp),%xmm6 + movaps `$framesz+16*1`(%rsp),%xmm7 + movaps `$framesz+16*2`(%rsp),%xmm8 + movaps `$framesz+16*3`(%rsp),%xmm9 + movaps `$framesz+16*4`(%rsp),%xmm10 + movaps `$framesz+16*5`(%rsp),%xmm11 + movaps `$framesz+16*6`(%rsp),%xmm12 + movaps `$framesz+16*7`(%rsp),%xmm13 + movaps `$framesz+16*8`(%rsp),%xmm14 + movaps `$framesz+16*9`(%rsp),%xmm15 +___ +$code.=<<___; + mov (%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lepilogue_xop: + ret +.size ${func}_xop,.-${func}_xop +___ +###################################################################### +# AVX+shrd code path +# +local *ror = sub { &shrd(@_[0],@_) }; + +$code.=<<___; +.type ${func}_avx,\@function,6 +.align 64 +${func}_avx: +.Lavx_shortcut: + mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + mov %rsp,%r11 # copy %rsp + sub \$`$framesz+$win64*16*10`,%rsp + and \$-64,%rsp # align stack frame + + shl \$6,$len + sub $inp,$out # re-bias + sub $inp,$in0 + add $inp,$len # end of input + + #mov $inp,$_inp # saved later + mov $out,$_out + mov $len,$_end + #mov $key,$_key # remains resident in $inp register + mov $ivp,$_ivp + mov $ctx,$_ctx + mov $in0,$_in0 + mov %r11,$_rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,`$framesz+16*0`(%rsp) + movaps %xmm7,`$framesz+16*1`(%rsp) + movaps %xmm8,`$framesz+16*2`(%rsp) + movaps %xmm9,`$framesz+16*3`(%rsp) + movaps %xmm10,`$framesz+16*4`(%rsp) + movaps %xmm11,`$framesz+16*5`(%rsp) + movaps %xmm12,`$framesz+16*6`(%rsp) + movaps %xmm13,`$framesz+16*7`(%rsp) + movaps %xmm14,`$framesz+16*8`(%rsp) + movaps %xmm15,`$framesz+16*9`(%rsp) +___ +$code.=<<___; +.Lprologue_avx: + vzeroall + + mov $inp,%r12 # borrow $a4 + lea 0x80($key),$inp # size optimization, reassign + lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0 + mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 + mov $ctx,%r15 # borrow $a2 + mov $in0,%rsi # borrow $a3 + vmovdqu ($ivp),$iv # load IV + sub \$9,%r14 + + mov $SZ*0(%r15),$A + mov $SZ*1(%r15),$B + mov $SZ*2(%r15),$C + mov $SZ*3(%r15),$D + mov $SZ*4(%r15),$E + mov $SZ*5(%r15),$F + mov $SZ*6(%r15),$G + mov $SZ*7(%r15),$H + + vmovdqa 0x00(%r13,%r14,8),$mask14 + vmovdqa 0x10(%r13,%r14,8),$mask12 + vmovdqa 0x20(%r13,%r14,8),$mask10 + vmovdqu 0x00-0x80($inp),$roundkey +___ + if ($SZ==4) { # SHA256 + my @X = map("%xmm$_",(0..3)); + my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); + +$code.=<<___; + jmp .Lloop_avx +.align 16 +.Lloop_avx: + vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 + vmovdqu 0x00(%rsi,%r12),@X[0] + vmovdqu 0x10(%rsi,%r12),@X[1] + vmovdqu 0x20(%rsi,%r12),@X[2] + vmovdqu 0x30(%rsi,%r12),@X[3] + vpshufb $t3,@X[0],@X[0] + lea $TABLE(%rip),$Tbl + vpshufb $t3,@X[1],@X[1] + vpshufb $t3,@X[2],@X[2] + vpaddd 0x00($Tbl),@X[0],$t0 + vpshufb $t3,@X[3],@X[3] + vpaddd 0x20($Tbl),@X[1],$t1 + vpaddd 0x40($Tbl),@X[2],$t2 + vpaddd 0x60($Tbl),@X[3],$t3 + vmovdqa $t0,0x00(%rsp) + mov $A,$a1 + vmovdqa $t1,0x10(%rsp) + mov $B,$a3 + vmovdqa $t2,0x20(%rsp) + xor $C,$a3 # magic + vmovdqa $t3,0x30(%rsp) + mov $E,$a0 + jmp .Lavx_00_47 + +.align 16 +.Lavx_00_47: + sub \$-16*2*$SZ,$Tbl # size optimization + vmovdqu (%r12),$inout # $a4 + mov %r12,$_inp # $a4 +___ +sub Xupdate_256_AVX () { + ( + '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] + '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] + '&vpsrld ($t2,$t0,$sigma0[0]);', + '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] + '&vpsrld ($t3,$t0,$sigma0[2])', + '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', + '&vpxor ($t0,$t3,$t2)', + '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] + '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', + '&vpxor ($t0,$t0,$t1)', + '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', + '&vpxor ($t0,$t0,$t2)', + '&vpsrld ($t2,$t3,$sigma1[2]);', + '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) + '&vpsrlq ($t3,$t3,$sigma1[0]);', + '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) + '&vpxor ($t2,$t2,$t3);', + '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', + '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15]) + '&vpshufd ($t2,$t2,0b10000100)', + '&vpsrldq ($t2,$t2,8)', + '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) + '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] + '&vpsrld ($t2,$t3,$sigma1[2])', + '&vpsrlq ($t3,$t3,$sigma1[0])', + '&vpxor ($t2,$t2,$t3);', + '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', + '&vpxor ($t2,$t2,$t3)', + '&vpshufd ($t2,$t2,0b11101000)', + '&vpslldq ($t2,$t2,8)', + '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) + ); +} + +sub AVX_256_00_47 () { +my $j = shift; +my $body = shift; +my @X = @_; +my @insns = (&$body,&$body,&$body,&$body); # 104 instructions + + foreach (Xupdate_256_AVX()) { # 29 instructions + eval; + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + } + &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); + foreach (@insns) { eval; } # remaining instructions + &vmovdqa (16*$j."(%rsp)",$t2); +} + + $aesni_cbc_idx=0; + for ($i=0,$j=0; $j<4; $j++) { + &AVX_256_00_47($j,\&body_00_15,@X); + push(@X,shift(@X)); # rotate(@X) + } + &mov ("%r12",$_inp); # borrow $a4 + &vpand ($temp,$temp,$mask14); + &mov ("%r15",$_out); # borrow $a2 + &vpor ($iv,$iv,$temp); + &vmovdqu ("(%r15,%r12)",$iv); # write output + &lea ("%r12","16(%r12)"); # inp++ + + &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); + &jne (".Lavx_00_47"); + + &vmovdqu ($inout,"(%r12)"); + &mov ($_inp,"%r12"); + + $aesni_cbc_idx=0; + for ($i=0; $i<16; ) { + foreach(body_00_15()) { eval; } + } + + } +$code.=<<___; + mov $_inp,%r12 # borrow $a4 + mov $_out,%r13 # borrow $a0 + mov $_ctx,%r15 # borrow $a2 + mov $_in0,%rsi # borrow $a3 + + vpand $mask14,$temp,$temp + mov $a1,$A + vpor $temp,$iv,$iv + vmovdqu $iv,(%r13,%r12) # write output + lea 16(%r12),%r12 # inp++ + + add $SZ*0(%r15),$A + add $SZ*1(%r15),$B + add $SZ*2(%r15),$C + add $SZ*3(%r15),$D + add $SZ*4(%r15),$E + add $SZ*5(%r15),$F + add $SZ*6(%r15),$G + add $SZ*7(%r15),$H + + cmp $_end,%r12 + + mov $A,$SZ*0(%r15) + mov $B,$SZ*1(%r15) + mov $C,$SZ*2(%r15) + mov $D,$SZ*3(%r15) + mov $E,$SZ*4(%r15) + mov $F,$SZ*5(%r15) + mov $G,$SZ*6(%r15) + mov $H,$SZ*7(%r15) + jb .Lloop_avx + + mov $_ivp,$ivp + mov $_rsp,%rsi + vmovdqu $iv,($ivp) # output IV + vzeroall +___ +$code.=<<___ if ($win64); + movaps `$framesz+16*0`(%rsp),%xmm6 + movaps `$framesz+16*1`(%rsp),%xmm7 + movaps `$framesz+16*2`(%rsp),%xmm8 + movaps `$framesz+16*3`(%rsp),%xmm9 + movaps `$framesz+16*4`(%rsp),%xmm10 + movaps `$framesz+16*5`(%rsp),%xmm11 + movaps `$framesz+16*6`(%rsp),%xmm12 + movaps `$framesz+16*7`(%rsp),%xmm13 + movaps `$framesz+16*8`(%rsp),%xmm14 + movaps `$framesz+16*9`(%rsp),%xmm15 +___ +$code.=<<___; + mov (%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lepilogue_avx: + ret +.size ${func}_avx,.-${func}_avx +___ + +if ($avx>1) {{ +###################################################################### +# AVX2+BMI code path +# +my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp +my $PUSH8=8*2*$SZ; +use integer; + +sub bodyx_00_15 () { + # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f + ( + '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. + + '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] + '&and ($a4,$e)', # f&e + '&rorx ($a0,$e,$Sigma1[2])', + '&rorx ($a2,$e,$Sigma1[1])', + + '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past + '&lea ($h,"($h,$a4)")', + '&andn ($a4,$e,$g)', # ~e&g + '&xor ($a0,$a2)', + + '&rorx ($a1,$e,$Sigma1[0])', + '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) + '&xor ($a0,$a1)', # Sigma1(e) + '&mov ($a2,$a)', + + '&rorx ($a4,$a,$Sigma0[2])', + '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) + '&xor ($a2,$b)', # a^b, b^c in next round + '&rorx ($a1,$a,$Sigma0[1])', + + '&rorx ($a0,$a,$Sigma0[0])', + '&lea ($d,"($d,$h)")', # d+=h + '&and ($a3,$a2)', # (b^c)&(a^b) + @aesni_cbc_block[$aesni_cbc_idx++]. + '&xor ($a1,$a4)', + + '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) + '&xor ($a1,$a0)', # Sigma0(a) + '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) + '&mov ($a4,$e)', # copy of f in future + + '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' + ); + # and at the finish one has to $a+=$a1 +} + +$code.=<<___; +.type ${func}_avx2,\@function,6 +.align 64 +${func}_avx2: +.Lavx2_shortcut: + mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + mov %rsp,%r11 # copy %rsp + sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp + and \$-256*$SZ,%rsp # align stack frame + add \$`2*$SZ*($rounds-8)`,%rsp + + shl \$6,$len + sub $inp,$out # re-bias + sub $inp,$in0 + add $inp,$len # end of input + + #mov $inp,$_inp # saved later + #mov $out,$_out # kept in $offload + mov $len,$_end + #mov $key,$_key # remains resident in $inp register + mov $ivp,$_ivp + mov $ctx,$_ctx + mov $in0,$_in0 + mov %r11,$_rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,`$framesz+16*0`(%rsp) + movaps %xmm7,`$framesz+16*1`(%rsp) + movaps %xmm8,`$framesz+16*2`(%rsp) + movaps %xmm9,`$framesz+16*3`(%rsp) + movaps %xmm10,`$framesz+16*4`(%rsp) + movaps %xmm11,`$framesz+16*5`(%rsp) + movaps %xmm12,`$framesz+16*6`(%rsp) + movaps %xmm13,`$framesz+16*7`(%rsp) + movaps %xmm14,`$framesz+16*8`(%rsp) + movaps %xmm15,`$framesz+16*9`(%rsp) +___ +$code.=<<___; +.Lprologue_avx2: + vzeroall + + mov $inp,%r13 # borrow $a0 + vpinsrq \$1,$out,$offload,$offload + lea 0x80($key),$inp # size optimization, reassign + lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4 + mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 + mov $ctx,%r15 # borrow $a2 + mov $in0,%rsi # borrow $a3 + vmovdqu ($ivp),$iv # load IV + lea -9(%r14),%r14 + + vmovdqa 0x00(%r12,%r14,8),$mask14 + vmovdqa 0x10(%r12,%r14,8),$mask12 + vmovdqa 0x20(%r12,%r14,8),$mask10 + + sub \$-16*$SZ,%r13 # inp++, size optimization + mov $SZ*0(%r15),$A + lea (%rsi,%r13),%r12 # borrow $a0 + mov $SZ*1(%r15),$B + cmp $len,%r13 # $_end + mov $SZ*2(%r15),$C + cmove %rsp,%r12 # next block or random data + mov $SZ*3(%r15),$D + mov $SZ*4(%r15),$E + mov $SZ*5(%r15),$F + mov $SZ*6(%r15),$G + mov $SZ*7(%r15),$H + vmovdqu 0x00-0x80($inp),$roundkey +___ + if ($SZ==4) { # SHA256 + my @X = map("%ymm$_",(0..3)); + my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7)); + +$code.=<<___; + jmp .Loop_avx2 +.align 16 +.Loop_avx2: + vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 + vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0 + vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1 + vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2 + vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3 + + vinserti128 \$1,(%r12),@X[0],@X[0] + vinserti128 \$1,16(%r12),@X[1],@X[1] + vpshufb $t3,@X[0],@X[0] + vinserti128 \$1,32(%r12),@X[2],@X[2] + vpshufb $t3,@X[1],@X[1] + vinserti128 \$1,48(%r12),@X[3],@X[3] + + lea $TABLE(%rip),$Tbl + vpshufb $t3,@X[2],@X[2] + lea -16*$SZ(%r13),%r13 + vpaddd 0x00($Tbl),@X[0],$t0 + vpshufb $t3,@X[3],@X[3] + vpaddd 0x20($Tbl),@X[1],$t1 + vpaddd 0x40($Tbl),@X[2],$t2 + vpaddd 0x60($Tbl),@X[3],$t3 + vmovdqa $t0,0x00(%rsp) + xor $a1,$a1 + vmovdqa $t1,0x20(%rsp) + lea -$PUSH8(%rsp),%rsp + mov $B,$a3 + vmovdqa $t2,0x00(%rsp) + xor $C,$a3 # magic + vmovdqa $t3,0x20(%rsp) + mov $F,$a4 + sub \$-16*2*$SZ,$Tbl # size optimization + jmp .Lavx2_00_47 + +.align 16 +.Lavx2_00_47: + vmovdqu (%r13),$inout + vpinsrq \$0,%r13,$offload,$offload +___ + +sub AVX2_256_00_47 () { +my $j = shift; +my $body = shift; +my @X = @_; +my @insns = (&$body,&$body,&$body,&$body); # 96 instructions +my $base = "+2*$PUSH8(%rsp)"; + + &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0); + foreach (Xupdate_256_AVX()) { # 29 instructions + eval; + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + } + &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); + foreach (@insns) { eval; } # remaining instructions + &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); +} + $aesni_cbc_idx=0; + for ($i=0,$j=0; $j<4; $j++) { + &AVX2_256_00_47($j,\&bodyx_00_15,@X); + push(@X,shift(@X)); # rotate(@X) + } + &vmovq ("%r13",$offload); # borrow $a0 + &vpextrq ("%r15",$offload,1); # borrow $a2 + &vpand ($temp,$temp,$mask14); + &vpor ($iv,$iv,$temp); + &vmovdqu ("(%r15,%r13)",$iv); # write output + &lea ("%r13","16(%r13)"); # inp++ + + &lea ($Tbl,16*2*$SZ."($Tbl)"); + &cmpb (($SZ-1)."($Tbl)",0); + &jne (".Lavx2_00_47"); + + &vmovdqu ($inout,"(%r13)"); + &vpinsrq ($offload,$offload,"%r13",0); + + $aesni_cbc_idx=0; + for ($i=0; $i<16; ) { + my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; + foreach(bodyx_00_15()) { eval; } + } + } +$code.=<<___; + vpextrq \$1,$offload,%r12 # $_out, borrow $a4 + vmovq $offload,%r13 # $_inp, borrow $a0 + mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2 + add $a1,$A + lea `2*$SZ*($rounds-8)`(%rsp),$Tbl + + vpand $mask14,$temp,$temp + vpor $temp,$iv,$iv + vmovdqu $iv,(%r12,%r13) # write output + lea 16(%r13),%r13 + + add $SZ*0(%r15),$A + add $SZ*1(%r15),$B + add $SZ*2(%r15),$C + add $SZ*3(%r15),$D + add $SZ*4(%r15),$E + add $SZ*5(%r15),$F + add $SZ*6(%r15),$G + add $SZ*7(%r15),$H + + mov $A,$SZ*0(%r15) + mov $B,$SZ*1(%r15) + mov $C,$SZ*2(%r15) + mov $D,$SZ*3(%r15) + mov $E,$SZ*4(%r15) + mov $F,$SZ*5(%r15) + mov $G,$SZ*6(%r15) + mov $H,$SZ*7(%r15) + + cmp `$PUSH8+2*8`($Tbl),%r13 # $_end + je .Ldone_avx2 + + xor $a1,$a1 + mov $B,$a3 + mov $F,$a4 + xor $C,$a3 # magic + jmp .Lower_avx2 +.align 16 +.Lower_avx2: + vmovdqu (%r13),$inout + vpinsrq \$0,%r13,$offload,$offload +___ + $aesni_cbc_idx=0; + for ($i=0; $i<16; ) { + my $base="+16($Tbl)"; + foreach(bodyx_00_15()) { eval; } + &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8); + } +$code.=<<___; + vmovq $offload,%r13 # borrow $a0 + vpextrq \$1,$offload,%r15 # borrow $a2 + vpand $mask14,$temp,$temp + vpor $temp,$iv,$iv + lea -$PUSH8($Tbl),$Tbl + vmovdqu $iv,(%r15,%r13) # write output + lea 16(%r13),%r13 # inp++ + cmp %rsp,$Tbl + jae .Lower_avx2 + + mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2 + lea 16*$SZ(%r13),%r13 + mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3 + add $a1,$A + lea `2*$SZ*($rounds-8)`(%rsp),%rsp + + add $SZ*0(%r15),$A + add $SZ*1(%r15),$B + add $SZ*2(%r15),$C + add $SZ*3(%r15),$D + add $SZ*4(%r15),$E + add $SZ*5(%r15),$F + add $SZ*6(%r15),$G + lea (%rsi,%r13),%r12 + add $SZ*7(%r15),$H + + cmp $_end,%r13 + + mov $A,$SZ*0(%r15) + cmove %rsp,%r12 # next block or stale data + mov $B,$SZ*1(%r15) + mov $C,$SZ*2(%r15) + mov $D,$SZ*3(%r15) + mov $E,$SZ*4(%r15) + mov $F,$SZ*5(%r15) + mov $G,$SZ*6(%r15) + mov $H,$SZ*7(%r15) + + jbe .Loop_avx2 + lea (%rsp),$Tbl + +.Ldone_avx2: + lea ($Tbl),%rsp + mov $_ivp,$ivp + mov $_rsp,%rsi + vmovdqu $iv,($ivp) # output IV + vzeroall +___ +$code.=<<___ if ($win64); + movaps `$framesz+16*0`(%rsp),%xmm6 + movaps `$framesz+16*1`(%rsp),%xmm7 + movaps `$framesz+16*2`(%rsp),%xmm8 + movaps `$framesz+16*3`(%rsp),%xmm9 + movaps `$framesz+16*4`(%rsp),%xmm10 + movaps `$framesz+16*5`(%rsp),%xmm11 + movaps `$framesz+16*6`(%rsp),%xmm12 + movaps `$framesz+16*7`(%rsp),%xmm13 + movaps `$framesz+16*8`(%rsp),%xmm14 + movaps `$framesz+16*9`(%rsp),%xmm15 +___ +$code.=<<___; + mov (%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lepilogue_avx2: + ret +.size ${func}_avx2,.-${func}_avx2 +___ +}} +}} +{{ +my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); + +my ($rounds,$Tbl)=("%r11d","%rbx"); + +my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15)); +my @rndkey=("%xmm4","%xmm5"); +my $r=0; +my $sn=0; + +my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9)); +my @MSG=map("%xmm$_",(10..13)); + +my $aesenc=sub { + use integer; + my ($n,$k)=($r/10,$r%10); + if ($k==0) { + $code.=<<___; + movups `16*$n`($in0),$in # load input + xorps $rndkey0,$in +___ + $code.=<<___ if ($n); + movups $iv,`16*($n-1)`($out,$in0) # write output +___ + $code.=<<___; + xorps $in,$iv + movups `32+16*$k-112`($key),$rndkey[1] + aesenc $rndkey[0],$iv +___ + } elsif ($k==9) { + $sn++; + $code.=<<___; + cmp \$11,$rounds + jb .Laesenclast$sn + movups `32+16*($k+0)-112`($key),$rndkey[1] + aesenc $rndkey[0],$iv + movups `32+16*($k+1)-112`($key),$rndkey[0] + aesenc $rndkey[1],$iv + je .Laesenclast$sn + movups `32+16*($k+2)-112`($key),$rndkey[1] + aesenc $rndkey[0],$iv + movups `32+16*($k+3)-112`($key),$rndkey[0] + aesenc $rndkey[1],$iv +.Laesenclast$sn: + aesenclast $rndkey[0],$iv + movups 16-112($key),$rndkey[1] # forward reference + nop +___ + } else { + $code.=<<___; + movups `32+16*$k-112`($key),$rndkey[1] + aesenc $rndkey[0],$iv +___ + } + $r++; unshift(@rndkey,pop(@rndkey)); +}; + +if ($shaext) { +my $Tbl="%rax"; + +$code.=<<___; +.type ${func}_shaext,\@function,6 +.align 32 +${func}_shaext: + mov `($win64?56:8)`(%rsp),$inp # load 7th argument +___ +$code.=<<___ if ($win64); + lea `-8-10*16`(%rsp),%rsp + movaps %xmm6,-8-10*16(%rax) + movaps %xmm7,-8-9*16(%rax) + movaps %xmm8,-8-8*16(%rax) + movaps %xmm9,-8-7*16(%rax) + movaps %xmm10,-8-6*16(%rax) + movaps %xmm11,-8-5*16(%rax) + movaps %xmm12,-8-4*16(%rax) + movaps %xmm13,-8-3*16(%rax) + movaps %xmm14,-8-2*16(%rax) + movaps %xmm15,-8-1*16(%rax) +.Lprologue_shaext: +___ +$code.=<<___; + lea K256+0x80(%rip),$Tbl + movdqu ($ctx),$ABEF # DCBA + movdqu 16($ctx),$CDGH # HGFE + movdqa 0x200-0x80($Tbl),$TMP # byte swap mask + + mov 240($key),$rounds + sub $in0,$out + movups ($key),$rndkey0 # $key[0] + movups 16($key),$rndkey[0] # forward reference + lea 112($key),$key # size optimization + + pshufd \$0x1b,$ABEF,$Wi # ABCD + pshufd \$0xb1,$ABEF,$ABEF # CDAB + pshufd \$0x1b,$CDGH,$CDGH # EFGH + movdqa $TMP,$BSWAP # offload + palignr \$8,$CDGH,$ABEF # ABEF + punpcklqdq $Wi,$CDGH # CDGH + + jmp .Loop_shaext + +.align 16 +.Loop_shaext: + movdqu ($inp),@MSG[0] + movdqu 0x10($inp),@MSG[1] + movdqu 0x20($inp),@MSG[2] + pshufb $TMP,@MSG[0] + movdqu 0x30($inp),@MSG[3] + + movdqa 0*32-0x80($Tbl),$Wi + paddd @MSG[0],$Wi + pshufb $TMP,@MSG[1] + movdqa $CDGH,$CDGH_SAVE # offload + movdqa $ABEF,$ABEF_SAVE # offload +___ + &$aesenc(); +$code.=<<___; + sha256rnds2 $ABEF,$CDGH # 0-3 + pshufd \$0x0e,$Wi,$Wi +___ + &$aesenc(); +$code.=<<___; + sha256rnds2 $CDGH,$ABEF + + movdqa 1*32-0x80($Tbl),$Wi + paddd @MSG[1],$Wi + pshufb $TMP,@MSG[2] + lea 0x40($inp),$inp +___ + &$aesenc(); +$code.=<<___; + sha256rnds2 $ABEF,$CDGH # 4-7 + pshufd \$0x0e,$Wi,$Wi +___ + &$aesenc(); +$code.=<<___; + sha256rnds2 $CDGH,$ABEF + + movdqa 2*32-0x80($Tbl),$Wi + paddd @MSG[2],$Wi + pshufb $TMP,@MSG[3] + sha256msg1 @MSG[1],@MSG[0] +___ + &$aesenc(); +$code.=<<___; + sha256rnds2 $ABEF,$CDGH # 8-11 + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[3],$TMP + palignr \$4,@MSG[2],$TMP + paddd $TMP,@MSG[0] +___ + &$aesenc(); +$code.=<<___; + sha256rnds2 $CDGH,$ABEF + + movdqa 3*32-0x80($Tbl),$Wi + paddd @MSG[3],$Wi + sha256msg2 @MSG[3],@MSG[0] + sha256msg1 @MSG[2],@MSG[1] +___ + &$aesenc(); +$code.=<<___; + sha256rnds2 $ABEF,$CDGH # 12-15 + pshufd \$0x0e,$Wi,$Wi +___ + &$aesenc(); +$code.=<<___; + movdqa @MSG[0],$TMP + palignr \$4,@MSG[3],$TMP + paddd $TMP,@MSG[1] + sha256rnds2 $CDGH,$ABEF +___ +for($i=4;$i<16-3;$i++) { + &$aesenc() if (($r%10)==0); +$code.=<<___; + movdqa $i*32-0x80($Tbl),$Wi + paddd @MSG[0],$Wi + sha256msg2 @MSG[0],@MSG[1] + sha256msg1 @MSG[3],@MSG[2] +___ + &$aesenc(); +$code.=<<___; + sha256rnds2 $ABEF,$CDGH # 16-19... + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[1],$TMP + palignr \$4,@MSG[0],$TMP + paddd $TMP,@MSG[2] +___ + &$aesenc(); + &$aesenc() if ($r==19); +$code.=<<___; + sha256rnds2 $CDGH,$ABEF +___ + push(@MSG,shift(@MSG)); +} +$code.=<<___; + movdqa 13*32-0x80($Tbl),$Wi + paddd @MSG[0],$Wi + sha256msg2 @MSG[0],@MSG[1] + sha256msg1 @MSG[3],@MSG[2] +___ + &$aesenc(); +$code.=<<___; + sha256rnds2 $ABEF,$CDGH # 52-55 + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[1],$TMP + palignr \$4,@MSG[0],$TMP + paddd $TMP,@MSG[2] +___ + &$aesenc(); + &$aesenc(); +$code.=<<___; + sha256rnds2 $CDGH,$ABEF + + movdqa 14*32-0x80($Tbl),$Wi + paddd @MSG[1],$Wi + sha256msg2 @MSG[1],@MSG[2] + movdqa $BSWAP,$TMP +___ + &$aesenc(); +$code.=<<___; + sha256rnds2 $ABEF,$CDGH # 56-59 + pshufd \$0x0e,$Wi,$Wi +___ + &$aesenc(); +$code.=<<___; + sha256rnds2 $CDGH,$ABEF + + movdqa 15*32-0x80($Tbl),$Wi + paddd @MSG[2],$Wi +___ + &$aesenc(); + &$aesenc(); +$code.=<<___; + sha256rnds2 $ABEF,$CDGH # 60-63 + pshufd \$0x0e,$Wi,$Wi +___ + &$aesenc(); +$code.=<<___; + sha256rnds2 $CDGH,$ABEF + #pxor $CDGH,$rndkey0 # black magic +___ + while ($r<40) { &$aesenc(); } # remaining aesenc's +$code.=<<___; + #xorps $CDGH,$rndkey0 # black magic + paddd $CDGH_SAVE,$CDGH + paddd $ABEF_SAVE,$ABEF + + dec $len + movups $iv,48($out,$in0) # write output + lea 64($in0),$in0 + jnz .Loop_shaext + + pshufd \$0xb1,$CDGH,$CDGH # DCHG + pshufd \$0x1b,$ABEF,$TMP # FEBA + pshufd \$0xb1,$ABEF,$ABEF # BAFE + punpckhqdq $CDGH,$ABEF # DCBA + palignr \$8,$TMP,$CDGH # HGFE + + movups $iv,($ivp) # write IV + movdqu $ABEF,($ctx) + movdqu $CDGH,16($ctx) +___ +$code.=<<___ if ($win64); + movaps 0*16(%rsp),%xmm6 + movaps 1*16(%rsp),%xmm7 + movaps 2*16(%rsp),%xmm8 + movaps 3*16(%rsp),%xmm9 + movaps 4*16(%rsp),%xmm10 + movaps 5*16(%rsp),%xmm11 + movaps 6*16(%rsp),%xmm12 + movaps 7*16(%rsp),%xmm13 + movaps 8*16(%rsp),%xmm14 + movaps 9*16(%rsp),%xmm15 + lea 8+10*16(%rsp),%rsp +.Lepilogue_shaext: +___ +$code.=<<___; + ret +.size ${func}_shaext,.-${func}_shaext +___ +} +}}}}} + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64 && $avx) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HanderlData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->Rip<prologue label + jb .Lin_prologue + + mov 152($context),%rax # pull context->Rsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lin_prologue +___ +$code.=<<___ if ($shaext); + lea aesni_cbc_sha256_enc_shaext(%rip),%r10 + cmp %r10,%rbx + jb .Lnot_in_shaext + + lea (%rax),%rsi + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx + .long 0xa548f3fc # cld; rep movsq + lea 168(%rax),%rax # adjust stack pointer + jmp .Lin_prologue +.Lnot_in_shaext: +___ +$code.=<<___ if ($avx>1); + lea .Lavx2_shortcut(%rip),%r10 + cmp %r10,%rbx # context->Rip<avx2_shortcut + jb .Lnot_in_avx2 + + and \$-256*$SZ,%rax + add \$`2*$SZ*($rounds-8)`,%rax +.Lnot_in_avx2: +___ +$code.=<<___; + mov %rax,%rsi # put aside Rsp + mov 16*$SZ+7*8(%rax),%rax # pull $_rsp + lea 48(%rax),%rax + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 + + lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx + .long 0xa548f3fc # cld; rep movsq + +.Lin_prologue: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size se_handler,.-se_handler + +.section .pdata + .rva .LSEH_begin_${func}_xop + .rva .LSEH_end_${func}_xop + .rva .LSEH_info_${func}_xop + + .rva .LSEH_begin_${func}_avx + .rva .LSEH_end_${func}_avx + .rva .LSEH_info_${func}_avx +___ +$code.=<<___ if ($avx>1); + .rva .LSEH_begin_${func}_avx2 + .rva .LSEH_end_${func}_avx2 + .rva .LSEH_info_${func}_avx2 +___ +$code.=<<___ if ($shaext); + .rva .LSEH_begin_${func}_shaext + .rva .LSEH_end_${func}_shaext + .rva .LSEH_info_${func}_shaext +___ +$code.=<<___; +.section .xdata +.align 8 +.LSEH_info_${func}_xop: + .byte 9,0,0,0 + .rva se_handler + .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[] + +.LSEH_info_${func}_avx: + .byte 9,0,0,0 + .rva se_handler + .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] +___ +$code.=<<___ if ($avx>1); +.LSEH_info_${func}_avx2: + .byte 9,0,0,0 + .rva se_handler + .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] +___ +$code.=<<___ if ($shaext); +.LSEH_info_${func}_shaext: + .byte 9,0,0,0 + .rva se_handler + .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[] +___ +} + +#################################################################### +sub rex { + local *opcode=shift; + my ($dst,$src)=@_; + my $rex=0; + + $rex|=0x04 if($dst>=8); + $rex|=0x01 if($src>=8); + unshift @opcode,$rex|0x40 if($rex); +} + +{ + my %opcodelet = ( + "sha256rnds2" => 0xcb, + "sha256msg1" => 0xcc, + "sha256msg2" => 0xcd ); + + sub sha256op38 { + my $instr = shift; + + if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x0f,0x38); + rex(\@opcode,$2,$1); + push @opcode,$opcodelet{$instr}; + push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M + return ".byte\t".join(',',@opcode); + } else { + return $instr."\t".@_[0]; + } + } +} + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +$code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem; +print $code; +close STDOUT; diff --git a/crypto/aes/asm/aesni-x86.pl b/crypto/aes/asm/aesni-x86.pl index 3dc345b585f6..f67df8cf13da 100755 --- a/crypto/aes/asm/aesni-x86.pl +++ b/crypto/aes/asm/aesni-x86.pl @@ -1,7 +1,7 @@ #!/usr/bin/env perl # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -43,6 +43,17 @@ # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09. +###################################################################### +# Current large-block performance in cycles per byte processed with +# 128-bit key (less is better). +# +# CBC en-/decrypt CTR XTS ECB +# Westmere 3.77/1.37 1.37 1.52 1.27 +# * Bridge 5.07/0.98 0.99 1.09 0.91 +# Haswell 4.44/0.80 0.97 1.03 0.72 +# Silvermont 5.77/3.56 3.67 4.03 3.46 +# Bulldozer 5.80/0.98 1.05 1.24 0.93 + $PREFIX="aesni"; # if $PREFIX is set to "AES", the script # generates drop-in replacement for # crypto/aes/asm/aes-586.pl:-) @@ -54,8 +65,11 @@ require "x86asm.pl"; &asm_init($ARGV[0],$0); -if ($PREFIX eq "aesni") { $movekey=*movups; } -else { $movekey=*movups; } +&external_label("OPENSSL_ia32cap_P"); +&static_label("key_const"); + +if ($PREFIX eq "aesni") { $movekey=\&movups; } +else { $movekey=\&movups; } $len="eax"; $rounds="ecx"; @@ -170,7 +184,10 @@ sub aesni_generate1 # fully unrolled loop { &aesni_inline_generate1("enc"); } else { &call ("_aesni_encrypt1"); } + &pxor ($rndkey0,$rndkey0); # clear register bank + &pxor ($rndkey1,$rndkey1); &movups (&QWP(0,"eax"),$inout0); + &pxor ($inout0,$inout0); &ret (); &function_end_B("${PREFIX}_encrypt"); @@ -186,7 +203,10 @@ sub aesni_generate1 # fully unrolled loop { &aesni_inline_generate1("dec"); } else { &call ("_aesni_decrypt1"); } + &pxor ($rndkey0,$rndkey0); # clear register bank + &pxor ($rndkey1,$rndkey1); &movups (&QWP(0,"eax"),$inout0); + &pxor ($inout0,$inout0); &ret (); &function_end_B("${PREFIX}_decrypt"); @@ -196,37 +216,71 @@ sub aesni_generate1 # fully unrolled loop # every *2nd* cycle. Thus 3x interleave was the one providing optimal # utilization, i.e. when subroutine's throughput is virtually same as # of non-interleaved subroutine [for number of input blocks up to 3]. -# This is why it makes no sense to implement 2x subroutine. -# aes[enc|dec] latency in next processor generation is 8, but the -# instructions can be scheduled every cycle. Optimal interleave for -# new processor is therefore 8x, but it's unfeasible to accommodate it -# in XMM registers addreassable in 32-bit mode and therefore 6x is -# used instead... +# This is why it originally made no sense to implement 2x subroutine. +# But times change and it became appropriate to spend extra 192 bytes +# on 2x subroutine on Atom Silvermont account. For processors that +# can schedule aes[enc|dec] every cycle optimal interleave factor +# equals to corresponding instructions latency. 8x is optimal for +# * Bridge, but it's unfeasible to accommodate such implementation +# in XMM registers addreassable in 32-bit mode and therefore maximum +# of 6x is used instead... + +sub aesni_generate2 +{ my $p=shift; + + &function_begin_B("_aesni_${p}rypt2"); + &$movekey ($rndkey0,&QWP(0,$key)); + &shl ($rounds,4); + &$movekey ($rndkey1,&QWP(16,$key)); + &xorps ($inout0,$rndkey0); + &pxor ($inout1,$rndkey0); + &$movekey ($rndkey0,&QWP(32,$key)); + &lea ($key,&DWP(32,$key,$rounds)); + &neg ($rounds); + &add ($rounds,16); + + &set_label("${p}2_loop"); + eval"&aes${p} ($inout0,$rndkey1)"; + eval"&aes${p} ($inout1,$rndkey1)"; + &$movekey ($rndkey1,&QWP(0,$key,$rounds)); + &add ($rounds,32); + eval"&aes${p} ($inout0,$rndkey0)"; + eval"&aes${p} ($inout1,$rndkey0)"; + &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); + &jnz (&label("${p}2_loop")); + eval"&aes${p} ($inout0,$rndkey1)"; + eval"&aes${p} ($inout1,$rndkey1)"; + eval"&aes${p}last ($inout0,$rndkey0)"; + eval"&aes${p}last ($inout1,$rndkey0)"; + &ret(); + &function_end_B("_aesni_${p}rypt2"); +} sub aesni_generate3 { my $p=shift; &function_begin_B("_aesni_${p}rypt3"); &$movekey ($rndkey0,&QWP(0,$key)); - &shr ($rounds,1); + &shl ($rounds,4); &$movekey ($rndkey1,&QWP(16,$key)); - &lea ($key,&DWP(32,$key)); &xorps ($inout0,$rndkey0); &pxor ($inout1,$rndkey0); &pxor ($inout2,$rndkey0); - &$movekey ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey0,&QWP(32,$key)); + &lea ($key,&DWP(32,$key,$rounds)); + &neg ($rounds); + &add ($rounds,16); &set_label("${p}3_loop"); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; - &dec ($rounds); eval"&aes${p} ($inout2,$rndkey1)"; - &$movekey ($rndkey1,&QWP(16,$key)); + &$movekey ($rndkey1,&QWP(0,$key,$rounds)); + &add ($rounds,32); eval"&aes${p} ($inout0,$rndkey0)"; eval"&aes${p} ($inout1,$rndkey0)"; - &lea ($key,&DWP(32,$key)); eval"&aes${p} ($inout2,$rndkey0)"; - &$movekey ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); &jnz (&label("${p}3_loop")); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; @@ -248,27 +302,29 @@ sub aesni_generate4 &function_begin_B("_aesni_${p}rypt4"); &$movekey ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey1,&QWP(16,$key)); - &shr ($rounds,1); - &lea ($key,&DWP(32,$key)); + &shl ($rounds,4); &xorps ($inout0,$rndkey0); &pxor ($inout1,$rndkey0); &pxor ($inout2,$rndkey0); &pxor ($inout3,$rndkey0); - &$movekey ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey0,&QWP(32,$key)); + &lea ($key,&DWP(32,$key,$rounds)); + &neg ($rounds); + &data_byte (0x0f,0x1f,0x40,0x00); + &add ($rounds,16); &set_label("${p}4_loop"); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; - &dec ($rounds); eval"&aes${p} ($inout2,$rndkey1)"; eval"&aes${p} ($inout3,$rndkey1)"; - &$movekey ($rndkey1,&QWP(16,$key)); + &$movekey ($rndkey1,&QWP(0,$key,$rounds)); + &add ($rounds,32); eval"&aes${p} ($inout0,$rndkey0)"; eval"&aes${p} ($inout1,$rndkey0)"; - &lea ($key,&DWP(32,$key)); eval"&aes${p} ($inout2,$rndkey0)"; eval"&aes${p} ($inout3,$rndkey0)"; - &$movekey ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); &jnz (&label("${p}4_loop")); eval"&aes${p} ($inout0,$rndkey1)"; @@ -289,43 +345,41 @@ sub aesni_generate6 &function_begin_B("_aesni_${p}rypt6"); &static_label("_aesni_${p}rypt6_enter"); &$movekey ($rndkey0,&QWP(0,$key)); - &shr ($rounds,1); + &shl ($rounds,4); &$movekey ($rndkey1,&QWP(16,$key)); - &lea ($key,&DWP(32,$key)); &xorps ($inout0,$rndkey0); &pxor ($inout1,$rndkey0); # pxor does better here - eval"&aes${p} ($inout0,$rndkey1)"; &pxor ($inout2,$rndkey0); - eval"&aes${p} ($inout1,$rndkey1)"; + eval"&aes${p} ($inout0,$rndkey1)"; &pxor ($inout3,$rndkey0); - &dec ($rounds); - eval"&aes${p} ($inout2,$rndkey1)"; &pxor ($inout4,$rndkey0); - eval"&aes${p} ($inout3,$rndkey1)"; + eval"&aes${p} ($inout1,$rndkey1)"; + &lea ($key,&DWP(32,$key,$rounds)); + &neg ($rounds); + eval"&aes${p} ($inout2,$rndkey1)"; &pxor ($inout5,$rndkey0); - eval"&aes${p} ($inout4,$rndkey1)"; - &$movekey ($rndkey0,&QWP(0,$key)); - eval"&aes${p} ($inout5,$rndkey1)"; - &jmp (&label("_aesni_${p}rypt6_enter")); + &$movekey ($rndkey0,&QWP(0,$key,$rounds)); + &add ($rounds,16); + &jmp (&label("_aesni_${p}rypt6_inner")); &set_label("${p}6_loop",16); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; - &dec ($rounds); eval"&aes${p} ($inout2,$rndkey1)"; + &set_label("_aesni_${p}rypt6_inner"); eval"&aes${p} ($inout3,$rndkey1)"; eval"&aes${p} ($inout4,$rndkey1)"; eval"&aes${p} ($inout5,$rndkey1)"; - &set_label("_aesni_${p}rypt6_enter",16); - &$movekey ($rndkey1,&QWP(16,$key)); + &set_label("_aesni_${p}rypt6_enter"); + &$movekey ($rndkey1,&QWP(0,$key,$rounds)); + &add ($rounds,32); eval"&aes${p} ($inout0,$rndkey0)"; eval"&aes${p} ($inout1,$rndkey0)"; - &lea ($key,&DWP(32,$key)); eval"&aes${p} ($inout2,$rndkey0)"; eval"&aes${p} ($inout3,$rndkey0)"; eval"&aes${p} ($inout4,$rndkey0)"; eval"&aes${p} ($inout5,$rndkey0)"; - &$movekey ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); &jnz (&label("${p}6_loop")); eval"&aes${p} ($inout0,$rndkey1)"; @@ -343,6 +397,8 @@ sub aesni_generate6 &ret(); &function_end_B("_aesni_${p}rypt6"); } +&aesni_generate2("enc") if ($PREFIX eq "aesni"); +&aesni_generate2("dec"); &aesni_generate3("enc") if ($PREFIX eq "aesni"); &aesni_generate3("dec"); &aesni_generate4("enc") if ($PREFIX eq "aesni"); @@ -446,8 +502,7 @@ if ($PREFIX eq "aesni") { &jmp (&label("ecb_ret")); &set_label("ecb_enc_two",16); - &xorps ($inout2,$inout2); - &call ("_aesni_encrypt3"); + &call ("_aesni_encrypt2"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &jmp (&label("ecb_ret")); @@ -547,8 +602,7 @@ if ($PREFIX eq "aesni") { &jmp (&label("ecb_ret")); &set_label("ecb_dec_two",16); - &xorps ($inout2,$inout2); - &call ("_aesni_decrypt3"); + &call ("_aesni_decrypt2"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &jmp (&label("ecb_ret")); @@ -568,6 +622,14 @@ if ($PREFIX eq "aesni") { &movups (&QWP(0x30,$out),$inout3); &set_label("ecb_ret"); + &pxor ("xmm0","xmm0"); # clear register bank + &pxor ("xmm1","xmm1"); + &pxor ("xmm2","xmm2"); + &pxor ("xmm3","xmm3"); + &pxor ("xmm4","xmm4"); + &pxor ("xmm5","xmm5"); + &pxor ("xmm6","xmm6"); + &pxor ("xmm7","xmm7"); &function_end("aesni_ecb_encrypt"); ###################################################################### @@ -610,11 +672,13 @@ if ($PREFIX eq "aesni") { &mov (&DWP(24,"esp"),$key_); &mov (&DWP(28,"esp"),$key_); - &shr ($rounds,1); + &shl ($rounds,4); + &mov ($rounds_,16); &lea ($key_,&DWP(0,$key)); &movdqa ($inout3,&QWP(0,"esp")); &movdqa ($inout0,$ivec); - &mov ($rounds_,$rounds); + &lea ($key,&DWP(32,$key,$rounds)); + &sub ($rounds_,$rounds); |