aboutsummaryrefslogtreecommitdiff
path: root/module/icp/asm-x86_64
diff options
context:
space:
mode:
Diffstat (limited to 'module/icp/asm-x86_64')
-rw-r--r--module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman23
-rw-r--r--module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman.descrip1
-rw-r--r--module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl127
-rw-r--r--module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl.descrip1
-rw-r--r--module/icp/asm-x86_64/aes/aes_aesni.S748
-rw-r--r--module/icp/asm-x86_64/aes/aes_amd64.S906
-rw-r--r--module/icp/asm-x86_64/aes/aeskey.c580
-rw-r--r--module/icp/asm-x86_64/aes/aesopt.h770
-rw-r--r--module/icp/asm-x86_64/aes/aestab.h165
-rw-r--r--module/icp/asm-x86_64/aes/aestab2.h594
-rw-r--r--module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams36
-rw-r--r--module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip1
-rw-r--r--module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl177
-rw-r--r--module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip1
-rw-r--r--module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S1245
-rw-r--r--module/icp/asm-x86_64/modes/gcm_pclmulqdq.S254
-rw-r--r--module/icp/asm-x86_64/modes/ghash-x86_64.S714
-rw-r--r--module/icp/asm-x86_64/sha1/sha1-x86_64.S1353
-rw-r--r--module/icp/asm-x86_64/sha2/sha256_impl.S2063
-rw-r--r--module/icp/asm-x86_64/sha2/sha512_impl.S2088
20 files changed, 11847 insertions, 0 deletions
diff --git a/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman
new file mode 100644
index 000000000000..48fea7bb333e
--- /dev/null
+++ b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman
@@ -0,0 +1,23 @@
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software is allowed (with or without
+ changes) provided that:
+
+ 1. source code distributions include the above copyright notice, this
+ list of conditions and the following disclaimer;
+
+ 2. binary distributions include the above copyright notice, this list
+ of conditions and the following disclaimer in their documentation;
+
+ 3. the name of the copyright holder is not used to endorse products
+ built using this software without specific written permission.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
diff --git a/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman.descrip b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman.descrip
new file mode 100644
index 000000000000..5f822cf27586
--- /dev/null
+++ b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman.descrip
@@ -0,0 +1 @@
+PORTIONS OF AES FUNCTIONALITY
diff --git a/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl
new file mode 100644
index 000000000000..92c9e196a318
--- /dev/null
+++ b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl
@@ -0,0 +1,127 @@
+
+ LICENSE ISSUES
+ ==============
+
+ The OpenSSL toolkit stays under a dual license, i.e. both the conditions of
+ the OpenSSL License and the original SSLeay license apply to the toolkit.
+ See below for the actual license texts. Actually both licenses are BSD-style
+ Open Source licenses. In case of any license issues related to OpenSSL
+ please contact openssl-core@openssl.org.
+
+ OpenSSL License
+ ---------------
+
+/* ====================================================================
+ * Copyright (c) 1998-2008 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+ Original SSLeay License
+ -----------------------
+
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to. The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * "This product includes cryptographic software written by
+ * Eric Young (eay@cryptsoft.com)"
+ * The word 'cryptographic' can be left out if the routines from the library
+ * being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ * the apps directory (application code) you must include an acknowledgement:
+ * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed. i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
diff --git a/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl.descrip b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl.descrip
new file mode 100644
index 000000000000..5f822cf27586
--- /dev/null
+++ b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl.descrip
@@ -0,0 +1 @@
+PORTIONS OF AES FUNCTIONALITY
diff --git a/module/icp/asm-x86_64/aes/aes_aesni.S b/module/icp/asm-x86_64/aes/aes_aesni.S
new file mode 100644
index 000000000000..4a80c62097ae
--- /dev/null
+++ b/module/icp/asm-x86_64/aes/aes_aesni.S
@@ -0,0 +1,748 @@
+/*
+ * ====================================================================
+ * Written by Intel Corporation for the OpenSSL project to add support
+ * for Intel AES-NI instructions. Rights for redistribution and usage
+ * in source and binary forms are granted according to the OpenSSL
+ * license.
+ *
+ * Author: Huang Ying <ying.huang at intel dot com>
+ * Vinodh Gopal <vinodh.gopal at intel dot com>
+ * Kahraman Akdemir
+ *
+ * Intel AES-NI is a new set of Single Instruction Multiple Data (SIMD)
+ * instructions that are going to be introduced in the next generation
+ * of Intel processor, as of 2009. These instructions enable fast and
+ * secure data encryption and decryption, using the Advanced Encryption
+ * Standard (AES), defined by FIPS Publication number 197. The
+ * architecture introduces six instructions that offer full hardware
+ * support for AES. Four of them support high performance data
+ * encryption and decryption, and the other two instructions support
+ * the AES key expansion procedure.
+ * ====================================================================
+ */
+
+/*
+ * ====================================================================
+ * Copyright (c) 1998-2008 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+/*
+ * ====================================================================
+ * OpenSolaris OS modifications
+ *
+ * This source originates as files aes-intel.S and eng_aesni_asm.pl, in
+ * patches sent sent Dec. 9, 2008 and Dec. 24, 2008, respectively, by
+ * Huang Ying of Intel to the openssl-dev mailing list under the subject
+ * of "Add support to Intel AES-NI instruction set for x86_64 platform".
+ *
+ * This OpenSolaris version has these major changes from the original source:
+ *
+ * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
+ * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function
+ * definitions for lint.
+ *
+ * 2. Formatted code, added comments, and added #includes and #defines.
+ *
+ * 3. If bit CR0.TS is set, clear and set the TS bit, after and before
+ * calling kpreempt_disable() and kpreempt_enable().
+ * If the TS bit is not set, Save and restore %xmm registers at the beginning
+ * and end of function calls (%xmm* registers are not saved and restored by
+ * during kernel thread preemption).
+ *
+ * 4. Renamed functions, reordered parameters, and changed return value
+ * to match OpenSolaris:
+ *
+ * OpenSSL interface:
+ * int intel_AES_set_encrypt_key(const unsigned char *userKey,
+ * const int bits, AES_KEY *key);
+ * int intel_AES_set_decrypt_key(const unsigned char *userKey,
+ * const int bits, AES_KEY *key);
+ * Return values for above are non-zero on error, 0 on success.
+ *
+ * void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
+ * const AES_KEY *key);
+ * void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
+ * const AES_KEY *key);
+ * typedef struct aes_key_st {
+ * unsigned int rd_key[4 *(AES_MAXNR + 1)];
+ * int rounds;
+ * unsigned int pad[3];
+ * } AES_KEY;
+ * Note: AES_LONG is undefined (that is, Intel uses 32-bit key schedules
+ * (ks32) instead of 64-bit (ks64).
+ * Number of rounds (aka round count) is at offset 240 of AES_KEY.
+ *
+ * OpenSolaris OS interface (#ifdefs removed for readability):
+ * int rijndael_key_setup_dec_intel(uint32_t rk[],
+ * const uint32_t cipherKey[], uint64_t keyBits);
+ * int rijndael_key_setup_enc_intel(uint32_t rk[],
+ * const uint32_t cipherKey[], uint64_t keyBits);
+ * Return values for above are 0 on error, number of rounds on success.
+ *
+ * void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
+ * const uint32_t pt[4], uint32_t ct[4]);
+ * void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
+ * const uint32_t pt[4], uint32_t ct[4]);
+ * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4];
+ * uint32_t ks32[(MAX_AES_NR + 1) * 4]; } aes_ks_t;
+ *
+ * typedef union {
+ * uint32_t ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)];
+ * } aes_ks_t;
+ * typedef struct aes_key {
+ * aes_ks_t encr_ks, decr_ks;
+ * long double align128;
+ * int flags, nr, type;
+ * } aes_key_t;
+ *
+ * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text,
+ * ct is crypto text, and MAX_AES_NR is 14.
+ * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64.
+ *
+ * Note2: aes_ks_t must be aligned on a 0 mod 128 byte boundary.
+ *
+ * ====================================================================
+ */
+
+
+#if defined(lint) || defined(__lint)
+
+#include <sys/types.h>
+
+/* ARGSUSED */
+void
+aes_encrypt_intel(const uint32_t rk[], int Nr, const uint32_t pt[4],
+ uint32_t ct[4]) {
+}
+/* ARGSUSED */
+void
+aes_decrypt_intel(const uint32_t rk[], int Nr, const uint32_t ct[4],
+ uint32_t pt[4]) {
+}
+/* ARGSUSED */
+int
+rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
+ uint64_t keyBits) {
+ return (0);
+}
+/* ARGSUSED */
+int
+rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[],
+ uint64_t keyBits) {
+ return (0);
+}
+
+
+#elif defined(HAVE_AES) /* guard by instruction set */
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+/*
+ * _key_expansion_128(), * _key_expansion_192a(), _key_expansion_192b(),
+ * _key_expansion_256a(), _key_expansion_256b()
+ *
+ * Helper functions called by rijndael_key_setup_inc_intel().
+ * Also used indirectly by rijndael_key_setup_dec_intel().
+ *
+ * Input:
+ * %xmm0 User-provided cipher key
+ * %xmm1 Round constant
+ * Output:
+ * (%rcx) AES key
+ */
+
+ENTRY_NP2(_key_expansion_128, _key_expansion_256a)
+_key_expansion_128_local:
+_key_expansion_256a_local:
+ pshufd $0b11111111, %xmm1, %xmm1
+ shufps $0b00010000, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ shufps $0b10001100, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ pxor %xmm1, %xmm0
+ movups %xmm0, (%rcx)
+ add $0x10, %rcx
+ ret
+ nop
+SET_SIZE(_key_expansion_128)
+SET_SIZE(_key_expansion_256a)
+
+
+ENTRY_NP(_key_expansion_192a)
+_key_expansion_192a_local:
+ pshufd $0b01010101, %xmm1, %xmm1
+ shufps $0b00010000, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ shufps $0b10001100, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ pxor %xmm1, %xmm0
+
+ movups %xmm2, %xmm5
+ movups %xmm2, %xmm6
+ pslldq $4, %xmm5
+ pshufd $0b11111111, %xmm0, %xmm3
+ pxor %xmm3, %xmm2
+ pxor %xmm5, %xmm2
+
+ movups %xmm0, %xmm1
+ shufps $0b01000100, %xmm0, %xmm6
+ movups %xmm6, (%rcx)
+ shufps $0b01001110, %xmm2, %xmm1
+ movups %xmm1, 0x10(%rcx)
+ add $0x20, %rcx
+ ret
+SET_SIZE(_key_expansion_192a)
+
+
+ENTRY_NP(_key_expansion_192b)
+_key_expansion_192b_local:
+ pshufd $0b01010101, %xmm1, %xmm1
+ shufps $0b00010000, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ shufps $0b10001100, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ pxor %xmm1, %xmm0
+
+ movups %xmm2, %xmm5
+ pslldq $4, %xmm5
+ pshufd $0b11111111, %xmm0, %xmm3
+ pxor %xmm3, %xmm2
+ pxor %xmm5, %xmm2
+
+ movups %xmm0, (%rcx)
+ add $0x10, %rcx
+ ret
+SET_SIZE(_key_expansion_192b)
+
+
+ENTRY_NP(_key_expansion_256b)
+_key_expansion_256b_local:
+ pshufd $0b10101010, %xmm1, %xmm1
+ shufps $0b00010000, %xmm2, %xmm4
+ pxor %xmm4, %xmm2
+ shufps $0b10001100, %xmm2, %xmm4
+ pxor %xmm4, %xmm2
+ pxor %xmm1, %xmm2
+ movups %xmm2, (%rcx)
+ add $0x10, %rcx
+ ret
+SET_SIZE(_key_expansion_256b)
+
+
+/*
+ * rijndael_key_setup_enc_intel()
+ * Expand the cipher key into the encryption key schedule.
+ *
+ * For kernel code, caller is responsible for ensuring kpreempt_disable()
+ * has been called. This is because %xmm registers are not saved/restored.
+ * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set
+ * on entry. Otherwise, if TS is not set, save and restore %xmm registers
+ * on the stack.
+ *
+ * OpenSolaris interface:
+ * int rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
+ * uint64_t keyBits);
+ * Return value is 0 on error, number of rounds on success.
+ *
+ * Original Intel OpenSSL interface:
+ * int intel_AES_set_encrypt_key(const unsigned char *userKey,
+ * const int bits, AES_KEY *key);
+ * Return value is non-zero on error, 0 on success.
+ */
+
+#ifdef OPENSSL_INTERFACE
+#define rijndael_key_setup_enc_intel intel_AES_set_encrypt_key
+#define rijndael_key_setup_dec_intel intel_AES_set_decrypt_key
+
+#define USERCIPHERKEY rdi /* P1, 64 bits */
+#define KEYSIZE32 esi /* P2, 32 bits */
+#define KEYSIZE64 rsi /* P2, 64 bits */
+#define AESKEY rdx /* P3, 64 bits */
+
+#else /* OpenSolaris Interface */
+#define AESKEY rdi /* P1, 64 bits */
+#define USERCIPHERKEY rsi /* P2, 64 bits */
+#define KEYSIZE32 edx /* P3, 32 bits */
+#define KEYSIZE64 rdx /* P3, 64 bits */
+#endif /* OPENSSL_INTERFACE */
+
+#define ROUNDS32 KEYSIZE32 /* temp */
+#define ROUNDS64 KEYSIZE64 /* temp */
+#define ENDAESKEY USERCIPHERKEY /* temp */
+
+ENTRY_NP(rijndael_key_setup_enc_intel)
+rijndael_key_setup_enc_intel_local:
+ FRAME_BEGIN
+ // NULL pointer sanity check
+ test %USERCIPHERKEY, %USERCIPHERKEY
+ jz .Lenc_key_invalid_param
+ test %AESKEY, %AESKEY
+ jz .Lenc_key_invalid_param
+
+ movups (%USERCIPHERKEY), %xmm0 // user key (first 16 bytes)
+ movups %xmm0, (%AESKEY)
+ lea 0x10(%AESKEY), %rcx // key addr
+ pxor %xmm4, %xmm4 // xmm4 is assumed 0 in _key_expansion_x
+
+ cmp $256, %KEYSIZE32
+ jnz .Lenc_key192
+
+ // AES 256: 14 rounds in encryption key schedule
+#ifdef OPENSSL_INTERFACE
+ mov $14, %ROUNDS32
+ movl %ROUNDS32, 240(%AESKEY) // key.rounds = 14
+#endif /* OPENSSL_INTERFACE */
+
+ movups 0x10(%USERCIPHERKEY), %xmm2 // other user key (2nd 16 bytes)
+ movups %xmm2, (%rcx)
+ add $0x10, %rcx
+
+ aeskeygenassist $0x1, %xmm2, %xmm1 // expand the key
+ call _key_expansion_256a_local
+ aeskeygenassist $0x1, %xmm0, %xmm1
+ call _key_expansion_256b_local
+ aeskeygenassist $0x2, %xmm2, %xmm1 // expand the key
+ call _key_expansion_256a_local
+ aeskeygenassist $0x2, %xmm0, %xmm1
+ call _key_expansion_256b_local
+ aeskeygenassist $0x4, %xmm2, %xmm1 // expand the key
+ call _key_expansion_256a_local
+ aeskeygenassist $0x4, %xmm0, %xmm1
+ call _key_expansion_256b_local
+ aeskeygenassist $0x8, %xmm2, %xmm1 // expand the key
+ call _key_expansion_256a_local
+ aeskeygenassist $0x8, %xmm0, %xmm1
+ call _key_expansion_256b_local
+ aeskeygenassist $0x10, %xmm2, %xmm1 // expand the key
+ call _key_expansion_256a_local
+ aeskeygenassist $0x10, %xmm0, %xmm1
+ call _key_expansion_256b_local
+ aeskeygenassist $0x20, %xmm2, %xmm1 // expand the key
+ call _key_expansion_256a_local
+ aeskeygenassist $0x20, %xmm0, %xmm1
+ call _key_expansion_256b_local
+ aeskeygenassist $0x40, %xmm2, %xmm1 // expand the key
+ call _key_expansion_256a_local
+
+#ifdef OPENSSL_INTERFACE
+ xor %rax, %rax // return 0 (OK)
+#else /* Open Solaris Interface */
+ mov $14, %rax // return # rounds = 14
+#endif
+ FRAME_END
+ ret
+
+.align 4
+.Lenc_key192:
+ cmp $192, %KEYSIZE32
+ jnz .Lenc_key128
+
+ // AES 192: 12 rounds in encryption key schedule
+#ifdef OPENSSL_INTERFACE
+ mov $12, %ROUNDS32
+ movl %ROUNDS32, 240(%AESKEY) // key.rounds = 12
+#endif /* OPENSSL_INTERFACE */
+
+ movq 0x10(%USERCIPHERKEY), %xmm2 // other user key
+ aeskeygenassist $0x1, %xmm2, %xmm1 // expand the key
+ call _key_expansion_192a_local
+ aeskeygenassist $0x2, %xmm2, %xmm1 // expand the key
+ call _key_expansion_192b_local
+ aeskeygenassist $0x4, %xmm2, %xmm1 // expand the key
+ call _key_expansion_192a_local
+ aeskeygenassist $0x8, %xmm2, %xmm1 // expand the key
+ call _key_expansion_192b_local
+ aeskeygenassist $0x10, %xmm2, %xmm1 // expand the key
+ call _key_expansion_192a_local
+ aeskeygenassist $0x20, %xmm2, %xmm1 // expand the key
+ call _key_expansion_192b_local
+ aeskeygenassist $0x40, %xmm2, %xmm1 // expand the key
+ call _key_expansion_192a_local
+ aeskeygenassist $0x80, %xmm2, %xmm1 // expand the key
+ call _key_expansion_192b_local
+
+#ifdef OPENSSL_INTERFACE
+ xor %rax, %rax // return 0 (OK)
+#else /* OpenSolaris Interface */
+ mov $12, %rax // return # rounds = 12
+#endif
+ FRAME_END
+ ret
+
+.align 4
+.Lenc_key128:
+ cmp $128, %KEYSIZE32
+ jnz .Lenc_key_invalid_key_bits
+
+ // AES 128: 10 rounds in encryption key schedule
+#ifdef OPENSSL_INTERFACE
+ mov $10, %ROUNDS32
+ movl %ROUNDS32, 240(%AESKEY) // key.rounds = 10
+#endif /* OPENSSL_INTERFACE */
+
+ aeskeygenassist $0x1, %xmm0, %xmm1 // expand the key
+ call _key_expansion_128_local
+ aeskeygenassist $0x2, %xmm0, %xmm1 // expand the key
+ call _key_expansion_128_local
+ aeskeygenassist $0x4, %xmm0, %xmm1 // expand the key
+ call _key_expansion_128_local
+ aeskeygenassist $0x8, %xmm0, %xmm1 // expand the key
+ call _key_expansion_128_local
+ aeskeygenassist $0x10, %xmm0, %xmm1 // expand the key
+ call _key_expansion_128_local
+ aeskeygenassist $0x20, %xmm0, %xmm1 // expand the key
+ call _key_expansion_128_local
+ aeskeygenassist $0x40, %xmm0, %xmm1 // expand the key
+ call _key_expansion_128_local
+ aeskeygenassist $0x80, %xmm0, %xmm1 // expand the key
+ call _key_expansion_128_local
+ aeskeygenassist $0x1b, %xmm0, %xmm1 // expand the key
+ call _key_expansion_128_local
+ aeskeygenassist $0x36, %xmm0, %xmm1 // expand the key
+ call _key_expansion_128_local
+
+#ifdef OPENSSL_INTERFACE
+ xor %rax, %rax // return 0 (OK)
+#else /* OpenSolaris Interface */
+ mov $10, %rax // return # rounds = 10
+#endif
+ FRAME_END
+ ret
+
+.Lenc_key_invalid_param:
+#ifdef OPENSSL_INTERFACE
+ mov $-1, %rax // user key or AES key pointer is NULL
+ FRAME_END
+ ret
+#else
+ /* FALLTHROUGH */
+#endif /* OPENSSL_INTERFACE */
+
+.Lenc_key_invalid_key_bits:
+#ifdef OPENSSL_INTERFACE
+ mov $-2, %rax // keysize is invalid
+#else /* Open Solaris Interface */
+ xor %rax, %rax // a key pointer is NULL or invalid keysize
+#endif /* OPENSSL_INTERFACE */
+ FRAME_END
+ ret
+ SET_SIZE(rijndael_key_setup_enc_intel)
+
+
+/*
+ * rijndael_key_setup_dec_intel()
+ * Expand the cipher key into the decryption key schedule.
+ *
+ * For kernel code, caller is responsible for ensuring kpreempt_disable()
+ * has been called. This is because %xmm registers are not saved/restored.
+ * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set
+ * on entry. Otherwise, if TS is not set, save and restore %xmm registers
+ * on the stack.
+ *
+ * OpenSolaris interface:
+ * int rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[],
+ * uint64_t keyBits);
+ * Return value is 0 on error, number of rounds on success.
+ * P1->P2, P2->P3, P3->P1
+ *
+ * Original Intel OpenSSL interface:
+ * int intel_AES_set_decrypt_key(const unsigned char *userKey,
+ * const int bits, AES_KEY *key);
+ * Return value is non-zero on error, 0 on success.
+ */
+
+ENTRY_NP(rijndael_key_setup_dec_intel)
+FRAME_BEGIN
+ // Generate round keys used for encryption
+ call rijndael_key_setup_enc_intel_local
+ test %rax, %rax
+#ifdef OPENSSL_INTERFACE
+ jnz .Ldec_key_exit // Failed if returned non-0
+#else /* OpenSolaris Interface */
+ jz .Ldec_key_exit // Failed if returned 0
+#endif /* OPENSSL_INTERFACE */
+
+ /*
+ * Convert round keys used for encryption
+ * to a form usable for decryption
+ */
+#ifndef OPENSSL_INTERFACE /* OpenSolaris Interface */
+ mov %rax, %ROUNDS64 // set # rounds (10, 12, or 14)
+ // (already set for OpenSSL)
+#endif
+
+ lea 0x10(%AESKEY), %rcx // key addr
+ shl $4, %ROUNDS32
+ add %AESKEY, %ROUNDS64
+ mov %ROUNDS64, %ENDAESKEY
+
+.align 4
+.Ldec_key_reorder_loop:
+ movups (%AESKEY), %xmm0
+ movups (%ROUNDS64), %xmm1
+ movups %xmm0, (%ROUNDS64)
+ movups %xmm1, (%AESKEY)
+ lea 0x10(%AESKEY), %AESKEY
+ lea -0x10(%ROUNDS64), %ROUNDS64
+ cmp %AESKEY, %ROUNDS64
+ ja .Ldec_key_reorder_loop
+
+.align 4
+.Ldec_key_inv_loop:
+ movups (%rcx), %xmm0
+ // Convert an encryption round key to a form usable for decryption
+ // with the "AES Inverse Mix Columns" instruction
+ aesimc %xmm0, %xmm1
+ movups %xmm1, (%rcx)
+ lea 0x10(%rcx), %rcx
+ cmp %ENDAESKEY, %rcx
+ jnz .Ldec_key_inv_loop
+
+.Ldec_key_exit:
+ // OpenSolaris: rax = # rounds (10, 12, or 14) or 0 for error
+ // OpenSSL: rax = 0 for OK, or non-zero for error
+ FRAME_END
+ ret
+ SET_SIZE(rijndael_key_setup_dec_intel)
+
+
+/*
+ * aes_encrypt_intel()
+ * Encrypt a single block (in and out can overlap).
+ *
+ * For kernel code, caller is responsible for ensuring kpreempt_disable()
+ * has been called. This is because %xmm registers are not saved/restored.
+ * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set
+ * on entry. Otherwise, if TS is not set, save and restore %xmm registers
+ * on the stack.
+ *
+ * Temporary register usage:
+ * %xmm0 State
+ * %xmm1 Key
+ *
+ * Original OpenSolaris Interface:
+ * void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
+ * const uint32_t pt[4], uint32_t ct[4])
+ *
+ * Original Intel OpenSSL Interface:
+ * void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
+ * const AES_KEY *key)
+ */
+
+#ifdef OPENSSL_INTERFACE
+#define aes_encrypt_intel intel_AES_encrypt
+#define aes_decrypt_intel intel_AES_decrypt
+
+#define INP rdi /* P1, 64 bits */
+#define OUTP rsi /* P2, 64 bits */
+#define KEYP rdx /* P3, 64 bits */
+
+/* No NROUNDS parameter--offset 240 from KEYP saved in %ecx: */
+#define NROUNDS32 ecx /* temporary, 32 bits */
+#define NROUNDS cl /* temporary, 8 bits */
+
+#else /* OpenSolaris Interface */
+#define KEYP rdi /* P1, 64 bits */
+#define NROUNDS esi /* P2, 32 bits */
+#define INP rdx /* P3, 64 bits */
+#define OUTP rcx /* P4, 64 bits */
+#endif /* OPENSSL_INTERFACE */
+
+#define STATE xmm0 /* temporary, 128 bits */
+#define KEY xmm1 /* temporary, 128 bits */
+
+
+ENTRY_NP(aes_encrypt_intel)
+
+ movups (%INP), %STATE // input
+ movups (%KEYP), %KEY // key
+#ifdef OPENSSL_INTERFACE
+ mov 240(%KEYP), %NROUNDS32 // round count
+#else /* OpenSolaris Interface */
+ /* Round count is already present as P2 in %rsi/%esi */
+#endif /* OPENSSL_INTERFACE */
+
+ pxor %KEY, %STATE // round 0
+ lea 0x30(%KEYP), %KEYP
+ cmp $12, %NROUNDS
+ jb .Lenc128
+ lea 0x20(%KEYP), %KEYP
+ je .Lenc192
+
+ // AES 256
+ lea 0x20(%KEYP), %KEYP
+ movups -0x60(%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movups -0x50(%KEYP), %KEY
+ aesenc %KEY, %STATE
+
+.align 4
+.Lenc192:
+ // AES 192 and 256
+ movups -0x40(%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movups -0x30(%KEYP), %KEY
+ aesenc %KEY, %STATE
+
+.align 4
+.Lenc128:
+ // AES 128, 192, and 256
+ movups -0x20(%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movups -0x10(%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movups (%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movups 0x10(%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movups 0x20(%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movups 0x30(%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movups 0x40(%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movups 0x50(%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movups 0x60(%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movups 0x70(%KEYP), %KEY
+ aesenclast %KEY, %STATE // last round
+ movups %STATE, (%OUTP) // output
+
+ ret
+ SET_SIZE(aes_encrypt_intel)
+
+
+/*
+ * aes_decrypt_intel()
+ * Decrypt a single block (in and out can overlap).
+ *
+ * For kernel code, caller is responsible for ensuring kpreempt_disable()
+ * has been called. This is because %xmm registers are not saved/restored.
+ * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set
+ * on entry. Otherwise, if TS is not set, save and restore %xmm registers
+ * on the stack.
+ *
+ * Temporary register usage:
+ * %xmm0 State
+ * %xmm1 Key
+ *
+ * Original OpenSolaris Interface:
+ * void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
+ * const uint32_t pt[4], uint32_t ct[4])/
+ *
+ * Original Intel OpenSSL Interface:
+ * void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
+ * const AES_KEY *key);
+ */
+ENTRY_NP(aes_decrypt_intel)
+
+ movups (%INP), %STATE // input
+ movups (%KEYP), %KEY // key
+#ifdef OPENSSL_INTERFACE
+ mov 240(%KEYP), %NROUNDS32 // round count
+#else /* OpenSolaris Interface */
+ /* Round count is already present as P2 in %rsi/%esi */
+#endif /* OPENSSL_INTERFACE */
+
+ pxor %KEY, %STATE // round 0
+ lea 0x30(%KEYP), %KEYP
+ cmp $12, %NROUNDS
+ jb .Ldec128
+ lea 0x20(%KEYP), %KEYP
+ je .Ldec192
+
+ // AES 256
+ lea 0x20(%KEYP), %KEYP
+ movups -0x60(%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movups -0x50(%KEYP), %KEY
+ aesdec %KEY, %STATE
+
+.align 4
+.Ldec192:
+ // AES 192 and 256
+ movups -0x40(%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movups -0x30(%KEYP), %KEY
+ aesdec %KEY, %STATE
+
+.align 4
+.Ldec128:
+ // AES 128, 192, and 256
+ movups -0x20(%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movups -0x10(%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movups (%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movups 0x10(%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movups 0x20(%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movups 0x30(%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movups 0x40(%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movups 0x50(%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movups 0x60(%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movups 0x70(%KEYP), %KEY
+ aesdeclast %KEY, %STATE // last round
+ movups %STATE, (%OUTP) // output
+
+ ret
+ SET_SIZE(aes_decrypt_intel)
+
+#endif /* lint || __lint */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/module/icp/asm-x86_64/aes/aes_amd64.S b/module/icp/asm-x86_64/aes/aes_amd64.S
new file mode 100644
index 000000000000..9db3a3179230
--- /dev/null
+++ b/module/icp/asm-x86_64/aes/aes_amd64.S
@@ -0,0 +1,906 @@
+/*
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
+ *
+ * LICENSE TERMS
+ *
+ * The free distribution and use of this software is allowed (with or without
+ * changes) provided that:
+ *
+ * 1. source code distributions include the above copyright notice, this
+ * list of conditions and the following disclaimer;
+ *
+ * 2. binary distributions include the above copyright notice, this list
+ * of conditions and the following disclaimer in their documentation;
+ *
+ * 3. the name of the copyright holder is not used to endorse products
+ * built using this software without specific written permission.
+ *
+ * DISCLAIMER
+ *
+ * This software is provided 'as is' with no explicit or implied warranties
+ * in respect of its properties, including, but not limited to, correctness
+ * and/or fitness for purpose.
+ * ---------------------------------------------------------------------------
+ * Issue 20/12/2007
+ *
+ * I am grateful to Dag Arne Osvik for many discussions of the techniques that
+ * can be used to optimise AES assembler code on AMD64/EM64T architectures.
+ * Some of the techniques used in this implementation are the result of
+ * suggestions made by him for which I am most grateful.
+ *
+ * An AES implementation for AMD64 processors using the YASM assembler. This
+ * implementation provides only encryption, decryption and hence requires key
+ * scheduling support in C. It uses 8k bytes of tables but its encryption and
+ * decryption performance is very close to that obtained using large tables.
+ * It can use either MS Windows or Gnu/Linux/OpenSolaris OS calling conventions,
+ * which are as follows:
+ * ms windows gnu/linux/opensolaris os
+ *
+ * in_blk rcx rdi
+ * out_blk rdx rsi
+ * context (cx) r8 rdx
+ *
+ * preserved rsi - + rbx, rbp, rsp, r12, r13, r14 & r15
+ * registers rdi - on both
+ *
+ * destroyed - rsi + rax, rcx, rdx, r8, r9, r10 & r11
+ * registers - rdi on both
+ *
+ * The convention used here is that for gnu/linux/opensolaris os.
+ *
+ * This code provides the standard AES block size (128 bits, 16 bytes) and the
+ * three standard AES key sizes (128, 192 and 256 bits). It has the same call
+ * interface as my C implementation. It uses the Microsoft C AMD64 calling
+ * conventions in which the three parameters are placed in rcx, rdx and r8
+ * respectively. The rbx, rsi, rdi, rbp and r12..r15 registers are preserved.
+ *
+ * OpenSolaris Note:
+ * Modified to use GNU/Linux/Solaris calling conventions.
+ * That is parameters are placed in rdi, rsi, rdx, and rcx, respectively.
+ *
+ * AES_RETURN aes_encrypt(const unsigned char in_blk[],
+ * unsigned char out_blk[], const aes_encrypt_ctx cx[1])/
+ *
+ * AES_RETURN aes_decrypt(const unsigned char in_blk[],
+ * unsigned char out_blk[], const aes_decrypt_ctx cx[1])/
+ *
+ * AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[],
+ * const aes_encrypt_ctx cx[1])/
+ *
+ * AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[],
+ * const aes_decrypt_ctx cx[1])/
+ *
+ * AES_RETURN aes_encrypt_key(const unsigned char key[],
+ * unsigned int len, const aes_decrypt_ctx cx[1])/
+ *
+ * AES_RETURN aes_decrypt_key(const unsigned char key[],
+ * unsigned int len, const aes_decrypt_ctx cx[1])/
+ *
+ * where <NNN> is 128, 102 or 256. In the last two calls the length can be in
+ * either bits or bytes.
+ *
+ * Comment in/out the following lines to obtain the desired subroutines. These
+ * selections MUST match those in the C header file aesopt.h
+ */
+#define AES_REV_DKS /* define if key decryption schedule is reversed */
+
+#define LAST_ROUND_TABLES /* define for the faster version using extra tables */
+
+/*
+ * The encryption key schedule has the following in memory layout where N is the
+ * number of rounds (10, 12 or 14):
+ *
+ * lo: | input key (round 0) | / each round is four 32-bit words
+ * | encryption round 1 |
+ * | encryption round 2 |
+ * ....
+ * | encryption round N-1 |
+ * hi: | encryption round N |
+ *
+ * The decryption key schedule is normally set up so that it has the same
+ * layout as above by actually reversing the order of the encryption key
+ * schedule in memory (this happens when AES_REV_DKS is set):
+ *
+ * lo: | decryption round 0 | = | encryption round N |
+ * | decryption round 1 | = INV_MIX_COL[ | encryption round N-1 | ]
+ * | decryption round 2 | = INV_MIX_COL[ | encryption round N-2 | ]
+ * .... ....
+ * | decryption round N-1 | = INV_MIX_COL[ | encryption round 1 | ]
+ * hi: | decryption round N | = | input key (round 0) |
+ *
+ * with rounds except the first and last modified using inv_mix_column()
+ * But if AES_REV_DKS is NOT set the order of keys is left as it is for
+ * encryption so that it has to be accessed in reverse when used for
+ * decryption (although the inverse mix column modifications are done)
+ *
+ * lo: | decryption round 0 | = | input key (round 0) |
+ * | decryption round 1 | = INV_MIX_COL[ | encryption round 1 | ]
+ * | decryption round 2 | = INV_MIX_COL[ | encryption round 2 | ]
+ * .... ....
+ * | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ]
+ * hi: | decryption round N | = | encryption round N |
+ *
+ * This layout is faster when the assembler key scheduling provided here
+ * is used.
+ *
+ * End of user defines
+ */
+
+/*
+ * ---------------------------------------------------------------------------
+ * OpenSolaris OS modifications
+ *
+ * This source originates from Brian Gladman file aes_amd64.asm
+ * in http://fp.gladman.plus.com/AES/aes-src-04-03-08.zip
+ * with these changes:
+ *
+ * 1. Removed MS Windows-specific code within DLL_EXPORT, _SEH_, and
+ * !__GNUC__ ifdefs. Also removed ENCRYPTION, DECRYPTION,
+ * AES_128, AES_192, AES_256, AES_VAR ifdefs.
+ *
+ * 2. Translate yasm/nasm %define and .macro definitions to cpp(1) #define
+ *
+ * 3. Translate yasm/nasm %ifdef/%ifndef to cpp(1) #ifdef
+ *
+ * 4. Translate Intel/yasm/nasm syntax to ATT/OpenSolaris as(1) syntax
+ * (operands reversed, literals prefixed with "$", registers prefixed with "%",
+ * and "[register+offset]", addressing changed to "offset(register)",
+ * parenthesis in constant expressions "()" changed to square brackets "[]",
+ * "." removed from local (numeric) labels, and other changes.
+ * Examples:
+ * Intel/yasm/nasm Syntax ATT/OpenSolaris Syntax
+ * mov rax,(4*20h) mov $[4*0x20],%rax
+ * mov rax,[ebx+20h] mov 0x20(%ebx),%rax
+ * lea rax,[ebx+ecx] lea (%ebx,%ecx),%rax
+ * sub rax,[ebx+ecx*4-20h] sub -0x20(%ebx,%ecx,4),%rax
+ *
+ * 5. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
+ * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function
+ * definitions for lint.
+ *
+ * 6. Renamed functions and reordered parameters to match OpenSolaris:
+ * Original Gladman interface:
+ * int aes_encrypt(const unsigned char *in,
+ * unsigned char *out, const aes_encrypt_ctx cx[1])/
+ * int aes_decrypt(const unsigned char *in,
+ * unsigned char *out, const aes_encrypt_ctx cx[1])/
+ * Note: aes_encrypt_ctx contains ks, a 60 element array of uint32_t,
+ * and a union type, inf., containing inf.l, a uint32_t and
+ * inf.b, a 4-element array of uint32_t. Only b[0] in the array (aka "l") is
+ * used and contains the key schedule length * 16 where key schedule length is
+ * 10, 12, or 14 bytes.
+ *
+ * OpenSolaris OS interface:
+ * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr,
+ * const uint32_t pt[4], uint32_t ct[4])/
+ * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr,
+ * const uint32_t pt[4], uint32_t ct[4])/
+ * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]/
+ * uint32_t ks32[(MAX_AES_NR + 1) * 4]/ } aes_ks_t/
+ * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text,
+ * ct is crypto text, and MAX_AES_NR is 14.
+ * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64.
+ */
+
+#if defined(lint) || defined(__lint)
+
+#include <sys/types.h>
+/* ARGSUSED */
+void
+aes_encrypt_amd64(const uint32_t rk[], int Nr, const uint32_t pt[4],
+ uint32_t ct[4]) {
+}
+/* ARGSUSED */
+void
+aes_decrypt_amd64(const uint32_t rk[], int Nr, const uint32_t ct[4],
+ uint32_t pt[4]) {
+}
+
+
+#else
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+#define KS_LENGTH 60
+
+#define raxd eax
+#define rdxd edx
+#define rcxd ecx
+#define rbxd ebx
+#define rsid esi
+#define rdid edi
+
+#define raxb al
+#define rdxb dl
+#define rcxb cl
+#define rbxb bl
+#define rsib sil
+#define rdib dil
+
+// finite field multiplies by {02}, {04} and {08}
+
+#define f2(x) [[x<<1]^[[[x>>7]&1]*0x11b]]
+#define f4(x) [[x<<2]^[[[x>>6]&1]*0x11b]^[[[x>>6]&2]*0x11b]]
+#define f8(x) [[x<<3]^[[[x>>5]&1]*0x11b]^[[[x>>5]&2]*0x11b]^[[[x>>5]&4]*0x11b]]
+
+// finite field multiplies required in table generation
+
+#define f3(x) [[f2(x)] ^ [x]]
+#define f9(x) [[f8(x)] ^ [x]]
+#define fb(x) [[f8(x)] ^ [f2(x)] ^ [x]]
+#define fd(x) [[f8(x)] ^ [f4(x)] ^ [x]]
+#define fe(x) [[f8(x)] ^ [f4(x)] ^ [f2(x)]]
+
+// macros for expanding S-box data
+
+#define u8(x) [f2(x)], [x], [x], [f3(x)], [f2(x)], [x], [x], [f3(x)]
+#define v8(x) [fe(x)], [f9(x)], [fd(x)], [fb(x)], [fe(x)], [f9(x)], [fd(x)], [x]
+#define w8(x) [x], 0, 0, 0, [x], 0, 0, 0
+
+#define enc_vals(x) \
+ .byte x(0x63),x(0x7c),x(0x77),x(0x7b),x(0xf2),x(0x6b),x(0x6f),x(0xc5); \
+ .byte x(0x30),x(0x01),x(0x67),x(0x2b),x(0xfe),x(0xd7),x(0xab),x(0x76); \
+ .byte x(0xca),x(0x82),x(0xc9),x(0x7d),x(0xfa),x(0x59),x(0x47),x(0xf0); \
+ .byte x(0xad),x(0xd4),x(0xa2),x(0xaf),x(0x9c),x(0xa4),x(0x72),x(0xc0); \
+ .byte x(0xb7),x(0xfd),x(0x93),x(0x26),x(0x36),x(0x3f),x(0xf7),x(0xcc); \
+ .byte x(0x34),x(0xa5),x(0xe5),x(0xf1),x(0x71),x(0xd8),x(0x31),x(0x15); \
+ .byte x(0x04),x(0xc7),x(0x23),x(0xc3),x(0x18),x(0x96),x(0x05),x(0x9a); \
+ .byte x(0x07),x(0x12),x(0x80),x(0xe2),x(0xeb),x(0x27),x(0xb2),x(0x75); \
+ .byte x(0x09),x(0x83),x(0x2c),x(0x1a),x(0x1b),x(0x6e),x(0x5a),x(0xa0); \
+ .byte x(0x52),x(0x3b),x(0xd6),x(0xb3),x(0x29),x(0xe3),x(0x2f),x(0x84); \
+ .byte x(0x53),x(0xd1),x(0x00),x(0xed),x(0x20),x(0xfc),x(0xb1),x(0x5b); \
+ .byte x(0x6a),x(0xcb),x(0xbe),x(0x39),x(0x4a),x(0x4c),x(0x58),x(0xcf); \
+ .byte x(0xd0),x(0xef),x(0xaa),x(0xfb),x(0x43),x(0x4d),x(0x33),x(0x85); \
+ .byte x(0x45),x(0xf9),x(0x02),x(0x7f),x(0x50),x(0x3c),x(0x9f),x(0xa8); \
+ .byte x(0x51),x(0xa3),x(0x40),x(0x8f),x(0x92),x(0x9d),x(0x38),x(0xf5); \
+ .byte x(0xbc),x(0xb6),x(0xda),x(0x21),x(0x10),x(0xff),x(0xf3),x(0xd2); \
+ .byte x(0xcd),x(0x0c),x(0x13),x(0xec),x(0x5f),x(0x97),x(0x44),x(0x17); \
+ .byte x(0xc4),x(0xa7),x(0x7e),x(0x3d),x(0x64),x(0x5d),x(0x19),x(0x73); \
+ .byte x(0x60),x(0x81),x(0x4f),x(0xdc),x(0x22),x(0x2a),x(0x90),x(0x88); \
+ .byte x(0x46),x(0xee),x(0xb8),x(0x14),x(0xde),x(0x5e),x(0x0b),x(0xdb); \
+ .byte x(0xe0),x(0x32),x(0x3a),x(0x0a),x(0x49),x(0x06),x(0x24),x(0x5c); \
+ .byte x(0xc2),x(0xd3),x(0xac),x(0x62),x(0x91),x(0x95),x(0xe4),x(0x79); \
+ .byte x(0xe7),x(0xc8),x(0x37),x(0x6d),x(0x8d),x(0xd5),x(0x4e),x(0xa9); \
+ .byte x(0x6c),x(0x56),x(0xf4),x(0xea),x(0x65),x(0x7a),x(0xae),x(0x08); \
+ .byte x(0xba),x(0x78),x(0x25),x(0x2e),x(0x1c),x(0xa6),x(0xb4),x(0xc6); \
+ .byte x(0xe8),x(0xdd),x(0x74),x(0x1f),x(0x4b),x(0xbd),x(0x8b),x(0x8a); \
+ .byte x(0x70),x(0x3e),x(0xb5),x(0x66),x(0x48),x(0x03),x(0xf6),x(0x0e); \
+ .byte x(0x61),x(0x35),x(0x57),x(0xb9),x(0x86),x(0xc1),x(0x1d),x(0x9e); \
+ .byte x(0xe1),x(0xf8),x(0x98),x(0x11),x(0x69),x(0xd9),x(0x8e),x(0x94); \
+ .byte x(0x9b),x(0x1e),x(0x87),x(0xe9),x(0xce),x(0x55),x(0x28),x(0xdf); \
+ .byte x(0x8c),x(0xa1),x(0x89),x(0x0d),x(0xbf),x(0xe6),x(0x42),x(0x68); \
+ .byte x(0x41),x(0x99),x(0x2d),x(0x0f),x(0xb0),x(0x54),x(0xbb),x(0x16)
+
+#define dec_vals(x) \
+ .byte x(0x52),x(0x09),x(0x6a),x(0xd5),x(0x30),x(0x36),x(0xa5),x(0x38); \
+ .byte x(0xbf),x(0x40),x(0xa3),x(0x9e),x(0x81),x(0xf3),x(0xd7),x(0xfb); \
+ .byte x(0x7c),x(0xe3),x(0x39),x(0x82),x(0x9b),x(0x2f),x(0xff),x(0x87); \
+ .byte x(0x34),x(0x8e),x(0x43),x(0x44),x(0xc4),x(0xde),x(0xe9),x(0xcb); \
+ .byte x(0x54),x(0x7b),x(0x94),x(0x32),x(0xa6),x(0xc2),x(0x23),x(0x3d); \
+ .byte x(0xee),x(0x4c),x(0x95),x(0x0b),x(0x42),x(0xfa),x(0xc3),x(0x4e); \
+ .byte x(0x08),x(0x2e),x(0xa1),x(0x66),x(0x28),x(0xd9),x(0x24),x(0xb2); \
+ .byte x(0x76),x(0x5b),x(0xa2),x(0x49),x(0x6d),x(0x8b),x(0xd1),x(0x25); \
+ .byte x(0x72),x(0xf8),x(0xf6),x(0x64),x(0x86),x(0x68),x(0x98),x(0x16); \
+ .byte x(0xd4),x(0xa4),x(0x5c),x(0xcc),x(0x5d),x(0x65),x(0xb6),x(0x92); \
+ .byte x(0x6c),x(0x70),x(0x48),x(0x50),x(0xfd),x(0xed),x(0xb9),x(0xda); \
+ .byte x(0x5e),x(0x15),x(0x46),x(0x57),x(0xa7),x(0x8d),x(0x9d),x(0x84); \
+ .byte x(0x90),x(0xd8),x(0xab),x(0x00),x(0x8c),x(0xbc),x(0xd3),x(0x0a); \
+ .byte x(0xf7),x(0xe4),x(0x58),x(0x05),x(0xb8),x(0xb3),x(0x45),x(0x06); \
+ .byte x(0xd0),x(0x2c),x(0x1e),x(0x8f),x(0xca),x(0x3f),x(0x0f),x(0x02); \
+ .byte x(0xc1),x(0xaf),x(0xbd),x(0x03),x(0x01),x(0x13),x(0x8a),x(0x6b); \
+ .byte x(0x3a),x(0x91),x(0x11),x(0x41),x(0x4f),x(0x67),x(0xdc),x(0xea); \
+ .byte x(0x97),x(0xf2),x(0xcf),x(0xce),x(0xf0),x(0xb4),x(0xe6),x(0x73); \
+ .byte x(0x96),x(0xac),x(0x74),x(0x22),x(0xe7),x(0xad),x(0x35),x(0x85); \
+ .byte x(0xe2),x(0xf9),x(0x37),x(0xe8),x(0x1c),x(0x75),x(0xdf),x(0x6e); \
+ .byte x(0x47),x(0xf1),x(0x1a),x(0x71),x(0x1d),x(0x29),x(0xc5),x(0x89); \
+ .byte x(0x6f),x(0xb7),x(0x62),x(0x0e),x(0xaa),x(0x18),x(0xbe),x(0x1b); \
+ .byte x(0xfc),x(0x56),x(0x3e),x(0x4b),x(0xc6),x(0xd2),x(0x79),x(0x20); \
+ .byte x(0x9a),x(0xdb),x(0xc0),x(0xfe),x(0x78),x(0xcd),x(0x5a),x(0xf4); \
+ .byte x(0x1f),x(0xdd),x(0xa8),x(0x33),x(0x88),x(0x07),x(0xc7),x(0x31); \
+ .byte x(0xb1),x(0x12),x(0x10),x(0x59),x(0x27),x(0x80),x(0xec),x(0x5f); \
+ .byte x(0x60),x(0x51),x(0x7f),x(0xa9),x(0x19),x(0xb5),x(0x4a),x(0x0d); \
+ .byte x(0x2d),x(0xe5),x(0x7a),x(0x9f),x(0x93),x(0xc9),x(0x9c),x(0xef); \
+ .byte x(0xa0),x(0xe0),x(0x3b),x(0x4d),x(0xae),x(0x2a),x(0xf5),x(0xb0); \
+ .byte x(0xc8),x(0xeb),x(0xbb),x(0x3c),x(0x83),x(0x53),x(0x99),x(0x61); \
+ .byte x(0x17),x(0x2b),x(0x04),x(0x7e),x(0xba),x(0x77),x(0xd6),x(0x26); \
+ .byte x(0xe1),x(0x69),x(0x14),x(0x63),x(0x55),x(0x21),x(0x0c),x(0x7d)
+
+#define tptr %rbp /* table pointer */
+#define kptr %r8 /* key schedule pointer */
+#define fofs 128 /* adjust offset in key schedule to keep |disp| < 128 */
+#define fk_ref(x, y) -16*x+fofs+4*y(kptr)
+
+#ifdef AES_REV_DKS
+#define rofs 128
+#define ik_ref(x, y) -16*x+rofs+4*y(kptr)
+
+#else
+#define rofs -128
+#define ik_ref(x, y) 16*x+rofs+4*y(kptr)
+#endif /* AES_REV_DKS */
+
+#define tab_0(x) (tptr,x,8)
+#define tab_1(x) 3(tptr,x,8)
+#define tab_2(x) 2(tptr,x,8)
+#define tab_3(x) 1(tptr,x,8)
+#define tab_f(x) 1(tptr,x,8)
+#define tab_i(x) 7(tptr,x,8)
+
+#define ff_rnd(p1, p2, p3, p4, round) /* normal forward round */ \
+ mov fk_ref(round,0), p1; \
+ mov fk_ref(round,1), p2; \
+ mov fk_ref(round,2), p3; \
+ mov fk_ref(round,3), p4; \
+ \
+ movzx %al, %esi; \
+ movzx %ah, %edi; \
+ shr $16, %eax; \
+ xor tab_0(%rsi), p1; \
+ xor tab_1(%rdi), p4; \
+ movzx %al, %esi; \
+ movzx %ah, %edi; \
+ xor tab_2(%rsi), p3; \
+ xor tab_3(%rdi), p2; \
+ \
+ movzx %bl, %esi; \
+ movzx %bh, %edi; \
+ shr $16, %ebx; \
+ xor tab_0(%rsi), p2; \
+ xor tab_1(%rdi), p1; \
+ movzx %bl, %esi; \
+ movzx %bh, %edi; \
+ xor tab_2(%rsi), p4; \
+ xor tab_3(%rdi), p3; \
+ \
+ movzx %cl, %esi; \
+ movzx %ch, %edi; \
+ shr $16, %ecx; \
+ xor tab_0(%rsi), p3; \
+ xor tab_1(%rdi), p2; \
+ movzx %cl, %esi; \
+ movzx %ch, %edi; \
+ xor tab_2(%rsi), p1; \
+ xor tab_3(%rdi), p4; \
+ \
+ movzx %dl, %esi; \
+ movzx %dh, %edi; \
+ shr $16, %edx; \
+ xor tab_0(%rsi), p4; \
+ xor tab_1(%rdi), p3; \
+ movzx %dl, %esi; \
+ movzx %dh, %edi; \
+ xor tab_2(%rsi), p2; \
+ xor tab_3(%rdi), p1; \
+ \
+ mov p1, %eax; \
+ mov p2, %ebx; \
+ mov p3, %ecx; \
+ mov p4, %edx
+
+#ifdef LAST_ROUND_TABLES
+
+#define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \
+ add $2048, tptr; \
+ mov fk_ref(round,0), p1; \
+ mov fk_ref(round,1), p2; \
+ mov fk_ref(round,2), p3; \
+ mov fk_ref(round,3), p4; \
+ \
+ movzx %al, %esi; \
+ movzx %ah, %edi; \
+ shr $16, %eax; \
+ xor tab_0(%rsi), p1; \
+ xor tab_1(%rdi), p4; \
+ movzx %al, %esi; \
+ movzx %ah, %edi; \
+ xor tab_2(%rsi), p3; \
+ xor tab_3(%rdi), p2; \
+ \
+ movzx %bl, %esi; \
+ movzx %bh, %edi; \
+ shr $16, %ebx; \
+ xor tab_0(%rsi), p2; \
+ xor tab_1(%rdi), p1; \
+ movzx %bl, %esi; \
+ movzx %bh, %edi; \
+ xor tab_2(%rsi), p4; \
+ xor tab_3(%rdi), p3; \
+ \
+ movzx %cl, %esi; \
+ movzx %ch, %edi; \
+ shr $16, %ecx; \
+ xor tab_0(%rsi), p3; \
+ xor tab_1(%rdi), p2; \
+ movzx %cl, %esi; \
+ movzx %ch, %edi; \
+ xor tab_2(%rsi), p1; \
+ xor tab_3(%rdi), p4; \
+ \
+ movzx %dl, %esi; \
+ movzx %dh, %edi; \
+ shr $16, %edx; \
+ xor tab_0(%rsi), p4; \
+ xor tab_1(%rdi), p3; \
+ movzx %dl, %esi; \
+ movzx %dh, %edi; \
+ xor tab_2(%rsi), p2; \
+ xor tab_3(%rdi), p1
+
+#else
+
+#define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \
+ mov fk_ref(round,0), p1; \
+ mov fk_ref(round,1), p2; \
+ mov fk_ref(round,2), p3; \
+ mov fk_ref(round,3), p4; \
+ \
+ movzx %al, %esi; \
+ movzx %ah, %edi; \
+ shr $16, %eax; \
+ movzx tab_f(%rsi), %esi; \
+ movzx tab_f(%rdi), %edi; \
+ xor %esi, p1; \
+ rol $8, %edi; \
+ xor %edi, p4; \
+ movzx %al, %esi; \
+ movzx %ah, %edi; \
+ movzx tab_f(%rsi), %esi; \
+ movzx tab_f(%rdi), %edi; \
+ rol $16, %esi; \
+ rol $24, %edi; \
+ xor %esi, p3; \
+ xor %edi, p2; \
+ \
+ movzx %bl, %esi; \
+ movzx %bh, %edi; \
+ shr $16, %ebx; \
+ movzx tab_f(%rsi), %esi; \
+ movzx tab_f(%rdi), %edi; \
+ xor %esi, p2; \
+ rol $8, %edi; \
+ xor %edi, p1; \
+ movzx %bl, %esi; \
+ movzx %bh, %edi; \
+ movzx tab_f(%rsi), %esi; \
+ movzx tab_f(%rdi), %edi; \
+ rol $16, %esi; \
+ rol $24, %edi; \
+ xor %esi, p4; \
+ xor %edi, p3; \
+ \
+ movzx %cl, %esi; \
+ movzx %ch, %edi; \
+ movzx tab_f(%rsi), %esi; \
+ movzx tab_f(%rdi), %edi; \
+ shr $16, %ecx; \
+ xor %esi, p3; \
+ rol $8, %edi; \
+ xor %edi, p2; \
+ movzx %cl, %esi; \
+ movzx %ch, %edi; \
+ movzx tab_f(%rsi), %esi; \
+ movzx tab_f(%rdi), %edi; \
+ rol $16, %esi; \
+ rol $24, %edi; \
+ xor %esi, p1; \
+ xor %edi, p4; \
+ \
+ movzx %dl, %esi; \
+ movzx %dh, %edi; \
+ movzx tab_f(%rsi), %esi; \
+ movzx tab_f(%rdi), %edi; \
+ shr $16, %edx; \
+ xor %esi, p4; \
+ rol $8, %edi; \
+ xor %edi, p3; \
+ movzx %dl, %esi; \
+ movzx %dh, %edi; \
+ movzx tab_f(%rsi), %esi; \
+ movzx tab_f(%rdi), %edi; \
+ rol $16, %esi; \
+ rol $24, %edi; \
+ xor %esi, p2; \
+ xor %edi, p1
+
+#endif /* LAST_ROUND_TABLES */
+
+#define ii_rnd(p1, p2, p3, p4, round) /* normal inverse round */ \
+ mov ik_ref(round,0), p1; \
+ mov ik_ref(round,1), p2; \
+ mov ik_ref(round,2), p3; \
+ mov ik_ref(round,3), p4; \
+ \
+ movzx %al, %esi; \
+ movzx %ah, %edi; \
+ shr $16, %eax; \
+ xor tab_0(%rsi), p1; \
+ xor tab_1(%rdi), p2; \
+ movzx %al, %esi; \
+ movzx %ah, %edi; \
+ xor tab_2(%rsi), p3; \
+ xor tab_3(%rdi), p4; \
+ \
+ movzx %bl, %esi; \
+ movzx %bh, %edi; \
+ shr $16, %ebx; \
+ xor tab_0(%rsi), p2; \
+ xor tab_1(%rdi), p3; \
+ movzx %bl, %esi; \
+ movzx %bh, %edi; \
+ xor tab_2(%rsi), p4; \
+ xor tab_3(%rdi), p1; \
+ \
+ movzx %cl, %esi; \
+ movzx %ch, %edi; \
+ shr $16, %ecx; \
+ xor tab_0(%rsi), p3; \
+ xor tab_1(%rdi), p4; \
+ movzx %cl, %esi; \
+ movzx %ch, %edi; \
+ xor tab_2(%rsi), p1; \
+ xor tab_3(%rdi), p2; \
+ \
+ movzx %dl, %esi; \
+ movzx %dh, %edi; \
+ shr $16, %edx; \
+ xor tab_0(%rsi), p4; \
+ xor tab_1(%rdi), p1; \
+ movzx %dl, %esi; \
+ movzx %dh, %edi; \
+ xor tab_2(%rsi), p2; \
+ xor tab_3(%rdi), p3; \
+ \
+ mov p1, %eax; \
+ mov p2, %ebx; \
+ mov p3, %ecx; \
+ mov p4, %edx
+
+#ifdef LAST_ROUND_TABLES
+
+#define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \
+ add $2048, tptr; \
+ mov ik_ref(round,0), p1; \
+ mov ik_ref(round,1), p2; \
+ mov ik_ref(round,2), p3; \
+ mov ik_ref(round,3), p4; \
+ \
+ movzx %al, %esi; \
+ movzx %ah, %edi; \
+ shr $16, %eax; \
+ xor tab_0(%rsi), p1; \
+ xor tab_1(%rdi), p2; \
+ movzx %al, %esi; \
+ movzx %ah, %edi; \
+ xor tab_2(%rsi), p3; \
+ xor tab_3(%rdi), p4; \
+ \
+ movzx %bl, %esi; \
+ movzx %bh, %edi; \
+ shr $16, %ebx; \
+ xor tab_0(%rsi), p2; \
+ xor tab_1(%rdi), p3; \
+ movzx %bl, %esi; \
+ movzx %bh, %edi; \
+ xor tab_2(%rsi), p4; \
+ xor tab_3(%rdi), p1; \
+ \
+ movzx %cl, %esi; \
+ movzx %ch, %edi; \
+ shr $16, %ecx; \
+ xor tab_0(%rsi), p3; \
+ xor tab_1(%rdi), p4; \
+ movzx %cl, %esi; \
+ movzx %ch, %edi; \
+ xor tab_2(%rsi), p1; \
+ xor tab_3(%rdi), p2; \
+ \
+ movzx %dl, %esi; \
+ movzx %dh, %edi; \
+ shr $16, %edx; \
+ xor tab_0(%rsi), p4; \
+ xor tab_1(%rdi), p1; \
+ movzx %dl, %esi; \
+ movzx %dh, %edi; \
+ xor tab_2(%rsi), p2; \
+ xor tab_3(%rdi), p3
+
+#else
+
+#define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \
+ mov ik_ref(round,0), p1; \
+ mov ik_ref(round,1), p2; \
+ mov ik_ref(round,2), p3; \
+ mov ik_ref(round,3), p4; \
+ \
+ movzx %al, %esi; \
+ movzx %ah, %edi; \
+ movzx tab_i(%rsi), %esi; \
+ movzx tab_i(%rdi), %edi; \
+ shr $16, %eax; \
+ xor %esi, p1; \
+ rol $8, %edi; \
+ xor %edi, p2; \
+ movzx %al, %esi; \
+ movzx %ah, %edi; \
+ movzx tab_i(%rsi), %esi; \
+ movzx tab_i(%rdi), %edi; \
+ rol $16, %esi; \
+ rol $24, %edi; \
+ xor %esi, p3; \
+ xor %edi, p4; \
+ \
+ movzx %bl, %esi; \
+ movzx %bh, %edi; \
+ movzx tab_i(%rsi), %esi; \
+ movzx tab_i(%rdi), %edi; \
+ shr $16, %ebx; \
+ xor %esi, p2; \
+ rol $8, %edi; \
+ xor %edi, p3; \
+ movzx %bl, %esi; \
+ movzx %bh, %edi; \
+ movzx tab_i(%rsi), %esi; \
+ movzx tab_i(%rdi), %edi; \
+ rol $16, %esi; \
+ rol $24, %edi; \
+ xor %esi, p4; \
+ xor %edi, p1; \
+ \
+ movzx %cl, %esi; \
+ movzx %ch, %edi; \
+ movzx tab_i(%rsi), %esi; \
+ movzx tab_i(%rdi), %edi; \
+ shr $16, %ecx; \
+ xor %esi, p3; \
+ rol $8, %edi; \
+ xor %edi, p4; \
+ movzx %cl, %esi; \
+ movzx %ch, %edi; \
+ movzx tab_i(%rsi), %esi; \
+ movzx tab_i(%rdi), %edi; \
+ rol $16, %esi; \
+ rol $24, %edi; \
+ xor %esi, p1; \
+ xor %edi, p2; \
+ \
+ movzx %dl, %esi; \
+ movzx %dh, %edi; \
+ movzx tab_i(%rsi), %esi; \
+ movzx tab_i(%rdi), %edi; \
+ shr $16, %edx; \
+ xor %esi, p4; \
+ rol $8, %edi; \
+ xor %edi, p1; \
+ movzx %dl, %esi; \
+ movzx %dh, %edi; \
+ movzx tab_i(%rsi), %esi; \
+ movzx tab_i(%rdi), %edi; \
+ rol $16, %esi; \
+ rol $24, %edi; \
+ xor %esi, p2; \
+ xor %edi, p3
+
+#endif /* LAST_ROUND_TABLES */
+
+/*
+ * OpenSolaris OS:
+ * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr,
+ * const uint32_t pt[4], uint32_t ct[4])/
+ *
+ * Original interface:
+ * int aes_encrypt(const unsigned char *in,
+ * unsigned char *out, const aes_encrypt_ctx cx[1])/
+ */
+.data
+.align 64
+enc_tab:
+ enc_vals(u8)
+#ifdef LAST_ROUND_TABLES
+ // Last Round Tables:
+ enc_vals(w8)
+#endif
+
+
+ENTRY_NP(aes_encrypt_amd64)
+#ifdef GLADMAN_INTERFACE
+ // Original interface
+ sub $[4*8], %rsp // gnu/linux/opensolaris binary interface
+ mov %rsi, (%rsp) // output pointer (P2)
+ mov %rdx, %r8 // context (P3)
+
+ mov %rbx, 1*8(%rsp) // P1: input pointer in rdi
+ mov %rbp, 2*8(%rsp) // P2: output pointer in (rsp)
+ mov %r12, 3*8(%rsp) // P3: context in r8
+ movzx 4*KS_LENGTH(kptr), %esi // Get byte key length * 16
+
+#else
+ // OpenSolaris OS interface
+ sub $[4*8], %rsp // Make room on stack to save registers
+ mov %rcx, (%rsp) // Save output pointer (P4) on stack
+ mov %rdi, %r8 // context (P1)
+ mov %rdx, %rdi // P3: save input pointer
+ shl $4, %esi // P2: esi byte key length * 16
+
+ mov %rbx, 1*8(%rsp) // Save registers
+ mov %rbp, 2*8(%rsp)
+ mov %r12, 3*8(%rsp)
+ // P1: context in r8
+ // P2: byte key length * 16 in esi
+ // P3: input pointer in rdi
+ // P4: output pointer in (rsp)
+#endif /* GLADMAN_INTERFACE */
+
+ lea enc_tab(%rip), tptr
+ sub $fofs, kptr
+
+ // Load input block into registers
+ mov (%rdi), %eax
+ mov 1*4(%rdi), %ebx
+ mov 2*4(%rdi), %ecx
+ mov 3*4(%rdi), %edx
+
+ xor fofs(kptr), %eax
+ xor fofs+4(kptr), %ebx
+ xor fofs+8(kptr), %ecx
+ xor fofs+12(kptr), %edx
+
+ lea (kptr,%rsi), kptr
+ // Jump based on byte key length * 16:
+ cmp $[10*16], %esi
+ je 3f
+ cmp $[12*16], %esi
+ je 2f
+ cmp $[14*16], %esi
+ je 1f
+ mov $-1, %rax // error
+ jmp 4f
+
+ // Perform normal forward rounds
+1: ff_rnd(%r9d, %r10d, %r11d, %r12d, 13)
+ ff_rnd(%r9d, %r10d, %r11d, %r12d, 12)
+2: ff_rnd(%r9d, %r10d, %r11d, %r12d, 11)
+ ff_rnd(%r9d, %r10d, %r11d, %r12d, 10)
+3: ff_rnd(%r9d, %r10d, %r11d, %r12d, 9)
+ ff_rnd(%r9d, %r10d, %r11d, %r12d, 8)
+ ff_rnd(%r9d, %r10d, %r11d, %r12d, 7)
+ ff_rnd(%r9d, %r10d, %r11d, %r12d, 6)
+ ff_rnd(%r9d, %r10d, %r11d, %r12d, 5)
+ ff_rnd(%r9d, %r10d, %r11d, %r12d, 4)
+ ff_rnd(%r9d, %r10d, %r11d, %r12d, 3)
+ ff_rnd(%r9d, %r10d, %r11d, %r12d, 2)
+ ff_rnd(%r9d, %r10d, %r11d, %r12d, 1)
+ fl_rnd(%r9d, %r10d, %r11d, %r12d, 0)
+
+ // Copy results
+ mov (%rsp), %rbx
+ mov %r9d, (%rbx)
+ mov %r10d, 4(%rbx)
+ mov %r11d, 8(%rbx)
+ mov %r12d, 12(%rbx)
+ xor %rax, %rax
+4: // Restore registers
+ mov 1*8(%rsp), %rbx
+ mov 2*8(%rsp), %rbp
+ mov 3*8(%rsp), %r12
+ add $[4*8], %rsp
+ ret
+
+ SET_SIZE(aes_encrypt_amd64)
+
+/*
+ * OpenSolaris OS:
+ * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr,
+ * const uint32_t pt[4], uint32_t ct[4])/
+ *
+ * Original interface:
+ * int aes_decrypt(const unsigned char *in,
+ * unsigned char *out, const aes_encrypt_ctx cx[1])/
+ */
+.data
+.align 64
+dec_tab:
+ dec_vals(v8)
+#ifdef LAST_ROUND_TABLES
+ // Last Round Tables:
+ dec_vals(w8)
+#endif
+
+
+ENTRY_NP(aes_decrypt_amd64)
+#ifdef GLADMAN_INTERFACE
+ // Original interface
+ sub $[4*8], %rsp // gnu/linux/opensolaris binary interface
+ mov %rsi, (%rsp) // output pointer (P2)
+ mov %rdx, %r8 // context (P3)
+
+ mov %rbx, 1*8(%rsp) // P1: input pointer in rdi
+ mov %rbp, 2*8(%rsp) // P2: output pointer in (rsp)
+ mov %r12, 3*8(%rsp) // P3: context in r8
+ movzx 4*KS_LENGTH(kptr), %esi // Get byte key length * 16
+
+#else
+ // OpenSolaris OS interface
+ sub $[4*8], %rsp // Make room on stack to save registers
+ mov %rcx, (%rsp) // Save output pointer (P4) on stack
+ mov %rdi, %r8 // context (P1)
+ mov %rdx, %rdi // P3: save input pointer
+ shl $4, %esi // P2: esi byte key length * 16
+
+ mov %rbx, 1*8(%rsp) // Save registers
+ mov %rbp, 2*8(%rsp)
+ mov %r12, 3*8(%rsp)
+ // P1: context in r8
+ // P2: byte key length * 16 in esi
+ // P3: input pointer in rdi
+ // P4: output pointer in (rsp)
+#endif /* GLADMAN_INTERFACE */
+
+ lea dec_tab(%rip), tptr
+ sub $rofs, kptr
+
+ // Load input block into registers
+ mov (%rdi), %eax
+ mov 1*4(%rdi), %ebx
+ mov 2*4(%rdi), %ecx
+ mov 3*4(%rdi), %edx
+
+#ifdef AES_REV_DKS
+ mov kptr, %rdi
+ lea (kptr,%rsi), kptr
+#else
+ lea (kptr,%rsi), %rdi
+#endif
+
+ xor rofs(%rdi), %eax
+ xor rofs+4(%rdi), %ebx
+ xor rofs+8(%rdi), %ecx
+ xor rofs+12(%rdi), %edx
+
+ // Jump based on byte key length * 16:
+ cmp $[10*16], %esi
+ je 3f
+ cmp $[12*16], %esi
+ je 2f
+ cmp $[14*16], %esi
+ je 1f
+ mov $-1, %rax // error
+ jmp 4f
+
+ // Perform normal inverse rounds
+1: ii_rnd(%r9d, %r10d, %r11d, %r12d, 13)
+ ii_rnd(%r9d, %r10d, %r11d, %r12d, 12)
+2: ii_rnd(%r9d, %r10d, %r11d, %r12d, 11)
+ ii_rnd(%r9d, %r10d, %r11d, %r12d, 10)
+3: ii_rnd(%r9d, %r10d, %r11d, %r12d, 9)
+ ii_rnd(%r9d, %r10d, %r11d, %r12d, 8)
+ ii_rnd(%r9d, %r10d, %r11d, %r12d, 7)
+ ii_rnd(%r9d, %r10d, %r11d, %r12d, 6)
+ ii_rnd(%r9d, %r10d, %r11d, %r12d, 5)
+ ii_rnd(%r9d, %r10d, %r11d, %r12d, 4)
+ ii_rnd(%r9d, %r10d, %r11d, %r12d, 3)
+ ii_rnd(%r9d, %r10d, %r11d, %r12d, 2)
+ ii_rnd(%r9d, %r10d, %r11d, %r12d, 1)
+ il_rnd(%r9d, %r10d, %r11d, %r12d, 0)
+
+ // Copy results
+ mov (%rsp), %rbx
+ mov %r9d, (%rbx)
+ mov %r10d, 4(%rbx)
+ mov %r11d, 8(%rbx)
+ mov %r12d, 12(%rbx)
+ xor %rax, %rax
+4: // Restore registers
+ mov 1*8(%rsp), %rbx
+ mov 2*8(%rsp), %rbp
+ mov 3*8(%rsp), %r12
+ add $[4*8], %rsp
+ ret
+
+ SET_SIZE(aes_decrypt_amd64)
+#endif /* lint || __lint */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/module/icp/asm-x86_64/aes/aeskey.c b/module/icp/asm-x86_64/aes/aeskey.c
new file mode 100644
index 000000000000..c3d1f2990874
--- /dev/null
+++ b/module/icp/asm-x86_64/aes/aeskey.c
@@ -0,0 +1,580 @@
+/*
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
+ *
+ * LICENSE TERMS
+ *
+ * The free distribution and use of this software is allowed (with or without
+ * changes) provided that:
+ *
+ * 1. source code distributions include the above copyright notice, this
+ * list of conditions and the following disclaimer;
+ *
+ * 2. binary distributions include the above copyright notice, this list
+ * of conditions and the following disclaimer in their documentation;
+ *
+ * 3. the name of the copyright holder is not used to endorse products
+ * built using this software without specific written permission.
+ *
+ * DISCLAIMER
+ *
+ * This software is provided 'as is' with no explicit or implied warranties
+ * in respect of its properties, including, but not limited to, correctness
+ * and/or fitness for purpose.
+ * ---------------------------------------------------------------------------
+ * Issue Date: 20/12/2007
+ */
+
+#include <aes/aes_impl.h>
+#include "aesopt.h"
+#include "aestab.h"
+#include "aestab2.h"
+
+/*
+ * Initialise the key schedule from the user supplied key. The key
+ * length can be specified in bytes, with legal values of 16, 24
+ * and 32, or in bits, with legal values of 128, 192 and 256. These
+ * values correspond with Nk values of 4, 6 and 8 respectively.
+ *
+ * The following macros implement a single cycle in the key
+ * schedule generation process. The number of cycles needed
+ * for each cx->n_col and nk value is:
+ *
+ * nk = 4 5 6 7 8
+ * ------------------------------
+ * cx->n_col = 4 10 9 8 7 7
+ * cx->n_col = 5 14 11 10 9 9
+ * cx->n_col = 6 19 15 12 11 11
+ * cx->n_col = 7 21 19 16 13 14
+ * cx->n_col = 8 29 23 19 17 14
+ */
+
+/*
+ * OpenSolaris changes
+ * 1. Added header files aes_impl.h and aestab2.h
+ * 2. Changed uint_8t and uint_32t to uint8_t and uint32_t
+ * 3. Remove code under ifdef USE_VIA_ACE_IF_PRESENT (always undefined)
+ * 4. Removed always-defined ifdefs FUNCS_IN_C, ENC_KEYING_IN_C,
+ * AES_128, AES_192, AES_256, AES_VAR defines
+ * 5. Changed aes_encrypt_key* aes_decrypt_key* functions to "static void"
+ * 6. Changed N_COLS to MAX_AES_NB
+ * 7. Replaced functions aes_encrypt_key and aes_decrypt_key with
+ * OpenSolaris-compatible functions rijndael_key_setup_enc_amd64 and
+ * rijndael_key_setup_dec_amd64
+ * 8. cstyled code and removed lint warnings
+ */
+
+#if defined(REDUCE_CODE_SIZE)
+#define ls_box ls_sub
+ uint32_t ls_sub(const uint32_t t, const uint32_t n);
+#define inv_mcol im_sub
+ uint32_t im_sub(const uint32_t x);
+#ifdef ENC_KS_UNROLL
+#undef ENC_KS_UNROLL
+#endif
+#ifdef DEC_KS_UNROLL
+#undef DEC_KS_UNROLL
+#endif
+#endif /* REDUCE_CODE_SIZE */
+
+
+#define ke4(k, i) \
+{ k[4 * (i) + 4] = ss[0] ^= ls_box(ss[3], 3) ^ t_use(r, c)[i]; \
+ k[4 * (i) + 5] = ss[1] ^= ss[0]; \
+ k[4 * (i) + 6] = ss[2] ^= ss[1]; \
+ k[4 * (i) + 7] = ss[3] ^= ss[2]; \
+}
+
+static void
+aes_encrypt_key128(const unsigned char *key, uint32_t rk[])
+{
+ uint32_t ss[4];
+
+ rk[0] = ss[0] = word_in(key, 0);
+ rk[1] = ss[1] = word_in(key, 1);
+ rk[2] = ss[2] = word_in(key, 2);
+ rk[3] = ss[3] = word_in(key, 3);
+
+#ifdef ENC_KS_UNROLL
+ ke4(rk, 0); ke4(rk, 1);
+ ke4(rk, 2); ke4(rk, 3);
+ ke4(rk, 4); ke4(rk, 5);
+ ke4(rk, 6); ke4(rk, 7);
+ ke4(rk, 8);
+#else
+ {
+ uint32_t i;
+ for (i = 0; i < 9; ++i)
+ ke4(rk, i);
+ }
+#endif /* ENC_KS_UNROLL */
+ ke4(rk, 9);
+}
+
+
+#define kef6(k, i) \
+{ k[6 * (i) + 6] = ss[0] ^= ls_box(ss[5], 3) ^ t_use(r, c)[i]; \
+ k[6 * (i) + 7] = ss[1] ^= ss[0]; \
+ k[6 * (i) + 8] = ss[2] ^= ss[1]; \
+ k[6 * (i) + 9] = ss[3] ^= ss[2]; \
+}
+
+#define ke6(k, i) \
+{ kef6(k, i); \
+ k[6 * (i) + 10] = ss[4] ^= ss[3]; \
+ k[6 * (i) + 11] = ss[5] ^= ss[4]; \
+}
+
+static void
+aes_encrypt_key192(const unsigned char *key, uint32_t rk[])
+{
+ uint32_t ss[6];
+
+ rk[0] = ss[0] = word_in(key, 0);
+ rk[1] = ss[1] = word_in(key, 1);
+ rk[2] = ss[2] = word_in(key, 2);
+ rk[3] = ss[3] = word_in(key, 3);
+ rk[4] = ss[4] = word_in(key, 4);
+ rk[5] = ss[5] = word_in(key, 5);
+
+#ifdef ENC_KS_UNROLL
+ ke6(rk, 0); ke6(rk, 1);
+ ke6(rk, 2); ke6(rk, 3);
+ ke6(rk, 4); ke6(rk, 5);
+ ke6(rk, 6);
+#else
+ {
+ uint32_t i;
+ for (i = 0; i < 7; ++i)
+ ke6(rk, i);
+ }
+#endif /* ENC_KS_UNROLL */
+ kef6(rk, 7);
+}
+
+
+
+#define kef8(k, i) \
+{ k[8 * (i) + 8] = ss[0] ^= ls_box(ss[7], 3) ^ t_use(r, c)[i]; \
+ k[8 * (i) + 9] = ss[1] ^= ss[0]; \
+ k[8 * (i) + 10] = ss[2] ^= ss[1]; \
+ k[8 * (i) + 11] = ss[3] ^= ss[2]; \
+}
+
+#define ke8(k, i) \
+{ kef8(k, i); \
+ k[8 * (i) + 12] = ss[4] ^= ls_box(ss[3], 0); \
+ k[8 * (i) + 13] = ss[5] ^= ss[4]; \
+ k[8 * (i) + 14] = ss[6] ^= ss[5]; \
+ k[8 * (i) + 15] = ss[7] ^= ss[6]; \
+}
+
+static void
+aes_encrypt_key256(const unsigned char *key, uint32_t rk[])
+{
+ uint32_t ss[8];
+
+ rk[0] = ss[0] = word_in(key, 0);
+ rk[1] = ss[1] = word_in(key, 1);
+ rk[2] = ss[2] = word_in(key, 2);
+ rk[3] = ss[3] = word_in(key, 3);
+ rk[4] = ss[4] = word_in(key, 4);
+ rk[5] = ss[5] = word_in(key, 5);
+ rk[6] = ss[6] = word_in(key, 6);
+ rk[7] = ss[7] = word_in(key, 7);
+
+#ifdef ENC_KS_UNROLL
+ ke8(rk, 0); ke8(rk, 1);
+ ke8(rk, 2); ke8(rk, 3);
+ ke8(rk, 4); ke8(rk, 5);
+#else
+ {
+ uint32_t i;
+ for (i = 0; i < 6; ++i)
+ ke8(rk, i);
+ }
+#endif /* ENC_KS_UNROLL */
+ kef8(rk, 6);
+}
+
+
+/*
+ * Expand the cipher key into the encryption key schedule.
+ *
+ * Return the number of rounds for the given cipher key size.
+ * The size of the key schedule depends on the number of rounds
+ * (which can be computed from the size of the key), i.e. 4 * (Nr + 1).
+ *
+ * Parameters:
+ * rk AES key schedule 32-bit array to be initialized
+ * cipherKey User key
+ * keyBits AES key size (128, 192, or 256 bits)
+ */
+int
+rijndael_key_setup_enc_amd64(uint32_t rk[], const uint32_t cipherKey[],
+ int keyBits)
+{
+ switch (keyBits) {
+ case 128:
+ aes_encrypt_key128((unsigned char *)&cipherKey[0], rk);
+ return (10);
+ case 192:
+ aes_encrypt_key192((unsigned char *)&cipherKey[0], rk);
+ return (12);
+ case 256:
+ aes_encrypt_key256((unsigned char *)&cipherKey[0], rk);
+ return (14);
+ default: /* should never get here */
+ break;
+ }
+
+ return (0);
+}
+
+
+/* this is used to store the decryption round keys */
+/* in forward or reverse order */
+
+#ifdef AES_REV_DKS
+#define v(n, i) ((n) - (i) + 2 * ((i) & 3))
+#else
+#define v(n, i) (i)
+#endif
+
+#if DEC_ROUND == NO_TABLES
+#define ff(x) (x)
+#else
+#define ff(x) inv_mcol(x)
+#if defined(dec_imvars)
+#define d_vars dec_imvars
+#endif
+#endif /* FUNCS_IN_C & DEC_KEYING_IN_C */
+
+
+#define k4e(k, i) \
+{ k[v(40, (4 * (i)) + 4)] = ss[0] ^= ls_box(ss[3], 3) ^ t_use(r, c)[i]; \
+ k[v(40, (4 * (i)) + 5)] = ss[1] ^= ss[0]; \
+ k[v(40, (4 * (i)) + 6)] = ss[2] ^= ss[1]; \
+ k[v(40, (4 * (i)) + 7)] = ss[3] ^= ss[2]; \
+}
+
+#if 1
+
+#define kdf4(k, i) \
+{ ss[0] = ss[0] ^ ss[2] ^ ss[1] ^ ss[3]; \
+ ss[1] = ss[1] ^ ss[3]; \
+ ss[2] = ss[2] ^ ss[3]; \
+ ss[4] = ls_box(ss[(i + 3) % 4], 3) ^ t_use(r, c)[i]; \
+ ss[i % 4] ^= ss[4]; \
+ ss[4] ^= k[v(40, (4 * (i)))]; k[v(40, (4 * (i)) + 4)] = ff(ss[4]); \
+ ss[4] ^= k[v(40, (4 * (i)) + 1)]; k[v(40, (4 * (i)) + 5)] = ff(ss[4]); \
+ ss[4] ^= k[v(40, (4 * (i)) + 2)]; k[v(40, (4 * (i)) + 6)] = ff(ss[4]); \
+ ss[4] ^= k[v(40, (4 * (i)) + 3)]; k[v(40, (4 * (i)) + 7)] = ff(ss[4]); \
+}
+
+#define kd4(k, i) \
+{ ss[4] = ls_box(ss[(i + 3) % 4], 3) ^ t_use(r, c)[i]; \
+ ss[i % 4] ^= ss[4]; ss[4] = ff(ss[4]); \
+ k[v(40, (4 * (i)) + 4)] = ss[4] ^= k[v(40, (4 * (i)))]; \
+ k[v(40, (4 * (i)) + 5)] = ss[4] ^= k[v(40, (4 * (i)) + 1)]; \
+ k[v(40, (4 * (i)) + 6)] = ss[4] ^= k[v(40, (4 * (i)) + 2)]; \
+ k[v(40, (4 * (i)) + 7)] = ss[4] ^= k[v(40, (4 * (i)) + 3)]; \
+}
+
+#define kdl4(k, i) \
+{ ss[4] = ls_box(ss[(i + 3) % 4], 3) ^ t_use(r, c)[i]; \
+ ss[i % 4] ^= ss[4]; \
+ k[v(40, (4 * (i)) + 4)] = (ss[0] ^= ss[1]) ^ ss[2] ^ ss[3]; \
+ k[v(40, (4 * (i)) + 5)] = ss[1] ^ ss[3]; \
+ k[v(40, (4 * (i)) + 6)] = ss[0]; \
+ k[v(40, (4 * (i)) + 7)] = ss[1]; \
+}
+
+#else
+
+#define kdf4(k, i) \
+{ ss[0] ^= ls_box(ss[3], 3) ^ t_use(r, c)[i]; \
+ k[v(40, (4 * (i)) + 4)] = ff(ss[0]); \
+ ss[1] ^= ss[0]; k[v(40, (4 * (i)) + 5)] = ff(ss[1]); \
+ ss[2] ^= ss[1]; k[v(40, (4 * (i)) + 6)] = ff(ss[2]); \
+ ss[3] ^= ss[2]; k[v(40, (4 * (i)) + 7)] = ff(ss[3]); \
+}
+
+#define kd4(k, i) \
+{ ss[4] = ls_box(ss[3], 3) ^ t_use(r, c)[i]; \
+ ss[0] ^= ss[4]; \
+ ss[4] = ff(ss[4]); \
+ k[v(40, (4 * (i)) + 4)] = ss[4] ^= k[v(40, (4 * (i)))]; \
+ ss[1] ^= ss[0]; \
+ k[v(40, (4 * (i)) + 5)] = ss[4] ^= k[v(40, (4 * (i)) + 1)]; \
+ ss[2] ^= ss[1]; \
+ k[v(40, (4 * (i)) + 6)] = ss[4] ^= k[v(40, (4 * (i)) + 2)]; \
+ ss[3] ^= ss[2]; \
+ k[v(40, (4 * (i)) + 7)] = ss[4] ^= k[v(40, (4 * (i)) + 3)]; \
+}
+
+#define kdl4(k, i) \
+{ ss[0] ^= ls_box(ss[3], 3) ^ t_use(r, c)[i]; \
+ k[v(40, (4 * (i)) + 4)] = ss[0]; \
+ ss[1] ^= ss[0]; k[v(40, (4 * (i)) + 5)] = ss[1]; \
+ ss[2] ^= ss[1]; k[v(40, (4 * (i)) + 6)] = ss[2]; \
+ ss[3] ^= ss[2]; k[v(40, (4 * (i)) + 7)] = ss[3]; \
+}
+
+#endif
+
+static void
+aes_decrypt_key128(const unsigned char *key, uint32_t rk[])
+{
+ uint32_t ss[5];
+#if defined(d_vars)
+ d_vars;
+#endif
+ rk[v(40, (0))] = ss[0] = word_in(key, 0);
+ rk[v(40, (1))] = ss[1] = word_in(key, 1);
+ rk[v(40, (2))] = ss[2] = word_in(key, 2);
+ rk[v(40, (3))] = ss[3] = word_in(key, 3);
+
+#ifdef DEC_KS_UNROLL
+ kdf4(rk, 0); kd4(rk, 1);
+ kd4(rk, 2); kd4(rk, 3);
+ kd4(rk, 4); kd4(rk, 5);
+ kd4(rk, 6); kd4(rk, 7);
+ kd4(rk, 8); kdl4(rk, 9);
+#else
+ {
+ uint32_t i;
+ for (i = 0; i < 10; ++i)
+ k4e(rk, i);
+#if !(DEC_ROUND == NO_TABLES)
+ for (i = MAX_AES_NB; i < 10 * MAX_AES_NB; ++i)
+ rk[i] = inv_mcol(rk[i]);
+#endif
+ }
+#endif /* DEC_KS_UNROLL */
+}
+
+
+
+#define k6ef(k, i) \
+{ k[v(48, (6 * (i)) + 6)] = ss[0] ^= ls_box(ss[5], 3) ^ t_use(r, c)[i]; \
+ k[v(48, (6 * (i)) + 7)] = ss[1] ^= ss[0]; \
+ k[v(48, (6 * (i)) + 8)] = ss[2] ^= ss[1]; \
+ k[v(48, (6 * (i)) + 9)] = ss[3] ^= ss[2]; \
+}
+
+#define k6e(k, i) \
+{ k6ef(k, i); \
+ k[v(48, (6 * (i)) + 10)] = ss[4] ^= ss[3]; \
+ k[v(48, (6 * (i)) + 11)] = ss[5] ^= ss[4]; \
+}
+
+#define kdf6(k, i) \
+{ ss[0] ^= ls_box(ss[5], 3) ^ t_use(r, c)[i]; \
+ k[v(48, (6 * (i)) + 6)] = ff(ss[0]); \
+ ss[1] ^= ss[0]; k[v(48, (6 * (i)) + 7)] = ff(ss[1]); \
+ ss[2] ^= ss[1]; k[v(48, (6 * (i)) + 8)] = ff(ss[2]); \
+ ss[3] ^= ss[2]; k[v(48, (6 * (i)) + 9)] = ff(ss[3]); \
+ ss[4] ^= ss[3]; k[v(48, (6 * (i)) + 10)] = ff(ss[4]); \
+ ss[5] ^= ss[4]; k[v(48, (6 * (i)) + 11)] = ff(ss[5]); \
+}
+
+#define kd6(k, i) \
+{ ss[6] = ls_box(ss[5], 3) ^ t_use(r, c)[i]; \
+ ss[0] ^= ss[6]; ss[6] = ff(ss[6]); \
+ k[v(48, (6 * (i)) + 6)] = ss[6] ^= k[v(48, (6 * (i)))]; \
+ ss[1] ^= ss[0]; \
+ k[v(48, (6 * (i)) + 7)] = ss[6] ^= k[v(48, (6 * (i)) + 1)]; \
+ ss[2] ^= ss[1]; \
+ k[v(48, (6 * (i)) + 8)] = ss[6] ^= k[v(48, (6 * (i)) + 2)]; \
+ ss[3] ^= ss[2]; \
+ k[v(48, (6 * (i)) + 9)] = ss[6] ^= k[v(48, (6 * (i)) + 3)]; \
+ ss[4] ^= ss[3]; \
+ k[v(48, (6 * (i)) + 10)] = ss[6] ^= k[v(48, (6 * (i)) + 4)]; \
+ ss[5] ^= ss[4]; \
+ k[v(48, (6 * (i)) + 11)] = ss[6] ^= k[v(48, (6 * (i)) + 5)]; \
+}
+
+#define kdl6(k, i) \
+{ ss[0] ^= ls_box(ss[5], 3) ^ t_use(r, c)[i]; \
+ k[v(48, (6 * (i)) + 6)] = ss[0]; \
+ ss[1] ^= ss[0]; k[v(48, (6 * (i)) + 7)] = ss[1]; \
+ ss[2] ^= ss[1]; k[v(48, (6 * (i)) + 8)] = ss[2]; \
+ ss[3] ^= ss[2]; k[v(48, (6 * (i)) + 9)] = ss[3]; \
+}
+
+static void
+aes_decrypt_key192(const unsigned char *key, uint32_t rk[])
+{
+ uint32_t ss[7];
+#if defined(d_vars)
+ d_vars;
+#endif
+ rk[v(48, (0))] = ss[0] = word_in(key, 0);
+ rk[v(48, (1))] = ss[1] = word_in(key, 1);
+ rk[v(48, (2))] = ss[2] = word_in(key, 2);
+ rk[v(48, (3))] = ss[3] = word_in(key, 3);
+
+#ifdef DEC_KS_UNROLL
+ ss[4] = word_in(key, 4);
+ rk[v(48, (4))] = ff(ss[4]);
+ ss[5] = word_in(key, 5);
+ rk[v(48, (5))] = ff(ss[5]);
+ kdf6(rk, 0); kd6(rk, 1);
+ kd6(rk, 2); kd6(rk, 3);
+ kd6(rk, 4); kd6(rk, 5);
+ kd6(rk, 6); kdl6(rk, 7);
+#else
+ rk[v(48, (4))] = ss[4] = word_in(key, 4);
+ rk[v(48, (5))] = ss[5] = word_in(key, 5);
+ {
+ uint32_t i;
+
+ for (i = 0; i < 7; ++i)
+ k6e(rk, i);
+ k6ef(rk, 7);
+#if !(DEC_ROUND == NO_TABLES)
+ for (i = MAX_AES_NB; i < 12 * MAX_AES_NB; ++i)
+ rk[i] = inv_mcol(rk[i]);
+#endif
+ }
+#endif
+}
+
+
+
+#define k8ef(k, i) \
+{ k[v(56, (8 * (i)) + 8)] = ss[0] ^= ls_box(ss[7], 3) ^ t_use(r, c)[i]; \
+ k[v(56, (8 * (i)) + 9)] = ss[1] ^= ss[0]; \
+ k[v(56, (8 * (i)) + 10)] = ss[2] ^= ss[1]; \
+ k[v(56, (8 * (i)) + 11)] = ss[3] ^= ss[2]; \
+}
+
+#define k8e(k, i) \
+{ k8ef(k, i); \
+ k[v(56, (8 * (i)) + 12)] = ss[4] ^= ls_box(ss[3], 0); \
+ k[v(56, (8 * (i)) + 13)] = ss[5] ^= ss[4]; \
+ k[v(56, (8 * (i)) + 14)] = ss[6] ^= ss[5]; \
+ k[v(56, (8 * (i)) + 15)] = ss[7] ^= ss[6]; \
+}
+
+#define kdf8(k, i) \
+{ ss[0] ^= ls_box(ss[7], 3) ^ t_use(r, c)[i]; \
+ k[v(56, (8 * (i)) + 8)] = ff(ss[0]); \
+ ss[1] ^= ss[0]; k[v(56, (8 * (i)) + 9)] = ff(ss[1]); \
+ ss[2] ^= ss[1]; k[v(56, (8 * (i)) + 10)] = ff(ss[2]); \
+ ss[3] ^= ss[2]; k[v(56, (8 * (i)) + 11)] = ff(ss[3]); \
+ ss[4] ^= ls_box(ss[3], 0); k[v(56, (8 * (i)) + 12)] = ff(ss[4]); \
+ ss[5] ^= ss[4]; k[v(56, (8 * (i)) + 13)] = ff(ss[5]); \
+ ss[6] ^= ss[5]; k[v(56, (8 * (i)) + 14)] = ff(ss[6]); \
+ ss[7] ^= ss[6]; k[v(56, (8 * (i)) + 15)] = ff(ss[7]); \
+}
+
+#define kd8(k, i) \
+{ ss[8] = ls_box(ss[7], 3) ^ t_use(r, c)[i]; \
+ ss[0] ^= ss[8]; \
+ ss[8] = ff(ss[8]); \
+ k[v(56, (8 * (i)) + 8)] = ss[8] ^= k[v(56, (8 * (i)))]; \
+ ss[1] ^= ss[0]; \
+ k[v(56, (8 * (i)) + 9)] = ss[8] ^= k[v(56, (8 * (i)) + 1)]; \
+ ss[2] ^= ss[1]; \
+ k[v(56, (8 * (i)) + 10)] = ss[8] ^= k[v(56, (8 * (i)) + 2)]; \
+ ss[3] ^= ss[2]; \
+ k[v(56, (8 * (i)) + 11)] = ss[8] ^= k[v(56, (8 * (i)) + 3)]; \
+ ss[8] = ls_box(ss[3], 0); \
+ ss[4] ^= ss[8]; \
+ ss[8] = ff(ss[8]); \
+ k[v(56, (8 * (i)) + 12)] = ss[8] ^= k[v(56, (8 * (i)) + 4)]; \
+ ss[5] ^= ss[4]; \
+ k[v(56, (8 * (i)) + 13)] = ss[8] ^= k[v(56, (8 * (i)) + 5)]; \
+ ss[6] ^= ss[5]; \
+ k[v(56, (8 * (i)) + 14)] = ss[8] ^= k[v(56, (8 * (i)) + 6)]; \
+ ss[7] ^= ss[6]; \
+ k[v(56, (8 * (i)) + 15)] = ss[8] ^= k[v(56, (8 * (i)) + 7)]; \
+}
+
+#define kdl8(k, i) \
+{ ss[0] ^= ls_box(ss[7], 3) ^ t_use(r, c)[i]; \
+ k[v(56, (8 * (i)) + 8)] = ss[0]; \
+ ss[1] ^= ss[0]; k[v(56, (8 * (i)) + 9)] = ss[1]; \
+ ss[2] ^= ss[1]; k[v(56, (8 * (i)) + 10)] = ss[2]; \
+ ss[3] ^= ss[2]; k[v(56, (8 * (i)) + 11)] = ss[3]; \
+}
+
+static void
+aes_decrypt_key256(const unsigned char *key, uint32_t rk[])
+{
+ uint32_t ss[9];
+#if defined(d_vars)
+ d_vars;
+#endif
+ rk[v(56, (0))] = ss[0] = word_in(key, 0);
+ rk[v(56, (1))] = ss[1] = word_in(key, 1);
+ rk[v(56, (2))] = ss[2] = word_in(key, 2);
+ rk[v(56, (3))] = ss[3] = word_in(key, 3);
+
+#ifdef DEC_KS_UNROLL
+ ss[4] = word_in(key, 4);
+ rk[v(56, (4))] = ff(ss[4]);
+ ss[5] = word_in(key, 5);
+ rk[v(56, (5))] = ff(ss[5]);
+ ss[6] = word_in(key, 6);
+ rk[v(56, (6))] = ff(ss[6]);
+ ss[7] = word_in(key, 7);
+ rk[v(56, (7))] = ff(ss[7]);
+ kdf8(rk, 0); kd8(rk, 1);
+ kd8(rk, 2); kd8(rk, 3);
+ kd8(rk, 4); kd8(rk, 5);
+ kdl8(rk, 6);
+#else
+ rk[v(56, (4))] = ss[4] = word_in(key, 4);
+ rk[v(56, (5))] = ss[5] = word_in(key, 5);
+ rk[v(56, (6))] = ss[6] = word_in(key, 6);
+ rk[v(56, (7))] = ss[7] = word_in(key, 7);
+ {
+ uint32_t i;
+
+ for (i = 0; i < 6; ++i)
+ k8e(rk, i);
+ k8ef(rk, 6);
+#if !(DEC_ROUND == NO_TABLES)
+ for (i = MAX_AES_NB; i < 14 * MAX_AES_NB; ++i)
+ rk[i] = inv_mcol(rk[i]);
+#endif
+ }
+#endif /* DEC_KS_UNROLL */
+}
+
+
+/*
+ * Expand the cipher key into the decryption key schedule.
+ *
+ * Return the number of rounds for the given cipher key size.
+ * The size of the key schedule depends on the number of rounds
+ * (which can be computed from the size of the key), i.e. 4 * (Nr + 1).
+ *
+ * Parameters:
+ * rk AES key schedule 32-bit array to be initialized
+ * cipherKey User key
+ * keyBits AES key size (128, 192, or 256 bits)
+ */
+int
+rijndael_key_setup_dec_amd64(uint32_t rk[], const uint32_t cipherKey[],
+ int keyBits)
+{
+ switch (keyBits) {
+ case 128:
+ aes_decrypt_key128((unsigned char *)&cipherKey[0], rk);
+ return (10);
+ case 192:
+ aes_decrypt_key192((unsigned char *)&cipherKey[0], rk);
+ return (12);
+ case 256:
+ aes_decrypt_key256((unsigned char *)&cipherKey[0], rk);
+ return (14);
+ default: /* should never get here */
+ break;
+ }
+
+ return (0);
+}
diff --git a/module/icp/asm-x86_64/aes/aesopt.h b/module/icp/asm-x86_64/aes/aesopt.h
new file mode 100644
index 000000000000..472111f96e59
--- /dev/null
+++ b/module/icp/asm-x86_64/aes/aesopt.h
@@ -0,0 +1,770 @@
+/*
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
+ *
+ * LICENSE TERMS
+ *
+ * The free distribution and use of this software is allowed (with or without
+ * changes) provided that:
+ *
+ * 1. source code distributions include the above copyright notice, this
+ * list of conditions and the following disclaimer;
+ *
+ * 2. binary distributions include the above copyright notice, this list
+ * of conditions and the following disclaimer in their documentation;
+ *
+ * 3. the name of the copyright holder is not used to endorse products
+ * built using this software without specific written permission.
+ *
+ * DISCLAIMER
+ *
+ * This software is provided 'as is' with no explicit or implied warranties
+ * in respect of its properties, including, but not limited to, correctness
+ * and/or fitness for purpose.
+ * ---------------------------------------------------------------------------
+ * Issue Date: 20/12/2007
+ *
+ * This file contains the compilation options for AES (Rijndael) and code
+ * that is common across encryption, key scheduling and table generation.
+ *
+ * OPERATION
+ *
+ * These source code files implement the AES algorithm Rijndael designed by
+ * Joan Daemen and Vincent Rijmen. This version is designed for the standard
+ * block size of 16 bytes and for key sizes of 128, 192 and 256 bits (16, 24
+ * and 32 bytes).
+ *
+ * This version is designed for flexibility and speed using operations on
+ * 32-bit words rather than operations on bytes. It can be compiled with
+ * either big or little endian internal byte order but is faster when the
+ * native byte order for the processor is used.
+ *
+ * THE CIPHER INTERFACE
+ *
+ * The cipher interface is implemented as an array of bytes in which lower
+ * AES bit sequence indexes map to higher numeric significance within bytes.
+ */
+
+/*
+ * OpenSolaris changes
+ * 1. Added __cplusplus and _AESTAB_H header guards
+ * 2. Added header files sys/types.h and aes_impl.h
+ * 3. Added defines for AES_ENCRYPT, AES_DECRYPT, AES_REV_DKS, and ASM_AMD64_C
+ * 4. Moved defines for IS_BIG_ENDIAN, IS_LITTLE_ENDIAN, PLATFORM_BYTE_ORDER
+ * from brg_endian.h
+ * 5. Undefined VIA_ACE_POSSIBLE and ASSUME_VIA_ACE_PRESENT
+ * 6. Changed uint_8t and uint_32t to uint8_t and uint32_t
+ * 7. Defined aes_sw32 as htonl() for byte swapping
+ * 8. Cstyled and hdrchk code
+ *
+ */
+
+#ifndef _AESOPT_H
+#define _AESOPT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/zfs_context.h>
+#include <aes/aes_impl.h>
+
+/* SUPPORT FEATURES */
+#define AES_ENCRYPT /* if support for encryption is needed */
+#define AES_DECRYPT /* if support for decryption is needed */
+
+/* PLATFORM-SPECIFIC FEATURES */
+#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */
+#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */
+#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#define AES_REV_DKS /* define to reverse decryption key schedule */
+
+
+/*
+ * CONFIGURATION - THE USE OF DEFINES
+ * Later in this section there are a number of defines that control the
+ * operation of the code. In each section, the purpose of each define is
+ * explained so that the relevant form can be included or excluded by
+ * setting either 1's or 0's respectively on the branches of the related
+ * #if clauses. The following local defines should not be changed.
+ */
+
+#define ENCRYPTION_IN_C 1
+#define DECRYPTION_IN_C 2
+#define ENC_KEYING_IN_C 4
+#define DEC_KEYING_IN_C 8
+
+#define NO_TABLES 0
+#define ONE_TABLE 1
+#define FOUR_TABLES 4
+#define NONE 0
+#define PARTIAL 1
+#define FULL 2
+
+/* --- START OF USER CONFIGURED OPTIONS --- */
+
+/*
+ * 1. BYTE ORDER WITHIN 32 BIT WORDS
+ *
+ * The fundamental data processing units in Rijndael are 8-bit bytes. The
+ * input, output and key input are all enumerated arrays of bytes in which
+ * bytes are numbered starting at zero and increasing to one less than the
+ * number of bytes in the array in question. This enumeration is only used
+ * for naming bytes and does not imply any adjacency or order relationship
+ * from one byte to another. When these inputs and outputs are considered
+ * as bit sequences, bits 8*n to 8*n+7 of the bit sequence are mapped to
+ * byte[n] with bit 8n+i in the sequence mapped to bit 7-i within the byte.
+ * In this implementation bits are numbered from 0 to 7 starting at the
+ * numerically least significant end of each byte. Bit n represents 2^n.
+ *
+ * However, Rijndael can be implemented more efficiently using 32-bit
+ * words by packing bytes into words so that bytes 4*n to 4*n+3 are placed
+ * into word[n]. While in principle these bytes can be assembled into words
+ * in any positions, this implementation only supports the two formats in
+ * which bytes in adjacent positions within words also have adjacent byte
+ * numbers. This order is called big-endian if the lowest numbered bytes
+ * in words have the highest numeric significance and little-endian if the
+ * opposite applies.
+ *
+ * This code can work in either order irrespective of the order used by the
+ * machine on which it runs. Normally the internal byte order will be set
+ * to the order of the processor on which the code is to be run but this
+ * define can be used to reverse this in special situations
+ *
+ * WARNING: Assembler code versions rely on PLATFORM_BYTE_ORDER being set.
+ * This define will hence be redefined later (in section 4) if necessary
+ */
+
+#if 1
+#define ALGORITHM_BYTE_ORDER PLATFORM_BYTE_ORDER
+#elif 0
+#define ALGORITHM_BYTE_ORDER IS_LITTLE_ENDIAN
+#elif 0
+#define ALGORITHM_BYTE_ORDER IS_BIG_ENDIAN
+#else
+#error The algorithm byte order is not defined
+#endif
+
+/* 2. VIA ACE SUPPORT */
+
+#if defined(__GNUC__) && defined(__i386__) || \
+ defined(_WIN32) && defined(_M_IX86) && \
+ !(defined(_WIN64) || defined(_WIN32_WCE) || \
+ defined(_MSC_VER) && (_MSC_VER <= 800))
+#define VIA_ACE_POSSIBLE
+#endif
+
+/*
+ * Define this option if support for the VIA ACE is required. This uses
+ * inline assembler instructions and is only implemented for the Microsoft,
+ * Intel and GCC compilers. If VIA ACE is known to be present, then defining
+ * ASSUME_VIA_ACE_PRESENT will remove the ordinary encryption/decryption
+ * code. If USE_VIA_ACE_IF_PRESENT is defined then VIA ACE will be used if
+ * it is detected (both present and enabled) but the normal AES code will
+ * also be present.
+ *
+ * When VIA ACE is to be used, all AES encryption contexts MUST be 16 byte
+ * aligned; other input/output buffers do not need to be 16 byte aligned
+ * but there are very large performance gains if this can be arranged.
+ * VIA ACE also requires the decryption key schedule to be in reverse
+ * order (which later checks below ensure).
+ */
+
+/* VIA ACE is not used here for OpenSolaris: */
+#undef VIA_ACE_POSSIBLE
+#undef ASSUME_VIA_ACE_PRESENT
+
+#if 0 && defined(VIA_ACE_POSSIBLE) && !defined(USE_VIA_ACE_IF_PRESENT)
+#define USE_VIA_ACE_IF_PRESENT
+#endif
+
+#if 0 && defined(VIA_ACE_POSSIBLE) && !defined(ASSUME_VIA_ACE_PRESENT)
+#define ASSUME_VIA_ACE_PRESENT
+#endif
+
+
+/*
+ * 3. ASSEMBLER SUPPORT
+ *
+ * This define (which can be on the command line) enables the use of the
+ * assembler code routines for encryption, decryption and key scheduling
+ * as follows:
+ *
+ * ASM_X86_V1C uses the assembler (aes_x86_v1.asm) with large tables for
+ * encryption and decryption and but with key scheduling in C
+ * ASM_X86_V2 uses assembler (aes_x86_v2.asm) with compressed tables for
+ * encryption, decryption and key scheduling
+ * ASM_X86_V2C uses assembler (aes_x86_v2.asm) with compressed tables for
+ * encryption and decryption and but with key scheduling in C
+ * ASM_AMD64_C uses assembler (aes_amd64.asm) with compressed tables for
+ * encryption and decryption and but with key scheduling in C
+ *
+ * Change one 'if 0' below to 'if 1' to select the version or define
+ * as a compilation option.
+ */
+
+#if 0 && !defined(ASM_X86_V1C)
+#define ASM_X86_V1C
+#elif 0 && !defined(ASM_X86_V2)
+#define ASM_X86_V2
+#elif 0 && !defined(ASM_X86_V2C)
+#define ASM_X86_V2C
+#elif 1 && !defined(ASM_AMD64_C)
+#define ASM_AMD64_C
+#endif
+
+#if (defined(ASM_X86_V1C) || defined(ASM_X86_V2) || defined(ASM_X86_V2C)) && \
+ !defined(_M_IX86) || defined(ASM_AMD64_C) && !defined(_M_X64) && \
+ !defined(__amd64)
+#error Assembler code is only available for x86 and AMD64 systems
+#endif
+
+/*
+ * 4. FAST INPUT/OUTPUT OPERATIONS.
+ *
+ * On some machines it is possible to improve speed by transferring the
+ * bytes in the input and output arrays to and from the internal 32-bit
+ * variables by addressing these arrays as if they are arrays of 32-bit
+ * words. On some machines this will always be possible but there may
+ * be a large performance penalty if the byte arrays are not aligned on
+ * the normal word boundaries. On other machines this technique will
+ * lead to memory access errors when such 32-bit word accesses are not
+ * properly aligned. The option SAFE_IO avoids such problems but will
+ * often be slower on those machines that support misaligned access
+ * (especially so if care is taken to align the input and output byte
+ * arrays on 32-bit word boundaries). If SAFE_IO is not defined it is
+ * assumed that access to byte arrays as if they are arrays of 32-bit
+ * words will not cause problems when such accesses are misaligned.
+ */
+#if 1 && !defined(_MSC_VER)
+#define SAFE_IO
+#endif
+
+/*
+ * 5. LOOP UNROLLING
+ *
+ * The code for encryption and decryption cycles through a number of rounds
+ * that can be implemented either in a loop or by expanding the code into a
+ * long sequence of instructions, the latter producing a larger program but
+ * one that will often be much faster. The latter is called loop unrolling.
+ * There are also potential speed advantages in expanding two iterations in
+ * a loop with half the number of iterations, which is called partial loop
+ * unrolling. The following options allow partial or full loop unrolling
+ * to be set independently for encryption and decryption
+ */
+#if 1
+#define ENC_UNROLL FULL
+#elif 0
+#define ENC_UNROLL PARTIAL
+#else
+#define ENC_UNROLL NONE
+#endif
+
+#if 1
+#define DEC_UNROLL FULL
+#elif 0
+#define DEC_UNROLL PARTIAL
+#else
+#define DEC_UNROLL NONE
+#endif
+
+#if 1
+#define ENC_KS_UNROLL
+#endif
+
+#if 1
+#define DEC_KS_UNROLL
+#endif
+
+/*
+ * 6. FAST FINITE FIELD OPERATIONS
+ *
+ * If this section is included, tables are used to provide faster finite
+ * field arithmetic. This has no effect if FIXED_TABLES is defined.
+ */
+#if 1
+#define FF_TABLES
+#endif
+
+/*
+ * 7. INTERNAL STATE VARIABLE FORMAT
+ *
+ * The internal state of Rijndael is stored in a number of local 32-bit
+ * word variables which can be defined either as an array or as individual
+ * names variables. Include this section if you want to store these local
+ * variables in arrays. Otherwise individual local variables will be used.
+ */
+#if 1
+#define ARRAYS
+#endif
+
+/*
+ * 8. FIXED OR DYNAMIC TABLES
+ *
+ * When this section is included the tables used by the code are compiled
+ * statically into the binary file. Otherwise the subroutine aes_init()
+ * must be called to compute them before the code is first used.
+ */
+#if 1 && !(defined(_MSC_VER) && (_MSC_VER <= 800))
+#define FIXED_TABLES
+#endif
+
+/*
+ * 9. MASKING OR CASTING FROM LONGER VALUES TO BYTES
+ *
+ * In some systems it is better to mask longer values to extract bytes
+ * rather than using a cast. This option allows this choice.
+ */
+#if 0
+#define to_byte(x) ((uint8_t)(x))
+#else
+#define to_byte(x) ((x) & 0xff)
+#endif
+
+/*
+ * 10. TABLE ALIGNMENT
+ *
+ * On some systems speed will be improved by aligning the AES large lookup
+ * tables on particular boundaries. This define should be set to a power of
+ * two giving the desired alignment. It can be left undefined if alignment
+ * is not needed. This option is specific to the Microsoft VC++ compiler -
+ * it seems to sometimes cause trouble for the VC++ version 6 compiler.
+ */
+
+#if 1 && defined(_MSC_VER) && (_MSC_VER >= 1300)
+#define TABLE_ALIGN 32
+#endif
+
+/*
+ * 11. REDUCE CODE AND TABLE SIZE
+ *
+ * This replaces some expanded macros with function calls if AES_ASM_V2 or
+ * AES_ASM_V2C are defined
+ */
+
+#if 1 && (defined(ASM_X86_V2) || defined(ASM_X86_V2C))
+#define REDUCE_CODE_SIZE
+#endif
+
+/*
+ * 12. TABLE OPTIONS
+ *
+ * This cipher proceeds by repeating in a number of cycles known as rounds
+ * which are implemented by a round function which is optionally be speeded
+ * up using tables. The basic tables are 256 32-bit words, with either
+ * one or four tables being required for each round function depending on
+ * how much speed is required. Encryption and decryption round functions
+ * are different and the last encryption and decryption round functions are
+ * different again making four different round functions in all.
+ *
+ * This means that:
+ * 1. Normal encryption and decryption rounds can each use either 0, 1
+ * or 4 tables and table spaces of 0, 1024 or 4096 bytes each.
+ * 2. The last encryption and decryption rounds can also use either 0, 1
+ * or 4 tables and table spaces of 0, 1024 or 4096 bytes each.
+ *
+ * Include or exclude the appropriate definitions below to set the number
+ * of tables used by this implementation.
+ */
+
+#if 1 /* set tables for the normal encryption round */
+#define ENC_ROUND FOUR_TABLES
+#elif 0
+#define ENC_ROUND ONE_TABLE
+#else
+#define ENC_ROUND NO_TABLES
+#endif
+
+#if 1 /* set tables for the last encryption round */
+#define LAST_ENC_ROUND FOUR_TABLES
+#elif 0
+#define LAST_ENC_ROUND ONE_TABLE
+#else
+#define LAST_ENC_ROUND NO_TABLES
+#endif
+
+#if 1 /* set tables for the normal decryption round */
+#define DEC_ROUND FOUR_TABLES
+#elif 0
+#define DEC_ROUND ONE_TABLE
+#else
+#define DEC_ROUND NO_TABLES
+#endif
+
+#if 1 /* set tables for the last decryption round */
+#define LAST_DEC_ROUND FOUR_TABLES
+#elif 0
+#define LAST_DEC_ROUND ONE_TABLE
+#else
+#define LAST_DEC_ROUND NO_TABLES
+#endif
+
+/*
+ * The decryption key schedule can be speeded up with tables in the same
+ * way that the round functions can. Include or exclude the following
+ * defines to set this requirement.
+ */
+#if 1
+#define KEY_SCHED FOUR_TABLES
+#elif 0
+#define KEY_SCHED ONE_TABLE
+#else
+#define KEY_SCHED NO_TABLES
+#endif
+
+/* ---- END OF USER CONFIGURED OPTIONS ---- */
+
+/* VIA ACE support is only available for VC++ and GCC */
+
+#if !defined(_MSC_VER) && !defined(__GNUC__)
+#if defined(ASSUME_VIA_ACE_PRESENT)
+#undef ASSUME_VIA_ACE_PRESENT
+#endif
+#if defined(USE_VIA_ACE_IF_PRESENT)
+#undef USE_VIA_ACE_IF_PRESENT
+#endif
+#endif
+
+#if defined(ASSUME_VIA_ACE_PRESENT) && !defined(USE_VIA_ACE_IF_PRESENT)
+#define USE_VIA_ACE_IF_PRESENT
+#endif
+
+#if defined(USE_VIA_ACE_IF_PRESENT) && !defined(AES_REV_DKS)
+#define AES_REV_DKS
+#endif
+
+/* Assembler support requires the use of platform byte order */
+
+#if (defined(ASM_X86_V1C) || defined(ASM_X86_V2C) || defined(ASM_AMD64_C)) && \
+ (ALGORITHM_BYTE_ORDER != PLATFORM_BYTE_ORDER)
+#undef ALGORITHM_BYTE_ORDER
+#define ALGORITHM_BYTE_ORDER PLATFORM_BYTE_ORDER
+#endif
+
+/*
+ * In this implementation the columns of the state array are each held in
+ * 32-bit words. The state array can be held in various ways: in an array
+ * of words, in a number of individual word variables or in a number of
+ * processor registers. The following define maps a variable name x and
+ * a column number c to the way the state array variable is to be held.
+ * The first define below maps the state into an array x[c] whereas the
+ * second form maps the state into a number of individual variables x0,
+ * x1, etc. Another form could map individual state columns to machine
+ * register names.
+ */
+
+#if defined(ARRAYS)
+#define s(x, c) x[c]
+#else
+#define s(x, c) x##c
+#endif
+
+/*
+ * This implementation provides subroutines for encryption, decryption
+ * and for setting the three key lengths (separately) for encryption
+ * and decryption. Since not all functions are needed, masks are set
+ * up here to determine which will be implemented in C
+ */
+
+#if !defined(AES_ENCRYPT)
+#define EFUNCS_IN_C 0
+#elif defined(ASSUME_VIA_ACE_PRESENT) || defined(ASM_X86_V1C) || \
+ defined(ASM_X86_V2C) || defined(ASM_AMD64_C)
+#define EFUNCS_IN_C ENC_KEYING_IN_C
+#elif !defined(ASM_X86_V2)
+#define EFUNCS_IN_C (ENCRYPTION_IN_C | ENC_KEYING_IN_C)
+#else
+#define EFUNCS_IN_C 0
+#endif
+
+#if !defined(AES_DECRYPT)
+#define DFUNCS_IN_C 0
+#elif defined(ASSUME_VIA_ACE_PRESENT) || defined(ASM_X86_V1C) || \
+ defined(ASM_X86_V2C) || defined(ASM_AMD64_C)
+#define DFUNCS_IN_C DEC_KEYING_IN_C
+#elif !defined(ASM_X86_V2)
+#define DFUNCS_IN_C (DECRYPTION_IN_C | DEC_KEYING_IN_C)
+#else
+#define DFUNCS_IN_C 0
+#endif
+
+#define FUNCS_IN_C (EFUNCS_IN_C | DFUNCS_IN_C)
+
+/* END OF CONFIGURATION OPTIONS */
+
+/* Disable or report errors on some combinations of options */
+
+#if ENC_ROUND == NO_TABLES && LAST_ENC_ROUND != NO_TABLES
+#undef LAST_ENC_ROUND
+#define LAST_ENC_ROUND NO_TABLES
+#elif ENC_ROUND == ONE_TABLE && LAST_ENC_ROUND == FOUR_TABLES
+#undef LAST_ENC_ROUND
+#define LAST_ENC_ROUND ONE_TABLE
+#endif
+
+#if ENC_ROUND == NO_TABLES && ENC_UNROLL != NONE
+#undef ENC_UNROLL
+#define ENC_UNROLL NONE
+#endif
+
+#if DEC_ROUND == NO_TABLES && LAST_DEC_ROUND != NO_TABLES
+#undef LAST_DEC_ROUND
+#define LAST_DEC_ROUND NO_TABLES
+#elif DEC_ROUND == ONE_TABLE && LAST_DEC_ROUND == FOUR_TABLES
+#undef LAST_DEC_ROUND
+#define LAST_DEC_ROUND ONE_TABLE
+#endif
+
+#if DEC_ROUND == NO_TABLES && DEC_UNROLL != NONE
+#undef DEC_UNROLL
+#define DEC_UNROLL NONE
+#endif
+
+#if (ALGORITHM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#define aes_sw32 htonl
+#elif defined(bswap32)
+#define aes_sw32 bswap32
+#elif defined(bswap_32)
+#define aes_sw32 bswap_32
+#else
+#define brot(x, n) (((uint32_t)(x) << (n)) | ((uint32_t)(x) >> (32 - (n))))
+#define aes_sw32(x) ((brot((x), 8) & 0x00ff00ff) | (brot((x), 24) & 0xff00ff00))
+#endif
+
+
+/*
+ * upr(x, n): rotates bytes within words by n positions, moving bytes to
+ * higher index positions with wrap around into low positions
+ * ups(x, n): moves bytes by n positions to higher index positions in
+ * words but without wrap around
+ * bval(x, n): extracts a byte from a word
+ *
+ * WARNING: The definitions given here are intended only for use with
+ * unsigned variables and with shift counts that are compile
+ * time constants
+ */
+
+#if (ALGORITHM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#define upr(x, n) (((uint32_t)(x) << (8 * (n))) | \
+ ((uint32_t)(x) >> (32 - 8 * (n))))
+#define ups(x, n) ((uint32_t)(x) << (8 * (n)))
+#define bval(x, n) to_byte((x) >> (8 * (n)))
+#define bytes2word(b0, b1, b2, b3) \
+ (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | \
+ ((uint32_t)(b1) << 8) | (b0))
+#endif
+
+#if (ALGORITHM_BYTE_ORDER == IS_BIG_ENDIAN)
+#define upr(x, n) (((uint32_t)(x) >> (8 * (n))) | \
+ ((uint32_t)(x) << (32 - 8 * (n))))
+#define ups(x, n) ((uint32_t)(x) >> (8 * (n)))
+#define bval(x, n) to_byte((x) >> (24 - 8 * (n)))
+#define bytes2word(b0, b1, b2, b3) \
+ (((uint32_t)(b0) << 24) | ((uint32_t)(b1) << 16) | \
+ ((uint32_t)(b2) << 8) | (b3))
+#endif
+
+#if defined(SAFE_IO)
+#define word_in(x, c) bytes2word(((const uint8_t *)(x) + 4 * c)[0], \
+ ((const uint8_t *)(x) + 4 * c)[1], \
+ ((const uint8_t *)(x) + 4 * c)[2], \
+ ((const uint8_t *)(x) + 4 * c)[3])
+#define word_out(x, c, v) { ((uint8_t *)(x) + 4 * c)[0] = bval(v, 0); \
+ ((uint8_t *)(x) + 4 * c)[1] = bval(v, 1); \
+ ((uint8_t *)(x) + 4 * c)[2] = bval(v, 2); \
+ ((uint8_t *)(x) + 4 * c)[3] = bval(v, 3); }
+#elif (ALGORITHM_BYTE_ORDER == PLATFORM_BYTE_ORDER)
+#define word_in(x, c) (*((uint32_t *)(x) + (c)))
+#define word_out(x, c, v) (*((uint32_t *)(x) + (c)) = (v))
+#else
+#define word_in(x, c) aes_sw32(*((uint32_t *)(x) + (c)))
+#define word_out(x, c, v) (*((uint32_t *)(x) + (c)) = aes_sw32(v))
+#endif
+
+/* the finite field modular polynomial and elements */
+
+#define WPOLY 0x011b
+#define BPOLY 0x1b
+
+/* multiply four bytes in GF(2^8) by 'x' {02} in parallel */
+
+#define m1 0x80808080
+#define m2 0x7f7f7f7f
+#define gf_mulx(x) ((((x) & m2) << 1) ^ ((((x) & m1) >> 7) * BPOLY))
+
+/*
+ * The following defines provide alternative definitions of gf_mulx that might
+ * give improved performance if a fast 32-bit multiply is not available. Note
+ * that a temporary variable u needs to be defined where gf_mulx is used.
+ *
+ * #define gf_mulx(x) (u = (x) & m1, u |= (u >> 1), ((x) & m2) << 1) ^ \
+ * ((u >> 3) | (u >> 6))
+ * #define m4 (0x01010101 * BPOLY)
+ * #define gf_mulx(x) (u = (x) & m1, ((x) & m2) << 1) ^ ((u - (u >> 7)) \
+ * & m4)
+ */
+
+/* Work out which tables are needed for the different options */
+
+#if defined(ASM_X86_V1C)
+#if defined(ENC_ROUND)
+#undef ENC_ROUND
+#endif
+#define ENC_ROUND FOUR_TABLES
+#if defined(LAST_ENC_ROUND)
+#undef LAST_ENC_ROUND
+#endif
+#define LAST_ENC_ROUND FOUR_TABLES
+#if defined(DEC_ROUND)
+#undef DEC_ROUND
+#endif
+#define DEC_ROUND FOUR_TABLES
+#if defined(LAST_DEC_ROUND)
+#undef LAST_DEC_ROUND
+#endif
+#define LAST_DEC_ROUND FOUR_TABLES
+#if defined(KEY_SCHED)
+#undef KEY_SCHED
+#define KEY_SCHED FOUR_TABLES
+#endif
+#endif
+
+#if (FUNCS_IN_C & ENCRYPTION_IN_C) || defined(ASM_X86_V1C)
+#if ENC_ROUND == ONE_TABLE
+#define FT1_SET
+#elif ENC_ROUND == FOUR_TABLES
+#define FT4_SET
+#else
+#define SBX_SET
+#endif
+#if LAST_ENC_ROUND == ONE_TABLE
+#define FL1_SET
+#elif LAST_ENC_ROUND == FOUR_TABLES
+#define FL4_SET
+#elif !defined(SBX_SET)
+#define SBX_SET
+#endif
+#endif
+
+#if (FUNCS_IN_C & DECRYPTION_IN_C) || defined(ASM_X86_V1C)
+#if DEC_ROUND == ONE_TABLE
+#define IT1_SET
+#elif DEC_ROUND == FOUR_TABLES
+#define IT4_SET
+#else
+#define ISB_SET
+#endif
+#if LAST_DEC_ROUND == ONE_TABLE
+#define IL1_SET
+#elif LAST_DEC_ROUND == FOUR_TABLES
+#define IL4_SET
+#elif !defined(ISB_SET)
+#define ISB_SET
+#endif
+#endif
+
+
+#if !(defined(REDUCE_CODE_SIZE) && (defined(ASM_X86_V2) || \
+ defined(ASM_X86_V2C)))
+#if ((FUNCS_IN_C & ENC_KEYING_IN_C) || (FUNCS_IN_C & DEC_KEYING_IN_C))
+#if KEY_SCHED == ONE_TABLE
+#if !defined(FL1_SET) && !defined(FL4_SET)
+#define LS1_SET
+#endif
+#elif KEY_SCHED == FOUR_TABLES
+#if !defined(FL4_SET)
+#define LS4_SET
+#endif
+#elif !defined(SBX_SET)
+#define SBX_SET
+#endif
+#endif
+#if (FUNCS_IN_C & DEC_KEYING_IN_C)
+#if KEY_SCHED == ONE_TABLE
+#define IM1_SET
+#elif KEY_SCHED == FOUR_TABLES
+#define IM4_SET
+#elif !defined(SBX_SET)
+#define SBX_SET
+#endif
+#endif
+#endif
+
+/* generic definitions of Rijndael macros that use tables */
+
+#define no_table(x, box, vf, rf, c) bytes2word(\
+ box[bval(vf(x, 0, c), rf(0, c))], \
+ box[bval(vf(x, 1, c), rf(1, c))], \
+ box[bval(vf(x, 2, c), rf(2, c))], \
+ box[bval(vf(x, 3, c), rf(3, c))])
+
+#define one_table(x, op, tab, vf, rf, c) \
+ (tab[bval(vf(x, 0, c), rf(0, c))] \
+ ^ op(tab[bval(vf(x, 1, c), rf(1, c))], 1) \
+ ^ op(tab[bval(vf(x, 2, c), rf(2, c))], 2) \
+ ^ op(tab[bval(vf(x, 3, c), rf(3, c))], 3))
+
+#define four_tables(x, tab, vf, rf, c) \
+ (tab[0][bval(vf(x, 0, c), rf(0, c))] \
+ ^ tab[1][bval(vf(x, 1, c), rf(1, c))] \
+ ^ tab[2][bval(vf(x, 2, c), rf(2, c))] \
+ ^ tab[3][bval(vf(x, 3, c), rf(3, c))])
+
+#define vf1(x, r, c) (x)
+#define rf1(r, c) (r)
+#define rf2(r, c) ((8+r-c)&3)
+
+/*
+ * Perform forward and inverse column mix operation on four bytes in long word
+ * x in parallel. NOTE: x must be a simple variable, NOT an expression in
+ * these macros.
+ */
+
+#if !(defined(REDUCE_CODE_SIZE) && (defined(ASM_X86_V2) || \
+ defined(ASM_X86_V2C)))
+
+#if defined(FM4_SET) /* not currently used */
+#define fwd_mcol(x) four_tables(x, t_use(f, m), vf1, rf1, 0)
+#elif defined(FM1_SET) /* not currently used */
+#define fwd_mcol(x) one_table(x, upr, t_use(f, m), vf1, rf1, 0)
+#else
+#define dec_fmvars uint32_t g2
+#define fwd_mcol(x) (g2 = gf_mulx(x), g2 ^ upr((x) ^ g2, 3) ^ \
+ upr((x), 2) ^ upr((x), 1))
+#endif
+
+#if defined(IM4_SET)
+#define inv_mcol(x) four_tables(x, t_use(i, m), vf1, rf1, 0)
+#elif defined(IM1_SET)
+#define inv_mcol(x) one_table(x, upr, t_use(i, m), vf1, rf1, 0)
+#else
+#define dec_imvars uint32_t g2, g4, g9
+#define inv_mcol(x) (g2 = gf_mulx(x), g4 = gf_mulx(g2), g9 = \
+ (x) ^ gf_mulx(g4), g4 ^= g9, \
+ (x) ^ g2 ^ g4 ^ upr(g2 ^ g9, 3) ^ \
+ upr(g4, 2) ^ upr(g9, 1))
+#endif
+
+#if defined(FL4_SET)
+#define ls_box(x, c) four_tables(x, t_use(f, l), vf1, rf2, c)
+#elif defined(LS4_SET)
+#define ls_box(x, c) four_tables(x, t_use(l, s), vf1, rf2, c)
+#elif defined(FL1_SET)
+#define ls_box(x, c) one_table(x, upr, t_use(f, l), vf1, rf2, c)
+#elif defined(LS1_SET)
+#define ls_box(x, c) one_table(x, upr, t_use(l, s), vf1, rf2, c)
+#else
+#define ls_box(x, c) no_table(x, t_use(s, box), vf1, rf2, c)
+#endif
+
+#endif
+
+#if defined(ASM_X86_V1C) && defined(AES_DECRYPT) && !defined(ISB_SET)
+#define ISB_SET
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _AESOPT_H */
diff --git a/module/icp/asm-x86_64/aes/aestab.h b/module/icp/asm-x86_64/aes/aestab.h
new file mode 100644
index 000000000000..33cdb6c6f9fe
--- /dev/null
+++ b/module/icp/asm-x86_64/aes/aestab.h
@@ -0,0 +1,165 @@
+/*
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
+ *
+ * LICENSE TERMS
+ *
+ * The free distribution and use of this software is allowed (with or without
+ * changes) provided that:
+ *
+ * 1. source code distributions include the above copyright notice, this
+ * list of conditions and the following disclaimer;
+ *
+ * 2. binary distributions include the above copyright notice, this list
+ * of conditions and the following disclaimer in their documentation;
+ *
+ * 3. the name of the copyright holder is not used to endorse products
+ * built using this software without specific written permission.
+ *
+ * DISCLAIMER
+ *
+ * This software is provided 'as is' with no explicit or implied warranties
+ * in respect of its properties, including, but not limited to, correctness
+ * and/or fitness for purpose.
+ * ---------------------------------------------------------------------------
+ * Issue Date: 20/12/2007
+ *
+ * This file contains the code for declaring the tables needed to implement
+ * AES. The file aesopt.h is assumed to be included before this header file.
+ * If there are no global variables, the definitions here can be used to put
+ * the AES tables in a structure so that a pointer can then be added to the
+ * AES context to pass them to the AES routines that need them. If this
+ * facility is used, the calling program has to ensure that this pointer is
+ * managed appropriately. In particular, the value of the t_dec(in, it) item
+ * in the table structure must be set to zero in order to ensure that the
+ * tables are initialised. In practice the three code sequences in aeskey.c
+ * that control the calls to aes_init() and the aes_init() routine itself will
+ * have to be changed for a specific implementation. If global variables are
+ * available it will generally be preferable to use them with the precomputed
+ * FIXED_TABLES option that uses static global tables.
+ *
+ * The following defines can be used to control the way the tables
+ * are defined, initialised and used in embedded environments that
+ * require special features for these purposes
+ *
+ * the 't_dec' construction is used to declare fixed table arrays
+ * the 't_set' construction is used to set fixed table values
+ * the 't_use' construction is used to access fixed table values
+ *
+ * 256 byte tables:
+ *
+ * t_xxx(s, box) => forward S box
+ * t_xxx(i, box) => inverse S box
+ *
+ * 256 32-bit word OR 4 x 256 32-bit word tables:
+ *
+ * t_xxx(f, n) => forward normal round
+ * t_xxx(f, l) => forward last round
+ * t_xxx(i, n) => inverse normal round
+ * t_xxx(i, l) => inverse last round
+ * t_xxx(l, s) => key schedule table
+ * t_xxx(i, m) => key schedule table
+ *
+ * Other variables and tables:
+ *
+ * t_xxx(r, c) => the rcon table
+ */
+
+/*
+ * OpenSolaris OS modifications
+ *
+ * 1. Added __cplusplus and _AESTAB_H header guards
+ * 2. Added header file sys/types.h
+ * 3. Remove code defined for _MSC_VER
+ * 4. Changed all variables to "static const"
+ * 5. Changed uint_8t and uint_32t to uint8_t and uint32_t
+ * 6. Cstyled and hdrchk code
+ */
+
+#ifndef _AESTAB_H
+#define _AESTAB_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+
+#define t_dec(m, n) t_##m##n
+#define t_set(m, n) t_##m##n
+#define t_use(m, n) t_##m##n
+
+#if defined(DO_TABLES) && defined(FIXED_TABLES)
+#define d_1(t, n, b, e) static const t n[256] = b(e)
+#define d_4(t, n, b, e, f, g, h) static const t n[4][256] = \
+ {b(e), b(f), b(g), b(h)}
+static const uint32_t t_dec(r, c)[RC_LENGTH] = rc_data(w0);
+#else
+#define d_1(t, n, b, e) static const t n[256]
+#define d_4(t, n, b, e, f, g, h) static const t n[4][256]
+static const uint32_t t_dec(r, c)[RC_LENGTH];
+#endif
+
+#if defined(SBX_SET)
+ d_1(uint8_t, t_dec(s, box), sb_data, h0);
+#endif
+#if defined(ISB_SET)
+ d_1(uint8_t, t_dec(i, box), isb_data, h0);
+#endif
+
+#if defined(FT1_SET)
+ d_1(uint32_t, t_dec(f, n), sb_data, u0);
+#endif
+#if defined(FT4_SET)
+ d_4(uint32_t, t_dec(f, n), sb_data, u0, u1, u2, u3);
+#endif
+
+#if defined(FL1_SET)
+ d_1(uint32_t, t_dec(f, l), sb_data, w0);
+#endif
+#if defined(FL4_SET)
+ d_4(uint32_t, t_dec(f, l), sb_data, w0, w1, w2, w3);
+#endif
+
+#if defined(IT1_SET)
+ d_1(uint32_t, t_dec(i, n), isb_data, v0);
+#endif
+#if defined(IT4_SET)
+ d_4(uint32_t, t_dec(i, n), isb_data, v0, v1, v2, v3);
+#endif
+
+#if defined(IL1_SET)
+ d_1(uint32_t, t_dec(i, l), isb_data, w0);
+#endif
+#if defined(IL4_SET)
+ d_4(uint32_t, t_dec(i, l), isb_data, w0, w1, w2, w3);
+#endif
+
+#if defined(LS1_SET)
+#if defined(FL1_SET)
+#undef LS1_SET
+#else
+ d_1(uint32_t, t_dec(l, s), sb_data, w0);
+#endif
+#endif
+
+#if defined(LS4_SET)
+#if defined(FL4_SET)
+#undef LS4_SET
+#else
+ d_4(uint32_t, t_dec(l, s), sb_data, w0, w1, w2, w3);
+#endif
+#endif
+
+#if defined(IM1_SET)
+ d_1(uint32_t, t_dec(i, m), mm_data, v0);
+#endif
+#if defined(IM4_SET)
+ d_4(uint32_t, t_dec(i, m), mm_data, v0, v1, v2, v3);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _AESTAB_H */
diff --git a/module/icp/asm-x86_64/aes/aestab2.h b/module/icp/asm-x86_64/aes/aestab2.h
new file mode 100644
index 000000000000..eb13f72b10d8
--- /dev/null
+++ b/module/icp/asm-x86_64/aes/aestab2.h
@@ -0,0 +1,594 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _AESTAB2_H
+#define _AESTAB2_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * To create this file for OpenSolaris:
+ * 1. Compile and run tablegen.c, from aes-src-04-03-08.zip,
+ * after defining ASM_AMD64_C
+ * 2. mv aestab2.c aestab2.h
+ * 3. Add __cplusplus and _AESTAB2_H header guards
+ * 3. Add #include <aes_impl.h>
+ * 4. Change "uint_32t" to "uint32_t"
+ * 5. Change all variables to "static const"
+ * 6. Cstyle and hdrchk this file
+ */
+
+#include <aes/aes_impl.h>
+
+static const uint32_t t_rc[RC_LENGTH] =
+{
+ 0x00000001, 0x00000002, 0x00000004, 0x00000008,
+ 0x00000010, 0x00000020, 0x00000040, 0x00000080,
+ 0x0000001b, 0x00000036
+};
+
+static const uint32_t t_ls[4][256] =
+{
+ {
+ 0x00000063, 0x0000007c, 0x00000077, 0x0000007b,
+ 0x000000f2, 0x0000006b, 0x0000006f, 0x000000c5,
+ 0x00000030, 0x00000001, 0x00000067, 0x0000002b,
+ 0x000000fe, 0x000000d7, 0x000000ab, 0x00000076,
+ 0x000000ca, 0x00000082, 0x000000c9, 0x0000007d,
+ 0x000000fa, 0x00000059, 0x00000047, 0x000000f0,
+ 0x000000ad, 0x000000d4, 0x000000a2, 0x000000af,
+ 0x0000009c, 0x000000a4, 0x00000072, 0x000000c0,
+ 0x000000b7, 0x000000fd, 0x00000093, 0x00000026,
+ 0x00000036, 0x0000003f, 0x000000f7, 0x000000cc,
+ 0x00000034, 0x000000a5, 0x000000e5, 0x000000f1,
+ 0x00000071, 0x000000d8, 0x00000031, 0x00000015,
+ 0x00000004, 0x000000c7, 0x00000023, 0x000000c3,
+ 0x00000018, 0x00000096, 0x00000005, 0x0000009a,
+ 0x00000007, 0x00000012, 0x00000080, 0x000000e2,
+ 0x000000eb, 0x00000027, 0x000000b2, 0x00000075,
+ 0x00000009, 0x00000083, 0x0000002c, 0x0000001a,
+ 0x0000001b, 0x0000006e, 0x0000005a, 0x000000a0,
+ 0x00000052, 0x0000003b, 0x000000d6, 0x000000b3,
+ 0x00000029, 0x000000e3, 0x0000002f, 0x00000084,
+ 0x00000053, 0x000000d1, 0x00000000, 0x000000ed,
+ 0x00000020, 0x000000fc, 0x000000b1, 0x0000005b,
+ 0x0000006a, 0x000000cb, 0x000000be, 0x00000039,
+ 0x0000004a, 0x0000004c, 0x00000058, 0x000000cf,
+ 0x000000d0, 0x000000ef, 0x000000aa, 0x000000fb,
+ 0x00000043, 0x0000004d, 0x00000033, 0x00000085,
+ 0x00000045, 0x000000f9, 0x00000002, 0x0000007f,
+ 0x00000050, 0x0000003c, 0x0000009f, 0x000000a8,
+ 0x00000051, 0x000000a3, 0x00000040, 0x0000008f,
+ 0x00000092, 0x0000009d, 0x00000038, 0x000000f5,
+ 0x000000bc, 0x000000b6, 0x000000da, 0x00000021,
+ 0x00000010, 0x000000ff, 0x000000f3, 0x000000d2,
+ 0x000000cd, 0x0000000c, 0x00000013, 0x000000ec,
+ 0x0000005f, 0x00000097, 0x00000044, 0x00000017,
+ 0x000000c4, 0x000000a7, 0x0000007e, 0x0000003d,
+ 0x00000064, 0x0000005d, 0x00000019, 0x00000073,
+ 0x00000060, 0x00000081, 0x0000004f, 0x000000dc,
+ 0x00000022, 0x0000002a, 0x00000090, 0x00000088,
+ 0x00000046, 0x000000ee, 0x000000b8, 0x00000014,
+ 0x000000de, 0x0000005e, 0x0000000b, 0x000000db,
+ 0x000000e0, 0x00000032, 0x0000003a, 0x0000000a,
+ 0x00000049, 0x00000006, 0x00000024, 0x0000005c,
+ 0x000000c2, 0x000000d3, 0x000000ac, 0x00000062,
+ 0x00000091, 0x00000095, 0x000000e4, 0x00000079,
+ 0x000000e7, 0x000000c8, 0x00000037, 0x0000006d,
+ 0x0000008d, 0x000000d5, 0x0000004e, 0x000000a9,
+ 0x0000006c, 0x00000056, 0x000000f4, 0x000000ea,
+ 0x00000065, 0x0000007a, 0x000000ae, 0x00000008,
+ 0x000000ba, 0x00000078, 0x00000025, 0x0000002e,
+ 0x0000001c, 0x000000a6, 0x000000b4, 0x000000c6,
+ 0x000000e8, 0x000000dd, 0x00000074, 0x0000001f,
+ 0x0000004b, 0x000000bd, 0x0000008b, 0x0000008a,
+ 0x00000070, 0x0000003e, 0x000000b5, 0x00000066,
+ 0x00000048, 0x00000003, 0x000000f6, 0x0000000e,
+ 0x00000061, 0x00000035, 0x00000057, 0x000000b9,
+ 0x00000086, 0x000000c1, 0x0000001d, 0x0000009e,
+ 0x000000e1, 0x000000f8, 0x00000098, 0x00000011,
+ 0x00000069, 0x000000d9, 0x0000008e, 0x00000094,
+ 0x0000009b, 0x0000001e, 0x00000087, 0x000000e9,
+ 0x000000ce, 0x00000055, 0x00000028, 0x000000df,
+ 0x0000008c, 0x000000a1, 0x00000089, 0x0000000d,
+ 0x000000bf, 0x000000e6, 0x00000042, 0x00000068,
+ 0x00000041, 0x00000099, 0x0000002d, 0x0000000f,
+ 0x000000b0, 0x00000054, 0x000000bb, 0x00000016
+ },
+ {
+ 0x00006300, 0x00007c00, 0x00007700, 0x00007b00,
+ 0x0000f200, 0x00006b00, 0x00006f00, 0x0000c500,
+ 0x00003000, 0x00000100, 0x00006700, 0x00002b00,
+ 0x0000fe00, 0x0000d700, 0x0000ab00, 0x00007600,
+ 0x0000ca00, 0x00008200, 0x0000c900, 0x00007d00,
+ 0x0000fa00, 0x00005900, 0x00004700, 0x0000f000,
+ 0x0000ad00, 0x0000d400, 0x0000a200, 0x0000af00,
+ 0x00009c00, 0x0000a400, 0x00007200, 0x0000c000,
+ 0x0000b700, 0x0000fd00, 0x00009300, 0x00002600,
+ 0x00003600, 0x00003f00, 0x0000f700, 0x0000cc00,
+ 0x00003400, 0x0000a500, 0x0000e500, 0x0000f100,
+ 0x00007100, 0x0000d800, 0x00003100, 0x00001500,
+ 0x00000400, 0x0000c700, 0x00002300, 0x0000c300,
+ 0x00001800, 0x00009600, 0x00000500, 0x00009a00,
+ 0x00000700, 0x00001200, 0x00008000, 0x0000e200,
+ 0x0000eb00, 0x00002700, 0x0000b200, 0x00007500,
+ 0x00000900, 0x00008300, 0x00002c00, 0x00001a00,
+ 0x00001b00, 0x00006e00, 0x00005a00, 0x0000a000,
+ 0x00005200, 0x00003b00, 0x0000d600, 0x0000b300,
+ 0x00002900, 0x0000e300, 0x00002f00, 0x00008400,
+ 0x00005300, 0x0000d100, 0x00000000, 0x0000ed00,
+ 0x00002000, 0x0000fc00, 0x0000b100, 0x00005b00,
+ 0x00006a00, 0x0000cb00, 0x0000be00, 0x00003900,
+ 0x00004a00, 0x00004c00, 0x00005800, 0x0000cf00,
+ 0x0000d000, 0x0000ef00, 0x0000aa00, 0x0000fb00,
+ 0x00004300, 0x00004d00, 0x00003300, 0x00008500,
+ 0x00004500, 0x0000f900, 0x00000200, 0x00007f00,
+ 0x00005000, 0x00003c00, 0x00009f00, 0x0000a800,
+ 0x00005100, 0x0000a300, 0x00004000, 0x00008f00,
+ 0x00009200, 0x00009d00, 0x00003800, 0x0000f500,
+ 0x0000bc00, 0x0000b600, 0x0000da00, 0x00002100,
+ 0x00001000, 0x0000ff00, 0x0000f300, 0x0000d200,
+ 0x0000cd00, 0x00000c00, 0x00001300, 0x0000ec00,
+ 0x00005f00, 0x00009700, 0x00004400, 0x00001700,
+ 0x0000c400, 0x0000a700, 0x00007e00, 0x00003d00,
+ 0x00006400, 0x00005d00, 0x00001900, 0x00007300,
+ 0x00006000, 0x00008100, 0x00004f00, 0x0000dc00,
+ 0x00002200, 0x00002a00, 0x00009000, 0x00008800,
+ 0x00004600, 0x0000ee00, 0x0000b800, 0x00001400,
+ 0x0000de00, 0x00005e00, 0x00000b00, 0x0000db00,
+ 0x0000e000, 0x00003200, 0x00003a00, 0x00000a00,
+ 0x00004900, 0x00000600, 0x00002400, 0x00005c00,
+ 0x0000c200, 0x0000d300, 0x0000ac00, 0x00006200,
+ 0x00009100, 0x00009500, 0x0000e400, 0x00007900,
+ 0x0000e700, 0x0000c800, 0x00003700, 0x00006d00,
+ 0x00008d00, 0x0000d500, 0x00004e00, 0x0000a900,
+ 0x00006c00, 0x00005600, 0x0000f400, 0x0000ea00,
+ 0x00006500, 0x00007a00, 0x0000ae00, 0x00000800,
+ 0x0000ba00, 0x00007800, 0x00002500, 0x00002e00,
+ 0x00001c00, 0x0000a600, 0x0000b400, 0x0000c600,
+ 0x0000e800, 0x0000dd00, 0x00007400, 0x00001f00,
+ 0x00004b00, 0x0000bd00, 0x00008b00, 0x00008a00,
+ 0x00007000, 0x00003e00, 0x0000b500, 0x00006600,
+ 0x00004800, 0x00000300, 0x0000f600, 0x00000e00,
+ 0x00006100, 0x00003500, 0x00005700, 0x0000b900,
+ 0x00008600, 0x0000c100, 0x00001d00, 0x00009e00,
+ 0x0000e100, 0x0000f800, 0x00009800, 0x00001100,
+ 0x00006900, 0x0000d900, 0x00008e00, 0x00009400,
+ 0x00009b00, 0x00001e00, 0x00008700, 0x0000e900,
+ 0x0000ce00, 0x00005500, 0x00002800, 0x0000df00,
+ 0x00008c00, 0x0000a100, 0x00008900, 0x00000d00,
+ 0x0000bf00, 0x0000e600, 0x00004200, 0x00006800,
+ 0x00004100, 0x00009900, 0x00002d00, 0x00000f00,
+ 0x0000b000, 0x00005400, 0x0000bb00, 0x00001600
+ },
+ {
+ 0x00630000, 0x007c0000, 0x00770000, 0x007b0000,
+ 0x00f20000, 0x006b0000, 0x006f0000, 0x00c50000,
+ 0x00300000, 0x00010000, 0x00670000, 0x002b0000,
+ 0x00fe0000, 0x00d70000, 0x00ab0000, 0x00760000,
+ 0x00ca0000, 0x00820000, 0x00c90000, 0x007d0000,
+ 0x00fa0000, 0x00590000, 0x00470000, 0x00f00000,
+ 0x00ad0000, 0x00d40000, 0x00a20000, 0x00af0000,
+ 0x009c0000, 0x00a40000, 0x00720000, 0x00c00000,
+ 0x00b70000, 0x00fd0000, 0x00930000, 0x00260000,
+ 0x00360000, 0x003f0000, 0x00f70000, 0x00cc0000,
+ 0x00340000, 0x00a50000, 0x00e50000, 0x00f10000,
+ 0x00710000, 0x00d80000, 0x00310000, 0x00150000,
+ 0x00040000, 0x00c70000, 0x00230000, 0x00c30000,
+ 0x00180000, 0x00960000, 0x00050000, 0x009a0000,
+ 0x00070000, 0x00120000, 0x00800000, 0x00e20000,
+ 0x00eb0000, 0x00270000, 0x00b20000, 0x00750000,
+ 0x00090000, 0x00830000, 0x002c0000, 0x001a0000,
+ 0x001b0000, 0x006e0000, 0x005a0000, 0x00a00000,
+ 0x00520000, 0x003b0000, 0x00d60000, 0x00b30000,
+ 0x00290000, 0x00e30000, 0x002f0000, 0x00840000,
+ 0x00530000, 0x00d10000, 0x00000000, 0x00ed0000,
+ 0x00200000, 0x00fc0000, 0x00b10000, 0x005b0000,
+ 0x006a0000, 0x00cb0000, 0x00be0000, 0x00390000,
+ 0x004a0000, 0x004c0000, 0x00580000, 0x00cf0000,
+ 0x00d00000, 0x00ef0000, 0x00aa0000, 0x00fb0000,
+ 0x00430000, 0x004d0000, 0x00330000, 0x00850000,
+ 0x00450000, 0x00f90000, 0x00020000, 0x007f0000,
+ 0x00500000, 0x003c0000, 0x009f0000, 0x00a80000,
+ 0x00510000, 0x00a30000, 0x00400000, 0x008f0000,
+ 0x00920000, 0x009d0000, 0x00380000, 0x00f50000,
+ 0x00bc0000, 0x00b60000, 0x00da0000, 0x00210000,
+ 0x00100000, 0x00ff0000, 0x00f30000, 0x00d20000,
+ 0x00cd0000, 0x000c0000, 0x00130000, 0x00ec0000,
+ 0x005f0000, 0x00970000, 0x00440000, 0x00170000,
+ 0x00c40000, 0x00a70000, 0x007e0000, 0x003d0000,
+ 0x00640000, 0x005d0000, 0x00190000, 0x00730000,
+ 0x00600000, 0x00810000, 0x004f0000, 0x00dc0000,
+ 0x00220000, 0x002a0000, 0x00900000, 0x00880000,
+ 0x00460000, 0x00ee0000, 0x00b80000, 0x00140000,
+ 0x00de0000, 0x005e0000, 0x000b0000, 0x00db0000,
+ 0x00e00000, 0x00320000, 0x003a0000, 0x000a0000,
+ 0x00490000, 0x00060000, 0x00240000, 0x005c0000,
+ 0x00c20000, 0x00d30000, 0x00ac0000, 0x00620000,
+ 0x00910000, 0x00950000, 0x00e40000, 0x00790000,
+ 0x00e70000, 0x00c80000, 0x00370000, 0x006d0000,
+ 0x008d0000, 0x00d50000, 0x004e0000, 0x00a90000,
+ 0x006c0000, 0x00560000, 0x00f40000, 0x00ea0000,
+ 0x00650000, 0x007a0000, 0x00ae0000, 0x00080000,
+ 0x00ba0000, 0x00780000, 0x00250000, 0x002e0000,
+ 0x001c0000, 0x00a60000, 0x00b40000, 0x00c60000,
+ 0x00e80000, 0x00dd0000, 0x00740000, 0x001f0000,
+ 0x004b0000, 0x00bd0000, 0x008b0000, 0x008a0000,
+ 0x00700000, 0x003e0000, 0x00b50000, 0x00660000,
+ 0x00480000, 0x00030000, 0x00f60000, 0x000e0000,
+ 0x00610000, 0x00350000, 0x00570000, 0x00b90000,
+ 0x00860000, 0x00c10000, 0x001d0000, 0x009e0000,
+ 0x00e10000, 0x00f80000, 0x00980000, 0x00110000,
+ 0x00690000, 0x00d90000, 0x008e0000, 0x00940000,
+ 0x009b0000, 0x001e0000, 0x00870000, 0x00e90000,
+ 0x00ce0000, 0x00550000, 0x00280000, 0x00df0000,
+ 0x008c0000, 0x00a10000, 0x00890000, 0x000d0000,
+ 0x00bf0000, 0x00e60000, 0x00420000, 0x00680000,
+ 0x00410000, 0x00990000, 0x002d0000, 0x000f0000,
+ 0x00b00000, 0x00540000, 0x00bb0000, 0x00160000
+ },
+ {
+ 0x63000000, 0x7c000000, 0x77000000, 0x7b000000,
+ 0xf2000000, 0x6b000000, 0x6f000000, 0xc5000000,
+ 0x30000000, 0x01000000, 0x67000000, 0x2b000000,
+ 0xfe000000, 0xd7000000, 0xab000000, 0x76000000,
+ 0xca000000, 0x82000000, 0xc9000000, 0x7d000000,
+ 0xfa000000, 0x59000000, 0x47000000, 0xf0000000,
+ 0xad000000, 0xd4000000, 0xa2000000, 0xaf000000,
+ 0x9c000000, 0xa4000000, 0x72000000, 0xc0000000,
+ 0xb7000000, 0xfd000000, 0x93000000, 0x26000000,
+ 0x36000000, 0x3f000000, 0xf7000000, 0xcc000000,
+ 0x34000000, 0xa5000000, 0xe5000000, 0xf1000000,
+ 0x71000000, 0xd8000000, 0x31000000, 0x15000000,
+ 0x04000000, 0xc7000000, 0x23000000, 0xc3000000,
+ 0x18000000, 0x96000000, 0x05000000, 0x9a000000,
+ 0x07000000, 0x12000000, 0x80000000, 0xe2000000,
+ 0xeb000000, 0x27000000, 0xb2000000, 0x75000000,
+ 0x09000000, 0x83000000, 0x2c000000, 0x1a000000,
+ 0x1b000000, 0x6e000000, 0x5a000000, 0xa0000000,
+ 0x52000000, 0x3b000000, 0xd6000000, 0xb3000000,
+ 0x29000000, 0xe3000000, 0x2f000000, 0x84000000,
+ 0x53000000, 0xd1000000, 0x00000000, 0xed000000,
+ 0x20000000, 0xfc000000, 0xb1000000, 0x5b000000,
+ 0x6a000000, 0xcb000000, 0xbe000000, 0x39000000,
+ 0x4a000000, 0x4c000000, 0x58000000, 0xcf000000,
+ 0xd0000000, 0xef000000, 0xaa000000, 0xfb000000,
+ 0x43000000, 0x4d000000, 0x33000000, 0x85000000,
+ 0x45000000, 0xf9000000, 0x02000000, 0x7f000000,
+ 0x50000000, 0x3c000000, 0x9f000000, 0xa8000000,
+ 0x51000000, 0xa3000000, 0x40000000, 0x8f000000,
+ 0x92000000, 0x9d000000, 0x38000000, 0xf5000000,
+ 0xbc000000, 0xb6000000, 0xda000000, 0x21000000,
+ 0x10000000, 0xff000000, 0xf3000000, 0xd2000000,
+ 0xcd000000, 0x0c000000, 0x13000000, 0xec000000,
+ 0x5f000000, 0x97000000, 0x44000000, 0x17000000,
+ 0xc4000000, 0xa7000000, 0x7e000000, 0x3d000000,
+ 0x64000000, 0x5d000000, 0x19000000, 0x73000000,
+ 0x60000000, 0x81000000, 0x4f000000, 0xdc000000,
+ 0x22000000, 0x2a000000, 0x90000000, 0x88000000,
+ 0x46000000, 0xee000000, 0xb8000000, 0x14000000,
+ 0xde000000, 0x5e000000, 0x0b000000, 0xdb000000,
+ 0xe0000000, 0x32000000, 0x3a000000, 0x0a000000,
+ 0x49000000, 0x06000000, 0x24000000, 0x5c000000,
+ 0xc2000000, 0xd3000000, 0xac000000, 0x62000000,
+ 0x91000000, 0x95000000, 0xe4000000, 0x79000000,
+ 0xe7000000, 0xc8000000, 0x37000000, 0x6d000000,
+ 0x8d000000, 0xd5000000, 0x4e000000, 0xa9000000,
+ 0x6c000000, 0x56000000, 0xf4000000, 0xea000000,
+ 0x65000000, 0x7a000000, 0xae000000, 0x08000000,
+ 0xba000000, 0x78000000, 0x25000000, 0x2e000000,
+ 0x1c000000, 0xa6000000, 0xb4000000, 0xc6000000,
+ 0xe8000000, 0xdd000000, 0x74000000, 0x1f000000,
+ 0x4b000000, 0xbd000000, 0x8b000000, 0x8a000000,
+ 0x70000000, 0x3e000000, 0xb5000000, 0x66000000,
+ 0x48000000, 0x03000000, 0xf6000000, 0x0e000000,
+ 0x61000000, 0x35000000, 0x57000000, 0xb9000000,
+ 0x86000000, 0xc1000000, 0x1d000000, 0x9e000000,
+ 0xe1000000, 0xf8000000, 0x98000000, 0x11000000,
+ 0x69000000, 0xd9000000, 0x8e000000, 0x94000000,
+ 0x9b000000, 0x1e000000, 0x87000000, 0xe9000000,
+ 0xce000000, 0x55000000, 0x28000000, 0xdf000000,
+ 0x8c000000, 0xa1000000, 0x89000000, 0x0d000000,
+ 0xbf000000, 0xe6000000, 0x42000000, 0x68000000,
+ 0x41000000, 0x99000000, 0x2d000000, 0x0f000000,
+ 0xb0000000, 0x54000000, 0xbb000000, 0x16000000
+ }
+};
+
+static const uint32_t t_im[4][256] =
+{
+ {
+ 0x00000000, 0x0b0d090e, 0x161a121c, 0x1d171b12,
+ 0x2c342438, 0x27392d36, 0x3a2e3624, 0x31233f2a,
+ 0x58684870, 0x5365417e, 0x4e725a6c, 0x457f5362,
+ 0x745c6c48, 0x7f516546, 0x62467e54, 0x694b775a,
+ 0xb0d090e0, 0xbbdd99ee, 0xa6ca82fc, 0xadc78bf2,
+ 0x9ce4b4d8, 0x97e9bdd6, 0x8afea6c4, 0x81f3afca,
+ 0xe8b8d890, 0xe3b5d19e, 0xfea2ca8c, 0xf5afc382,
+ 0xc48cfca8, 0xcf81f5a6, 0xd296eeb4, 0xd99be7ba,
+ 0x7bbb3bdb, 0x70b632d5, 0x6da129c7, 0x66ac20c9,
+ 0x578f1fe3, 0x5c8216ed, 0x41950dff, 0x4a9804f1,
+ 0x23d373ab, 0x28de7aa5, 0x35c961b7, 0x3ec468b9,
+ 0x0fe75793, 0x04ea5e9d, 0x19fd458f, 0x12f04c81,
+ 0xcb6bab3b, 0xc066a235, 0xdd71b927, 0xd67cb029,
+ 0xe75f8f03, 0xec52860d, 0xf1459d1f, 0xfa489411,
+ 0x9303e34b, 0x980eea45, 0x8519f157, 0x8e14f859,
+ 0xbf37c773, 0xb43ace7d, 0xa92dd56f, 0xa220dc61,
+ 0xf66d76ad, 0xfd607fa3, 0xe07764b1, 0xeb7a6dbf,
+ 0xda595295, 0xd1545b9b, 0xcc434089, 0xc74e4987,
+ 0xae053edd, 0xa50837d3, 0xb81f2cc1, 0xb31225cf,
+ 0x82311ae5, 0x893c13eb, 0x942b08f9, 0x9f2601f7,
+ 0x46bde64d, 0x4db0ef43, 0x50a7f451, 0x5baafd5f,
+ 0x6a89c275, 0x6184cb7b, 0x7c93d069, 0x779ed967,
+ 0x1ed5ae3d, 0x15d8a733, 0x08cfbc21, 0x03c2b52f,
+ 0x32e18a05, 0x39ec830b, 0x24fb9819, 0x2ff69117,
+ 0x8dd64d76, 0x86db4478, 0x9bcc5f6a, 0x90c15664,
+ 0xa1e2694e, 0xaaef6040, 0xb7f87b52, 0xbcf5725c,
+ 0xd5be0506, 0xdeb30c08, 0xc3a4171a, 0xc8a91e14,
+ 0xf98a213e, 0xf2872830, 0xef903322, 0xe49d3a2c,
+ 0x3d06dd96, 0x360bd498, 0x2b1ccf8a, 0x2011c684,
+ 0x1132f9ae, 0x1a3ff0a0, 0x0728ebb2, 0x0c25e2bc,
+ 0x656e95e6, 0x6e639ce8, 0x737487fa, 0x78798ef4,
+ 0x495ab1de, 0x4257b8d0, 0x5f40a3c2, 0x544daacc,
+ 0xf7daec41, 0xfcd7e54f, 0xe1c0fe5d, 0xeacdf753,
+ 0xdbeec879, 0xd0e3c177, 0xcdf4da65, 0xc6f9d36b,
+ 0xafb2a431, 0xa4bfad3f, 0xb9a8b62d, 0xb2a5bf23,
+ 0x83868009, 0x888b8907, 0x959c9215, 0x9e919b1b,
+ 0x470a7ca1, 0x4c0775af, 0x51106ebd, 0x5a1d67b3,
+ 0x6b3e5899, 0x60335197, 0x7d244a85, 0x7629438b,
+ 0x1f6234d1, 0x146f3ddf, 0x097826cd, 0x02752fc3,
+ 0x335610e9, 0x385b19e7, 0x254c02f5, 0x2e410bfb,
+ 0x8c61d79a, 0x876cde94, 0x9a7bc586, 0x9176cc88,
+ 0xa055f3a2, 0xab58faac, 0xb64fe1be, 0xbd42e8b0,
+ 0xd4099fea, 0xdf0496e4, 0xc2138df6, 0xc91e84f8,
+ 0xf83dbbd2, 0xf330b2dc, 0xee27a9ce, 0xe52aa0c0,
+ 0x3cb1477a, 0x37bc4e74, 0x2aab5566, 0x21a65c68,
+ 0x10856342, 0x1b886a4c, 0x069f715e, 0x0d927850,
+ 0x64d90f0a, 0x6fd40604, 0x72c31d16, 0x79ce1418,
+ 0x48ed2b32, 0x43e0223c, 0x5ef7392e, 0x55fa3020,
+ 0x01b79aec, 0x0aba93e2, 0x17ad88f0, 0x1ca081fe,
+ 0x2d83bed4, 0x268eb7da, 0x3b99acc8, 0x3094a5c6,
+ 0x59dfd29c, 0x52d2db92, 0x4fc5c080, 0x44c8c98e,
+ 0x75ebf6a4, 0x7ee6ffaa, 0x63f1e4b8, 0x68fcedb6,
+ 0xb1670a0c, 0xba6a0302, 0xa77d1810, 0xac70111e,
+ 0x9d532e34, 0x965e273a, 0x8b493c28, 0x80443526,
+ 0xe90f427c, 0xe2024b72, 0xff155060, 0xf418596e,
+ 0xc53b6644, 0xce366f4a, 0xd3217458, 0xd82c7d56,
+ 0x7a0ca137, 0x7101a839, 0x6c16b32b, 0x671bba25,
+ 0x5638850f, 0x5d358c01, 0x40229713, 0x4b2f9e1d,
+ 0x2264e947, 0x2969e049, 0x347efb5b, 0x3f73f255,
+ 0x0e50cd7f, 0x055dc471, 0x184adf63, 0x1347d66d,
+ 0xcadc31d7, 0xc1d138d9, 0xdcc623cb, 0xd7cb2ac5,
+ 0xe6e815ef, 0xede51ce1, 0xf0f207f3, 0xfbff0efd,
+ 0x92b479a7, 0x99b970a9, 0x84ae6bbb, 0x8fa362b5,
+ 0xbe805d9f, 0xb58d5491, 0xa89a4f83, 0xa397468d
+ },
+ {
+ 0x00000000, 0x0d090e0b, 0x1a121c16, 0x171b121d,
+ 0x3424382c, 0x392d3627, 0x2e36243a, 0x233f2a31,
+ 0x68487058, 0x65417e53, 0x725a6c4e, 0x7f536245,
+ 0x5c6c4874, 0x5165467f, 0x467e5462, 0x4b775a69,
+ 0xd090e0b0, 0xdd99eebb, 0xca82fca6, 0xc78bf2ad,
+ 0xe4b4d89c, 0xe9bdd697, 0xfea6c48a, 0xf3afca81,
+ 0xb8d890e8, 0xb5d19ee3, 0xa2ca8cfe, 0xafc382f5,
+ 0x8cfca8c4, 0x81f5a6cf, 0x96eeb4d2, 0x9be7bad9,
+ 0xbb3bdb7b, 0xb632d570, 0xa129c76d, 0xac20c966,
+ 0x8f1fe357, 0x8216ed5c, 0x950dff41, 0x9804f14a,
+ 0xd373ab23, 0xde7aa528, 0xc961b735, 0xc468b93e,
+ 0xe757930f, 0xea5e9d04, 0xfd458f19, 0xf04c8112,
+ 0x6bab3bcb, 0x66a235c0, 0x71b927dd, 0x7cb029d6,
+ 0x5f8f03e7, 0x52860dec, 0x459d1ff1, 0x489411fa,
+ 0x03e34b93, 0x0eea4598, 0x19f15785, 0x14f8598e,
+ 0x37c773bf, 0x3ace7db4, 0x2dd56fa9, 0x20dc61a2,
+ 0x6d76adf6, 0x607fa3fd, 0x7764b1e0, 0x7a6dbfeb,
+ 0x595295da, 0x545b9bd1, 0x434089cc, 0x4e4987c7,
+ 0x053eddae, 0x0837d3a5, 0x1f2cc1b8, 0x1225cfb3,
+ 0x311ae582, 0x3c13eb89, 0x2b08f994, 0x2601f79f,
+ 0xbde64d46, 0xb0ef434d, 0xa7f45150, 0xaafd5f5b,
+ 0x89c2756a, 0x84cb7b61, 0x93d0697c, 0x9ed96777,
+ 0xd5ae3d1e, 0xd8a73315, 0xcfbc2108, 0xc2b52f03,
+ 0xe18a0532, 0xec830b39, 0xfb981924, 0xf691172f,
+ 0xd64d768d, 0xdb447886, 0xcc5f6a9b, 0xc1566490,
+ 0xe2694ea1, 0xef6040aa, 0xf87b52b7, 0xf5725cbc,
+ 0xbe0506d5, 0xb30c08de, 0xa4171ac3, 0xa91e14c8,
+ 0x8a213ef9, 0x872830f2, 0x903322ef, 0x9d3a2ce4,
+ 0x06dd963d, 0x0bd49836, 0x1ccf8a2b, 0x11c68420,
+ 0x32f9ae11, 0x3ff0a01a, 0x28ebb207, 0x25e2bc0c,
+ 0x6e95e665, 0x639ce86e, 0x7487fa73, 0x798ef478,
+ 0x5ab1de49, 0x57b8d042, 0x40a3c25f, 0x4daacc54,
+ 0xdaec41f7, 0xd7e54ffc, 0xc0fe5de1, 0xcdf753ea,
+ 0xeec879db, 0xe3c177d0, 0xf4da65cd, 0xf9d36bc6,
+ 0xb2a431af, 0xbfad3fa4, 0xa8b62db9, 0xa5bf23b2,
+ 0x86800983, 0x8b890788, 0x9c921595, 0x919b1b9e,
+ 0x0a7ca147, 0x0775af4c, 0x106ebd51, 0x1d67b35a,
+ 0x3e58996b, 0x33519760, 0x244a857d, 0x29438b76,
+ 0x6234d11f, 0x6f3ddf14, 0x7826cd09, 0x752fc302,
+ 0x5610e933, 0x5b19e738, 0x4c02f525, 0x410bfb2e,
+ 0x61d79a8c, 0x6cde9487, 0x7bc5869a, 0x76cc8891,
+ 0x55f3a2a0, 0x58faacab, 0x4fe1beb6, 0x42e8b0bd,
+ 0x099fead4, 0x0496e4df, 0x138df6c2, 0x1e84f8c9,
+ 0x3dbbd2f8, 0x30b2dcf3, 0x27a9ceee, 0x2aa0c0e5,
+ 0xb1477a3c, 0xbc4e7437, 0xab55662a, 0xa65c6821,
+ 0x85634210, 0x886a4c1b, 0x9f715e06, 0x9278500d,
+ 0xd90f0a64, 0xd406046f, 0xc31d1672, 0xce141879,
+ 0xed2b3248, 0xe0223c43, 0xf7392e5e, 0xfa302055,
+ 0xb79aec01, 0xba93e20a, 0xad88f017, 0xa081fe1c,
+ 0x83bed42d, 0x8eb7da26, 0x99acc83b, 0x94a5c630,
+ 0xdfd29c59, 0xd2db9252, 0xc5c0804f, 0xc8c98e44,
+ 0xebf6a475, 0xe6ffaa7e, 0xf1e4b863, 0xfcedb668,
+ 0x670a0cb1, 0x6a0302ba, 0x7d1810a7, 0x70111eac,
+ 0x532e349d, 0x5e273a96, 0x493c288b, 0x44352680,
+ 0x0f427ce9, 0x024b72e2, 0x155060ff, 0x18596ef4,
+ 0x3b6644c5, 0x366f4ace, 0x217458d3, 0x2c7d56d8,
+ 0x0ca1377a, 0x01a83971, 0x16b32b6c, 0x1bba2567,
+ 0x38850f56, 0x358c015d, 0x22971340, 0x2f9e1d4b,
+ 0x64e94722, 0x69e04929, 0x7efb5b34, 0x73f2553f,
+ 0x50cd7f0e, 0x5dc47105, 0x4adf6318, 0x47d66d13,
+ 0xdc31d7ca, 0xd138d9c1, 0xc623cbdc, 0xcb2ac5d7,
+ 0xe815efe6, 0xe51ce1ed, 0xf207f3f0, 0xff0efdfb,
+ 0xb479a792, 0xb970a999, 0xae6bbb84, 0xa362b58f,
+ 0x805d9fbe, 0x8d5491b5, 0x9a4f83a8, 0x97468da3
+ },
+ {
+ 0x00000000, 0x090e0b0d, 0x121c161a, 0x1b121d17,
+ 0x24382c34, 0x2d362739, 0x36243a2e, 0x3f2a3123,
+ 0x48705868, 0x417e5365, 0x5a6c4e72, 0x5362457f,
+ 0x6c48745c, 0x65467f51, 0x7e546246, 0x775a694b,
+ 0x90e0b0d0, 0x99eebbdd, 0x82fca6ca, 0x8bf2adc7,
+ 0xb4d89ce4, 0xbdd697e9, 0xa6c48afe, 0xafca81f3,
+ 0xd890e8b8, 0xd19ee3b5, 0xca8cfea2, 0xc382f5af,
+ 0xfca8c48c, 0xf5a6cf81, 0xeeb4d296, 0xe7bad99b,
+ 0x3bdb7bbb, 0x32d570b6, 0x29c76da1, 0x20c966ac,
+ 0x1fe3578f, 0x16ed5c82, 0x0dff4195, 0x04f14a98,
+ 0x73ab23d3, 0x7aa528de, 0x61b735c9, 0x68b93ec4,
+ 0x57930fe7, 0x5e9d04ea, 0x458f19fd, 0x4c8112f0,
+ 0xab3bcb6b, 0xa235c066, 0xb927dd71, 0xb029d67c,
+ 0x8f03e75f, 0x860dec52, 0x9d1ff145, 0x9411fa48,
+ 0xe34b9303, 0xea45980e, 0xf1578519, 0xf8598e14,
+ 0xc773bf37, 0xce7db43a, 0xd56fa92d, 0xdc61a220,
+ 0x76adf66d, 0x7fa3fd60, 0x64b1e077, 0x6dbfeb7a,
+ 0x5295da59, 0x5b9bd154, 0x4089cc43, 0x4987c74e,
+ 0x3eddae05, 0x37d3a508, 0x2cc1b81f, 0x25cfb312,
+ 0x1ae58231, 0x13eb893c, 0x08f9942b, 0x01f79f26,
+ 0xe64d46bd, 0xef434db0, 0xf45150a7, 0xfd5f5baa,
+ 0xc2756a89, 0xcb7b6184, 0xd0697c93, 0xd967779e,
+ 0xae3d1ed5, 0xa73315d8, 0xbc2108cf, 0xb52f03c2,
+ 0x8a0532e1, 0x830b39ec, 0x981924fb, 0x91172ff6,
+ 0x4d768dd6, 0x447886db, 0x5f6a9bcc, 0x566490c1,
+ 0x694ea1e2, 0x6040aaef, 0x7b52b7f8, 0x725cbcf5,
+ 0x0506d5be, 0x0c08deb3, 0x171ac3a4, 0x1e14c8a9,
+ 0x213ef98a, 0x2830f287, 0x3322ef90, 0x3a2ce49d,
+ 0xdd963d06, 0xd498360b, 0xcf8a2b1c, 0xc6842011,
+ 0xf9ae1132, 0xf0a01a3f, 0xebb20728, 0xe2bc0c25,
+ 0x95e6656e, 0x9ce86e63, 0x87fa7374, 0x8ef47879,
+ 0xb1de495a, 0xb8d04257, 0xa3c25f40, 0xaacc544d,
+ 0xec41f7da, 0xe54ffcd7, 0xfe5de1c0, 0xf753eacd,
+ 0xc879dbee, 0xc177d0e3, 0xda65cdf4, 0xd36bc6f9,
+ 0xa431afb2, 0xad3fa4bf, 0xb62db9a8, 0xbf23b2a5,
+ 0x80098386, 0x8907888b, 0x9215959c, 0x9b1b9e91,
+ 0x7ca1470a, 0x75af4c07, 0x6ebd5110, 0x67b35a1d,
+ 0x58996b3e, 0x51976033, 0x4a857d24, 0x438b7629,
+ 0x34d11f62, 0x3ddf146f, 0x26cd0978, 0x2fc30275,
+ 0x10e93356, 0x19e7385b, 0x02f5254c, 0x0bfb2e41,
+ 0xd79a8c61, 0xde94876c, 0xc5869a7b, 0xcc889176,
+ 0xf3a2a055, 0xfaacab58, 0xe1beb64f, 0xe8b0bd42,
+ 0x9fead409, 0x96e4df04, 0x8df6c213, 0x84f8c91e,
+ 0xbbd2f83d, 0xb2dcf330, 0xa9ceee27, 0xa0c0e52a,
+ 0x477a3cb1, 0x4e7437bc, 0x55662aab, 0x5c6821a6,
+ 0x63421085, 0x6a4c1b88, 0x715e069f, 0x78500d92,
+ 0x0f0a64d9, 0x06046fd4, 0x1d1672c3, 0x141879ce,
+ 0x2b3248ed, 0x223c43e0, 0x392e5ef7, 0x302055fa,
+ 0x9aec01b7, 0x93e20aba, 0x88f017ad, 0x81fe1ca0,
+ 0xbed42d83, 0xb7da268e, 0xacc83b99, 0xa5c63094,
+ 0xd29c59df, 0xdb9252d2, 0xc0804fc5, 0xc98e44c8,
+ 0xf6a475eb, 0xffaa7ee6, 0xe4b863f1, 0xedb668fc,
+ 0x0a0cb167, 0x0302ba6a, 0x1810a77d, 0x111eac70,
+ 0x2e349d53, 0x273a965e, 0x3c288b49, 0x35268044,
+ 0x427ce90f, 0x4b72e202, 0x5060ff15, 0x596ef418,
+ 0x6644c53b, 0x6f4ace36, 0x7458d321, 0x7d56d82c,
+ 0xa1377a0c, 0xa8397101, 0xb32b6c16, 0xba25671b,
+ 0x850f5638, 0x8c015d35, 0x97134022, 0x9e1d4b2f,
+ 0xe9472264, 0xe0492969, 0xfb5b347e, 0xf2553f73,
+ 0xcd7f0e50, 0xc471055d, 0xdf63184a, 0xd66d1347,
+ 0x31d7cadc, 0x38d9c1d1, 0x23cbdcc6, 0x2ac5d7cb,
+ 0x15efe6e8, 0x1ce1ede5, 0x07f3f0f2, 0x0efdfbff,
+ 0x79a792b4, 0x70a999b9, 0x6bbb84ae, 0x62b58fa3,
+ 0x5d9fbe80, 0x5491b58d, 0x4f83a89a, 0x468da397
+ },
+ {
+ 0x00000000, 0x0e0b0d09, 0x1c161a12, 0x121d171b,
+ 0x382c3424, 0x3627392d, 0x243a2e36, 0x2a31233f,
+ 0x70586848, 0x7e536541, 0x6c4e725a, 0x62457f53,
+ 0x48745c6c, 0x467f5165, 0x5462467e, 0x5a694b77,
+ 0xe0b0d090, 0xeebbdd99, 0xfca6ca82, 0xf2adc78b,
+ 0xd89ce4b4, 0xd697e9bd, 0xc48afea6, 0xca81f3af,
+ 0x90e8b8d8, 0x9ee3b5d1, 0x8cfea2ca, 0x82f5afc3,
+ 0xa8c48cfc, 0xa6cf81f5, 0xb4d296ee, 0xbad99be7,
+ 0xdb7bbb3b, 0xd570b632, 0xc76da129, 0xc966ac20,
+ 0xe3578f1f, 0xed5c8216, 0xff41950d, 0xf14a9804,
+ 0xab23d373, 0xa528de7a, 0xb735c961, 0xb93ec468,
+ 0x930fe757, 0x9d04ea5e, 0x8f19fd45, 0x8112f04c,
+ 0x3bcb6bab, 0x35c066a2, 0x27dd71b9, 0x29d67cb0,
+ 0x03e75f8f, 0x0dec5286, 0x1ff1459d, 0x11fa4894,
+ 0x4b9303e3, 0x45980eea, 0x578519f1, 0x598e14f8,
+ 0x73bf37c7, 0x7db43ace, 0x6fa92dd5, 0x61a220dc,
+ 0xadf66d76, 0xa3fd607f, 0xb1e07764, 0xbfeb7a6d,
+ 0x95da5952, 0x9bd1545b, 0x89cc4340, 0x87c74e49,
+ 0xddae053e, 0xd3a50837, 0xc1b81f2c, 0xcfb31225,
+ 0xe582311a, 0xeb893c13, 0xf9942b08, 0xf79f2601,
+ 0x4d46bde6, 0x434db0ef, 0x5150a7f4, 0x5f5baafd,
+ 0x756a89c2, 0x7b6184cb, 0x697c93d0, 0x67779ed9,
+ 0x3d1ed5ae, 0x3315d8a7, 0x2108cfbc, 0x2f03c2b5,
+ 0x0532e18a, 0x0b39ec83, 0x1924fb98, 0x172ff691,
+ 0x768dd64d, 0x7886db44, 0x6a9bcc5f, 0x6490c156,
+ 0x4ea1e269, 0x40aaef60, 0x52b7f87b, 0x5cbcf572,
+ 0x06d5be05, 0x08deb30c, 0x1ac3a417, 0x14c8a91e,
+ 0x3ef98a21, 0x30f28728, 0x22ef9033, 0x2ce49d3a,
+ 0x963d06dd, 0x98360bd4, 0x8a2b1ccf, 0x842011c6,
+ 0xae1132f9, 0xa01a3ff0, 0xb20728eb, 0xbc0c25e2,
+ 0xe6656e95, 0xe86e639c, 0xfa737487, 0xf478798e,
+ 0xde495ab1, 0xd04257b8, 0xc25f40a3, 0xcc544daa,
+ 0x41f7daec, 0x4ffcd7e5, 0x5de1c0fe, 0x53eacdf7,
+ 0x79dbeec8, 0x77d0e3c1, 0x65cdf4da, 0x6bc6f9d3,
+ 0x31afb2a4, 0x3fa4bfad, 0x2db9a8b6, 0x23b2a5bf,
+ 0x09838680, 0x07888b89, 0x15959c92, 0x1b9e919b,
+ 0xa1470a7c, 0xaf4c0775, 0xbd51106e, 0xb35a1d67,
+ 0x996b3e58, 0x97603351, 0x857d244a, 0x8b762943,
+ 0xd11f6234, 0xdf146f3d, 0xcd097826, 0xc302752f,
+ 0xe9335610, 0xe7385b19, 0xf5254c02, 0xfb2e410b,
+ 0x9a8c61d7, 0x94876cde, 0x869a7bc5, 0x889176cc,
+ 0xa2a055f3, 0xacab58fa, 0xbeb64fe1, 0xb0bd42e8,
+ 0xead4099f, 0xe4df0496, 0xf6c2138d, 0xf8c91e84,
+ 0xd2f83dbb, 0xdcf330b2, 0xceee27a9, 0xc0e52aa0,
+ 0x7a3cb147, 0x7437bc4e, 0x662aab55, 0x6821a65c,
+ 0x42108563, 0x4c1b886a, 0x5e069f71, 0x500d9278,
+ 0x0a64d90f, 0x046fd406, 0x1672c31d, 0x1879ce14,
+ 0x3248ed2b, 0x3c43e022, 0x2e5ef739, 0x2055fa30,
+ 0xec01b79a, 0xe20aba93, 0xf017ad88, 0xfe1ca081,
+ 0xd42d83be, 0xda268eb7, 0xc83b99ac, 0xc63094a5,
+ 0x9c59dfd2, 0x9252d2db, 0x804fc5c0, 0x8e44c8c9,
+ 0xa475ebf6, 0xaa7ee6ff, 0xb863f1e4, 0xb668fced,
+ 0x0cb1670a, 0x02ba6a03, 0x10a77d18, 0x1eac7011,
+ 0x349d532e, 0x3a965e27, 0x288b493c, 0x26804435,
+ 0x7ce90f42, 0x72e2024b, 0x60ff1550, 0x6ef41859,
+ 0x44c53b66, 0x4ace366f, 0x58d32174, 0x56d82c7d,
+ 0x377a0ca1, 0x397101a8, 0x2b6c16b3, 0x25671bba,
+ 0x0f563885, 0x015d358c, 0x13402297, 0x1d4b2f9e,
+ 0x472264e9, 0x492969e0, 0x5b347efb, 0x553f73f2,
+ 0x7f0e50cd, 0x71055dc4, 0x63184adf, 0x6d1347d6,
+ 0xd7cadc31, 0xd9c1d138, 0xcbdcc623, 0xc5d7cb2a,
+ 0xefe6e815, 0xe1ede51c, 0xf3f0f207, 0xfdfbff0e,
+ 0xa792b479, 0xa999b970, 0xbb84ae6b, 0xb58fa362,
+ 0x9fbe805d, 0x91b58d54, 0x83a89a4f, 0x8da39746
+ }
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _AESTAB2_H */
diff --git a/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams
new file mode 100644
index 000000000000..0de1883dc81b
--- /dev/null
+++ b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams
@@ -0,0 +1,36 @@
+Copyright (c) 2006-2017, CRYPTOGAMS by <appro@openssl.org>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+ * Redistributions of source code must retain copyright notices,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials
+ provided with the distribution.
+
+ * Neither the name of the CRYPTOGAMS nor the names of its
+ copyright holder and contributors may be used to endorse or
+ promote products derived from this software without specific
+ prior written permission.
+
+ALTERNATIVELY, provided that this notice is retained in full, this
+product may be distributed under the terms of the GNU General Public
+License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+those given above.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip
new file mode 100644
index 000000000000..6184759c8b74
--- /dev/null
+++ b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip
@@ -0,0 +1 @@
+PORTIONS OF GCM and GHASH FUNCTIONALITY
diff --git a/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl
new file mode 100644
index 000000000000..49cc83d2ee29
--- /dev/null
+++ b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl
@@ -0,0 +1,177 @@
+
+ Apache License
+ Version 2.0, January 2004
+ https://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
diff --git a/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip
new file mode 100644
index 000000000000..6184759c8b74
--- /dev/null
+++ b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip
@@ -0,0 +1 @@
+PORTIONS OF GCM and GHASH FUNCTIONALITY
diff --git a/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S b/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S
new file mode 100644
index 000000000000..ed9f660fce5b
--- /dev/null
+++ b/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S
@@ -0,0 +1,1245 @@
+# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+#
+# AES-NI-CTR+GHASH stitch.
+#
+# February 2013
+#
+# OpenSSL GCM implementation is organized in such way that its
+# performance is rather close to the sum of its streamed components,
+# in the context parallelized AES-NI CTR and modulo-scheduled
+# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
+# was observed to perform significantly better than the sum of the
+# components on contemporary CPUs, the effort was deemed impossible to
+# justify. This module is based on combination of Intel submissions,
+# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
+# Locktyukhin of Intel Corp. who verified that it reduces shuffles
+# pressure with notable relative improvement, achieving 1.0 cycle per
+# byte processed with 128-bit key on Haswell processor, 0.74 - on
+# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
+# measurements for favourable packet size, one divisible by 96.
+# Applications using the EVP interface will observe a few percent
+# worse performance.]
+#
+# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
+#
+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
+# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
+
+# Generated once from
+# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/aesni-gcm-x86_64.pl
+# and modified for ICP. Modification are kept at a bare minimum to ease later
+# upstream merges.
+
+#if defined(__x86_64__) && defined(HAVE_AVX) && \
+ defined(HAVE_AES) && defined(HAVE_PCLMULQDQ)
+
+.extern gcm_avx_can_use_movbe
+
+.text
+
+#ifdef HAVE_MOVBE
+.type _aesni_ctr32_ghash_6x,@function
+.align 32
+_aesni_ctr32_ghash_6x:
+ vmovdqu 32(%r11),%xmm2
+ subq $6,%rdx
+ vpxor %xmm4,%xmm4,%xmm4
+ vmovdqu 0-128(%rcx),%xmm15
+ vpaddb %xmm2,%xmm1,%xmm10
+ vpaddb %xmm2,%xmm10,%xmm11
+ vpaddb %xmm2,%xmm11,%xmm12
+ vpaddb %xmm2,%xmm12,%xmm13
+ vpaddb %xmm2,%xmm13,%xmm14
+ vpxor %xmm15,%xmm1,%xmm9
+ vmovdqu %xmm4,16+8(%rsp)
+ jmp .Loop6x
+
+.align 32
+.Loop6x:
+ addl $100663296,%ebx
+ jc .Lhandle_ctr32
+ vmovdqu 0-32(%r9),%xmm3
+ vpaddb %xmm2,%xmm14,%xmm1
+ vpxor %xmm15,%xmm10,%xmm10
+ vpxor %xmm15,%xmm11,%xmm11
+
+.Lresume_ctr32:
+ vmovdqu %xmm1,(%r8)
+ vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5
+ vpxor %xmm15,%xmm12,%xmm12
+ vmovups 16-128(%rcx),%xmm2
+ vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6
+ xorq %r12,%r12
+ cmpq %r14,%r15
+
+ vaesenc %xmm2,%xmm9,%xmm9
+ vmovdqu 48+8(%rsp),%xmm0
+ vpxor %xmm15,%xmm13,%xmm13
+ vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1
+ vaesenc %xmm2,%xmm10,%xmm10
+ vpxor %xmm15,%xmm14,%xmm14
+ setnc %r12b
+ vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vmovdqu 16-32(%r9),%xmm3
+ negq %r12
+ vaesenc %xmm2,%xmm12,%xmm12
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5
+ vpxor %xmm4,%xmm8,%xmm8
+ vaesenc %xmm2,%xmm13,%xmm13
+ vpxor %xmm5,%xmm1,%xmm4
+ andq $0x60,%r12
+ vmovups 32-128(%rcx),%xmm15
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1
+ vaesenc %xmm2,%xmm14,%xmm14
+
+ vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2
+ leaq (%r14,%r12,1),%r14
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor 16+8(%rsp),%xmm8,%xmm8
+ vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3
+ vmovdqu 64+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 88(%r14),%r13
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 80(%r14),%r12
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,32+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,40+8(%rsp)
+ vmovdqu 48-32(%r9),%xmm5
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 48-128(%rcx),%xmm15
+ vpxor %xmm1,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm2,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor %xmm3,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5
+ vmovdqu 80+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqu 64-32(%r9),%xmm1
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 64-128(%rcx),%xmm15
+ vpxor %xmm2,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm3,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 72(%r14),%r13
+ vpxor %xmm5,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 64(%r14),%r12
+ vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1
+ vmovdqu 96+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,48+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,56+8(%rsp)
+ vpxor %xmm2,%xmm4,%xmm4
+ vmovdqu 96-32(%r9),%xmm2
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 80-128(%rcx),%xmm15
+ vpxor %xmm3,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 56(%r14),%r13
+ vpxor %xmm1,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1
+ vpxor 112+8(%rsp),%xmm8,%xmm8
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 48(%r14),%r12
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,64+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,72+8(%rsp)
+ vpxor %xmm3,%xmm4,%xmm4
+ vmovdqu 112-32(%r9),%xmm3
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 96-128(%rcx),%xmm15
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm1,%xmm6,%xmm6
+ vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 40(%r14),%r13
+ vpxor %xmm2,%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 32(%r14),%r12
+ vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,80+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,88+8(%rsp)
+ vpxor %xmm5,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor %xmm1,%xmm6,%xmm6
+
+ vmovups 112-128(%rcx),%xmm15
+ vpslldq $8,%xmm6,%xmm5
+ vpxor %xmm2,%xmm4,%xmm4
+ vmovdqu 16(%r11),%xmm3
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm8,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor %xmm5,%xmm4,%xmm4
+ movbeq 24(%r14),%r13
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 16(%r14),%r12
+ vpalignr $8,%xmm4,%xmm4,%xmm0
+ vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
+ movq %r13,96+8(%rsp)
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r12,104+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ vmovups 128-128(%rcx),%xmm1
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups 144-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm10,%xmm10
+ vpsrldq $8,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm11,%xmm11
+ vpxor %xmm6,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm12,%xmm12
+ vpxor %xmm0,%xmm4,%xmm4
+ movbeq 8(%r14),%r13
+ vaesenc %xmm1,%xmm13,%xmm13
+ movbeq 0(%r14),%r12
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 160-128(%rcx),%xmm1
+ cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds.
+ jb .Lenc_tail
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+ vmovups 176-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 192-128(%rcx),%xmm1
+ cmpl $14,%ebp // ICP does not zero key schedule.
+ jb .Lenc_tail
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+ vmovups 208-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 224-128(%rcx),%xmm1
+ jmp .Lenc_tail
+
+.align 32
+.Lhandle_ctr32:
+ vmovdqu (%r11),%xmm0
+ vpshufb %xmm0,%xmm1,%xmm6
+ vmovdqu 48(%r11),%xmm5
+ vpaddd 64(%r11),%xmm6,%xmm10
+ vpaddd %xmm5,%xmm6,%xmm11
+ vmovdqu 0-32(%r9),%xmm3
+ vpaddd %xmm5,%xmm10,%xmm12
+ vpshufb %xmm0,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm11,%xmm13
+ vpshufb %xmm0,%xmm11,%xmm11
+ vpxor %xmm15,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm12,%xmm14
+ vpshufb %xmm0,%xmm12,%xmm12
+ vpxor %xmm15,%xmm11,%xmm11
+ vpaddd %xmm5,%xmm13,%xmm1
+ vpshufb %xmm0,%xmm13,%xmm13
+ vpshufb %xmm0,%xmm14,%xmm14
+ vpshufb %xmm0,%xmm1,%xmm1
+ jmp .Lresume_ctr32
+
+.align 32
+.Lenc_tail:
+ vaesenc %xmm15,%xmm9,%xmm9
+ vmovdqu %xmm7,16+8(%rsp)
+ vpalignr $8,%xmm4,%xmm4,%xmm8
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
+ vpxor 0(%rdi),%xmm1,%xmm2
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpxor 16(%rdi),%xmm1,%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ vpxor 32(%rdi),%xmm1,%xmm5
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor 48(%rdi),%xmm1,%xmm6
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor 64(%rdi),%xmm1,%xmm7
+ vpxor 80(%rdi),%xmm1,%xmm3
+ vmovdqu (%r8),%xmm1
+
+ vaesenclast %xmm2,%xmm9,%xmm9
+ vmovdqu 32(%r11),%xmm2
+ vaesenclast %xmm0,%xmm10,%xmm10
+ vpaddb %xmm2,%xmm1,%xmm0
+ movq %r13,112+8(%rsp)
+ leaq 96(%rdi),%rdi
+ vaesenclast %xmm5,%xmm11,%xmm11
+ vpaddb %xmm2,%xmm0,%xmm5
+ movq %r12,120+8(%rsp)
+ leaq 96(%rsi),%rsi
+ vmovdqu 0-128(%rcx),%xmm15
+ vaesenclast %xmm6,%xmm12,%xmm12
+ vpaddb %xmm2,%xmm5,%xmm6
+ vaesenclast %xmm7,%xmm13,%xmm13
+ vpaddb %xmm2,%xmm6,%xmm7
+ vaesenclast %xmm3,%xmm14,%xmm14
+ vpaddb %xmm2,%xmm7,%xmm3
+
+ addq $0x60,%r10
+ subq $0x6,%rdx
+ jc .L6x_done
+
+ vmovups %xmm9,-96(%rsi)
+ vpxor %xmm15,%xmm1,%xmm9
+ vmovups %xmm10,-80(%rsi)
+ vmovdqa %xmm0,%xmm10
+ vmovups %xmm11,-64(%rsi)
+ vmovdqa %xmm5,%xmm11
+ vmovups %xmm12,-48(%rsi)
+ vmovdqa %xmm6,%xmm12
+ vmovups %xmm13,-32(%rsi)
+ vmovdqa %xmm7,%xmm13
+ vmovups %xmm14,-16(%rsi)
+ vmovdqa %xmm3,%xmm14
+ vmovdqu 32+8(%rsp),%xmm7
+ jmp .Loop6x
+
+.L6x_done:
+ vpxor 16+8(%rsp),%xmm8,%xmm8
+ vpxor %xmm4,%xmm8,%xmm8
+
+ .byte 0xf3,0xc3
+.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
+#endif /* ifdef HAVE_MOVBE */
+
+.type _aesni_ctr32_ghash_no_movbe_6x,@function
+.align 32
+_aesni_ctr32_ghash_no_movbe_6x:
+ vmovdqu 32(%r11),%xmm2
+ subq $6,%rdx
+ vpxor %xmm4,%xmm4,%xmm4
+ vmovdqu 0-128(%rcx),%xmm15
+ vpaddb %xmm2,%xmm1,%xmm10
+ vpaddb %xmm2,%xmm10,%xmm11
+ vpaddb %xmm2,%xmm11,%xmm12
+ vpaddb %xmm2,%xmm12,%xmm13
+ vpaddb %xmm2,%xmm13,%xmm14
+ vpxor %xmm15,%xmm1,%xmm9
+ vmovdqu %xmm4,16+8(%rsp)
+ jmp .Loop6x_nmb
+
+.align 32
+.Loop6x_nmb:
+ addl $100663296,%ebx
+ jc .Lhandle_ctr32_nmb
+ vmovdqu 0-32(%r9),%xmm3
+ vpaddb %xmm2,%xmm14,%xmm1
+ vpxor %xmm15,%xmm10,%xmm10
+ vpxor %xmm15,%xmm11,%xmm11
+
+.Lresume_ctr32_nmb:
+ vmovdqu %xmm1,(%r8)
+ vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5
+ vpxor %xmm15,%xmm12,%xmm12
+ vmovups 16-128(%rcx),%xmm2
+ vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6
+ xorq %r12,%r12
+ cmpq %r14,%r15
+
+ vaesenc %xmm2,%xmm9,%xmm9
+ vmovdqu 48+8(%rsp),%xmm0
+ vpxor %xmm15,%xmm13,%xmm13
+ vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1
+ vaesenc %xmm2,%xmm10,%xmm10
+ vpxor %xmm15,%xmm14,%xmm14
+ setnc %r12b
+ vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vmovdqu 16-32(%r9),%xmm3
+ negq %r12
+ vaesenc %xmm2,%xmm12,%xmm12
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5
+ vpxor %xmm4,%xmm8,%xmm8
+ vaesenc %xmm2,%xmm13,%xmm13
+ vpxor %xmm5,%xmm1,%xmm4
+ andq $0x60,%r12
+ vmovups 32-128(%rcx),%xmm15
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1
+ vaesenc %xmm2,%xmm14,%xmm14
+
+ vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2
+ leaq (%r14,%r12,1),%r14
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor 16+8(%rsp),%xmm8,%xmm8
+ vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3
+ vmovdqu 64+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm10,%xmm10
+ movq 88(%r14),%r13
+ bswapq %r13
+ vaesenc %xmm15,%xmm11,%xmm11
+ movq 80(%r14),%r12
+ bswapq %r12
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,32+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,40+8(%rsp)
+ vmovdqu 48-32(%r9),%xmm5
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 48-128(%rcx),%xmm15
+ vpxor %xmm1,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm2,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor %xmm3,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5
+ vmovdqu 80+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqu 64-32(%r9),%xmm1
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 64-128(%rcx),%xmm15
+ vpxor %xmm2,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm3,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm10,%xmm10
+ movq 72(%r14),%r13
+ bswapq %r13
+ vpxor %xmm5,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5
+ vaesenc %xmm15,%xmm11,%xmm11
+ movq 64(%r14),%r12
+ bswapq %r12
+ vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1
+ vmovdqu 96+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,48+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,56+8(%rsp)
+ vpxor %xmm2,%xmm4,%xmm4
+ vmovdqu 96-32(%r9),%xmm2
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 80-128(%rcx),%xmm15
+ vpxor %xmm3,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5
+ vaesenc %xmm15,%xmm10,%xmm10
+ movq 56(%r14),%r13
+ bswapq %r13
+ vpxor %xmm1,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1
+ vpxor 112+8(%rsp),%xmm8,%xmm8
+ vaesenc %xmm15,%xmm11,%xmm11
+ movq 48(%r14),%r12
+ bswapq %r12
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,64+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,72+8(%rsp)
+ vpxor %xmm3,%xmm4,%xmm4
+ vmovdqu 112-32(%r9),%xmm3
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 96-128(%rcx),%xmm15
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm1,%xmm6,%xmm6
+ vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1
+ vaesenc %xmm15,%xmm10,%xmm10
+ movq 40(%r14),%r13
+ bswapq %r13
+ vpxor %xmm2,%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2
+ vaesenc %xmm15,%xmm11,%xmm11
+ movq 32(%r14),%r12
+ bswapq %r12
+ vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,80+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,88+8(%rsp)
+ vpxor %xmm5,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor %xmm1,%xmm6,%xmm6
+
+ vmovups 112-128(%rcx),%xmm15
+ vpslldq $8,%xmm6,%xmm5
+ vpxor %xmm2,%xmm4,%xmm4
+ vmovdqu 16(%r11),%xmm3
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm8,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor %xmm5,%xmm4,%xmm4
+ movq 24(%r14),%r13
+ bswapq %r13
+ vaesenc %xmm15,%xmm11,%xmm11
+ movq 16(%r14),%r12
+ bswapq %r12
+ vpalignr $8,%xmm4,%xmm4,%xmm0
+ vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
+ movq %r13,96+8(%rsp)
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r12,104+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ vmovups 128-128(%rcx),%xmm1
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups 144-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm10,%xmm10
+ vpsrldq $8,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm11,%xmm11
+ vpxor %xmm6,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm12,%xmm12
+ vpxor %xmm0,%xmm4,%xmm4
+ movq 8(%r14),%r13
+ bswapq %r13
+ vaesenc %xmm1,%xmm13,%xmm13
+ movq 0(%r14),%r12
+ bswapq %r12
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 160-128(%rcx),%xmm1
+ cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds.
+ jb .Lenc_tail_nmb
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+ vmovups 176-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 192-128(%rcx),%xmm1
+ cmpl $14,%ebp // ICP does not zero key schedule.
+ jb .Lenc_tail_nmb
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+ vmovups 208-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 224-128(%rcx),%xmm1
+ jmp .Lenc_tail_nmb
+
+.align 32
+.Lhandle_ctr32_nmb:
+ vmovdqu (%r11),%xmm0
+ vpshufb %xmm0,%xmm1,%xmm6
+ vmovdqu 48(%r11),%xmm5
+ vpaddd 64(%r11),%xmm6,%xmm10
+ vpaddd %xmm5,%xmm6,%xmm11
+ vmovdqu 0-32(%r9),%xmm3
+ vpaddd %xmm5,%xmm10,%xmm12
+ vpshufb %xmm0,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm11,%xmm13
+ vpshufb %xmm0,%xmm11,%xmm11
+ vpxor %xmm15,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm12,%xmm14
+ vpshufb %xmm0,%xmm12,%xmm12
+ vpxor %xmm15,%xmm11,%xmm11
+ vpaddd %xmm5,%xmm13,%xmm1
+ vpshufb %xmm0,%xmm13,%xmm13
+ vpshufb %xmm0,%xmm14,%xmm14
+ vpshufb %xmm0,%xmm1,%xmm1
+ jmp .Lresume_ctr32_nmb
+
+.align 32
+.Lenc_tail_nmb:
+ vaesenc %xmm15,%xmm9,%xmm9
+ vmovdqu %xmm7,16+8(%rsp)
+ vpalignr $8,%xmm4,%xmm4,%xmm8
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
+ vpxor 0(%rdi),%xmm1,%xmm2
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpxor 16(%rdi),%xmm1,%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ vpxor 32(%rdi),%xmm1,%xmm5
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor 48(%rdi),%xmm1,%xmm6
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor 64(%rdi),%xmm1,%xmm7
+ vpxor 80(%rdi),%xmm1,%xmm3
+ vmovdqu (%r8),%xmm1
+
+ vaesenclast %xmm2,%xmm9,%xmm9
+ vmovdqu 32(%r11),%xmm2
+ vaesenclast %xmm0,%xmm10,%xmm10
+ vpaddb %xmm2,%xmm1,%xmm0
+ movq %r13,112+8(%rsp)
+ leaq 96(%rdi),%rdi
+ vaesenclast %xmm5,%xmm11,%xmm11
+ vpaddb %xmm2,%xmm0,%xmm5
+ movq %r12,120+8(%rsp)
+ leaq 96(%rsi),%rsi
+ vmovdqu 0-128(%rcx),%xmm15
+ vaesenclast %xmm6,%xmm12,%xmm12
+ vpaddb %xmm2,%xmm5,%xmm6
+ vaesenclast %xmm7,%xmm13,%xmm13
+ vpaddb %xmm2,%xmm6,%xmm7
+ vaesenclast %xmm3,%xmm14,%xmm14
+ vpaddb %xmm2,%xmm7,%xmm3
+
+ addq $0x60,%r10
+ subq $0x6,%rdx
+ jc .L6x_done_nmb
+
+ vmovups %xmm9,-96(%rsi)
+ vpxor %xmm15,%xmm1,%xmm9
+ vmovups %xmm10,-80(%rsi)
+ vmovdqa %xmm0,%xmm10
+ vmovups %xmm11,-64(%rsi)
+ vmovdqa %xmm5,%xmm11
+ vmovups %xmm12,-48(%rsi)
+ vmovdqa %xmm6,%xmm12
+ vmovups %xmm13,-32(%rsi)
+ vmovdqa %xmm7,%xmm13
+ vmovups %xmm14,-16(%rsi)
+ vmovdqa %xmm3,%xmm14
+ vmovdqu 32+8(%rsp),%xmm7
+ jmp .Loop6x_nmb
+
+.L6x_done_nmb:
+ vpxor 16+8(%rsp),%xmm8,%xmm8
+ vpxor %xmm4,%xmm8,%xmm8
+
+ .byte 0xf3,0xc3
+.size _aesni_ctr32_ghash_no_movbe_6x,.-_aesni_ctr32_ghash_no_movbe_6x
+
+.globl aesni_gcm_decrypt
+.type aesni_gcm_decrypt,@function
+.align 32
+aesni_gcm_decrypt:
+.cfi_startproc
+ xorq %r10,%r10
+ cmpq $0x60,%rdx
+ jb .Lgcm_dec_abort
+
+ leaq (%rsp),%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ vzeroupper
+
+ vmovdqu (%r8),%xmm1
+ addq $-128,%rsp
+ movl 12(%r8),%ebx
+ leaq .Lbswap_mask(%rip),%r11
+ leaq -128(%rcx),%r14
+ movq $0xf80,%r15
+ vmovdqu (%r9),%xmm8
+ andq $-128,%rsp
+ vmovdqu (%r11),%xmm0
+ leaq 128(%rcx),%rcx
+ leaq 32+32(%r9),%r9
+ movl 504-128(%rcx),%ebp // ICP has a larger offset for rounds.
+ vpshufb %xmm0,%xmm8,%xmm8
+
+ andq %r15,%r14
+ andq %rsp,%r15
+ subq %r14,%r15
+ jc .Ldec_no_key_aliasing
+ cmpq $768,%r15
+ jnc .Ldec_no_key_aliasing
+ subq %r15,%rsp
+.Ldec_no_key_aliasing:
+
+ vmovdqu 80(%rdi),%xmm7
+ leaq (%rdi),%r14
+ vmovdqu 64(%rdi),%xmm4
+ leaq -192(%rdi,%rdx,1),%r15
+ vmovdqu 48(%rdi),%xmm5
+ shrq $4,%rdx
+ xorq %r10,%r10
+ vmovdqu 32(%rdi),%xmm6
+ vpshufb %xmm0,%xmm7,%xmm7
+ vmovdqu 16(%rdi),%xmm2
+ vpshufb %xmm0,%xmm4,%xmm4
+ vmovdqu (%rdi),%xmm3
+ vpshufb %xmm0,%xmm5,%xmm5
+ vmovdqu %xmm4,48(%rsp)
+ vpshufb %xmm0,%xmm6,%xmm6
+ vmovdqu %xmm5,64(%rsp)
+ vpshufb %xmm0,%xmm2,%xmm2
+ vmovdqu %xmm6,80(%rsp)
+ vpshufb %xmm0,%xmm3,%xmm3
+ vmovdqu %xmm2,96(%rsp)
+ vmovdqu %xmm3,112(%rsp)
+
+#ifdef HAVE_MOVBE
+#ifdef _KERNEL
+ testl $1,gcm_avx_can_use_movbe(%rip)
+#else
+ testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip)
+#endif
+ jz 1f
+ call _aesni_ctr32_ghash_6x
+ jmp 2f
+1:
+#endif
+ call _aesni_ctr32_ghash_no_movbe_6x
+2:
+ vmovups %xmm9,-96(%rsi)
+ vmovups %xmm10,-80(%rsi)
+ vmovups %xmm11,-64(%rsi)
+ vmovups %xmm12,-48(%rsi)
+ vmovups %xmm13,-32(%rsi)
+ vmovups %xmm14,-16(%rsi)
+
+ vpshufb (%r11),%xmm8,%xmm8
+ vmovdqu %xmm8,-64(%r9)
+
+ vzeroupper
+ movq -48(%rax),%r15
+.cfi_restore %r15
+ movq -40(%rax),%r14
+.cfi_restore %r14
+ movq -32(%rax),%r13
+.cfi_restore %r13
+ movq -24(%rax),%r12
+.cfi_restore %r12
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lgcm_dec_abort:
+ movq %r10,%rax
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesni_gcm_decrypt,.-aesni_gcm_decrypt
+.type _aesni_ctr32_6x,@function
+.align 32
+_aesni_ctr32_6x:
+ vmovdqu 0-128(%rcx),%xmm4
+ vmovdqu 32(%r11),%xmm2
+ leaq -2(%rbp),%r13 // ICP uses 10,12,14 not 9,11,13 for rounds.
+ vmovups 16-128(%rcx),%xmm15
+ leaq 32-128(%rcx),%r12
+ vpxor %xmm4,%xmm1,%xmm9
+ addl $100663296,%ebx
+ jc .Lhandle_ctr32_2
+ vpaddb %xmm2,%xmm1,%xmm10
+ vpaddb %xmm2,%xmm10,%xmm11
+ vpxor %xmm4,%xmm10,%xmm10
+ vpaddb %xmm2,%xmm11,%xmm12
+ vpxor %xmm4,%xmm11,%xmm11
+ vpaddb %xmm2,%xmm12,%xmm13
+ vpxor %xmm4,%xmm12,%xmm12
+ vpaddb %xmm2,%xmm13,%xmm14
+ vpxor %xmm4,%xmm13,%xmm13
+ vpaddb %xmm2,%xmm14,%xmm1
+ vpxor %xmm4,%xmm14,%xmm14
+ jmp .Loop_ctr32
+
+.align 16
+.Loop_ctr32:
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+ vmovups (%r12),%xmm15
+ leaq 16(%r12),%r12
+ decl %r13d
+ jnz .Loop_ctr32
+
+ vmovdqu (%r12),%xmm3
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor 0(%rdi),%xmm3,%xmm4
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor 16(%rdi),%xmm3,%xmm5
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpxor 32(%rdi),%xmm3,%xmm6
+ vaesenc %xmm15,%xmm12,%xmm12
+ vpxor 48(%rdi),%xmm3,%xmm8
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor 64(%rdi),%xmm3,%xmm2
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor 80(%rdi),%xmm3,%xmm3
+ leaq 96(%rdi),%rdi
+
+ vaesenclast %xmm4,%xmm9,%xmm9
+ vaesenclast %xmm5,%xmm10,%xmm10
+ vaesenclast %xmm6,%xmm11,%xmm11
+ vaesenclast %xmm8,%xmm12,%xmm12
+ vaesenclast %xmm2,%xmm13,%xmm13
+ vaesenclast %xmm3,%xmm14,%xmm14
+ vmovups %xmm9,0(%rsi)
+ vmovups %xmm10,16(%rsi)
+ vmovups %xmm11,32(%rsi)
+ vmovups %xmm12,48(%rsi)
+ vmovups %xmm13,64(%rsi)
+ vmovups %xmm14,80(%rsi)
+ leaq 96(%rsi),%rsi
+
+ .byte 0xf3,0xc3
+.align 32
+.Lhandle_ctr32_2:
+ vpshufb %xmm0,%xmm1,%xmm6
+ vmovdqu 48(%r11),%xmm5
+ vpaddd 64(%r11),%xmm6,%xmm10
+ vpaddd %xmm5,%xmm6,%xmm11
+ vpaddd %xmm5,%xmm10,%xmm12
+ vpshufb %xmm0,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm11,%xmm13
+ vpshufb %xmm0,%xmm11,%xmm11
+ vpxor %xmm4,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm12,%xmm14
+ vpshufb %xmm0,%xmm12,%xmm12
+ vpxor %xmm4,%xmm11,%xmm11
+ vpaddd %xmm5,%xmm13,%xmm1
+ vpshufb %xmm0,%xmm13,%xmm13
+ vpxor %xmm4,%xmm12,%xmm12
+ vpshufb %xmm0,%xmm14,%xmm14
+ vpxor %xmm4,%xmm13,%xmm13
+ vpshufb %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm14,%xmm14
+ jmp .Loop_ctr32
+.size _aesni_ctr32_6x,.-_aesni_ctr32_6x
+
+.globl aesni_gcm_encrypt
+.type aesni_gcm_encrypt,@function
+.align 32
+aesni_gcm_encrypt:
+.cfi_startproc
+ xorq %r10,%r10
+ cmpq $288,%rdx
+ jb .Lgcm_enc_abort
+
+ leaq (%rsp),%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ vzeroupper
+
+ vmovdqu (%r8),%xmm1
+ addq $-128,%rsp
+ movl 12(%r8),%ebx
+ leaq .Lbswap_mask(%rip),%r11
+ leaq -128(%rcx),%r14
+ movq $0xf80,%r15
+ leaq 128(%rcx),%rcx
+ vmovdqu (%r11),%xmm0
+ andq $-128,%rsp
+ movl 504-128(%rcx),%ebp // ICP has an larger offset for rounds.
+
+ andq %r15,%r14
+ andq %rsp,%r15
+ subq %r14,%r15
+ jc .Lenc_no_key_aliasing
+ cmpq $768,%r15
+ jnc .Lenc_no_key_aliasing
+ subq %r15,%rsp
+.Lenc_no_key_aliasing:
+
+ leaq (%rsi),%r14
+ leaq -192(%rsi,%rdx,1),%r15
+ shrq $4,%rdx
+
+ call _aesni_ctr32_6x
+ vpshufb %xmm0,%xmm9,%xmm8
+ vpshufb %xmm0,%xmm10,%xmm2
+ vmovdqu %xmm8,112(%rsp)
+ vpshufb %xmm0,%xmm11,%xmm4
+ vmovdqu %xmm2,96(%rsp)
+ vpshufb %xmm0,%xmm12,%xmm5
+ vmovdqu %xmm4,80(%rsp)
+ vpshufb %xmm0,%xmm13,%xmm6
+ vmovdqu %xmm5,64(%rsp)
+ vpshufb %xmm0,%xmm14,%xmm7
+ vmovdqu %xmm6,48(%rsp)
+
+ call _aesni_ctr32_6x
+
+ vmovdqu (%r9),%xmm8
+ leaq 32+32(%r9),%r9
+ subq $12,%rdx
+ movq $192,%r10
+ vpshufb %xmm0,%xmm8,%xmm8
+
+#ifdef HAVE_MOVBE
+#ifdef _KERNEL
+ testl $1,gcm_avx_can_use_movbe(%rip)
+#else
+ testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip)
+#endif
+ jz 1f
+ call _aesni_ctr32_ghash_6x
+ jmp 2f
+1:
+#endif
+ call _aesni_ctr32_ghash_no_movbe_6x
+2:
+ vmovdqu 32(%rsp),%xmm7
+ vmovdqu (%r11),%xmm0
+ vmovdqu 0-32(%r9),%xmm3
+ vpunpckhqdq %xmm7,%xmm7,%xmm1
+ vmovdqu 32-32(%r9),%xmm15
+ vmovups %xmm9,-96(%rsi)
+ vpshufb %xmm0,%xmm9,%xmm9
+ vpxor %xmm7,%xmm1,%xmm1
+ vmovups %xmm10,-80(%rsi)
+ vpshufb %xmm0,%xmm10,%xmm10
+ vmovups %xmm11,-64(%rsi)
+ vpshufb %xmm0,%xmm11,%xmm11
+ vmovups %xmm12,-48(%rsi)
+ vpshufb %xmm0,%xmm12,%xmm12
+ vmovups %xmm13,-32(%rsi)
+ vpshufb %xmm0,%xmm13,%xmm13
+ vmovups %xmm14,-16(%rsi)
+ vpshufb %xmm0,%xmm14,%xmm14
+ vmovdqu %xmm9,16(%rsp)
+ vmovdqu 48(%rsp),%xmm6
+ vmovdqu 16-32(%r9),%xmm0
+ vpunpckhqdq %xmm6,%xmm6,%xmm2
+ vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5
+ vpxor %xmm6,%xmm2,%xmm2
+ vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
+
+ vmovdqu 64(%rsp),%xmm9
+ vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4
+ vmovdqu 48-32(%r9),%xmm3
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm9,%xmm9,%xmm5
+ vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6
+ vpxor %xmm9,%xmm5,%xmm5
+ vpxor %xmm7,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
+ vmovdqu 80-32(%r9),%xmm15
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vmovdqu 80(%rsp),%xmm1
+ vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7
+ vmovdqu 64-32(%r9),%xmm0
+ vpxor %xmm4,%xmm7,%xmm7
+ vpunpckhqdq %xmm1,%xmm1,%xmm4
+ vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpxor %xmm6,%xmm9,%xmm9
+ vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 96(%rsp),%xmm2
+ vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6
+ vmovdqu 96-32(%r9),%xmm3
+ vpxor %xmm7,%xmm6,%xmm6
+ vpunpckhqdq %xmm2,%xmm2,%xmm7
+ vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpxor %xmm9,%xmm1,%xmm1
+ vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4
+ vmovdqu 128-32(%r9),%xmm15
+ vpxor %xmm5,%xmm4,%xmm4
+
+ vpxor 112(%rsp),%xmm8,%xmm8
+ vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5
+ vmovdqu 112-32(%r9),%xmm0
+ vpunpckhqdq %xmm8,%xmm8,%xmm9
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2
+ vpxor %xmm8,%xmm9,%xmm9
+ vpxor %xmm1,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7
+ vpxor %xmm4,%xmm7,%xmm4
+
+ vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6
+ vmovdqu 0-32(%r9),%xmm3
+ vpunpckhqdq %xmm14,%xmm14,%xmm1
+ vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8
+ vpxor %xmm14,%xmm1,%xmm1
+ vpxor %xmm5,%xmm6,%xmm5
+ vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9
+ vmovdqu 32-32(%r9),%xmm15
+ vpxor %xmm2,%xmm8,%xmm7
+ vpxor %xmm4,%xmm9,%xmm6
+
+ vmovdqu 16-32(%r9),%xmm0
+ vpxor %xmm5,%xmm7,%xmm9
+ vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4
+ vpxor %xmm9,%xmm6,%xmm6
+ vpunpckhqdq %xmm13,%xmm13,%xmm2
+ vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14
+ vpxor %xmm13,%xmm2,%xmm2
+ vpslldq $8,%xmm6,%xmm9
+ vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
+ vpxor %xmm9,%xmm5,%xmm8
+ vpsrldq $8,%xmm6,%xmm6
+ vpxor %xmm6,%xmm7,%xmm7
+
+ vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5
+ vmovdqu 48-32(%r9),%xmm3
+ vpxor %xmm4,%xmm5,%xmm5
+ vpunpckhqdq %xmm12,%xmm12,%xmm9
+ vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13
+ vpxor %xmm12,%xmm9,%xmm9
+ vpxor %xmm14,%xmm13,%xmm13
+ vpalignr $8,%xmm8,%xmm8,%xmm14
+ vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
+ vmovdqu 80-32(%r9),%xmm15
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4
+ vmovdqu 64-32(%r9),%xmm0
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm11,%xmm11,%xmm1
+ vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12
+ vpxor %xmm11,%xmm1,%xmm1
+ vpxor %xmm13,%xmm12,%xmm12
+ vxorps 16(%rsp),%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9
+ vpxor %xmm2,%xmm9,%xmm9
+
+ vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
+ vxorps %xmm14,%xmm8,%xmm8
+
+ vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5
+ vmovdqu 96-32(%r9),%xmm3
+ vpxor %xmm4,%xmm5,%xmm5
+ vpunpckhqdq %xmm10,%xmm10,%xmm2
+ vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11
+ vpxor %xmm10,%xmm2,%xmm2
+ vpalignr $8,%xmm8,%xmm8,%xmm14
+ vpxor %xmm12,%xmm11,%xmm11
+ vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1
+ vmovdqu 128-32(%r9),%xmm15
+ vpxor %xmm9,%xmm1,%xmm1
+
+ vxorps %xmm7,%xmm14,%xmm14
+ vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
+ vxorps %xmm14,%xmm8,%xmm8
+
+ vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4
+ vmovdqu 112-32(%r9),%xmm0
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm8,%xmm8,%xmm9
+ vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10
+ vpxor %xmm8,%xmm9,%xmm9
+ vpxor %xmm11,%xmm10,%xmm10
+ vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5
+ vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7
+ vpxor %xmm4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6
+ vpxor %xmm10,%xmm7,%xmm7
+ vpxor %xmm2,%xmm6,%xmm6
+
+ vpxor %xmm5,%xmm7,%xmm4
+ vpxor %xmm4,%xmm6,%xmm6
+ vpslldq $8,%xmm6,%xmm1
+ vmovdqu 16(%r11),%xmm3
+ vpsrldq $8,%xmm6,%xmm6
+ vpxor %xmm1,%xmm5,%xmm8
+ vpxor %xmm6,%xmm7,%xmm7
+
+ vpalignr $8,%xmm8,%xmm8,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
+ vpxor %xmm2,%xmm8,%xmm8
+
+ vpalignr $8,%xmm8,%xmm8,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
+ vpxor %xmm7,%xmm2,%xmm2
+ vpxor %xmm2,%xmm8,%xmm8
+ vpshufb (%r11),%xmm8,%xmm8
+ vmovdqu %xmm8,-64(%r9)
+
+ vzeroupper
+ movq -48(%rax),%r15
+.cfi_restore %r15
+ movq -40(%rax),%r14
+.cfi_restore %r14
+ movq -32(%rax),%r13
+.cfi_restore %r13
+ movq -24(%rax),%r12
+.cfi_restore %r12
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lgcm_enc_abort:
+ movq %r10,%rax
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
+
+/* Some utility routines */
+
+/*
+ * clear all fpu registers
+ * void clear_fpu_regs_avx(void);
+ */
+.globl clear_fpu_regs_avx
+.type clear_fpu_regs_avx,@function
+.align 32
+clear_fpu_regs_avx:
+ vzeroall
+ ret
+.size clear_fpu_regs_avx,.-clear_fpu_regs_avx
+
+/*
+ * void gcm_xor_avx(const uint8_t *src, uint8_t *dst);
+ *
+ * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and
+ * stores the result at `dst'. The XOR is performed using FPU registers,
+ * so make sure FPU state is saved when running this in the kernel.
+ */
+.globl gcm_xor_avx
+.type gcm_xor_avx,@function
+.align 32
+gcm_xor_avx:
+ movdqu (%rdi), %xmm0
+ movdqu (%rsi), %xmm1
+ pxor %xmm1, %xmm0
+ movdqu %xmm0, (%rsi)
+ ret
+.size gcm_xor_avx,.-gcm_xor_avx
+
+/*
+ * Toggle a boolean_t value atomically and return the new value.
+ * boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
+ */
+.globl atomic_toggle_boolean_nv
+.type atomic_toggle_boolean_nv,@function
+.align 32
+atomic_toggle_boolean_nv:
+ xorl %eax, %eax
+ lock
+ xorl $1, (%rdi)
+ jz 1f
+ movl $1, %eax
+1:
+ ret
+.size atomic_toggle_boolean_nv,.-atomic_toggle_boolean_nv
+
+.align 64
+.Lbswap_mask:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lpoly:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.Lone_msb:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Ltwo_lsb:
+.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.Lone_lsb:
+.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 64
+
+/* Mark the stack non-executable. */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */
diff --git a/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S b/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S
new file mode 100644
index 000000000000..59edc4c8d56c
--- /dev/null
+++ b/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S
@@ -0,0 +1,254 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2009 Intel Corporation
+ * All Rights Reserved.
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
+ * instructions. This file contains an accelerated
+ * Galois Field Multiplication implementation.
+ *
+ * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
+ * carry-less multiplication. More information about PCLMULQDQ can be
+ * found at:
+ * http://software.intel.com/en-us/articles/
+ * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
+ *
+ */
+
+/*
+ * ====================================================================
+ * OpenSolaris OS modifications
+ *
+ * This source originates as file galois_hash_asm.c from
+ * Intel Corporation dated September 21, 2009.
+ *
+ * This OpenSolaris version has these major changes from the original source:
+ *
+ * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
+ * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function
+ * definition for lint.
+ *
+ * 2. Formatted code, added comments, and added #includes and #defines.
+ *
+ * 3. If bit CR0.TS is set, clear and set the TS bit, after and before
+ * calling kpreempt_disable() and kpreempt_enable().
+ * If the TS bit is not set, Save and restore %xmm registers at the beginning
+ * and end of function calls (%xmm* registers are not saved and restored by
+ * during kernel thread preemption).
+ *
+ * 4. Removed code to perform hashing. This is already done with C macro
+ * GHASH in gcm.c. For better performance, this removed code should be
+ * reintegrated in the future to replace the C GHASH macro.
+ *
+ * 5. Added code to byte swap 16-byte input and output.
+ *
+ * 6. Folded in comments from the original C source with embedded assembly
+ * (SB_w_shift_xor.c)
+ *
+ * 7. Renamed function and reordered parameters to match OpenSolaris:
+ * Intel interface:
+ * void galois_hash_asm(unsigned char *hk, unsigned char *s,
+ * unsigned char *d, int length)
+ * OpenSolaris OS interface:
+ * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
+ * ====================================================================
+ */
+
+
+#if defined(lint) || defined(__lint) /* lint */
+
+#include <sys/types.h>
+
+/* ARGSUSED */
+void
+gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) {
+}
+
+#elif defined(HAVE_PCLMULQDQ) /* guard by instruction set */
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+/*
+ * Use this mask to byte-swap a 16-byte integer with the pshufb instruction
+ */
+
+// static uint8_t byte_swap16_mask[] = {
+// 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
+.data
+.align XMM_ALIGN
+.Lbyte_swap16_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+
+/*
+ * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
+ *
+ * Perform a carry-less multiplication (that is, use XOR instead of the
+ * multiply operator) on P1 and P2 and place the result in P3.
+ *
+ * Byte swap the input and the output.
+ *
+ * Note: x_in, y, and res all point to a block of 20-byte numbers
+ * (an array of two 64-bit integers).
+ *
+ * Note2: For kernel code, caller is responsible for ensuring
+ * kpreempt_disable() has been called. This is because %xmm registers are
+ * not saved/restored. Clear and set the CR0.TS bit on entry and exit,
+ * respectively, if TS is set on entry. Otherwise, if TS is not set,
+ * save and restore %xmm registers on the stack.
+ *
+ * Note3: Original Intel definition:
+ * void galois_hash_asm(unsigned char *hk, unsigned char *s,
+ * unsigned char *d, int length)
+ *
+ * Note4: Register/parameter mapping:
+ * Intel:
+ * Parameter 1: %rcx (copied to %xmm0) hk or x_in
+ * Parameter 2: %rdx (copied to %xmm1) s or y
+ * Parameter 3: %rdi (result) d or res
+ * OpenSolaris:
+ * Parameter 1: %rdi (copied to %xmm0) x_in
+ * Parameter 2: %rsi (copied to %xmm1) y
+ * Parameter 3: %rdx (result) res
+ */
+
+ENTRY_NP(gcm_mul_pclmulqdq)
+ //
+ // Copy Parameters
+ //
+ movdqu (%rdi), %xmm0 // P1
+ movdqu (%rsi), %xmm1 // P2
+
+ //
+ // Byte swap 16-byte input
+ //
+ lea .Lbyte_swap16_mask(%rip), %rax
+ movups (%rax), %xmm10
+ pshufb %xmm10, %xmm0
+ pshufb %xmm10, %xmm1
+
+
+ //
+ // Multiply with the hash key
+ //
+ movdqu %xmm0, %xmm3
+ pclmulqdq $0, %xmm1, %xmm3 // xmm3 holds a0*b0
+
+ movdqu %xmm0, %xmm4
+ pclmulqdq $16, %xmm1, %xmm4 // xmm4 holds a0*b1
+
+ movdqu %xmm0, %xmm5
+ pclmulqdq $1, %xmm1, %xmm5 // xmm5 holds a1*b0
+ movdqu %xmm0, %xmm6
+ pclmulqdq $17, %xmm1, %xmm6 // xmm6 holds a1*b1
+
+ pxor %xmm5, %xmm4 // xmm4 holds a0*b1 + a1*b0
+
+ movdqu %xmm4, %xmm5 // move the contents of xmm4 to xmm5
+ psrldq $8, %xmm4 // shift by xmm4 64 bits to the right
+ pslldq $8, %xmm5 // shift by xmm5 64 bits to the left
+ pxor %xmm5, %xmm3
+ pxor %xmm4, %xmm6 // Register pair <xmm6:xmm3> holds the result
+ // of the carry-less multiplication of
+ // xmm0 by xmm1.
+
+ // We shift the result of the multiplication by one bit position
+ // to the left to cope for the fact that the bits are reversed.
+ movdqu %xmm3, %xmm7
+ movdqu %xmm6, %xmm8
+ pslld $1, %xmm3
+ pslld $1, %xmm6
+ psrld $31, %xmm7
+ psrld $31, %xmm8
+ movdqu %xmm7, %xmm9
+ pslldq $4, %xmm8
+ pslldq $4, %xmm7
+ psrldq $12, %xmm9
+ por %xmm7, %xmm3
+ por %xmm8, %xmm6
+ por %xmm9, %xmm6
+
+ //
+ // First phase of the reduction
+ //
+ // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
+ // independently.
+ movdqu %xmm3, %xmm7
+ movdqu %xmm3, %xmm8
+ movdqu %xmm3, %xmm9
+ pslld $31, %xmm7 // packed right shift shifting << 31
+ pslld $30, %xmm8 // packed right shift shifting << 30
+ pslld $25, %xmm9 // packed right shift shifting << 25
+ pxor %xmm8, %xmm7 // xor the shifted versions
+ pxor %xmm9, %xmm7
+ movdqu %xmm7, %xmm8
+ pslldq $12, %xmm7
+ psrldq $4, %xmm8
+ pxor %xmm7, %xmm3 // first phase of the reduction complete
+
+ //
+ // Second phase of the reduction
+ //
+ // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
+ // shift operations.
+ movdqu %xmm3, %xmm2
+ movdqu %xmm3, %xmm4 // packed left shifting >> 1
+ movdqu %xmm3, %xmm5
+ psrld $1, %xmm2
+ psrld $2, %xmm4 // packed left shifting >> 2
+ psrld $7, %xmm5 // packed left shifting >> 7
+ pxor %xmm4, %xmm2 // xor the shifted versions
+ pxor %xmm5, %xmm2
+ pxor %xmm8, %xmm2
+ pxor %xmm2, %xmm3
+ pxor %xmm3, %xmm6 // the result is in xmm6
+
+ //
+ // Byte swap 16-byte result
+ //
+ pshufb %xmm10, %xmm6 // %xmm10 has the swap mask
+
+ //
+ // Store the result
+ //
+ movdqu %xmm6, (%rdx) // P3
+
+
+ //
+ // Return
+ //
+ ret
+ SET_SIZE(gcm_mul_pclmulqdq)
+
+#endif /* lint || __lint */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/module/icp/asm-x86_64/modes/ghash-x86_64.S b/module/icp/asm-x86_64/modes/ghash-x86_64.S
new file mode 100644
index 000000000000..90cc36b43a78
--- /dev/null
+++ b/module/icp/asm-x86_64/modes/ghash-x86_64.S
@@ -0,0 +1,714 @@
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March, June 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that
+# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
+# function features so called "528B" variant utilizing additional
+# 256+16 bytes of per-key storage [+512 bytes shared table].
+# Performance results are for this streamed GHASH subroutine and are
+# expressed in cycles per processed byte, less is better:
+#
+# gcc 3.4.x(*) assembler
+#
+# P4 28.6 14.0 +100%
+# Opteron 19.3 7.7 +150%
+# Core2 17.8 8.1(**) +120%
+# Atom 31.6 16.8 +88%
+# VIA Nano 21.8 10.1 +115%
+#
+# (*) comparison is not completely fair, because C results are
+# for vanilla "256B" implementation, while assembler results
+# are for "528B";-)
+# (**) it's mystery [to me] why Core2 result is not same as for
+# Opteron;
+
+# May 2010
+#
+# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
+# See ghash-x86.pl for background information and details about coding
+# techniques.
+#
+# Special thanks to David Woodhouse for providing access to a
+# Westmere-based system on behalf of Intel Open Source Technology Centre.
+
+# December 2012
+#
+# Overhaul: aggregate Karatsuba post-processing, improve ILP in
+# reduction_alg9, increase reduction aggregate factor to 4x. As for
+# the latter. ghash-x86.pl discusses that it makes lesser sense to
+# increase aggregate factor. Then why increase here? Critical path
+# consists of 3 independent pclmulqdq instructions, Karatsuba post-
+# processing and reduction. "On top" of this we lay down aggregated
+# multiplication operations, triplets of independent pclmulqdq's. As
+# issue rate for pclmulqdq is limited, it makes lesser sense to
+# aggregate more multiplications than it takes to perform remaining
+# non-multiplication operations. 2x is near-optimal coefficient for
+# contemporary Intel CPUs (therefore modest improvement coefficient),
+# but not for Bulldozer. Latter is because logical SIMD operations
+# are twice as slow in comparison to Intel, so that critical path is
+# longer. A CPU with higher pclmulqdq issue rate would also benefit
+# from higher aggregate factor...
+#
+# Westmere 1.78(+13%)
+# Sandy Bridge 1.80(+8%)
+# Ivy Bridge 1.80(+7%)
+# Haswell 0.55(+93%) (if system doesn't support AVX)
+# Broadwell 0.45(+110%)(if system doesn't support AVX)
+# Skylake 0.44(+110%)(if system doesn't support AVX)
+# Bulldozer 1.49(+27%)
+# Silvermont 2.88(+13%)
+# Knights L 2.12(-) (if system doesn't support AVX)
+# Goldmont 1.08(+24%)
+
+# March 2013
+#
+# ... 8x aggregate factor AVX code path is using reduction algorithm
+# suggested by Shay Gueron[1]. Even though contemporary AVX-capable
+# CPUs such as Sandy and Ivy Bridge can execute it, the code performs
+# sub-optimally in comparison to above mentioned version. But thanks
+# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
+# it performs in 0.41 cycles per byte on Haswell processor, in
+# 0.29 on Broadwell, and in 0.36 on Skylake.
+#
+# Knights Landing achieves 1.09 cpb.
+#
+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
+
+# Generated once from
+# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/ghash-x86_64.pl
+# and modified for ICP. Modification are kept at a bare minimum to ease later
+# upstream merges.
+
+#if defined(__x86_64__) && defined(HAVE_AVX) && \
+ defined(HAVE_AES) && defined(HAVE_PCLMULQDQ)
+
+.text
+
+.globl gcm_gmult_clmul
+.type gcm_gmult_clmul,@function
+.align 16
+gcm_gmult_clmul:
+.cfi_startproc
+.L_gmult_clmul:
+ movdqu (%rdi),%xmm0
+ movdqa .Lbswap_mask(%rip),%xmm5
+ movdqu (%rsi),%xmm2
+ movdqu 32(%rsi),%xmm4
+.byte 102,15,56,0,197
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,220,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+.byte 102,15,56,0,197
+ movdqu %xmm0,(%rdi)
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size gcm_gmult_clmul,.-gcm_gmult_clmul
+
+.globl gcm_init_htab_avx
+.type gcm_init_htab_avx,@function
+.align 32
+gcm_init_htab_avx:
+.cfi_startproc
+ vzeroupper
+
+ vmovdqu (%rsi),%xmm2
+ // KCF/ICP stores H in network byte order with the hi qword first
+ // so we need to swap all bytes, not the 2 qwords.
+ vmovdqu .Lbswap_mask(%rip),%xmm4
+ vpshufb %xmm4,%xmm2,%xmm2
+
+
+ vpshufd $255,%xmm2,%xmm4
+ vpsrlq $63,%xmm2,%xmm3
+ vpsllq $1,%xmm2,%xmm2
+ vpxor %xmm5,%xmm5,%xmm5
+ vpcmpgtd %xmm4,%xmm5,%xmm5
+ vpslldq $8,%xmm3,%xmm3
+ vpor %xmm3,%xmm2,%xmm2
+
+
+ vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5
+ vpxor %xmm5,%xmm2,%xmm2
+
+ vpunpckhqdq %xmm2,%xmm2,%xmm6
+ vmovdqa %xmm2,%xmm0
+ vpxor %xmm2,%xmm6,%xmm6
+ movq $4,%r10
+ jmp .Linit_start_avx
+.align 32
+.Linit_loop_avx:
+ vpalignr $8,%xmm3,%xmm4,%xmm5
+ vmovdqu %xmm5,-16(%rdi)
+ vpunpckhqdq %xmm0,%xmm0,%xmm3
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
+ vpxor %xmm0,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+
+ vpslldq $8,%xmm3,%xmm4
+ vpsrldq $8,%xmm3,%xmm3
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm3,%xmm1,%xmm1
+ vpsllq $57,%xmm0,%xmm3
+ vpsllq $62,%xmm0,%xmm4
+ vpxor %xmm3,%xmm4,%xmm4
+ vpsllq $63,%xmm0,%xmm3
+ vpxor %xmm3,%xmm4,%xmm4
+ vpslldq $8,%xmm4,%xmm3
+ vpsrldq $8,%xmm4,%xmm4
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrlq $1,%xmm0,%xmm4
+ vpxor %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $5,%xmm4,%xmm4
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $1,%xmm0,%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+.Linit_start_avx:
+ vmovdqa %xmm0,%xmm5
+ vpunpckhqdq %xmm0,%xmm0,%xmm3
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
+ vpxor %xmm0,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+
+ vpslldq $8,%xmm3,%xmm4
+ vpsrldq $8,%xmm3,%xmm3
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm3,%xmm1,%xmm1
+ vpsllq $57,%xmm0,%xmm3
+ vpsllq $62,%xmm0,%xmm4
+ vpxor %xmm3,%xmm4,%xmm4
+ vpsllq $63,%xmm0,%xmm3
+ vpxor %xmm3,%xmm4,%xmm4
+ vpslldq $8,%xmm4,%xmm3
+ vpsrldq $8,%xmm4,%xmm4
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrlq $1,%xmm0,%xmm4
+ vpxor %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $5,%xmm4,%xmm4
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $1,%xmm0,%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+ vpshufd $78,%xmm5,%xmm3
+ vpshufd $78,%xmm0,%xmm4
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqu %xmm5,0(%rdi)
+ vpxor %xmm0,%xmm4,%xmm4
+ vmovdqu %xmm0,16(%rdi)
+ leaq 48(%rdi),%rdi
+ subq $1,%r10
+ jnz .Linit_loop_avx
+
+ vpalignr $8,%xmm4,%xmm3,%xmm5
+ vmovdqu %xmm5,-16(%rdi)
+
+ vzeroupper
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size gcm_init_htab_avx,.-gcm_init_htab_avx
+
+.globl gcm_gmult_avx
+.type gcm_gmult_avx,@function
+.align 32
+gcm_gmult_avx:
+.cfi_startproc
+ jmp .L_gmult_clmul
+.cfi_endproc
+.size gcm_gmult_avx,.-gcm_gmult_avx
+.globl gcm_ghash_avx
+.type gcm_ghash_avx,@function
+.align 32
+gcm_ghash_avx:
+.cfi_startproc
+ vzeroupper
+
+ vmovdqu (%rdi),%xmm10
+ leaq .L0x1c2_polynomial(%rip),%r10
+ leaq 64(%rsi),%rsi
+ vmovdqu .Lbswap_mask(%rip),%xmm13
+ vpshufb %xmm13,%xmm10,%xmm10
+ cmpq $0x80,%rcx
+ jb .Lshort_avx
+ subq $0x80,%rcx
+
+ vmovdqu 112(%rdx),%xmm14
+ vmovdqu 0-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm14
+ vmovdqu 32-64(%rsi),%xmm7
+
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vmovdqu 96(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm14,%xmm9,%xmm9
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 16-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vmovdqu 80(%rdx),%xmm14
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 48-64(%rsi),%xmm6
+ vpxor %xmm14,%xmm9,%xmm9
+ vmovdqu 64(%rdx),%xmm15
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 80-64(%rsi),%xmm7
+
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vmovdqu 48(%rdx),%xmm14
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 96-64(%rsi),%xmm6
+ vpxor %xmm5,%xmm2,%xmm2
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 128-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu 32(%rdx),%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vmovdqu 16(%rdx),%xmm14
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 144-64(%rsi),%xmm6
+ vpxor %xmm5,%xmm2,%xmm2
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 176-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu (%rdx),%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 160-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
+
+ leaq 128(%rdx),%rdx
+ cmpq $0x80,%rcx
+ jb .Ltail_avx
+
+ vpxor %xmm10,%xmm15,%xmm15
+ subq $0x80,%rcx
+ jmp .Loop8x_avx
+
+.align 32
+.Loop8x_avx:
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vmovdqu 112(%rdx),%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpxor %xmm15,%xmm8,%xmm8
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11
+ vmovdqu 0-64(%rsi),%xmm6
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12
+ vmovdqu 32-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu 96(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm3,%xmm10,%xmm10
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vxorps %xmm4,%xmm11,%xmm11
+ vmovdqu 16-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm5,%xmm12,%xmm12
+ vxorps %xmm15,%xmm8,%xmm8
+
+ vmovdqu 80(%rdx),%xmm14
+ vpxor %xmm10,%xmm12,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm11,%xmm12,%xmm12
+ vpslldq $8,%xmm12,%xmm9
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vpsrldq $8,%xmm12,%xmm12
+ vpxor %xmm9,%xmm10,%xmm10
+ vmovdqu 48-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm14
+ vxorps %xmm12,%xmm11,%xmm11
+ vpxor %xmm1,%xmm4,%xmm4
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 80-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 64(%rdx),%xmm15
+ vpalignr $8,%xmm10,%xmm10,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vxorps %xmm15,%xmm8,%xmm8
+ vpxor %xmm5,%xmm2,%xmm2
+
+ vmovdqu 48(%rdx),%xmm14
+ vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 96-64(%rsi),%xmm6
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 128-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 32(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+ vpxor %xmm5,%xmm2,%xmm2
+ vxorps %xmm12,%xmm10,%xmm10
+
+ vmovdqu 16(%rdx),%xmm14
+ vpalignr $8,%xmm10,%xmm10,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 144-64(%rsi),%xmm6
+ vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
+ vxorps %xmm11,%xmm12,%xmm12
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 176-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu (%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 160-64(%rsi),%xmm6
+ vpxor %xmm12,%xmm15,%xmm15
+ vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
+ vpxor %xmm10,%xmm15,%xmm15
+
+ leaq 128(%rdx),%rdx
+ subq $0x80,%rcx
+ jnc .Loop8x_avx
+
+ addq $0x80,%rcx
+ jmp .Ltail_no_xor_avx
+
+.align 32
+.Lshort_avx:
+ vmovdqu -16(%rdx,%rcx,1),%xmm14
+ leaq (%rdx,%rcx,1),%rdx
+ vmovdqu 0-64(%rsi),%xmm6
+ vmovdqu 32-64(%rsi),%xmm7
+ vpshufb %xmm13,%xmm14,%xmm15
+
+ vmovdqa %xmm0,%xmm3
+ vmovdqa %xmm1,%xmm4
+ vmovdqa %xmm2,%xmm5
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -32(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 16-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -48(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 48-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovdqu 80-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -64(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -80(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 96-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovdqu 128-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -96(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -112(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 144-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovq 184-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jmp .Ltail_avx
+
+.align 32
+.Ltail_avx:
+ vpxor %xmm10,%xmm15,%xmm15
+.Ltail_no_xor_avx:
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+
+ vmovdqu (%r10),%xmm12
+
+ vpxor %xmm0,%xmm3,%xmm10
+ vpxor %xmm1,%xmm4,%xmm11
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vpxor %xmm10,%xmm5,%xmm5
+ vpxor %xmm11,%xmm5,%xmm5
+ vpslldq $8,%xmm5,%xmm9
+ vpsrldq $8,%xmm5,%xmm5
+ vpxor %xmm9,%xmm10,%xmm10
+ vpxor %xmm5,%xmm11,%xmm11
+
+ vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
+ vpalignr $8,%xmm10,%xmm10,%xmm10
+ vpxor %xmm9,%xmm10,%xmm10
+
+ vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
+ vpalignr $8,%xmm10,%xmm10,%xmm10
+ vpxor %xmm11,%xmm10,%xmm10
+ vpxor %xmm9,%xmm10,%xmm10
+
+ cmpq $0,%rcx
+ jne .Lshort_avx
+
+ vpshufb %xmm13,%xmm10,%xmm10
+ vmovdqu %xmm10,(%rdi)
+ vzeroupper
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size gcm_ghash_avx,.-gcm_ghash_avx
+.align 64
+.Lbswap_mask:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.L0x1c2_polynomial:
+.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.L7_mask:
+.long 7,0,7,0
+.L7_mask_poly:
+.long 7,0,450,0
+.align 64
+.type .Lrem_4bit,@object
+.Lrem_4bit:
+.long 0,0,0,471859200,0,943718400,0,610271232
+.long 0,1887436800,0,1822425088,0,1220542464,0,1423966208
+.long 0,3774873600,0,4246732800,0,3644850176,0,3311403008
+.long 0,2441084928,0,2376073216,0,2847932416,0,3051356160
+.type .Lrem_8bit,@object
+.Lrem_8bit:
+.value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
+.value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
+.value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
+.value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
+.value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
+.value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
+.value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
+.value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
+.value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
+.value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
+.value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
+.value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
+.value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
+.value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
+.value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
+.value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
+.value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
+.value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
+.value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
+.value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
+.value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
+.value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
+.value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
+.value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
+.value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
+.value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
+.value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
+.value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
+.value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
+.value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
+.value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
+.value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
+
+.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 64
+
+/* Mark the stack non-executable. */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */
diff --git a/module/icp/asm-x86_64/sha1/sha1-x86_64.S b/module/icp/asm-x86_64/sha1/sha1-x86_64.S
new file mode 100644
index 000000000000..cb923784a730
--- /dev/null
+++ b/module/icp/asm-x86_64/sha1/sha1-x86_64.S
@@ -0,0 +1,1353 @@
+/*
+ * !/usr/bin/env perl
+ *
+ * ====================================================================
+ * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+ * project. The module is, however, dual licensed under OpenSSL and
+ * CRYPTOGAMS licenses depending on where you obtain it. For further
+ * details see http://www.openssl.org/~appro/cryptogams/.
+ * ====================================================================
+ *
+ * sha1_block procedure for x86_64.
+ *
+ * It was brought to my attention that on EM64T compiler-generated code
+ * was far behind 32-bit assembler implementation. This is unlike on
+ * Opteron where compiler-generated code was only 15% behind 32-bit
+ * assembler, which originally made it hard to motivate the effort.
+ * There was suggestion to mechanically translate 32-bit code, but I
+ * dismissed it, reasoning that x86_64 offers enough register bank
+ * capacity to fully utilize SHA-1 parallelism. Therefore this fresh
+ * implementation:-) However! While 64-bit code does performs better
+ * on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
+ * x86_64 does offer larger *addressable* bank, but out-of-order core
+ * reaches for even more registers through dynamic aliasing, and EM64T
+ * core must have managed to run-time optimize even 32-bit code just as
+ * good as 64-bit one. Performance improvement is summarized in the
+ * following table:
+ *
+ * gcc 3.4 32-bit asm cycles/byte
+ * Opteron +45% +20% 6.8
+ * Xeon P4 +65% +0% 9.9
+ * Core2 +60% +10% 7.0
+ *
+ *
+ * OpenSolaris OS modifications
+ *
+ * Sun elects to use this software under the BSD license.
+ *
+ * This source originates from OpenSSL file sha1-x86_64.pl at
+ * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz
+ * (presumably for future OpenSSL release 0.9.8h), with these changes:
+ *
+ * 1. Added perl "use strict" and declared variables.
+ *
+ * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
+ * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards.
+ *
+ * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1)
+ * assemblers).
+ *
+ */
+
+/*
+ * This file was generated by a perl script (sha1-x86_64.pl). The comments from
+ * the original file have been pasted above.
+ */
+
+#if defined(lint) || defined(__lint)
+#include <sys/stdint.h>
+#include <sys/sha1.h>
+
+
+/* ARGSUSED */
+void
+sha1_block_data_order(SHA1_CTX *ctx, const void *inpp, size_t blocks)
+{
+}
+
+#else
+#define _ASM
+#include <sys/asm_linkage.h>
+ENTRY_NP(sha1_block_data_order)
+ push %rbx
+ push %rbp
+ push %r12
+ mov %rsp,%rax
+ mov %rdi,%r8 # reassigned argument
+ sub $72,%rsp
+ mov %rsi,%r9 # reassigned argument
+ and $-64,%rsp
+ mov %rdx,%r10 # reassigned argument
+ mov %rax,64(%rsp)
+
+ mov 0(%r8),%edx
+ mov 4(%r8),%esi
+ mov 8(%r8),%edi
+ mov 12(%r8),%ebp
+ mov 16(%r8),%r11d
+.align 4
+.Lloop:
+ mov 0(%r9),%eax
+ bswap %eax
+ mov %eax,0(%rsp)
+ lea 0x5a827999(%eax,%r11d),%r12d
+ mov %edi,%ebx
+ mov 4(%r9),%eax
+ mov %edx,%r11d
+ xor %ebp,%ebx
+ bswap %eax
+ rol $5,%r11d
+ and %esi,%ebx
+ mov %eax,4(%rsp)
+ add %r11d,%r12d
+ xor %ebp,%ebx
+ rol $30,%esi
+ add %ebx,%r12d
+ lea 0x5a827999(%eax,%ebp),%r11d
+ mov %esi,%ebx
+ mov 8(%r9),%eax
+ mov %r12d,%ebp
+ xor %edi,%ebx
+ bswap %eax
+ rol $5,%ebp
+ and %edx,%ebx
+ mov %eax,8(%rsp)
+ add %ebp,%r11d
+ xor %edi,%ebx
+ rol $30,%edx
+ add %ebx,%r11d
+ lea 0x5a827999(%eax,%edi),%ebp
+ mov %edx,%ebx
+ mov 12(%r9),%eax
+ mov %r11d,%edi
+ xor %esi,%ebx
+ bswap %eax
+ rol $5,%edi
+ and %r12d,%ebx
+ mov %eax,12(%rsp)
+ add %edi,%ebp
+ xor %esi,%ebx
+ rol $30,%r12d
+ add %ebx,%ebp
+ lea 0x5a827999(%eax,%esi),%edi
+ mov %r12d,%ebx
+ mov 16(%r9),%eax
+ mov %ebp,%esi
+ xor %edx,%ebx
+ bswap %eax
+ rol $5,%esi
+ and %r11d,%ebx
+ mov %eax,16(%rsp)
+ add %esi,%edi
+ xor %edx,%ebx
+ rol $30,%r11d
+ add %ebx,%edi
+ lea 0x5a827999(%eax,%edx),%esi
+ mov %r11d,%ebx
+ mov 20(%r9),%eax
+ mov %edi,%edx
+ xor %r12d,%ebx
+ bswap %eax
+ rol $5,%edx
+ and %ebp,%ebx
+ mov %eax,20(%rsp)
+ add %edx,%esi
+ xor %r12d,%ebx
+ rol $30,%ebp
+ add %ebx,%esi
+ lea 0x5a827999(%eax,%r12d),%edx
+ mov %ebp,%ebx
+ mov 24(%r9),%eax
+ mov %esi,%r12d
+ xor %r11d,%ebx
+ bswap %eax
+ rol $5,%r12d
+ and %edi,%ebx
+ mov %eax,24(%rsp)
+ add %r12d,%edx
+ xor %r11d,%ebx
+ rol $30,%edi
+ add %ebx,%edx
+ lea 0x5a827999(%eax,%r11d),%r12d
+ mov %edi,%ebx
+ mov 28(%r9),%eax
+ mov %edx,%r11d
+ xor %ebp,%ebx
+ bswap %eax
+ rol $5,%r11d
+ and %esi,%ebx
+ mov %eax,28(%rsp)
+ add %r11d,%r12d
+ xor %ebp,%ebx
+ rol $30,%esi
+ add %ebx,%r12d
+ lea 0x5a827999(%eax,%ebp),%r11d
+ mov %esi,%ebx
+ mov 32(%r9),%eax
+ mov %r12d,%ebp
+ xor %edi,%ebx
+ bswap %eax
+ rol $5,%ebp
+ and %edx,%ebx
+ mov %eax,32(%rsp)
+ add %ebp,%r11d
+ xor %edi,%ebx
+ rol $30,%edx
+ add %ebx,%r11d
+ lea 0x5a827999(%eax,%edi),%ebp
+ mov %edx,%ebx
+ mov 36(%r9),%eax
+ mov %r11d,%edi
+ xor %esi,%ebx
+ bswap %eax
+ rol $5,%edi
+ and %r12d,%ebx
+ mov %eax,36(%rsp)
+ add %edi,%ebp
+ xor %esi,%ebx
+ rol $30,%r12d
+ add %ebx,%ebp
+ lea 0x5a827999(%eax,%esi),%edi
+ mov %r12d,%ebx
+ mov 40(%r9),%eax
+ mov %ebp,%esi
+ xor %edx,%ebx
+ bswap %eax
+ rol $5,%esi
+ and %r11d,%ebx
+ mov %eax,40(%rsp)
+ add %esi,%edi
+ xor %edx,%ebx
+ rol $30,%r11d
+ add %ebx,%edi
+ lea 0x5a827999(%eax,%edx),%esi
+ mov %r11d,%ebx
+ mov 44(%r9),%eax
+ mov %edi,%edx
+ xor %r12d,%ebx
+ bswap %eax
+ rol $5,%edx
+ and %ebp,%ebx
+ mov %eax,44(%rsp)
+ add %edx,%esi
+ xor %r12d,%ebx
+ rol $30,%ebp
+ add %ebx,%esi
+ lea 0x5a827999(%eax,%r12d),%edx
+ mov %ebp,%ebx
+ mov 48(%r9),%eax
+ mov %esi,%r12d
+ xor %r11d,%ebx
+ bswap %eax
+ rol $5,%r12d
+ and %edi,%ebx
+ mov %eax,48(%rsp)
+ add %r12d,%edx
+ xor %r11d,%ebx
+ rol $30,%edi
+ add %ebx,%edx
+ lea 0x5a827999(%eax,%r11d),%r12d
+ mov %edi,%ebx
+ mov 52(%r9),%eax
+ mov %edx,%r11d
+ xor %ebp,%ebx
+ bswap %eax
+ rol $5,%r11d
+ and %esi,%ebx
+ mov %eax,52(%rsp)
+ add %r11d,%r12d
+ xor %ebp,%ebx
+ rol $30,%esi
+ add %ebx,%r12d
+ lea 0x5a827999(%eax,%ebp),%r11d
+ mov %esi,%ebx
+ mov 56(%r9),%eax
+ mov %r12d,%ebp
+ xor %edi,%ebx
+ bswap %eax
+ rol $5,%ebp
+ and %edx,%ebx
+ mov %eax,56(%rsp)
+ add %ebp,%r11d
+ xor %edi,%ebx
+ rol $30,%edx
+ add %ebx,%r11d
+ lea 0x5a827999(%eax,%edi),%ebp
+ mov %edx,%ebx
+ mov 60(%r9),%eax
+ mov %r11d,%edi
+ xor %esi,%ebx
+ bswap %eax
+ rol $5,%edi
+ and %r12d,%ebx
+ mov %eax,60(%rsp)
+ add %edi,%ebp
+ xor %esi,%ebx
+ rol $30,%r12d
+ add %ebx,%ebp
+ lea 0x5a827999(%eax,%esi),%edi
+ mov 0(%rsp),%eax
+ mov %r12d,%ebx
+ mov %ebp,%esi
+ xor 8(%rsp),%eax
+ xor %edx,%ebx
+ rol $5,%esi
+ xor 32(%rsp),%eax
+ and %r11d,%ebx
+ add %esi,%edi
+ xor 52(%rsp),%eax
+ xor %edx,%ebx
+ rol $30,%r11d
+ add %ebx,%edi
+ rol $1,%eax
+ mov %eax,0(%rsp)
+ lea 0x5a827999(%eax,%edx),%esi
+ mov 4(%rsp),%eax
+ mov %r11d,%ebx
+ mov %edi,%edx
+ xor 12(%rsp),%eax
+ xor %r12d,%ebx
+ rol $5,%edx
+ xor 36(%rsp),%eax
+ and %ebp,%ebx
+ add %edx,%esi
+ xor 56(%rsp),%eax
+ xor %r12d,%ebx
+ rol $30,%ebp
+ add %ebx,%esi
+ rol $1,%eax
+ mov %eax,4(%rsp)
+ lea 0x5a827999(%eax,%r12d),%edx
+ mov 8(%rsp),%eax
+ mov %ebp,%ebx
+ mov %esi,%r12d
+ xor 16(%rsp),%eax
+ xor %r11d,%ebx
+ rol $5,%r12d
+ xor 40(%rsp),%eax
+ and %edi,%ebx
+ add %r12d,%edx
+ xor 60(%rsp),%eax
+ xor %r11d,%ebx
+ rol $30,%edi
+ add %ebx,%edx
+ rol $1,%eax
+ mov %eax,8(%rsp)
+ lea 0x5a827999(%eax,%r11d),%r12d
+ mov 12(%rsp),%eax
+ mov %edi,%ebx
+ mov %edx,%r11d
+ xor 20(%rsp),%eax
+ xor %ebp,%ebx
+ rol $5,%r11d
+ xor 44(%rsp),%eax
+ and %esi,%ebx
+ add %r11d,%r12d
+ xor 0(%rsp),%eax
+ xor %ebp,%ebx
+ rol $30,%esi
+ add %ebx,%r12d
+ rol $1,%eax
+ mov %eax,12(%rsp)
+ lea 0x5a827999(%eax,%ebp),%r11d
+ mov 16(%rsp),%eax
+ mov %esi,%ebx
+ mov %r12d,%ebp
+ xor 24(%rsp),%eax
+ xor %edi,%ebx
+ rol $5,%ebp
+ xor 48(%rsp),%eax
+ and %edx,%ebx
+ add %ebp,%r11d
+ xor 4(%rsp),%eax
+ xor %edi,%ebx
+ rol $30,%edx
+ add %ebx,%r11d
+ rol $1,%eax
+ mov %eax,16(%rsp)
+ lea 0x6ed9eba1(%eax,%edi),%ebp
+ mov 20(%rsp),%eax
+ mov %edx,%ebx
+ mov %r11d,%edi
+ xor 28(%rsp),%eax
+ xor %r12d,%ebx
+ rol $5,%edi
+ xor 52(%rsp),%eax
+ xor %esi,%ebx
+ add %edi,%ebp
+ xor 8(%rsp),%eax
+ rol $30,%r12d
+ add %ebx,%ebp
+ rol $1,%eax
+ mov %eax,20(%rsp)
+ lea 0x6ed9eba1(%eax,%esi),%edi
+ mov 24(%rsp),%eax
+ mov %r12d,%ebx
+ mov %ebp,%esi
+ xor 32(%rsp),%eax
+ xor %r11d,%ebx
+ rol $5,%esi
+ xor 56(%rsp),%eax
+ xor %edx,%ebx
+ add %esi,%edi
+ xor 12(%rsp),%eax
+ rol $30,%r11d
+ add %ebx,%edi
+ rol $1,%eax
+ mov %eax,24(%rsp)
+ lea 0x6ed9eba1(%eax,%edx),%esi
+ mov 28(%rsp),%eax
+ mov %r11d,%ebx
+ mov %edi,%edx
+ xor 36(%rsp),%eax
+ xor %ebp,%ebx
+ rol $5,%edx
+ xor 60(%rsp),%eax
+ xor %r12d,%ebx
+ add %edx,%esi
+ xor 16(%rsp),%eax
+ rol $30,%ebp
+ add %ebx,%esi
+ rol $1,%eax
+ mov %eax,28(%rsp)
+ lea 0x6ed9eba1(%eax,%r12d),%edx
+ mov 32(%rsp),%eax
+ mov %ebp,%ebx
+ mov %esi,%r12d
+ xor 40(%rsp),%eax
+ xor %edi,%ebx
+ rol $5,%r12d
+ xor 0(%rsp),%eax
+ xor %r11d,%ebx
+ add %r12d,%edx
+ xor 20(%rsp),%eax
+ rol $30,%edi
+ add %ebx,%edx
+ rol $1,%eax
+ mov %eax,32(%rsp)
+ lea 0x6ed9eba1(%eax,%r11d),%r12d
+ mov 36(%rsp),%eax
+ mov %edi,%ebx
+ mov %edx,%r11d
+ xor 44(%rsp),%eax
+ xor %esi,%ebx
+ rol $5,%r11d
+ xor 4(%rsp),%eax
+ xor %ebp,%ebx
+ add %r11d,%r12d
+ xor 24(%rsp),%eax
+ rol $30,%esi
+ add %ebx,%r12d
+ rol $1,%eax
+ mov %eax,36(%rsp)
+ lea 0x6ed9eba1(%eax,%ebp),%r11d
+ mov 40(%rsp),%eax
+ mov %esi,%ebx
+ mov %r12d,%ebp
+ xor 48(%rsp),%eax
+ xor %edx,%ebx
+ rol $5,%ebp
+ xor 8(%rsp),%eax
+ xor %edi,%ebx
+ add %ebp,%r11d
+ xor 28(%rsp),%eax
+ rol $30,%edx
+ add %ebx,%r11d
+ rol $1,%eax
+ mov %eax,40(%rsp)
+ lea 0x6ed9eba1(%eax,%edi),%ebp
+ mov 44(%rsp),%eax
+ mov %edx,%ebx
+ mov %r11d,%edi
+ xor 52(%rsp),%eax
+ xor %r12d,%ebx
+ rol $5,%edi
+ xor 12(%rsp),%eax
+ xor %esi,%ebx
+ add %edi,%ebp
+ xor 32(%rsp),%eax
+ rol $30,%r12d
+ add %ebx,%ebp
+ rol $1,%eax
+ mov %eax,44(%rsp)
+ lea 0x6ed9eba1(%eax,%esi),%edi
+ mov 48(%rsp),%eax
+ mov %r12d,%ebx
+ mov %ebp,%esi
+ xor 56(%rsp),%eax
+ xor %r11d,%ebx
+ rol $5,%esi
+ xor 16(%rsp),%eax
+ xor %edx,%ebx
+ add %esi,%edi
+ xor 36(%rsp),%eax
+ rol $30,%r11d
+ add %ebx,%edi
+ rol $1,%eax
+ mov %eax,48(%rsp)
+ lea 0x6ed9eba1(%eax,%edx),%esi
+ mov 52(%rsp),%eax
+ mov %r11d,%ebx
+ mov %edi,%edx
+ xor 60(%rsp),%eax
+ xor %ebp,%ebx
+ rol $5,%edx
+ xor 20(%rsp),%eax
+ xor %r12d,%ebx
+ add %edx,%esi
+ xor 40(%rsp),%eax
+ rol $30,%ebp
+ add %ebx,%esi
+ rol $1,%eax
+ mov %eax,52(%rsp)
+ lea 0x6ed9eba1(%eax,%r12d),%edx
+ mov 56(%rsp),%eax
+ mov %ebp,%ebx
+ mov %esi,%r12d
+ xor 0(%rsp),%eax
+ xor %edi,%ebx
+ rol $5,%r12d
+ xor 24(%rsp),%eax
+ xor %r11d,%ebx
+ add %r12d,%edx
+ xor 44(%rsp),%eax
+ rol $30,%edi
+ add %ebx,%edx
+ rol $1,%eax
+ mov %eax,56(%rsp)
+ lea 0x6ed9eba1(%eax,%r11d),%r12d
+ mov 60(%rsp),%eax
+ mov %edi,%ebx
+ mov %edx,%r11d
+ xor 4(%rsp),%eax
+ xor %esi,%ebx
+ rol $5,%r11d
+ xor 28(%rsp),%eax
+ xor %ebp,%ebx
+ add %r11d,%r12d
+ xor 48(%rsp),%eax
+ rol $30,%esi
+ add %ebx,%r12d
+ rol $1,%eax
+ mov %eax,60(%rsp)
+ lea 0x6ed9eba1(%eax,%ebp),%r11d
+ mov 0(%rsp),%eax
+ mov %esi,%ebx
+ mov %r12d,%ebp
+ xor 8(%rsp),%eax
+ xor %edx,%ebx
+ rol $5,%ebp
+ xor 32(%rsp),%eax
+ xor %edi,%ebx
+ add %ebp,%r11d
+ xor 52(%rsp),%eax
+ rol $30,%edx
+ add %ebx,%r11d
+ rol $1,%eax
+ mov %eax,0(%rsp)
+ lea 0x6ed9eba1(%eax,%edi),%ebp
+ mov 4(%rsp),%eax
+ mov %edx,%ebx
+ mov %r11d,%edi
+ xor 12(%rsp),%eax
+ xor %r12d,%ebx
+ rol $5,%edi
+ xor 36(%rsp),%eax
+ xor %esi,%ebx
+ add %edi,%ebp
+ xor 56(%rsp),%eax
+ rol $30,%r12d
+ add %ebx,%ebp
+ rol $1,%eax
+ mov %eax,4(%rsp)
+ lea 0x6ed9eba1(%eax,%esi),%edi
+ mov 8(%rsp),%eax
+ mov %r12d,%ebx
+ mov %ebp,%esi
+ xor 16(%rsp),%eax
+ xor %r11d,%ebx
+ rol $5,%esi
+ xor 40(%rsp),%eax
+ xor %edx,%ebx
+ add %esi,%edi
+ xor 60(%rsp),%eax
+ rol $30,%r11d
+ add %ebx,%edi
+ rol $1,%eax
+ mov %eax,8(%rsp)
+ lea 0x6ed9eba1(%eax,%edx),%esi
+ mov 12(%rsp),%eax
+ mov %r11d,%ebx
+ mov %edi,%edx
+ xor 20(%rsp),%eax
+ xor %ebp,%ebx
+ rol $5,%edx
+ xor 44(%rsp),%eax
+ xor %r12d,%ebx
+ add %edx,%esi
+ xor 0(%rsp),%eax
+ rol $30,%ebp
+ add %ebx,%esi
+ rol $1,%eax
+ mov %eax,12(%rsp)
+ lea 0x6ed9eba1(%eax,%r12d),%edx
+ mov 16(%rsp),%eax
+ mov %ebp,%ebx
+ mov %esi,%r12d
+ xor 24(%rsp),%eax
+ xor %edi,%ebx
+ rol $5,%r12d
+ xor 48(%rsp),%eax
+ xor %r11d,%ebx
+ add %r12d,%edx
+ xor 4(%rsp),%eax
+ rol $30,%edi
+ add %ebx,%edx
+ rol $1,%eax
+ mov %eax,16(%rsp)
+ lea 0x6ed9eba1(%eax,%r11d),%r12d
+ mov 20(%rsp),%eax
+ mov %edi,%ebx
+ mov %edx,%r11d
+ xor 28(%rsp),%eax
+ xor %esi,%ebx
+ rol $5,%r11d
+ xor 52(%rsp),%eax
+ xor %ebp,%ebx
+ add %r11d,%r12d
+ xor 8(%rsp),%eax
+ rol $30,%esi
+ add %ebx,%r12d
+ rol $1,%eax
+ mov %eax,20(%rsp)
+ lea 0x6ed9eba1(%eax,%ebp),%r11d
+ mov 24(%rsp),%eax
+ mov %esi,%ebx
+ mov %r12d,%ebp
+ xor 32(%rsp),%eax
+ xor %edx,%ebx
+ rol $5,%ebp
+ xor 56(%rsp),%eax
+ xor %edi,%ebx
+ add %ebp,%r11d
+ xor 12(%rsp),%eax
+ rol $30,%edx
+ add %ebx,%r11d
+ rol $1,%eax
+ mov %eax,24(%rsp)
+ lea 0x6ed9eba1(%eax,%edi),%ebp
+ mov 28(%rsp),%eax
+ mov %edx,%ebx
+ mov %r11d,%edi
+ xor 36(%rsp),%eax
+ xor %r12d,%ebx
+ rol $5,%edi
+ xor 60(%rsp),%eax
+ xor %esi,%ebx
+ add %edi,%ebp
+ xor 16(%rsp),%eax
+ rol $30,%r12d
+ add %ebx,%ebp
+ rol $1,%eax
+ mov %eax,28(%rsp)
+ lea 0x6ed9eba1(%eax,%esi),%edi
+ mov 32(%rsp),%eax
+ mov %r12d,%ebx
+ mov %ebp,%esi
+ xor 40(%rsp),%eax
+ xor %r11d,%ebx
+ rol $5,%esi
+ xor 0(%rsp),%eax
+ xor %edx,%ebx
+ add %esi,%edi
+ xor 20(%rsp),%eax
+ rol $30,%r11d
+ add %ebx,%edi
+ rol $1,%eax
+ mov %eax,32(%rsp)
+ lea -0x70e44324(%eax,%edx),%esi
+ mov 36(%rsp),%eax
+ mov %ebp,%ebx
+ mov %ebp,%ecx
+ xor 44(%rsp),%eax
+ mov %edi,%edx
+ and %r11d,%ebx
+ xor 4(%rsp),%eax
+ or %r11d,%ecx
+ rol $5,%edx
+ xor 24(%rsp),%eax
+ and %r12d,%ecx
+ add %edx,%esi
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%ebp
+ mov %eax,36(%rsp)
+ add %ebx,%esi
+ lea -0x70e44324(%eax,%r12d),%edx
+ mov 40(%rsp),%eax
+ mov %edi,%ebx
+ mov %edi,%ecx
+ xor 48(%rsp),%eax
+ mov %esi,%r12d
+ and %ebp,%ebx
+ xor 8(%rsp),%eax
+ or %ebp,%ecx
+ rol $5,%r12d
+ xor 28(%rsp),%eax
+ and %r11d,%ecx
+ add %r12d,%edx
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%edi
+ mov %eax,40(%rsp)
+ add %ebx,%edx
+ lea -0x70e44324(%eax,%r11d),%r12d
+ mov 44(%rsp),%eax
+ mov %esi,%ebx
+ mov %esi,%ecx
+ xor 52(%rsp),%eax
+ mov %edx,%r11d
+ and %edi,%ebx
+ xor 12(%rsp),%eax
+ or %edi,%ecx
+ rol $5,%r11d
+ xor 32(%rsp),%eax
+ and %ebp,%ecx
+ add %r11d,%r12d
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%esi
+ mov %eax,44(%rsp)
+ add %ebx,%r12d
+ lea -0x70e44324(%eax,%ebp),%r11d
+ mov 48(%rsp),%eax
+ mov %edx,%ebx
+ mov %edx,%ecx
+ xor 56(%rsp),%eax
+ mov %r12d,%ebp
+ and %esi,%ebx
+ xor 16(%rsp),%eax
+ or %esi,%ecx
+ rol $5,%ebp
+ xor 36(%rsp),%eax
+ and %edi,%ecx
+ add %ebp,%r11d
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%edx
+ mov %eax,48(%rsp)
+ add %ebx,%r11d
+ lea -0x70e44324(%eax,%edi),%ebp
+ mov 52(%rsp),%eax
+ mov %r12d,%ebx
+ mov %r12d,%ecx
+ xor 60(%rsp),%eax
+ mov %r11d,%edi
+ and %edx,%ebx
+ xor 20(%rsp),%eax
+ or %edx,%ecx
+ rol $5,%edi
+ xor 40(%rsp),%eax
+ and %esi,%ecx
+ add %edi,%ebp
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%r12d
+ mov %eax,52(%rsp)
+ add %ebx,%ebp
+ lea -0x70e44324(%eax,%esi),%edi
+ mov 56(%rsp),%eax
+ mov %r11d,%ebx
+ mov %r11d,%ecx
+ xor 0(%rsp),%eax
+ mov %ebp,%esi
+ and %r12d,%ebx
+ xor 24(%rsp),%eax
+ or %r12d,%ecx
+ rol $5,%esi
+ xor 44(%rsp),%eax
+ and %edx,%ecx
+ add %esi,%edi
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%r11d
+ mov %eax,56(%rsp)
+ add %ebx,%edi
+ lea -0x70e44324(%eax,%edx),%esi
+ mov 60(%rsp),%eax
+ mov %ebp,%ebx
+ mov %ebp,%ecx
+ xor 4(%rsp),%eax
+ mov %edi,%edx
+ and %r11d,%ebx
+ xor 28(%rsp),%eax
+ or %r11d,%ecx
+ rol $5,%edx
+ xor 48(%rsp),%eax
+ and %r12d,%ecx
+ add %edx,%esi
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%ebp
+ mov %eax,60(%rsp)
+ add %ebx,%esi
+ lea -0x70e44324(%eax,%r12d),%edx
+ mov 0(%rsp),%eax
+ mov %edi,%ebx
+ mov %edi,%ecx
+ xor 8(%rsp),%eax
+ mov %esi,%r12d
+ and %ebp,%ebx
+ xor 32(%rsp),%eax
+ or %ebp,%ecx
+ rol $5,%r12d
+ xor 52(%rsp),%eax
+ and %r11d,%ecx
+ add %r12d,%edx
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%edi
+ mov %eax,0(%rsp)
+ add %ebx,%edx
+ lea -0x70e44324(%eax,%r11d),%r12d
+ mov 4(%rsp),%eax
+ mov %esi,%ebx
+ mov %esi,%ecx
+ xor 12(%rsp),%eax
+ mov %edx,%r11d
+ and %edi,%ebx
+ xor 36(%rsp),%eax
+ or %edi,%ecx
+ rol $5,%r11d
+ xor 56(%rsp),%eax
+ and %ebp,%ecx
+ add %r11d,%r12d
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%esi
+ mov %eax,4(%rsp)
+ add %ebx,%r12d
+ lea -0x70e44324(%eax,%ebp),%r11d
+ mov 8(%rsp),%eax
+ mov %edx,%ebx
+ mov %edx,%ecx
+ xor 16(%rsp),%eax
+ mov %r12d,%ebp
+ and %esi,%ebx
+ xor 40(%rsp),%eax
+ or %esi,%ecx
+ rol $5,%ebp
+ xor 60(%rsp),%eax
+ and %edi,%ecx
+ add %ebp,%r11d
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%edx
+ mov %eax,8(%rsp)
+ add %ebx,%r11d
+ lea -0x70e44324(%eax,%edi),%ebp
+ mov 12(%rsp),%eax
+ mov %r12d,%ebx
+ mov %r12d,%ecx
+ xor 20(%rsp),%eax
+ mov %r11d,%edi
+ and %edx,%ebx
+ xor 44(%rsp),%eax
+ or %edx,%ecx
+ rol $5,%edi
+ xor 0(%rsp),%eax
+ and %esi,%ecx
+ add %edi,%ebp
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%r12d
+ mov %eax,12(%rsp)
+ add %ebx,%ebp
+ lea -0x70e44324(%eax,%esi),%edi
+ mov 16(%rsp),%eax
+ mov %r11d,%ebx
+ mov %r11d,%ecx
+ xor 24(%rsp),%eax
+ mov %ebp,%esi
+ and %r12d,%ebx
+ xor 48(%rsp),%eax
+ or %r12d,%ecx
+ rol $5,%esi
+ xor 4(%rsp),%eax
+ and %edx,%ecx
+ add %esi,%edi
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%r11d
+ mov %eax,16(%rsp)
+ add %ebx,%edi
+ lea -0x70e44324(%eax,%edx),%esi
+ mov 20(%rsp),%eax
+ mov %ebp,%ebx
+ mov %ebp,%ecx
+ xor 28(%rsp),%eax
+ mov %edi,%edx
+ and %r11d,%ebx
+ xor 52(%rsp),%eax
+ or %r11d,%ecx
+ rol $5,%edx
+ xor 8(%rsp),%eax
+ and %r12d,%ecx
+ add %edx,%esi
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%ebp
+ mov %eax,20(%rsp)
+ add %ebx,%esi
+ lea -0x70e44324(%eax,%r12d),%edx
+ mov 24(%rsp),%eax
+ mov %edi,%ebx
+ mov %edi,%ecx
+ xor 32(%rsp),%eax
+ mov %esi,%r12d
+ and %ebp,%ebx
+ xor 56(%rsp),%eax
+ or %ebp,%ecx
+ rol $5,%r12d
+ xor 12(%rsp),%eax
+ and %r11d,%ecx
+ add %r12d,%edx
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%edi
+ mov %eax,24(%rsp)
+ add %ebx,%edx
+ lea -0x70e44324(%eax,%r11d),%r12d
+ mov 28(%rsp),%eax
+ mov %esi,%ebx
+ mov %esi,%ecx
+ xor 36(%rsp),%eax
+ mov %edx,%r11d
+ and %edi,%ebx
+ xor 60(%rsp),%eax
+ or %edi,%ecx
+ rol $5,%r11d
+ xor 16(%rsp),%eax
+ and %ebp,%ecx
+ add %r11d,%r12d
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%esi
+ mov %eax,28(%rsp)
+ add %ebx,%r12d
+ lea -0x70e44324(%eax,%ebp),%r11d
+ mov 32(%rsp),%eax
+ mov %edx,%ebx
+ mov %edx,%ecx
+ xor 40(%rsp),%eax
+ mov %r12d,%ebp
+ and %esi,%ebx
+ xor 0(%rsp),%eax
+ or %esi,%ecx
+ rol $5,%ebp
+ xor 20(%rsp),%eax
+ and %edi,%ecx
+ add %ebp,%r11d
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%edx
+ mov %eax,32(%rsp)
+ add %ebx,%r11d
+ lea -0x70e44324(%eax,%edi),%ebp
+ mov 36(%rsp),%eax
+ mov %r12d,%ebx
+ mov %r12d,%ecx
+ xor 44(%rsp),%eax
+ mov %r11d,%edi
+ and %edx,%ebx
+ xor 4(%rsp),%eax
+ or %edx,%ecx
+ rol $5,%edi
+ xor 24(%rsp),%eax
+ and %esi,%ecx
+ add %edi,%ebp
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%r12d
+ mov %eax,36(%rsp)
+ add %ebx,%ebp
+ lea -0x70e44324(%eax,%esi),%edi
+ mov 40(%rsp),%eax
+ mov %r11d,%ebx
+ mov %r11d,%ecx
+ xor 48(%rsp),%eax
+ mov %ebp,%esi
+ and %r12d,%ebx
+ xor 8(%rsp),%eax
+ or %r12d,%ecx
+ rol $5,%esi
+ xor 28(%rsp),%eax
+ and %edx,%ecx
+ add %esi,%edi
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%r11d
+ mov %eax,40(%rsp)
+ add %ebx,%edi
+ lea -0x70e44324(%eax,%edx),%esi
+ mov 44(%rsp),%eax
+ mov %ebp,%ebx
+ mov %ebp,%ecx
+ xor 52(%rsp),%eax
+ mov %edi,%edx
+ and %r11d,%ebx
+ xor 12(%rsp),%eax
+ or %r11d,%ecx
+ rol $5,%edx
+ xor 32(%rsp),%eax
+ and %r12d,%ecx
+ add %edx,%esi
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%ebp
+ mov %eax,44(%rsp)
+ add %ebx,%esi
+ lea -0x70e44324(%eax,%r12d),%edx
+ mov 48(%rsp),%eax
+ mov %edi,%ebx
+ mov %edi,%ecx
+ xor 56(%rsp),%eax
+ mov %esi,%r12d
+ and %ebp,%ebx
+ xor 16(%rsp),%eax
+ or %ebp,%ecx
+ rol $5,%r12d
+ xor 36(%rsp),%eax
+ and %r11d,%ecx
+ add %r12d,%edx
+ rol $1,%eax
+ or %ecx,%ebx
+ rol $30,%edi
+ mov %eax,48(%rsp)
+ add %ebx,%edx
+ lea -0x359d3e2a(%eax,%r11d),%r12d
+ mov 52(%rsp),%eax
+ mov %edi,%ebx
+ mov %edx,%r11d
+ xor 60(%rsp),%eax
+ xor %esi,%ebx
+ rol $5,%r11d
+ xor 20(%rsp),%eax
+ xor %ebp,%ebx
+ add %r11d,%r12d
+ xor 40(%rsp),%eax
+ rol $30,%esi
+ add %ebx,%r12d
+ rol $1,%eax
+ mov %eax,52(%rsp)
+ lea -0x359d3e2a(%eax,%ebp),%r11d
+ mov 56(%rsp),%eax
+ mov %esi,%ebx
+ mov %r12d,%ebp
+ xor 0(%rsp),%eax
+ xor %edx,%ebx
+ rol $5,%ebp
+ xor 24(%rsp),%eax
+ xor %edi,%ebx
+ add %ebp,%r11d
+ xor 44(%rsp),%eax
+ rol $30,%edx
+ add %ebx,%r11d
+ rol $1,%eax
+ mov %eax,56(%rsp)
+ lea -0x359d3e2a(%eax,%edi),%ebp
+ mov 60(%rsp),%eax
+ mov %edx,%ebx
+ mov %r11d,%edi
+ xor 4(%rsp),%eax
+ xor %r12d,%ebx
+ rol $5,%edi
+ xor 28(%rsp),%eax
+ xor %esi,%ebx
+ add %edi,%ebp
+ xor 48(%rsp),%eax
+ rol $30,%r12d
+ add %ebx,%ebp
+ rol $1,%eax
+ mov %eax,60(%rsp)
+ lea -0x359d3e2a(%eax,%esi),%edi
+ mov 0(%rsp),%eax
+ mov %r12d,%ebx
+ mov %ebp,%esi
+ xor 8(%rsp),%eax
+ xor %r11d,%ebx
+ rol $5,%esi
+ xor 32(%rsp),%eax
+ xor %edx,%ebx
+ add %esi,%edi
+ xor 52(%rsp),%eax
+ rol $30,%r11d
+ add %ebx,%edi
+ rol $1,%eax
+ mov %eax,0(%rsp)
+ lea -0x359d3e2a(%eax,%edx),%esi
+ mov 4(%rsp),%eax
+ mov %r11d,%ebx
+ mov %edi,%edx
+ xor 12(%rsp),%eax
+ xor %ebp,%ebx
+ rol $5,%edx
+ xor 36(%rsp),%eax
+ xor %r12d,%ebx
+ add %edx,%esi
+ xor 56(%rsp),%eax
+ rol $30,%ebp
+ add %ebx,%esi
+ rol $1,%eax
+ mov %eax,4(%rsp)
+ lea -0x359d3e2a(%eax,%r12d),%edx
+ mov 8(%rsp),%eax
+ mov %ebp,%ebx
+ mov %esi,%r12d
+ xor 16(%rsp),%eax
+ xor %edi,%ebx
+ rol $5,%r12d
+ xor 40(%rsp),%eax
+ xor %r11d,%ebx
+ add %r12d,%edx
+ xor 60(%rsp),%eax
+ rol $30,%edi
+ add %ebx,%edx
+ rol $1,%eax
+ mov %eax,8(%rsp)
+ lea -0x359d3e2a(%eax,%r11d),%r12d
+ mov 12(%rsp),%eax
+ mov %edi,%ebx
+ mov %edx,%r11d
+ xor 20(%rsp),%eax
+ xor %esi,%ebx
+ rol $5,%r11d
+ xor 44(%rsp),%eax
+ xor %ebp,%ebx
+ add %r11d,%r12d
+ xor 0(%rsp),%eax
+ rol $30,%esi
+ add %ebx,%r12d
+ rol $1,%eax
+ mov %eax,12(%rsp)
+ lea -0x359d3e2a(%eax,%ebp),%r11d
+ mov 16(%rsp),%eax
+ mov %esi,%ebx
+ mov %r12d,%ebp
+ xor 24(%rsp),%eax
+ xor %edx,%ebx
+ rol $5,%ebp
+ xor 48(%rsp),%eax
+ xor %edi,%ebx
+ add %ebp,%r11d
+ xor 4(%rsp),%eax
+ rol $30,%edx
+ add %ebx,%r11d
+ rol $1,%eax
+ mov %eax,16(%rsp)
+ lea -0x359d3e2a(%eax,%edi),%ebp
+ mov 20(%rsp),%eax
+ mov %edx,%ebx
+ mov %r11d,%edi
+ xor 28(%rsp),%eax
+ xor %r12d,%ebx
+ rol $5,%edi
+ xor 52(%rsp),%eax
+ xor %esi,%ebx
+ add %edi,%ebp
+ xor 8(%rsp),%eax
+ rol $30,%r12d
+ add %ebx,%ebp
+ rol $1,%eax
+ mov %eax,20(%rsp)
+ lea -0x359d3e2a(%eax,%esi),%edi
+ mov 24(%rsp),%eax
+ mov %r12d,%ebx
+ mov %ebp,%esi
+ xor 32(%rsp),%eax
+ xor %r11d,%ebx
+ rol $5,%esi
+ xor 56(%rsp),%eax
+ xor %edx,%ebx
+ add %esi,%edi
+ xor 12(%rsp),%eax
+ rol $30,%r11d
+ add %ebx,%edi
+ rol $1,%eax
+ mov %eax,24(%rsp)
+ lea -0x359d3e2a(%eax,%edx),%esi
+ mov 28(%rsp),%eax
+ mov %r11d,%ebx
+ mov %edi,%edx
+ xor 36(%rsp),%eax
+ xor %ebp,%ebx
+ rol $5,%edx
+ xor 60(%rsp),%eax
+ xor %r12d,%ebx
+ add %edx,%esi
+ xor 16(%rsp),%eax
+ rol $30,%ebp
+ add %ebx,%esi
+ rol $1,%eax
+ mov %eax,28(%rsp)
+ lea -0x359d3e2a(%eax,%r12d),%edx
+ mov 32(%rsp),%eax
+ mov %ebp,%ebx
+ mov %esi,%r12d
+ xor 40(%rsp),%eax
+ xor %edi,%ebx
+ rol $5,%r12d
+ xor 0(%rsp),%eax
+ xor %r11d,%ebx
+ add %r12d,%edx
+ xor 20(%rsp),%eax
+ rol $30,%edi
+ add %ebx,%edx
+ rol $1,%eax
+ mov %eax,32(%rsp)
+ lea -0x359d3e2a(%eax,%r11d),%r12d
+ mov 36(%rsp),%eax
+ mov %edi,%ebx
+ mov %edx,%r11d
+ xor 44(%rsp),%eax
+ xor %esi,%ebx
+ rol $5,%r11d
+ xor 4(%rsp),%eax
+ xor %ebp,%ebx
+ add %r11d,%r12d
+ xor 24(%rsp),%eax
+ rol $30,%esi
+ add %ebx,%r12d
+ rol $1,%eax
+ mov %eax,36(%rsp)
+ lea -0x359d3e2a(%eax,%ebp),%r11d
+ mov 40(%rsp),%eax
+ mov %esi,%ebx
+ mov %r12d,%ebp
+ xor 48(%rsp),%eax
+ xor %edx,%ebx
+ rol $5,%ebp
+ xor 8(%rsp),%eax
+ xor %edi,%ebx
+ add %ebp,%r11d
+ xor 28(%rsp),%eax
+ rol $30,%edx
+ add %ebx,%r11d
+ rol $1,%eax
+ mov %eax,40(%rsp)
+ lea -0x359d3e2a(%eax,%edi),%ebp
+ mov 44(%rsp),%eax
+ mov %edx,%ebx
+ mov %r11d,%edi
+ xor 52(%rsp),%eax
+ xor %r12d,%ebx
+ rol $5,%edi
+ xor 12(%rsp),%eax
+ xor %esi,%ebx
+ add %edi,%ebp
+ xor 32(%rsp),%eax
+ rol $30,%r12d
+ add %ebx,%ebp
+ rol $1,%eax
+ mov %eax,44(%rsp)
+ lea -0x359d3e2a(%eax,%esi),%edi
+ mov 48(%rsp),%eax
+ mov %r12d,%ebx
+ mov %ebp,%esi
+ xor 56(%rsp),%eax
+ xor %r11d,%ebx
+ rol $5,%esi
+ xor 16(%rsp),%eax
+ xor %edx,%ebx
+ add %esi,%edi
+ xor 36(%rsp),%eax
+ rol $30,%r11d
+ add %ebx,%edi
+ rol $1,%eax
+ mov %eax,48(%rsp)
+ lea -0x359d3e2a(%eax,%edx),%esi
+ mov 52(%rsp),%eax
+ mov %r11d,%ebx
+ mov %edi,%edx
+ xor 60(%rsp),%eax
+ xor %ebp,%ebx
+ rol $5,%edx
+ xor 20(%rsp),%eax
+ xor %r12d,%ebx
+ add %edx,%esi
+ xor 40(%rsp),%eax
+ rol $30,%ebp
+ add %ebx,%esi
+ rol $1,%eax
+ lea -0x359d3e2a(%eax,%r12d),%edx
+ mov 56(%rsp),%eax
+ mov %ebp,%ebx
+ mov %esi,%r12d
+ xor 0(%rsp),%eax
+ xor %edi,%ebx
+ rol $5,%r12d
+ xor 24(%rsp),%eax
+ xor %r11d,%ebx
+ add %r12d,%edx
+ xor 44(%rsp),%eax
+ rol $30,%edi
+ add %ebx,%edx
+ rol $1,%eax
+ lea -0x359d3e2a(%eax,%r11d),%r12d
+ mov 60(%rsp),%eax
+ mov %edi,%ebx
+ mov %edx,%r11d
+ xor 4(%rsp),%eax
+ xor %esi,%ebx
+ rol $5,%r11d
+ xor 28(%rsp),%eax
+ xor %ebp,%ebx
+ add %r11d,%r12d
+ xor 48(%rsp),%eax
+ rol $30,%esi
+ add %ebx,%r12d
+ rol $1,%eax
+ lea -0x359d3e2a(%eax,%ebp),%r11d
+ mov %esi,%ebx
+ mov %r12d,%ebp
+ xor %edx,%ebx
+ rol $5,%ebp
+ xor %edi,%ebx
+ add %ebp,%r11d
+ rol $30,%edx
+ add %ebx,%r11d
+ // Update and save state information in SHA-1 context
+ add 0(%r8),%r11d
+ add 4(%r8),%r12d
+ add 8(%r8),%edx
+ add 12(%r8),%esi
+ add 16(%r8),%edi
+ mov %r11d,0(%r8)
+ mov %r12d,4(%r8)
+ mov %edx,8(%r8)
+ mov %esi,12(%r8)
+ mov %edi,16(%r8)
+
+ xchg %r11d,%edx # mov %r11d,%edx
+ xchg %r12d,%esi # mov %r12d,%esi
+ xchg %r11d,%edi # mov %edx,%edi
+ xchg %r12d,%ebp # mov %esi,%ebp
+ # mov %edi,%r11d
+ lea 64(%r9),%r9
+ sub $1,%r10
+ jnz .Lloop
+ mov 64(%rsp),%rsp
+ pop %r12
+ pop %rbp
+ pop %rbx
+ ret
+SET_SIZE(sha1_block_data_order)
+
+.data
+.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro@openssl.org>"
+
+#endif /* lint || __lint */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/module/icp/asm-x86_64/sha2/sha256_impl.S b/module/icp/asm-x86_64/sha2/sha256_impl.S
new file mode 100644
index 000000000000..766b75355f0b
--- /dev/null
+++ b/module/icp/asm-x86_64/sha2/sha256_impl.S
@@ -0,0 +1,2063 @@
+/*
+ * ====================================================================
+ * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+ * project. Rights for redistribution and usage in source and binary
+ * forms are granted according to the OpenSSL license.
+ * ====================================================================
+ *
+ * sha256/512_block procedure for x86_64.
+ *
+ * 40% improvement over compiler-generated code on Opteron. On EM64T
+ * sha256 was observed to run >80% faster and sha512 - >40%. No magical
+ * tricks, just straight implementation... I really wonder why gcc
+ * [being armed with inline assembler] fails to generate as fast code.
+ * The only thing which is cool about this module is that it's very
+ * same instruction sequence used for both SHA-256 and SHA-512. In
+ * former case the instructions operate on 32-bit operands, while in
+ * latter - on 64-bit ones. All I had to do is to get one flavor right,
+ * the other one passed the test right away:-)
+ *
+ * sha256_block runs in ~1005 cycles on Opteron, which gives you
+ * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
+ * frequency in GHz. sha512_block runs in ~1275 cycles, which results
+ * in 128*1000/1275=100MBps per GHz. Is there room for improvement?
+ * Well, if you compare it to IA-64 implementation, which maintains
+ * X[16] in register bank[!], tends to 4 instructions per CPU clock
+ * cycle and runs in 1003 cycles, 1275 is very good result for 3-way
+ * issue Opteron pipeline and X[16] maintained in memory. So that *if*
+ * there is a way to improve it, *then* the only way would be to try to
+ * offload X[16] updates to SSE unit, but that would require "deeper"
+ * loop unroll, which in turn would naturally cause size blow-up, not
+ * to mention increased complexity! And once again, only *if* it's
+ * actually possible to noticeably improve overall ILP, instruction
+ * level parallelism, on a given CPU implementation in this case.
+ *
+ * Special note on Intel EM64T. While Opteron CPU exhibits perfect
+ * performance ratio of 1.5 between 64- and 32-bit flavors [see above],
+ * [currently available] EM64T CPUs apparently are far from it. On the
+ * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
+ * sha256_block:-( This is presumably because 64-bit shifts/rotates
+ * apparently are not atomic instructions, but implemented in microcode.
+ */
+
+/*
+ * OpenSolaris OS modifications
+ *
+ * Sun elects to use this software under the BSD license.
+ *
+ * This source originates from OpenSSL file sha512-x86_64.pl at
+ * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz
+ * (presumably for future OpenSSL release 0.9.8h), with these changes:
+ *
+ * 1. Added perl "use strict" and declared variables.
+ *
+ * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
+ * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards.
+ *
+ * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1)
+ * assemblers). Replaced the .picmeup macro with assembler code.
+ *
+ * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype",
+ * at the beginning of SHA2_CTX (the next field is 8-byte aligned).
+ */
+
+/*
+ * This file was generated by a perl script (sha512-x86_64.pl) that were
+ * used to generate sha256 and sha512 variants from the same code base.
+ * The comments from the original file have been pasted above.
+ */
+
+#if defined(lint) || defined(__lint)
+#include <sys/stdint.h>
+#include <sha2/sha2.h>
+
+/* ARGSUSED */
+void
+SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num)
+{
+}
+
+
+#else
+#define _ASM
+#include <sys/asm_linkage.h>
+
+ENTRY_NP(SHA256TransformBlocks)
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ mov %rsp,%rbp # copy %rsp
+ shl $4,%rdx # num*16
+ sub $16*4+4*8,%rsp
+ lea (%rsi,%rdx,4),%rdx # inp+num*16*4
+ and $-64,%rsp # align stack frame
+ add $8,%rdi # Skip OpenSolaris field, "algotype"
+ mov %rdi,16*4+0*8(%rsp) # save ctx, 1st arg
+ mov %rsi,16*4+1*8(%rsp) # save inp, 2nd arg
+ mov %rdx,16*4+2*8(%rsp) # save end pointer, "3rd" arg
+ mov %rbp,16*4+3*8(%rsp) # save copy of %rsp
+
+ #.picmeup %rbp
+ # The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts
+ # the address of the "next" instruction into the target register
+ # (%rbp). This generates these 2 instructions:
+ lea .Llea(%rip),%rbp
+ #nop # .picmeup generates a nop for mod 8 alignment--not needed here
+
+.Llea:
+ lea K256-.(%rbp),%rbp
+
+ mov 4*0(%rdi),%eax
+ mov 4*1(%rdi),%ebx
+ mov 4*2(%rdi),%ecx
+ mov 4*3(%rdi),%edx
+ mov 4*4(%rdi),%r8d
+ mov 4*5(%rdi),%r9d
+ mov 4*6(%rdi),%r10d
+ mov 4*7(%rdi),%r11d
+ jmp .Lloop
+
+.align 16
+.Lloop:
+ xor %rdi,%rdi
+ mov 4*0(%rsi),%r12d
+ bswap %r12d
+ mov %r8d,%r13d
+ mov %r8d,%r14d
+ mov %r9d,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %r10d,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %r8d,%r15d # (f^g)&e
+ mov %r12d,0(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %r11d,%r12d # T1+=h
+
+ mov %eax,%r11d
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %eax,%r13d
+ mov %eax,%r14d
+
+ ror $2,%r11d
+ ror $13,%r13d
+ mov %eax,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%r11d
+ ror $9,%r13d
+ or %ecx,%r14d # a|c
+
+ xor %r13d,%r11d # h=Sigma0(a)
+ and %ecx,%r15d # a&c
+ add %r12d,%edx # d+=T1
+
+ and %ebx,%r14d # (a|c)&b
+ add %r12d,%r11d # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%r11d # h+=Maj(a,b,c)
+ mov 4*1(%rsi),%r12d
+ bswap %r12d
+ mov %edx,%r13d
+ mov %edx,%r14d
+ mov %r8d,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %r9d,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %edx,%r15d # (f^g)&e
+ mov %r12d,4(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %r10d,%r12d # T1+=h
+
+ mov %r11d,%r10d
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %r11d,%r13d
+ mov %r11d,%r14d
+
+ ror $2,%r10d
+ ror $13,%r13d
+ mov %r11d,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%r10d
+ ror $9,%r13d
+ or %ebx,%r14d # a|c
+
+ xor %r13d,%r10d # h=Sigma0(a)
+ and %ebx,%r15d # a&c
+ add %r12d,%ecx # d+=T1
+
+ and %eax,%r14d # (a|c)&b
+ add %r12d,%r10d # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%r10d # h+=Maj(a,b,c)
+ mov 4*2(%rsi),%r12d
+ bswap %r12d
+ mov %ecx,%r13d
+ mov %ecx,%r14d
+ mov %edx,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %r8d,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %ecx,%r15d # (f^g)&e
+ mov %r12d,8(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %r9d,%r12d # T1+=h
+
+ mov %r10d,%r9d
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %r10d,%r13d
+ mov %r10d,%r14d
+
+ ror $2,%r9d
+ ror $13,%r13d
+ mov %r10d,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%r9d
+ ror $9,%r13d
+ or %eax,%r14d # a|c
+
+ xor %r13d,%r9d # h=Sigma0(a)
+ and %eax,%r15d # a&c
+ add %r12d,%ebx # d+=T1
+
+ and %r11d,%r14d # (a|c)&b
+ add %r12d,%r9d # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%r9d # h+=Maj(a,b,c)
+ mov 4*3(%rsi),%r12d
+ bswap %r12d
+ mov %ebx,%r13d
+ mov %ebx,%r14d
+ mov %ecx,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %edx,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %ebx,%r15d # (f^g)&e
+ mov %r12d,12(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %r8d,%r12d # T1+=h
+
+ mov %r9d,%r8d
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %r9d,%r13d
+ mov %r9d,%r14d
+
+ ror $2,%r8d
+ ror $13,%r13d
+ mov %r9d,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%r8d
+ ror $9,%r13d
+ or %r11d,%r14d # a|c
+
+ xor %r13d,%r8d # h=Sigma0(a)
+ and %r11d,%r15d # a&c
+ add %r12d,%eax # d+=T1
+
+ and %r10d,%r14d # (a|c)&b
+ add %r12d,%r8d # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%r8d # h+=Maj(a,b,c)
+ mov 4*4(%rsi),%r12d
+ bswap %r12d
+ mov %eax,%r13d
+ mov %eax,%r14d
+ mov %ebx,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %ecx,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %eax,%r15d # (f^g)&e
+ mov %r12d,16(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %edx,%r12d # T1+=h
+
+ mov %r8d,%edx
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %r8d,%r13d
+ mov %r8d,%r14d
+
+ ror $2,%edx
+ ror $13,%r13d
+ mov %r8d,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%edx
+ ror $9,%r13d
+ or %r10d,%r14d # a|c
+
+ xor %r13d,%edx # h=Sigma0(a)
+ and %r10d,%r15d # a&c
+ add %r12d,%r11d # d+=T1
+
+ and %r9d,%r14d # (a|c)&b
+ add %r12d,%edx # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%edx # h+=Maj(a,b,c)
+ mov 4*5(%rsi),%r12d
+ bswap %r12d
+ mov %r11d,%r13d
+ mov %r11d,%r14d
+ mov %eax,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %ebx,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %r11d,%r15d # (f^g)&e
+ mov %r12d,20(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %ecx,%r12d # T1+=h
+
+ mov %edx,%ecx
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %edx,%r13d
+ mov %edx,%r14d
+
+ ror $2,%ecx
+ ror $13,%r13d
+ mov %edx,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%ecx
+ ror $9,%r13d
+ or %r9d,%r14d # a|c
+
+ xor %r13d,%ecx # h=Sigma0(a)
+ and %r9d,%r15d # a&c
+ add %r12d,%r10d # d+=T1
+
+ and %r8d,%r14d # (a|c)&b
+ add %r12d,%ecx # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%ecx # h+=Maj(a,b,c)
+ mov 4*6(%rsi),%r12d
+ bswap %r12d
+ mov %r10d,%r13d
+ mov %r10d,%r14d
+ mov %r11d,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %eax,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %r10d,%r15d # (f^g)&e
+ mov %r12d,24(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %ebx,%r12d # T1+=h
+
+ mov %ecx,%ebx
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %ecx,%r13d
+ mov %ecx,%r14d
+
+ ror $2,%ebx
+ ror $13,%r13d
+ mov %ecx,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%ebx
+ ror $9,%r13d
+ or %r8d,%r14d # a|c
+
+ xor %r13d,%ebx # h=Sigma0(a)
+ and %r8d,%r15d # a&c
+ add %r12d,%r9d # d+=T1
+
+ and %edx,%r14d # (a|c)&b
+ add %r12d,%ebx # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%ebx # h+=Maj(a,b,c)
+ mov 4*7(%rsi),%r12d
+ bswap %r12d
+ mov %r9d,%r13d
+ mov %r9d,%r14d
+ mov %r10d,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %r11d,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %r9d,%r15d # (f^g)&e
+ mov %r12d,28(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %eax,%r12d # T1+=h
+
+ mov %ebx,%eax
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %ebx,%r13d
+ mov %ebx,%r14d
+
+ ror $2,%eax
+ ror $13,%r13d
+ mov %ebx,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%eax
+ ror $9,%r13d
+ or %edx,%r14d # a|c
+
+ xor %r13d,%eax # h=Sigma0(a)
+ and %edx,%r15d # a&c
+ add %r12d,%r8d # d+=T1
+
+ and %ecx,%r14d # (a|c)&b
+ add %r12d,%eax # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%eax # h+=Maj(a,b,c)
+ mov 4*8(%rsi),%r12d
+ bswap %r12d
+ mov %r8d,%r13d
+ mov %r8d,%r14d
+ mov %r9d,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %r10d,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %r8d,%r15d # (f^g)&e
+ mov %r12d,32(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %r11d,%r12d # T1+=h
+
+ mov %eax,%r11d
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %eax,%r13d
+ mov %eax,%r14d
+
+ ror $2,%r11d
+ ror $13,%r13d
+ mov %eax,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%r11d
+ ror $9,%r13d
+ or %ecx,%r14d # a|c
+
+ xor %r13d,%r11d # h=Sigma0(a)
+ and %ecx,%r15d # a&c
+ add %r12d,%edx # d+=T1
+
+ and %ebx,%r14d # (a|c)&b
+ add %r12d,%r11d # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%r11d # h+=Maj(a,b,c)
+ mov 4*9(%rsi),%r12d
+ bswap %r12d
+ mov %edx,%r13d
+ mov %edx,%r14d
+ mov %r8d,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %r9d,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %edx,%r15d # (f^g)&e
+ mov %r12d,36(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %r10d,%r12d # T1+=h
+
+ mov %r11d,%r10d
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %r11d,%r13d
+ mov %r11d,%r14d
+
+ ror $2,%r10d
+ ror $13,%r13d
+ mov %r11d,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%r10d
+ ror $9,%r13d
+ or %ebx,%r14d # a|c
+
+ xor %r13d,%r10d # h=Sigma0(a)
+ and %ebx,%r15d # a&c
+ add %r12d,%ecx # d+=T1
+
+ and %eax,%r14d # (a|c)&b
+ add %r12d,%r10d # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%r10d # h+=Maj(a,b,c)
+ mov 4*10(%rsi),%r12d
+ bswap %r12d
+ mov %ecx,%r13d
+ mov %ecx,%r14d
+ mov %edx,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %r8d,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %ecx,%r15d # (f^g)&e
+ mov %r12d,40(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %r9d,%r12d # T1+=h
+
+ mov %r10d,%r9d
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %r10d,%r13d
+ mov %r10d,%r14d
+
+ ror $2,%r9d
+ ror $13,%r13d
+ mov %r10d,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%r9d
+ ror $9,%r13d
+ or %eax,%r14d # a|c
+
+ xor %r13d,%r9d # h=Sigma0(a)
+ and %eax,%r15d # a&c
+ add %r12d,%ebx # d+=T1
+
+ and %r11d,%r14d # (a|c)&b
+ add %r12d,%r9d # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%r9d # h+=Maj(a,b,c)
+ mov 4*11(%rsi),%r12d
+ bswap %r12d
+ mov %ebx,%r13d
+ mov %ebx,%r14d
+ mov %ecx,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %edx,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %ebx,%r15d # (f^g)&e
+ mov %r12d,44(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %r8d,%r12d # T1+=h
+
+ mov %r9d,%r8d
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %r9d,%r13d
+ mov %r9d,%r14d
+
+ ror $2,%r8d
+ ror $13,%r13d
+ mov %r9d,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%r8d
+ ror $9,%r13d
+ or %r11d,%r14d # a|c
+
+ xor %r13d,%r8d # h=Sigma0(a)
+ and %r11d,%r15d # a&c
+ add %r12d,%eax # d+=T1
+
+ and %r10d,%r14d # (a|c)&b
+ add %r12d,%r8d # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%r8d # h+=Maj(a,b,c)
+ mov 4*12(%rsi),%r12d
+ bswap %r12d
+ mov %eax,%r13d
+ mov %eax,%r14d
+ mov %ebx,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %ecx,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %eax,%r15d # (f^g)&e
+ mov %r12d,48(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %edx,%r12d # T1+=h
+
+ mov %r8d,%edx
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %r8d,%r13d
+ mov %r8d,%r14d
+
+ ror $2,%edx
+ ror $13,%r13d
+ mov %r8d,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%edx
+ ror $9,%r13d
+ or %r10d,%r14d # a|c
+
+ xor %r13d,%edx # h=Sigma0(a)
+ and %r10d,%r15d # a&c
+ add %r12d,%r11d # d+=T1
+
+ and %r9d,%r14d # (a|c)&b
+ add %r12d,%edx # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%edx # h+=Maj(a,b,c)
+ mov 4*13(%rsi),%r12d
+ bswap %r12d
+ mov %r11d,%r13d
+ mov %r11d,%r14d
+ mov %eax,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %ebx,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %r11d,%r15d # (f^g)&e
+ mov %r12d,52(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %ecx,%r12d # T1+=h
+
+ mov %edx,%ecx
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %edx,%r13d
+ mov %edx,%r14d
+
+ ror $2,%ecx
+ ror $13,%r13d
+ mov %edx,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%ecx
+ ror $9,%r13d
+ or %r9d,%r14d # a|c
+
+ xor %r13d,%ecx # h=Sigma0(a)
+ and %r9d,%r15d # a&c
+ add %r12d,%r10d # d+=T1
+
+ and %r8d,%r14d # (a|c)&b
+ add %r12d,%ecx # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%ecx # h+=Maj(a,b,c)
+ mov 4*14(%rsi),%r12d
+ bswap %r12d
+ mov %r10d,%r13d
+ mov %r10d,%r14d
+ mov %r11d,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %eax,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %r10d,%r15d # (f^g)&e
+ mov %r12d,56(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %ebx,%r12d # T1+=h
+
+ mov %ecx,%ebx
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %ecx,%r13d
+ mov %ecx,%r14d
+
+ ror $2,%ebx
+ ror $13,%r13d
+ mov %ecx,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%ebx
+ ror $9,%r13d
+ or %r8d,%r14d # a|c
+
+ xor %r13d,%ebx # h=Sigma0(a)
+ and %r8d,%r15d # a&c
+ add %r12d,%r9d # d+=T1
+
+ and %edx,%r14d # (a|c)&b
+ add %r12d,%ebx # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%ebx # h+=Maj(a,b,c)
+ mov 4*15(%rsi),%r12d
+ bswap %r12d
+ mov %r9d,%r13d
+ mov %r9d,%r14d
+ mov %r10d,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %r11d,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %r9d,%r15d # (f^g)&e
+ mov %r12d,60(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %eax,%r12d # T1+=h
+
+ mov %ebx,%eax
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %ebx,%r13d
+ mov %ebx,%r14d
+
+ ror $2,%eax
+ ror $13,%r13d
+ mov %ebx,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%eax
+ ror $9,%r13d
+ or %edx,%r14d # a|c
+
+ xor %r13d,%eax # h=Sigma0(a)
+ and %edx,%r15d # a&c
+ add %r12d,%r8d # d+=T1
+
+ and %ecx,%r14d # (a|c)&b
+ add %r12d,%eax # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%eax # h+=Maj(a,b,c)
+ jmp .Lrounds_16_xx
+.align 16
+.Lrounds_16_xx:
+ mov 4(%rsp),%r13d
+ mov 56(%rsp),%r12d
+
+ mov %r13d,%r15d
+
+ shr $3,%r13d
+ ror $7,%r15d
+
+ xor %r15d,%r13d
+ ror $11,%r15d
+
+ xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
+ mov %r12d,%r14d
+
+ shr $10,%r12d
+ ror $17,%r14d
+
+ xor %r14d,%r12d
+ ror $2,%r14d
+
+ xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
+
+ add %r13d,%r12d
+
+ add 36(%rsp),%r12d
+
+ add 0(%rsp),%r12d
+ mov %r8d,%r13d
+ mov %r8d,%r14d
+ mov %r9d,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %r10d,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %r8d,%r15d # (f^g)&e
+ mov %r12d,0(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %r11d,%r12d # T1+=h
+
+ mov %eax,%r11d
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %eax,%r13d
+ mov %eax,%r14d
+
+ ror $2,%r11d
+ ror $13,%r13d
+ mov %eax,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%r11d
+ ror $9,%r13d
+ or %ecx,%r14d # a|c
+
+ xor %r13d,%r11d # h=Sigma0(a)
+ and %ecx,%r15d # a&c
+ add %r12d,%edx # d+=T1
+
+ and %ebx,%r14d # (a|c)&b
+ add %r12d,%r11d # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%r11d # h+=Maj(a,b,c)
+ mov 8(%rsp),%r13d
+ mov 60(%rsp),%r12d
+
+ mov %r13d,%r15d
+
+ shr $3,%r13d
+ ror $7,%r15d
+
+ xor %r15d,%r13d
+ ror $11,%r15d
+
+ xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
+ mov %r12d,%r14d
+
+ shr $10,%r12d
+ ror $17,%r14d
+
+ xor %r14d,%r12d
+ ror $2,%r14d
+
+ xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
+
+ add %r13d,%r12d
+
+ add 40(%rsp),%r12d
+
+ add 4(%rsp),%r12d
+ mov %edx,%r13d
+ mov %edx,%r14d
+ mov %r8d,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %r9d,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %edx,%r15d # (f^g)&e
+ mov %r12d,4(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %r10d,%r12d # T1+=h
+
+ mov %r11d,%r10d
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %r11d,%r13d
+ mov %r11d,%r14d
+
+ ror $2,%r10d
+ ror $13,%r13d
+ mov %r11d,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%r10d
+ ror $9,%r13d
+ or %ebx,%r14d # a|c
+
+ xor %r13d,%r10d # h=Sigma0(a)
+ and %ebx,%r15d # a&c
+ add %r12d,%ecx # d+=T1
+
+ and %eax,%r14d # (a|c)&b
+ add %r12d,%r10d # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%r10d # h+=Maj(a,b,c)
+ mov 12(%rsp),%r13d
+ mov 0(%rsp),%r12d
+
+ mov %r13d,%r15d
+
+ shr $3,%r13d
+ ror $7,%r15d
+
+ xor %r15d,%r13d
+ ror $11,%r15d
+
+ xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
+ mov %r12d,%r14d
+
+ shr $10,%r12d
+ ror $17,%r14d
+
+ xor %r14d,%r12d
+ ror $2,%r14d
+
+ xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
+
+ add %r13d,%r12d
+
+ add 44(%rsp),%r12d
+
+ add 8(%rsp),%r12d
+ mov %ecx,%r13d
+ mov %ecx,%r14d
+ mov %edx,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %r8d,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %ecx,%r15d # (f^g)&e
+ mov %r12d,8(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %r9d,%r12d # T1+=h
+
+ mov %r10d,%r9d
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %r10d,%r13d
+ mov %r10d,%r14d
+
+ ror $2,%r9d
+ ror $13,%r13d
+ mov %r10d,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%r9d
+ ror $9,%r13d
+ or %eax,%r14d # a|c
+
+ xor %r13d,%r9d # h=Sigma0(a)
+ and %eax,%r15d # a&c
+ add %r12d,%ebx # d+=T1
+
+ and %r11d,%r14d # (a|c)&b
+ add %r12d,%r9d # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%r9d # h+=Maj(a,b,c)
+ mov 16(%rsp),%r13d
+ mov 4(%rsp),%r12d
+
+ mov %r13d,%r15d
+
+ shr $3,%r13d
+ ror $7,%r15d
+
+ xor %r15d,%r13d
+ ror $11,%r15d
+
+ xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
+ mov %r12d,%r14d
+
+ shr $10,%r12d
+ ror $17,%r14d
+
+ xor %r14d,%r12d
+ ror $2,%r14d
+
+ xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
+
+ add %r13d,%r12d
+
+ add 48(%rsp),%r12d
+
+ add 12(%rsp),%r12d
+ mov %ebx,%r13d
+ mov %ebx,%r14d
+ mov %ecx,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %edx,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %ebx,%r15d # (f^g)&e
+ mov %r12d,12(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %r8d,%r12d # T1+=h
+
+ mov %r9d,%r8d
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %r9d,%r13d
+ mov %r9d,%r14d
+
+ ror $2,%r8d
+ ror $13,%r13d
+ mov %r9d,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%r8d
+ ror $9,%r13d
+ or %r11d,%r14d # a|c
+
+ xor %r13d,%r8d # h=Sigma0(a)
+ and %r11d,%r15d # a&c
+ add %r12d,%eax # d+=T1
+
+ and %r10d,%r14d # (a|c)&b
+ add %r12d,%r8d # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%r8d # h+=Maj(a,b,c)
+ mov 20(%rsp),%r13d
+ mov 8(%rsp),%r12d
+
+ mov %r13d,%r15d
+
+ shr $3,%r13d
+ ror $7,%r15d
+
+ xor %r15d,%r13d
+ ror $11,%r15d
+
+ xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
+ mov %r12d,%r14d
+
+ shr $10,%r12d
+ ror $17,%r14d
+
+ xor %r14d,%r12d
+ ror $2,%r14d
+
+ xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
+
+ add %r13d,%r12d
+
+ add 52(%rsp),%r12d
+
+ add 16(%rsp),%r12d
+ mov %eax,%r13d
+ mov %eax,%r14d
+ mov %ebx,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %ecx,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %eax,%r15d # (f^g)&e
+ mov %r12d,16(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %edx,%r12d # T1+=h
+
+ mov %r8d,%edx
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %r8d,%r13d
+ mov %r8d,%r14d
+
+ ror $2,%edx
+ ror $13,%r13d
+ mov %r8d,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%edx
+ ror $9,%r13d
+ or %r10d,%r14d # a|c
+
+ xor %r13d,%edx # h=Sigma0(a)
+ and %r10d,%r15d # a&c
+ add %r12d,%r11d # d+=T1
+
+ and %r9d,%r14d # (a|c)&b
+ add %r12d,%edx # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%edx # h+=Maj(a,b,c)
+ mov 24(%rsp),%r13d
+ mov 12(%rsp),%r12d
+
+ mov %r13d,%r15d
+
+ shr $3,%r13d
+ ror $7,%r15d
+
+ xor %r15d,%r13d
+ ror $11,%r15d
+
+ xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
+ mov %r12d,%r14d
+
+ shr $10,%r12d
+ ror $17,%r14d
+
+ xor %r14d,%r12d
+ ror $2,%r14d
+
+ xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
+
+ add %r13d,%r12d
+
+ add 56(%rsp),%r12d
+
+ add 20(%rsp),%r12d
+ mov %r11d,%r13d
+ mov %r11d,%r14d
+ mov %eax,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %ebx,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %r11d,%r15d # (f^g)&e
+ mov %r12d,20(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %ecx,%r12d # T1+=h
+
+ mov %edx,%ecx
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %edx,%r13d
+ mov %edx,%r14d
+
+ ror $2,%ecx
+ ror $13,%r13d
+ mov %edx,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%ecx
+ ror $9,%r13d
+ or %r9d,%r14d # a|c
+
+ xor %r13d,%ecx # h=Sigma0(a)
+ and %r9d,%r15d # a&c
+ add %r12d,%r10d # d+=T1
+
+ and %r8d,%r14d # (a|c)&b
+ add %r12d,%ecx # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%ecx # h+=Maj(a,b,c)
+ mov 28(%rsp),%r13d
+ mov 16(%rsp),%r12d
+
+ mov %r13d,%r15d
+
+ shr $3,%r13d
+ ror $7,%r15d
+
+ xor %r15d,%r13d
+ ror $11,%r15d
+
+ xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
+ mov %r12d,%r14d
+
+ shr $10,%r12d
+ ror $17,%r14d
+
+ xor %r14d,%r12d
+ ror $2,%r14d
+
+ xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
+
+ add %r13d,%r12d
+
+ add 60(%rsp),%r12d
+
+ add 24(%rsp),%r12d
+ mov %r10d,%r13d
+ mov %r10d,%r14d
+ mov %r11d,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %eax,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %r10d,%r15d # (f^g)&e
+ mov %r12d,24(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %ebx,%r12d # T1+=h
+
+ mov %ecx,%ebx
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %ecx,%r13d
+ mov %ecx,%r14d
+
+ ror $2,%ebx
+ ror $13,%r13d
+ mov %ecx,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%ebx
+ ror $9,%r13d
+ or %r8d,%r14d # a|c
+
+ xor %r13d,%ebx # h=Sigma0(a)
+ and %r8d,%r15d # a&c
+ add %r12d,%r9d # d+=T1
+
+ and %edx,%r14d # (a|c)&b
+ add %r12d,%ebx # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%ebx # h+=Maj(a,b,c)
+ mov 32(%rsp),%r13d
+ mov 20(%rsp),%r12d
+
+ mov %r13d,%r15d
+
+ shr $3,%r13d
+ ror $7,%r15d
+
+ xor %r15d,%r13d
+ ror $11,%r15d
+
+ xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
+ mov %r12d,%r14d
+
+ shr $10,%r12d
+ ror $17,%r14d
+
+ xor %r14d,%r12d
+ ror $2,%r14d
+
+ xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
+
+ add %r13d,%r12d
+
+ add 0(%rsp),%r12d
+
+ add 28(%rsp),%r12d
+ mov %r9d,%r13d
+ mov %r9d,%r14d
+ mov %r10d,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %r11d,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %r9d,%r15d # (f^g)&e
+ mov %r12d,28(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %eax,%r12d # T1+=h
+
+ mov %ebx,%eax
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %ebx,%r13d
+ mov %ebx,%r14d
+
+ ror $2,%eax
+ ror $13,%r13d
+ mov %ebx,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%eax
+ ror $9,%r13d
+ or %edx,%r14d # a|c
+
+ xor %r13d,%eax # h=Sigma0(a)
+ and %edx,%r15d # a&c
+ add %r12d,%r8d # d+=T1
+
+ and %ecx,%r14d # (a|c)&b
+ add %r12d,%eax # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%eax # h+=Maj(a,b,c)
+ mov 36(%rsp),%r13d
+ mov 24(%rsp),%r12d
+
+ mov %r13d,%r15d
+
+ shr $3,%r13d
+ ror $7,%r15d
+
+ xor %r15d,%r13d
+ ror $11,%r15d
+
+ xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
+ mov %r12d,%r14d
+
+ shr $10,%r12d
+ ror $17,%r14d
+
+ xor %r14d,%r12d
+ ror $2,%r14d
+
+ xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
+
+ add %r13d,%r12d
+
+ add 4(%rsp),%r12d
+
+ add 32(%rsp),%r12d
+ mov %r8d,%r13d
+ mov %r8d,%r14d
+ mov %r9d,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %r10d,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %r8d,%r15d # (f^g)&e
+ mov %r12d,32(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %r11d,%r12d # T1+=h
+
+ mov %eax,%r11d
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %eax,%r13d
+ mov %eax,%r14d
+
+ ror $2,%r11d
+ ror $13,%r13d
+ mov %eax,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%r11d
+ ror $9,%r13d
+ or %ecx,%r14d # a|c
+
+ xor %r13d,%r11d # h=Sigma0(a)
+ and %ecx,%r15d # a&c
+ add %r12d,%edx # d+=T1
+
+ and %ebx,%r14d # (a|c)&b
+ add %r12d,%r11d # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%r11d # h+=Maj(a,b,c)
+ mov 40(%rsp),%r13d
+ mov 28(%rsp),%r12d
+
+ mov %r13d,%r15d
+
+ shr $3,%r13d
+ ror $7,%r15d
+
+ xor %r15d,%r13d
+ ror $11,%r15d
+
+ xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
+ mov %r12d,%r14d
+
+ shr $10,%r12d
+ ror $17,%r14d
+
+ xor %r14d,%r12d
+ ror $2,%r14d
+
+ xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
+
+ add %r13d,%r12d
+
+ add 8(%rsp),%r12d
+
+ add 36(%rsp),%r12d
+ mov %edx,%r13d
+ mov %edx,%r14d
+ mov %r8d,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %r9d,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %edx,%r15d # (f^g)&e
+ mov %r12d,36(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %r10d,%r12d # T1+=h
+
+ mov %r11d,%r10d
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %r11d,%r13d
+ mov %r11d,%r14d
+
+ ror $2,%r10d
+ ror $13,%r13d
+ mov %r11d,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%r10d
+ ror $9,%r13d
+ or %ebx,%r14d # a|c
+
+ xor %r13d,%r10d # h=Sigma0(a)
+ and %ebx,%r15d # a&c
+ add %r12d,%ecx # d+=T1
+
+ and %eax,%r14d # (a|c)&b
+ add %r12d,%r10d # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%r10d # h+=Maj(a,b,c)
+ mov 44(%rsp),%r13d
+ mov 32(%rsp),%r12d
+
+ mov %r13d,%r15d
+
+ shr $3,%r13d
+ ror $7,%r15d
+
+ xor %r15d,%r13d
+ ror $11,%r15d
+
+ xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
+ mov %r12d,%r14d
+
+ shr $10,%r12d
+ ror $17,%r14d
+
+ xor %r14d,%r12d
+ ror $2,%r14d
+
+ xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
+
+ add %r13d,%r12d
+
+ add 12(%rsp),%r12d
+
+ add 40(%rsp),%r12d
+ mov %ecx,%r13d
+ mov %ecx,%r14d
+ mov %edx,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %r8d,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %ecx,%r15d # (f^g)&e
+ mov %r12d,40(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %r9d,%r12d # T1+=h
+
+ mov %r10d,%r9d
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %r10d,%r13d
+ mov %r10d,%r14d
+
+ ror $2,%r9d
+ ror $13,%r13d
+ mov %r10d,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%r9d
+ ror $9,%r13d
+ or %eax,%r14d # a|c
+
+ xor %r13d,%r9d # h=Sigma0(a)
+ and %eax,%r15d # a&c
+ add %r12d,%ebx # d+=T1
+
+ and %r11d,%r14d # (a|c)&b
+ add %r12d,%r9d # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%r9d # h+=Maj(a,b,c)
+ mov 48(%rsp),%r13d
+ mov 36(%rsp),%r12d
+
+ mov %r13d,%r15d
+
+ shr $3,%r13d
+ ror $7,%r15d
+
+ xor %r15d,%r13d
+ ror $11,%r15d
+
+ xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
+ mov %r12d,%r14d
+
+ shr $10,%r12d
+ ror $17,%r14d
+
+ xor %r14d,%r12d
+ ror $2,%r14d
+
+ xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
+
+ add %r13d,%r12d
+
+ add 16(%rsp),%r12d
+
+ add 44(%rsp),%r12d
+ mov %ebx,%r13d
+ mov %ebx,%r14d
+ mov %ecx,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %edx,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %ebx,%r15d # (f^g)&e
+ mov %r12d,44(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %r8d,%r12d # T1+=h
+
+ mov %r9d,%r8d
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %r9d,%r13d
+ mov %r9d,%r14d
+
+ ror $2,%r8d
+ ror $13,%r13d
+ mov %r9d,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%r8d
+ ror $9,%r13d
+ or %r11d,%r14d # a|c
+
+ xor %r13d,%r8d # h=Sigma0(a)
+ and %r11d,%r15d # a&c
+ add %r12d,%eax # d+=T1
+
+ and %r10d,%r14d # (a|c)&b
+ add %r12d,%r8d # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%r8d # h+=Maj(a,b,c)
+ mov 52(%rsp),%r13d
+ mov 40(%rsp),%r12d
+
+ mov %r13d,%r15d
+
+ shr $3,%r13d
+ ror $7,%r15d
+
+ xor %r15d,%r13d
+ ror $11,%r15d
+
+ xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
+ mov %r12d,%r14d
+
+ shr $10,%r12d
+ ror $17,%r14d
+
+ xor %r14d,%r12d
+ ror $2,%r14d
+
+ xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
+
+ add %r13d,%r12d
+
+ add 20(%rsp),%r12d
+
+ add 48(%rsp),%r12d
+ mov %eax,%r13d
+ mov %eax,%r14d
+ mov %ebx,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %ecx,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %eax,%r15d # (f^g)&e
+ mov %r12d,48(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %edx,%r12d # T1+=h
+
+ mov %r8d,%edx
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %r8d,%r13d
+ mov %r8d,%r14d
+
+ ror $2,%edx
+ ror $13,%r13d
+ mov %r8d,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%edx
+ ror $9,%r13d
+ or %r10d,%r14d # a|c
+
+ xor %r13d,%edx # h=Sigma0(a)
+ and %r10d,%r15d # a&c
+ add %r12d,%r11d # d+=T1
+
+ and %r9d,%r14d # (a|c)&b
+ add %r12d,%edx # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%edx # h+=Maj(a,b,c)
+ mov 56(%rsp),%r13d
+ mov 44(%rsp),%r12d
+
+ mov %r13d,%r15d
+
+ shr $3,%r13d
+ ror $7,%r15d
+
+ xor %r15d,%r13d
+ ror $11,%r15d
+
+ xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
+ mov %r12d,%r14d
+
+ shr $10,%r12d
+ ror $17,%r14d
+
+ xor %r14d,%r12d
+ ror $2,%r14d
+
+ xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
+
+ add %r13d,%r12d
+
+ add 24(%rsp),%r12d
+
+ add 52(%rsp),%r12d
+ mov %r11d,%r13d
+ mov %r11d,%r14d
+ mov %eax,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %ebx,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %r11d,%r15d # (f^g)&e
+ mov %r12d,52(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %ecx,%r12d # T1+=h
+
+ mov %edx,%ecx
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %edx,%r13d
+ mov %edx,%r14d
+
+ ror $2,%ecx
+ ror $13,%r13d
+ mov %edx,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%ecx
+ ror $9,%r13d
+ or %r9d,%r14d # a|c
+
+ xor %r13d,%ecx # h=Sigma0(a)
+ and %r9d,%r15d # a&c
+ add %r12d,%r10d # d+=T1
+
+ and %r8d,%r14d # (a|c)&b
+ add %r12d,%ecx # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%ecx # h+=Maj(a,b,c)
+ mov 60(%rsp),%r13d
+ mov 48(%rsp),%r12d
+
+ mov %r13d,%r15d
+
+ shr $3,%r13d
+ ror $7,%r15d
+
+ xor %r15d,%r13d
+ ror $11,%r15d
+
+ xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
+ mov %r12d,%r14d
+
+ shr $10,%r12d
+ ror $17,%r14d
+
+ xor %r14d,%r12d
+ ror $2,%r14d
+
+ xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
+
+ add %r13d,%r12d
+
+ add 28(%rsp),%r12d
+
+ add 56(%rsp),%r12d
+ mov %r10d,%r13d
+ mov %r10d,%r14d
+ mov %r11d,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %eax,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %r10d,%r15d # (f^g)&e
+ mov %r12d,56(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %ebx,%r12d # T1+=h
+
+ mov %ecx,%ebx
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %ecx,%r13d
+ mov %ecx,%r14d
+
+ ror $2,%ebx
+ ror $13,%r13d
+ mov %ecx,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%ebx
+ ror $9,%r13d
+ or %r8d,%r14d # a|c
+
+ xor %r13d,%ebx # h=Sigma0(a)
+ and %r8d,%r15d # a&c
+ add %r12d,%r9d # d+=T1
+
+ and %edx,%r14d # (a|c)&b
+ add %r12d,%ebx # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%ebx # h+=Maj(a,b,c)
+ mov 0(%rsp),%r13d
+ mov 52(%rsp),%r12d
+
+ mov %r13d,%r15d
+
+ shr $3,%r13d
+ ror $7,%r15d
+
+ xor %r15d,%r13d
+ ror $11,%r15d
+
+ xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
+ mov %r12d,%r14d
+
+ shr $10,%r12d
+ ror $17,%r14d
+
+ xor %r14d,%r12d
+ ror $2,%r14d
+
+ xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
+
+ add %r13d,%r12d
+
+ add 32(%rsp),%r12d
+
+ add 60(%rsp),%r12d
+ mov %r9d,%r13d
+ mov %r9d,%r14d
+ mov %r10d,%r15d
+
+ ror $6,%r13d
+ ror $11,%r14d
+ xor %r11d,%r15d # f^g
+
+ xor %r14d,%r13d
+ ror $14,%r14d
+ and %r9d,%r15d # (f^g)&e
+ mov %r12d,60(%rsp)
+
+ xor %r14d,%r13d # Sigma1(e)
+ xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g
+ add %eax,%r12d # T1+=h
+
+ mov %ebx,%eax
+ add %r13d,%r12d # T1+=Sigma1(e)
+
+ add %r15d,%r12d # T1+=Ch(e,f,g)
+ mov %ebx,%r13d
+ mov %ebx,%r14d
+
+ ror $2,%eax
+ ror $13,%r13d
+ mov %ebx,%r15d
+ add (%rbp,%rdi,4),%r12d # T1+=K[round]
+
+ xor %r13d,%eax
+ ror $9,%r13d
+ or %edx,%r14d # a|c
+
+ xor %r13d,%eax # h=Sigma0(a)
+ and %edx,%r15d # a&c
+ add %r12d,%r8d # d+=T1
+
+ and %ecx,%r14d # (a|c)&b
+ add %r12d,%eax # h+=T1
+
+ or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14d,%eax # h+=Maj(a,b,c)
+ cmp $64,%rdi
+ jb .Lrounds_16_xx
+
+ mov 16*4+0*8(%rsp),%rdi
+ lea 16*4(%rsi),%rsi
+
+ add 4*0(%rdi),%eax
+ add 4*1(%rdi),%ebx
+ add 4*2(%rdi),%ecx
+ add 4*3(%rdi),%edx
+ add 4*4(%rdi),%r8d
+ add 4*5(%rdi),%r9d
+ add 4*6(%rdi),%r10d
+ add 4*7(%rdi),%r11d
+
+ cmp 16*4+2*8(%rsp),%rsi
+
+ mov %eax,4*0(%rdi)
+ mov %ebx,4*1(%rdi)
+ mov %ecx,4*2(%rdi)
+ mov %edx,4*3(%rdi)
+ mov %r8d,4*4(%rdi)
+ mov %r9d,4*5(%rdi)
+ mov %r10d,4*6(%rdi)
+ mov %r11d,4*7(%rdi)
+ jb .Lloop
+
+ mov 16*4+3*8(%rsp),%rsp
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+
+ ret
+SET_SIZE(SHA256TransformBlocks)
+
+.data
+.align 64
+.type K256,@object
+K256:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+#endif /* !lint && !__lint */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/module/icp/asm-x86_64/sha2/sha512_impl.S b/module/icp/asm-x86_64/sha2/sha512_impl.S
new file mode 100644
index 000000000000..6e37618761b2
--- /dev/null
+++ b/module/icp/asm-x86_64/sha2/sha512_impl.S
@@ -0,0 +1,2088 @@
+/*
+ * ====================================================================
+ * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+ * project. Rights for redistribution and usage in source and binary
+ * forms are granted according to the OpenSSL license.
+ * ====================================================================
+ *
+ * sha256/512_block procedure for x86_64.
+ *
+ * 40% improvement over compiler-generated code on Opteron. On EM64T
+ * sha256 was observed to run >80% faster and sha512 - >40%. No magical
+ * tricks, just straight implementation... I really wonder why gcc
+ * [being armed with inline assembler] fails to generate as fast code.
+ * The only thing which is cool about this module is that it's very
+ * same instruction sequence used for both SHA-256 and SHA-512. In
+ * former case the instructions operate on 32-bit operands, while in
+ * latter - on 64-bit ones. All I had to do is to get one flavor right,
+ * the other one passed the test right away:-)
+ *
+ * sha256_block runs in ~1005 cycles on Opteron, which gives you
+ * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
+ * frequency in GHz. sha512_block runs in ~1275 cycles, which results
+ * in 128*1000/1275=100MBps per GHz. Is there room for improvement?
+ * Well, if you compare it to IA-64 implementation, which maintains
+ * X[16] in register bank[!], tends to 4 instructions per CPU clock
+ * cycle and runs in 1003 cycles, 1275 is very good result for 3-way
+ * issue Opteron pipeline and X[16] maintained in memory. So that *if*
+ * there is a way to improve it, *then* the only way would be to try to
+ * offload X[16] updates to SSE unit, but that would require "deeper"
+ * loop unroll, which in turn would naturally cause size blow-up, not
+ * to mention increased complexity! And once again, only *if* it's
+ * actually possible to noticeably improve overall ILP, instruction
+ * level parallelism, on a given CPU implementation in this case.
+ *
+ * Special note on Intel EM64T. While Opteron CPU exhibits perfect
+ * performance ratio of 1.5 between 64- and 32-bit flavors [see above],
+ * [currently available] EM64T CPUs apparently are far from it. On the
+ * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
+ * sha256_block:-( This is presumably because 64-bit shifts/rotates
+ * apparently are not atomic instructions, but implemented in microcode.
+ */
+
+/*
+ * OpenSolaris OS modifications
+ *
+ * Sun elects to use this software under the BSD license.
+ *
+ * This source originates from OpenSSL file sha512-x86_64.pl at
+ * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz
+ * (presumably for future OpenSSL release 0.9.8h), with these changes:
+ *
+ * 1. Added perl "use strict" and declared variables.
+ *
+ * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
+ * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards.
+ *
+ * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1)
+ * assemblers). Replaced the .picmeup macro with assembler code.
+ *
+ * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype",
+ * at the beginning of SHA2_CTX (the next field is 8-byte aligned).
+ */
+
+/*
+ * This file was generated by a perl script (sha512-x86_64.pl) that were
+ * used to generate sha256 and sha512 variants from the same code base.
+ * The comments from the original file have been pasted above.
+ */
+
+
+#if defined(lint) || defined(__lint)
+#include <sys/stdint.h>
+#include <sha2/sha2.h>
+
+/* ARGSUSED */
+void
+SHA512TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num)
+{
+}
+
+
+#else
+#define _ASM
+#include <sys/asm_linkage.h>
+
+ENTRY_NP(SHA512TransformBlocks)
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ mov %rsp,%rbp # copy %rsp
+ shl $4,%rdx # num*16
+ sub $16*8+4*8,%rsp
+ lea (%rsi,%rdx,8),%rdx # inp+num*16*8
+ and $-64,%rsp # align stack frame
+ add $8,%rdi # Skip OpenSolaris field, "algotype"
+ mov %rdi,16*8+0*8(%rsp) # save ctx, 1st arg
+ mov %rsi,16*8+1*8(%rsp) # save inp, 2nd arg
+ mov %rdx,16*8+2*8(%rsp) # save end pointer, "3rd" arg
+ mov %rbp,16*8+3*8(%rsp) # save copy of %rsp
+
+ #.picmeup %rbp
+ # The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts
+ # the address of the "next" instruction into the target register
+ # (%rbp). This generates these 2 instructions:
+ lea .Llea(%rip),%rbp
+ #nop # .picmeup generates a nop for mod 8 alignment--not needed here
+
+.Llea:
+ lea K512-.(%rbp),%rbp
+
+ mov 8*0(%rdi),%rax
+ mov 8*1(%rdi),%rbx
+ mov 8*2(%rdi),%rcx
+ mov 8*3(%rdi),%rdx
+ mov 8*4(%rdi),%r8
+ mov 8*5(%rdi),%r9
+ mov 8*6(%rdi),%r10
+ mov 8*7(%rdi),%r11
+ jmp .Lloop
+
+.align 16
+.Lloop:
+ xor %rdi,%rdi
+ mov 8*0(%rsi),%r12
+ bswap %r12
+ mov %r8,%r13
+ mov %r8,%r14
+ mov %r9,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %r10,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %r8,%r15 # (f^g)&e
+ mov %r12,0(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %r11,%r12 # T1+=h
+
+ mov %rax,%r11
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %rax,%r13
+ mov %rax,%r14
+
+ ror $28,%r11
+ ror $34,%r13
+ mov %rax,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%r11
+ ror $5,%r13
+ or %rcx,%r14 # a|c
+
+ xor %r13,%r11 # h=Sigma0(a)
+ and %rcx,%r15 # a&c
+ add %r12,%rdx # d+=T1
+
+ and %rbx,%r14 # (a|c)&b
+ add %r12,%r11 # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%r11 # h+=Maj(a,b,c)
+ mov 8*1(%rsi),%r12
+ bswap %r12
+ mov %rdx,%r13
+ mov %rdx,%r14
+ mov %r8,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %r9,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %rdx,%r15 # (f^g)&e
+ mov %r12,8(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %r10,%r12 # T1+=h
+
+ mov %r11,%r10
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %r11,%r13
+ mov %r11,%r14
+
+ ror $28,%r10
+ ror $34,%r13
+ mov %r11,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%r10
+ ror $5,%r13
+ or %rbx,%r14 # a|c
+
+ xor %r13,%r10 # h=Sigma0(a)
+ and %rbx,%r15 # a&c
+ add %r12,%rcx # d+=T1
+
+ and %rax,%r14 # (a|c)&b
+ add %r12,%r10 # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%r10 # h+=Maj(a,b,c)
+ mov 8*2(%rsi),%r12
+ bswap %r12
+ mov %rcx,%r13
+ mov %rcx,%r14
+ mov %rdx,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %r8,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %rcx,%r15 # (f^g)&e
+ mov %r12,16(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %r9,%r12 # T1+=h
+
+ mov %r10,%r9
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %r10,%r13
+ mov %r10,%r14
+
+ ror $28,%r9
+ ror $34,%r13
+ mov %r10,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%r9
+ ror $5,%r13
+ or %rax,%r14 # a|c
+
+ xor %r13,%r9 # h=Sigma0(a)
+ and %rax,%r15 # a&c
+ add %r12,%rbx # d+=T1
+
+ and %r11,%r14 # (a|c)&b
+ add %r12,%r9 # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%r9 # h+=Maj(a,b,c)
+ mov 8*3(%rsi),%r12
+ bswap %r12
+ mov %rbx,%r13
+ mov %rbx,%r14
+ mov %rcx,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %rdx,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %rbx,%r15 # (f^g)&e
+ mov %r12,24(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %r8,%r12 # T1+=h
+
+ mov %r9,%r8
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %r9,%r13
+ mov %r9,%r14
+
+ ror $28,%r8
+ ror $34,%r13
+ mov %r9,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%r8
+ ror $5,%r13
+ or %r11,%r14 # a|c
+
+ xor %r13,%r8 # h=Sigma0(a)
+ and %r11,%r15 # a&c
+ add %r12,%rax # d+=T1
+
+ and %r10,%r14 # (a|c)&b
+ add %r12,%r8 # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%r8 # h+=Maj(a,b,c)
+ mov 8*4(%rsi),%r12
+ bswap %r12
+ mov %rax,%r13
+ mov %rax,%r14
+ mov %rbx,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %rcx,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %rax,%r15 # (f^g)&e
+ mov %r12,32(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %rdx,%r12 # T1+=h
+
+ mov %r8,%rdx
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %r8,%r13
+ mov %r8,%r14
+
+ ror $28,%rdx
+ ror $34,%r13
+ mov %r8,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%rdx
+ ror $5,%r13
+ or %r10,%r14 # a|c
+
+ xor %r13,%rdx # h=Sigma0(a)
+ and %r10,%r15 # a&c
+ add %r12,%r11 # d+=T1
+
+ and %r9,%r14 # (a|c)&b
+ add %r12,%rdx # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%rdx # h+=Maj(a,b,c)
+ mov 8*5(%rsi),%r12
+ bswap %r12
+ mov %r11,%r13
+ mov %r11,%r14
+ mov %rax,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %rbx,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %r11,%r15 # (f^g)&e
+ mov %r12,40(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %rcx,%r12 # T1+=h
+
+ mov %rdx,%rcx
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %rdx,%r13
+ mov %rdx,%r14
+
+ ror $28,%rcx
+ ror $34,%r13
+ mov %rdx,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%rcx
+ ror $5,%r13
+ or %r9,%r14 # a|c
+
+ xor %r13,%rcx # h=Sigma0(a)
+ and %r9,%r15 # a&c
+ add %r12,%r10 # d+=T1
+
+ and %r8,%r14 # (a|c)&b
+ add %r12,%rcx # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%rcx # h+=Maj(a,b,c)
+ mov 8*6(%rsi),%r12
+ bswap %r12
+ mov %r10,%r13
+ mov %r10,%r14
+ mov %r11,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %rax,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %r10,%r15 # (f^g)&e
+ mov %r12,48(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %rbx,%r12 # T1+=h
+
+ mov %rcx,%rbx
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %rcx,%r13
+ mov %rcx,%r14
+
+ ror $28,%rbx
+ ror $34,%r13
+ mov %rcx,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%rbx
+ ror $5,%r13
+ or %r8,%r14 # a|c
+
+ xor %r13,%rbx # h=Sigma0(a)
+ and %r8,%r15 # a&c
+ add %r12,%r9 # d+=T1
+
+ and %rdx,%r14 # (a|c)&b
+ add %r12,%rbx # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%rbx # h+=Maj(a,b,c)
+ mov 8*7(%rsi),%r12
+ bswap %r12
+ mov %r9,%r13
+ mov %r9,%r14
+ mov %r10,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %r11,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %r9,%r15 # (f^g)&e
+ mov %r12,56(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %rax,%r12 # T1+=h
+
+ mov %rbx,%rax
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %rbx,%r13
+ mov %rbx,%r14
+
+ ror $28,%rax
+ ror $34,%r13
+ mov %rbx,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%rax
+ ror $5,%r13
+ or %rdx,%r14 # a|c
+
+ xor %r13,%rax # h=Sigma0(a)
+ and %rdx,%r15 # a&c
+ add %r12,%r8 # d+=T1
+
+ and %rcx,%r14 # (a|c)&b
+ add %r12,%rax # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%rax # h+=Maj(a,b,c)
+ mov 8*8(%rsi),%r12
+ bswap %r12
+ mov %r8,%r13
+ mov %r8,%r14
+ mov %r9,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %r10,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %r8,%r15 # (f^g)&e
+ mov %r12,64(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %r11,%r12 # T1+=h
+
+ mov %rax,%r11
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %rax,%r13
+ mov %rax,%r14
+
+ ror $28,%r11
+ ror $34,%r13
+ mov %rax,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%r11
+ ror $5,%r13
+ or %rcx,%r14 # a|c
+
+ xor %r13,%r11 # h=Sigma0(a)
+ and %rcx,%r15 # a&c
+ add %r12,%rdx # d+=T1
+
+ and %rbx,%r14 # (a|c)&b
+ add %r12,%r11 # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%r11 # h+=Maj(a,b,c)
+ mov 8*9(%rsi),%r12
+ bswap %r12
+ mov %rdx,%r13
+ mov %rdx,%r14
+ mov %r8,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %r9,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %rdx,%r15 # (f^g)&e
+ mov %r12,72(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %r10,%r12 # T1+=h
+
+ mov %r11,%r10
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %r11,%r13
+ mov %r11,%r14
+
+ ror $28,%r10
+ ror $34,%r13
+ mov %r11,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%r10
+ ror $5,%r13
+ or %rbx,%r14 # a|c
+
+ xor %r13,%r10 # h=Sigma0(a)
+ and %rbx,%r15 # a&c
+ add %r12,%rcx # d+=T1
+
+ and %rax,%r14 # (a|c)&b
+ add %r12,%r10 # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%r10 # h+=Maj(a,b,c)
+ mov 8*10(%rsi),%r12
+ bswap %r12
+ mov %rcx,%r13
+ mov %rcx,%r14
+ mov %rdx,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %r8,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %rcx,%r15 # (f^g)&e
+ mov %r12,80(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %r9,%r12 # T1+=h
+
+ mov %r10,%r9
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %r10,%r13
+ mov %r10,%r14
+
+ ror $28,%r9
+ ror $34,%r13
+ mov %r10,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%r9
+ ror $5,%r13
+ or %rax,%r14 # a|c
+
+ xor %r13,%r9 # h=Sigma0(a)
+ and %rax,%r15 # a&c
+ add %r12,%rbx # d+=T1
+
+ and %r11,%r14 # (a|c)&b
+ add %r12,%r9 # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%r9 # h+=Maj(a,b,c)
+ mov 8*11(%rsi),%r12
+ bswap %r12
+ mov %rbx,%r13
+ mov %rbx,%r14
+ mov %rcx,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %rdx,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %rbx,%r15 # (f^g)&e
+ mov %r12,88(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %r8,%r12 # T1+=h
+
+ mov %r9,%r8
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %r9,%r13
+ mov %r9,%r14
+
+ ror $28,%r8
+ ror $34,%r13
+ mov %r9,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%r8
+ ror $5,%r13
+ or %r11,%r14 # a|c
+
+ xor %r13,%r8 # h=Sigma0(a)
+ and %r11,%r15 # a&c
+ add %r12,%rax # d+=T1
+
+ and %r10,%r14 # (a|c)&b
+ add %r12,%r8 # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%r8 # h+=Maj(a,b,c)
+ mov 8*12(%rsi),%r12
+ bswap %r12
+ mov %rax,%r13
+ mov %rax,%r14
+ mov %rbx,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %rcx,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %rax,%r15 # (f^g)&e
+ mov %r12,96(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %rdx,%r12 # T1+=h
+
+ mov %r8,%rdx
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %r8,%r13
+ mov %r8,%r14
+
+ ror $28,%rdx
+ ror $34,%r13
+ mov %r8,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%rdx
+ ror $5,%r13
+ or %r10,%r14 # a|c
+
+ xor %r13,%rdx # h=Sigma0(a)
+ and %r10,%r15 # a&c
+ add %r12,%r11 # d+=T1
+
+ and %r9,%r14 # (a|c)&b
+ add %r12,%rdx # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%rdx # h+=Maj(a,b,c)
+ mov 8*13(%rsi),%r12
+ bswap %r12
+ mov %r11,%r13
+ mov %r11,%r14
+ mov %rax,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %rbx,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %r11,%r15 # (f^g)&e
+ mov %r12,104(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %rcx,%r12 # T1+=h
+
+ mov %rdx,%rcx
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %rdx,%r13
+ mov %rdx,%r14
+
+ ror $28,%rcx
+ ror $34,%r13
+ mov %rdx,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%rcx
+ ror $5,%r13
+ or %r9,%r14 # a|c
+
+ xor %r13,%rcx # h=Sigma0(a)
+ and %r9,%r15 # a&c
+ add %r12,%r10 # d+=T1
+
+ and %r8,%r14 # (a|c)&b
+ add %r12,%rcx # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%rcx # h+=Maj(a,b,c)
+ mov 8*14(%rsi),%r12
+ bswap %r12
+ mov %r10,%r13
+ mov %r10,%r14
+ mov %r11,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %rax,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %r10,%r15 # (f^g)&e
+ mov %r12,112(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %rbx,%r12 # T1+=h
+
+ mov %rcx,%rbx
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %rcx,%r13
+ mov %rcx,%r14
+
+ ror $28,%rbx
+ ror $34,%r13
+ mov %rcx,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%rbx
+ ror $5,%r13
+ or %r8,%r14 # a|c
+
+ xor %r13,%rbx # h=Sigma0(a)
+ and %r8,%r15 # a&c
+ add %r12,%r9 # d+=T1
+
+ and %rdx,%r14 # (a|c)&b
+ add %r12,%rbx # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%rbx # h+=Maj(a,b,c)
+ mov 8*15(%rsi),%r12
+ bswap %r12
+ mov %r9,%r13
+ mov %r9,%r14
+ mov %r10,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %r11,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %r9,%r15 # (f^g)&e
+ mov %r12,120(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %rax,%r12 # T1+=h
+
+ mov %rbx,%rax
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %rbx,%r13
+ mov %rbx,%r14
+
+ ror $28,%rax
+ ror $34,%r13
+ mov %rbx,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%rax
+ ror $5,%r13
+ or %rdx,%r14 # a|c
+
+ xor %r13,%rax # h=Sigma0(a)
+ and %rdx,%r15 # a&c
+ add %r12,%r8 # d+=T1
+
+ and %rcx,%r14 # (a|c)&b
+ add %r12,%rax # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%rax # h+=Maj(a,b,c)
+ jmp .Lrounds_16_xx
+.align 16
+.Lrounds_16_xx:
+ mov 8(%rsp),%r13
+ mov 112(%rsp),%r12
+
+ mov %r13,%r15
+
+ shr $7,%r13
+ ror $1,%r15
+
+ xor %r15,%r13
+ ror $7,%r15
+
+ xor %r15,%r13 # sigma0(X[(i+1)&0xf])
+ mov %r12,%r14
+
+ shr $6,%r12
+ ror $19,%r14
+
+ xor %r14,%r12
+ ror $42,%r14
+
+ xor %r14,%r12 # sigma1(X[(i+14)&0xf])
+
+ add %r13,%r12
+
+ add 72(%rsp),%r12
+
+ add 0(%rsp),%r12
+ mov %r8,%r13
+ mov %r8,%r14
+ mov %r9,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %r10,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %r8,%r15 # (f^g)&e
+ mov %r12,0(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %r11,%r12 # T1+=h
+
+ mov %rax,%r11
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %rax,%r13
+ mov %rax,%r14
+
+ ror $28,%r11
+ ror $34,%r13
+ mov %rax,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%r11
+ ror $5,%r13
+ or %rcx,%r14 # a|c
+
+ xor %r13,%r11 # h=Sigma0(a)
+ and %rcx,%r15 # a&c
+ add %r12,%rdx # d+=T1
+
+ and %rbx,%r14 # (a|c)&b
+ add %r12,%r11 # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%r11 # h+=Maj(a,b,c)
+ mov 16(%rsp),%r13
+ mov 120(%rsp),%r12
+
+ mov %r13,%r15
+
+ shr $7,%r13
+ ror $1,%r15
+
+ xor %r15,%r13
+ ror $7,%r15
+
+ xor %r15,%r13 # sigma0(X[(i+1)&0xf])
+ mov %r12,%r14
+
+ shr $6,%r12
+ ror $19,%r14
+
+ xor %r14,%r12
+ ror $42,%r14
+
+ xor %r14,%r12 # sigma1(X[(i+14)&0xf])
+
+ add %r13,%r12
+
+ add 80(%rsp),%r12
+
+ add 8(%rsp),%r12
+ mov %rdx,%r13
+ mov %rdx,%r14
+ mov %r8,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %r9,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %rdx,%r15 # (f^g)&e
+ mov %r12,8(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %r10,%r12 # T1+=h
+
+ mov %r11,%r10
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %r11,%r13
+ mov %r11,%r14
+
+ ror $28,%r10
+ ror $34,%r13
+ mov %r11,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%r10
+ ror $5,%r13
+ or %rbx,%r14 # a|c
+
+ xor %r13,%r10 # h=Sigma0(a)
+ and %rbx,%r15 # a&c
+ add %r12,%rcx # d+=T1
+
+ and %rax,%r14 # (a|c)&b
+ add %r12,%r10 # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%r10 # h+=Maj(a,b,c)
+ mov 24(%rsp),%r13
+ mov 0(%rsp),%r12
+
+ mov %r13,%r15
+
+ shr $7,%r13
+ ror $1,%r15
+
+ xor %r15,%r13
+ ror $7,%r15
+
+ xor %r15,%r13 # sigma0(X[(i+1)&0xf])
+ mov %r12,%r14
+
+ shr $6,%r12
+ ror $19,%r14
+
+ xor %r14,%r12
+ ror $42,%r14
+
+ xor %r14,%r12 # sigma1(X[(i+14)&0xf])
+
+ add %r13,%r12
+
+ add 88(%rsp),%r12
+
+ add 16(%rsp),%r12
+ mov %rcx,%r13
+ mov %rcx,%r14
+ mov %rdx,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %r8,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %rcx,%r15 # (f^g)&e
+ mov %r12,16(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %r9,%r12 # T1+=h
+
+ mov %r10,%r9
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %r10,%r13
+ mov %r10,%r14
+
+ ror $28,%r9
+ ror $34,%r13
+ mov %r10,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%r9
+ ror $5,%r13
+ or %rax,%r14 # a|c
+
+ xor %r13,%r9 # h=Sigma0(a)
+ and %rax,%r15 # a&c
+ add %r12,%rbx # d+=T1
+
+ and %r11,%r14 # (a|c)&b
+ add %r12,%r9 # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%r9 # h+=Maj(a,b,c)
+ mov 32(%rsp),%r13
+ mov 8(%rsp),%r12
+
+ mov %r13,%r15
+
+ shr $7,%r13
+ ror $1,%r15
+
+ xor %r15,%r13
+ ror $7,%r15
+
+ xor %r15,%r13 # sigma0(X[(i+1)&0xf])
+ mov %r12,%r14
+
+ shr $6,%r12
+ ror $19,%r14
+
+ xor %r14,%r12
+ ror $42,%r14
+
+ xor %r14,%r12 # sigma1(X[(i+14)&0xf])
+
+ add %r13,%r12
+
+ add 96(%rsp),%r12
+
+ add 24(%rsp),%r12
+ mov %rbx,%r13
+ mov %rbx,%r14
+ mov %rcx,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %rdx,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %rbx,%r15 # (f^g)&e
+ mov %r12,24(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %r8,%r12 # T1+=h
+
+ mov %r9,%r8
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %r9,%r13
+ mov %r9,%r14
+
+ ror $28,%r8
+ ror $34,%r13
+ mov %r9,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%r8
+ ror $5,%r13
+ or %r11,%r14 # a|c
+
+ xor %r13,%r8 # h=Sigma0(a)
+ and %r11,%r15 # a&c
+ add %r12,%rax # d+=T1
+
+ and %r10,%r14 # (a|c)&b
+ add %r12,%r8 # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%r8 # h+=Maj(a,b,c)
+ mov 40(%rsp),%r13
+ mov 16(%rsp),%r12
+
+ mov %r13,%r15
+
+ shr $7,%r13
+ ror $1,%r15
+
+ xor %r15,%r13
+ ror $7,%r15
+
+ xor %r15,%r13 # sigma0(X[(i+1)&0xf])
+ mov %r12,%r14
+
+ shr $6,%r12
+ ror $19,%r14
+
+ xor %r14,%r12
+ ror $42,%r14
+
+ xor %r14,%r12 # sigma1(X[(i+14)&0xf])
+
+ add %r13,%r12
+
+ add 104(%rsp),%r12
+
+ add 32(%rsp),%r12
+ mov %rax,%r13
+ mov %rax,%r14
+ mov %rbx,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %rcx,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %rax,%r15 # (f^g)&e
+ mov %r12,32(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %rdx,%r12 # T1+=h
+
+ mov %r8,%rdx
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %r8,%r13
+ mov %r8,%r14
+
+ ror $28,%rdx
+ ror $34,%r13
+ mov %r8,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%rdx
+ ror $5,%r13
+ or %r10,%r14 # a|c
+
+ xor %r13,%rdx # h=Sigma0(a)
+ and %r10,%r15 # a&c
+ add %r12,%r11 # d+=T1
+
+ and %r9,%r14 # (a|c)&b
+ add %r12,%rdx # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%rdx # h+=Maj(a,b,c)
+ mov 48(%rsp),%r13
+ mov 24(%rsp),%r12
+
+ mov %r13,%r15
+
+ shr $7,%r13
+ ror $1,%r15
+
+ xor %r15,%r13
+ ror $7,%r15
+
+ xor %r15,%r13 # sigma0(X[(i+1)&0xf])
+ mov %r12,%r14
+
+ shr $6,%r12
+ ror $19,%r14
+
+ xor %r14,%r12
+ ror $42,%r14
+
+ xor %r14,%r12 # sigma1(X[(i+14)&0xf])
+
+ add %r13,%r12
+
+ add 112(%rsp),%r12
+
+ add 40(%rsp),%r12
+ mov %r11,%r13
+ mov %r11,%r14
+ mov %rax,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %rbx,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %r11,%r15 # (f^g)&e
+ mov %r12,40(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %rcx,%r12 # T1+=h
+
+ mov %rdx,%rcx
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %rdx,%r13
+ mov %rdx,%r14
+
+ ror $28,%rcx
+ ror $34,%r13
+ mov %rdx,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%rcx
+ ror $5,%r13
+ or %r9,%r14 # a|c
+
+ xor %r13,%rcx # h=Sigma0(a)
+ and %r9,%r15 # a&c
+ add %r12,%r10 # d+=T1
+
+ and %r8,%r14 # (a|c)&b
+ add %r12,%rcx # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%rcx # h+=Maj(a,b,c)
+ mov 56(%rsp),%r13
+ mov 32(%rsp),%r12
+
+ mov %r13,%r15
+
+ shr $7,%r13
+ ror $1,%r15
+
+ xor %r15,%r13
+ ror $7,%r15
+
+ xor %r15,%r13 # sigma0(X[(i+1)&0xf])
+ mov %r12,%r14
+
+ shr $6,%r12
+ ror $19,%r14
+
+ xor %r14,%r12
+ ror $42,%r14
+
+ xor %r14,%r12 # sigma1(X[(i+14)&0xf])
+
+ add %r13,%r12
+
+ add 120(%rsp),%r12
+
+ add 48(%rsp),%r12
+ mov %r10,%r13
+ mov %r10,%r14
+ mov %r11,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %rax,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %r10,%r15 # (f^g)&e
+ mov %r12,48(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %rbx,%r12 # T1+=h
+
+ mov %rcx,%rbx
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %rcx,%r13
+ mov %rcx,%r14
+
+ ror $28,%rbx
+ ror $34,%r13
+ mov %rcx,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%rbx
+ ror $5,%r13
+ or %r8,%r14 # a|c
+
+ xor %r13,%rbx # h=Sigma0(a)
+ and %r8,%r15 # a&c
+ add %r12,%r9 # d+=T1
+
+ and %rdx,%r14 # (a|c)&b
+ add %r12,%rbx # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%rbx # h+=Maj(a,b,c)
+ mov 64(%rsp),%r13
+ mov 40(%rsp),%r12
+
+ mov %r13,%r15
+
+ shr $7,%r13
+ ror $1,%r15
+
+ xor %r15,%r13
+ ror $7,%r15
+
+ xor %r15,%r13 # sigma0(X[(i+1)&0xf])
+ mov %r12,%r14
+
+ shr $6,%r12
+ ror $19,%r14
+
+ xor %r14,%r12
+ ror $42,%r14
+
+ xor %r14,%r12 # sigma1(X[(i+14)&0xf])
+
+ add %r13,%r12
+
+ add 0(%rsp),%r12
+
+ add 56(%rsp),%r12
+ mov %r9,%r13
+ mov %r9,%r14
+ mov %r10,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %r11,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %r9,%r15 # (f^g)&e
+ mov %r12,56(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %rax,%r12 # T1+=h
+
+ mov %rbx,%rax
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %rbx,%r13
+ mov %rbx,%r14
+
+ ror $28,%rax
+ ror $34,%r13
+ mov %rbx,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%rax
+ ror $5,%r13
+ or %rdx,%r14 # a|c
+
+ xor %r13,%rax # h=Sigma0(a)
+ and %rdx,%r15 # a&c
+ add %r12,%r8 # d+=T1
+
+ and %rcx,%r14 # (a|c)&b
+ add %r12,%rax # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%rax # h+=Maj(a,b,c)
+ mov 72(%rsp),%r13
+ mov 48(%rsp),%r12
+
+ mov %r13,%r15
+
+ shr $7,%r13
+ ror $1,%r15
+
+ xor %r15,%r13
+ ror $7,%r15
+
+ xor %r15,%r13 # sigma0(X[(i+1)&0xf])
+ mov %r12,%r14
+
+ shr $6,%r12
+ ror $19,%r14
+
+ xor %r14,%r12
+ ror $42,%r14
+
+ xor %r14,%r12 # sigma1(X[(i+14)&0xf])
+
+ add %r13,%r12
+
+ add 8(%rsp),%r12
+
+ add 64(%rsp),%r12
+ mov %r8,%r13
+ mov %r8,%r14
+ mov %r9,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %r10,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %r8,%r15 # (f^g)&e
+ mov %r12,64(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %r11,%r12 # T1+=h
+
+ mov %rax,%r11
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %rax,%r13
+ mov %rax,%r14
+
+ ror $28,%r11
+ ror $34,%r13
+ mov %rax,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%r11
+ ror $5,%r13
+ or %rcx,%r14 # a|c
+
+ xor %r13,%r11 # h=Sigma0(a)
+ and %rcx,%r15 # a&c
+ add %r12,%rdx # d+=T1
+
+ and %rbx,%r14 # (a|c)&b
+ add %r12,%r11 # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%r11 # h+=Maj(a,b,c)
+ mov 80(%rsp),%r13
+ mov 56(%rsp),%r12
+
+ mov %r13,%r15
+
+ shr $7,%r13
+ ror $1,%r15
+
+ xor %r15,%r13
+ ror $7,%r15
+
+ xor %r15,%r13 # sigma0(X[(i+1)&0xf])
+ mov %r12,%r14
+
+ shr $6,%r12
+ ror $19,%r14
+
+ xor %r14,%r12
+ ror $42,%r14
+
+ xor %r14,%r12 # sigma1(X[(i+14)&0xf])
+
+ add %r13,%r12
+
+ add 16(%rsp),%r12
+
+ add 72(%rsp),%r12
+ mov %rdx,%r13
+ mov %rdx,%r14
+ mov %r8,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %r9,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %rdx,%r15 # (f^g)&e
+ mov %r12,72(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %r10,%r12 # T1+=h
+
+ mov %r11,%r10
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %r11,%r13
+ mov %r11,%r14
+
+ ror $28,%r10
+ ror $34,%r13
+ mov %r11,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%r10
+ ror $5,%r13
+ or %rbx,%r14 # a|c
+
+ xor %r13,%r10 # h=Sigma0(a)
+ and %rbx,%r15 # a&c
+ add %r12,%rcx # d+=T1
+
+ and %rax,%r14 # (a|c)&b
+ add %r12,%r10 # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%r10 # h+=Maj(a,b,c)
+ mov 88(%rsp),%r13
+ mov 64(%rsp),%r12
+
+ mov %r13,%r15
+
+ shr $7,%r13
+ ror $1,%r15
+
+ xor %r15,%r13
+ ror $7,%r15
+
+ xor %r15,%r13 # sigma0(X[(i+1)&0xf])
+ mov %r12,%r14
+
+ shr $6,%r12
+ ror $19,%r14
+
+ xor %r14,%r12
+ ror $42,%r14
+
+ xor %r14,%r12 # sigma1(X[(i+14)&0xf])
+
+ add %r13,%r12
+
+ add 24(%rsp),%r12
+
+ add 80(%rsp),%r12
+ mov %rcx,%r13
+ mov %rcx,%r14
+ mov %rdx,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %r8,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %rcx,%r15 # (f^g)&e
+ mov %r12,80(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %r9,%r12 # T1+=h
+
+ mov %r10,%r9
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %r10,%r13
+ mov %r10,%r14
+
+ ror $28,%r9
+ ror $34,%r13
+ mov %r10,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%r9
+ ror $5,%r13
+ or %rax,%r14 # a|c
+
+ xor %r13,%r9 # h=Sigma0(a)
+ and %rax,%r15 # a&c
+ add %r12,%rbx # d+=T1
+
+ and %r11,%r14 # (a|c)&b
+ add %r12,%r9 # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%r9 # h+=Maj(a,b,c)
+ mov 96(%rsp),%r13
+ mov 72(%rsp),%r12
+
+ mov %r13,%r15
+
+ shr $7,%r13
+ ror $1,%r15
+
+ xor %r15,%r13
+ ror $7,%r15
+
+ xor %r15,%r13 # sigma0(X[(i+1)&0xf])
+ mov %r12,%r14
+
+ shr $6,%r12
+ ror $19,%r14
+
+ xor %r14,%r12
+ ror $42,%r14
+
+ xor %r14,%r12 # sigma1(X[(i+14)&0xf])
+
+ add %r13,%r12
+
+ add 32(%rsp),%r12
+
+ add 88(%rsp),%r12
+ mov %rbx,%r13
+ mov %rbx,%r14
+ mov %rcx,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %rdx,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %rbx,%r15 # (f^g)&e
+ mov %r12,88(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %r8,%r12 # T1+=h
+
+ mov %r9,%r8
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %r9,%r13
+ mov %r9,%r14
+
+ ror $28,%r8
+ ror $34,%r13
+ mov %r9,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%r8
+ ror $5,%r13
+ or %r11,%r14 # a|c
+
+ xor %r13,%r8 # h=Sigma0(a)
+ and %r11,%r15 # a&c
+ add %r12,%rax # d+=T1
+
+ and %r10,%r14 # (a|c)&b
+ add %r12,%r8 # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%r8 # h+=Maj(a,b,c)
+ mov 104(%rsp),%r13
+ mov 80(%rsp),%r12
+
+ mov %r13,%r15
+
+ shr $7,%r13
+ ror $1,%r15
+
+ xor %r15,%r13
+ ror $7,%r15
+
+ xor %r15,%r13 # sigma0(X[(i+1)&0xf])
+ mov %r12,%r14
+
+ shr $6,%r12
+ ror $19,%r14
+
+ xor %r14,%r12
+ ror $42,%r14
+
+ xor %r14,%r12 # sigma1(X[(i+14)&0xf])
+
+ add %r13,%r12
+
+ add 40(%rsp),%r12
+
+ add 96(%rsp),%r12
+ mov %rax,%r13
+ mov %rax,%r14
+ mov %rbx,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %rcx,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %rax,%r15 # (f^g)&e
+ mov %r12,96(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %rdx,%r12 # T1+=h
+
+ mov %r8,%rdx
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %r8,%r13
+ mov %r8,%r14
+
+ ror $28,%rdx
+ ror $34,%r13
+ mov %r8,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%rdx
+ ror $5,%r13
+ or %r10,%r14 # a|c
+
+ xor %r13,%rdx # h=Sigma0(a)
+ and %r10,%r15 # a&c
+ add %r12,%r11 # d+=T1
+
+ and %r9,%r14 # (a|c)&b
+ add %r12,%rdx # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%rdx # h+=Maj(a,b,c)
+ mov 112(%rsp),%r13
+ mov 88(%rsp),%r12
+
+ mov %r13,%r15
+
+ shr $7,%r13
+ ror $1,%r15
+
+ xor %r15,%r13
+ ror $7,%r15
+
+ xor %r15,%r13 # sigma0(X[(i+1)&0xf])
+ mov %r12,%r14
+
+ shr $6,%r12
+ ror $19,%r14
+
+ xor %r14,%r12
+ ror $42,%r14
+
+ xor %r14,%r12 # sigma1(X[(i+14)&0xf])
+
+ add %r13,%r12
+
+ add 48(%rsp),%r12
+
+ add 104(%rsp),%r12
+ mov %r11,%r13
+ mov %r11,%r14
+ mov %rax,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %rbx,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %r11,%r15 # (f^g)&e
+ mov %r12,104(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %rcx,%r12 # T1+=h
+
+ mov %rdx,%rcx
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %rdx,%r13
+ mov %rdx,%r14
+
+ ror $28,%rcx
+ ror $34,%r13
+ mov %rdx,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%rcx
+ ror $5,%r13
+ or %r9,%r14 # a|c
+
+ xor %r13,%rcx # h=Sigma0(a)
+ and %r9,%r15 # a&c
+ add %r12,%r10 # d+=T1
+
+ and %r8,%r14 # (a|c)&b
+ add %r12,%rcx # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%rcx # h+=Maj(a,b,c)
+ mov 120(%rsp),%r13
+ mov 96(%rsp),%r12
+
+ mov %r13,%r15
+
+ shr $7,%r13
+ ror $1,%r15
+
+ xor %r15,%r13
+ ror $7,%r15
+
+ xor %r15,%r13 # sigma0(X[(i+1)&0xf])
+ mov %r12,%r14
+
+ shr $6,%r12
+ ror $19,%r14
+
+ xor %r14,%r12
+ ror $42,%r14
+
+ xor %r14,%r12 # sigma1(X[(i+14)&0xf])
+
+ add %r13,%r12
+
+ add 56(%rsp),%r12
+
+ add 112(%rsp),%r12
+ mov %r10,%r13
+ mov %r10,%r14
+ mov %r11,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %rax,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %r10,%r15 # (f^g)&e
+ mov %r12,112(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %rbx,%r12 # T1+=h
+
+ mov %rcx,%rbx
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %rcx,%r13
+ mov %rcx,%r14
+
+ ror $28,%rbx
+ ror $34,%r13
+ mov %rcx,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%rbx
+ ror $5,%r13
+ or %r8,%r14 # a|c
+
+ xor %r13,%rbx # h=Sigma0(a)
+ and %r8,%r15 # a&c
+ add %r12,%r9 # d+=T1
+
+ and %rdx,%r14 # (a|c)&b
+ add %r12,%rbx # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%rbx # h+=Maj(a,b,c)
+ mov 0(%rsp),%r13
+ mov 104(%rsp),%r12
+
+ mov %r13,%r15
+
+ shr $7,%r13
+ ror $1,%r15
+
+ xor %r15,%r13
+ ror $7,%r15
+
+ xor %r15,%r13 # sigma0(X[(i+1)&0xf])
+ mov %r12,%r14
+
+ shr $6,%r12
+ ror $19,%r14
+
+ xor %r14,%r12
+ ror $42,%r14
+
+ xor %r14,%r12 # sigma1(X[(i+14)&0xf])
+
+ add %r13,%r12
+
+ add 64(%rsp),%r12
+
+ add 120(%rsp),%r12
+ mov %r9,%r13
+ mov %r9,%r14
+ mov %r10,%r15
+
+ ror $14,%r13
+ ror $18,%r14
+ xor %r11,%r15 # f^g
+
+ xor %r14,%r13
+ ror $23,%r14
+ and %r9,%r15 # (f^g)&e
+ mov %r12,120(%rsp)
+
+ xor %r14,%r13 # Sigma1(e)
+ xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g
+ add %rax,%r12 # T1+=h
+
+ mov %rbx,%rax
+ add %r13,%r12 # T1+=Sigma1(e)
+
+ add %r15,%r12 # T1+=Ch(e,f,g)
+ mov %rbx,%r13
+ mov %rbx,%r14
+
+ ror $28,%rax
+ ror $34,%r13
+ mov %rbx,%r15
+ add (%rbp,%rdi,8),%r12 # T1+=K[round]
+
+ xor %r13,%rax
+ ror $5,%r13
+ or %rdx,%r14 # a|c
+
+ xor %r13,%rax # h=Sigma0(a)
+ and %rdx,%r15 # a&c
+ add %r12,%r8 # d+=T1
+
+ and %rcx,%r14 # (a|c)&b
+ add %r12,%rax # h+=T1
+
+ or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
+ lea 1(%rdi),%rdi # round++
+
+ add %r14,%rax # h+=Maj(a,b,c)
+ cmp $80,%rdi
+ jb .Lrounds_16_xx
+
+ mov 16*8+0*8(%rsp),%rdi
+ lea 16*8(%rsi),%rsi
+
+ add 8*0(%rdi),%rax
+ add 8*1(%rdi),%rbx
+ add 8*2(%rdi),%rcx
+ add 8*3(%rdi),%rdx
+ add 8*4(%rdi),%r8
+ add 8*5(%rdi),%r9
+ add 8*6(%rdi),%r10
+ add 8*7(%rdi),%r11
+
+ cmp 16*8+2*8(%rsp),%rsi
+
+ mov %rax,8*0(%rdi)
+ mov %rbx,8*1(%rdi)
+ mov %rcx,8*2(%rdi)
+ mov %rdx,8*3(%rdi)
+ mov %r8,8*4(%rdi)
+ mov %r9,8*5(%rdi)
+ mov %r10,8*6(%rdi)
+ mov %r11,8*7(%rdi)
+ jb .Lloop
+
+ mov 16*8+3*8(%rsp),%rsp
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+
+ ret
+SET_SIZE(SHA512TransformBlocks)
+
+.data
+.align 64
+.type K512,@object
+K512:
+ .quad 0x428a2f98d728ae22,0x7137449123ef65cd
+ .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ .quad 0x3956c25bf348b538,0x59f111f1b605d019
+ .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ .quad 0xd807aa98a3030242,0x12835b0145706fbe
+ .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ .quad 0x9bdc06a725c71235,0xc19bf174cf692694
+ .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ .quad 0x983e5152ee66dfab,0xa831c66d2db43210
+ .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ .quad 0x06ca6351e003826f,0x142929670a0e6e70
+ .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+ .quad 0x81c2c92e47edaee6,0x92722c851482353b
+ .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+ .quad 0xd192e819d6ef5218,0xd69906245565a910
+ .quad 0xf40e35855771202a,0x106aa07032bbd1b8
+ .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+ .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ .quad 0x90befffa23631e28,0xa4506cebde82bde9
+ .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ .quad 0xca273eceea26619c,0xd186b8c721c0c207
+ .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ .quad 0x113f9804bef90dae,0x1b710b35131c471b
+ .quad 0x28db77f523047d84,0x32caab7b40c72493
+ .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+#endif /* !lint && !__lint */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif