20 files changed, 11847 insertions, 0 deletions
diff --git a/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman
new file mode 100644
index 000000000000..48fea7bb333e
--- /dev/null
+++ b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman
@@ -0,0 +1,23 @@
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software is allowed (with or without
+ changes) provided that:
+
+  1. source code distributions include the above copyright notice, this
+     list of conditions and the following disclaimer;
+
+  2. binary distributions include the above copyright notice, this list
+     of conditions and the following disclaimer in their documentation;
+
+  3. the name of the copyright holder is not used to endorse products
+     built using this software without specific written permission.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
diff --git a/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman.descrip b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman.descrip
new file mode 100644
index 000000000000..5f822cf27586
--- /dev/null
+++ b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman.descrip
@@ -0,0 +1 @@
+PORTIONS OF AES FUNCTIONALITY
diff --git a/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl
new file mode 100644
index 000000000000..92c9e196a318
--- /dev/null
+++ b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl
@@ -0,0 +1,127 @@
+
+  LICENSE ISSUES
+  ==============
+
+  The OpenSSL toolkit stays under a dual license, i.e. both the conditions of
+  the OpenSSL License and the original SSLeay license apply to the toolkit.
+  See below for the actual license texts. Actually both licenses are BSD-style
+  Open Source licenses. In case of any license issues related to OpenSSL
+  please contact openssl-core@openssl.org.
+
+  OpenSSL License
+  ---------------
+
+/* ====================================================================
+ * Copyright (c) 1998-2008 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+ Original SSLeay License
+ -----------------------
+
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the routines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
diff --git a/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl.descrip b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl.descrip
new file mode 100644
index 000000000000..5f822cf27586
--- /dev/null
+++ b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl.descrip
@@ -0,0 +1 @@
+PORTIONS OF AES FUNCTIONALITY
diff --git a/module/icp/asm-x86_64/aes/aes_aesni.S b/module/icp/asm-x86_64/aes/aes_aesni.S
new file mode 100644
index 000000000000..4a80c62097ae
--- /dev/null
+++ b/module/icp/asm-x86_64/aes/aes_aesni.S
@@ -0,0 +1,748 @@
+/*
+ * ====================================================================
+ * Written by Intel Corporation for the OpenSSL project to add support
+ * for Intel AES-NI instructions. Rights for redistribution and usage
+ * in source and binary forms are granted according to the OpenSSL
+ * license.
+ *
+ *   Author: Huang Ying <ying.huang at intel dot com>
+ *           Vinodh Gopal <vinodh.gopal at intel dot com>
+ *           Kahraman Akdemir
+ *
+ * Intel AES-NI is a new set of Single Instruction Multiple Data (SIMD)
+ * instructions that are going to be introduced in the next generation
+ * of Intel processor, as of 2009. These instructions enable fast and
+ * secure data encryption and decryption, using the Advanced Encryption
+ * Standard (AES), defined by FIPS Publication number 197. The
+ * architecture introduces six instructions that offer full hardware
+ * support for AES. Four of them support high performance data
+ * encryption and decryption, and the other two instructions support
+ * the AES key expansion procedure.
+ * ====================================================================
+ */
+
+/*
+ * ====================================================================
+ * Copyright (c) 1998-2008 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+/*
+ * ====================================================================
+ * OpenSolaris OS modifications
+ *
+ * This source originates as files aes-intel.S and eng_aesni_asm.pl, in
+ * patches sent sent Dec. 9, 2008 and Dec. 24, 2008, respectively, by
+ * Huang Ying of Intel to the openssl-dev mailing list under the subject
+ * of "Add support to Intel AES-NI instruction set for x86_64 platform".
+ *
+ * This OpenSolaris version has these major changes from the original source:
+ *
+ * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
+ * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function
+ * definitions for lint.
+ *
+ * 2. Formatted code, added comments, and added #includes and #defines.
+ *
+ * 3. If bit CR0.TS is set, clear and set the TS bit, after and before
+ * calling kpreempt_disable() and kpreempt_enable().
+ * If the TS bit is not set, Save and restore %xmm registers at the beginning
+ * and end of function calls (%xmm* registers are not saved and restored by
+ * during kernel thread preemption).
+ *
+ * 4. Renamed functions, reordered parameters, and changed return value
+ * to match OpenSolaris:
+ *
+ * OpenSSL interface:
+ *	int intel_AES_set_encrypt_key(const unsigned char *userKey,
+ *		const int bits, AES_KEY *key);
+ *	int intel_AES_set_decrypt_key(const unsigned char *userKey,
+ *		const int bits, AES_KEY *key);
+ *	Return values for above are non-zero on error, 0 on success.
+ *
+ *	void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
+ *		const AES_KEY *key);
+ *	void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
+ *		const AES_KEY *key);
+ *	typedef struct aes_key_st {
+ *		unsigned int	rd_key[4 *(AES_MAXNR + 1)];
+ *		int		rounds;
+ *		unsigned int	pad[3];
+ *	} AES_KEY;
+ * Note: AES_LONG is undefined (that is, Intel uses 32-bit key schedules
+ * (ks32) instead of 64-bit (ks64).
+ * Number of rounds (aka round count) is at offset 240 of AES_KEY.
+ *
+ * OpenSolaris OS interface (#ifdefs removed for readability):
+ *	int rijndael_key_setup_dec_intel(uint32_t rk[],
+ *		const uint32_t cipherKey[], uint64_t keyBits);
+ *	int rijndael_key_setup_enc_intel(uint32_t rk[],
+ *		const uint32_t cipherKey[], uint64_t keyBits);
+ *	Return values for above are 0 on error, number of rounds on success.
+ *
+ *	void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
+ *		const uint32_t pt[4], uint32_t ct[4]);
+ *	void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
+ *		const uint32_t pt[4], uint32_t ct[4]);
+ *	typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4];
+ *		 uint32_t ks32[(MAX_AES_NR + 1) * 4]; } aes_ks_t;
+ *
+ *	typedef union {
+ *		uint32_t	ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)];
+ *	} aes_ks_t;
+ *	typedef struct aes_key {
+ *		aes_ks_t	encr_ks, decr_ks;
+ *		long double	align128;
+ *		int		flags, nr, type;
+ *	} aes_key_t;
+ *
+ * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text,
+ * ct is crypto text, and MAX_AES_NR is 14.
+ * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64.
+ *
+ * Note2: aes_ks_t must be aligned on a 0 mod 128 byte boundary.
+ *
+ * ====================================================================
+ */
+
+
+#if defined(lint) || defined(__lint)
+
+#include <sys/types.h>
+
+/* ARGSUSED */
+void
+aes_encrypt_intel(const uint32_t rk[], int Nr, const uint32_t pt[4],
+    uint32_t ct[4]) {
+}
+/* ARGSUSED */
+void
+aes_decrypt_intel(const uint32_t rk[], int Nr, const uint32_t ct[4],
+    uint32_t pt[4]) {
+}
+/* ARGSUSED */
+int
+rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
+    uint64_t keyBits) {
+	return (0);
+}
+/* ARGSUSED */
+int
+rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[],
+   uint64_t keyBits) {
+	return (0);
+}
+
+
+#elif defined(HAVE_AES)	/* guard by instruction set */
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+/*
+ * _key_expansion_128(), * _key_expansion_192a(), _key_expansion_192b(),
+ * _key_expansion_256a(), _key_expansion_256b()
+ *
+ * Helper functions called by rijndael_key_setup_inc_intel().
+ * Also used indirectly by rijndael_key_setup_dec_intel().
+ *
+ * Input:
+ * %xmm0	User-provided cipher key
+ * %xmm1	Round constant
+ * Output:
+ * (%rcx)	AES key
+ */
+
+ENTRY_NP2(_key_expansion_128, _key_expansion_256a)
+_key_expansion_128_local:
+_key_expansion_256a_local:
+	pshufd	$0b11111111, %xmm1, %xmm1
+	shufps	$0b00010000, %xmm0, %xmm4
+	pxor	%xmm4, %xmm0
+	shufps	$0b10001100, %xmm0, %xmm4
+	pxor	%xmm4, %xmm0
+	pxor	%xmm1, %xmm0
+	movups	%xmm0, (%rcx)
+	add	$0x10, %rcx
+	ret
+	nop
+SET_SIZE(_key_expansion_128)
+SET_SIZE(_key_expansion_256a)
+
+
+ENTRY_NP(_key_expansion_192a)
+_key_expansion_192a_local:
+	pshufd	$0b01010101, %xmm1, %xmm1
+	shufps	$0b00010000, %xmm0, %xmm4
+	pxor	%xmm4, %xmm0
+	shufps	$0b10001100, %xmm0, %xmm4
+	pxor	%xmm4, %xmm0
+	pxor	%xmm1, %xmm0
+
+	movups	%xmm2, %xmm5
+	movups	%xmm2, %xmm6
+	pslldq	$4, %xmm5
+	pshufd	$0b11111111, %xmm0, %xmm3
+	pxor	%xmm3, %xmm2
+	pxor	%xmm5, %xmm2
+
+	movups	%xmm0, %xmm1
+	shufps	$0b01000100, %xmm0, %xmm6
+	movups	%xmm6, (%rcx)
+	shufps	$0b01001110, %xmm2, %xmm1
+	movups	%xmm1, 0x10(%rcx)
+	add	$0x20, %rcx
+	ret
+SET_SIZE(_key_expansion_192a)
+
+
+ENTRY_NP(_key_expansion_192b)
+_key_expansion_192b_local:
+	pshufd	$0b01010101, %xmm1, %xmm1
+	shufps	$0b00010000, %xmm0, %xmm4
+	pxor	%xmm4, %xmm0
+	shufps	$0b10001100, %xmm0, %xmm4
+	pxor	%xmm4, %xmm0
+	pxor	%xmm1, %xmm0
+
+	movups	%xmm2, %xmm5
+	pslldq	$4, %xmm5
+	pshufd	$0b11111111, %xmm0, %xmm3
+	pxor	%xmm3, %xmm2
+	pxor	%xmm5, %xmm2
+
+	movups	%xmm0, (%rcx)
+	add	$0x10, %rcx
+	ret
+SET_SIZE(_key_expansion_192b)
+
+
+ENTRY_NP(_key_expansion_256b)
+_key_expansion_256b_local:
+	pshufd	$0b10101010, %xmm1, %xmm1
+	shufps	$0b00010000, %xmm2, %xmm4
+	pxor	%xmm4, %xmm2
+	shufps	$0b10001100, %xmm2, %xmm4
+	pxor	%xmm4, %xmm2
+	pxor	%xmm1, %xmm2
+	movups	%xmm2, (%rcx)
+	add	$0x10, %rcx
+	ret
+SET_SIZE(_key_expansion_256b)
+
+
+/*
+ * rijndael_key_setup_enc_intel()
+ * Expand the cipher key into the encryption key schedule.
+ *
+ * For kernel code, caller is responsible for ensuring kpreempt_disable()
+ * has been called.  This is because %xmm registers are not saved/restored.
+ * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
+ * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
+ * on the stack.
+ *
+ * OpenSolaris interface:
+ * int rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
+ *	uint64_t keyBits);
+ * Return value is 0 on error, number of rounds on success.
+ *
+ * Original Intel OpenSSL interface:
+ * int intel_AES_set_encrypt_key(const unsigned char *userKey,
+ *	const int bits, AES_KEY *key);
+ * Return value is non-zero on error, 0 on success.
+ */
+
+#ifdef	OPENSSL_INTERFACE
+#define	rijndael_key_setup_enc_intel	intel_AES_set_encrypt_key
+#define	rijndael_key_setup_dec_intel	intel_AES_set_decrypt_key
+
+#define	USERCIPHERKEY		rdi	/* P1, 64 bits */
+#define	KEYSIZE32		esi	/* P2, 32 bits */
+#define	KEYSIZE64		rsi	/* P2, 64 bits */
+#define	AESKEY			rdx	/* P3, 64 bits */
+
+#else	/* OpenSolaris Interface */
+#define	AESKEY			rdi	/* P1, 64 bits */
+#define	USERCIPHERKEY		rsi	/* P2, 64 bits */
+#define	KEYSIZE32		edx	/* P3, 32 bits */
+#define	KEYSIZE64		rdx	/* P3, 64 bits */
+#endif	/* OPENSSL_INTERFACE */
+
+#define	ROUNDS32		KEYSIZE32	/* temp */
+#define	ROUNDS64		KEYSIZE64	/* temp */
+#define	ENDAESKEY		USERCIPHERKEY	/* temp */
+
+ENTRY_NP(rijndael_key_setup_enc_intel)
+rijndael_key_setup_enc_intel_local:
+	FRAME_BEGIN
+	// NULL pointer sanity check
+	test	%USERCIPHERKEY, %USERCIPHERKEY
+	jz	.Lenc_key_invalid_param
+	test	%AESKEY, %AESKEY
+	jz	.Lenc_key_invalid_param
+
+	movups	(%USERCIPHERKEY), %xmm0	// user key (first 16 bytes)
+	movups	%xmm0, (%AESKEY)
+	lea	0x10(%AESKEY), %rcx	// key addr
+	pxor	%xmm4, %xmm4		// xmm4 is assumed 0 in _key_expansion_x
+
+	cmp	$256, %KEYSIZE32
+	jnz	.Lenc_key192
+
+	// AES 256: 14 rounds in encryption key schedule
+#ifdef OPENSSL_INTERFACE
+	mov	$14, %ROUNDS32
+	movl	%ROUNDS32, 240(%AESKEY)		// key.rounds = 14
+#endif	/* OPENSSL_INTERFACE */
+
+	movups	0x10(%USERCIPHERKEY), %xmm2	// other user key (2nd 16 bytes)
+	movups	%xmm2, (%rcx)
+	add	$0x10, %rcx
+
+	aeskeygenassist $0x1, %xmm2, %xmm1	// expand the key
+	call	_key_expansion_256a_local
+	aeskeygenassist $0x1, %xmm0, %xmm1
+	call	_key_expansion_256b_local
+	aeskeygenassist $0x2, %xmm2, %xmm1	// expand the key
+	call	_key_expansion_256a_local
+	aeskeygenassist $0x2, %xmm0, %xmm1
+	call	_key_expansion_256b_local
+	aeskeygenassist $0x4, %xmm2, %xmm1	// expand the key
+	call	_key_expansion_256a_local
+	aeskeygenassist $0x4, %xmm0, %xmm1
+	call	_key_expansion_256b_local
+	aeskeygenassist $0x8, %xmm2, %xmm1	// expand the key
+	call	_key_expansion_256a_local
+	aeskeygenassist $0x8, %xmm0, %xmm1
+	call	_key_expansion_256b_local
+	aeskeygenassist $0x10, %xmm2, %xmm1	// expand the key
+	call	_key_expansion_256a_local
+	aeskeygenassist $0x10, %xmm0, %xmm1
+	call	_key_expansion_256b_local
+	aeskeygenassist $0x20, %xmm2, %xmm1	// expand the key
+	call	_key_expansion_256a_local
+	aeskeygenassist $0x20, %xmm0, %xmm1
+	call	_key_expansion_256b_local
+	aeskeygenassist $0x40, %xmm2, %xmm1	// expand the key
+	call	_key_expansion_256a_local
+
+#ifdef	OPENSSL_INTERFACE
+	xor	%rax, %rax			// return 0 (OK)
+#else	/* Open Solaris Interface */
+	mov	$14, %rax			// return # rounds = 14
+#endif
+	FRAME_END
+	ret
+
+.align 4
+.Lenc_key192:
+	cmp	$192, %KEYSIZE32
+	jnz	.Lenc_key128
+
+	// AES 192: 12 rounds in encryption key schedule
+#ifdef OPENSSL_INTERFACE
+	mov	$12, %ROUNDS32
+	movl	%ROUNDS32, 240(%AESKEY)	// key.rounds = 12
+#endif	/* OPENSSL_INTERFACE */
+
+	movq	0x10(%USERCIPHERKEY), %xmm2	// other user key
+	aeskeygenassist $0x1, %xmm2, %xmm1	// expand the key
+	call	_key_expansion_192a_local
+	aeskeygenassist $0x2, %xmm2, %xmm1	// expand the key
+	call	_key_expansion_192b_local
+	aeskeygenassist $0x4, %xmm2, %xmm1	// expand the key
+	call	_key_expansion_192a_local
+	aeskeygenassist $0x8, %xmm2, %xmm1	// expand the key
+	call	_key_expansion_192b_local
+	aeskeygenassist $0x10, %xmm2, %xmm1	// expand the key
+	call	_key_expansion_192a_local
+	aeskeygenassist $0x20, %xmm2, %xmm1	// expand the key
+	call	_key_expansion_192b_local
+	aeskeygenassist $0x40, %xmm2, %xmm1	// expand the key
+	call	_key_expansion_192a_local
+	aeskeygenassist $0x80, %xmm2, %xmm1	// expand the key
+	call	_key_expansion_192b_local
+
+#ifdef	OPENSSL_INTERFACE
+	xor	%rax, %rax			// return 0 (OK)
+#else	/* OpenSolaris Interface */
+	mov	$12, %rax			// return # rounds = 12
+#endif
+	FRAME_END
+	ret
+
+.align 4
+.Lenc_key128:
+	cmp $128, %KEYSIZE32
+	jnz .Lenc_key_invalid_key_bits
+
+	// AES 128: 10 rounds in encryption key schedule
+#ifdef OPENSSL_INTERFACE
+	mov	$10, %ROUNDS32
+	movl	%ROUNDS32, 240(%AESKEY)		// key.rounds = 10
+#endif	/* OPENSSL_INTERFACE */
+
+	aeskeygenassist $0x1, %xmm0, %xmm1	// expand the key
+	call	_key_expansion_128_local
+	aeskeygenassist $0x2, %xmm0, %xmm1	// expand the key
+	call	_key_expansion_128_local
+	aeskeygenassist $0x4, %xmm0, %xmm1	// expand the key
+	call	_key_expansion_128_local
+	aeskeygenassist $0x8, %xmm0, %xmm1	// expand the key
+	call	_key_expansion_128_local
+	aeskeygenassist $0x10, %xmm0, %xmm1	// expand the key
+	call	_key_expansion_128_local
+	aeskeygenassist $0x20, %xmm0, %xmm1	// expand the key
+	call	_key_expansion_128_local
+	aeskeygenassist $0x40, %xmm0, %xmm1	// expand the key
+	call	_key_expansion_128_local
+	aeskeygenassist $0x80, %xmm0, %xmm1	// expand the key
+	call	_key_expansion_128_local
+	aeskeygenassist $0x1b, %xmm0, %xmm1	// expand the key
+	call	_key_expansion_128_local
+	aeskeygenassist $0x36, %xmm0, %xmm1	// expand the key
+	call	_key_expansion_128_local
+
+#ifdef	OPENSSL_INTERFACE
+	xor	%rax, %rax			// return 0 (OK)
+#else	/* OpenSolaris Interface */
+	mov	$10, %rax			// return # rounds = 10
+#endif
+	FRAME_END
+	ret
+
+.Lenc_key_invalid_param:
+#ifdef	OPENSSL_INTERFACE
+	mov	$-1, %rax	// user key or AES key pointer is NULL
+	FRAME_END
+	ret
+#else
+	/* FALLTHROUGH */
+#endif	/* OPENSSL_INTERFACE */
+
+.Lenc_key_invalid_key_bits:
+#ifdef	OPENSSL_INTERFACE
+	mov	$-2, %rax	// keysize is invalid
+#else	/* Open Solaris Interface */
+	xor	%rax, %rax	// a key pointer is NULL or invalid keysize
+#endif	/* OPENSSL_INTERFACE */
+	FRAME_END
+	ret
+	SET_SIZE(rijndael_key_setup_enc_intel)
+
+
+/*
+ * rijndael_key_setup_dec_intel()
+ * Expand the cipher key into the decryption key schedule.
+ *
+ * For kernel code, caller is responsible for ensuring kpreempt_disable()
+ * has been called.  This is because %xmm registers are not saved/restored.
+ * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
+ * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
+ * on the stack.
+ *
+ * OpenSolaris interface:
+ * int rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[],
+ *	uint64_t keyBits);
+ * Return value is 0 on error, number of rounds on success.
+ * P1->P2, P2->P3, P3->P1
+ *
+ * Original Intel OpenSSL interface:
+ * int intel_AES_set_decrypt_key(const unsigned char *userKey,
+ *	const int bits, AES_KEY *key);
+ * Return value is non-zero on error, 0 on success.
+ */
+
+ENTRY_NP(rijndael_key_setup_dec_intel)
+FRAME_BEGIN
+	// Generate round keys used for encryption
+	call	rijndael_key_setup_enc_intel_local
+	test	%rax, %rax
+#ifdef	OPENSSL_INTERFACE
+	jnz	.Ldec_key_exit	// Failed if returned non-0
+#else	/* OpenSolaris Interface */
+	jz	.Ldec_key_exit	// Failed if returned 0
+#endif	/* OPENSSL_INTERFACE */
+
+	/*
+	 * Convert round keys used for encryption
+	 * to a form usable for decryption
+	 */
+#ifndef	OPENSSL_INTERFACE		/* OpenSolaris Interface */
+	mov	%rax, %ROUNDS64		// set # rounds (10, 12, or 14)
+					// (already set for OpenSSL)
+#endif
+
+	lea	0x10(%AESKEY), %rcx	// key addr
+	shl	$4, %ROUNDS32
+	add	%AESKEY, %ROUNDS64
+	mov	%ROUNDS64, %ENDAESKEY
+
+.align 4
+.Ldec_key_reorder_loop:
+	movups	(%AESKEY), %xmm0
+	movups	(%ROUNDS64), %xmm1
+	movups	%xmm0, (%ROUNDS64)
+	movups	%xmm1, (%AESKEY)
+	lea	0x10(%AESKEY), %AESKEY
+	lea	-0x10(%ROUNDS64), %ROUNDS64
+	cmp	%AESKEY, %ROUNDS64
+	ja	.Ldec_key_reorder_loop
+
+.align 4
+.Ldec_key_inv_loop:
+	movups	(%rcx), %xmm0
+	// Convert an encryption round key to a form usable for decryption
+	// with the "AES Inverse Mix Columns" instruction
+	aesimc	%xmm0, %xmm1
+	movups	%xmm1, (%rcx)
+	lea	0x10(%rcx), %rcx
+	cmp	%ENDAESKEY, %rcx
+	jnz	.Ldec_key_inv_loop
+
+.Ldec_key_exit:
+	// OpenSolaris: rax = # rounds (10, 12, or 14) or 0 for error
+	// OpenSSL: rax = 0 for OK, or non-zero for error
+	FRAME_END
+	ret
+	SET_SIZE(rijndael_key_setup_dec_intel)
+
+
+/*
+ * aes_encrypt_intel()
+ * Encrypt a single block (in and out can overlap).
+ *
+ * For kernel code, caller is responsible for ensuring kpreempt_disable()
+ * has been called.  This is because %xmm registers are not saved/restored.
+ * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
+ * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
+ * on the stack.
+ *
+ * Temporary register usage:
+ * %xmm0	State
+ * %xmm1	Key
+ *
+ * Original OpenSolaris Interface:
+ * void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
+ *	const uint32_t pt[4], uint32_t ct[4])
+ *
+ * Original Intel OpenSSL Interface:
+ * void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
+ *	const AES_KEY *key)
+ */
+
+#ifdef	OPENSSL_INTERFACE
+#define	aes_encrypt_intel	intel_AES_encrypt
+#define	aes_decrypt_intel	intel_AES_decrypt
+
+#define	INP		rdi	/* P1, 64 bits */
+#define	OUTP		rsi	/* P2, 64 bits */
+#define	KEYP		rdx	/* P3, 64 bits */
+
+/* No NROUNDS parameter--offset 240 from KEYP saved in %ecx:  */
+#define	NROUNDS32	ecx	/* temporary, 32 bits */
+#define	NROUNDS		cl	/* temporary,  8 bits */
+
+#else	/* OpenSolaris Interface */
+#define	KEYP		rdi	/* P1, 64 bits */
+#define	NROUNDS		esi	/* P2, 32 bits */
+#define	INP		rdx	/* P3, 64 bits */
+#define	OUTP		rcx	/* P4, 64 bits */
+#endif	/* OPENSSL_INTERFACE */
+
+#define	STATE		xmm0	/* temporary, 128 bits */
+#define	KEY		xmm1	/* temporary, 128 bits */
+
+
+ENTRY_NP(aes_encrypt_intel)
+
+	movups	(%INP), %STATE			// input
+	movups	(%KEYP), %KEY			// key
+#ifdef	OPENSSL_INTERFACE
+	mov	240(%KEYP), %NROUNDS32		// round count
+#else	/* OpenSolaris Interface */
+	/* Round count is already present as P2 in %rsi/%esi */
+#endif	/* OPENSSL_INTERFACE */
+
+	pxor	%KEY, %STATE			// round 0
+	lea	0x30(%KEYP), %KEYP
+	cmp	$12, %NROUNDS
+	jb	.Lenc128
+	lea	0x20(%KEYP), %KEYP
+	je	.Lenc192
+
+	// AES 256
+	lea	0x20(%KEYP), %KEYP
+	movups	-0x60(%KEYP), %KEY
+	aesenc	%KEY, %STATE
+	movups	-0x50(%KEYP), %KEY
+	aesenc	%KEY, %STATE
+
+.align 4
+.Lenc192:
+	// AES 192 and 256
+	movups	-0x40(%KEYP), %KEY
+	aesenc	%KEY, %STATE
+	movups	-0x30(%KEYP), %KEY
+	aesenc	%KEY, %STATE
+
+.align 4
+.Lenc128:
+	// AES 128, 192, and 256
+	movups	-0x20(%KEYP), %KEY
+	aesenc	%KEY, %STATE
+	movups	-0x10(%KEYP), %KEY
+	aesenc	%KEY, %STATE
+	movups	(%KEYP), %KEY
+	aesenc	%KEY, %STATE
+	movups	0x10(%KEYP), %KEY
+	aesenc	%KEY, %STATE
+	movups	0x20(%KEYP), %KEY
+	aesenc	%KEY, %STATE
+	movups	0x30(%KEYP), %KEY
+	aesenc	%KEY, %STATE
+	movups	0x40(%KEYP), %KEY
+	aesenc	%KEY, %STATE
+	movups	0x50(%KEYP), %KEY
+	aesenc	%KEY, %STATE
+	movups	0x60(%KEYP), %KEY
+	aesenc	%KEY, %STATE
+	movups	0x70(%KEYP), %KEY
+	aesenclast	 %KEY, %STATE		// last round
+	movups	%STATE, (%OUTP)			// output
+
+	ret
+	SET_SIZE(aes_encrypt_intel)
+
+
+/*
+ * aes_decrypt_intel()
+ * Decrypt a single block (in and out can overlap).
+ *
+ * For kernel code, caller is responsible for ensuring kpreempt_disable()
+ * has been called.  This is because %xmm registers are not saved/restored.
+ * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
+ * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
+ * on the stack.
+ *
+ * Temporary register usage:
+ * %xmm0	State
+ * %xmm1	Key
+ *
+ * Original OpenSolaris Interface:
+ * void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
+ *	const uint32_t pt[4], uint32_t ct[4])/
+ *
+ * Original Intel OpenSSL Interface:
+ * void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
+ *	const AES_KEY *key);
+ */
+ENTRY_NP(aes_decrypt_intel)
+
+	movups	(%INP), %STATE			// input
+	movups	(%KEYP), %KEY			// key
+#ifdef	OPENSSL_INTERFACE
+	mov	240(%KEYP), %NROUNDS32		// round count
+#else	/* OpenSolaris Interface */
+	/* Round count is already present as P2 in %rsi/%esi */
+#endif	/* OPENSSL_INTERFACE */
+
+	pxor	%KEY, %STATE			// round 0
+	lea	0x30(%KEYP), %KEYP
+	cmp	$12, %NROUNDS
+	jb	.Ldec128
+	lea	0x20(%KEYP), %KEYP
+	je	.Ldec192
+
+	// AES 256
+	lea	0x20(%KEYP), %KEYP
+	movups	-0x60(%KEYP), %KEY
+	aesdec	%KEY, %STATE
+	movups	-0x50(%KEYP), %KEY
+	aesdec	%KEY, %STATE
+
+.align 4
+.Ldec192:
+	// AES 192 and 256
+	movups	-0x40(%KEYP), %KEY
+	aesdec	%KEY, %STATE
+	movups	-0x30(%KEYP), %KEY
+	aesdec	%KEY, %STATE
+
+.align 4
+.Ldec128:
+	// AES 128, 192, and 256
+	movups	-0x20(%KEYP), %KEY
+	aesdec	%KEY, %STATE
+	movups	-0x10(%KEYP), %KEY
+	aesdec	%KEY, %STATE
+	movups	(%KEYP), %KEY
+	aesdec	%KEY, %STATE
+	movups	0x10(%KEYP), %KEY
+	aesdec	%KEY, %STATE
+	movups	0x20(%KEYP), %KEY
+	aesdec	%KEY, %STATE
+	movups	0x30(%KEYP), %KEY
+	aesdec	%KEY, %STATE
+	movups	0x40(%KEYP), %KEY
+	aesdec	%KEY, %STATE
+	movups	0x50(%KEYP), %KEY
+	aesdec	%KEY, %STATE
+	movups	0x60(%KEYP), %KEY
+	aesdec	%KEY, %STATE
+	movups	0x70(%KEYP), %KEY
+	aesdeclast	%KEY, %STATE		// last round
+	movups	%STATE, (%OUTP)			// output
+
+	ret
+	SET_SIZE(aes_decrypt_intel)
+
+#endif	/* lint || __lint */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/module/icp/asm-x86_64/aes/aes_amd64.S b/module/icp/asm-x86_64/aes/aes_amd64.S
new file mode 100644
index 000000000000..9db3a3179230
--- /dev/null
+++ b/module/icp/asm-x86_64/aes/aes_amd64.S
@@ -0,0 +1,906 @@
+/*
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
+ *
+ * LICENSE TERMS
+ *
+ * The free distribution and use of this software is allowed (with or without
+ * changes) provided that:
+ *
+ *  1. source code distributions include the above copyright notice, this
+ *     list of conditions and the following disclaimer;
+ *
+ *  2. binary distributions include the above copyright notice, this list
+ *     of conditions and the following disclaimer in their documentation;
+ *
+ *  3. the name of the copyright holder is not used to endorse products
+ *     built using this software without specific written permission.
+ *
+ * DISCLAIMER
+ *
+ * This software is provided 'as is' with no explicit or implied warranties
+ * in respect of its properties, including, but not limited to, correctness
+ * and/or fitness for purpose.
+ * ---------------------------------------------------------------------------
+ * Issue 20/12/2007
+ *
+ * I am grateful to Dag Arne Osvik for many discussions of the techniques that
+ * can be used to optimise AES assembler code on AMD64/EM64T architectures.
+ * Some of the techniques used in this implementation are the result of
+ * suggestions made by him for which I am most grateful.
+ *
+ * An AES implementation for AMD64 processors using the YASM assembler.  This
+ * implementation provides only encryption, decryption and hence requires key
+ * scheduling support in C. It uses 8k bytes of tables but its encryption and
+ * decryption performance is very close to that obtained using large tables.
+ * It can use either MS Windows or Gnu/Linux/OpenSolaris OS calling conventions,
+ * which are as follows:
+ *               ms windows  gnu/linux/opensolaris os
+ *
+ *   in_blk          rcx     rdi
+ *   out_blk         rdx     rsi
+ *   context (cx)     r8     rdx
+ *
+ *   preserved       rsi      -    + rbx, rbp, rsp, r12, r13, r14 & r15
+ *   registers       rdi      -      on both
+ *
+ *   destroyed        -      rsi   + rax, rcx, rdx, r8, r9, r10 & r11
+ *   registers        -      rdi     on both
+ *
+ * The convention used here is that for gnu/linux/opensolaris os.
+ *
+ * This code provides the standard AES block size (128 bits, 16 bytes) and the
+ * three standard AES key sizes (128, 192 and 256 bits). It has the same call
+ * interface as my C implementation.  It uses the Microsoft C AMD64 calling
+ * conventions in which the three parameters are placed in  rcx, rdx and r8
+ * respectively.  The rbx, rsi, rdi, rbp and r12..r15 registers are preserved.
+ *
+ * OpenSolaris Note:
+ * Modified to use GNU/Linux/Solaris calling conventions.
+ * That is parameters are placed in rdi, rsi, rdx, and rcx, respectively.
+ *
+ *     AES_RETURN aes_encrypt(const unsigned char in_blk[],
+ *                   unsigned char out_blk[], const aes_encrypt_ctx cx[1])/
+ *
+ *     AES_RETURN aes_decrypt(const unsigned char in_blk[],
+ *                   unsigned char out_blk[], const aes_decrypt_ctx cx[1])/
+ *
+ *     AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[],
+ *                                            const aes_encrypt_ctx cx[1])/
+ *
+ *     AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[],
+ *                                            const aes_decrypt_ctx cx[1])/
+ *
+ *     AES_RETURN aes_encrypt_key(const unsigned char key[],
+ *                           unsigned int len, const aes_decrypt_ctx cx[1])/
+ *
+ *     AES_RETURN aes_decrypt_key(const unsigned char key[],
+ *                           unsigned int len, const aes_decrypt_ctx cx[1])/
+ *
+ * where <NNN> is 128, 102 or 256.  In the last two calls the length can be in
+ * either bits or bytes.
+ *
+ * Comment in/out the following lines to obtain the desired subroutines. These
+ * selections MUST match those in the C header file aesopt.h
+ */
+#define	AES_REV_DKS	  /* define if key decryption schedule is reversed */
+
+#define	LAST_ROUND_TABLES /* define for the faster version using extra tables */
+
+/*
+ * The encryption key schedule has the following in memory layout where N is the
+ * number of rounds (10, 12 or 14):
+ *
+ * lo: | input key (round 0)  |  / each round is four 32-bit words
+ *     | encryption round 1   |
+ *     | encryption round 2   |
+ *     ....
+ *     | encryption round N-1 |
+ * hi: | encryption round N   |
+ *
+ * The decryption key schedule is normally set up so that it has the same
+ * layout as above by actually reversing the order of the encryption key
+ * schedule in memory (this happens when AES_REV_DKS is set):
+ *
+ * lo: | decryption round 0   | =              | encryption round N   |
+ *     | decryption round 1   | = INV_MIX_COL[ | encryption round N-1 | ]
+ *     | decryption round 2   | = INV_MIX_COL[ | encryption round N-2 | ]
+ *     ....                       ....
+ *     | decryption round N-1 | = INV_MIX_COL[ | encryption round 1   | ]
+ * hi: | decryption round N   | =              | input key (round 0)  |
+ *
+ * with rounds except the first and last modified using inv_mix_column()
+ * But if AES_REV_DKS is NOT set the order of keys is left as it is for
+ * encryption so that it has to be accessed in reverse when used for
+ * decryption (although the inverse mix column modifications are done)
+ *
+ * lo: | decryption round 0   | =              | input key (round 0)  |
+ *     | decryption round 1   | = INV_MIX_COL[ | encryption round 1   | ]
+ *     | decryption round 2   | = INV_MIX_COL[ | encryption round 2   | ]
+ *     ....                       ....
+ *     | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ]
+ * hi: | decryption round N   | =              | encryption round N   |
+ *
+ * This layout is faster when the assembler key scheduling provided here
+ * is used.
+ *
+ * End of user defines
+ */
+
+/*
+ * ---------------------------------------------------------------------------
+ * OpenSolaris OS modifications
+ *
+ * This source originates from Brian Gladman file aes_amd64.asm
+ * in http://fp.gladman.plus.com/AES/aes-src-04-03-08.zip
+ * with these changes:
+ *
+ * 1. Removed MS Windows-specific code within DLL_EXPORT, _SEH_, and
+ * !__GNUC__ ifdefs.  Also removed ENCRYPTION, DECRYPTION,
+ * AES_128, AES_192, AES_256, AES_VAR ifdefs.
+ *
+ * 2. Translate yasm/nasm %define and .macro definitions to cpp(1) #define
+ *
+ * 3. Translate yasm/nasm %ifdef/%ifndef to cpp(1) #ifdef
+ *
+ * 4. Translate Intel/yasm/nasm syntax to ATT/OpenSolaris as(1) syntax
+ * (operands reversed, literals prefixed with "$", registers prefixed with "%",
+ * and "[register+offset]", addressing changed to "offset(register)",
+ * parenthesis in constant expressions "()" changed to square brackets "[]",
+ * "." removed from  local (numeric) labels, and other changes.
+ * Examples:
+ * Intel/yasm/nasm Syntax	ATT/OpenSolaris Syntax
+ * mov	rax,(4*20h)		mov	$[4*0x20],%rax
+ * mov	rax,[ebx+20h]		mov	0x20(%ebx),%rax
+ * lea	rax,[ebx+ecx]		lea	(%ebx,%ecx),%rax
+ * sub	rax,[ebx+ecx*4-20h]	sub	-0x20(%ebx,%ecx,4),%rax
+ *
+ * 5. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
+ * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function
+ * definitions for lint.
+ *
+ * 6. Renamed functions and reordered parameters to match OpenSolaris:
+ * Original Gladman interface:
+ *	int aes_encrypt(const unsigned char *in,
+ *		unsigned char *out, const aes_encrypt_ctx cx[1])/
+ *	int aes_decrypt(const unsigned char *in,
+ *		unsigned char *out, const aes_encrypt_ctx cx[1])/
+ * Note: aes_encrypt_ctx contains ks, a 60 element array of uint32_t,
+ * and a union type, inf., containing inf.l, a uint32_t and
+ * inf.b, a 4-element array of uint32_t.  Only b[0] in the array (aka "l") is
+ * used and contains the key schedule length * 16 where key schedule length is
+ * 10, 12, or 14 bytes.
+ *
+ * OpenSolaris OS interface:
+ *	void aes_encrypt_amd64(const aes_ks_t *ks, int Nr,
+ *		const uint32_t pt[4], uint32_t ct[4])/
+ *	void aes_decrypt_amd64(const aes_ks_t *ks, int Nr,
+ *		const uint32_t pt[4], uint32_t ct[4])/
+ *	typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]/
+ *		 uint32_t ks32[(MAX_AES_NR + 1) * 4]/ } aes_ks_t/
+ * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text,
+ * ct is crypto text, and MAX_AES_NR is 14.
+ * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64.
+ */
+
+#if defined(lint) || defined(__lint)
+
+#include <sys/types.h>
+/* ARGSUSED */
+void
+aes_encrypt_amd64(const uint32_t rk[], int Nr, const uint32_t pt[4],
+	uint32_t ct[4]) {
+}
+/* ARGSUSED */
+void
+aes_decrypt_amd64(const uint32_t rk[], int Nr, const uint32_t ct[4],
+	uint32_t pt[4]) {
+}
+
+
+#else
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+#define	KS_LENGTH	60
+
+#define	raxd		eax
+#define	rdxd		edx
+#define	rcxd		ecx
+#define	rbxd		ebx
+#define	rsid		esi
+#define	rdid		edi
+
+#define	raxb		al
+#define	rdxb		dl
+#define	rcxb		cl
+#define	rbxb		bl
+#define	rsib		sil
+#define	rdib		dil
+
+// finite field multiplies by {02}, {04} and {08}
+
+#define	f2(x) [[x<<1]^[[[x>>7]&1]*0x11b]]
+#define	f4(x) [[x<<2]^[[[x>>6]&1]*0x11b]^[[[x>>6]&2]*0x11b]]
+#define	f8(x) [[x<<3]^[[[x>>5]&1]*0x11b]^[[[x>>5]&2]*0x11b]^[[[x>>5]&4]*0x11b]]
+
+// finite field multiplies required in table generation
+
+#define	f3(x) [[f2(x)] ^ [x]]
+#define	f9(x) [[f8(x)] ^ [x]]
+#define	fb(x) [[f8(x)] ^ [f2(x)] ^ [x]]
+#define	fd(x) [[f8(x)] ^ [f4(x)] ^ [x]]
+#define	fe(x) [[f8(x)] ^ [f4(x)] ^ [f2(x)]]
+
+// macros for expanding S-box data
+
+#define	u8(x) [f2(x)], [x], [x], [f3(x)], [f2(x)], [x], [x], [f3(x)]
+#define	v8(x) [fe(x)], [f9(x)], [fd(x)], [fb(x)], [fe(x)], [f9(x)], [fd(x)], [x]
+#define	w8(x) [x], 0, 0, 0, [x], 0, 0, 0
+
+#define	enc_vals(x)	\
+   .byte x(0x63),x(0x7c),x(0x77),x(0x7b),x(0xf2),x(0x6b),x(0x6f),x(0xc5); \
+   .byte x(0x30),x(0x01),x(0x67),x(0x2b),x(0xfe),x(0xd7),x(0xab),x(0x76); \
+   .byte x(0xca),x(0x82),x(0xc9),x(0x7d),x(0xfa),x(0x59),x(0x47),x(0xf0); \
+   .byte x(0xad),x(0xd4),x(0xa2),x(0xaf),x(0x9c),x(0xa4),x(0x72),x(0xc0); \
+   .byte x(0xb7),x(0xfd),x(0x93),x(0x26),x(0x36),x(0x3f),x(0xf7),x(0xcc); \
+   .byte x(0x34),x(0xa5),x(0xe5),x(0xf1),x(0x71),x(0xd8),x(0x31),x(0x15); \
+   .byte x(0x04),x(0xc7),x(0x23),x(0xc3),x(0x18),x(0x96),x(0x05),x(0x9a); \
+   .byte x(0x07),x(0x12),x(0x80),x(0xe2),x(0xeb),x(0x27),x(0xb2),x(0x75); \
+   .byte x(0x09),x(0x83),x(0x2c),x(0x1a),x(0x1b),x(0x6e),x(0x5a),x(0xa0); \
+   .byte x(0x52),x(0x3b),x(0xd6),x(0xb3),x(0x29),x(0xe3),x(0x2f),x(0x84); \
+   .byte x(0x53),x(0xd1),x(0x00),x(0xed),x(0x20),x(0xfc),x(0xb1),x(0x5b); \
+   .byte x(0x6a),x(0xcb),x(0xbe),x(0x39),x(0x4a),x(0x4c),x(0x58),x(0xcf); \
+   .byte x(0xd0),x(0xef),x(0xaa),x(0xfb),x(0x43),x(0x4d),x(0x33),x(0x85); \
+   .byte x(0x45),x(0xf9),x(0x02),x(0x7f),x(0x50),x(0x3c),x(0x9f),x(0xa8); \
+   .byte x(0x51),x(0xa3),x(0x40),x(0x8f),x(0x92),x(0x9d),x(0x38),x(0xf5); \
+   .byte x(0xbc),x(0xb6),x(0xda),x(0x21),x(0x10),x(0xff),x(0xf3),x(0xd2); \
+   .byte x(0xcd),x(0x0c),x(0x13),x(0xec),x(0x5f),x(0x97),x(0x44),x(0x17); \
+   .byte x(0xc4),x(0xa7),x(0x7e),x(0x3d),x(0x64),x(0x5d),x(0x19),x(0x73); \
+   .byte x(0x60),x(0x81),x(0x4f),x(0xdc),x(0x22),x(0x2a),x(0x90),x(0x88); \
+   .byte x(0x46),x(0xee),x(0xb8),x(0x14),x(0xde),x(0x5e),x(0x0b),x(0xdb); \
+   .byte x(0xe0),x(0x32),x(0x3a),x(0x0a),x(0x49),x(0x06),x(0x24),x(0x5c); \
+   .byte x(0xc2),x(0xd3),x(0xac),x(0x62),x(0x91),x(0x95),x(0xe4),x(0x79); \
+   .byte x(0xe7),x(0xc8),x(0x37),x(0x6d),x(0x8d),x(0xd5),x(0x4e),x(0xa9); \
+   .byte x(0x6c),x(0x56),x(0xf4),x(0xea),x(0x65),x(0x7a),x(0xae),x(0x08); \
+   .byte x(0xba),x(0x78),x(0x25),x(0x2e),x(0x1c),x(0xa6),x(0xb4),x(0xc6); \
+   .byte x(0xe8),x(0xdd),x(0x74),x(0x1f),x(0x4b),x(0xbd),x(0x8b),x(0x8a); \
+   .byte x(0x70),x(0x3e),x(0xb5),x(0x66),x(0x48),x(0x03),x(0xf6),x(0x0e); \
+   .byte x(0x61),x(0x35),x(0x57),x(0xb9),x(0x86),x(0xc1),x(0x1d),x(0x9e); \
+   .byte x(0xe1),x(0xf8),x(0x98),x(0x11),x(0x69),x(0xd9),x(0x8e),x(0x94); \
+   .byte x(0x9b),x(0x1e),x(0x87),x(0xe9),x(0xce),x(0x55),x(0x28),x(0xdf); \
+   .byte x(0x8c),x(0xa1),x(0x89),x(0x0d),x(0xbf),x(0xe6),x(0x42),x(0x68); \
+   .byte x(0x41),x(0x99),x(0x2d),x(0x0f),x(0xb0),x(0x54),x(0xbb),x(0x16)
+
+#define	dec_vals(x) \
+   .byte x(0x52),x(0x09),x(0x6a),x(0xd5),x(0x30),x(0x36),x(0xa5),x(0x38); \
+   .byte x(0xbf),x(0x40),x(0xa3),x(0x9e),x(0x81),x(0xf3),x(0xd7),x(0xfb); \
+   .byte x(0x7c),x(0xe3),x(0x39),x(0x82),x(0x9b),x(0x2f),x(0xff),x(0x87); \
+   .byte x(0x34),x(0x8e),x(0x43),x(0x44),x(0xc4),x(0xde),x(0xe9),x(0xcb); \
+   .byte x(0x54),x(0x7b),x(0x94),x(0x32),x(0xa6),x(0xc2),x(0x23),x(0x3d); \
+   .byte x(0xee),x(0x4c),x(0x95),x(0x0b),x(0x42),x(0xfa),x(0xc3),x(0x4e); \
+   .byte x(0x08),x(0x2e),x(0xa1),x(0x66),x(0x28),x(0xd9),x(0x24),x(0xb2); \
+   .byte x(0x76),x(0x5b),x(0xa2),x(0x49),x(0x6d),x(0x8b),x(0xd1),x(0x25); \
+   .byte x(0x72),x(0xf8),x(0xf6),x(0x64),x(0x86),x(0x68),x(0x98),x(0x16); \
+   .byte x(0xd4),x(0xa4),x(0x5c),x(0xcc),x(0x5d),x(0x65),x(0xb6),x(0x92); \
+   .byte x(0x6c),x(0x70),x(0x48),x(0x50),x(0xfd),x(0xed),x(0xb9),x(0xda); \
+   .byte x(0x5e),x(0x15),x(0x46),x(0x57),x(0xa7),x(0x8d),x(0x9d),x(0x84); \
+   .byte x(0x90),x(0xd8),x(0xab),x(0x00),x(0x8c),x(0xbc),x(0xd3),x(0x0a); \
+   .byte x(0xf7),x(0xe4),x(0x58),x(0x05),x(0xb8),x(0xb3),x(0x45),x(0x06); \
+   .byte x(0xd0),x(0x2c),x(0x1e),x(0x8f),x(0xca),x(0x3f),x(0x0f),x(0x02); \
+   .byte x(0xc1),x(0xaf),x(0xbd),x(0x03),x(0x01),x(0x13),x(0x8a),x(0x6b); \
+   .byte x(0x3a),x(0x91),x(0x11),x(0x41),x(0x4f),x(0x67),x(0xdc),x(0xea); \
+   .byte x(0x97),x(0xf2),x(0xcf),x(0xce),x(0xf0),x(0xb4),x(0xe6),x(0x73); \
+   .byte x(0x96),x(0xac),x(0x74),x(0x22),x(0xe7),x(0xad),x(0x35),x(0x85); \
+   .byte x(0xe2),x(0xf9),x(0x37),x(0xe8),x(0x1c),x(0x75),x(0xdf),x(0x6e); \
+   .byte x(0x47),x(0xf1),x(0x1a),x(0x71),x(0x1d),x(0x29),x(0xc5),x(0x89); \
+   .byte x(0x6f),x(0xb7),x(0x62),x(0x0e),x(0xaa),x(0x18),x(0xbe),x(0x1b); \
+   .byte x(0xfc),x(0x56),x(0x3e),x(0x4b),x(0xc6),x(0xd2),x(0x79),x(0x20); \
+   .byte x(0x9a),x(0xdb),x(0xc0),x(0xfe),x(0x78),x(0xcd),x(0x5a),x(0xf4); \
+   .byte x(0x1f),x(0xdd),x(0xa8),x(0x33),x(0x88),x(0x07),x(0xc7),x(0x31); \
+   .byte x(0xb1),x(0x12),x(0x10),x(0x59),x(0x27),x(0x80),x(0xec),x(0x5f); \
+   .byte x(0x60),x(0x51),x(0x7f),x(0xa9),x(0x19),x(0xb5),x(0x4a),x(0x0d); \
+   .byte x(0x2d),x(0xe5),x(0x7a),x(0x9f),x(0x93),x(0xc9),x(0x9c),x(0xef); \
+   .byte x(0xa0),x(0xe0),x(0x3b),x(0x4d),x(0xae),x(0x2a),x(0xf5),x(0xb0); \
+   .byte x(0xc8),x(0xeb),x(0xbb),x(0x3c),x(0x83),x(0x53),x(0x99),x(0x61); \
+   .byte x(0x17),x(0x2b),x(0x04),x(0x7e),x(0xba),x(0x77),x(0xd6),x(0x26); \
+   .byte x(0xe1),x(0x69),x(0x14),x(0x63),x(0x55),x(0x21),x(0x0c),x(0x7d)
+
+#define	tptr	%rbp	/* table pointer */
+#define	kptr	%r8	/* key schedule pointer */
+#define	fofs	128	/* adjust offset in key schedule to keep |disp| < 128 */
+#define	fk_ref(x, y)	-16*x+fofs+4*y(kptr)
+
+#ifdef	AES_REV_DKS
+#define	rofs		128
+#define	ik_ref(x, y)	-16*x+rofs+4*y(kptr)
+
+#else
+#define	rofs		-128
+#define	ik_ref(x, y)	16*x+rofs+4*y(kptr)
+#endif	/* AES_REV_DKS */
+
+#define	tab_0(x)	(tptr,x,8)
+#define	tab_1(x)	3(tptr,x,8)
+#define	tab_2(x)	2(tptr,x,8)
+#define	tab_3(x)	1(tptr,x,8)
+#define	tab_f(x)	1(tptr,x,8)
+#define	tab_i(x)	7(tptr,x,8)
+
+#define	ff_rnd(p1, p2, p3, p4, round)	/* normal forward round */ \
+	mov	fk_ref(round,0), p1; \
+	mov	fk_ref(round,1), p2; \
+	mov	fk_ref(round,2), p3; \
+	mov	fk_ref(round,3), p4; \
+ \
+	movzx	%al, %esi; \
+	movzx	%ah, %edi; \
+	shr	$16, %eax; \
+	xor	tab_0(%rsi), p1; \
+	xor	tab_1(%rdi), p4; \
+	movzx	%al, %esi; \
+	movzx	%ah, %edi; \
+	xor	tab_2(%rsi), p3; \
+	xor	tab_3(%rdi), p2; \
+ \
+	movzx	%bl, %esi; \
+	movzx	%bh, %edi; \
+	shr	$16, %ebx; \
+	xor	tab_0(%rsi), p2; \
+	xor	tab_1(%rdi), p1; \
+	movzx	%bl, %esi; \
+	movzx	%bh, %edi; \
+	xor	tab_2(%rsi), p4; \
+	xor	tab_3(%rdi), p3; \
+ \
+	movzx	%cl, %esi; \
+	movzx	%ch, %edi; \
+	shr	$16, %ecx; \
+	xor	tab_0(%rsi), p3; \
+	xor	tab_1(%rdi), p2; \
+	movzx	%cl, %esi; \
+	movzx	%ch, %edi; \
+	xor	tab_2(%rsi), p1; \
+	xor	tab_3(%rdi), p4; \
+ \
+	movzx	%dl, %esi; \
+	movzx	%dh, %edi; \
+	shr	$16, %edx; \
+	xor	tab_0(%rsi), p4; \
+	xor	tab_1(%rdi), p3; \
+	movzx	%dl, %esi; \
+	movzx	%dh, %edi; \
+	xor	tab_2(%rsi), p2; \
+	xor	tab_3(%rdi), p1; \
+ \
+	mov	p1, %eax; \
+	mov	p2, %ebx; \
+	mov	p3, %ecx; \
+	mov	p4, %edx
+
+#ifdef	LAST_ROUND_TABLES
+
+#define	fl_rnd(p1, p2, p3, p4, round)	/* last forward round */ \
+	add	$2048, tptr; \
+	mov	fk_ref(round,0), p1; \
+	mov	fk_ref(round,1), p2; \
+	mov	fk_ref(round,2), p3; \
+	mov	fk_ref(round,3), p4; \
+ \
+	movzx	%al, %esi; \
+	movzx	%ah, %edi; \
+	shr	$16, %eax; \
+	xor	tab_0(%rsi), p1; \
+	xor	tab_1(%rdi), p4; \
+	movzx	%al, %esi; \
+	movzx	%ah, %edi; \
+	xor	tab_2(%rsi), p3; \
+	xor	tab_3(%rdi), p2; \
+ \
+	movzx	%bl, %esi; \
+	movzx	%bh, %edi; \
+	shr	$16, %ebx; \
+	xor	tab_0(%rsi), p2; \
+	xor	tab_1(%rdi), p1; \
+	movzx	%bl, %esi; \
+	movzx	%bh, %edi; \
+	xor	tab_2(%rsi), p4; \
+	xor	tab_3(%rdi), p3; \
+ \
+	movzx	%cl, %esi; \
+	movzx	%ch, %edi; \
+	shr	$16, %ecx; \
+	xor	tab_0(%rsi), p3; \
+	xor	tab_1(%rdi), p2; \
+	movzx	%cl, %esi; \
+	movzx	%ch, %edi; \
+	xor	tab_2(%rsi), p1; \
+	xor	tab_3(%rdi), p4; \
+ \
+	movzx	%dl, %esi; \
+	movzx	%dh, %edi; \
+	shr	$16, %edx; \
+	xor	tab_0(%rsi), p4; \
+	xor	tab_1(%rdi), p3; \
+	movzx	%dl, %esi; \
+	movzx	%dh, %edi; \
+	xor	tab_2(%rsi), p2; \
+	xor	tab_3(%rdi), p1
+
+#else
+
+#define	fl_rnd(p1, p2, p3, p4, round)	/* last forward round */ \
+	mov	fk_ref(round,0), p1; \
+	mov	fk_ref(round,1), p2; \
+	mov	fk_ref(round,2), p3; \
+	mov	fk_ref(round,3), p4; \
+ \
+	movzx	%al, %esi; \
+	movzx	%ah, %edi; \
+	shr	$16, %eax; \
+	movzx	tab_f(%rsi), %esi; \
+	movzx	tab_f(%rdi), %edi; \
+	xor	%esi, p1; \
+	rol	$8, %edi; \
+	xor	%edi, p4; \
+	movzx	%al, %esi; \
+	movzx	%ah, %edi; \
+	movzx	tab_f(%rsi), %esi; \
+	movzx	tab_f(%rdi), %edi; \
+	rol	$16, %esi; \
+	rol	$24, %edi; \
+	xor	%esi, p3; \
+	xor	%edi, p2; \
+ \
+	movzx	%bl, %esi; \
+	movzx	%bh, %edi; \
+	shr	$16, %ebx; \
+	movzx	tab_f(%rsi), %esi; \
+	movzx	tab_f(%rdi), %edi; \
+	xor	%esi, p2; \
+	rol	$8, %edi; \
+	xor	%edi, p1; \
+	movzx	%bl, %esi; \
+	movzx	%bh, %edi; \
+	movzx	tab_f(%rsi), %esi; \
+	movzx	tab_f(%rdi), %edi; \
+	rol	$16, %esi; \
+	rol	$24, %edi; \
+	xor	%esi, p4; \
+	xor	%edi, p3; \
+ \
+	movzx	%cl, %esi; \
+	movzx	%ch, %edi; \
+	movzx	tab_f(%rsi), %esi; \
+	movzx	tab_f(%rdi), %edi; \
+	shr	$16, %ecx; \
+	xor	%esi, p3; \
+	rol	$8, %edi; \
+	xor	%edi, p2; \
+	movzx	%cl, %esi; \
+	movzx	%ch, %edi; \
+	movzx	tab_f(%rsi), %esi; \
+	movzx	tab_f(%rdi), %edi; \
+	rol	$16, %esi; \
+	rol	$24, %edi; \
+	xor	%esi, p1; \
+	xor	%edi, p4; \
+ \
+	movzx	%dl, %esi; \
+	movzx	%dh, %edi; \
+	movzx	tab_f(%rsi), %esi; \
+	movzx	tab_f(%rdi), %edi; \
+	shr	$16, %edx; \
+	xor	%esi, p4; \
+	rol	$8, %edi; \
+	xor	%edi, p3; \
+	movzx	%dl, %esi; \
+	movzx	%dh, %edi; \
+	movzx	tab_f(%rsi), %esi; \
+	movzx	tab_f(%rdi), %edi; \
+	rol	$16, %esi; \
+	rol	$24, %edi; \
+	xor	%esi, p2; \
+	xor	%edi, p1
+
+#endif	/* LAST_ROUND_TABLES */
+
+#define	ii_rnd(p1, p2, p3, p4, round)	/* normal inverse round */ \
+	mov	ik_ref(round,0), p1; \
+	mov	ik_ref(round,1), p2; \
+	mov	ik_ref(round,2), p3; \
+	mov	ik_ref(round,3), p4; \
+ \
+	movzx	%al, %esi; \
+	movzx	%ah, %edi; \
+	shr	$16, %eax; \
+	xor	tab_0(%rsi), p1; \
+	xor	tab_1(%rdi), p2; \
+	movzx	%al, %esi; \
+	movzx	%ah, %edi; \
+	xor	tab_2(%rsi), p3; \
+	xor	tab_3(%rdi), p4; \
+ \
+	movzx	%bl, %esi; \
+	movzx	%bh, %edi; \
+	shr	$16, %ebx; \
+	xor	tab_0(%rsi), p2; \
+	xor	tab_1(%rdi), p3; \
+	movzx	%bl, %esi; \
+	movzx	%bh, %edi; \
+	xor	tab_2(%rsi), p4; \
+	xor	tab_3(%rdi), p1; \
+ \
+	movzx	%cl, %esi; \
+	movzx	%ch, %edi; \
+	shr	$16, %ecx; \
+	xor	tab_0(%rsi), p3; \
+	xor	tab_1(%rdi), p4; \
+	movzx	%cl, %esi; \
+	movzx	%ch, %edi; \
+	xor	tab_2(%rsi), p1; \
+	xor	tab_3(%rdi), p2; \
+ \
+	movzx	%dl, %esi; \
+	movzx	%dh, %edi; \
+	shr	$16, %edx; \
+	xor	tab_0(%rsi), p4; \
+	xor	tab_1(%rdi), p1; \
+	movzx	%dl, %esi; \
+	movzx	%dh, %edi; \
+	xor	tab_2(%rsi), p2; \
+	xor	tab_3(%rdi), p3; \
+ \
+	mov	p1, %eax; \
+	mov	p2, %ebx; \
+	mov	p3, %ecx; \
+	mov	p4, %edx
+
+#ifdef	LAST_ROUND_TABLES
+
+#define	il_rnd(p1, p2, p3, p4, round)	/* last inverse round */ \
+	add	$2048, tptr; \
+	mov	ik_ref(round,0), p1; \
+	mov	ik_ref(round,1), p2; \
+	mov	ik_ref(round,2), p3; \
+	mov	ik_ref(round,3), p4; \
+ \
+	movzx	%al, %esi; \
+	movzx	%ah, %edi; \
+	shr	$16, %eax; \
+	xor	tab_0(%rsi), p1; \
+	xor	tab_1(%rdi), p2; \
+	movzx	%al, %esi; \
+	movzx	%ah, %edi; \
+	xor	tab_2(%rsi), p3; \
+	xor	tab_3(%rdi), p4; \
+ \
+	movzx	%bl, %esi; \
+	movzx	%bh, %edi; \
+	shr	$16, %ebx; \
+	xor	tab_0(%rsi), p2; \
+	xor	tab_1(%rdi), p3; \
+	movzx	%bl, %esi; \
+	movzx	%bh, %edi; \
+	xor	tab_2(%rsi), p4; \
+	xor	tab_3(%rdi), p1; \
+ \
+	movzx	%cl, %esi; \
+	movzx	%ch, %edi; \
+	shr	$16, %ecx; \
+	xor	tab_0(%rsi), p3; \
+	xor	tab_1(%rdi), p4; \
+	movzx	%cl, %esi; \
+	movzx	%ch, %edi; \
+	xor	tab_2(%rsi), p1; \
+	xor	tab_3(%rdi), p2; \
+ \
+	movzx	%dl, %esi; \
+	movzx	%dh, %edi; \
+	shr	$16, %edx; \
+	xor	tab_0(%rsi), p4; \
+	xor	tab_1(%rdi), p1; \
+	movzx	%dl, %esi; \
+	movzx	%dh, %edi; \
+	xor	tab_2(%rsi), p2; \
+	xor	tab_3(%rdi), p3
+
+#else
+
+#define	il_rnd(p1, p2, p3, p4, round)	/* last inverse round */ \
+	mov	ik_ref(round,0), p1; \
+	mov	ik_ref(round,1), p2; \
+	mov	ik_ref(round,2), p3; \
+	mov	ik_ref(round,3), p4; \
+ \
+	movzx	%al, %esi; \
+	movzx	%ah, %edi; \
+	movzx	tab_i(%rsi), %esi; \
+	movzx	tab_i(%rdi), %edi; \
+	shr	$16, %eax; \
+	xor	%esi, p1; \
+	rol	$8, %edi; \
+	xor	%edi, p2; \
+	movzx	%al, %esi; \
+	movzx	%ah, %edi; \
+	movzx	tab_i(%rsi), %esi; \
+	movzx	tab_i(%rdi), %edi; \
+	rol	$16, %esi; \
+	rol	$24, %edi; \
+	xor	%esi, p3; \
+	xor	%edi, p4; \
+ \
+	movzx	%bl, %esi; \
+	movzx	%bh, %edi; \
+	movzx	tab_i(%rsi), %esi; \
+	movzx	tab_i(%rdi), %edi; \
+	shr	$16, %ebx; \
+	xor	%esi, p2; \
+	rol	$8, %edi; \
+	xor	%edi, p3; \
+	movzx	%bl, %esi; \
+	movzx	%bh, %edi; \
+	movzx	tab_i(%rsi), %esi; \
+	movzx	tab_i(%rdi), %edi; \
+	rol	$16, %esi; \
+	rol	$24, %edi; \
+	xor	%esi, p4; \
+	xor	%edi, p1; \
+ \
+	movzx	%cl, %esi; \
+	movzx	%ch, %edi; \
+	movzx	tab_i(%rsi), %esi; \
+	movzx	tab_i(%rdi), %edi; \
+	shr	$16, %ecx; \
+	xor	%esi, p3; \
+	rol	$8, %edi; \
+	xor	%edi, p4; \
+	movzx	%cl, %esi; \
+	movzx	%ch, %edi; \
+	movzx	tab_i(%rsi), %esi; \
+	movzx	tab_i(%rdi), %edi; \
+	rol	$16, %esi; \
+	rol	$24, %edi; \
+	xor	%esi, p1; \
+	xor	%edi, p2; \
+ \
+	movzx	%dl, %esi; \
+	movzx	%dh, %edi; \
+	movzx	tab_i(%rsi), %esi; \
+	movzx	tab_i(%rdi), %edi; \
+	shr	$16, %edx; \
+	xor	%esi, p4; \
+	rol	$8, %edi; \
+	xor	%edi, p1; \
+	movzx	%dl, %esi; \
+	movzx	%dh, %edi; \
+	movzx	tab_i(%rsi), %esi; \
+	movzx	tab_i(%rdi), %edi; \
+	rol	$16, %esi; \
+	rol	$24, %edi; \
+	xor	%esi, p2; \
+	xor	%edi, p3
+
+#endif	/* LAST_ROUND_TABLES */
+
+/*
+ * OpenSolaris OS:
+ * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr,
+ *	const uint32_t pt[4], uint32_t ct[4])/
+ *
+ * Original interface:
+ * int aes_encrypt(const unsigned char *in,
+ *	unsigned char *out, const aes_encrypt_ctx cx[1])/
+ */
+.data
+.align	64
+enc_tab:
+	enc_vals(u8)
+#ifdef	LAST_ROUND_TABLES
+	// Last Round Tables:
+	enc_vals(w8)
+#endif
+
+
+ENTRY_NP(aes_encrypt_amd64)
+#ifdef	GLADMAN_INTERFACE
+	// Original interface
+	sub	$[4*8], %rsp	// gnu/linux/opensolaris binary interface
+	mov	%rsi, (%rsp)	// output pointer (P2)
+	mov	%rdx, %r8	// context (P3)
+
+	mov	%rbx, 1*8(%rsp)	// P1: input pointer in rdi
+	mov	%rbp, 2*8(%rsp)	// P2: output pointer in (rsp)
+	mov	%r12, 3*8(%rsp)	// P3: context in r8
+	movzx	4*KS_LENGTH(kptr), %esi	// Get byte key length * 16
+
+#else
+	// OpenSolaris OS interface
+	sub	$[4*8], %rsp	// Make room on stack to save registers
+	mov	%rcx, (%rsp)	// Save output pointer (P4) on stack
+	mov	%rdi, %r8	// context (P1)
+	mov	%rdx, %rdi	// P3: save input pointer
+	shl	$4, %esi	// P2: esi byte key length * 16
+
+	mov	%rbx, 1*8(%rsp)	// Save registers
+	mov	%rbp, 2*8(%rsp)
+	mov	%r12, 3*8(%rsp)
+	// P1: context in r8
+	// P2: byte key length * 16 in esi
+	// P3: input pointer in rdi
+	// P4: output pointer in (rsp)
+#endif	/* GLADMAN_INTERFACE */
+
+	lea	enc_tab(%rip), tptr
+	sub	$fofs, kptr
+
+	// Load input block into registers
+	mov	(%rdi), %eax
+	mov	1*4(%rdi), %ebx
+	mov	2*4(%rdi), %ecx
+	mov	3*4(%rdi), %edx
+
+	xor	fofs(kptr), %eax
+	xor	fofs+4(kptr), %ebx
+	xor	fofs+8(kptr), %ecx
+	xor	fofs+12(kptr), %edx
+
+	lea	(kptr,%rsi), kptr
+	// Jump based on byte key length * 16:
+	cmp	$[10*16], %esi
+	je	3f
+	cmp	$[12*16], %esi
+	je	2f
+	cmp	$[14*16], %esi
+	je	1f
+	mov	$-1, %rax	// error
+	jmp	4f
+
+	// Perform normal forward rounds
+1:	ff_rnd(%r9d, %r10d, %r11d, %r12d, 13)
+	ff_rnd(%r9d, %r10d, %r11d, %r12d, 12)
+2:	ff_rnd(%r9d, %r10d, %r11d, %r12d, 11)
+	ff_rnd(%r9d, %r10d, %r11d, %r12d, 10)
+3:	ff_rnd(%r9d, %r10d, %r11d, %r12d,  9)
+	ff_rnd(%r9d, %r10d, %r11d, %r12d,  8)
+	ff_rnd(%r9d, %r10d, %r11d, %r12d,  7)
+	ff_rnd(%r9d, %r10d, %r11d, %r12d,  6)
+	ff_rnd(%r9d, %r10d, %r11d, %r12d,  5)
+	ff_rnd(%r9d, %r10d, %r11d, %r12d,  4)
+	ff_rnd(%r9d, %r10d, %r11d, %r12d,  3)
+	ff_rnd(%r9d, %r10d, %r11d, %r12d,  2)
+	ff_rnd(%r9d, %r10d, %r11d, %r12d,  1)
+	fl_rnd(%r9d, %r10d, %r11d, %r12d,  0)
+
+	// Copy results
+	mov	(%rsp), %rbx
+	mov	%r9d, (%rbx)
+	mov	%r10d, 4(%rbx)
+	mov	%r11d, 8(%rbx)
+	mov	%r12d, 12(%rbx)
+	xor	%rax, %rax
+4:	// Restore registers
+	mov	1*8(%rsp), %rbx
+	mov	2*8(%rsp), %rbp
+	mov	3*8(%rsp), %r12
+	add	$[4*8], %rsp
+	ret
+
+	SET_SIZE(aes_encrypt_amd64)
+
+/*
+ * OpenSolaris OS:
+ * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr,
+ *	const uint32_t pt[4], uint32_t ct[4])/
+ *
+ * Original interface:
+ * int aes_decrypt(const unsigned char *in,
+ *	unsigned char *out, const aes_encrypt_ctx cx[1])/
+ */
+.data
+.align	64
+dec_tab:
+	dec_vals(v8)
+#ifdef	LAST_ROUND_TABLES
+	// Last Round Tables:
+	dec_vals(w8)
+#endif
+
+
+ENTRY_NP(aes_decrypt_amd64)
+#ifdef	GLADMAN_INTERFACE
+	// Original interface
+	sub	$[4*8], %rsp	// gnu/linux/opensolaris binary interface
+	mov	%rsi, (%rsp)	// output pointer (P2)
+	mov	%rdx, %r8	// context (P3)
+
+	mov	%rbx, 1*8(%rsp)	// P1: input pointer in rdi
+	mov	%rbp, 2*8(%rsp)	// P2: output pointer in (rsp)
+	mov	%r12, 3*8(%rsp)	// P3: context in r8
+	movzx	4*KS_LENGTH(kptr), %esi	// Get byte key length * 16
+
+#else
+	// OpenSolaris OS interface
+	sub	$[4*8], %rsp	// Make room on stack to save registers
+	mov	%rcx, (%rsp)	// Save output pointer (P4) on stack
+	mov	%rdi, %r8	// context (P1)
+	mov	%rdx, %rdi	// P3: save input pointer
+	shl	$4, %esi	// P2: esi byte key length * 16
+
+	mov	%rbx, 1*8(%rsp)	// Save registers
+	mov	%rbp, 2*8(%rsp)
+	mov	%r12, 3*8(%rsp)
+	// P1: context in r8
+	// P2: byte key length * 16 in esi
+	// P3: input pointer in rdi
+	// P4: output pointer in (rsp)
+#endif	/* GLADMAN_INTERFACE */
+
+	lea	dec_tab(%rip), tptr
+	sub	$rofs, kptr
+
+	// Load input block into registers
+	mov	(%rdi), %eax
+	mov	1*4(%rdi), %ebx
+	mov	2*4(%rdi), %ecx
+	mov	3*4(%rdi), %edx
+
+#ifdef AES_REV_DKS
+	mov	kptr, %rdi
+	lea	(kptr,%rsi), kptr
+#else
+	lea	(kptr,%rsi), %rdi
+#endif
+
+	xor	rofs(%rdi), %eax
+	xor	rofs+4(%rdi), %ebx
+	xor	rofs+8(%rdi), %ecx
+	xor	rofs+12(%rdi), %edx
+
+	// Jump based on byte key length * 16:
+	cmp	$[10*16], %esi
+	je	3f
+	cmp	$[12*16], %esi
+	je	2f
+	cmp	$[14*16], %esi
+	je	1f
+	mov	$-1, %rax	// error
+	jmp	4f
+
+	// Perform normal inverse rounds
+1:	ii_rnd(%r9d, %r10d, %r11d, %r12d, 13)
+	ii_rnd(%r9d, %r10d, %r11d, %r12d, 12)
+2:	ii_rnd(%r9d, %r10d, %r11d, %r12d, 11)
+	ii_rnd(%r9d, %r10d, %r11d, %r12d, 10)
+3:	ii_rnd(%r9d, %r10d, %r11d, %r12d,  9)
+	ii_rnd(%r9d, %r10d, %r11d, %r12d,  8)
+	ii_rnd(%r9d, %r10d, %r11d, %r12d,  7)
+	ii_rnd(%r9d, %r10d, %r11d, %r12d,  6)
+	ii_rnd(%r9d, %r10d, %r11d, %r12d,  5)
+	ii_rnd(%r9d, %r10d, %r11d, %r12d,  4)
+	ii_rnd(%r9d, %r10d, %r11d, %r12d,  3)
+	ii_rnd(%r9d, %r10d, %r11d, %r12d,  2)
+	ii_rnd(%r9d, %r10d, %r11d, %r12d,  1)
+	il_rnd(%r9d, %r10d, %r11d, %r12d,  0)
+
+	// Copy results
+	mov	(%rsp), %rbx
+	mov	%r9d, (%rbx)
+	mov	%r10d, 4(%rbx)
+	mov	%r11d, 8(%rbx)
+	mov	%r12d, 12(%rbx)
+	xor	%rax, %rax
+4:	// Restore registers
+	mov	1*8(%rsp), %rbx
+	mov	2*8(%rsp), %rbp
+	mov	3*8(%rsp), %r12
+	add	$[4*8], %rsp
+	ret
+
+	SET_SIZE(aes_decrypt_amd64)
+#endif	/* lint || __lint */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/module/icp/asm-x86_64/aes/aeskey.c b/module/icp/asm-x86_64/aes/aeskey.c
new file mode 100644
index 000000000000..c3d1f2990874
--- /dev/null
+++ b/module/icp/asm-x86_64/aes/aeskey.c
@@ -0,0 +1,580 @@
+/*
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
+ *
+ * LICENSE TERMS
+ *
+ * The free distribution and use of this software is allowed (with or without
+ * changes) provided that:
+ *
+ *  1. source code distributions include the above copyright notice, this
+ *	 list of conditions and the following disclaimer;
+ *
+ *  2. binary distributions include the above copyright notice, this list
+ *	 of conditions and the following disclaimer in their documentation;
+ *
+ *  3. the name of the copyright holder is not used to endorse products
+ *	 built using this software without specific written permission.
+ *
+ * DISCLAIMER
+ *
+ * This software is provided 'as is' with no explicit or implied warranties
+ * in respect of its properties, including, but not limited to, correctness
+ * and/or fitness for purpose.
+ * ---------------------------------------------------------------------------
+ * Issue Date: 20/12/2007
+ */
+
+#include <aes/aes_impl.h>
+#include "aesopt.h"
+#include "aestab.h"
+#include "aestab2.h"
+
+/*
+ *	Initialise the key schedule from the user supplied key. The key
+ *	length can be specified in bytes, with legal values of 16, 24
+ *	and 32, or in bits, with legal values of 128, 192 and 256. These
+ *	values correspond with Nk values of 4, 6 and 8 respectively.
+ *
+ *	The following macros implement a single cycle in the key
+ *	schedule generation process. The number of cycles needed
+ *	for each cx->n_col and nk value is:
+ *
+ *	nk =		4  5  6  7  8
+ *	------------------------------
+ *	cx->n_col = 4	10  9  8  7  7
+ *	cx->n_col = 5	14 11 10  9  9
+ *	cx->n_col = 6	19 15 12 11 11
+ *	cx->n_col = 7	21 19 16 13 14
+ *	cx->n_col = 8	29 23 19 17 14
+ */
+
+/*
+ * OpenSolaris changes
+ * 1. Added header files aes_impl.h and aestab2.h
+ * 2. Changed uint_8t and uint_32t to uint8_t and uint32_t
+ * 3. Remove code under ifdef USE_VIA_ACE_IF_PRESENT (always undefined)
+ * 4. Removed always-defined ifdefs FUNCS_IN_C, ENC_KEYING_IN_C,
+ *	AES_128, AES_192, AES_256, AES_VAR defines
+ * 5. Changed aes_encrypt_key* aes_decrypt_key* functions to "static void"
+ * 6. Changed N_COLS to MAX_AES_NB
+ * 7. Replaced functions aes_encrypt_key and aes_decrypt_key with
+ *	OpenSolaris-compatible functions rijndael_key_setup_enc_amd64 and
+ *	rijndael_key_setup_dec_amd64
+ * 8. cstyled code and removed lint warnings
+ */
+
+#if defined(REDUCE_CODE_SIZE)
+#define	ls_box ls_sub
+	uint32_t	ls_sub(const uint32_t t, const uint32_t n);
+#define	inv_mcol im_sub
+	uint32_t	im_sub(const uint32_t x);
+#ifdef ENC_KS_UNROLL
+#undef ENC_KS_UNROLL
+#endif
+#ifdef DEC_KS_UNROLL
+#undef DEC_KS_UNROLL
+#endif
+#endif	/* REDUCE_CODE_SIZE */
+
+
+#define	ke4(k, i) \
+{	k[4 * (i) + 4] = ss[0] ^= ls_box(ss[3], 3) ^ t_use(r, c)[i]; \
+	k[4 * (i) + 5] = ss[1] ^= ss[0]; \
+	k[4 * (i) + 6] = ss[2] ^= ss[1]; \
+	k[4 * (i) + 7] = ss[3] ^= ss[2]; \
+}
+
+static void
+aes_encrypt_key128(const unsigned char *key, uint32_t rk[])
+{
+	uint32_t	ss[4];
+
+	rk[0] = ss[0] = word_in(key, 0);
+	rk[1] = ss[1] = word_in(key, 1);
+	rk[2] = ss[2] = word_in(key, 2);
+	rk[3] = ss[3] = word_in(key, 3);
+
+#ifdef ENC_KS_UNROLL
+	ke4(rk, 0);  ke4(rk, 1);
+	ke4(rk, 2);  ke4(rk, 3);
+	ke4(rk, 4);  ke4(rk, 5);
+	ke4(rk, 6);  ke4(rk, 7);
+	ke4(rk, 8);
+#else
+	{
+		uint32_t	i;
+		for (i = 0; i < 9; ++i)
+			ke4(rk, i);
+	}
+#endif	/* ENC_KS_UNROLL */
+	ke4(rk, 9);
+}
+
+
+#define	kef6(k, i) \
+{	k[6 * (i) + 6] = ss[0] ^= ls_box(ss[5], 3) ^ t_use(r, c)[i]; \
+	k[6 * (i) + 7] = ss[1] ^= ss[0]; \
+	k[6 * (i) + 8] = ss[2] ^= ss[1]; \
+	k[6 * (i) + 9] = ss[3] ^= ss[2]; \
+}
+
+#define	ke6(k, i) \
+{	kef6(k, i); \
+	k[6 * (i) + 10] = ss[4] ^= ss[3]; \
+	k[6 * (i) + 11] = ss[5] ^= ss[4]; \
+}
+
+static void
+aes_encrypt_key192(const unsigned char *key, uint32_t rk[])
+{
+	uint32_t	ss[6];
+
+	rk[0] = ss[0] = word_in(key, 0);
+	rk[1] = ss[1] = word_in(key, 1);
+	rk[2] = ss[2] = word_in(key, 2);
+	rk[3] = ss[3] = word_in(key, 3);
+	rk[4] = ss[4] = word_in(key, 4);
+	rk[5] = ss[5] = word_in(key, 5);
+
+#ifdef ENC_KS_UNROLL
+	ke6(rk, 0);  ke6(rk, 1);
+	ke6(rk, 2);  ke6(rk, 3);
+	ke6(rk, 4);  ke6(rk, 5);
+	ke6(rk, 6);
+#else
+	{
+		uint32_t	i;
+		for (i = 0; i < 7; ++i)
+			ke6(rk, i);
+	}
+#endif	/* ENC_KS_UNROLL */
+	kef6(rk, 7);
+}
+
+
+
+#define	kef8(k, i) \
+{	k[8 * (i) + 8] = ss[0] ^= ls_box(ss[7], 3) ^ t_use(r, c)[i]; \
+	k[8 * (i) + 9] = ss[1] ^= ss[0]; \
+	k[8 * (i) + 10] = ss[2] ^= ss[1]; \
+	k[8 * (i) + 11] = ss[3] ^= ss[2]; \
+}
+
+#define	ke8(k, i) \
+{   kef8(k, i); \
+	k[8 * (i) + 12] = ss[4] ^= ls_box(ss[3], 0); \
+	k[8 * (i) + 13] = ss[5] ^= ss[4]; \
+	k[8 * (i) + 14] = ss[6] ^= ss[5]; \
+	k[8 * (i) + 15] = ss[7] ^= ss[6]; \
+}
+
+static void
+aes_encrypt_key256(const unsigned char *key, uint32_t rk[])
+{
+	uint32_t	ss[8];
+
+	rk[0] = ss[0] = word_in(key, 0);
+	rk[1] = ss[1] = word_in(key, 1);
+	rk[2] = ss[2] = word_in(key, 2);
+	rk[3] = ss[3] = word_in(key, 3);
+	rk[4] = ss[4] = word_in(key, 4);
+	rk[5] = ss[5] = word_in(key, 5);
+	rk[6] = ss[6] = word_in(key, 6);
+	rk[7] = ss[7] = word_in(key, 7);
+
+#ifdef ENC_KS_UNROLL
+	ke8(rk, 0); ke8(rk, 1);
+	ke8(rk, 2); ke8(rk, 3);
+	ke8(rk, 4); ke8(rk, 5);
+#else
+	{
+		uint32_t	i;
+		for (i = 0; i < 6; ++i)
+			ke8(rk,  i);
+	}
+#endif	/* ENC_KS_UNROLL */
+	kef8(rk, 6);
+}
+
+
+/*
+ * Expand the cipher key into the encryption key schedule.
+ *
+ * Return the number of rounds for the given cipher key size.
+ * The size of the key schedule depends on the number of rounds
+ * (which can be computed from the size of the key), i.e. 4 * (Nr + 1).
+ *
+ * Parameters:
+ * rk		AES key schedule 32-bit array to be initialized
+ * cipherKey	User key
+ * keyBits	AES key size (128, 192, or 256 bits)
+ */
+int
+rijndael_key_setup_enc_amd64(uint32_t rk[], const uint32_t cipherKey[],
+    int keyBits)
+{
+	switch (keyBits) {
+	case 128:
+		aes_encrypt_key128((unsigned char *)&cipherKey[0], rk);
+		return (10);
+	case 192:
+		aes_encrypt_key192((unsigned char *)&cipherKey[0], rk);
+		return (12);
+	case 256:
+		aes_encrypt_key256((unsigned char *)&cipherKey[0], rk);
+		return (14);
+	default: /* should never get here */
+		break;
+	}
+
+	return (0);
+}
+
+
+/* this is used to store the decryption round keys  */
+/* in forward or reverse order */
+
+#ifdef AES_REV_DKS
+#define	v(n, i)  ((n) - (i) + 2 * ((i) & 3))
+#else
+#define	v(n, i)  (i)
+#endif
+
+#if DEC_ROUND == NO_TABLES
+#define	ff(x)   (x)
+#else
+#define	ff(x)   inv_mcol(x)
+#if defined(dec_imvars)
+#define	d_vars  dec_imvars
+#endif
+#endif	/* FUNCS_IN_C & DEC_KEYING_IN_C */
+
+
+#define	k4e(k, i) \
+{	k[v(40, (4 * (i)) + 4)] = ss[0] ^= ls_box(ss[3], 3) ^ t_use(r, c)[i]; \
+	k[v(40, (4 * (i)) + 5)] = ss[1] ^= ss[0]; \
+	k[v(40, (4 * (i)) + 6)] = ss[2] ^= ss[1]; \
+	k[v(40, (4 * (i)) + 7)] = ss[3] ^= ss[2]; \
+}
+
+#if 1
+
+#define	kdf4(k, i) \
+{	ss[0] = ss[0] ^ ss[2] ^ ss[1] ^ ss[3]; \
+	ss[1] = ss[1] ^ ss[3]; \
+	ss[2] = ss[2] ^ ss[3]; \
+	ss[4] = ls_box(ss[(i + 3) % 4], 3) ^ t_use(r, c)[i]; \
+	ss[i % 4] ^= ss[4]; \
+	ss[4] ^= k[v(40, (4 * (i)))];   k[v(40, (4 * (i)) + 4)] = ff(ss[4]); \
+	ss[4] ^= k[v(40, (4 * (i)) + 1)]; k[v(40, (4 * (i)) + 5)] = ff(ss[4]); \
+	ss[4] ^= k[v(40, (4 * (i)) + 2)]; k[v(40, (4 * (i)) + 6)] = ff(ss[4]); \
+	ss[4] ^= k[v(40, (4 * (i)) + 3)]; k[v(40, (4 * (i)) + 7)] = ff(ss[4]); \
+}
+
+#define	kd4(k, i) \
+{	ss[4] = ls_box(ss[(i + 3) % 4], 3) ^ t_use(r, c)[i]; \
+	ss[i % 4] ^= ss[4]; ss[4] = ff(ss[4]); \
+	k[v(40, (4 * (i)) + 4)] = ss[4] ^= k[v(40, (4 * (i)))]; \
+	k[v(40, (4 * (i)) + 5)] = ss[4] ^= k[v(40, (4 * (i)) + 1)]; \
+	k[v(40, (4 * (i)) + 6)] = ss[4] ^= k[v(40, (4 * (i)) + 2)]; \
+	k[v(40, (4 * (i)) + 7)] = ss[4] ^= k[v(40, (4 * (i)) + 3)]; \
+}
+
+#define	kdl4(k, i) \
+{	ss[4] = ls_box(ss[(i + 3) % 4], 3) ^ t_use(r, c)[i]; \
+	ss[i % 4] ^= ss[4]; \
+	k[v(40, (4 * (i)) + 4)] = (ss[0] ^= ss[1]) ^ ss[2] ^ ss[3]; \
+	k[v(40, (4 * (i)) + 5)] = ss[1] ^ ss[3]; \
+	k[v(40, (4 * (i)) + 6)] = ss[0]; \
+	k[v(40, (4 * (i)) + 7)] = ss[1]; \
+}
+
+#else
+
+#define	kdf4(k, i) \
+{	ss[0] ^= ls_box(ss[3], 3) ^ t_use(r, c)[i]; \
+	k[v(40, (4 * (i)) + 4)] = ff(ss[0]); \
+	ss[1] ^= ss[0]; k[v(40, (4 * (i)) + 5)] = ff(ss[1]); \
+	ss[2] ^= ss[1]; k[v(40, (4 * (i)) + 6)] = ff(ss[2]); \
+	ss[3] ^= ss[2]; k[v(40, (4 * (i)) + 7)] = ff(ss[3]); \
+}
+
+#define	kd4(k, i) \
+{	ss[4] = ls_box(ss[3], 3) ^ t_use(r, c)[i]; \
+	ss[0] ^= ss[4]; \
+	ss[4] = ff(ss[4]); \
+	k[v(40, (4 * (i)) + 4)] = ss[4] ^= k[v(40, (4 * (i)))]; \
+	ss[1] ^= ss[0]; \
+	k[v(40, (4 * (i)) + 5)] = ss[4] ^= k[v(40, (4 * (i)) + 1)]; \
+	ss[2] ^= ss[1]; \
+	k[v(40, (4 * (i)) + 6)] = ss[4] ^= k[v(40, (4 * (i)) + 2)]; \
+	ss[3] ^= ss[2]; \
+	k[v(40, (4 * (i)) + 7)] = ss[4] ^= k[v(40, (4 * (i)) + 3)]; \
+}
+
+#define	kdl4(k, i) \
+{	ss[0] ^= ls_box(ss[3], 3) ^ t_use(r, c)[i]; \
+	k[v(40, (4 * (i)) + 4)] = ss[0]; \
+	ss[1] ^= ss[0]; k[v(40, (4 * (i)) + 5)] = ss[1]; \
+	ss[2] ^= ss[1]; k[v(40, (4 * (i)) + 6)] = ss[2]; \
+	ss[3] ^= ss[2]; k[v(40, (4 * (i)) + 7)] = ss[3]; \
+}
+
+#endif
+
+static void
+aes_decrypt_key128(const unsigned char *key, uint32_t rk[])
+{
+	uint32_t	ss[5];
+#if defined(d_vars)
+	d_vars;
+#endif
+	rk[v(40, (0))] = ss[0] = word_in(key, 0);
+	rk[v(40, (1))] = ss[1] = word_in(key, 1);
+	rk[v(40, (2))] = ss[2] = word_in(key, 2);
+	rk[v(40, (3))] = ss[3] = word_in(key, 3);
+
+#ifdef DEC_KS_UNROLL
+	kdf4(rk, 0); kd4(rk, 1);
+	kd4(rk, 2);  kd4(rk, 3);
+	kd4(rk, 4);  kd4(rk, 5);
+	kd4(rk, 6);  kd4(rk, 7);
+	kd4(rk, 8);  kdl4(rk, 9);
+#else
+	{
+		uint32_t	i;
+		for (i = 0; i < 10; ++i)
+			k4e(rk, i);
+#if !(DEC_ROUND == NO_TABLES)
+		for (i = MAX_AES_NB; i < 10 * MAX_AES_NB; ++i)
+			rk[i] = inv_mcol(rk[i]);
+#endif
+	}
+#endif	/* DEC_KS_UNROLL */
+}
+
+
+
+#define	k6ef(k, i) \
+{	k[v(48, (6 * (i)) + 6)] = ss[0] ^= ls_box(ss[5], 3) ^ t_use(r, c)[i]; \
+	k[v(48, (6 * (i)) + 7)] = ss[1] ^= ss[0]; \
+	k[v(48, (6 * (i)) + 8)] = ss[2] ^= ss[1]; \
+	k[v(48, (6 * (i)) + 9)] = ss[3] ^= ss[2]; \
+}
+
+#define	k6e(k, i) \
+{	k6ef(k, i); \
+	k[v(48, (6 * (i)) + 10)] = ss[4] ^= ss[3]; \
+	k[v(48, (6 * (i)) + 11)] = ss[5] ^= ss[4]; \
+}
+
+#define	kdf6(k, i) \
+{	ss[0] ^= ls_box(ss[5], 3) ^ t_use(r, c)[i]; \
+	k[v(48, (6 * (i)) + 6)] = ff(ss[0]); \
+	ss[1] ^= ss[0]; k[v(48, (6 * (i)) + 7)] = ff(ss[1]); \
+	ss[2] ^= ss[1]; k[v(48, (6 * (i)) + 8)] = ff(ss[2]); \
+	ss[3] ^= ss[2]; k[v(48, (6 * (i)) + 9)] = ff(ss[3]); \
+	ss[4] ^= ss[3]; k[v(48, (6 * (i)) + 10)] = ff(ss[4]); \
+	ss[5] ^= ss[4]; k[v(48, (6 * (i)) + 11)] = ff(ss[5]); \
+}
+
+#define	kd6(k, i) \
+{	ss[6] = ls_box(ss[5], 3) ^ t_use(r, c)[i]; \
+	ss[0] ^= ss[6]; ss[6] = ff(ss[6]); \
+	k[v(48, (6 * (i)) + 6)] = ss[6] ^= k[v(48, (6 * (i)))]; \
+	ss[1] ^= ss[0]; \
+	k[v(48, (6 * (i)) + 7)] = ss[6] ^= k[v(48, (6 * (i)) + 1)]; \
+	ss[2] ^= ss[1]; \
+	k[v(48, (6 * (i)) + 8)] = ss[6] ^= k[v(48, (6 * (i)) + 2)]; \
+	ss[3] ^= ss[2]; \
+	k[v(48, (6 * (i)) + 9)] = ss[6] ^= k[v(48, (6 * (i)) + 3)]; \
+	ss[4] ^= ss[3]; \
+	k[v(48, (6 * (i)) + 10)] = ss[6] ^= k[v(48, (6 * (i)) + 4)]; \
+	ss[5] ^= ss[4]; \
+	k[v(48, (6 * (i)) + 11)] = ss[6] ^= k[v(48, (6 * (i)) + 5)]; \
+}
+
+#define	kdl6(k, i) \
+{	ss[0] ^= ls_box(ss[5], 3) ^ t_use(r, c)[i]; \
+	k[v(48, (6 * (i)) + 6)] = ss[0]; \
+	ss[1] ^= ss[0]; k[v(48, (6 * (i)) + 7)] = ss[1]; \
+	ss[2] ^= ss[1]; k[v(48, (6 * (i)) + 8)] = ss[2]; \
+	ss[3] ^= ss[2]; k[v(48, (6 * (i)) + 9)] = ss[3]; \
+}
+
+static void
+aes_decrypt_key192(const unsigned char *key, uint32_t rk[])
+{
+	uint32_t	ss[7];
+#if defined(d_vars)
+	d_vars;
+#endif
+	rk[v(48, (0))] = ss[0] = word_in(key, 0);
+	rk[v(48, (1))] = ss[1] = word_in(key, 1);
+	rk[v(48, (2))] = ss[2] = word_in(key, 2);
+	rk[v(48, (3))] = ss[3] = word_in(key, 3);
+
+#ifdef DEC_KS_UNROLL
+	ss[4] = word_in(key, 4);
+	rk[v(48, (4))] = ff(ss[4]);
+	ss[5] = word_in(key, 5);
+	rk[v(48, (5))] = ff(ss[5]);
+	kdf6(rk, 0); kd6(rk, 1);
+	kd6(rk, 2);  kd6(rk, 3);
+	kd6(rk, 4);  kd6(rk, 5);
+	kd6(rk, 6);  kdl6(rk, 7);
+#else
+	rk[v(48, (4))] = ss[4] = word_in(key, 4);
+	rk[v(48, (5))] = ss[5] = word_in(key, 5);
+	{
+		uint32_t	i;
+
+		for (i = 0; i < 7; ++i)
+			k6e(rk, i);
+		k6ef(rk, 7);
+#if !(DEC_ROUND == NO_TABLES)
+		for (i = MAX_AES_NB; i < 12 * MAX_AES_NB; ++i)
+			rk[i] = inv_mcol(rk[i]);
+#endif
+	}
+#endif
+}
+
+
+
+#define	k8ef(k, i) \
+{	k[v(56, (8 * (i)) + 8)] = ss[0] ^= ls_box(ss[7], 3) ^ t_use(r, c)[i]; \
+	k[v(56, (8 * (i)) + 9)] = ss[1] ^= ss[0]; \
+	k[v(56, (8 * (i)) + 10)] = ss[2] ^= ss[1]; \
+	k[v(56, (8 * (i)) + 11)] = ss[3] ^= ss[2]; \
+}
+
+#define	k8e(k, i) \
+{	k8ef(k, i); \
+	k[v(56, (8 * (i)) + 12)] = ss[4] ^= ls_box(ss[3], 0); \
+	k[v(56, (8 * (i)) + 13)] = ss[5] ^= ss[4]; \
+	k[v(56, (8 * (i)) + 14)] = ss[6] ^= ss[5]; \
+	k[v(56, (8 * (i)) + 15)] = ss[7] ^= ss[6]; \
+}
+
+#define	kdf8(k, i) \
+{	ss[0] ^= ls_box(ss[7], 3) ^ t_use(r, c)[i]; \
+	k[v(56, (8 * (i)) + 8)] = ff(ss[0]); \
+	ss[1] ^= ss[0]; k[v(56, (8 * (i)) + 9)] = ff(ss[1]); \
+	ss[2] ^= ss[1]; k[v(56, (8 * (i)) + 10)] = ff(ss[2]); \
+	ss[3] ^= ss[2]; k[v(56, (8 * (i)) + 11)] = ff(ss[3]); \
+	ss[4] ^= ls_box(ss[3], 0); k[v(56, (8 * (i)) + 12)] = ff(ss[4]); \
+	ss[5] ^= ss[4]; k[v(56, (8 * (i)) + 13)] = ff(ss[5]); \
+	ss[6] ^= ss[5]; k[v(56, (8 * (i)) + 14)] = ff(ss[6]); \
+	ss[7] ^= ss[6]; k[v(56, (8 * (i)) + 15)] = ff(ss[7]); \
+}
+
+#define	kd8(k, i) \
+{	ss[8] = ls_box(ss[7], 3) ^ t_use(r, c)[i]; \
+	ss[0] ^= ss[8]; \
+	ss[8] = ff(ss[8]); \
+	k[v(56, (8 * (i)) + 8)] = ss[8] ^= k[v(56, (8 * (i)))]; \
+	ss[1] ^= ss[0]; \
+	k[v(56, (8 * (i)) + 9)] = ss[8] ^= k[v(56, (8 * (i)) + 1)]; \
+	ss[2] ^= ss[1]; \
+	k[v(56, (8 * (i)) + 10)] = ss[8] ^= k[v(56, (8 * (i)) + 2)]; \
+	ss[3] ^= ss[2]; \
+	k[v(56, (8 * (i)) + 11)] = ss[8] ^= k[v(56, (8 * (i)) + 3)]; \
+	ss[8] = ls_box(ss[3], 0); \
+	ss[4] ^= ss[8]; \
+	ss[8] = ff(ss[8]); \
+	k[v(56, (8 * (i)) + 12)] = ss[8] ^= k[v(56, (8 * (i)) + 4)]; \
+	ss[5] ^= ss[4]; \
+	k[v(56, (8 * (i)) + 13)] = ss[8] ^= k[v(56, (8 * (i)) + 5)]; \
+	ss[6] ^= ss[5]; \
+	k[v(56, (8 * (i)) + 14)] = ss[8] ^= k[v(56, (8 * (i)) + 6)]; \
+	ss[7] ^= ss[6]; \
+	k[v(56, (8 * (i)) + 15)] = ss[8] ^= k[v(56, (8 * (i)) + 7)]; \
+}
+
+#define	kdl8(k, i) \
+{	ss[0] ^= ls_box(ss[7], 3) ^ t_use(r, c)[i]; \
+	k[v(56, (8 * (i)) + 8)] = ss[0]; \
+	ss[1] ^= ss[0]; k[v(56, (8 * (i)) + 9)] = ss[1]; \
+	ss[2] ^= ss[1]; k[v(56, (8 * (i)) + 10)] = ss[2]; \
+	ss[3] ^= ss[2]; k[v(56, (8 * (i)) + 11)] = ss[3]; \
+}
+
+static void
+aes_decrypt_key256(const unsigned char *key, uint32_t rk[])
+{
+	uint32_t	ss[9];
+#if defined(d_vars)
+	d_vars;
+#endif
+	rk[v(56, (0))] = ss[0] = word_in(key, 0);
+	rk[v(56, (1))] = ss[1] = word_in(key, 1);
+	rk[v(56, (2))] = ss[2] = word_in(key, 2);
+	rk[v(56, (3))] = ss[3] = word_in(key, 3);
+
+#ifdef DEC_KS_UNROLL
+	ss[4] = word_in(key, 4);
+	rk[v(56, (4))] = ff(ss[4]);
+	ss[5] = word_in(key, 5);
+	rk[v(56, (5))] = ff(ss[5]);
+	ss[6] = word_in(key, 6);
+	rk[v(56, (6))] = ff(ss[6]);
+	ss[7] = word_in(key, 7);
+	rk[v(56, (7))] = ff(ss[7]);
+	kdf8(rk, 0); kd8(rk, 1);
+	kd8(rk, 2);  kd8(rk, 3);
+	kd8(rk, 4);  kd8(rk, 5);
+	kdl8(rk, 6);
+#else
+	rk[v(56, (4))] = ss[4] = word_in(key, 4);
+	rk[v(56, (5))] = ss[5] = word_in(key, 5);
+	rk[v(56, (6))] = ss[6] = word_in(key, 6);
+	rk[v(56, (7))] = ss[7] = word_in(key, 7);
+	{
+		uint32_t	i;
+
+		for (i = 0; i < 6; ++i)
+			k8e(rk,  i);
+		k8ef(rk,  6);
+#if !(DEC_ROUND == NO_TABLES)
+		for (i = MAX_AES_NB; i < 14 * MAX_AES_NB; ++i)
+			rk[i] = inv_mcol(rk[i]);
+#endif
+	}
+#endif	/* DEC_KS_UNROLL */
+}
+
+
+/*
+ * Expand the cipher key into the decryption key schedule.
+ *
+ * Return the number of rounds for the given cipher key size.
+ * The size of the key schedule depends on the number of rounds
+ * (which can be computed from the size of the key), i.e. 4 * (Nr + 1).
+ *
+ * Parameters:
+ * rk		AES key schedule 32-bit array to be initialized
+ * cipherKey	User key
+ * keyBits	AES key size (128, 192, or 256 bits)
+ */
+int
+rijndael_key_setup_dec_amd64(uint32_t rk[], const uint32_t cipherKey[],
+    int keyBits)
+{
+	switch (keyBits) {
+	case 128:
+		aes_decrypt_key128((unsigned char *)&cipherKey[0], rk);
+		return (10);
+	case 192:
+		aes_decrypt_key192((unsigned char *)&cipherKey[0], rk);
+		return (12);
+	case 256:
+		aes_decrypt_key256((unsigned char *)&cipherKey[0], rk);
+		return (14);
+	default: /* should never get here */
+		break;
+	}
+
+	return (0);
+}
diff --git a/module/icp/asm-x86_64/aes/aesopt.h b/module/icp/asm-x86_64/aes/aesopt.h
new file mode 100644
index 000000000000..472111f96e59
--- /dev/null
+++ b/module/icp/asm-x86_64/aes/aesopt.h
@@ -0,0 +1,770 @@
+/*
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
+ *
+ * LICENSE TERMS
+ *
+ * The free distribution and use of this software is allowed (with or without
+ * changes) provided that:
+ *
+ *  1. source code distributions include the above copyright notice, this
+ *	list of conditions and the following disclaimer;
+ *
+ *  2. binary distributions include the above copyright notice, this list
+ *	of conditions and the following disclaimer in their documentation;
+ *
+ *  3. the name of the copyright holder is not used to endorse products
+ *	built using this software without specific written permission.
+ *
+ * DISCLAIMER
+ *
+ * This software is provided 'as is' with no explicit or implied warranties
+ * in respect of its properties, including, but not limited to, correctness
+ * and/or fitness for purpose.
+ * ---------------------------------------------------------------------------
+ * Issue Date: 20/12/2007
+ *
+ * This file contains the compilation options for AES (Rijndael) and code
+ * that is common across encryption, key scheduling and table generation.
+ *
+ * OPERATION
+ *
+ * These source code files implement the AES algorithm Rijndael designed by
+ * Joan Daemen and Vincent Rijmen. This version is designed for the standard
+ * block size of 16 bytes and for key sizes of 128, 192 and 256 bits (16, 24
+ * and 32 bytes).
+ *
+ * This version is designed for flexibility and speed using operations on
+ * 32-bit words rather than operations on bytes.  It can be compiled with
+ * either big or little endian internal byte order but is faster when the
+ * native byte order for the processor is used.
+ *
+ * THE CIPHER INTERFACE
+ *
+ * The cipher interface is implemented as an array of bytes in which lower
+ * AES bit sequence indexes map to higher numeric significance within bytes.
+ */
+
+/*
+ * OpenSolaris changes
+ * 1. Added __cplusplus and _AESTAB_H header guards
+ * 2. Added header files sys/types.h and aes_impl.h
+ * 3. Added defines for AES_ENCRYPT, AES_DECRYPT, AES_REV_DKS, and ASM_AMD64_C
+ * 4. Moved defines for IS_BIG_ENDIAN, IS_LITTLE_ENDIAN, PLATFORM_BYTE_ORDER
+ *    from brg_endian.h
+ * 5. Undefined VIA_ACE_POSSIBLE and ASSUME_VIA_ACE_PRESENT
+ * 6. Changed uint_8t and uint_32t to uint8_t and uint32_t
+ * 7. Defined aes_sw32 as htonl() for byte swapping
+ * 8. Cstyled and hdrchk code
+ *
+ */
+
+#ifndef _AESOPT_H
+#define	_AESOPT_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/zfs_context.h>
+#include <aes/aes_impl.h>
+
+/*  SUPPORT FEATURES */
+#define	AES_ENCRYPT /* if support for encryption is needed */
+#define	AES_DECRYPT /* if support for decryption is needed */
+
+/*  PLATFORM-SPECIFIC FEATURES */
+#define	IS_BIG_ENDIAN		4321 /* byte 0 is most significant (mc68k) */
+#define	IS_LITTLE_ENDIAN	1234 /* byte 0 is least significant (i386) */
+#define	PLATFORM_BYTE_ORDER	IS_LITTLE_ENDIAN
+#define	AES_REV_DKS /* define to reverse decryption key schedule */
+
+
+/*
+ *  CONFIGURATION - THE USE OF DEFINES
+ *	Later in this section there are a number of defines that control the
+ *	operation of the code.  In each section, the purpose of each define is
+ *	explained so that the relevant form can be included or excluded by
+ *	setting either 1's or 0's respectively on the branches of the related
+ *	#if clauses.  The following local defines should not be changed.
+ */
+
+#define	ENCRYPTION_IN_C	1
+#define	DECRYPTION_IN_C	2
+#define	ENC_KEYING_IN_C	4
+#define	DEC_KEYING_IN_C	8
+
+#define	NO_TABLES	0
+#define	ONE_TABLE	1
+#define	FOUR_TABLES	4
+#define	NONE		0
+#define	PARTIAL		1
+#define	FULL		2
+
+/*  --- START OF USER CONFIGURED OPTIONS --- */
+
+/*
+ *  1. BYTE ORDER WITHIN 32 BIT WORDS
+ *
+ *	The fundamental data processing units in Rijndael are 8-bit bytes. The
+ *	input, output and key input are all enumerated arrays of bytes in which
+ *	bytes are numbered starting at zero and increasing to one less than the
+ *	number of bytes in the array in question. This enumeration is only used
+ *	for naming bytes and does not imply any adjacency or order relationship
+ *	from one byte to another. When these inputs and outputs are considered
+ *	as bit sequences, bits 8*n to 8*n+7 of the bit sequence are mapped to
+ *	byte[n] with bit 8n+i in the sequence mapped to bit 7-i within the byte.
+ *	In this implementation bits are numbered from 0 to 7 starting at the
+ *	numerically least significant end of each byte.  Bit n represents 2^n.
+ *
+ *	However, Rijndael can be implemented more efficiently using 32-bit
+ *	words by packing bytes into words so that bytes 4*n to 4*n+3 are placed
+ *	into word[n]. While in principle these bytes can be assembled into words
+ *	in any positions, this implementation only supports the two formats in
+ *	which bytes in adjacent positions within words also have adjacent byte
+ *	numbers. This order is called big-endian if the lowest numbered bytes
+ *	in words have the highest numeric significance and little-endian if the
+ *	opposite applies.
+ *
+ *	This code can work in either order irrespective of the order used by the
+ *	machine on which it runs. Normally the internal byte order will be set
+ *	to the order of the processor on which the code is to be run but this
+ *	define	can be used to reverse this in special situations
+ *
+ *	WARNING: Assembler code versions rely on PLATFORM_BYTE_ORDER being set.
+ *	This define will hence be redefined later (in section 4) if necessary
+ */
+
+#if 1
+#define	ALGORITHM_BYTE_ORDER PLATFORM_BYTE_ORDER
+#elif 0
+#define	ALGORITHM_BYTE_ORDER IS_LITTLE_ENDIAN
+#elif 0
+#define	ALGORITHM_BYTE_ORDER IS_BIG_ENDIAN
+#else
+#error The algorithm byte order is not defined
+#endif
+
+/*  2. VIA ACE SUPPORT */
+
+#if defined(__GNUC__) && defined(__i386__) || \
+	defined(_WIN32) && defined(_M_IX86) && \
+	!(defined(_WIN64) || defined(_WIN32_WCE) || \
+	defined(_MSC_VER) && (_MSC_VER <= 800))
+#define	VIA_ACE_POSSIBLE
+#endif
+
+/*
+ *  Define this option if support for the VIA ACE is required. This uses
+ *  inline assembler instructions and is only implemented for the Microsoft,
+ *  Intel and GCC compilers.  If VIA ACE is known to be present, then defining
+ *  ASSUME_VIA_ACE_PRESENT will remove the ordinary encryption/decryption
+ *  code.  If USE_VIA_ACE_IF_PRESENT is defined then VIA ACE will be used if
+ *  it is detected (both present and enabled) but the normal AES code will
+ *  also be present.
+ *
+ *  When VIA ACE is to be used, all AES encryption contexts MUST be 16 byte
+ *  aligned; other input/output buffers do not need to be 16 byte aligned
+ *  but there are very large performance gains if this can be arranged.
+ *  VIA ACE also requires the decryption key schedule to be in reverse
+ *  order (which later checks below ensure).
+ */
+
+/*  VIA ACE is not used here for OpenSolaris: */
+#undef	VIA_ACE_POSSIBLE
+#undef	ASSUME_VIA_ACE_PRESENT
+
+#if 0 && defined(VIA_ACE_POSSIBLE) && !defined(USE_VIA_ACE_IF_PRESENT)
+#define	USE_VIA_ACE_IF_PRESENT
+#endif
+
+#if 0 && defined(VIA_ACE_POSSIBLE) && !defined(ASSUME_VIA_ACE_PRESENT)
+#define	ASSUME_VIA_ACE_PRESENT
+#endif
+
+
+/*
+ *  3. ASSEMBLER SUPPORT
+ *
+ *	This define (which can be on the command line) enables the use of the
+ *	assembler code routines for encryption, decryption and key scheduling
+ *	as follows:
+ *
+ *	ASM_X86_V1C uses the assembler (aes_x86_v1.asm) with large tables for
+ *		encryption and decryption and but with key scheduling in C
+ *	ASM_X86_V2  uses assembler (aes_x86_v2.asm) with compressed tables for
+ *		encryption, decryption and key scheduling
+ *	ASM_X86_V2C uses assembler (aes_x86_v2.asm) with compressed tables for
+ *		encryption and decryption and but with key scheduling in C
+ *	ASM_AMD64_C uses assembler (aes_amd64.asm) with compressed tables for
+ *		encryption and decryption and but with key scheduling in C
+ *
+ *	Change one 'if 0' below to 'if 1' to select the version or define
+ *	as a compilation option.
+ */
+
+#if 0 && !defined(ASM_X86_V1C)
+#define	ASM_X86_V1C
+#elif 0 && !defined(ASM_X86_V2)
+#define	ASM_X86_V2
+#elif 0 && !defined(ASM_X86_V2C)
+#define	ASM_X86_V2C
+#elif 1 && !defined(ASM_AMD64_C)
+#define	ASM_AMD64_C
+#endif
+
+#if (defined(ASM_X86_V1C) || defined(ASM_X86_V2) || defined(ASM_X86_V2C)) && \
+	!defined(_M_IX86) || defined(ASM_AMD64_C) && !defined(_M_X64) && \
+	!defined(__amd64)
+#error Assembler code is only available for x86 and AMD64 systems
+#endif
+
+/*
+ *  4. FAST INPUT/OUTPUT OPERATIONS.
+ *
+ *	On some machines it is possible to improve speed by transferring the
+ *	bytes in the input and output arrays to and from the internal 32-bit
+ *	variables by addressing these arrays as if they are arrays of 32-bit
+ *	words.  On some machines this will always be possible but there may
+ *	be a large performance penalty if the byte arrays are not aligned on
+ *	the normal word boundaries. On other machines this technique will
+ *	lead to memory access errors when such 32-bit word accesses are not
+ *	properly aligned. The option SAFE_IO avoids such problems but will
+ *	often be slower on those machines that support misaligned access
+ *	(especially so if care is taken to align the input  and output byte
+ *	arrays on 32-bit word boundaries). If SAFE_IO is not defined it is
+ *	assumed that access to byte arrays as if they are arrays of 32-bit
+ *	words will not cause problems when such accesses are misaligned.
+ */
+#if 1 && !defined(_MSC_VER)
+#define	SAFE_IO
+#endif
+
+/*
+ *  5. LOOP UNROLLING
+ *
+ *	The code for encryption and decryption cycles through a number of rounds
+ *	that can be implemented either in a loop or by expanding the code into a
+ *	long sequence of instructions, the latter producing a larger program but
+ *	one that will often be much faster. The latter is called loop unrolling.
+ *	There are also potential speed advantages in expanding two iterations in
+ *	a loop with half the number of iterations, which is called partial loop
+ *	unrolling.  The following options allow partial or full loop unrolling
+ *	to be set independently for encryption and decryption
+ */
+#if 1
+#define	ENC_UNROLL  FULL
+#elif 0
+#define	ENC_UNROLL  PARTIAL
+#else
+#define	ENC_UNROLL  NONE
+#endif
+
+#if 1
+#define	DEC_UNROLL  FULL
+#elif 0
+#define	DEC_UNROLL  PARTIAL
+#else
+#define	DEC_UNROLL  NONE
+#endif
+
+#if 1
+#define	ENC_KS_UNROLL
+#endif
+
+#if 1
+#define	DEC_KS_UNROLL
+#endif
+
+/*
+ *  6. FAST FINITE FIELD OPERATIONS
+ *
+ *	If this section is included, tables are used to provide faster finite
+ *	field arithmetic.  This has no effect if FIXED_TABLES is defined.
+ */
+#if 1
+#define	FF_TABLES
+#endif
+
+/*
+ *  7. INTERNAL STATE VARIABLE FORMAT
+ *
+ *	The internal state of Rijndael is stored in a number of local 32-bit
+ *	word variables which can be defined either as an array or as individual
+ *	names variables. Include this section if you want to store these local
+ *	variables in arrays. Otherwise individual local variables will be used.
+ */
+#if 1
+#define	ARRAYS
+#endif
+
+/*
+ *  8. FIXED OR DYNAMIC TABLES
+ *
+ *	When this section is included the tables used by the code are compiled
+ *	statically into the binary file.  Otherwise the subroutine aes_init()
+ *	must be called to compute them before the code is first used.
+ */
+#if 1 && !(defined(_MSC_VER) && (_MSC_VER <= 800))
+#define	FIXED_TABLES
+#endif
+
+/*
+ *  9. MASKING OR CASTING FROM LONGER VALUES TO BYTES
+ *
+ *	In some systems it is better to mask longer values to extract bytes
+ *	rather than using a cast. This option allows this choice.
+ */
+#if 0
+#define	to_byte(x)  ((uint8_t)(x))
+#else
+#define	to_byte(x)  ((x) & 0xff)
+#endif
+
+/*
+ *  10. TABLE ALIGNMENT
+ *
+ *	On some systems speed will be improved by aligning the AES large lookup
+ *	tables on particular boundaries. This define should be set to a power of
+ *	two giving the desired alignment. It can be left undefined if alignment
+ *	is not needed.  This option is specific to the Microsoft VC++ compiler -
+ *	it seems to sometimes cause trouble for the VC++ version 6 compiler.
+ */
+
+#if 1 && defined(_MSC_VER) && (_MSC_VER >= 1300)
+#define	TABLE_ALIGN 32
+#endif
+
+/*
+ *  11.  REDUCE CODE AND TABLE SIZE
+ *
+ *	This replaces some expanded macros with function calls if AES_ASM_V2 or
+ *	AES_ASM_V2C are defined
+ */
+
+#if 1 && (defined(ASM_X86_V2) || defined(ASM_X86_V2C))
+#define	REDUCE_CODE_SIZE
+#endif
+
+/*
+ *  12. TABLE OPTIONS
+ *
+ *	This cipher proceeds by repeating in a number of cycles known as rounds
+ *	which are implemented by a round function which is optionally be speeded
+ *	up using tables.  The basic tables are 256 32-bit words, with either
+ *	one or four tables being required for each round function depending on
+ *	how much speed is required. Encryption and decryption round functions
+ *	are different and the last encryption and decryption round functions are
+ *	different again making four different round functions in all.
+ *
+ *	This means that:
+ *	1. Normal encryption and decryption rounds can each use either 0, 1
+ *		or 4 tables and table spaces of 0, 1024 or 4096 bytes each.
+ *	2. The last encryption and decryption rounds can also use either 0, 1
+ *		or 4 tables and table spaces of 0, 1024 or 4096 bytes each.
+ *
+ *	Include or exclude the appropriate definitions below to set the number
+ *	of tables used by this implementation.
+ */
+
+#if 1   /* set tables for the normal encryption round */
+#define	ENC_ROUND   FOUR_TABLES
+#elif 0
+#define	ENC_ROUND   ONE_TABLE
+#else
+#define	ENC_ROUND   NO_TABLES
+#endif
+
+#if 1   /* set tables for the last encryption round */
+#define	LAST_ENC_ROUND  FOUR_TABLES
+#elif 0
+#define	LAST_ENC_ROUND  ONE_TABLE
+#else
+#define	LAST_ENC_ROUND  NO_TABLES
+#endif
+
+#if 1   /* set tables for the normal decryption round */
+#define	DEC_ROUND   FOUR_TABLES
+#elif 0
+#define	DEC_ROUND   ONE_TABLE
+#else
+#define	DEC_ROUND   NO_TABLES
+#endif
+
+#if 1   /* set tables for the last decryption round */
+#define	LAST_DEC_ROUND  FOUR_TABLES
+#elif 0
+#define	LAST_DEC_ROUND  ONE_TABLE
+#else
+#define	LAST_DEC_ROUND  NO_TABLES
+#endif
+
+/*
+ *  The decryption key schedule can be speeded up with tables in the same
+ *	way that the round functions can.  Include or exclude the following
+ *	defines to set this requirement.
+ */
+#if 1
+#define	KEY_SCHED   FOUR_TABLES
+#elif 0
+#define	KEY_SCHED   ONE_TABLE
+#else
+#define	KEY_SCHED   NO_TABLES
+#endif
+
+/*  ---- END OF USER CONFIGURED OPTIONS ---- */
+
+/* VIA ACE support is only available for VC++ and GCC */
+
+#if !defined(_MSC_VER) && !defined(__GNUC__)
+#if defined(ASSUME_VIA_ACE_PRESENT)
+#undef ASSUME_VIA_ACE_PRESENT
+#endif
+#if defined(USE_VIA_ACE_IF_PRESENT)
+#undef USE_VIA_ACE_IF_PRESENT
+#endif
+#endif
+
+#if defined(ASSUME_VIA_ACE_PRESENT) && !defined(USE_VIA_ACE_IF_PRESENT)
+#define	USE_VIA_ACE_IF_PRESENT
+#endif
+
+#if defined(USE_VIA_ACE_IF_PRESENT) && !defined(AES_REV_DKS)
+#define	AES_REV_DKS
+#endif
+
+/* Assembler support requires the use of platform byte order */
+
+#if (defined(ASM_X86_V1C) || defined(ASM_X86_V2C) || defined(ASM_AMD64_C)) && \
+	(ALGORITHM_BYTE_ORDER != PLATFORM_BYTE_ORDER)
+#undef  ALGORITHM_BYTE_ORDER
+#define	ALGORITHM_BYTE_ORDER PLATFORM_BYTE_ORDER
+#endif
+
+/*
+ * In this implementation the columns of the state array are each held in
+ *	32-bit words. The state array can be held in various ways: in an array
+ *	of words, in a number of individual word variables or in a number of
+ *	processor registers. The following define maps a variable name x and
+ *	a column number c to the way the state array variable is to be held.
+ *	The first define below maps the state into an array x[c] whereas the
+ *	second form maps the state into a number of individual variables x0,
+ *	x1, etc.  Another form could map individual state columns to machine
+ *	register names.
+ */
+
+#if defined(ARRAYS)
+#define	s(x, c) x[c]
+#else
+#define	s(x, c) x##c
+#endif
+
+/*
+ *  This implementation provides subroutines for encryption, decryption
+ *	and for setting the three key lengths (separately) for encryption
+ *	and decryption. Since not all functions are needed, masks are set
+ *	up here to determine which will be implemented in C
+ */
+
+#if !defined(AES_ENCRYPT)
+#define	EFUNCS_IN_C   0
+#elif defined(ASSUME_VIA_ACE_PRESENT) || defined(ASM_X86_V1C) || \
+	defined(ASM_X86_V2C) || defined(ASM_AMD64_C)
+#define	EFUNCS_IN_C   ENC_KEYING_IN_C
+#elif !defined(ASM_X86_V2)
+#define	EFUNCS_IN_C   (ENCRYPTION_IN_C | ENC_KEYING_IN_C)
+#else
+#define	EFUNCS_IN_C   0
+#endif
+
+#if !defined(AES_DECRYPT)
+#define	DFUNCS_IN_C   0
+#elif defined(ASSUME_VIA_ACE_PRESENT) || defined(ASM_X86_V1C) || \
+	defined(ASM_X86_V2C) || defined(ASM_AMD64_C)
+#define	DFUNCS_IN_C   DEC_KEYING_IN_C
+#elif !defined(ASM_X86_V2)
+#define	DFUNCS_IN_C   (DECRYPTION_IN_C | DEC_KEYING_IN_C)
+#else
+#define	DFUNCS_IN_C   0
+#endif
+
+#define	FUNCS_IN_C  (EFUNCS_IN_C | DFUNCS_IN_C)
+
+/* END OF CONFIGURATION OPTIONS */
+
+/* Disable or report errors on some combinations of options */
+
+#if ENC_ROUND == NO_TABLES && LAST_ENC_ROUND != NO_TABLES
+#undef  LAST_ENC_ROUND
+#define	LAST_ENC_ROUND  NO_TABLES
+#elif ENC_ROUND == ONE_TABLE && LAST_ENC_ROUND == FOUR_TABLES
+#undef  LAST_ENC_ROUND
+#define	LAST_ENC_ROUND  ONE_TABLE
+#endif
+
+#if ENC_ROUND == NO_TABLES && ENC_UNROLL != NONE
+#undef  ENC_UNROLL
+#define	ENC_UNROLL  NONE
+#endif
+
+#if DEC_ROUND == NO_TABLES && LAST_DEC_ROUND != NO_TABLES
+#undef  LAST_DEC_ROUND
+#define	LAST_DEC_ROUND  NO_TABLES
+#elif DEC_ROUND == ONE_TABLE && LAST_DEC_ROUND == FOUR_TABLES
+#undef  LAST_DEC_ROUND
+#define	LAST_DEC_ROUND  ONE_TABLE
+#endif
+
+#if DEC_ROUND == NO_TABLES && DEC_UNROLL != NONE
+#undef  DEC_UNROLL
+#define	DEC_UNROLL  NONE
+#endif
+
+#if (ALGORITHM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#define	aes_sw32	htonl
+#elif defined(bswap32)
+#define	aes_sw32	bswap32
+#elif defined(bswap_32)
+#define	aes_sw32	bswap_32
+#else
+#define	brot(x, n)  (((uint32_t)(x) << (n)) | ((uint32_t)(x) >> (32 - (n))))
+#define	aes_sw32(x) ((brot((x), 8) & 0x00ff00ff) | (brot((x), 24) & 0xff00ff00))
+#endif
+
+
+/*
+ *	upr(x, n):  rotates bytes within words by n positions, moving bytes to
+ *		higher index positions with wrap around into low positions
+ *	ups(x, n):  moves bytes by n positions to higher index positions in
+ *		words but without wrap around
+ *	bval(x, n): extracts a byte from a word
+ *
+ *	WARNING:   The definitions given here are intended only for use with
+ *		unsigned variables and with shift counts that are compile
+ *		time constants
+ */
+
+#if (ALGORITHM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#define	upr(x, n)	(((uint32_t)(x) << (8 * (n))) | \
+			((uint32_t)(x) >> (32 - 8 * (n))))
+#define	ups(x, n)	((uint32_t)(x) << (8 * (n)))
+#define	bval(x, n)	to_byte((x) >> (8 * (n)))
+#define	bytes2word(b0, b1, b2, b3)  \
+		(((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | \
+		((uint32_t)(b1) << 8) | (b0))
+#endif
+
+#if (ALGORITHM_BYTE_ORDER == IS_BIG_ENDIAN)
+#define	upr(x, n)	(((uint32_t)(x) >> (8 * (n))) | \
+			((uint32_t)(x) << (32 - 8 * (n))))
+#define	ups(x, n)	((uint32_t)(x) >> (8 * (n)))
+#define	bval(x, n)	to_byte((x) >> (24 - 8 * (n)))
+#define	bytes2word(b0, b1, b2, b3)  \
+		(((uint32_t)(b0) << 24) | ((uint32_t)(b1) << 16) | \
+		((uint32_t)(b2) << 8) | (b3))
+#endif
+
+#if defined(SAFE_IO)
+#define	word_in(x, c)	bytes2word(((const uint8_t *)(x) + 4 * c)[0], \
+				((const uint8_t *)(x) + 4 * c)[1], \
+				((const uint8_t *)(x) + 4 * c)[2], \
+				((const uint8_t *)(x) + 4 * c)[3])
+#define	word_out(x, c, v) { ((uint8_t *)(x) + 4 * c)[0] = bval(v, 0); \
+			((uint8_t *)(x) + 4 * c)[1] = bval(v, 1); \
+			((uint8_t *)(x) + 4 * c)[2] = bval(v, 2); \
+			((uint8_t *)(x) + 4 * c)[3] = bval(v, 3); }
+#elif (ALGORITHM_BYTE_ORDER == PLATFORM_BYTE_ORDER)
+#define	word_in(x, c)	(*((uint32_t *)(x) + (c)))
+#define	word_out(x, c, v) (*((uint32_t *)(x) + (c)) = (v))
+#else
+#define	word_in(x, c)	aes_sw32(*((uint32_t *)(x) + (c)))
+#define	word_out(x, c, v) (*((uint32_t *)(x) + (c)) = aes_sw32(v))
+#endif
+
+/* the finite field modular polynomial and elements */
+
+#define	WPOLY   0x011b
+#define	BPOLY	0x1b
+
+/* multiply four bytes in GF(2^8) by 'x' {02} in parallel */
+
+#define	m1  0x80808080
+#define	m2  0x7f7f7f7f
+#define	gf_mulx(x)  ((((x) & m2) << 1) ^ ((((x) & m1) >> 7) * BPOLY))
+
+/*
+ * The following defines provide alternative definitions of gf_mulx that might
+ * give improved performance if a fast 32-bit multiply is not available. Note
+ * that a temporary variable u needs to be defined where gf_mulx is used.
+ *
+ * #define	gf_mulx(x) (u = (x) & m1, u |= (u >> 1), ((x) & m2) << 1) ^ \
+ *			((u >> 3) | (u >> 6))
+ * #define	m4  (0x01010101 * BPOLY)
+ * #define	gf_mulx(x) (u = (x) & m1, ((x) & m2) << 1) ^ ((u - (u >> 7)) \
+ *			& m4)
+ */
+
+/* Work out which tables are needed for the different options   */
+
+#if defined(ASM_X86_V1C)
+#if defined(ENC_ROUND)
+#undef  ENC_ROUND
+#endif
+#define	ENC_ROUND   FOUR_TABLES
+#if defined(LAST_ENC_ROUND)
+#undef  LAST_ENC_ROUND
+#endif
+#define	LAST_ENC_ROUND  FOUR_TABLES
+#if defined(DEC_ROUND)
+#undef  DEC_ROUND
+#endif
+#define	DEC_ROUND   FOUR_TABLES
+#if defined(LAST_DEC_ROUND)
+#undef  LAST_DEC_ROUND
+#endif
+#define	LAST_DEC_ROUND  FOUR_TABLES
+#if defined(KEY_SCHED)
+#undef  KEY_SCHED
+#define	KEY_SCHED   FOUR_TABLES
+#endif
+#endif
+
+#if (FUNCS_IN_C & ENCRYPTION_IN_C) || defined(ASM_X86_V1C)
+#if ENC_ROUND == ONE_TABLE
+#define	FT1_SET
+#elif ENC_ROUND == FOUR_TABLES
+#define	FT4_SET
+#else
+#define	SBX_SET
+#endif
+#if LAST_ENC_ROUND == ONE_TABLE
+#define	FL1_SET
+#elif LAST_ENC_ROUND == FOUR_TABLES
+#define	FL4_SET
+#elif !defined(SBX_SET)
+#define	SBX_SET
+#endif
+#endif
+
+#if (FUNCS_IN_C & DECRYPTION_IN_C) || defined(ASM_X86_V1C)
+#if DEC_ROUND == ONE_TABLE
+#define	IT1_SET
+#elif DEC_ROUND == FOUR_TABLES
+#define	IT4_SET
+#else
+#define	ISB_SET
+#endif
+#if LAST_DEC_ROUND == ONE_TABLE
+#define	IL1_SET
+#elif LAST_DEC_ROUND == FOUR_TABLES
+#define	IL4_SET
+#elif !defined(ISB_SET)
+#define	ISB_SET
+#endif
+#endif
+
+
+#if !(defined(REDUCE_CODE_SIZE) && (defined(ASM_X86_V2) || \
+	defined(ASM_X86_V2C)))
+#if ((FUNCS_IN_C & ENC_KEYING_IN_C) || (FUNCS_IN_C & DEC_KEYING_IN_C))
+#if KEY_SCHED == ONE_TABLE
+#if !defined(FL1_SET) && !defined(FL4_SET)
+#define	LS1_SET
+#endif
+#elif KEY_SCHED == FOUR_TABLES
+#if !defined(FL4_SET)
+#define	LS4_SET
+#endif
+#elif !defined(SBX_SET)
+#define	SBX_SET
+#endif
+#endif
+#if (FUNCS_IN_C & DEC_KEYING_IN_C)
+#if KEY_SCHED == ONE_TABLE
+#define	IM1_SET
+#elif KEY_SCHED == FOUR_TABLES
+#define	IM4_SET
+#elif !defined(SBX_SET)
+#define	SBX_SET
+#endif
+#endif
+#endif
+
+/* generic definitions of Rijndael macros that use tables */
+
+#define	no_table(x, box, vf, rf, c) bytes2word(\
+	box[bval(vf(x, 0, c), rf(0, c))], \
+	box[bval(vf(x, 1, c), rf(1, c))], \
+	box[bval(vf(x, 2, c), rf(2, c))], \
+	box[bval(vf(x, 3, c), rf(3, c))])
+
+#define	one_table(x, op, tab, vf, rf, c) \
+	(tab[bval(vf(x, 0, c), rf(0, c))] \
+	^ op(tab[bval(vf(x, 1, c), rf(1, c))], 1) \
+	^ op(tab[bval(vf(x, 2, c), rf(2, c))], 2) \
+	^ op(tab[bval(vf(x, 3, c), rf(3, c))], 3))
+
+#define	four_tables(x, tab, vf, rf, c) \
+	(tab[0][bval(vf(x, 0, c), rf(0, c))] \
+	^ tab[1][bval(vf(x, 1, c), rf(1, c))] \
+	^ tab[2][bval(vf(x, 2, c), rf(2, c))] \
+	^ tab[3][bval(vf(x, 3, c), rf(3, c))])
+
+#define	vf1(x, r, c)	(x)
+#define	rf1(r, c)	(r)
+#define	rf2(r, c)	((8+r-c)&3)
+
+/*
+ * Perform forward and inverse column mix operation on four bytes in long word
+ * x in parallel. NOTE: x must be a simple variable, NOT an expression in
+ * these macros.
+ */
+
+#if !(defined(REDUCE_CODE_SIZE) && (defined(ASM_X86_V2) || \
+	defined(ASM_X86_V2C)))
+
+#if defined(FM4_SET)	/* not currently used */
+#define	fwd_mcol(x)	four_tables(x, t_use(f, m), vf1, rf1, 0)
+#elif defined(FM1_SET)	/* not currently used */
+#define	fwd_mcol(x)	one_table(x, upr, t_use(f, m), vf1, rf1, 0)
+#else
+#define	dec_fmvars	uint32_t g2
+#define	fwd_mcol(x)	(g2 = gf_mulx(x), g2 ^ upr((x) ^ g2, 3) ^ \
+				upr((x), 2) ^ upr((x), 1))
+#endif
+
+#if defined(IM4_SET)
+#define	inv_mcol(x)	four_tables(x, t_use(i, m), vf1, rf1, 0)
+#elif defined(IM1_SET)
+#define	inv_mcol(x)	one_table(x, upr, t_use(i, m), vf1, rf1, 0)
+#else
+#define	dec_imvars	uint32_t g2, g4, g9
+#define	inv_mcol(x)	(g2 = gf_mulx(x), g4 = gf_mulx(g2), g9 = \
+				(x) ^ gf_mulx(g4), g4 ^= g9, \
+				(x) ^ g2 ^ g4 ^ upr(g2 ^ g9, 3) ^ \
+				upr(g4, 2) ^ upr(g9, 1))
+#endif
+
+#if defined(FL4_SET)
+#define	ls_box(x, c)	four_tables(x, t_use(f, l), vf1, rf2, c)
+#elif defined(LS4_SET)
+#define	ls_box(x, c)	four_tables(x, t_use(l, s), vf1, rf2, c)
+#elif defined(FL1_SET)
+#define	ls_box(x, c)	one_table(x, upr, t_use(f, l), vf1, rf2, c)
+#elif defined(LS1_SET)
+#define	ls_box(x, c)	one_table(x, upr, t_use(l, s), vf1, rf2, c)
+#else
+#define	ls_box(x, c)	no_table(x, t_use(s, box), vf1, rf2, c)
+#endif
+
+#endif
+
+#if defined(ASM_X86_V1C) && defined(AES_DECRYPT) && !defined(ISB_SET)
+#define	ISB_SET
+#endif
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _AESOPT_H */
diff --git a/module/icp/asm-x86_64/aes/aestab.h b/module/icp/asm-x86_64/aes/aestab.h
new file mode 100644
index 000000000000..33cdb6c6f9fe
--- /dev/null
+++ b/module/icp/asm-x86_64/aes/aestab.h
@@ -0,0 +1,165 @@
+/*
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
+ *
+ * LICENSE TERMS
+ *
+ * The free distribution and use of this software is allowed (with or without
+ * changes) provided that:
+ *
+ *  1. source code distributions include the above copyright notice, this
+ *     list of conditions and the following disclaimer;
+ *
+ *  2. binary distributions include the above copyright notice, this list
+ *     of conditions and the following disclaimer in their documentation;
+ *
+ *  3. the name of the copyright holder is not used to endorse products
+ *     built using this software without specific written permission.
+ *
+ * DISCLAIMER
+ *
+ * This software is provided 'as is' with no explicit or implied warranties
+ * in respect of its properties, including, but not limited to, correctness
+ * and/or fitness for purpose.
+ * ---------------------------------------------------------------------------
+ * Issue Date: 20/12/2007
+ *
+ * This file contains the code for declaring the tables needed to implement
+ * AES. The file aesopt.h is assumed to be included before this header file.
+ * If there are no global variables, the definitions here can be used to put
+ * the AES tables in a structure so that a pointer can then be added to the
+ * AES context to pass them to the AES routines that need them.   If this
+ * facility is used, the calling program has to ensure that this pointer is
+ * managed appropriately.  In particular, the value of the t_dec(in, it) item
+ * in the table structure must be set to zero in order to ensure that the
+ * tables are initialised. In practice the three code sequences in aeskey.c
+ * that control the calls to aes_init() and the aes_init() routine itself will
+ * have to be changed for a specific implementation. If global variables are
+ * available it will generally be preferable to use them with the precomputed
+ * FIXED_TABLES option that uses static global tables.
+ *
+ * The following defines can be used to control the way the tables
+ * are defined, initialised and used in embedded environments that
+ * require special features for these purposes
+ *
+ *    the 't_dec' construction is used to declare fixed table arrays
+ *    the 't_set' construction is used to set fixed table values
+ *    the 't_use' construction is used to access fixed table values
+ *
+ *    256 byte tables:
+ *
+ *        t_xxx(s, box)    => forward S box
+ *        t_xxx(i, box)    => inverse S box
+ *
+ *    256 32-bit word OR 4 x 256 32-bit word tables:
+ *
+ *        t_xxx(f, n)      => forward normal round
+ *        t_xxx(f, l)      => forward last round
+ *        t_xxx(i, n)      => inverse normal round
+ *        t_xxx(i, l)      => inverse last round
+ *        t_xxx(l, s)      => key schedule table
+ *        t_xxx(i, m)      => key schedule table
+ *
+ *    Other variables and tables:
+ *
+ *        t_xxx(r, c)      => the rcon table
+ */
+
+/*
+ * OpenSolaris OS modifications
+ *
+ * 1. Added __cplusplus and _AESTAB_H header guards
+ * 2. Added header file sys/types.h
+ * 3. Remove code defined for _MSC_VER
+ * 4. Changed all variables to "static const"
+ * 5. Changed uint_8t and uint_32t to uint8_t and uint32_t
+ * 6. Cstyled and hdrchk code
+ */
+
+#ifndef _AESTAB_H
+#define	_AESTAB_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+
+#define	t_dec(m, n) t_##m##n
+#define	t_set(m, n) t_##m##n
+#define	t_use(m, n) t_##m##n
+
+#if defined(DO_TABLES) && defined(FIXED_TABLES)
+#define	d_1(t, n, b, e)		 static const t n[256]    =   b(e)
+#define	d_4(t, n, b, e, f, g, h) static const t n[4][256] = \
+					{b(e), b(f), b(g), b(h)}
+static const uint32_t t_dec(r, c)[RC_LENGTH] = rc_data(w0);
+#else
+#define	d_1(t, n, b, e)			static const t n[256]
+#define	d_4(t, n, b, e, f, g, h)	static const t n[4][256]
+static const uint32_t t_dec(r, c)[RC_LENGTH];
+#endif
+
+#if defined(SBX_SET)
+	d_1(uint8_t, t_dec(s, box), sb_data, h0);
+#endif
+#if defined(ISB_SET)
+	d_1(uint8_t, t_dec(i, box), isb_data, h0);
+#endif
+
+#if defined(FT1_SET)
+	d_1(uint32_t, t_dec(f, n), sb_data, u0);
+#endif
+#if defined(FT4_SET)
+	d_4(uint32_t, t_dec(f, n), sb_data, u0, u1, u2, u3);
+#endif
+
+#if defined(FL1_SET)
+	d_1(uint32_t, t_dec(f, l), sb_data, w0);
+#endif
+#if defined(FL4_SET)
+	d_4(uint32_t, t_dec(f, l), sb_data, w0, w1, w2, w3);
+#endif
+
+#if defined(IT1_SET)
+	d_1(uint32_t, t_dec(i, n), isb_data, v0);
+#endif
+#if defined(IT4_SET)
+	d_4(uint32_t, t_dec(i, n), isb_data, v0, v1, v2, v3);
+#endif
+
+#if defined(IL1_SET)
+	d_1(uint32_t, t_dec(i, l), isb_data, w0);
+#endif
+#if defined(IL4_SET)
+	d_4(uint32_t, t_dec(i, l), isb_data, w0, w1, w2, w3);
+#endif
+
+#if defined(LS1_SET)
+#if defined(FL1_SET)
+#undef  LS1_SET
+#else
+	d_1(uint32_t, t_dec(l, s), sb_data, w0);
+#endif
+#endif
+
+#if defined(LS4_SET)
+#if defined(FL4_SET)
+#undef  LS4_SET
+#else
+	d_4(uint32_t, t_dec(l, s), sb_data, w0, w1, w2, w3);
+#endif
+#endif
+
+#if defined(IM1_SET)
+	d_1(uint32_t, t_dec(i, m), mm_data, v0);
+#endif
+#if defined(IM4_SET)
+	d_4(uint32_t, t_dec(i, m), mm_data, v0, v1, v2, v3);
+#endif
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _AESTAB_H */
diff --git a/module/icp/asm-x86_64/aes/aestab2.h b/module/icp/asm-x86_64/aes/aestab2.h
new file mode 100644
index 000000000000..eb13f72b10d8
--- /dev/null
+++ b/module/icp/asm-x86_64/aes/aestab2.h
@@ -0,0 +1,594 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _AESTAB2_H
+#define	_AESTAB2_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * To create this file for OpenSolaris:
+ * 1. Compile and run tablegen.c, from aes-src-04-03-08.zip,
+ *	after defining ASM_AMD64_C
+ * 2. mv aestab2.c aestab2.h
+ * 3. Add __cplusplus and _AESTAB2_H header guards
+ * 3. Add #include <aes_impl.h>
+ * 4. Change "uint_32t" to "uint32_t"
+ * 5. Change all variables to "static const"
+ * 6. Cstyle and hdrchk this file
+ */
+
+#include <aes/aes_impl.h>
+
+static const uint32_t t_rc[RC_LENGTH] =
+{
+	0x00000001, 0x00000002, 0x00000004, 0x00000008,
+	0x00000010, 0x00000020, 0x00000040, 0x00000080,
+	0x0000001b, 0x00000036
+};
+
+static const uint32_t t_ls[4][256] =
+{
+	{
+	0x00000063, 0x0000007c, 0x00000077, 0x0000007b,
+	0x000000f2, 0x0000006b, 0x0000006f, 0x000000c5,
+	0x00000030, 0x00000001, 0x00000067, 0x0000002b,
+	0x000000fe, 0x000000d7, 0x000000ab, 0x00000076,
+	0x000000ca, 0x00000082, 0x000000c9, 0x0000007d,
+	0x000000fa, 0x00000059, 0x00000047, 0x000000f0,
+	0x000000ad, 0x000000d4, 0x000000a2, 0x000000af,
+	0x0000009c, 0x000000a4, 0x00000072, 0x000000c0,
+	0x000000b7, 0x000000fd, 0x00000093, 0x00000026,
+	0x00000036, 0x0000003f, 0x000000f7, 0x000000cc,
+	0x00000034, 0x000000a5, 0x000000e5, 0x000000f1,
+	0x00000071, 0x000000d8, 0x00000031, 0x00000015,
+	0x00000004, 0x000000c7, 0x00000023, 0x000000c3,
+	0x00000018, 0x00000096, 0x00000005, 0x0000009a,
+	0x00000007, 0x00000012, 0x00000080, 0x000000e2,
+	0x000000eb, 0x00000027, 0x000000b2, 0x00000075,
+	0x00000009, 0x00000083, 0x0000002c, 0x0000001a,
+	0x0000001b, 0x0000006e, 0x0000005a, 0x000000a0,
+	0x00000052, 0x0000003b, 0x000000d6, 0x000000b3,
+	0x00000029, 0x000000e3, 0x0000002f, 0x00000084,
+	0x00000053, 0x000000d1, 0x00000000, 0x000000ed,
+	0x00000020, 0x000000fc, 0x000000b1, 0x0000005b,
+	0x0000006a, 0x000000cb, 0x000000be, 0x00000039,
+	0x0000004a, 0x0000004c, 0x00000058, 0x000000cf,
+	0x000000d0, 0x000000ef, 0x000000aa, 0x000000fb,
+	0x00000043, 0x0000004d, 0x00000033, 0x00000085,
+	0x00000045, 0x000000f9, 0x00000002, 0x0000007f,
+	0x00000050, 0x0000003c, 0x0000009f, 0x000000a8,
+	0x00000051, 0x000000a3, 0x00000040, 0x0000008f,
+	0x00000092, 0x0000009d, 0x00000038, 0x000000f5,
+	0x000000bc, 0x000000b6, 0x000000da, 0x00000021,
+	0x00000010, 0x000000ff, 0x000000f3, 0x000000d2,
+	0x000000cd, 0x0000000c, 0x00000013, 0x000000ec,
+	0x0000005f, 0x00000097, 0x00000044, 0x00000017,
+	0x000000c4, 0x000000a7, 0x0000007e, 0x0000003d,
+	0x00000064, 0x0000005d, 0x00000019, 0x00000073,
+	0x00000060, 0x00000081, 0x0000004f, 0x000000dc,
+	0x00000022, 0x0000002a, 0x00000090, 0x00000088,
+	0x00000046, 0x000000ee, 0x000000b8, 0x00000014,
+	0x000000de, 0x0000005e, 0x0000000b, 0x000000db,
+	0x000000e0, 0x00000032, 0x0000003a, 0x0000000a,
+	0x00000049, 0x00000006, 0x00000024, 0x0000005c,
+	0x000000c2, 0x000000d3, 0x000000ac, 0x00000062,
+	0x00000091, 0x00000095, 0x000000e4, 0x00000079,
+	0x000000e7, 0x000000c8, 0x00000037, 0x0000006d,
+	0x0000008d, 0x000000d5, 0x0000004e, 0x000000a9,
+	0x0000006c, 0x00000056, 0x000000f4, 0x000000ea,
+	0x00000065, 0x0000007a, 0x000000ae, 0x00000008,
+	0x000000ba, 0x00000078, 0x00000025, 0x0000002e,
+	0x0000001c, 0x000000a6, 0x000000b4, 0x000000c6,
+	0x000000e8, 0x000000dd, 0x00000074, 0x0000001f,
+	0x0000004b, 0x000000bd, 0x0000008b, 0x0000008a,
+	0x00000070, 0x0000003e, 0x000000b5, 0x00000066,
+	0x00000048, 0x00000003, 0x000000f6, 0x0000000e,
+	0x00000061, 0x00000035, 0x00000057, 0x000000b9,
+	0x00000086, 0x000000c1, 0x0000001d, 0x0000009e,
+	0x000000e1, 0x000000f8, 0x00000098, 0x00000011,
+	0x00000069, 0x000000d9, 0x0000008e, 0x00000094,
+	0x0000009b, 0x0000001e, 0x00000087, 0x000000e9,
+	0x000000ce, 0x00000055, 0x00000028, 0x000000df,
+	0x0000008c, 0x000000a1, 0x00000089, 0x0000000d,
+	0x000000bf, 0x000000e6, 0x00000042, 0x00000068,
+	0x00000041, 0x00000099, 0x0000002d, 0x0000000f,
+	0x000000b0, 0x00000054, 0x000000bb, 0x00000016
+	},
+	{
+	0x00006300, 0x00007c00, 0x00007700, 0x00007b00,
+	0x0000f200, 0x00006b00, 0x00006f00, 0x0000c500,
+	0x00003000, 0x00000100, 0x00006700, 0x00002b00,
+	0x0000fe00, 0x0000d700, 0x0000ab00, 0x00007600,
+	0x0000ca00, 0x00008200, 0x0000c900, 0x00007d00,
+	0x0000fa00, 0x00005900, 0x00004700, 0x0000f000,
+	0x0000ad00, 0x0000d400, 0x0000a200, 0x0000af00,
+	0x00009c00, 0x0000a400, 0x00007200, 0x0000c000,
+	0x0000b700, 0x0000fd00, 0x00009300, 0x00002600,
+	0x00003600, 0x00003f00, 0x0000f700, 0x0000cc00,
+	0x00003400, 0x0000a500, 0x0000e500, 0x0000f100,
+	0x00007100, 0x0000d800, 0x00003100, 0x00001500,
+	0x00000400, 0x0000c700, 0x00002300, 0x0000c300,
+	0x00001800, 0x00009600, 0x00000500, 0x00009a00,
+	0x00000700, 0x00001200, 0x00008000, 0x0000e200,
+	0x0000eb00, 0x00002700, 0x0000b200, 0x00007500,
+	0x00000900, 0x00008300, 0x00002c00, 0x00001a00,
+	0x00001b00, 0x00006e00, 0x00005a00, 0x0000a000,
+	0x00005200, 0x00003b00, 0x0000d600, 0x0000b300,
+	0x00002900, 0x0000e300, 0x00002f00, 0x00008400,
+	0x00005300, 0x0000d100, 0x00000000, 0x0000ed00,
+	0x00002000, 0x0000fc00, 0x0000b100, 0x00005b00,
+	0x00006a00, 0x0000cb00, 0x0000be00, 0x00003900,
+	0x00004a00, 0x00004c00, 0x00005800, 0x0000cf00,
+	0x0000d000, 0x0000ef00, 0x0000aa00, 0x0000fb00,
+	0x00004300, 0x00004d00, 0x00003300, 0x00008500,
+	0x00004500, 0x0000f900, 0x00000200, 0x00007f00,
+	0x00005000, 0x00003c00, 0x00009f00, 0x0000a800,
+	0x00005100, 0x0000a300, 0x00004000, 0x00008f00,
+	0x00009200, 0x00009d00, 0x00003800, 0x0000f500,
+	0x0000bc00, 0x0000b600, 0x0000da00, 0x00002100,
+	0x00001000, 0x0000ff00, 0x0000f300, 0x0000d200,
+	0x0000cd00, 0x00000c00, 0x00001300, 0x0000ec00,
+	0x00005f00, 0x00009700, 0x00004400, 0x00001700,
+	0x0000c400, 0x0000a700, 0x00007e00, 0x00003d00,
+	0x00006400, 0x00005d00, 0x00001900, 0x00007300,
+	0x00006000, 0x00008100, 0x00004f00, 0x0000dc00,
+	0x00002200, 0x00002a00, 0x00009000, 0x00008800,
+	0x00004600, 0x0000ee00, 0x0000b800, 0x00001400,
+	0x0000de00, 0x00005e00, 0x00000b00, 0x0000db00,
+	0x0000e000, 0x00003200, 0x00003a00, 0x00000a00,
+	0x00004900, 0x00000600, 0x00002400, 0x00005c00,
+	0x0000c200, 0x0000d300, 0x0000ac00, 0x00006200,
+	0x00009100, 0x00009500, 0x0000e400, 0x00007900,
+	0x0000e700, 0x0000c800, 0x00003700, 0x00006d00,
+	0x00008d00, 0x0000d500, 0x00004e00, 0x0000a900,
+	0x00006c00, 0x00005600, 0x0000f400, 0x0000ea00,
+	0x00006500, 0x00007a00, 0x0000ae00, 0x00000800,
+	0x0000ba00, 0x00007800, 0x00002500, 0x00002e00,
+	0x00001c00, 0x0000a600, 0x0000b400, 0x0000c600,
+	0x0000e800, 0x0000dd00, 0x00007400, 0x00001f00,
+	0x00004b00, 0x0000bd00, 0x00008b00, 0x00008a00,
+	0x00007000, 0x00003e00, 0x0000b500, 0x00006600,
+	0x00004800, 0x00000300, 0x0000f600, 0x00000e00,
+	0x00006100, 0x00003500, 0x00005700, 0x0000b900,
+	0x00008600, 0x0000c100, 0x00001d00, 0x00009e00,
+	0x0000e100, 0x0000f800, 0x00009800, 0x00001100,
+	0x00006900, 0x0000d900, 0x00008e00, 0x00009400,
+	0x00009b00, 0x00001e00, 0x00008700, 0x0000e900,
+	0x0000ce00, 0x00005500, 0x00002800, 0x0000df00,
+	0x00008c00, 0x0000a100, 0x00008900, 0x00000d00,
+	0x0000bf00, 0x0000e600, 0x00004200, 0x00006800,
+	0x00004100, 0x00009900, 0x00002d00, 0x00000f00,
+	0x0000b000, 0x00005400, 0x0000bb00, 0x00001600
+	},
+	{
+	0x00630000, 0x007c0000, 0x00770000, 0x007b0000,
+	0x00f20000, 0x006b0000, 0x006f0000, 0x00c50000,
+	0x00300000, 0x00010000, 0x00670000, 0x002b0000,
+	0x00fe0000, 0x00d70000, 0x00ab0000, 0x00760000,
+	0x00ca0000, 0x00820000, 0x00c90000, 0x007d0000,
+	0x00fa0000, 0x00590000, 0x00470000, 0x00f00000,
+	0x00ad0000, 0x00d40000, 0x00a20000, 0x00af0000,
+	0x009c0000, 0x00a40000, 0x00720000, 0x00c00000,
+	0x00b70000, 0x00fd0000, 0x00930000, 0x00260000,
+	0x00360000, 0x003f0000, 0x00f70000, 0x00cc0000,
+	0x00340000, 0x00a50000, 0x00e50000, 0x00f10000,
+	0x00710000, 0x00d80000, 0x00310000, 0x00150000,
+	0x00040000, 0x00c70000, 0x00230000, 0x00c30000,
+	0x00180000, 0x00960000, 0x00050000, 0x009a0000,
+	0x00070000, 0x00120000, 0x00800000, 0x00e20000,
+	0x00eb0000, 0x00270000, 0x00b20000, 0x00750000,
+	0x00090000, 0x00830000, 0x002c0000, 0x001a0000,
+	0x001b0000, 0x006e0000, 0x005a0000, 0x00a00000,
+	0x00520000, 0x003b0000, 0x00d60000, 0x00b30000,
+	0x00290000, 0x00e30000, 0x002f0000, 0x00840000,
+	0x00530000, 0x00d10000, 0x00000000, 0x00ed0000,
+	0x00200000, 0x00fc0000, 0x00b10000, 0x005b0000,
+	0x006a0000, 0x00cb0000, 0x00be0000, 0x00390000,
+	0x004a0000, 0x004c0000, 0x00580000, 0x00cf0000,
+	0x00d00000, 0x00ef0000, 0x00aa0000, 0x00fb0000,
+	0x00430000, 0x004d0000, 0x00330000, 0x00850000,
+	0x00450000, 0x00f90000, 0x00020000, 0x007f0000,
+	0x00500000, 0x003c0000, 0x009f0000, 0x00a80000,
+	0x00510000, 0x00a30000, 0x00400000, 0x008f0000,
+	0x00920000, 0x009d0000, 0x00380000, 0x00f50000,
+	0x00bc0000, 0x00b60000, 0x00da0000, 0x00210000,
+	0x00100000, 0x00ff0000, 0x00f30000, 0x00d20000,
+	0x00cd0000, 0x000c0000, 0x00130000, 0x00ec0000,
+	0x005f0000, 0x00970000, 0x00440000, 0x00170000,
+	0x00c40000, 0x00a70000, 0x007e0000, 0x003d0000,
+	0x00640000, 0x005d0000, 0x00190000, 0x00730000,
+	0x00600000, 0x00810000, 0x004f0000, 0x00dc0000,
+	0x00220000, 0x002a0000, 0x00900000, 0x00880000,
+	0x00460000, 0x00ee0000, 0x00b80000, 0x00140000,
+	0x00de0000, 0x005e0000, 0x000b0000, 0x00db0000,
+	0x00e00000, 0x00320000, 0x003a0000, 0x000a0000,
+	0x00490000, 0x00060000, 0x00240000, 0x005c0000,
+	0x00c20000, 0x00d30000, 0x00ac0000, 0x00620000,
+	0x00910000, 0x00950000, 0x00e40000, 0x00790000,
+	0x00e70000, 0x00c80000, 0x00370000, 0x006d0000,
+	0x008d0000, 0x00d50000, 0x004e0000, 0x00a90000,
+	0x006c0000, 0x00560000, 0x00f40000, 0x00ea0000,
+	0x00650000, 0x007a0000, 0x00ae0000, 0x00080000,
+	0x00ba0000, 0x00780000, 0x00250000, 0x002e0000,
+	0x001c0000, 0x00a60000, 0x00b40000, 0x00c60000,
+	0x00e80000, 0x00dd0000, 0x00740000, 0x001f0000,
+	0x004b0000, 0x00bd0000, 0x008b0000, 0x008a0000,
+	0x00700000, 0x003e0000, 0x00b50000, 0x00660000,
+	0x00480000, 0x00030000, 0x00f60000, 0x000e0000,
+	0x00610000, 0x00350000, 0x00570000, 0x00b90000,
+	0x00860000, 0x00c10000, 0x001d0000, 0x009e0000,
+	0x00e10000, 0x00f80000, 0x00980000, 0x00110000,
+	0x00690000, 0x00d90000, 0x008e0000, 0x00940000,
+	0x009b0000, 0x001e0000, 0x00870000, 0x00e90000,
+	0x00ce0000, 0x00550000, 0x00280000, 0x00df0000,
+	0x008c0000, 0x00a10000, 0x00890000, 0x000d0000,
+	0x00bf0000, 0x00e60000, 0x00420000, 0x00680000,
+	0x00410000, 0x00990000, 0x002d0000, 0x000f0000,
+	0x00b00000, 0x00540000, 0x00bb0000, 0x00160000
+	},
+	{
+	0x63000000, 0x7c000000, 0x77000000, 0x7b000000,
+	0xf2000000, 0x6b000000, 0x6f000000, 0xc5000000,
+	0x30000000, 0x01000000, 0x67000000, 0x2b000000,
+	0xfe000000, 0xd7000000, 0xab000000, 0x76000000,
+	0xca000000, 0x82000000, 0xc9000000, 0x7d000000,
+	0xfa000000, 0x59000000, 0x47000000, 0xf0000000,
+	0xad000000, 0xd4000000, 0xa2000000, 0xaf000000,
+	0x9c000000, 0xa4000000, 0x72000000, 0xc0000000,
+	0xb7000000, 0xfd000000, 0x93000000, 0x26000000,
+	0x36000000, 0x3f000000, 0xf7000000, 0xcc000000,
+	0x34000000, 0xa5000000, 0xe5000000, 0xf1000000,
+	0x71000000, 0xd8000000, 0x31000000, 0x15000000,
+	0x04000000, 0xc7000000, 0x23000000, 0xc3000000,
+	0x18000000, 0x96000000, 0x05000000, 0x9a000000,
+	0x07000000, 0x12000000, 0x80000000, 0xe2000000,
+	0xeb000000, 0x27000000, 0xb2000000, 0x75000000,
+	0x09000000, 0x83000000, 0x2c000000, 0x1a000000,
+	0x1b000000, 0x6e000000, 0x5a000000, 0xa0000000,
+	0x52000000, 0x3b000000, 0xd6000000, 0xb3000000,
+	0x29000000, 0xe3000000, 0x2f000000, 0x84000000,
+	0x53000000, 0xd1000000, 0x00000000, 0xed000000,
+	0x20000000, 0xfc000000, 0xb1000000, 0x5b000000,
+	0x6a000000, 0xcb000000, 0xbe000000, 0x39000000,
+	0x4a000000, 0x4c000000, 0x58000000, 0xcf000000,
+	0xd0000000, 0xef000000, 0xaa000000, 0xfb000000,
+	0x43000000, 0x4d000000, 0x33000000, 0x85000000,
+	0x45000000, 0xf9000000, 0x02000000, 0x7f000000,
+	0x50000000, 0x3c000000, 0x9f000000, 0xa8000000,
+	0x51000000, 0xa3000000, 0x40000000, 0x8f000000,
+	0x92000000, 0x9d000000, 0x38000000, 0xf5000000,
+	0xbc000000, 0xb6000000, 0xda000000, 0x21000000,
+	0x10000000, 0xff000000, 0xf3000000, 0xd2000000,
+	0xcd000000, 0x0c000000, 0x13000000, 0xec000000,
+	0x5f000000, 0x97000000, 0x44000000, 0x17000000,
+	0xc4000000, 0xa7000000, 0x7e000000, 0x3d000000,
+	0x64000000, 0x5d000000, 0x19000000, 0x73000000,
+	0x60000000, 0x81000000, 0x4f000000, 0xdc000000,
+	0x22000000, 0x2a000000, 0x90000000, 0x88000000,
+	0x46000000, 0xee000000, 0xb8000000, 0x14000000,
+	0xde000000, 0x5e000000, 0x0b000000, 0xdb000000,
+	0xe0000000, 0x32000000, 0x3a000000, 0x0a000000,
+	0x49000000, 0x06000000, 0x24000000, 0x5c000000,
+	0xc2000000, 0xd3000000, 0xac000000, 0x62000000,
+	0x91000000, 0x95000000, 0xe4000000, 0x79000000,
+	0xe7000000, 0xc8000000, 0x37000000, 0x6d000000,
+	0x8d000000, 0xd5000000, 0x4e000000, 0xa9000000,
+	0x6c000000, 0x56000000, 0xf4000000, 0xea000000,
+	0x65000000, 0x7a000000, 0xae000000, 0x08000000,
+	0xba000000, 0x78000000, 0x25000000, 0x2e000000,
+	0x1c000000, 0xa6000000, 0xb4000000, 0xc6000000,
+	0xe8000000, 0xdd000000, 0x74000000, 0x1f000000,
+	0x4b000000, 0xbd000000, 0x8b000000, 0x8a000000,
+	0x70000000, 0x3e000000, 0xb5000000, 0x66000000,
+	0x48000000, 0x03000000, 0xf6000000, 0x0e000000,
+	0x61000000, 0x35000000, 0x57000000, 0xb9000000,
+	0x86000000, 0xc1000000, 0x1d000000, 0x9e000000,
+	0xe1000000, 0xf8000000, 0x98000000, 0x11000000,
+	0x69000000, 0xd9000000, 0x8e000000, 0x94000000,
+	0x9b000000, 0x1e000000, 0x87000000, 0xe9000000,
+	0xce000000, 0x55000000, 0x28000000, 0xdf000000,
+	0x8c000000, 0xa1000000, 0x89000000, 0x0d000000,
+	0xbf000000, 0xe6000000, 0x42000000, 0x68000000,
+	0x41000000, 0x99000000, 0x2d000000, 0x0f000000,
+	0xb0000000, 0x54000000, 0xbb000000, 0x16000000
+	}
+};
+
+static const uint32_t t_im[4][256] =
+{
+	{
+	0x00000000, 0x0b0d090e, 0x161a121c, 0x1d171b12,
+	0x2c342438, 0x27392d36, 0x3a2e3624, 0x31233f2a,
+	0x58684870, 0x5365417e, 0x4e725a6c, 0x457f5362,
+	0x745c6c48, 0x7f516546, 0x62467e54, 0x694b775a,
+	0xb0d090e0, 0xbbdd99ee, 0xa6ca82fc, 0xadc78bf2,
+	0x9ce4b4d8, 0x97e9bdd6, 0x8afea6c4, 0x81f3afca,
+	0xe8b8d890, 0xe3b5d19e, 0xfea2ca8c, 0xf5afc382,
+	0xc48cfca8, 0xcf81f5a6, 0xd296eeb4, 0xd99be7ba,
+	0x7bbb3bdb, 0x70b632d5, 0x6da129c7, 0x66ac20c9,
+	0x578f1fe3, 0x5c8216ed, 0x41950dff, 0x4a9804f1,
+	0x23d373ab, 0x28de7aa5, 0x35c961b7, 0x3ec468b9,
+	0x0fe75793, 0x04ea5e9d, 0x19fd458f, 0x12f04c81,
+	0xcb6bab3b, 0xc066a235, 0xdd71b927, 0xd67cb029,
+	0xe75f8f03, 0xec52860d, 0xf1459d1f, 0xfa489411,
+	0x9303e34b, 0x980eea45, 0x8519f157, 0x8e14f859,
+	0xbf37c773, 0xb43ace7d, 0xa92dd56f, 0xa220dc61,
+	0xf66d76ad, 0xfd607fa3, 0xe07764b1, 0xeb7a6dbf,
+	0xda595295, 0xd1545b9b, 0xcc434089, 0xc74e4987,
+	0xae053edd, 0xa50837d3, 0xb81f2cc1, 0xb31225cf,
+	0x82311ae5, 0x893c13eb, 0x942b08f9, 0x9f2601f7,
+	0x46bde64d, 0x4db0ef43, 0x50a7f451, 0x5baafd5f,
+	0x6a89c275, 0x6184cb7b, 0x7c93d069, 0x779ed967,
+	0x1ed5ae3d, 0x15d8a733, 0x08cfbc21, 0x03c2b52f,
+	0x32e18a05, 0x39ec830b, 0x24fb9819, 0x2ff69117,
+	0x8dd64d76, 0x86db4478, 0x9bcc5f6a, 0x90c15664,
+	0xa1e2694e, 0xaaef6040, 0xb7f87b52, 0xbcf5725c,
+	0xd5be0506, 0xdeb30c08, 0xc3a4171a, 0xc8a91e14,
+	0xf98a213e, 0xf2872830, 0xef903322, 0xe49d3a2c,
+	0x3d06dd96, 0x360bd498, 0x2b1ccf8a, 0x2011c684,
+	0x1132f9ae, 0x1a3ff0a0, 0x0728ebb2, 0x0c25e2bc,
+	0x656e95e6, 0x6e639ce8, 0x737487fa, 0x78798ef4,
+	0x495ab1de, 0x4257b8d0, 0x5f40a3c2, 0x544daacc,
+	0xf7daec41, 0xfcd7e54f, 0xe1c0fe5d, 0xeacdf753,
+	0xdbeec879, 0xd0e3c177, 0xcdf4da65, 0xc6f9d36b,
+	0xafb2a431, 0xa4bfad3f, 0xb9a8b62d, 0xb2a5bf23,
+	0x83868009, 0x888b8907, 0x959c9215, 0x9e919b1b,
+	0x470a7ca1, 0x4c0775af, 0x51106ebd, 0x5a1d67b3,
+	0x6b3e5899, 0x60335197, 0x7d244a85, 0x7629438b,
+	0x1f6234d1, 0x146f3ddf, 0x097826cd, 0x02752fc3,
+	0x335610e9, 0x385b19e7, 0x254c02f5, 0x2e410bfb,
+	0x8c61d79a, 0x876cde94, 0x9a7bc586, 0x9176cc88,
+	0xa055f3a2, 0xab58faac, 0xb64fe1be, 0xbd42e8b0,
+	0xd4099fea, 0xdf0496e4, 0xc2138df6, 0xc91e84f8,
+	0xf83dbbd2, 0xf330b2dc, 0xee27a9ce, 0xe52aa0c0,
+	0x3cb1477a, 0x37bc4e74, 0x2aab5566, 0x21a65c68,
+	0x10856342, 0x1b886a4c, 0x069f715e, 0x0d927850,
+	0x64d90f0a, 0x6fd40604, 0x72c31d16, 0x79ce1418,
+	0x48ed2b32, 0x43e0223c, 0x5ef7392e, 0x55fa3020,
+	0x01b79aec, 0x0aba93e2, 0x17ad88f0, 0x1ca081fe,
+	0x2d83bed4, 0x268eb7da, 0x3b99acc8, 0x3094a5c6,
+	0x59dfd29c, 0x52d2db92, 0x4fc5c080, 0x44c8c98e,
+	0x75ebf6a4, 0x7ee6ffaa, 0x63f1e4b8, 0x68fcedb6,
+	0xb1670a0c, 0xba6a0302, 0xa77d1810, 0xac70111e,
+	0x9d532e34, 0x965e273a, 0x8b493c28, 0x80443526,
+	0xe90f427c, 0xe2024b72, 0xff155060, 0xf418596e,
+	0xc53b6644, 0xce366f4a, 0xd3217458, 0xd82c7d56,
+	0x7a0ca137, 0x7101a839, 0x6c16b32b, 0x671bba25,
+	0x5638850f, 0x5d358c01, 0x40229713, 0x4b2f9e1d,
+	0x2264e947, 0x2969e049, 0x347efb5b, 0x3f73f255,
+	0x0e50cd7f, 0x055dc471, 0x184adf63, 0x1347d66d,
+	0xcadc31d7, 0xc1d138d9, 0xdcc623cb, 0xd7cb2ac5,
+	0xe6e815ef, 0xede51ce1, 0xf0f207f3, 0xfbff0efd,
+	0x92b479a7, 0x99b970a9, 0x84ae6bbb, 0x8fa362b5,
+	0xbe805d9f, 0xb58d5491, 0xa89a4f83, 0xa397468d
+	},
+	{
+	0x00000000, 0x0d090e0b, 0x1a121c16, 0x171b121d,
+	0x3424382c, 0x392d3627, 0x2e36243a, 0x233f2a31,
+	0x68487058, 0x65417e53, 0x725a6c4e, 0x7f536245,
+	0x5c6c4874, 0x5165467f, 0x467e5462, 0x4b775a69,
+	0xd090e0b0, 0xdd99eebb, 0xca82fca6, 0xc78bf2ad,
+	0xe4b4d89c, 0xe9bdd697, 0xfea6c48a, 0xf3afca81,
+	0xb8d890e8, 0xb5d19ee3, 0xa2ca8cfe, 0xafc382f5,
+	0x8cfca8c4, 0x81f5a6cf, 0x96eeb4d2, 0x9be7bad9,
+	0xbb3bdb7b, 0xb632d570, 0xa129c76d, 0xac20c966,
+	0x8f1fe357, 0x8216ed5c, 0x950dff41, 0x9804f14a,
+	0xd373ab23, 0xde7aa528, 0xc961b735, 0xc468b93e,
+	0xe757930f, 0xea5e9d04, 0xfd458f19, 0xf04c8112,
+	0x6bab3bcb, 0x66a235c0, 0x71b927dd, 0x7cb029d6,
+	0x5f8f03e7, 0x52860dec, 0x459d1ff1, 0x489411fa,
+	0x03e34b93, 0x0eea4598, 0x19f15785, 0x14f8598e,
+	0x37c773bf, 0x3ace7db4, 0x2dd56fa9, 0x20dc61a2,
+	0x6d76adf6, 0x607fa3fd, 0x7764b1e0, 0x7a6dbfeb,
+	0x595295da, 0x545b9bd1, 0x434089cc, 0x4e4987c7,
+	0x053eddae, 0x0837d3a5, 0x1f2cc1b8, 0x1225cfb3,
+	0x311ae582, 0x3c13eb89, 0x2b08f994, 0x2601f79f,
+	0xbde64d46, 0xb0ef434d, 0xa7f45150, 0xaafd5f5b,
+	0x89c2756a, 0x84cb7b61, 0x93d0697c, 0x9ed96777,
+	0xd5ae3d1e, 0xd8a73315, 0xcfbc2108, 0xc2b52f03,
+	0xe18a0532, 0xec830b39, 0xfb981924, 0xf691172f,
+	0xd64d768d, 0xdb447886, 0xcc5f6a9b, 0xc1566490,
+	0xe2694ea1, 0xef6040aa, 0xf87b52b7, 0xf5725cbc,
+	0xbe0506d5, 0xb30c08de, 0xa4171ac3, 0xa91e14c8,
+	0x8a213ef9, 0x872830f2, 0x903322ef, 0x9d3a2ce4,
+	0x06dd963d, 0x0bd49836, 0x1ccf8a2b, 0x11c68420,
+	0x32f9ae11, 0x3ff0a01a, 0x28ebb207, 0x25e2bc0c,
+	0x6e95e665, 0x639ce86e, 0x7487fa73, 0x798ef478,
+	0x5ab1de49, 0x57b8d042, 0x40a3c25f, 0x4daacc54,
+	0xdaec41f7, 0xd7e54ffc, 0xc0fe5de1, 0xcdf753ea,
+	0xeec879db, 0xe3c177d0, 0xf4da65cd, 0xf9d36bc6,
+	0xb2a431af, 0xbfad3fa4, 0xa8b62db9, 0xa5bf23b2,
+	0x86800983, 0x8b890788, 0x9c921595, 0x919b1b9e,
+	0x0a7ca147, 0x0775af4c, 0x106ebd51, 0x1d67b35a,
+	0x3e58996b, 0x33519760, 0x244a857d, 0x29438b76,
+	0x6234d11f, 0x6f3ddf14, 0x7826cd09, 0x752fc302,
+	0x5610e933, 0x5b19e738, 0x4c02f525, 0x410bfb2e,
+	0x61d79a8c, 0x6cde9487, 0x7bc5869a, 0x76cc8891,
+	0x55f3a2a0, 0x58faacab, 0x4fe1beb6, 0x42e8b0bd,
+	0x099fead4, 0x0496e4df, 0x138df6c2, 0x1e84f8c9,
+	0x3dbbd2f8, 0x30b2dcf3, 0x27a9ceee, 0x2aa0c0e5,
+	0xb1477a3c, 0xbc4e7437, 0xab55662a, 0xa65c6821,
+	0x85634210, 0x886a4c1b, 0x9f715e06, 0x9278500d,
+	0xd90f0a64, 0xd406046f, 0xc31d1672, 0xce141879,
+	0xed2b3248, 0xe0223c43, 0xf7392e5e, 0xfa302055,
+	0xb79aec01, 0xba93e20a, 0xad88f017, 0xa081fe1c,
+	0x83bed42d, 0x8eb7da26, 0x99acc83b, 0x94a5c630,
+	0xdfd29c59, 0xd2db9252, 0xc5c0804f, 0xc8c98e44,
+	0xebf6a475, 0xe6ffaa7e, 0xf1e4b863, 0xfcedb668,
+	0x670a0cb1, 0x6a0302ba, 0x7d1810a7, 0x70111eac,
+	0x532e349d, 0x5e273a96, 0x493c288b, 0x44352680,
+	0x0f427ce9, 0x024b72e2, 0x155060ff, 0x18596ef4,
+	0x3b6644c5, 0x366f4ace, 0x217458d3, 0x2c7d56d8,
+	0x0ca1377a, 0x01a83971, 0x16b32b6c, 0x1bba2567,
+	0x38850f56, 0x358c015d, 0x22971340, 0x2f9e1d4b,
+	0x64e94722, 0x69e04929, 0x7efb5b34, 0x73f2553f,
+	0x50cd7f0e, 0x5dc47105, 0x4adf6318, 0x47d66d13,
+	0xdc31d7ca, 0xd138d9c1, 0xc623cbdc, 0xcb2ac5d7,
+	0xe815efe6, 0xe51ce1ed, 0xf207f3f0, 0xff0efdfb,
+	0xb479a792, 0xb970a999, 0xae6bbb84, 0xa362b58f,
+	0x805d9fbe, 0x8d5491b5, 0x9a4f83a8, 0x97468da3
+	},
+	{
+	0x00000000, 0x090e0b0d, 0x121c161a, 0x1b121d17,
+	0x24382c34, 0x2d362739, 0x36243a2e, 0x3f2a3123,
+	0x48705868, 0x417e5365, 0x5a6c4e72, 0x5362457f,
+	0x6c48745c, 0x65467f51, 0x7e546246, 0x775a694b,
+	0x90e0b0d0, 0x99eebbdd, 0x82fca6ca, 0x8bf2adc7,
+	0xb4d89ce4, 0xbdd697e9, 0xa6c48afe, 0xafca81f3,
+	0xd890e8b8, 0xd19ee3b5, 0xca8cfea2, 0xc382f5af,
+	0xfca8c48c, 0xf5a6cf81, 0xeeb4d296, 0xe7bad99b,
+	0x3bdb7bbb, 0x32d570b6, 0x29c76da1, 0x20c966ac,
+	0x1fe3578f, 0x16ed5c82, 0x0dff4195, 0x04f14a98,
+	0x73ab23d3, 0x7aa528de, 0x61b735c9, 0x68b93ec4,
+	0x57930fe7, 0x5e9d04ea, 0x458f19fd, 0x4c8112f0,
+	0xab3bcb6b, 0xa235c066, 0xb927dd71, 0xb029d67c,
+	0x8f03e75f, 0x860dec52, 0x9d1ff145, 0x9411fa48,
+	0xe34b9303, 0xea45980e, 0xf1578519, 0xf8598e14,
+	0xc773bf37, 0xce7db43a, 0xd56fa92d, 0xdc61a220,
+	0x76adf66d, 0x7fa3fd60, 0x64b1e077, 0x6dbfeb7a,
+	0x5295da59, 0x5b9bd154, 0x4089cc43, 0x4987c74e,
+	0x3eddae05, 0x37d3a508, 0x2cc1b81f, 0x25cfb312,
+	0x1ae58231, 0x13eb893c, 0x08f9942b, 0x01f79f26,
+	0xe64d46bd, 0xef434db0, 0xf45150a7, 0xfd5f5baa,
+	0xc2756a89, 0xcb7b6184, 0xd0697c93, 0xd967779e,
+	0xae3d1ed5, 0xa73315d8, 0xbc2108cf, 0xb52f03c2,
+	0x8a0532e1, 0x830b39ec, 0x981924fb, 0x91172ff6,
+	0x4d768dd6, 0x447886db, 0x5f6a9bcc, 0x566490c1,
+	0x694ea1e2, 0x6040aaef, 0x7b52b7f8, 0x725cbcf5,
+	0x0506d5be, 0x0c08deb3, 0x171ac3a4, 0x1e14c8a9,
+	0x213ef98a, 0x2830f287, 0x3322ef90, 0x3a2ce49d,
+	0xdd963d06, 0xd498360b, 0xcf8a2b1c, 0xc6842011,
+	0xf9ae1132, 0xf0a01a3f, 0xebb20728, 0xe2bc0c25,
+	0x95e6656e, 0x9ce86e63, 0x87fa7374, 0x8ef47879,
+	0xb1de495a, 0xb8d04257, 0xa3c25f40, 0xaacc544d,
+	0xec41f7da, 0xe54ffcd7, 0xfe5de1c0, 0xf753eacd,
+	0xc879dbee, 0xc177d0e3, 0xda65cdf4, 0xd36bc6f9,
+	0xa431afb2, 0xad3fa4bf, 0xb62db9a8, 0xbf23b2a5,
+	0x80098386, 0x8907888b, 0x9215959c, 0x9b1b9e91,
+	0x7ca1470a, 0x75af4c07, 0x6ebd5110, 0x67b35a1d,
+	0x58996b3e, 0x51976033, 0x4a857d24, 0x438b7629,
+	0x34d11f62, 0x3ddf146f, 0x26cd0978, 0x2fc30275,
+	0x10e93356, 0x19e7385b, 0x02f5254c, 0x0bfb2e41,
+	0xd79a8c61, 0xde94876c, 0xc5869a7b, 0xcc889176,
+	0xf3a2a055, 0xfaacab58, 0xe1beb64f, 0xe8b0bd42,
+	0x9fead409, 0x96e4df04, 0x8df6c213, 0x84f8c91e,
+	0xbbd2f83d, 0xb2dcf330, 0xa9ceee27, 0xa0c0e52a,
+	0x477a3cb1, 0x4e7437bc, 0x55662aab, 0x5c6821a6,
+	0x63421085, 0x6a4c1b88, 0x715e069f, 0x78500d92,
+	0x0f0a64d9, 0x06046fd4, 0x1d1672c3, 0x141879ce,
+	0x2b3248ed, 0x223c43e0, 0x392e5ef7, 0x302055fa,
+	0x9aec01b7, 0x93e20aba, 0x88f017ad, 0x81fe1ca0,
+	0xbed42d83, 0xb7da268e, 0xacc83b99, 0xa5c63094,
+	0xd29c59df, 0xdb9252d2, 0xc0804fc5, 0xc98e44c8,
+	0xf6a475eb, 0xffaa7ee6, 0xe4b863f1, 0xedb668fc,
+	0x0a0cb167, 0x0302ba6a, 0x1810a77d, 0x111eac70,
+	0x2e349d53, 0x273a965e, 0x3c288b49, 0x35268044,
+	0x427ce90f, 0x4b72e202, 0x5060ff15, 0x596ef418,
+	0x6644c53b, 0x6f4ace36, 0x7458d321, 0x7d56d82c,
+	0xa1377a0c, 0xa8397101, 0xb32b6c16, 0xba25671b,
+	0x850f5638, 0x8c015d35, 0x97134022, 0x9e1d4b2f,
+	0xe9472264, 0xe0492969, 0xfb5b347e, 0xf2553f73,
+	0xcd7f0e50, 0xc471055d, 0xdf63184a, 0xd66d1347,
+	0x31d7cadc, 0x38d9c1d1, 0x23cbdcc6, 0x2ac5d7cb,
+	0x15efe6e8, 0x1ce1ede5, 0x07f3f0f2, 0x0efdfbff,
+	0x79a792b4, 0x70a999b9, 0x6bbb84ae, 0x62b58fa3,
+	0x5d9fbe80, 0x5491b58d, 0x4f83a89a, 0x468da397
+	},
+	{
+	0x00000000, 0x0e0b0d09, 0x1c161a12, 0x121d171b,
+	0x382c3424, 0x3627392d, 0x243a2e36, 0x2a31233f,
+	0x70586848, 0x7e536541, 0x6c4e725a, 0x62457f53,
+	0x48745c6c, 0x467f5165, 0x5462467e, 0x5a694b77,
+	0xe0b0d090, 0xeebbdd99, 0xfca6ca82, 0xf2adc78b,
+	0xd89ce4b4, 0xd697e9bd, 0xc48afea6, 0xca81f3af,
+	0x90e8b8d8, 0x9ee3b5d1, 0x8cfea2ca, 0x82f5afc3,
+	0xa8c48cfc, 0xa6cf81f5, 0xb4d296ee, 0xbad99be7,
+	0xdb7bbb3b, 0xd570b632, 0xc76da129, 0xc966ac20,
+	0xe3578f1f, 0xed5c8216, 0xff41950d, 0xf14a9804,
+	0xab23d373, 0xa528de7a, 0xb735c961, 0xb93ec468,
+	0x930fe757, 0x9d04ea5e, 0x8f19fd45, 0x8112f04c,
+	0x3bcb6bab, 0x35c066a2, 0x27dd71b9, 0x29d67cb0,
+	0x03e75f8f, 0x0dec5286, 0x1ff1459d, 0x11fa4894,
+	0x4b9303e3, 0x45980eea, 0x578519f1, 0x598e14f8,
+	0x73bf37c7, 0x7db43ace, 0x6fa92dd5, 0x61a220dc,
+	0xadf66d76, 0xa3fd607f, 0xb1e07764, 0xbfeb7a6d,
+	0x95da5952, 0x9bd1545b, 0x89cc4340, 0x87c74e49,
+	0xddae053e, 0xd3a50837, 0xc1b81f2c, 0xcfb31225,
+	0xe582311a, 0xeb893c13, 0xf9942b08, 0xf79f2601,
+	0x4d46bde6, 0x434db0ef, 0x5150a7f4, 0x5f5baafd,
+	0x756a89c2, 0x7b6184cb, 0x697c93d0, 0x67779ed9,
+	0x3d1ed5ae, 0x3315d8a7, 0x2108cfbc, 0x2f03c2b5,
+	0x0532e18a, 0x0b39ec83, 0x1924fb98, 0x172ff691,
+	0x768dd64d, 0x7886db44, 0x6a9bcc5f, 0x6490c156,
+	0x4ea1e269, 0x40aaef60, 0x52b7f87b, 0x5cbcf572,
+	0x06d5be05, 0x08deb30c, 0x1ac3a417, 0x14c8a91e,
+	0x3ef98a21, 0x30f28728, 0x22ef9033, 0x2ce49d3a,
+	0x963d06dd, 0x98360bd4, 0x8a2b1ccf, 0x842011c6,
+	0xae1132f9, 0xa01a3ff0, 0xb20728eb, 0xbc0c25e2,
+	0xe6656e95, 0xe86e639c, 0xfa737487, 0xf478798e,
+	0xde495ab1, 0xd04257b8, 0xc25f40a3, 0xcc544daa,
+	0x41f7daec, 0x4ffcd7e5, 0x5de1c0fe, 0x53eacdf7,
+	0x79dbeec8, 0x77d0e3c1, 0x65cdf4da, 0x6bc6f9d3,
+	0x31afb2a4, 0x3fa4bfad, 0x2db9a8b6, 0x23b2a5bf,
+	0x09838680, 0x07888b89, 0x15959c92, 0x1b9e919b,
+	0xa1470a7c, 0xaf4c0775, 0xbd51106e, 0xb35a1d67,
+	0x996b3e58, 0x97603351, 0x857d244a, 0x8b762943,
+	0xd11f6234, 0xdf146f3d, 0xcd097826, 0xc302752f,
+	0xe9335610, 0xe7385b19, 0xf5254c02, 0xfb2e410b,
+	0x9a8c61d7, 0x94876cde, 0x869a7bc5, 0x889176cc,
+	0xa2a055f3, 0xacab58fa, 0xbeb64fe1, 0xb0bd42e8,
+	0xead4099f, 0xe4df0496, 0xf6c2138d, 0xf8c91e84,
+	0xd2f83dbb, 0xdcf330b2, 0xceee27a9, 0xc0e52aa0,
+	0x7a3cb147, 0x7437bc4e, 0x662aab55, 0x6821a65c,
+	0x42108563, 0x4c1b886a, 0x5e069f71, 0x500d9278,
+	0x0a64d90f, 0x046fd406, 0x1672c31d, 0x1879ce14,
+	0x3248ed2b, 0x3c43e022, 0x2e5ef739, 0x2055fa30,
+	0xec01b79a, 0xe20aba93, 0xf017ad88, 0xfe1ca081,
+	0xd42d83be, 0xda268eb7, 0xc83b99ac, 0xc63094a5,
+	0x9c59dfd2, 0x9252d2db, 0x804fc5c0, 0x8e44c8c9,
+	0xa475ebf6, 0xaa7ee6ff, 0xb863f1e4, 0xb668fced,
+	0x0cb1670a, 0x02ba6a03, 0x10a77d18, 0x1eac7011,
+	0x349d532e, 0x3a965e27, 0x288b493c, 0x26804435,
+	0x7ce90f42, 0x72e2024b, 0x60ff1550, 0x6ef41859,
+	0x44c53b66, 0x4ace366f, 0x58d32174, 0x56d82c7d,
+	0x377a0ca1, 0x397101a8, 0x2b6c16b3, 0x25671bba,
+	0x0f563885, 0x015d358c, 0x13402297, 0x1d4b2f9e,
+	0x472264e9, 0x492969e0, 0x5b347efb, 0x553f73f2,
+	0x7f0e50cd, 0x71055dc4, 0x63184adf, 0x6d1347d6,
+	0xd7cadc31, 0xd9c1d138, 0xcbdcc623, 0xc5d7cb2a,
+	0xefe6e815, 0xe1ede51c, 0xf3f0f207, 0xfdfbff0e,
+	0xa792b479, 0xa999b970, 0xbb84ae6b, 0xb58fa362,
+	0x9fbe805d, 0x91b58d54, 0x83a89a4f, 0x8da39746
+	}
+};
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _AESTAB2_H */
diff --git a/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams
new file mode 100644
index 000000000000..0de1883dc81b
--- /dev/null
+++ b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams
@@ -0,0 +1,36 @@
+Copyright (c) 2006-2017, CRYPTOGAMS by <appro@openssl.org>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+      *	Redistributions of source code must retain copyright notices,
+	this list of conditions and the following disclaimer.
+
+      *	Redistributions in binary form must reproduce the above
+	copyright notice, this list of conditions and the following
+	disclaimer in the documentation and/or other materials
+	provided with the distribution.
+
+      *	Neither the name of the CRYPTOGAMS nor the names of its
+	copyright holder and contributors may be used to endorse or
+	promote products derived from this software without specific
+	prior written permission.
+
+ALTERNATIVELY, provided that this notice is retained in full, this
+product may be distributed under the terms of the GNU General Public
+License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+those given above.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip
new file mode 100644
index 000000000000..6184759c8b74
--- /dev/null
+++ b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip
@@ -0,0 +1 @@
+PORTIONS OF GCM and GHASH FUNCTIONALITY
diff --git a/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl
new file mode 100644
index 000000000000..49cc83d2ee29
--- /dev/null
+++ b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl
@@ -0,0 +1,177 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        https://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
diff --git a/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip
new file mode 100644
index 000000000000..6184759c8b74
--- /dev/null
+++ b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip
@@ -0,0 +1 @@
+PORTIONS OF GCM and GHASH FUNCTIONALITY
diff --git a/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S b/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S
new file mode 100644
index 000000000000..ed9f660fce5b
--- /dev/null
+++ b/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S
@@ -0,0 +1,1245 @@
+# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+#
+# AES-NI-CTR+GHASH stitch.
+#
+# February 2013
+#
+# OpenSSL GCM implementation is organized in such way that its
+# performance is rather close to the sum of its streamed components,
+# in the context parallelized AES-NI CTR and modulo-scheduled
+# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
+# was observed to perform significantly better than the sum of the
+# components on contemporary CPUs, the effort was deemed impossible to
+# justify. This module is based on combination of Intel submissions,
+# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
+# Locktyukhin of Intel Corp. who verified that it reduces shuffles
+# pressure with notable relative improvement, achieving 1.0 cycle per
+# byte processed with 128-bit key on Haswell processor, 0.74 - on
+# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
+# measurements for favourable packet size, one divisible by 96.
+# Applications using the EVP interface will observe a few percent
+# worse performance.]
+#
+# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
+#
+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
+# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
+
+# Generated once from
+# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/aesni-gcm-x86_64.pl
+# and modified for ICP. Modification are kept at a bare minimum to ease later
+# upstream merges.
+
+#if defined(__x86_64__) && defined(HAVE_AVX) && \
+    defined(HAVE_AES) && defined(HAVE_PCLMULQDQ)
+
+.extern gcm_avx_can_use_movbe
+
+.text
+
+#ifdef HAVE_MOVBE
+.type	_aesni_ctr32_ghash_6x,@function
+.align	32
+_aesni_ctr32_ghash_6x:
+	vmovdqu	32(%r11),%xmm2
+	subq	$6,%rdx
+	vpxor	%xmm4,%xmm4,%xmm4
+	vmovdqu	0-128(%rcx),%xmm15
+	vpaddb	%xmm2,%xmm1,%xmm10
+	vpaddb	%xmm2,%xmm10,%xmm11
+	vpaddb	%xmm2,%xmm11,%xmm12
+	vpaddb	%xmm2,%xmm12,%xmm13
+	vpaddb	%xmm2,%xmm13,%xmm14
+	vpxor	%xmm15,%xmm1,%xmm9
+	vmovdqu	%xmm4,16+8(%rsp)
+	jmp	.Loop6x
+
+.align	32
+.Loop6x:
+	addl	$100663296,%ebx
+	jc	.Lhandle_ctr32
+	vmovdqu	0-32(%r9),%xmm3
+	vpaddb	%xmm2,%xmm14,%xmm1
+	vpxor	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm15,%xmm11,%xmm11
+
+.Lresume_ctr32:
+	vmovdqu	%xmm1,(%r8)
+	vpclmulqdq	$0x10,%xmm3,%xmm7,%xmm5
+	vpxor	%xmm15,%xmm12,%xmm12
+	vmovups	16-128(%rcx),%xmm2
+	vpclmulqdq	$0x01,%xmm3,%xmm7,%xmm6
+	xorq	%r12,%r12
+	cmpq	%r14,%r15
+
+	vaesenc	%xmm2,%xmm9,%xmm9
+	vmovdqu	48+8(%rsp),%xmm0
+	vpxor	%xmm15,%xmm13,%xmm13
+	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm1
+	vaesenc	%xmm2,%xmm10,%xmm10
+	vpxor	%xmm15,%xmm14,%xmm14
+	setnc	%r12b
+	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vmovdqu	16-32(%r9),%xmm3
+	negq	%r12
+	vaesenc	%xmm2,%xmm12,%xmm12
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm3,%xmm0,%xmm5
+	vpxor	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm2,%xmm13,%xmm13
+	vpxor	%xmm5,%xmm1,%xmm4
+	andq	$0x60,%r12
+	vmovups	32-128(%rcx),%xmm15
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm1
+	vaesenc	%xmm2,%xmm14,%xmm14
+
+	vpclmulqdq	$0x01,%xmm3,%xmm0,%xmm2
+	leaq	(%r14,%r12,1),%r14
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	16+8(%rsp),%xmm8,%xmm8
+	vpclmulqdq	$0x11,%xmm3,%xmm0,%xmm3
+	vmovdqu	64+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	88(%r14),%r13
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	80(%r14),%r12
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,32+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,40+8(%rsp)
+	vmovdqu	48-32(%r9),%xmm5
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	48-128(%rcx),%xmm15
+	vpxor	%xmm1,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm5,%xmm0,%xmm1
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm5,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm3,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm5,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpclmulqdq	$0x11,%xmm5,%xmm0,%xmm5
+	vmovdqu	80+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	%xmm1,%xmm4,%xmm4
+	vmovdqu	64-32(%r9),%xmm1
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	64-128(%rcx),%xmm15
+	vpxor	%xmm2,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm1,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm3,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm1,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	72(%r14),%r13
+	vpxor	%xmm5,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm1,%xmm0,%xmm5
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	64(%r14),%r12
+	vpclmulqdq	$0x11,%xmm1,%xmm0,%xmm1
+	vmovdqu	96+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,48+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,56+8(%rsp)
+	vpxor	%xmm2,%xmm4,%xmm4
+	vmovdqu	96-32(%r9),%xmm2
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	80-128(%rcx),%xmm15
+	vpxor	%xmm3,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm5
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	56(%r14),%r13
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm1
+	vpxor	112+8(%rsp),%xmm8,%xmm8
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	48(%r14),%r12
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,64+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,72+8(%rsp)
+	vpxor	%xmm3,%xmm4,%xmm4
+	vmovdqu	112-32(%r9),%xmm3
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	96-128(%rcx),%xmm15
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm5
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm6,%xmm6
+	vpclmulqdq	$0x01,%xmm3,%xmm8,%xmm1
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	40(%r14),%r13
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm3,%xmm8,%xmm2
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	32(%r14),%r12
+	vpclmulqdq	$0x11,%xmm3,%xmm8,%xmm8
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,80+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,88+8(%rsp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm6,%xmm6
+
+	vmovups	112-128(%rcx),%xmm15
+	vpslldq	$8,%xmm6,%xmm5
+	vpxor	%xmm2,%xmm4,%xmm4
+	vmovdqu	16(%r11),%xmm3
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm8,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm5,%xmm4,%xmm4
+	movbeq	24(%r14),%r13
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	16(%r14),%r12
+	vpalignr	$8,%xmm4,%xmm4,%xmm0
+	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
+	movq	%r13,96+8(%rsp)
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r12,104+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vmovups	128-128(%rcx),%xmm1
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vmovups	144-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vpsrldq	$8,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vpxor	%xmm6,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vpxor	%xmm0,%xmm4,%xmm4
+	movbeq	8(%r14),%r13
+	vaesenc	%xmm1,%xmm13,%xmm13
+	movbeq	0(%r14),%r12
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	160-128(%rcx),%xmm1
+	cmpl	$12,%ebp	// ICP uses 10,12,14 not 9,11,13 for rounds.
+	jb	.Lenc_tail
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+	vmovups	176-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	192-128(%rcx),%xmm1
+	cmpl	$14,%ebp	// ICP does not zero key schedule.
+	jb	.Lenc_tail
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+	vmovups	208-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	224-128(%rcx),%xmm1
+	jmp	.Lenc_tail
+
+.align	32
+.Lhandle_ctr32:
+	vmovdqu	(%r11),%xmm0
+	vpshufb	%xmm0,%xmm1,%xmm6
+	vmovdqu	48(%r11),%xmm5
+	vpaddd	64(%r11),%xmm6,%xmm10
+	vpaddd	%xmm5,%xmm6,%xmm11
+	vmovdqu	0-32(%r9),%xmm3
+	vpaddd	%xmm5,%xmm10,%xmm12
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm11,%xmm13
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vpxor	%xmm15,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm12,%xmm14
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vpxor	%xmm15,%xmm11,%xmm11
+	vpaddd	%xmm5,%xmm13,%xmm1
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vpshufb	%xmm0,%xmm1,%xmm1
+	jmp	.Lresume_ctr32
+
+.align	32
+.Lenc_tail:
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vmovdqu	%xmm7,16+8(%rsp)
+	vpalignr	$8,%xmm4,%xmm4,%xmm8
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
+	vpxor	0(%rdi),%xmm1,%xmm2
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpxor	16(%rdi),%xmm1,%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vpxor	32(%rdi),%xmm1,%xmm5
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	48(%rdi),%xmm1,%xmm6
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	64(%rdi),%xmm1,%xmm7
+	vpxor	80(%rdi),%xmm1,%xmm3
+	vmovdqu	(%r8),%xmm1
+
+	vaesenclast	%xmm2,%xmm9,%xmm9
+	vmovdqu	32(%r11),%xmm2
+	vaesenclast	%xmm0,%xmm10,%xmm10
+	vpaddb	%xmm2,%xmm1,%xmm0
+	movq	%r13,112+8(%rsp)
+	leaq	96(%rdi),%rdi
+	vaesenclast	%xmm5,%xmm11,%xmm11
+	vpaddb	%xmm2,%xmm0,%xmm5
+	movq	%r12,120+8(%rsp)
+	leaq	96(%rsi),%rsi
+	vmovdqu	0-128(%rcx),%xmm15
+	vaesenclast	%xmm6,%xmm12,%xmm12
+	vpaddb	%xmm2,%xmm5,%xmm6
+	vaesenclast	%xmm7,%xmm13,%xmm13
+	vpaddb	%xmm2,%xmm6,%xmm7
+	vaesenclast	%xmm3,%xmm14,%xmm14
+	vpaddb	%xmm2,%xmm7,%xmm3
+
+	addq	$0x60,%r10
+	subq	$0x6,%rdx
+	jc	.L6x_done
+
+	vmovups	%xmm9,-96(%rsi)
+	vpxor	%xmm15,%xmm1,%xmm9
+	vmovups	%xmm10,-80(%rsi)
+	vmovdqa	%xmm0,%xmm10
+	vmovups	%xmm11,-64(%rsi)
+	vmovdqa	%xmm5,%xmm11
+	vmovups	%xmm12,-48(%rsi)
+	vmovdqa	%xmm6,%xmm12
+	vmovups	%xmm13,-32(%rsi)
+	vmovdqa	%xmm7,%xmm13
+	vmovups	%xmm14,-16(%rsi)
+	vmovdqa	%xmm3,%xmm14
+	vmovdqu	32+8(%rsp),%xmm7
+	jmp	.Loop6x
+
+.L6x_done:
+	vpxor	16+8(%rsp),%xmm8,%xmm8
+	vpxor	%xmm4,%xmm8,%xmm8
+
+	.byte	0xf3,0xc3
+.size	_aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
+#endif /* ifdef HAVE_MOVBE */
+
+.type	_aesni_ctr32_ghash_no_movbe_6x,@function
+.align	32
+_aesni_ctr32_ghash_no_movbe_6x:
+	vmovdqu	32(%r11),%xmm2
+	subq	$6,%rdx
+	vpxor	%xmm4,%xmm4,%xmm4
+	vmovdqu	0-128(%rcx),%xmm15
+	vpaddb	%xmm2,%xmm1,%xmm10
+	vpaddb	%xmm2,%xmm10,%xmm11
+	vpaddb	%xmm2,%xmm11,%xmm12
+	vpaddb	%xmm2,%xmm12,%xmm13
+	vpaddb	%xmm2,%xmm13,%xmm14
+	vpxor	%xmm15,%xmm1,%xmm9
+	vmovdqu	%xmm4,16+8(%rsp)
+	jmp	.Loop6x_nmb
+
+.align	32
+.Loop6x_nmb:
+	addl	$100663296,%ebx
+	jc	.Lhandle_ctr32_nmb
+	vmovdqu	0-32(%r9),%xmm3
+	vpaddb	%xmm2,%xmm14,%xmm1
+	vpxor	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm15,%xmm11,%xmm11
+
+.Lresume_ctr32_nmb:
+	vmovdqu	%xmm1,(%r8)
+	vpclmulqdq	$0x10,%xmm3,%xmm7,%xmm5
+	vpxor	%xmm15,%xmm12,%xmm12
+	vmovups	16-128(%rcx),%xmm2
+	vpclmulqdq	$0x01,%xmm3,%xmm7,%xmm6
+	xorq	%r12,%r12
+	cmpq	%r14,%r15
+
+	vaesenc	%xmm2,%xmm9,%xmm9
+	vmovdqu	48+8(%rsp),%xmm0
+	vpxor	%xmm15,%xmm13,%xmm13
+	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm1
+	vaesenc	%xmm2,%xmm10,%xmm10
+	vpxor	%xmm15,%xmm14,%xmm14
+	setnc	%r12b
+	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vmovdqu	16-32(%r9),%xmm3
+	negq	%r12
+	vaesenc	%xmm2,%xmm12,%xmm12
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm3,%xmm0,%xmm5
+	vpxor	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm2,%xmm13,%xmm13
+	vpxor	%xmm5,%xmm1,%xmm4
+	andq	$0x60,%r12
+	vmovups	32-128(%rcx),%xmm15
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm1
+	vaesenc	%xmm2,%xmm14,%xmm14
+
+	vpclmulqdq	$0x01,%xmm3,%xmm0,%xmm2
+	leaq	(%r14,%r12,1),%r14
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	16+8(%rsp),%xmm8,%xmm8
+	vpclmulqdq	$0x11,%xmm3,%xmm0,%xmm3
+	vmovdqu	64+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movq	88(%r14),%r13
+	bswapq	%r13
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movq	80(%r14),%r12
+	bswapq	%r12
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,32+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,40+8(%rsp)
+	vmovdqu	48-32(%r9),%xmm5
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	48-128(%rcx),%xmm15
+	vpxor	%xmm1,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm5,%xmm0,%xmm1
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm5,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm3,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm5,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpclmulqdq	$0x11,%xmm5,%xmm0,%xmm5
+	vmovdqu	80+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	%xmm1,%xmm4,%xmm4
+	vmovdqu	64-32(%r9),%xmm1
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	64-128(%rcx),%xmm15
+	vpxor	%xmm2,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm1,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm3,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm1,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movq	72(%r14),%r13
+	bswapq	%r13
+	vpxor	%xmm5,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm1,%xmm0,%xmm5
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movq	64(%r14),%r12
+	bswapq	%r12
+	vpclmulqdq	$0x11,%xmm1,%xmm0,%xmm1
+	vmovdqu	96+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,48+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,56+8(%rsp)
+	vpxor	%xmm2,%xmm4,%xmm4
+	vmovdqu	96-32(%r9),%xmm2
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	80-128(%rcx),%xmm15
+	vpxor	%xmm3,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm5
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movq	56(%r14),%r13
+	bswapq	%r13
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm1
+	vpxor	112+8(%rsp),%xmm8,%xmm8
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movq	48(%r14),%r12
+	bswapq	%r12
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,64+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,72+8(%rsp)
+	vpxor	%xmm3,%xmm4,%xmm4
+	vmovdqu	112-32(%r9),%xmm3
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	96-128(%rcx),%xmm15
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm5
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm6,%xmm6
+	vpclmulqdq	$0x01,%xmm3,%xmm8,%xmm1
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movq	40(%r14),%r13
+	bswapq	%r13
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm3,%xmm8,%xmm2
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movq	32(%r14),%r12
+	bswapq	%r12
+	vpclmulqdq	$0x11,%xmm3,%xmm8,%xmm8
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,80+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,88+8(%rsp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm6,%xmm6
+
+	vmovups	112-128(%rcx),%xmm15
+	vpslldq	$8,%xmm6,%xmm5
+	vpxor	%xmm2,%xmm4,%xmm4
+	vmovdqu	16(%r11),%xmm3
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm8,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm5,%xmm4,%xmm4
+	movq	24(%r14),%r13
+	bswapq	%r13
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movq	16(%r14),%r12
+	bswapq	%r12
+	vpalignr	$8,%xmm4,%xmm4,%xmm0
+	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
+	movq	%r13,96+8(%rsp)
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r12,104+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vmovups	128-128(%rcx),%xmm1
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vmovups	144-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vpsrldq	$8,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vpxor	%xmm6,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vpxor	%xmm0,%xmm4,%xmm4
+	movq	8(%r14),%r13
+	bswapq	%r13
+	vaesenc	%xmm1,%xmm13,%xmm13
+	movq	0(%r14),%r12
+	bswapq	%r12
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	160-128(%rcx),%xmm1
+	cmpl	$12,%ebp	// ICP uses 10,12,14 not 9,11,13 for rounds.
+	jb	.Lenc_tail_nmb
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+	vmovups	176-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	192-128(%rcx),%xmm1
+	cmpl	$14,%ebp	// ICP does not zero key schedule.
+	jb	.Lenc_tail_nmb
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+	vmovups	208-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	224-128(%rcx),%xmm1
+	jmp	.Lenc_tail_nmb
+
+.align	32
+.Lhandle_ctr32_nmb:
+	vmovdqu	(%r11),%xmm0
+	vpshufb	%xmm0,%xmm1,%xmm6
+	vmovdqu	48(%r11),%xmm5
+	vpaddd	64(%r11),%xmm6,%xmm10
+	vpaddd	%xmm5,%xmm6,%xmm11
+	vmovdqu	0-32(%r9),%xmm3
+	vpaddd	%xmm5,%xmm10,%xmm12
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm11,%xmm13
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vpxor	%xmm15,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm12,%xmm14
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vpxor	%xmm15,%xmm11,%xmm11
+	vpaddd	%xmm5,%xmm13,%xmm1
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vpshufb	%xmm0,%xmm1,%xmm1
+	jmp	.Lresume_ctr32_nmb
+
+.align	32
+.Lenc_tail_nmb:
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vmovdqu	%xmm7,16+8(%rsp)
+	vpalignr	$8,%xmm4,%xmm4,%xmm8
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
+	vpxor	0(%rdi),%xmm1,%xmm2
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpxor	16(%rdi),%xmm1,%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vpxor	32(%rdi),%xmm1,%xmm5
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	48(%rdi),%xmm1,%xmm6
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	64(%rdi),%xmm1,%xmm7
+	vpxor	80(%rdi),%xmm1,%xmm3
+	vmovdqu	(%r8),%xmm1
+
+	vaesenclast	%xmm2,%xmm9,%xmm9
+	vmovdqu	32(%r11),%xmm2
+	vaesenclast	%xmm0,%xmm10,%xmm10
+	vpaddb	%xmm2,%xmm1,%xmm0
+	movq	%r13,112+8(%rsp)
+	leaq	96(%rdi),%rdi
+	vaesenclast	%xmm5,%xmm11,%xmm11
+	vpaddb	%xmm2,%xmm0,%xmm5
+	movq	%r12,120+8(%rsp)
+	leaq	96(%rsi),%rsi
+	vmovdqu	0-128(%rcx),%xmm15
+	vaesenclast	%xmm6,%xmm12,%xmm12
+	vpaddb	%xmm2,%xmm5,%xmm6
+	vaesenclast	%xmm7,%xmm13,%xmm13
+	vpaddb	%xmm2,%xmm6,%xmm7
+	vaesenclast	%xmm3,%xmm14,%xmm14
+	vpaddb	%xmm2,%xmm7,%xmm3
+
+	addq	$0x60,%r10
+	subq	$0x6,%rdx
+	jc	.L6x_done_nmb
+
+	vmovups	%xmm9,-96(%rsi)
+	vpxor	%xmm15,%xmm1,%xmm9
+	vmovups	%xmm10,-80(%rsi)
+	vmovdqa	%xmm0,%xmm10
+	vmovups	%xmm11,-64(%rsi)
+	vmovdqa	%xmm5,%xmm11
+	vmovups	%xmm12,-48(%rsi)
+	vmovdqa	%xmm6,%xmm12
+	vmovups	%xmm13,-32(%rsi)
+	vmovdqa	%xmm7,%xmm13
+	vmovups	%xmm14,-16(%rsi)
+	vmovdqa	%xmm3,%xmm14
+	vmovdqu	32+8(%rsp),%xmm7
+	jmp	.Loop6x_nmb
+
+.L6x_done_nmb:
+	vpxor	16+8(%rsp),%xmm8,%xmm8
+	vpxor	%xmm4,%xmm8,%xmm8
+
+	.byte	0xf3,0xc3
+.size	_aesni_ctr32_ghash_no_movbe_6x,.-_aesni_ctr32_ghash_no_movbe_6x
+
+.globl	aesni_gcm_decrypt
+.type	aesni_gcm_decrypt,@function
+.align	32
+aesni_gcm_decrypt:
+.cfi_startproc
+	xorq	%r10,%r10
+	cmpq	$0x60,%rdx
+	jb	.Lgcm_dec_abort
+
+	leaq	(%rsp),%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+	vzeroupper
+
+	vmovdqu	(%r8),%xmm1
+	addq	$-128,%rsp
+	movl	12(%r8),%ebx
+	leaq	.Lbswap_mask(%rip),%r11
+	leaq	-128(%rcx),%r14
+	movq	$0xf80,%r15
+	vmovdqu	(%r9),%xmm8
+	andq	$-128,%rsp
+	vmovdqu	(%r11),%xmm0
+	leaq	128(%rcx),%rcx
+	leaq	32+32(%r9),%r9
+	movl	504-128(%rcx),%ebp	// ICP has a larger offset for rounds.
+	vpshufb	%xmm0,%xmm8,%xmm8
+
+	andq	%r15,%r14
+	andq	%rsp,%r15
+	subq	%r14,%r15
+	jc	.Ldec_no_key_aliasing
+	cmpq	$768,%r15
+	jnc	.Ldec_no_key_aliasing
+	subq	%r15,%rsp
+.Ldec_no_key_aliasing:
+
+	vmovdqu	80(%rdi),%xmm7
+	leaq	(%rdi),%r14
+	vmovdqu	64(%rdi),%xmm4
+	leaq	-192(%rdi,%rdx,1),%r15
+	vmovdqu	48(%rdi),%xmm5
+	shrq	$4,%rdx
+	xorq	%r10,%r10
+	vmovdqu	32(%rdi),%xmm6
+	vpshufb	%xmm0,%xmm7,%xmm7
+	vmovdqu	16(%rdi),%xmm2
+	vpshufb	%xmm0,%xmm4,%xmm4
+	vmovdqu	(%rdi),%xmm3
+	vpshufb	%xmm0,%xmm5,%xmm5
+	vmovdqu	%xmm4,48(%rsp)
+	vpshufb	%xmm0,%xmm6,%xmm6
+	vmovdqu	%xmm5,64(%rsp)
+	vpshufb	%xmm0,%xmm2,%xmm2
+	vmovdqu	%xmm6,80(%rsp)
+	vpshufb	%xmm0,%xmm3,%xmm3
+	vmovdqu	%xmm2,96(%rsp)
+	vmovdqu	%xmm3,112(%rsp)
+
+#ifdef HAVE_MOVBE
+#ifdef _KERNEL
+	testl	$1,gcm_avx_can_use_movbe(%rip)
+#else
+	testl	$1,gcm_avx_can_use_movbe@GOTPCREL(%rip)
+#endif
+	jz	1f
+	call	_aesni_ctr32_ghash_6x
+	jmp	2f
+1:
+#endif
+	call	_aesni_ctr32_ghash_no_movbe_6x
+2:
+	vmovups	%xmm9,-96(%rsi)
+	vmovups	%xmm10,-80(%rsi)
+	vmovups	%xmm11,-64(%rsi)
+	vmovups	%xmm12,-48(%rsi)
+	vmovups	%xmm13,-32(%rsi)
+	vmovups	%xmm14,-16(%rsi)
+
+	vpshufb	(%r11),%xmm8,%xmm8
+	vmovdqu	%xmm8,-64(%r9)
+
+	vzeroupper
+	movq	-48(%rax),%r15
+.cfi_restore	%r15
+	movq	-40(%rax),%r14
+.cfi_restore	%r14
+	movq	-32(%rax),%r13
+.cfi_restore	%r13
+	movq	-24(%rax),%r12
+.cfi_restore	%r12
+	movq	-16(%rax),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rax),%rbx
+.cfi_restore	%rbx
+	leaq	(%rax),%rsp
+.cfi_def_cfa_register	%rsp
+.Lgcm_dec_abort:
+	movq	%r10,%rax
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
+.type	_aesni_ctr32_6x,@function
+.align	32
+_aesni_ctr32_6x:
+	vmovdqu	0-128(%rcx),%xmm4
+	vmovdqu	32(%r11),%xmm2
+	leaq	-2(%rbp),%r13	// ICP uses 10,12,14 not 9,11,13 for rounds.
+	vmovups	16-128(%rcx),%xmm15
+	leaq	32-128(%rcx),%r12
+	vpxor	%xmm4,%xmm1,%xmm9
+	addl	$100663296,%ebx
+	jc	.Lhandle_ctr32_2
+	vpaddb	%xmm2,%xmm1,%xmm10
+	vpaddb	%xmm2,%xmm10,%xmm11
+	vpxor	%xmm4,%xmm10,%xmm10
+	vpaddb	%xmm2,%xmm11,%xmm12
+	vpxor	%xmm4,%xmm11,%xmm11
+	vpaddb	%xmm2,%xmm12,%xmm13
+	vpxor	%xmm4,%xmm12,%xmm12
+	vpaddb	%xmm2,%xmm13,%xmm14
+	vpxor	%xmm4,%xmm13,%xmm13
+	vpaddb	%xmm2,%xmm14,%xmm1
+	vpxor	%xmm4,%xmm14,%xmm14
+	jmp	.Loop_ctr32
+
+.align	16
+.Loop_ctr32:
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vmovups	(%r12),%xmm15
+	leaq	16(%r12),%r12
+	decl	%r13d
+	jnz	.Loop_ctr32
+
+	vmovdqu	(%r12),%xmm3
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	0(%rdi),%xmm3,%xmm4
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	16(%rdi),%xmm3,%xmm5
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpxor	32(%rdi),%xmm3,%xmm6
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vpxor	48(%rdi),%xmm3,%xmm8
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	64(%rdi),%xmm3,%xmm2
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	80(%rdi),%xmm3,%xmm3
+	leaq	96(%rdi),%rdi
+
+	vaesenclast	%xmm4,%xmm9,%xmm9
+	vaesenclast	%xmm5,%xmm10,%xmm10
+	vaesenclast	%xmm6,%xmm11,%xmm11
+	vaesenclast	%xmm8,%xmm12,%xmm12
+	vaesenclast	%xmm2,%xmm13,%xmm13
+	vaesenclast	%xmm3,%xmm14,%xmm14
+	vmovups	%xmm9,0(%rsi)
+	vmovups	%xmm10,16(%rsi)
+	vmovups	%xmm11,32(%rsi)
+	vmovups	%xmm12,48(%rsi)
+	vmovups	%xmm13,64(%rsi)
+	vmovups	%xmm14,80(%rsi)
+	leaq	96(%rsi),%rsi
+
+	.byte	0xf3,0xc3
+.align	32
+.Lhandle_ctr32_2:
+	vpshufb	%xmm0,%xmm1,%xmm6
+	vmovdqu	48(%r11),%xmm5
+	vpaddd	64(%r11),%xmm6,%xmm10
+	vpaddd	%xmm5,%xmm6,%xmm11
+	vpaddd	%xmm5,%xmm10,%xmm12
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm11,%xmm13
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vpxor	%xmm4,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm12,%xmm14
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vpxor	%xmm4,%xmm11,%xmm11
+	vpaddd	%xmm5,%xmm13,%xmm1
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vpxor	%xmm4,%xmm12,%xmm12
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vpxor	%xmm4,%xmm13,%xmm13
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm4,%xmm14,%xmm14
+	jmp	.Loop_ctr32
+.size	_aesni_ctr32_6x,.-_aesni_ctr32_6x
+
+.globl	aesni_gcm_encrypt
+.type	aesni_gcm_encrypt,@function
+.align	32
+aesni_gcm_encrypt:
+.cfi_startproc
+	xorq	%r10,%r10
+	cmpq	$288,%rdx
+	jb	.Lgcm_enc_abort
+
+	leaq	(%rsp),%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+	vzeroupper
+
+	vmovdqu	(%r8),%xmm1
+	addq	$-128,%rsp
+	movl	12(%r8),%ebx
+	leaq	.Lbswap_mask(%rip),%r11
+	leaq	-128(%rcx),%r14
+	movq	$0xf80,%r15
+	leaq	128(%rcx),%rcx
+	vmovdqu	(%r11),%xmm0
+	andq	$-128,%rsp
+	movl	504-128(%rcx),%ebp	// ICP has an larger offset for rounds.
+
+	andq	%r15,%r14
+	andq	%rsp,%r15
+	subq	%r14,%r15
+	jc	.Lenc_no_key_aliasing
+	cmpq	$768,%r15
+	jnc	.Lenc_no_key_aliasing
+	subq	%r15,%rsp
+.Lenc_no_key_aliasing:
+
+	leaq	(%rsi),%r14
+	leaq	-192(%rsi,%rdx,1),%r15
+	shrq	$4,%rdx
+
+	call	_aesni_ctr32_6x
+	vpshufb	%xmm0,%xmm9,%xmm8
+	vpshufb	%xmm0,%xmm10,%xmm2
+	vmovdqu	%xmm8,112(%rsp)
+	vpshufb	%xmm0,%xmm11,%xmm4
+	vmovdqu	%xmm2,96(%rsp)
+	vpshufb	%xmm0,%xmm12,%xmm5
+	vmovdqu	%xmm4,80(%rsp)
+	vpshufb	%xmm0,%xmm13,%xmm6
+	vmovdqu	%xmm5,64(%rsp)
+	vpshufb	%xmm0,%xmm14,%xmm7
+	vmovdqu	%xmm6,48(%rsp)
+
+	call	_aesni_ctr32_6x
+
+	vmovdqu	(%r9),%xmm8
+	leaq	32+32(%r9),%r9
+	subq	$12,%rdx
+	movq	$192,%r10
+	vpshufb	%xmm0,%xmm8,%xmm8
+
+#ifdef HAVE_MOVBE
+#ifdef _KERNEL
+	testl	$1,gcm_avx_can_use_movbe(%rip)
+#else
+	testl	$1,gcm_avx_can_use_movbe@GOTPCREL(%rip)
+#endif
+	jz	1f
+	call	_aesni_ctr32_ghash_6x
+	jmp	2f
+1:
+#endif
+	call	_aesni_ctr32_ghash_no_movbe_6x
+2:
+	vmovdqu	32(%rsp),%xmm7
+	vmovdqu	(%r11),%xmm0
+	vmovdqu	0-32(%r9),%xmm3
+	vpunpckhqdq	%xmm7,%xmm7,%xmm1
+	vmovdqu	32-32(%r9),%xmm15
+	vmovups	%xmm9,-96(%rsi)
+	vpshufb	%xmm0,%xmm9,%xmm9
+	vpxor	%xmm7,%xmm1,%xmm1
+	vmovups	%xmm10,-80(%rsi)
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vmovups	%xmm11,-64(%rsi)
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vmovups	%xmm12,-48(%rsi)
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vmovups	%xmm13,-32(%rsi)
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vmovups	%xmm14,-16(%rsi)
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vmovdqu	%xmm9,16(%rsp)
+	vmovdqu	48(%rsp),%xmm6
+	vmovdqu	16-32(%r9),%xmm0
+	vpunpckhqdq	%xmm6,%xmm6,%xmm2
+	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm5
+	vpxor	%xmm6,%xmm2,%xmm2
+	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
+
+	vmovdqu	64(%rsp),%xmm9
+	vpclmulqdq	$0x00,%xmm0,%xmm6,%xmm4
+	vmovdqu	48-32(%r9),%xmm3
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm9,%xmm9,%xmm5
+	vpclmulqdq	$0x11,%xmm0,%xmm6,%xmm6
+	vpxor	%xmm9,%xmm5,%xmm5
+	vpxor	%xmm7,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
+	vmovdqu	80-32(%r9),%xmm15
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vmovdqu	80(%rsp),%xmm1
+	vpclmulqdq	$0x00,%xmm3,%xmm9,%xmm7
+	vmovdqu	64-32(%r9),%xmm0
+	vpxor	%xmm4,%xmm7,%xmm7
+	vpunpckhqdq	%xmm1,%xmm1,%xmm4
+	vpclmulqdq	$0x11,%xmm3,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpxor	%xmm6,%xmm9,%xmm9
+	vpclmulqdq	$0x00,%xmm15,%xmm5,%xmm5
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	96(%rsp),%xmm2
+	vpclmulqdq	$0x00,%xmm0,%xmm1,%xmm6
+	vmovdqu	96-32(%r9),%xmm3
+	vpxor	%xmm7,%xmm6,%xmm6
+	vpunpckhqdq	%xmm2,%xmm2,%xmm7
+	vpclmulqdq	$0x11,%xmm0,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpxor	%xmm9,%xmm1,%xmm1
+	vpclmulqdq	$0x10,%xmm15,%xmm4,%xmm4
+	vmovdqu	128-32(%r9),%xmm15
+	vpxor	%xmm5,%xmm4,%xmm4
+
+	vpxor	112(%rsp),%xmm8,%xmm8
+	vpclmulqdq	$0x00,%xmm3,%xmm2,%xmm5
+	vmovdqu	112-32(%r9),%xmm0
+	vpunpckhqdq	%xmm8,%xmm8,%xmm9
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x11,%xmm3,%xmm2,%xmm2
+	vpxor	%xmm8,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm15,%xmm7,%xmm7
+	vpxor	%xmm4,%xmm7,%xmm4
+
+	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm6
+	vmovdqu	0-32(%r9),%xmm3
+	vpunpckhqdq	%xmm14,%xmm14,%xmm1
+	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm8
+	vpxor	%xmm14,%xmm1,%xmm1
+	vpxor	%xmm5,%xmm6,%xmm5
+	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm9
+	vmovdqu	32-32(%r9),%xmm15
+	vpxor	%xmm2,%xmm8,%xmm7
+	vpxor	%xmm4,%xmm9,%xmm6
+
+	vmovdqu	16-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm7,%xmm9
+	vpclmulqdq	$0x00,%xmm3,%xmm14,%xmm4
+	vpxor	%xmm9,%xmm6,%xmm6
+	vpunpckhqdq	%xmm13,%xmm13,%xmm2
+	vpclmulqdq	$0x11,%xmm3,%xmm14,%xmm14
+	vpxor	%xmm13,%xmm2,%xmm2
+	vpslldq	$8,%xmm6,%xmm9
+	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
+	vpxor	%xmm9,%xmm5,%xmm8
+	vpsrldq	$8,%xmm6,%xmm6
+	vpxor	%xmm6,%xmm7,%xmm7
+
+	vpclmulqdq	$0x00,%xmm0,%xmm13,%xmm5
+	vmovdqu	48-32(%r9),%xmm3
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpunpckhqdq	%xmm12,%xmm12,%xmm9
+	vpclmulqdq	$0x11,%xmm0,%xmm13,%xmm13
+	vpxor	%xmm12,%xmm9,%xmm9
+	vpxor	%xmm14,%xmm13,%xmm13
+	vpalignr	$8,%xmm8,%xmm8,%xmm14
+	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
+	vmovdqu	80-32(%r9),%xmm15
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vpclmulqdq	$0x00,%xmm3,%xmm12,%xmm4
+	vmovdqu	64-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm11,%xmm11,%xmm1
+	vpclmulqdq	$0x11,%xmm3,%xmm12,%xmm12
+	vpxor	%xmm11,%xmm1,%xmm1
+	vpxor	%xmm13,%xmm12,%xmm12
+	vxorps	16(%rsp),%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm15,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm9,%xmm9
+
+	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
+	vxorps	%xmm14,%xmm8,%xmm8
+
+	vpclmulqdq	$0x00,%xmm0,%xmm11,%xmm5
+	vmovdqu	96-32(%r9),%xmm3
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpunpckhqdq	%xmm10,%xmm10,%xmm2
+	vpclmulqdq	$0x11,%xmm0,%xmm11,%xmm11
+	vpxor	%xmm10,%xmm2,%xmm2
+	vpalignr	$8,%xmm8,%xmm8,%xmm14
+	vpxor	%xmm12,%xmm11,%xmm11
+	vpclmulqdq	$0x10,%xmm15,%xmm1,%xmm1
+	vmovdqu	128-32(%r9),%xmm15
+	vpxor	%xmm9,%xmm1,%xmm1
+
+	vxorps	%xmm7,%xmm14,%xmm14
+	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
+	vxorps	%xmm14,%xmm8,%xmm8
+
+	vpclmulqdq	$0x00,%xmm3,%xmm10,%xmm4
+	vmovdqu	112-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm8,%xmm8,%xmm9
+	vpclmulqdq	$0x11,%xmm3,%xmm10,%xmm10
+	vpxor	%xmm8,%xmm9,%xmm9
+	vpxor	%xmm11,%xmm10,%xmm10
+	vpclmulqdq	$0x00,%xmm15,%xmm2,%xmm2
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm5
+	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm7
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm6
+	vpxor	%xmm10,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm6,%xmm6
+
+	vpxor	%xmm5,%xmm7,%xmm4
+	vpxor	%xmm4,%xmm6,%xmm6
+	vpslldq	$8,%xmm6,%xmm1
+	vmovdqu	16(%r11),%xmm3
+	vpsrldq	$8,%xmm6,%xmm6
+	vpxor	%xmm1,%xmm5,%xmm8
+	vpxor	%xmm6,%xmm7,%xmm7
+
+	vpalignr	$8,%xmm8,%xmm8,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
+	vpxor	%xmm2,%xmm8,%xmm8
+
+	vpalignr	$8,%xmm8,%xmm8,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
+	vpxor	%xmm7,%xmm2,%xmm2
+	vpxor	%xmm2,%xmm8,%xmm8
+	vpshufb	(%r11),%xmm8,%xmm8
+	vmovdqu	%xmm8,-64(%r9)
+
+	vzeroupper
+	movq	-48(%rax),%r15
+.cfi_restore	%r15
+	movq	-40(%rax),%r14
+.cfi_restore	%r14
+	movq	-32(%rax),%r13
+.cfi_restore	%r13
+	movq	-24(%rax),%r12
+.cfi_restore	%r12
+	movq	-16(%rax),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rax),%rbx
+.cfi_restore	%rbx
+	leaq	(%rax),%rsp
+.cfi_def_cfa_register	%rsp
+.Lgcm_enc_abort:
+	movq	%r10,%rax
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
+
+/* Some utility routines */
+
+/*
+ * clear all fpu registers
+ * void clear_fpu_regs_avx(void);
+ */
+.globl	clear_fpu_regs_avx
+.type	clear_fpu_regs_avx,@function
+.align	32
+clear_fpu_regs_avx:
+	vzeroall
+	ret
+.size	clear_fpu_regs_avx,.-clear_fpu_regs_avx
+
+/*
+ * void gcm_xor_avx(const uint8_t *src, uint8_t *dst);
+ *
+ * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and
+ * stores the result at `dst'. The XOR is performed using FPU registers,
+ * so make sure FPU state is saved when running this in the kernel.
+ */
+.globl  gcm_xor_avx
+.type	gcm_xor_avx,@function
+.align	32
+gcm_xor_avx:
+	movdqu  (%rdi), %xmm0
+	movdqu  (%rsi), %xmm1
+	pxor    %xmm1, %xmm0
+	movdqu  %xmm0, (%rsi)
+	ret
+.size	gcm_xor_avx,.-gcm_xor_avx
+
+/*
+ * Toggle a boolean_t value atomically and return the new value.
+ * boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
+ */
+.globl	atomic_toggle_boolean_nv
+.type	atomic_toggle_boolean_nv,@function
+.align	32
+atomic_toggle_boolean_nv:
+	xorl	%eax, %eax
+	lock
+	xorl	$1, (%rdi)
+	jz	1f
+	movl	$1, %eax
+1:
+	ret
+.size	atomic_toggle_boolean_nv,.-atomic_toggle_boolean_nv
+
+.align	64
+.Lbswap_mask:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lpoly:
+.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.Lone_msb:
+.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Ltwo_lsb:
+.byte	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.Lone_lsb:
+.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.byte	65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
+
+/* Mark the stack non-executable. */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */
diff --git a/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S b/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S
new file mode 100644
index 000000000000..59edc4c8d56c
--- /dev/null
+++ b/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S
@@ -0,0 +1,254 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2009 Intel Corporation
+ * All Rights Reserved.
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
+ * instructions.  This file contains an accelerated
+ * Galois Field Multiplication implementation.
+ *
+ * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
+ * carry-less multiplication. More information about PCLMULQDQ can be
+ * found at:
+ * http://software.intel.com/en-us/articles/
+ * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
+ *
+ */
+
+/*
+ * ====================================================================
+ * OpenSolaris OS modifications
+ *
+ * This source originates as file galois_hash_asm.c from
+ * Intel Corporation dated September 21, 2009.
+ *
+ * This OpenSolaris version has these major changes from the original source:
+ *
+ * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
+ * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function
+ * definition for lint.
+ *
+ * 2. Formatted code, added comments, and added #includes and #defines.
+ *
+ * 3. If bit CR0.TS is set, clear and set the TS bit, after and before
+ * calling kpreempt_disable() and kpreempt_enable().
+ * If the TS bit is not set, Save and restore %xmm registers at the beginning
+ * and end of function calls (%xmm* registers are not saved and restored by
+ * during kernel thread preemption).
+ *
+ * 4. Removed code to perform hashing.  This is already done with C macro
+ * GHASH in gcm.c.  For better performance, this removed code should be
+ * reintegrated in the future to replace the C GHASH macro.
+ *
+ * 5. Added code to byte swap 16-byte input and output.
+ *
+ * 6. Folded in comments from the original C source with embedded assembly
+ * (SB_w_shift_xor.c)
+ *
+ * 7. Renamed function and reordered parameters to match OpenSolaris:
+ * Intel interface:
+ *	void galois_hash_asm(unsigned char *hk, unsigned char *s,
+ *		unsigned char *d, int length)
+ * OpenSolaris OS interface:
+ *	void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
+ * ====================================================================
+ */
+
+
+#if defined(lint) || defined(__lint)	/* lint */
+
+#include <sys/types.h>
+
+/* ARGSUSED */
+void
+gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) {
+}
+
+#elif defined(HAVE_PCLMULQDQ)	/* guard by instruction set */
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+/*
+ * Use this mask to byte-swap a 16-byte integer with the pshufb instruction
+ */
+
+// static uint8_t byte_swap16_mask[] = {
+//	 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
+.data
+.align XMM_ALIGN
+.Lbyte_swap16_mask:
+	.byte	15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+
+/*
+ * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
+ *
+ * Perform a carry-less multiplication (that is, use XOR instead of the
+ * multiply operator) on P1 and P2 and place the result in P3.
+ *
+ * Byte swap the input and the output.
+ *
+ * Note: x_in, y, and res all point to a block of 20-byte numbers
+ * (an array of two 64-bit integers).
+ *
+ * Note2: For kernel code, caller is responsible for ensuring
+ * kpreempt_disable() has been called.  This is because %xmm registers are
+ * not saved/restored.  Clear and set the CR0.TS bit on entry and exit,
+ * respectively, if TS is set on entry.  Otherwise, if TS is not set,
+ * save and restore %xmm registers on the stack.
+ *
+ * Note3: Original Intel definition:
+ * void galois_hash_asm(unsigned char *hk, unsigned char *s,
+ *	unsigned char *d, int length)
+ *
+ * Note4: Register/parameter mapping:
+ * Intel:
+ *	Parameter 1: %rcx (copied to %xmm0)	hk or x_in
+ *	Parameter 2: %rdx (copied to %xmm1)	s or y
+ *	Parameter 3: %rdi (result)		d or res
+ * OpenSolaris:
+ *	Parameter 1: %rdi (copied to %xmm0)	x_in
+ *	Parameter 2: %rsi (copied to %xmm1)	y
+ *	Parameter 3: %rdx (result)		res
+ */
+
+ENTRY_NP(gcm_mul_pclmulqdq)
+	//
+	// Copy Parameters
+	//
+	movdqu	(%rdi), %xmm0	// P1
+	movdqu	(%rsi), %xmm1	// P2
+
+	//
+	// Byte swap 16-byte input
+	//
+	lea	.Lbyte_swap16_mask(%rip), %rax
+	movups	(%rax), %xmm10
+	pshufb	%xmm10, %xmm0
+	pshufb	%xmm10, %xmm1
+
+
+	//
+	// Multiply with the hash key
+	//
+	movdqu	%xmm0, %xmm3
+	pclmulqdq $0, %xmm1, %xmm3	// xmm3 holds a0*b0
+
+	movdqu	%xmm0, %xmm4
+	pclmulqdq $16, %xmm1, %xmm4	// xmm4 holds a0*b1
+
+	movdqu	%xmm0, %xmm5
+	pclmulqdq $1, %xmm1, %xmm5	// xmm5 holds a1*b0
+	movdqu	%xmm0, %xmm6
+	pclmulqdq $17, %xmm1, %xmm6	// xmm6 holds a1*b1
+
+	pxor	%xmm5, %xmm4	// xmm4 holds a0*b1 + a1*b0
+
+	movdqu	%xmm4, %xmm5	// move the contents of xmm4 to xmm5
+	psrldq	$8, %xmm4	// shift by xmm4 64 bits to the right
+	pslldq	$8, %xmm5	// shift by xmm5 64 bits to the left
+	pxor	%xmm5, %xmm3
+	pxor	%xmm4, %xmm6	// Register pair <xmm6:xmm3> holds the result
+				// of the carry-less multiplication of
+				// xmm0 by xmm1.
+
+	// We shift the result of the multiplication by one bit position
+	// to the left to cope for the fact that the bits are reversed.
+	movdqu	%xmm3, %xmm7
+	movdqu	%xmm6, %xmm8
+	pslld	$1, %xmm3
+	pslld	$1, %xmm6
+	psrld	$31, %xmm7
+	psrld	$31, %xmm8
+	movdqu	%xmm7, %xmm9
+	pslldq	$4, %xmm8
+	pslldq	$4, %xmm7
+	psrldq	$12, %xmm9
+	por	%xmm7, %xmm3
+	por	%xmm8, %xmm6
+	por	%xmm9, %xmm6
+
+	//
+	// First phase of the reduction
+	//
+	// Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
+	// independently.
+	movdqu	%xmm3, %xmm7
+	movdqu	%xmm3, %xmm8
+	movdqu	%xmm3, %xmm9
+	pslld	$31, %xmm7	// packed right shift shifting << 31
+	pslld	$30, %xmm8	// packed right shift shifting << 30
+	pslld	$25, %xmm9	// packed right shift shifting << 25
+	pxor	%xmm8, %xmm7	// xor the shifted versions
+	pxor	%xmm9, %xmm7
+	movdqu	%xmm7, %xmm8
+	pslldq	$12, %xmm7
+	psrldq	$4, %xmm8
+	pxor	%xmm7, %xmm3	// first phase of the reduction complete
+
+	//
+	// Second phase of the reduction
+	//
+	// Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
+	// shift operations.
+	movdqu	%xmm3, %xmm2
+	movdqu	%xmm3, %xmm4	// packed left shifting >> 1
+	movdqu	%xmm3, %xmm5
+	psrld	$1, %xmm2
+	psrld	$2, %xmm4	// packed left shifting >> 2
+	psrld	$7, %xmm5	// packed left shifting >> 7
+	pxor	%xmm4, %xmm2	// xor the shifted versions
+	pxor	%xmm5, %xmm2
+	pxor	%xmm8, %xmm2
+	pxor	%xmm2, %xmm3
+	pxor	%xmm3, %xmm6	// the result is in xmm6
+
+	//
+	// Byte swap 16-byte result
+	//
+	pshufb	%xmm10, %xmm6	// %xmm10 has the swap mask
+
+	//
+	// Store the result
+	//
+	movdqu	%xmm6, (%rdx)	// P3
+
+
+	//
+	// Return
+	//
+	ret
+	SET_SIZE(gcm_mul_pclmulqdq)
+
+#endif	/* lint || __lint */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/module/icp/asm-x86_64/modes/ghash-x86_64.S b/module/icp/asm-x86_64/modes/ghash-x86_64.S
new file mode 100644
index 000000000000..90cc36b43a78
--- /dev/null
+++ b/module/icp/asm-x86_64/modes/ghash-x86_64.S
@@ -0,0 +1,714 @@
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March, June 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that
+# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
+# function features so called "528B" variant utilizing additional
+# 256+16 bytes of per-key storage [+512 bytes shared table].
+# Performance results are for this streamed GHASH subroutine and are
+# expressed in cycles per processed byte, less is better:
+#
+#		gcc 3.4.x(*)	assembler
+#
+# P4		28.6		14.0		+100%
+# Opteron	19.3		7.7		+150%
+# Core2		17.8		8.1(**)		+120%
+# Atom		31.6		16.8		+88%
+# VIA Nano	21.8		10.1		+115%
+#
+# (*)	comparison is not completely fair, because C results are
+#	for vanilla "256B" implementation, while assembler results
+#	are for "528B";-)
+# (**)	it's mystery [to me] why Core2 result is not same as for
+#	Opteron;
+
+# May 2010
+#
+# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
+# See ghash-x86.pl for background information and details about coding
+# techniques.
+#
+# Special thanks to David Woodhouse for providing access to a
+# Westmere-based system on behalf of Intel Open Source Technology Centre.
+
+# December 2012
+#
+# Overhaul: aggregate Karatsuba post-processing, improve ILP in
+# reduction_alg9, increase reduction aggregate factor to 4x. As for
+# the latter. ghash-x86.pl discusses that it makes lesser sense to
+# increase aggregate factor. Then why increase here? Critical path
+# consists of 3 independent pclmulqdq instructions, Karatsuba post-
+# processing and reduction. "On top" of this we lay down aggregated
+# multiplication operations, triplets of independent pclmulqdq's. As
+# issue rate for pclmulqdq is limited, it makes lesser sense to
+# aggregate more multiplications than it takes to perform remaining
+# non-multiplication operations. 2x is near-optimal coefficient for
+# contemporary Intel CPUs (therefore modest improvement coefficient),
+# but not for Bulldozer. Latter is because logical SIMD operations
+# are twice as slow in comparison to Intel, so that critical path is
+# longer. A CPU with higher pclmulqdq issue rate would also benefit
+# from higher aggregate factor...
+#
+# Westmere	1.78(+13%)
+# Sandy Bridge	1.80(+8%)
+# Ivy Bridge	1.80(+7%)
+# Haswell	0.55(+93%) (if system doesn't support AVX)
+# Broadwell	0.45(+110%)(if system doesn't support AVX)
+# Skylake	0.44(+110%)(if system doesn't support AVX)
+# Bulldozer	1.49(+27%)
+# Silvermont	2.88(+13%)
+# Knights L	2.12(-)    (if system doesn't support AVX)
+# Goldmont	1.08(+24%)
+
+# March 2013
+#
+# ... 8x aggregate factor AVX code path is using reduction algorithm
+# suggested by Shay Gueron[1]. Even though contemporary AVX-capable
+# CPUs such as Sandy and Ivy Bridge can execute it, the code performs
+# sub-optimally in comparison to above mentioned version. But thanks
+# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
+# it performs in 0.41 cycles per byte on Haswell processor, in
+# 0.29 on Broadwell, and in 0.36 on Skylake.
+#
+# Knights Landing achieves 1.09 cpb.
+#
+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
+
+# Generated once from
+# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/ghash-x86_64.pl
+# and modified for ICP. Modification are kept at a bare minimum to ease later
+# upstream merges.
+
+#if defined(__x86_64__) && defined(HAVE_AVX) && \
+    defined(HAVE_AES) && defined(HAVE_PCLMULQDQ)
+
+.text
+
+.globl	gcm_gmult_clmul
+.type	gcm_gmult_clmul,@function
+.align	16
+gcm_gmult_clmul:
+.cfi_startproc
+.L_gmult_clmul:
+	movdqu	(%rdi),%xmm0
+	movdqa	.Lbswap_mask(%rip),%xmm5
+	movdqu	(%rsi),%xmm2
+	movdqu	32(%rsi),%xmm4
+.byte	102,15,56,0,197
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pxor	%xmm0,%xmm3
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+.byte	102,15,56,0,197
+	movdqu	%xmm0,(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	gcm_gmult_clmul,.-gcm_gmult_clmul
+
+.globl	gcm_init_htab_avx
+.type	gcm_init_htab_avx,@function
+.align	32
+gcm_init_htab_avx:
+.cfi_startproc
+	vzeroupper
+
+	vmovdqu	(%rsi),%xmm2
+	// KCF/ICP stores H in network byte order with the hi qword first
+	// so we need to swap all bytes, not the 2 qwords.
+	vmovdqu	.Lbswap_mask(%rip),%xmm4
+	vpshufb	%xmm4,%xmm2,%xmm2
+
+
+	vpshufd	$255,%xmm2,%xmm4
+	vpsrlq	$63,%xmm2,%xmm3
+	vpsllq	$1,%xmm2,%xmm2
+	vpxor	%xmm5,%xmm5,%xmm5
+	vpcmpgtd	%xmm4,%xmm5,%xmm5
+	vpslldq	$8,%xmm3,%xmm3
+	vpor	%xmm3,%xmm2,%xmm2
+
+
+	vpand	.L0x1c2_polynomial(%rip),%xmm5,%xmm5
+	vpxor	%xmm5,%xmm2,%xmm2
+
+	vpunpckhqdq	%xmm2,%xmm2,%xmm6
+	vmovdqa	%xmm2,%xmm0
+	vpxor	%xmm2,%xmm6,%xmm6
+	movq	$4,%r10
+	jmp	.Linit_start_avx
+.align	32
+.Linit_loop_avx:
+	vpalignr	$8,%xmm3,%xmm4,%xmm5
+	vmovdqu	%xmm5,-16(%rdi)
+	vpunpckhqdq	%xmm0,%xmm0,%xmm3
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
+	vpxor	%xmm0,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+
+	vpslldq	$8,%xmm3,%xmm4
+	vpsrldq	$8,%xmm3,%xmm3
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpsllq	$57,%xmm0,%xmm3
+	vpsllq	$62,%xmm0,%xmm4
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpsllq	$63,%xmm0,%xmm3
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpslldq	$8,%xmm4,%xmm3
+	vpsrldq	$8,%xmm4,%xmm4
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vpsrlq	$1,%xmm0,%xmm4
+	vpxor	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$5,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$1,%xmm0,%xmm0
+	vpxor	%xmm1,%xmm0,%xmm0
+.Linit_start_avx:
+	vmovdqa	%xmm0,%xmm5
+	vpunpckhqdq	%xmm0,%xmm0,%xmm3
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
+	vpxor	%xmm0,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+
+	vpslldq	$8,%xmm3,%xmm4
+	vpsrldq	$8,%xmm3,%xmm3
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpsllq	$57,%xmm0,%xmm3
+	vpsllq	$62,%xmm0,%xmm4
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpsllq	$63,%xmm0,%xmm3
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpslldq	$8,%xmm4,%xmm3
+	vpsrldq	$8,%xmm4,%xmm4
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vpsrlq	$1,%xmm0,%xmm4
+	vpxor	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$5,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$1,%xmm0,%xmm0
+	vpxor	%xmm1,%xmm0,%xmm0
+	vpshufd	$78,%xmm5,%xmm3
+	vpshufd	$78,%xmm0,%xmm4
+	vpxor	%xmm5,%xmm3,%xmm3
+	vmovdqu	%xmm5,0(%rdi)
+	vpxor	%xmm0,%xmm4,%xmm4
+	vmovdqu	%xmm0,16(%rdi)
+	leaq	48(%rdi),%rdi
+	subq	$1,%r10
+	jnz	.Linit_loop_avx
+
+	vpalignr	$8,%xmm4,%xmm3,%xmm5
+	vmovdqu	%xmm5,-16(%rdi)
+
+	vzeroupper
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	gcm_init_htab_avx,.-gcm_init_htab_avx
+
+.globl	gcm_gmult_avx
+.type	gcm_gmult_avx,@function
+.align	32
+gcm_gmult_avx:
+.cfi_startproc
+	jmp	.L_gmult_clmul
+.cfi_endproc
+.size	gcm_gmult_avx,.-gcm_gmult_avx
+.globl	gcm_ghash_avx
+.type	gcm_ghash_avx,@function
+.align	32
+gcm_ghash_avx:
+.cfi_startproc
+	vzeroupper
+
+	vmovdqu	(%rdi),%xmm10
+	leaq	.L0x1c2_polynomial(%rip),%r10
+	leaq	64(%rsi),%rsi
+	vmovdqu	.Lbswap_mask(%rip),%xmm13
+	vpshufb	%xmm13,%xmm10,%xmm10
+	cmpq	$0x80,%rcx
+	jb	.Lshort_avx
+	subq	$0x80,%rcx
+
+	vmovdqu	112(%rdx),%xmm14
+	vmovdqu	0-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vmovdqu	32-64(%rsi),%xmm7
+
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vmovdqu	96(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	16-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vmovdqu	80(%rdx),%xmm14
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	48-64(%rsi),%xmm6
+	vpxor	%xmm14,%xmm9,%xmm9
+	vmovdqu	64(%rdx),%xmm15
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	80-64(%rsi),%xmm7
+
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	64-64(%rsi),%xmm6
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	48(%rdx),%xmm14
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	96-64(%rsi),%xmm6
+	vpxor	%xmm5,%xmm2,%xmm2
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	128-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+
+	vmovdqu	32(%rdx),%xmm15
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	112-64(%rsi),%xmm6
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	16(%rdx),%xmm14
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	144-64(%rsi),%xmm6
+	vpxor	%xmm5,%xmm2,%xmm2
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	176-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+
+	vmovdqu	(%rdx),%xmm15
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	160-64(%rsi),%xmm6
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
+
+	leaq	128(%rdx),%rdx
+	cmpq	$0x80,%rcx
+	jb	.Ltail_avx
+
+	vpxor	%xmm10,%xmm15,%xmm15
+	subq	$0x80,%rcx
+	jmp	.Loop8x_avx
+
+.align	32
+.Loop8x_avx:
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vmovdqu	112(%rdx),%xmm14
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpxor	%xmm15,%xmm8,%xmm8
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm10
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm11
+	vmovdqu	0-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm12
+	vmovdqu	32-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+
+	vmovdqu	96(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm3,%xmm10,%xmm10
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vxorps	%xmm4,%xmm11,%xmm11
+	vmovdqu	16-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm5,%xmm12,%xmm12
+	vxorps	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	80(%rdx),%xmm14
+	vpxor	%xmm10,%xmm12,%xmm12
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpxor	%xmm11,%xmm12,%xmm12
+	vpslldq	$8,%xmm12,%xmm9
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vpsrldq	$8,%xmm12,%xmm12
+	vpxor	%xmm9,%xmm10,%xmm10
+	vmovdqu	48-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vxorps	%xmm12,%xmm11,%xmm11
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	80-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	64(%rdx),%xmm15
+	vpalignr	$8,%xmm10,%xmm10,%xmm12
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	64-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vxorps	%xmm15,%xmm8,%xmm8
+	vpxor	%xmm5,%xmm2,%xmm2
+
+	vmovdqu	48(%rdx),%xmm14
+	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	96-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	128-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	32(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	112-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+	vpxor	%xmm5,%xmm2,%xmm2
+	vxorps	%xmm12,%xmm10,%xmm10
+
+	vmovdqu	16(%rdx),%xmm14
+	vpalignr	$8,%xmm10,%xmm10,%xmm12
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	144-64(%rsi),%xmm6
+	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
+	vxorps	%xmm11,%xmm12,%xmm12
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	176-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	160-64(%rsi),%xmm6
+	vpxor	%xmm12,%xmm15,%xmm15
+	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm10,%xmm15,%xmm15
+
+	leaq	128(%rdx),%rdx
+	subq	$0x80,%rcx
+	jnc	.Loop8x_avx
+
+	addq	$0x80,%rcx
+	jmp	.Ltail_no_xor_avx
+
+.align	32
+.Lshort_avx:
+	vmovdqu	-16(%rdx,%rcx,1),%xmm14
+	leaq	(%rdx,%rcx,1),%rdx
+	vmovdqu	0-64(%rsi),%xmm6
+	vmovdqu	32-64(%rsi),%xmm7
+	vpshufb	%xmm13,%xmm14,%xmm15
+
+	vmovdqa	%xmm0,%xmm3
+	vmovdqa	%xmm1,%xmm4
+	vmovdqa	%xmm2,%xmm5
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-32(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	16-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vpsrldq	$8,%xmm7,%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-48(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	48-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vmovdqu	80-64(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-64(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	64-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vpsrldq	$8,%xmm7,%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-80(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	96-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vmovdqu	128-64(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-96(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	112-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vpsrldq	$8,%xmm7,%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-112(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	144-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vmovq	184-64(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jmp	.Ltail_avx
+
+.align	32
+.Ltail_avx:
+	vpxor	%xmm10,%xmm15,%xmm15
+.Ltail_no_xor_avx:
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+
+	vmovdqu	(%r10),%xmm12
+
+	vpxor	%xmm0,%xmm3,%xmm10
+	vpxor	%xmm1,%xmm4,%xmm11
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vpxor	%xmm10,%xmm5,%xmm5
+	vpxor	%xmm11,%xmm5,%xmm5
+	vpslldq	$8,%xmm5,%xmm9
+	vpsrldq	$8,%xmm5,%xmm5
+	vpxor	%xmm9,%xmm10,%xmm10
+	vpxor	%xmm5,%xmm11,%xmm11
+
+	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
+	vpalignr	$8,%xmm10,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm10,%xmm10
+
+	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
+	vpalignr	$8,%xmm10,%xmm10,%xmm10
+	vpxor	%xmm11,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm10,%xmm10
+
+	cmpq	$0,%rcx
+	jne	.Lshort_avx
+
+	vpshufb	%xmm13,%xmm10,%xmm10
+	vmovdqu	%xmm10,(%rdi)
+	vzeroupper
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	gcm_ghash_avx,.-gcm_ghash_avx
+.align	64
+.Lbswap_mask:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.L0x1c2_polynomial:
+.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.L7_mask:
+.long	7,0,7,0
+.L7_mask_poly:
+.long	7,0,450,0
+.align	64
+.type	.Lrem_4bit,@object
+.Lrem_4bit:
+.long	0,0,0,471859200,0,943718400,0,610271232
+.long	0,1887436800,0,1822425088,0,1220542464,0,1423966208
+.long	0,3774873600,0,4246732800,0,3644850176,0,3311403008
+.long	0,2441084928,0,2376073216,0,2847932416,0,3051356160
+.type	.Lrem_8bit,@object
+.Lrem_8bit:
+.value	0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
+.value	0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
+.value	0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
+.value	0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
+.value	0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
+.value	0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
+.value	0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
+.value	0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
+.value	0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
+.value	0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
+.value	0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
+.value	0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
+.value	0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
+.value	0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
+.value	0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
+.value	0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
+.value	0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
+.value	0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
+.value	0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
+.value	0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
+.value	0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
+.value	0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
+.value	0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
+.value	0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
+.value	0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
+.value	0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
+.value	0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
+.value	0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
+.value	0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
+.value	0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
+.value	0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
+.value	0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
+
+.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
+
+/* Mark the stack non-executable. */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */
diff --git a/module/icp/asm-x86_64/sha1/sha1-x86_64.S b/module/icp/asm-x86_64/sha1/sha1-x86_64.S
new file mode 100644
index 000000000000..cb923784a730
--- /dev/null
+++ b/module/icp/asm-x86_64/sha1/sha1-x86_64.S
@@ -0,0 +1,1353 @@
+/*
+ * !/usr/bin/env perl
+ *
+ *  ====================================================================
+ *  Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+ *  project. The module is, however, dual licensed under OpenSSL and
+ *  CRYPTOGAMS licenses depending on where you obtain it. For further
+ *  details see http://www.openssl.org/~appro/cryptogams/.
+ *  ====================================================================
+ *
+ *  sha1_block procedure for x86_64.
+ *
+ *  It was brought to my attention that on EM64T compiler-generated code
+ *  was far behind 32-bit assembler implementation. This is unlike on
+ *  Opteron where compiler-generated code was only 15% behind 32-bit
+ *  assembler, which originally made it hard to motivate the effort.
+ *  There was suggestion to mechanically translate 32-bit code, but I
+ *  dismissed it, reasoning that x86_64 offers enough register bank
+ *  capacity to fully utilize SHA-1 parallelism. Therefore this fresh
+ *  implementation:-) However! While 64-bit code does performs better
+ *  on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
+ *  x86_64 does offer larger *addressable* bank, but out-of-order core
+ *  reaches for even more registers through dynamic aliasing, and EM64T
+ *  core must have managed to run-time optimize even 32-bit code just as
+ *  good as 64-bit one. Performance improvement is summarized in the
+ *  following table:
+ *
+ * 		gcc 3.4		32-bit asm	cycles/byte
+ *  Opteron	+45%		+20%		6.8
+ *  Xeon P4	+65%		+0%		9.9
+ *  Core2		+60%		+10%		7.0
+ *
+ *
+ *  OpenSolaris OS modifications
+ *
+ *  Sun elects to use this software under the BSD license.
+ *
+ *  This source originates from OpenSSL file sha1-x86_64.pl at
+ *  ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz
+ *  (presumably for future OpenSSL release 0.9.8h), with these changes:
+ *
+ *  1. Added perl "use strict" and declared variables.
+ *
+ *  2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
+ *  /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards.
+ *
+ *  3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1)
+ *  assemblers).
+ *
+ */
+
+/*
+ * This file was generated by a perl script (sha1-x86_64.pl). The comments from
+ * the original file have been pasted above.
+ */
+
+#if defined(lint) || defined(__lint)
+#include <sys/stdint.h>
+#include <sys/sha1.h>
+
+
+/* ARGSUSED */
+void
+sha1_block_data_order(SHA1_CTX *ctx, const void *inpp, size_t blocks)
+{
+}
+
+#else
+#define _ASM
+#include <sys/asm_linkage.h>
+ENTRY_NP(sha1_block_data_order)
+	push	%rbx
+	push	%rbp
+	push	%r12
+	mov	%rsp,%rax
+	mov	%rdi,%r8	# reassigned argument
+	sub	$72,%rsp
+	mov	%rsi,%r9	# reassigned argument
+	and	$-64,%rsp
+	mov	%rdx,%r10	# reassigned argument
+	mov	%rax,64(%rsp)
+
+	mov	0(%r8),%edx
+	mov	4(%r8),%esi
+	mov	8(%r8),%edi
+	mov	12(%r8),%ebp
+	mov	16(%r8),%r11d
+.align	4
+.Lloop:
+	mov	0(%r9),%eax
+	bswap	%eax
+	mov	%eax,0(%rsp)
+	lea	0x5a827999(%eax,%r11d),%r12d
+	mov	%edi,%ebx
+	mov	4(%r9),%eax
+	mov	%edx,%r11d
+	xor	%ebp,%ebx
+	bswap	%eax
+	rol	$5,%r11d
+	and	%esi,%ebx
+	mov	%eax,4(%rsp)
+	add	%r11d,%r12d
+	xor	%ebp,%ebx
+	rol	$30,%esi
+	add	%ebx,%r12d
+	lea	0x5a827999(%eax,%ebp),%r11d
+	mov	%esi,%ebx
+	mov	8(%r9),%eax
+	mov	%r12d,%ebp
+	xor	%edi,%ebx
+	bswap	%eax
+	rol	$5,%ebp
+	and	%edx,%ebx
+	mov	%eax,8(%rsp)
+	add	%ebp,%r11d
+	xor	%edi,%ebx
+	rol	$30,%edx
+	add	%ebx,%r11d
+	lea	0x5a827999(%eax,%edi),%ebp
+	mov	%edx,%ebx
+	mov	12(%r9),%eax
+	mov	%r11d,%edi
+	xor	%esi,%ebx
+	bswap	%eax
+	rol	$5,%edi
+	and	%r12d,%ebx
+	mov	%eax,12(%rsp)
+	add	%edi,%ebp
+	xor	%esi,%ebx
+	rol	$30,%r12d
+	add	%ebx,%ebp
+	lea	0x5a827999(%eax,%esi),%edi
+	mov	%r12d,%ebx
+	mov	16(%r9),%eax
+	mov	%ebp,%esi
+	xor	%edx,%ebx
+	bswap	%eax
+	rol	$5,%esi
+	and	%r11d,%ebx
+	mov	%eax,16(%rsp)
+	add	%esi,%edi
+	xor	%edx,%ebx
+	rol	$30,%r11d
+	add	%ebx,%edi
+	lea	0x5a827999(%eax,%edx),%esi
+	mov	%r11d,%ebx
+	mov	20(%r9),%eax
+	mov	%edi,%edx
+	xor	%r12d,%ebx
+	bswap	%eax
+	rol	$5,%edx
+	and	%ebp,%ebx
+	mov	%eax,20(%rsp)
+	add	%edx,%esi
+	xor	%r12d,%ebx
+	rol	$30,%ebp
+	add	%ebx,%esi
+	lea	0x5a827999(%eax,%r12d),%edx
+	mov	%ebp,%ebx
+	mov	24(%r9),%eax
+	mov	%esi,%r12d
+	xor	%r11d,%ebx
+	bswap	%eax
+	rol	$5,%r12d
+	and	%edi,%ebx
+	mov	%eax,24(%rsp)
+	add	%r12d,%edx
+	xor	%r11d,%ebx
+	rol	$30,%edi
+	add	%ebx,%edx
+	lea	0x5a827999(%eax,%r11d),%r12d
+	mov	%edi,%ebx
+	mov	28(%r9),%eax
+	mov	%edx,%r11d
+	xor	%ebp,%ebx
+	bswap	%eax
+	rol	$5,%r11d
+	and	%esi,%ebx
+	mov	%eax,28(%rsp)
+	add	%r11d,%r12d
+	xor	%ebp,%ebx
+	rol	$30,%esi
+	add	%ebx,%r12d
+	lea	0x5a827999(%eax,%ebp),%r11d
+	mov	%esi,%ebx
+	mov	32(%r9),%eax
+	mov	%r12d,%ebp
+	xor	%edi,%ebx
+	bswap	%eax
+	rol	$5,%ebp
+	and	%edx,%ebx
+	mov	%eax,32(%rsp)
+	add	%ebp,%r11d
+	xor	%edi,%ebx
+	rol	$30,%edx
+	add	%ebx,%r11d
+	lea	0x5a827999(%eax,%edi),%ebp
+	mov	%edx,%ebx
+	mov	36(%r9),%eax
+	mov	%r11d,%edi
+	xor	%esi,%ebx
+	bswap	%eax
+	rol	$5,%edi
+	and	%r12d,%ebx
+	mov	%eax,36(%rsp)
+	add	%edi,%ebp
+	xor	%esi,%ebx
+	rol	$30,%r12d
+	add	%ebx,%ebp
+	lea	0x5a827999(%eax,%esi),%edi
+	mov	%r12d,%ebx
+	mov	40(%r9),%eax
+	mov	%ebp,%esi
+	xor	%edx,%ebx
+	bswap	%eax
+	rol	$5,%esi
+	and	%r11d,%ebx
+	mov	%eax,40(%rsp)
+	add	%esi,%edi
+	xor	%edx,%ebx
+	rol	$30,%r11d
+	add	%ebx,%edi
+	lea	0x5a827999(%eax,%edx),%esi
+	mov	%r11d,%ebx
+	mov	44(%r9),%eax
+	mov	%edi,%edx
+	xor	%r12d,%ebx
+	bswap	%eax
+	rol	$5,%edx
+	and	%ebp,%ebx
+	mov	%eax,44(%rsp)
+	add	%edx,%esi
+	xor	%r12d,%ebx
+	rol	$30,%ebp
+	add	%ebx,%esi
+	lea	0x5a827999(%eax,%r12d),%edx
+	mov	%ebp,%ebx
+	mov	48(%r9),%eax
+	mov	%esi,%r12d
+	xor	%r11d,%ebx
+	bswap	%eax
+	rol	$5,%r12d
+	and	%edi,%ebx
+	mov	%eax,48(%rsp)
+	add	%r12d,%edx
+	xor	%r11d,%ebx
+	rol	$30,%edi
+	add	%ebx,%edx
+	lea	0x5a827999(%eax,%r11d),%r12d
+	mov	%edi,%ebx
+	mov	52(%r9),%eax
+	mov	%edx,%r11d
+	xor	%ebp,%ebx
+	bswap	%eax
+	rol	$5,%r11d
+	and	%esi,%ebx
+	mov	%eax,52(%rsp)
+	add	%r11d,%r12d
+	xor	%ebp,%ebx
+	rol	$30,%esi
+	add	%ebx,%r12d
+	lea	0x5a827999(%eax,%ebp),%r11d
+	mov	%esi,%ebx
+	mov	56(%r9),%eax
+	mov	%r12d,%ebp
+	xor	%edi,%ebx
+	bswap	%eax
+	rol	$5,%ebp
+	and	%edx,%ebx
+	mov	%eax,56(%rsp)
+	add	%ebp,%r11d
+	xor	%edi,%ebx
+	rol	$30,%edx
+	add	%ebx,%r11d
+	lea	0x5a827999(%eax,%edi),%ebp
+	mov	%edx,%ebx
+	mov	60(%r9),%eax
+	mov	%r11d,%edi
+	xor	%esi,%ebx
+	bswap	%eax
+	rol	$5,%edi
+	and	%r12d,%ebx
+	mov	%eax,60(%rsp)
+	add	%edi,%ebp
+	xor	%esi,%ebx
+	rol	$30,%r12d
+	add	%ebx,%ebp
+	lea	0x5a827999(%eax,%esi),%edi
+	mov	0(%rsp),%eax
+	mov	%r12d,%ebx
+	mov	%ebp,%esi
+	xor	8(%rsp),%eax
+	xor	%edx,%ebx
+	rol	$5,%esi
+	xor	32(%rsp),%eax
+	and	%r11d,%ebx
+	add	%esi,%edi
+	xor	52(%rsp),%eax
+	xor	%edx,%ebx
+	rol	$30,%r11d
+	add	%ebx,%edi
+	rol	$1,%eax
+	mov	%eax,0(%rsp)
+	lea	0x5a827999(%eax,%edx),%esi
+	mov	4(%rsp),%eax
+	mov	%r11d,%ebx
+	mov	%edi,%edx
+	xor	12(%rsp),%eax
+	xor	%r12d,%ebx
+	rol	$5,%edx
+	xor	36(%rsp),%eax
+	and	%ebp,%ebx
+	add	%edx,%esi
+	xor	56(%rsp),%eax
+	xor	%r12d,%ebx
+	rol	$30,%ebp
+	add	%ebx,%esi
+	rol	$1,%eax
+	mov	%eax,4(%rsp)
+	lea	0x5a827999(%eax,%r12d),%edx
+	mov	8(%rsp),%eax
+	mov	%ebp,%ebx
+	mov	%esi,%r12d
+	xor	16(%rsp),%eax
+	xor	%r11d,%ebx
+	rol	$5,%r12d
+	xor	40(%rsp),%eax
+	and	%edi,%ebx
+	add	%r12d,%edx
+	xor	60(%rsp),%eax
+	xor	%r11d,%ebx
+	rol	$30,%edi
+	add	%ebx,%edx
+	rol	$1,%eax
+	mov	%eax,8(%rsp)
+	lea	0x5a827999(%eax,%r11d),%r12d
+	mov	12(%rsp),%eax
+	mov	%edi,%ebx
+	mov	%edx,%r11d
+	xor	20(%rsp),%eax
+	xor	%ebp,%ebx
+	rol	$5,%r11d
+	xor	44(%rsp),%eax
+	and	%esi,%ebx
+	add	%r11d,%r12d
+	xor	0(%rsp),%eax
+	xor	%ebp,%ebx
+	rol	$30,%esi
+	add	%ebx,%r12d
+	rol	$1,%eax
+	mov	%eax,12(%rsp)
+	lea	0x5a827999(%eax,%ebp),%r11d
+	mov	16(%rsp),%eax
+	mov	%esi,%ebx
+	mov	%r12d,%ebp
+	xor	24(%rsp),%eax
+	xor	%edi,%ebx
+	rol	$5,%ebp
+	xor	48(%rsp),%eax
+	and	%edx,%ebx
+	add	%ebp,%r11d
+	xor	4(%rsp),%eax
+	xor	%edi,%ebx
+	rol	$30,%edx
+	add	%ebx,%r11d
+	rol	$1,%eax
+	mov	%eax,16(%rsp)
+	lea	0x6ed9eba1(%eax,%edi),%ebp
+	mov	20(%rsp),%eax
+	mov	%edx,%ebx
+	mov	%r11d,%edi
+	xor	28(%rsp),%eax
+	xor	%r12d,%ebx
+	rol	$5,%edi
+	xor	52(%rsp),%eax
+	xor	%esi,%ebx
+	add	%edi,%ebp
+	xor	8(%rsp),%eax
+	rol	$30,%r12d
+	add	%ebx,%ebp
+	rol	$1,%eax
+	mov	%eax,20(%rsp)
+	lea	0x6ed9eba1(%eax,%esi),%edi
+	mov	24(%rsp),%eax
+	mov	%r12d,%ebx
+	mov	%ebp,%esi
+	xor	32(%rsp),%eax
+	xor	%r11d,%ebx
+	rol	$5,%esi
+	xor	56(%rsp),%eax
+	xor	%edx,%ebx
+	add	%esi,%edi
+	xor	12(%rsp),%eax
+	rol	$30,%r11d
+	add	%ebx,%edi
+	rol	$1,%eax
+	mov	%eax,24(%rsp)
+	lea	0x6ed9eba1(%eax,%edx),%esi
+	mov	28(%rsp),%eax
+	mov	%r11d,%ebx
+	mov	%edi,%edx
+	xor	36(%rsp),%eax
+	xor	%ebp,%ebx
+	rol	$5,%edx
+	xor	60(%rsp),%eax
+	xor	%r12d,%ebx
+	add	%edx,%esi
+	xor	16(%rsp),%eax
+	rol	$30,%ebp
+	add	%ebx,%esi
+	rol	$1,%eax
+	mov	%eax,28(%rsp)
+	lea	0x6ed9eba1(%eax,%r12d),%edx
+	mov	32(%rsp),%eax
+	mov	%ebp,%ebx
+	mov	%esi,%r12d
+	xor	40(%rsp),%eax
+	xor	%edi,%ebx
+	rol	$5,%r12d
+	xor	0(%rsp),%eax
+	xor	%r11d,%ebx
+	add	%r12d,%edx
+	xor	20(%rsp),%eax
+	rol	$30,%edi
+	add	%ebx,%edx
+	rol	$1,%eax
+	mov	%eax,32(%rsp)
+	lea	0x6ed9eba1(%eax,%r11d),%r12d
+	mov	36(%rsp),%eax
+	mov	%edi,%ebx
+	mov	%edx,%r11d
+	xor	44(%rsp),%eax
+	xor	%esi,%ebx
+	rol	$5,%r11d
+	xor	4(%rsp),%eax
+	xor	%ebp,%ebx
+	add	%r11d,%r12d
+	xor	24(%rsp),%eax
+	rol	$30,%esi
+	add	%ebx,%r12d
+	rol	$1,%eax
+	mov	%eax,36(%rsp)
+	lea	0x6ed9eba1(%eax,%ebp),%r11d
+	mov	40(%rsp),%eax
+	mov	%esi,%ebx
+	mov	%r12d,%ebp
+	xor	48(%rsp),%eax
+	xor	%edx,%ebx
+	rol	$5,%ebp
+	xor	8(%rsp),%eax
+	xor	%edi,%ebx
+	add	%ebp,%r11d
+	xor	28(%rsp),%eax
+	rol	$30,%edx
+	add	%ebx,%r11d
+	rol	$1,%eax
+	mov	%eax,40(%rsp)
+	lea	0x6ed9eba1(%eax,%edi),%ebp
+	mov	44(%rsp),%eax
+	mov	%edx,%ebx
+	mov	%r11d,%edi
+	xor	52(%rsp),%eax
+	xor	%r12d,%ebx
+	rol	$5,%edi
+	xor	12(%rsp),%eax
+	xor	%esi,%ebx
+	add	%edi,%ebp
+	xor	32(%rsp),%eax
+	rol	$30,%r12d
+	add	%ebx,%ebp
+	rol	$1,%eax
+	mov	%eax,44(%rsp)
+	lea	0x6ed9eba1(%eax,%esi),%edi
+	mov	48(%rsp),%eax
+	mov	%r12d,%ebx
+	mov	%ebp,%esi
+	xor	56(%rsp),%eax
+	xor	%r11d,%ebx
+	rol	$5,%esi
+	xor	16(%rsp),%eax
+	xor	%edx,%ebx
+	add	%esi,%edi
+	xor	36(%rsp),%eax
+	rol	$30,%r11d
+	add	%ebx,%edi
+	rol	$1,%eax
+	mov	%eax,48(%rsp)
+	lea	0x6ed9eba1(%eax,%edx),%esi
+	mov	52(%rsp),%eax
+	mov	%r11d,%ebx
+	mov	%edi,%edx
+	xor	60(%rsp),%eax
+	xor	%ebp,%ebx
+	rol	$5,%edx
+	xor	20(%rsp),%eax
+	xor	%r12d,%ebx
+	add	%edx,%esi
+	xor	40(%rsp),%eax
+	rol	$30,%ebp
+	add	%ebx,%esi
+	rol	$1,%eax
+	mov	%eax,52(%rsp)
+	lea	0x6ed9eba1(%eax,%r12d),%edx
+	mov	56(%rsp),%eax
+	mov	%ebp,%ebx
+	mov	%esi,%r12d
+	xor	0(%rsp),%eax
+	xor	%edi,%ebx
+	rol	$5,%r12d
+	xor	24(%rsp),%eax
+	xor	%r11d,%ebx
+	add	%r12d,%edx
+	xor	44(%rsp),%eax
+	rol	$30,%edi
+	add	%ebx,%edx
+	rol	$1,%eax
+	mov	%eax,56(%rsp)
+	lea	0x6ed9eba1(%eax,%r11d),%r12d
+	mov	60(%rsp),%eax
+	mov	%edi,%ebx
+	mov	%edx,%r11d
+	xor	4(%rsp),%eax
+	xor	%esi,%ebx
+	rol	$5,%r11d
+	xor	28(%rsp),%eax
+	xor	%ebp,%ebx
+	add	%r11d,%r12d
+	xor	48(%rsp),%eax
+	rol	$30,%esi
+	add	%ebx,%r12d
+	rol	$1,%eax
+	mov	%eax,60(%rsp)
+	lea	0x6ed9eba1(%eax,%ebp),%r11d
+	mov	0(%rsp),%eax
+	mov	%esi,%ebx
+	mov	%r12d,%ebp
+	xor	8(%rsp),%eax
+	xor	%edx,%ebx
+	rol	$5,%ebp
+	xor	32(%rsp),%eax
+	xor	%edi,%ebx
+	add	%ebp,%r11d
+	xor	52(%rsp),%eax
+	rol	$30,%edx
+	add	%ebx,%r11d
+	rol	$1,%eax
+	mov	%eax,0(%rsp)
+	lea	0x6ed9eba1(%eax,%edi),%ebp
+	mov	4(%rsp),%eax
+	mov	%edx,%ebx
+	mov	%r11d,%edi
+	xor	12(%rsp),%eax
+	xor	%r12d,%ebx
+	rol	$5,%edi
+	xor	36(%rsp),%eax
+	xor	%esi,%ebx
+	add	%edi,%ebp
+	xor	56(%rsp),%eax
+	rol	$30,%r12d
+	add	%ebx,%ebp
+	rol	$1,%eax
+	mov	%eax,4(%rsp)
+	lea	0x6ed9eba1(%eax,%esi),%edi
+	mov	8(%rsp),%eax
+	mov	%r12d,%ebx
+	mov	%ebp,%esi
+	xor	16(%rsp),%eax
+	xor	%r11d,%ebx
+	rol	$5,%esi
+	xor	40(%rsp),%eax
+	xor	%edx,%ebx
+	add	%esi,%edi
+	xor	60(%rsp),%eax
+	rol	$30,%r11d
+	add	%ebx,%edi
+	rol	$1,%eax
+	mov	%eax,8(%rsp)
+	lea	0x6ed9eba1(%eax,%edx),%esi
+	mov	12(%rsp),%eax
+	mov	%r11d,%ebx
+	mov	%edi,%edx
+	xor	20(%rsp),%eax
+	xor	%ebp,%ebx
+	rol	$5,%edx
+	xor	44(%rsp),%eax
+	xor	%r12d,%ebx
+	add	%edx,%esi
+	xor	0(%rsp),%eax
+	rol	$30,%ebp
+	add	%ebx,%esi
+	rol	$1,%eax
+	mov	%eax,12(%rsp)
+	lea	0x6ed9eba1(%eax,%r12d),%edx
+	mov	16(%rsp),%eax
+	mov	%ebp,%ebx
+	mov	%esi,%r12d
+	xor	24(%rsp),%eax
+	xor	%edi,%ebx
+	rol	$5,%r12d
+	xor	48(%rsp),%eax
+	xor	%r11d,%ebx
+	add	%r12d,%edx
+	xor	4(%rsp),%eax
+	rol	$30,%edi
+	add	%ebx,%edx
+	rol	$1,%eax
+	mov	%eax,16(%rsp)
+	lea	0x6ed9eba1(%eax,%r11d),%r12d
+	mov	20(%rsp),%eax
+	mov	%edi,%ebx
+	mov	%edx,%r11d
+	xor	28(%rsp),%eax
+	xor	%esi,%ebx
+	rol	$5,%r11d
+	xor	52(%rsp),%eax
+	xor	%ebp,%ebx
+	add	%r11d,%r12d
+	xor	8(%rsp),%eax
+	rol	$30,%esi
+	add	%ebx,%r12d
+	rol	$1,%eax
+	mov	%eax,20(%rsp)
+	lea	0x6ed9eba1(%eax,%ebp),%r11d
+	mov	24(%rsp),%eax
+	mov	%esi,%ebx
+	mov	%r12d,%ebp
+	xor	32(%rsp),%eax
+	xor	%edx,%ebx
+	rol	$5,%ebp
+	xor	56(%rsp),%eax
+	xor	%edi,%ebx
+	add	%ebp,%r11d
+	xor	12(%rsp),%eax
+	rol	$30,%edx
+	add	%ebx,%r11d
+	rol	$1,%eax
+	mov	%eax,24(%rsp)
+	lea	0x6ed9eba1(%eax,%edi),%ebp
+	mov	28(%rsp),%eax
+	mov	%edx,%ebx
+	mov	%r11d,%edi
+	xor	36(%rsp),%eax
+	xor	%r12d,%ebx
+	rol	$5,%edi
+	xor	60(%rsp),%eax
+	xor	%esi,%ebx
+	add	%edi,%ebp
+	xor	16(%rsp),%eax
+	rol	$30,%r12d
+	add	%ebx,%ebp
+	rol	$1,%eax
+	mov	%eax,28(%rsp)
+	lea	0x6ed9eba1(%eax,%esi),%edi
+	mov	32(%rsp),%eax
+	mov	%r12d,%ebx
+	mov	%ebp,%esi
+	xor	40(%rsp),%eax
+	xor	%r11d,%ebx
+	rol	$5,%esi
+	xor	0(%rsp),%eax
+	xor	%edx,%ebx
+	add	%esi,%edi
+	xor	20(%rsp),%eax
+	rol	$30,%r11d
+	add	%ebx,%edi
+	rol	$1,%eax
+	mov	%eax,32(%rsp)
+	lea	-0x70e44324(%eax,%edx),%esi
+	mov	36(%rsp),%eax
+	mov	%ebp,%ebx
+	mov	%ebp,%ecx
+	xor	44(%rsp),%eax
+	mov	%edi,%edx
+	and	%r11d,%ebx
+	xor	4(%rsp),%eax
+	or	%r11d,%ecx
+	rol	$5,%edx
+	xor	24(%rsp),%eax
+	and	%r12d,%ecx
+	add	%edx,%esi
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%ebp
+	mov	%eax,36(%rsp)
+	add	%ebx,%esi
+	lea	-0x70e44324(%eax,%r12d),%edx
+	mov	40(%rsp),%eax
+	mov	%edi,%ebx
+	mov	%edi,%ecx
+	xor	48(%rsp),%eax
+	mov	%esi,%r12d
+	and	%ebp,%ebx
+	xor	8(%rsp),%eax
+	or	%ebp,%ecx
+	rol	$5,%r12d
+	xor	28(%rsp),%eax
+	and	%r11d,%ecx
+	add	%r12d,%edx
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%edi
+	mov	%eax,40(%rsp)
+	add	%ebx,%edx
+	lea	-0x70e44324(%eax,%r11d),%r12d
+	mov	44(%rsp),%eax
+	mov	%esi,%ebx
+	mov	%esi,%ecx
+	xor	52(%rsp),%eax
+	mov	%edx,%r11d
+	and	%edi,%ebx
+	xor	12(%rsp),%eax
+	or	%edi,%ecx
+	rol	$5,%r11d
+	xor	32(%rsp),%eax
+	and	%ebp,%ecx
+	add	%r11d,%r12d
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%esi
+	mov	%eax,44(%rsp)
+	add	%ebx,%r12d
+	lea	-0x70e44324(%eax,%ebp),%r11d
+	mov	48(%rsp),%eax
+	mov	%edx,%ebx
+	mov	%edx,%ecx
+	xor	56(%rsp),%eax
+	mov	%r12d,%ebp
+	and	%esi,%ebx
+	xor	16(%rsp),%eax
+	or	%esi,%ecx
+	rol	$5,%ebp
+	xor	36(%rsp),%eax
+	and	%edi,%ecx
+	add	%ebp,%r11d
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%edx
+	mov	%eax,48(%rsp)
+	add	%ebx,%r11d
+	lea	-0x70e44324(%eax,%edi),%ebp
+	mov	52(%rsp),%eax
+	mov	%r12d,%ebx
+	mov	%r12d,%ecx
+	xor	60(%rsp),%eax
+	mov	%r11d,%edi
+	and	%edx,%ebx
+	xor	20(%rsp),%eax
+	or	%edx,%ecx
+	rol	$5,%edi
+	xor	40(%rsp),%eax
+	and	%esi,%ecx
+	add	%edi,%ebp
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%r12d
+	mov	%eax,52(%rsp)
+	add	%ebx,%ebp
+	lea	-0x70e44324(%eax,%esi),%edi
+	mov	56(%rsp),%eax
+	mov	%r11d,%ebx
+	mov	%r11d,%ecx
+	xor	0(%rsp),%eax
+	mov	%ebp,%esi
+	and	%r12d,%ebx
+	xor	24(%rsp),%eax
+	or	%r12d,%ecx
+	rol	$5,%esi
+	xor	44(%rsp),%eax
+	and	%edx,%ecx
+	add	%esi,%edi
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%r11d
+	mov	%eax,56(%rsp)
+	add	%ebx,%edi
+	lea	-0x70e44324(%eax,%edx),%esi
+	mov	60(%rsp),%eax
+	mov	%ebp,%ebx
+	mov	%ebp,%ecx
+	xor	4(%rsp),%eax
+	mov	%edi,%edx
+	and	%r11d,%ebx
+	xor	28(%rsp),%eax
+	or	%r11d,%ecx
+	rol	$5,%edx
+	xor	48(%rsp),%eax
+	and	%r12d,%ecx
+	add	%edx,%esi
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%ebp
+	mov	%eax,60(%rsp)
+	add	%ebx,%esi
+	lea	-0x70e44324(%eax,%r12d),%edx
+	mov	0(%rsp),%eax
+	mov	%edi,%ebx
+	mov	%edi,%ecx
+	xor	8(%rsp),%eax
+	mov	%esi,%r12d
+	and	%ebp,%ebx
+	xor	32(%rsp),%eax
+	or	%ebp,%ecx
+	rol	$5,%r12d
+	xor	52(%rsp),%eax
+	and	%r11d,%ecx
+	add	%r12d,%edx
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%edi
+	mov	%eax,0(%rsp)
+	add	%ebx,%edx
+	lea	-0x70e44324(%eax,%r11d),%r12d
+	mov	4(%rsp),%eax
+	mov	%esi,%ebx
+	mov	%esi,%ecx
+	xor	12(%rsp),%eax
+	mov	%edx,%r11d
+	and	%edi,%ebx
+	xor	36(%rsp),%eax
+	or	%edi,%ecx
+	rol	$5,%r11d
+	xor	56(%rsp),%eax
+	and	%ebp,%ecx
+	add	%r11d,%r12d
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%esi
+	mov	%eax,4(%rsp)
+	add	%ebx,%r12d
+	lea	-0x70e44324(%eax,%ebp),%r11d
+	mov	8(%rsp),%eax
+	mov	%edx,%ebx
+	mov	%edx,%ecx
+	xor	16(%rsp),%eax
+	mov	%r12d,%ebp
+	and	%esi,%ebx
+	xor	40(%rsp),%eax
+	or	%esi,%ecx
+	rol	$5,%ebp
+	xor	60(%rsp),%eax
+	and	%edi,%ecx
+	add	%ebp,%r11d
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%edx
+	mov	%eax,8(%rsp)
+	add	%ebx,%r11d
+	lea	-0x70e44324(%eax,%edi),%ebp
+	mov	12(%rsp),%eax
+	mov	%r12d,%ebx
+	mov	%r12d,%ecx
+	xor	20(%rsp),%eax
+	mov	%r11d,%edi
+	and	%edx,%ebx
+	xor	44(%rsp),%eax
+	or	%edx,%ecx
+	rol	$5,%edi
+	xor	0(%rsp),%eax
+	and	%esi,%ecx
+	add	%edi,%ebp
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%r12d
+	mov	%eax,12(%rsp)
+	add	%ebx,%ebp
+	lea	-0x70e44324(%eax,%esi),%edi
+	mov	16(%rsp),%eax
+	mov	%r11d,%ebx
+	mov	%r11d,%ecx
+	xor	24(%rsp),%eax
+	mov	%ebp,%esi
+	and	%r12d,%ebx
+	xor	48(%rsp),%eax
+	or	%r12d,%ecx
+	rol	$5,%esi
+	xor	4(%rsp),%eax
+	and	%edx,%ecx
+	add	%esi,%edi
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%r11d
+	mov	%eax,16(%rsp)
+	add	%ebx,%edi
+	lea	-0x70e44324(%eax,%edx),%esi
+	mov	20(%rsp),%eax
+	mov	%ebp,%ebx
+	mov	%ebp,%ecx
+	xor	28(%rsp),%eax
+	mov	%edi,%edx
+	and	%r11d,%ebx
+	xor	52(%rsp),%eax
+	or	%r11d,%ecx
+	rol	$5,%edx
+	xor	8(%rsp),%eax
+	and	%r12d,%ecx
+	add	%edx,%esi
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%ebp
+	mov	%eax,20(%rsp)
+	add	%ebx,%esi
+	lea	-0x70e44324(%eax,%r12d),%edx
+	mov	24(%rsp),%eax
+	mov	%edi,%ebx
+	mov	%edi,%ecx
+	xor	32(%rsp),%eax
+	mov	%esi,%r12d
+	and	%ebp,%ebx
+	xor	56(%rsp),%eax
+	or	%ebp,%ecx
+	rol	$5,%r12d
+	xor	12(%rsp),%eax
+	and	%r11d,%ecx
+	add	%r12d,%edx
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%edi
+	mov	%eax,24(%rsp)
+	add	%ebx,%edx
+	lea	-0x70e44324(%eax,%r11d),%r12d
+	mov	28(%rsp),%eax
+	mov	%esi,%ebx
+	mov	%esi,%ecx
+	xor	36(%rsp),%eax
+	mov	%edx,%r11d
+	and	%edi,%ebx
+	xor	60(%rsp),%eax
+	or	%edi,%ecx
+	rol	$5,%r11d
+	xor	16(%rsp),%eax
+	and	%ebp,%ecx
+	add	%r11d,%r12d
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%esi
+	mov	%eax,28(%rsp)
+	add	%ebx,%r12d
+	lea	-0x70e44324(%eax,%ebp),%r11d
+	mov	32(%rsp),%eax
+	mov	%edx,%ebx
+	mov	%edx,%ecx
+	xor	40(%rsp),%eax
+	mov	%r12d,%ebp
+	and	%esi,%ebx
+	xor	0(%rsp),%eax
+	or	%esi,%ecx
+	rol	$5,%ebp
+	xor	20(%rsp),%eax
+	and	%edi,%ecx
+	add	%ebp,%r11d
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%edx
+	mov	%eax,32(%rsp)
+	add	%ebx,%r11d
+	lea	-0x70e44324(%eax,%edi),%ebp
+	mov	36(%rsp),%eax
+	mov	%r12d,%ebx
+	mov	%r12d,%ecx
+	xor	44(%rsp),%eax
+	mov	%r11d,%edi
+	and	%edx,%ebx
+	xor	4(%rsp),%eax
+	or	%edx,%ecx
+	rol	$5,%edi
+	xor	24(%rsp),%eax
+	and	%esi,%ecx
+	add	%edi,%ebp
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%r12d
+	mov	%eax,36(%rsp)
+	add	%ebx,%ebp
+	lea	-0x70e44324(%eax,%esi),%edi
+	mov	40(%rsp),%eax
+	mov	%r11d,%ebx
+	mov	%r11d,%ecx
+	xor	48(%rsp),%eax
+	mov	%ebp,%esi
+	and	%r12d,%ebx
+	xor	8(%rsp),%eax
+	or	%r12d,%ecx
+	rol	$5,%esi
+	xor	28(%rsp),%eax
+	and	%edx,%ecx
+	add	%esi,%edi
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%r11d
+	mov	%eax,40(%rsp)
+	add	%ebx,%edi
+	lea	-0x70e44324(%eax,%edx),%esi
+	mov	44(%rsp),%eax
+	mov	%ebp,%ebx
+	mov	%ebp,%ecx
+	xor	52(%rsp),%eax
+	mov	%edi,%edx
+	and	%r11d,%ebx
+	xor	12(%rsp),%eax
+	or	%r11d,%ecx
+	rol	$5,%edx
+	xor	32(%rsp),%eax
+	and	%r12d,%ecx
+	add	%edx,%esi
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%ebp
+	mov	%eax,44(%rsp)
+	add	%ebx,%esi
+	lea	-0x70e44324(%eax,%r12d),%edx
+	mov	48(%rsp),%eax
+	mov	%edi,%ebx
+	mov	%edi,%ecx
+	xor	56(%rsp),%eax
+	mov	%esi,%r12d
+	and	%ebp,%ebx
+	xor	16(%rsp),%eax
+	or	%ebp,%ecx
+	rol	$5,%r12d
+	xor	36(%rsp),%eax
+	and	%r11d,%ecx
+	add	%r12d,%edx
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%edi
+	mov	%eax,48(%rsp)
+	add	%ebx,%edx
+	lea	-0x359d3e2a(%eax,%r11d),%r12d
+	mov	52(%rsp),%eax
+	mov	%edi,%ebx
+	mov	%edx,%r11d
+	xor	60(%rsp),%eax
+	xor	%esi,%ebx
+	rol	$5,%r11d
+	xor	20(%rsp),%eax
+	xor	%ebp,%ebx
+	add	%r11d,%r12d
+	xor	40(%rsp),%eax
+	rol	$30,%esi
+	add	%ebx,%r12d
+	rol	$1,%eax
+	mov	%eax,52(%rsp)
+	lea	-0x359d3e2a(%eax,%ebp),%r11d
+	mov	56(%rsp),%eax
+	mov	%esi,%ebx
+	mov	%r12d,%ebp
+	xor	0(%rsp),%eax
+	xor	%edx,%ebx
+	rol	$5,%ebp
+	xor	24(%rsp),%eax
+	xor	%edi,%ebx
+	add	%ebp,%r11d
+	xor	44(%rsp),%eax
+	rol	$30,%edx
+	add	%ebx,%r11d
+	rol	$1,%eax
+	mov	%eax,56(%rsp)
+	lea	-0x359d3e2a(%eax,%edi),%ebp
+	mov	60(%rsp),%eax
+	mov	%edx,%ebx
+	mov	%r11d,%edi
+	xor	4(%rsp),%eax
+	xor	%r12d,%ebx
+	rol	$5,%edi
+	xor	28(%rsp),%eax
+	xor	%esi,%ebx
+	add	%edi,%ebp
+	xor	48(%rsp),%eax
+	rol	$30,%r12d
+	add	%ebx,%ebp
+	rol	$1,%eax
+	mov	%eax,60(%rsp)
+	lea	-0x359d3e2a(%eax,%esi),%edi
+	mov	0(%rsp),%eax
+	mov	%r12d,%ebx
+	mov	%ebp,%esi
+	xor	8(%rsp),%eax
+	xor	%r11d,%ebx
+	rol	$5,%esi
+	xor	32(%rsp),%eax
+	xor	%edx,%ebx
+	add	%esi,%edi
+	xor	52(%rsp),%eax
+	rol	$30,%r11d
+	add	%ebx,%edi
+	rol	$1,%eax
+	mov	%eax,0(%rsp)
+	lea	-0x359d3e2a(%eax,%edx),%esi
+	mov	4(%rsp),%eax
+	mov	%r11d,%ebx
+	mov	%edi,%edx
+	xor	12(%rsp),%eax
+	xor	%ebp,%ebx
+	rol	$5,%edx
+	xor	36(%rsp),%eax
+	xor	%r12d,%ebx
+	add	%edx,%esi
+	xor	56(%rsp),%eax
+	rol	$30,%ebp
+	add	%ebx,%esi
+	rol	$1,%eax
+	mov	%eax,4(%rsp)
+	lea	-0x359d3e2a(%eax,%r12d),%edx
+	mov	8(%rsp),%eax
+	mov	%ebp,%ebx
+	mov	%esi,%r12d
+	xor	16(%rsp),%eax
+	xor	%edi,%ebx
+	rol	$5,%r12d
+	xor	40(%rsp),%eax
+	xor	%r11d,%ebx
+	add	%r12d,%edx
+	xor	60(%rsp),%eax
+	rol	$30,%edi
+	add	%ebx,%edx
+	rol	$1,%eax
+	mov	%eax,8(%rsp)
+	lea	-0x359d3e2a(%eax,%r11d),%r12d
+	mov	12(%rsp),%eax
+	mov	%edi,%ebx
+	mov	%edx,%r11d
+	xor	20(%rsp),%eax
+	xor	%esi,%ebx
+	rol	$5,%r11d
+	xor	44(%rsp),%eax
+	xor	%ebp,%ebx
+	add	%r11d,%r12d
+	xor	0(%rsp),%eax
+	rol	$30,%esi
+	add	%ebx,%r12d
+	rol	$1,%eax
+	mov	%eax,12(%rsp)
+	lea	-0x359d3e2a(%eax,%ebp),%r11d
+	mov	16(%rsp),%eax
+	mov	%esi,%ebx
+	mov	%r12d,%ebp
+	xor	24(%rsp),%eax
+	xor	%edx,%ebx
+	rol	$5,%ebp
+	xor	48(%rsp),%eax
+	xor	%edi,%ebx
+	add	%ebp,%r11d
+	xor	4(%rsp),%eax
+	rol	$30,%edx
+	add	%ebx,%r11d
+	rol	$1,%eax
+	mov	%eax,16(%rsp)
+	lea	-0x359d3e2a(%eax,%edi),%ebp
+	mov	20(%rsp),%eax
+	mov	%edx,%ebx
+	mov	%r11d,%edi
+	xor	28(%rsp),%eax
+	xor	%r12d,%ebx
+	rol	$5,%edi
+	xor	52(%rsp),%eax
+	xor	%esi,%ebx
+	add	%edi,%ebp
+	xor	8(%rsp),%eax
+	rol	$30,%r12d
+	add	%ebx,%ebp
+	rol	$1,%eax
+	mov	%eax,20(%rsp)
+	lea	-0x359d3e2a(%eax,%esi),%edi
+	mov	24(%rsp),%eax
+	mov	%r12d,%ebx
+	mov	%ebp,%esi
+	xor	32(%rsp),%eax
+	xor	%r11d,%ebx
+	rol	$5,%esi
+	xor	56(%rsp),%eax
+	xor	%edx,%ebx
+	add	%esi,%edi
+	xor	12(%rsp),%eax
+	rol	$30,%r11d
+	add	%ebx,%edi
+	rol	$1,%eax
+	mov	%eax,24(%rsp)
+	lea	-0x359d3e2a(%eax,%edx),%esi
+	mov	28(%rsp),%eax
+	mov	%r11d,%ebx
+	mov	%edi,%edx
+	xor	36(%rsp),%eax
+	xor	%ebp,%ebx
+	rol	$5,%edx
+	xor	60(%rsp),%eax
+	xor	%r12d,%ebx
+	add	%edx,%esi
+	xor	16(%rsp),%eax
+	rol	$30,%ebp
+	add	%ebx,%esi
+	rol	$1,%eax
+	mov	%eax,28(%rsp)
+	lea	-0x359d3e2a(%eax,%r12d),%edx
+	mov	32(%rsp),%eax
+	mov	%ebp,%ebx
+	mov	%esi,%r12d
+	xor	40(%rsp),%eax
+	xor	%edi,%ebx
+	rol	$5,%r12d
+	xor	0(%rsp),%eax
+	xor	%r11d,%ebx
+	add	%r12d,%edx
+	xor	20(%rsp),%eax
+	rol	$30,%edi
+	add	%ebx,%edx
+	rol	$1,%eax
+	mov	%eax,32(%rsp)
+	lea	-0x359d3e2a(%eax,%r11d),%r12d
+	mov	36(%rsp),%eax
+	mov	%edi,%ebx
+	mov	%edx,%r11d
+	xor	44(%rsp),%eax
+	xor	%esi,%ebx
+	rol	$5,%r11d
+	xor	4(%rsp),%eax
+	xor	%ebp,%ebx
+	add	%r11d,%r12d
+	xor	24(%rsp),%eax
+	rol	$30,%esi
+	add	%ebx,%r12d
+	rol	$1,%eax
+	mov	%eax,36(%rsp)
+	lea	-0x359d3e2a(%eax,%ebp),%r11d
+	mov	40(%rsp),%eax
+	mov	%esi,%ebx
+	mov	%r12d,%ebp
+	xor	48(%rsp),%eax
+	xor	%edx,%ebx
+	rol	$5,%ebp
+	xor	8(%rsp),%eax
+	xor	%edi,%ebx
+	add	%ebp,%r11d
+	xor	28(%rsp),%eax
+	rol	$30,%edx
+	add	%ebx,%r11d
+	rol	$1,%eax
+	mov	%eax,40(%rsp)
+	lea	-0x359d3e2a(%eax,%edi),%ebp
+	mov	44(%rsp),%eax
+	mov	%edx,%ebx
+	mov	%r11d,%edi
+	xor	52(%rsp),%eax
+	xor	%r12d,%ebx
+	rol	$5,%edi
+	xor	12(%rsp),%eax
+	xor	%esi,%ebx
+	add	%edi,%ebp
+	xor	32(%rsp),%eax
+	rol	$30,%r12d
+	add	%ebx,%ebp
+	rol	$1,%eax
+	mov	%eax,44(%rsp)
+	lea	-0x359d3e2a(%eax,%esi),%edi
+	mov	48(%rsp),%eax
+	mov	%r12d,%ebx
+	mov	%ebp,%esi
+	xor	56(%rsp),%eax
+	xor	%r11d,%ebx
+	rol	$5,%esi
+	xor	16(%rsp),%eax
+	xor	%edx,%ebx
+	add	%esi,%edi
+	xor	36(%rsp),%eax
+	rol	$30,%r11d
+	add	%ebx,%edi
+	rol	$1,%eax
+	mov	%eax,48(%rsp)
+	lea	-0x359d3e2a(%eax,%edx),%esi
+	mov	52(%rsp),%eax
+	mov	%r11d,%ebx
+	mov	%edi,%edx
+	xor	60(%rsp),%eax
+	xor	%ebp,%ebx
+	rol	$5,%edx
+	xor	20(%rsp),%eax
+	xor	%r12d,%ebx
+	add	%edx,%esi
+	xor	40(%rsp),%eax
+	rol	$30,%ebp
+	add	%ebx,%esi
+	rol	$1,%eax
+	lea	-0x359d3e2a(%eax,%r12d),%edx
+	mov	56(%rsp),%eax
+	mov	%ebp,%ebx
+	mov	%esi,%r12d
+	xor	0(%rsp),%eax
+	xor	%edi,%ebx
+	rol	$5,%r12d
+	xor	24(%rsp),%eax
+	xor	%r11d,%ebx
+	add	%r12d,%edx
+	xor	44(%rsp),%eax
+	rol	$30,%edi
+	add	%ebx,%edx
+	rol	$1,%eax
+	lea	-0x359d3e2a(%eax,%r11d),%r12d
+	mov	60(%rsp),%eax
+	mov	%edi,%ebx
+	mov	%edx,%r11d
+	xor	4(%rsp),%eax
+	xor	%esi,%ebx
+	rol	$5,%r11d
+	xor	28(%rsp),%eax
+	xor	%ebp,%ebx
+	add	%r11d,%r12d
+	xor	48(%rsp),%eax
+	rol	$30,%esi
+	add	%ebx,%r12d
+	rol	$1,%eax
+	lea	-0x359d3e2a(%eax,%ebp),%r11d
+	mov	%esi,%ebx
+	mov	%r12d,%ebp
+	xor	%edx,%ebx
+	rol	$5,%ebp
+	xor	%edi,%ebx
+	add	%ebp,%r11d
+	rol	$30,%edx
+	add	%ebx,%r11d
+	// Update and save state information in SHA-1 context
+	add	0(%r8),%r11d
+	add	4(%r8),%r12d
+	add	8(%r8),%edx
+	add	12(%r8),%esi
+	add	16(%r8),%edi
+	mov	%r11d,0(%r8)
+	mov	%r12d,4(%r8)
+	mov	%edx,8(%r8)
+	mov	%esi,12(%r8)
+	mov	%edi,16(%r8)
+
+	xchg	%r11d,%edx	# mov	%r11d,%edx
+	xchg	%r12d,%esi	# mov	%r12d,%esi
+	xchg	%r11d,%edi	# mov	%edx,%edi
+	xchg	%r12d,%ebp	# mov	%esi,%ebp
+			# mov	%edi,%r11d
+	lea	64(%r9),%r9
+	sub	$1,%r10
+	jnz	.Lloop
+	mov	64(%rsp),%rsp
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	ret
+SET_SIZE(sha1_block_data_order)
+
+.data
+.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro@openssl.org>"
+
+#endif /* lint || __lint */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/module/icp/asm-x86_64/sha2/sha256_impl.S b/module/icp/asm-x86_64/sha2/sha256_impl.S
new file mode 100644
index 000000000000..766b75355f0b
--- /dev/null
+++ b/module/icp/asm-x86_64/sha2/sha256_impl.S
@@ -0,0 +1,2063 @@
+/*
+ * ====================================================================
+ * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+ * project. Rights for redistribution and usage in source and binary
+ * forms are granted according to the OpenSSL license.
+ * ====================================================================
+ *
+ * sha256/512_block procedure for x86_64.
+ *
+ * 40% improvement over compiler-generated code on Opteron. On EM64T
+ * sha256 was observed to run >80% faster and sha512 - >40%. No magical
+ * tricks, just straight implementation... I really wonder why gcc
+ * [being armed with inline assembler] fails to generate as fast code.
+ * The only thing which is cool about this module is that it's very
+ * same instruction sequence used for both SHA-256 and SHA-512. In
+ * former case the instructions operate on 32-bit operands, while in
+ * latter - on 64-bit ones. All I had to do is to get one flavor right,
+ * the other one passed the test right away:-)
+ *
+ * sha256_block runs in ~1005 cycles on Opteron, which gives you
+ * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
+ * frequency in GHz. sha512_block runs in ~1275 cycles, which results
+ * in 128*1000/1275=100MBps per GHz. Is there room for improvement?
+ * Well, if you compare it to IA-64 implementation, which maintains
+ * X[16] in register bank[!], tends to 4 instructions per CPU clock
+ * cycle and runs in 1003 cycles, 1275 is very good result for 3-way
+ * issue Opteron pipeline and X[16] maintained in memory. So that *if*
+ * there is a way to improve it, *then* the only way would be to try to
+ * offload X[16] updates to SSE unit, but that would require "deeper"
+ * loop unroll, which in turn would naturally cause size blow-up, not
+ * to mention increased complexity! And once again, only *if* it's
+ * actually possible to noticeably improve overall ILP, instruction
+ * level parallelism, on a given CPU implementation in this case.
+ *
+ * Special note on Intel EM64T. While Opteron CPU exhibits perfect
+ * performance ratio of 1.5 between 64- and 32-bit flavors [see above],
+ * [currently available] EM64T CPUs apparently are far from it. On the
+ * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
+ * sha256_block:-( This is presumably because 64-bit shifts/rotates
+ * apparently are not atomic instructions, but implemented in microcode.
+ */
+
+/*
+ * OpenSolaris OS modifications
+ *
+ * Sun elects to use this software under the BSD license.
+ *
+ * This source originates from OpenSSL file sha512-x86_64.pl at
+ * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz
+ * (presumably for future OpenSSL release 0.9.8h), with these changes:
+ *
+ * 1. Added perl "use strict" and declared variables.
+ *
+ * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
+ * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards.
+ *
+ * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1)
+ * assemblers).  Replaced the .picmeup macro with assembler code.
+ *
+ * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype",
+ * at the beginning of SHA2_CTX (the next field is 8-byte aligned).
+ */
+
+/*
+ * This file was generated by a perl script (sha512-x86_64.pl) that were
+ * used to generate sha256 and sha512 variants from the same code base.
+ * The comments from the original file have been pasted above.
+ */
+
+#if defined(lint) || defined(__lint)
+#include <sys/stdint.h>
+#include <sha2/sha2.h>
+
+/* ARGSUSED */
+void
+SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num)
+{
+}
+
+
+#else
+#define _ASM
+#include <sys/asm_linkage.h>
+
+ENTRY_NP(SHA256TransformBlocks)
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	mov	%rsp,%rbp		# copy %rsp
+	shl	$4,%rdx		# num*16
+	sub	$16*4+4*8,%rsp
+	lea	(%rsi,%rdx,4),%rdx	# inp+num*16*4
+	and	$-64,%rsp		# align stack frame
+	add	$8,%rdi		# Skip OpenSolaris field, "algotype"
+	mov	%rdi,16*4+0*8(%rsp)		# save ctx, 1st arg
+	mov	%rsi,16*4+1*8(%rsp)		# save inp, 2nd arg
+	mov	%rdx,16*4+2*8(%rsp)		# save end pointer, "3rd" arg
+	mov	%rbp,16*4+3*8(%rsp)		# save copy of %rsp
+
+	#.picmeup %rbp
+	# The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts
+	# the address of the "next" instruction into the target register
+	# (%rbp).  This generates these 2 instructions:
+	lea	.Llea(%rip),%rbp
+	#nop	# .picmeup generates a nop for mod 8 alignment--not needed here
+
+.Llea:
+	lea	K256-.(%rbp),%rbp
+
+	mov	4*0(%rdi),%eax
+	mov	4*1(%rdi),%ebx
+	mov	4*2(%rdi),%ecx
+	mov	4*3(%rdi),%edx
+	mov	4*4(%rdi),%r8d
+	mov	4*5(%rdi),%r9d
+	mov	4*6(%rdi),%r10d
+	mov	4*7(%rdi),%r11d
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+	xor	%rdi,%rdi
+	mov	4*0(%rsi),%r12d
+	bswap	%r12d
+	mov	%r8d,%r13d
+	mov	%r8d,%r14d
+	mov	%r9d,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%r10d,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%r8d,%r15d			# (f^g)&e
+	mov	%r12d,0(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%r10d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r11d,%r12d			# T1+=h
+
+	mov	%eax,%r11d
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%eax,%r13d
+	mov	%eax,%r14d
+
+	ror	$2,%r11d
+	ror	$13,%r13d
+	mov	%eax,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%r11d
+	ror	$9,%r13d
+	or	%ecx,%r14d			# a|c
+
+	xor	%r13d,%r11d			# h=Sigma0(a)
+	and	%ecx,%r15d			# a&c
+	add	%r12d,%edx			# d+=T1
+
+	and	%ebx,%r14d			# (a|c)&b
+	add	%r12d,%r11d			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%r11d			# h+=Maj(a,b,c)
+	mov	4*1(%rsi),%r12d
+	bswap	%r12d
+	mov	%edx,%r13d
+	mov	%edx,%r14d
+	mov	%r8d,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%r9d,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%edx,%r15d			# (f^g)&e
+	mov	%r12d,4(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%r9d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r10d,%r12d			# T1+=h
+
+	mov	%r11d,%r10d
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%r11d,%r13d
+	mov	%r11d,%r14d
+
+	ror	$2,%r10d
+	ror	$13,%r13d
+	mov	%r11d,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%r10d
+	ror	$9,%r13d
+	or	%ebx,%r14d			# a|c
+
+	xor	%r13d,%r10d			# h=Sigma0(a)
+	and	%ebx,%r15d			# a&c
+	add	%r12d,%ecx			# d+=T1
+
+	and	%eax,%r14d			# (a|c)&b
+	add	%r12d,%r10d			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%r10d			# h+=Maj(a,b,c)
+	mov	4*2(%rsi),%r12d
+	bswap	%r12d
+	mov	%ecx,%r13d
+	mov	%ecx,%r14d
+	mov	%edx,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%r8d,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%ecx,%r15d			# (f^g)&e
+	mov	%r12d,8(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%r8d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r9d,%r12d			# T1+=h
+
+	mov	%r10d,%r9d
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%r10d,%r13d
+	mov	%r10d,%r14d
+
+	ror	$2,%r9d
+	ror	$13,%r13d
+	mov	%r10d,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%r9d
+	ror	$9,%r13d
+	or	%eax,%r14d			# a|c
+
+	xor	%r13d,%r9d			# h=Sigma0(a)
+	and	%eax,%r15d			# a&c
+	add	%r12d,%ebx			# d+=T1
+
+	and	%r11d,%r14d			# (a|c)&b
+	add	%r12d,%r9d			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%r9d			# h+=Maj(a,b,c)
+	mov	4*3(%rsi),%r12d
+	bswap	%r12d
+	mov	%ebx,%r13d
+	mov	%ebx,%r14d
+	mov	%ecx,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%edx,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%ebx,%r15d			# (f^g)&e
+	mov	%r12d,12(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%edx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r8d,%r12d			# T1+=h
+
+	mov	%r9d,%r8d
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%r9d,%r13d
+	mov	%r9d,%r14d
+
+	ror	$2,%r8d
+	ror	$13,%r13d
+	mov	%r9d,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%r8d
+	ror	$9,%r13d
+	or	%r11d,%r14d			# a|c
+
+	xor	%r13d,%r8d			# h=Sigma0(a)
+	and	%r11d,%r15d			# a&c
+	add	%r12d,%eax			# d+=T1
+
+	and	%r10d,%r14d			# (a|c)&b
+	add	%r12d,%r8d			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%r8d			# h+=Maj(a,b,c)
+	mov	4*4(%rsi),%r12d
+	bswap	%r12d
+	mov	%eax,%r13d
+	mov	%eax,%r14d
+	mov	%ebx,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%ecx,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%eax,%r15d			# (f^g)&e
+	mov	%r12d,16(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%ecx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%edx,%r12d			# T1+=h
+
+	mov	%r8d,%edx
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%r8d,%r13d
+	mov	%r8d,%r14d
+
+	ror	$2,%edx
+	ror	$13,%r13d
+	mov	%r8d,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%edx
+	ror	$9,%r13d
+	or	%r10d,%r14d			# a|c
+
+	xor	%r13d,%edx			# h=Sigma0(a)
+	and	%r10d,%r15d			# a&c
+	add	%r12d,%r11d			# d+=T1
+
+	and	%r9d,%r14d			# (a|c)&b
+	add	%r12d,%edx			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%edx			# h+=Maj(a,b,c)
+	mov	4*5(%rsi),%r12d
+	bswap	%r12d
+	mov	%r11d,%r13d
+	mov	%r11d,%r14d
+	mov	%eax,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%ebx,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%r11d,%r15d			# (f^g)&e
+	mov	%r12d,20(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%ebx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%ecx,%r12d			# T1+=h
+
+	mov	%edx,%ecx
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%edx,%r13d
+	mov	%edx,%r14d
+
+	ror	$2,%ecx
+	ror	$13,%r13d
+	mov	%edx,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%ecx
+	ror	$9,%r13d
+	or	%r9d,%r14d			# a|c
+
+	xor	%r13d,%ecx			# h=Sigma0(a)
+	and	%r9d,%r15d			# a&c
+	add	%r12d,%r10d			# d+=T1
+
+	and	%r8d,%r14d			# (a|c)&b
+	add	%r12d,%ecx			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%ecx			# h+=Maj(a,b,c)
+	mov	4*6(%rsi),%r12d
+	bswap	%r12d
+	mov	%r10d,%r13d
+	mov	%r10d,%r14d
+	mov	%r11d,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%eax,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%r10d,%r15d			# (f^g)&e
+	mov	%r12d,24(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%eax,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%ebx,%r12d			# T1+=h
+
+	mov	%ecx,%ebx
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%ecx,%r13d
+	mov	%ecx,%r14d
+
+	ror	$2,%ebx
+	ror	$13,%r13d
+	mov	%ecx,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%ebx
+	ror	$9,%r13d
+	or	%r8d,%r14d			# a|c
+
+	xor	%r13d,%ebx			# h=Sigma0(a)
+	and	%r8d,%r15d			# a&c
+	add	%r12d,%r9d			# d+=T1
+
+	and	%edx,%r14d			# (a|c)&b
+	add	%r12d,%ebx			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%ebx			# h+=Maj(a,b,c)
+	mov	4*7(%rsi),%r12d
+	bswap	%r12d
+	mov	%r9d,%r13d
+	mov	%r9d,%r14d
+	mov	%r10d,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%r11d,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%r9d,%r15d			# (f^g)&e
+	mov	%r12d,28(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%r11d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%eax,%r12d			# T1+=h
+
+	mov	%ebx,%eax
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%ebx,%r13d
+	mov	%ebx,%r14d
+
+	ror	$2,%eax
+	ror	$13,%r13d
+	mov	%ebx,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%eax
+	ror	$9,%r13d
+	or	%edx,%r14d			# a|c
+
+	xor	%r13d,%eax			# h=Sigma0(a)
+	and	%edx,%r15d			# a&c
+	add	%r12d,%r8d			# d+=T1
+
+	and	%ecx,%r14d			# (a|c)&b
+	add	%r12d,%eax			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%eax			# h+=Maj(a,b,c)
+	mov	4*8(%rsi),%r12d
+	bswap	%r12d
+	mov	%r8d,%r13d
+	mov	%r8d,%r14d
+	mov	%r9d,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%r10d,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%r8d,%r15d			# (f^g)&e
+	mov	%r12d,32(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%r10d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r11d,%r12d			# T1+=h
+
+	mov	%eax,%r11d
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%eax,%r13d
+	mov	%eax,%r14d
+
+	ror	$2,%r11d
+	ror	$13,%r13d
+	mov	%eax,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%r11d
+	ror	$9,%r13d
+	or	%ecx,%r14d			# a|c
+
+	xor	%r13d,%r11d			# h=Sigma0(a)
+	and	%ecx,%r15d			# a&c
+	add	%r12d,%edx			# d+=T1
+
+	and	%ebx,%r14d			# (a|c)&b
+	add	%r12d,%r11d			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%r11d			# h+=Maj(a,b,c)
+	mov	4*9(%rsi),%r12d
+	bswap	%r12d
+	mov	%edx,%r13d
+	mov	%edx,%r14d
+	mov	%r8d,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%r9d,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%edx,%r15d			# (f^g)&e
+	mov	%r12d,36(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%r9d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r10d,%r12d			# T1+=h
+
+	mov	%r11d,%r10d
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%r11d,%r13d
+	mov	%r11d,%r14d
+
+	ror	$2,%r10d
+	ror	$13,%r13d
+	mov	%r11d,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%r10d
+	ror	$9,%r13d
+	or	%ebx,%r14d			# a|c
+
+	xor	%r13d,%r10d			# h=Sigma0(a)
+	and	%ebx,%r15d			# a&c
+	add	%r12d,%ecx			# d+=T1
+
+	and	%eax,%r14d			# (a|c)&b
+	add	%r12d,%r10d			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%r10d			# h+=Maj(a,b,c)
+	mov	4*10(%rsi),%r12d
+	bswap	%r12d
+	mov	%ecx,%r13d
+	mov	%ecx,%r14d
+	mov	%edx,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%r8d,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%ecx,%r15d			# (f^g)&e
+	mov	%r12d,40(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%r8d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r9d,%r12d			# T1+=h
+
+	mov	%r10d,%r9d
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%r10d,%r13d
+	mov	%r10d,%r14d
+
+	ror	$2,%r9d
+	ror	$13,%r13d
+	mov	%r10d,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%r9d
+	ror	$9,%r13d
+	or	%eax,%r14d			# a|c
+
+	xor	%r13d,%r9d			# h=Sigma0(a)
+	and	%eax,%r15d			# a&c
+	add	%r12d,%ebx			# d+=T1
+
+	and	%r11d,%r14d			# (a|c)&b
+	add	%r12d,%r9d			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%r9d			# h+=Maj(a,b,c)
+	mov	4*11(%rsi),%r12d
+	bswap	%r12d
+	mov	%ebx,%r13d
+	mov	%ebx,%r14d
+	mov	%ecx,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%edx,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%ebx,%r15d			# (f^g)&e
+	mov	%r12d,44(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%edx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r8d,%r12d			# T1+=h
+
+	mov	%r9d,%r8d
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%r9d,%r13d
+	mov	%r9d,%r14d
+
+	ror	$2,%r8d
+	ror	$13,%r13d
+	mov	%r9d,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%r8d
+	ror	$9,%r13d
+	or	%r11d,%r14d			# a|c
+
+	xor	%r13d,%r8d			# h=Sigma0(a)
+	and	%r11d,%r15d			# a&c
+	add	%r12d,%eax			# d+=T1
+
+	and	%r10d,%r14d			# (a|c)&b
+	add	%r12d,%r8d			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%r8d			# h+=Maj(a,b,c)
+	mov	4*12(%rsi),%r12d
+	bswap	%r12d
+	mov	%eax,%r13d
+	mov	%eax,%r14d
+	mov	%ebx,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%ecx,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%eax,%r15d			# (f^g)&e
+	mov	%r12d,48(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%ecx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%edx,%r12d			# T1+=h
+
+	mov	%r8d,%edx
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%r8d,%r13d
+	mov	%r8d,%r14d
+
+	ror	$2,%edx
+	ror	$13,%r13d
+	mov	%r8d,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%edx
+	ror	$9,%r13d
+	or	%r10d,%r14d			# a|c
+
+	xor	%r13d,%edx			# h=Sigma0(a)
+	and	%r10d,%r15d			# a&c
+	add	%r12d,%r11d			# d+=T1
+
+	and	%r9d,%r14d			# (a|c)&b
+	add	%r12d,%edx			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%edx			# h+=Maj(a,b,c)
+	mov	4*13(%rsi),%r12d
+	bswap	%r12d
+	mov	%r11d,%r13d
+	mov	%r11d,%r14d
+	mov	%eax,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%ebx,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%r11d,%r15d			# (f^g)&e
+	mov	%r12d,52(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%ebx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%ecx,%r12d			# T1+=h
+
+	mov	%edx,%ecx
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%edx,%r13d
+	mov	%edx,%r14d
+
+	ror	$2,%ecx
+	ror	$13,%r13d
+	mov	%edx,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%ecx
+	ror	$9,%r13d
+	or	%r9d,%r14d			# a|c
+
+	xor	%r13d,%ecx			# h=Sigma0(a)
+	and	%r9d,%r15d			# a&c
+	add	%r12d,%r10d			# d+=T1
+
+	and	%r8d,%r14d			# (a|c)&b
+	add	%r12d,%ecx			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%ecx			# h+=Maj(a,b,c)
+	mov	4*14(%rsi),%r12d
+	bswap	%r12d
+	mov	%r10d,%r13d
+	mov	%r10d,%r14d
+	mov	%r11d,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%eax,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%r10d,%r15d			# (f^g)&e
+	mov	%r12d,56(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%eax,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%ebx,%r12d			# T1+=h
+
+	mov	%ecx,%ebx
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%ecx,%r13d
+	mov	%ecx,%r14d
+
+	ror	$2,%ebx
+	ror	$13,%r13d
+	mov	%ecx,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%ebx
+	ror	$9,%r13d
+	or	%r8d,%r14d			# a|c
+
+	xor	%r13d,%ebx			# h=Sigma0(a)
+	and	%r8d,%r15d			# a&c
+	add	%r12d,%r9d			# d+=T1
+
+	and	%edx,%r14d			# (a|c)&b
+	add	%r12d,%ebx			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%ebx			# h+=Maj(a,b,c)
+	mov	4*15(%rsi),%r12d
+	bswap	%r12d
+	mov	%r9d,%r13d
+	mov	%r9d,%r14d
+	mov	%r10d,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%r11d,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%r9d,%r15d			# (f^g)&e
+	mov	%r12d,60(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%r11d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%eax,%r12d			# T1+=h
+
+	mov	%ebx,%eax
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%ebx,%r13d
+	mov	%ebx,%r14d
+
+	ror	$2,%eax
+	ror	$13,%r13d
+	mov	%ebx,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%eax
+	ror	$9,%r13d
+	or	%edx,%r14d			# a|c
+
+	xor	%r13d,%eax			# h=Sigma0(a)
+	and	%edx,%r15d			# a&c
+	add	%r12d,%r8d			# d+=T1
+
+	and	%ecx,%r14d			# (a|c)&b
+	add	%r12d,%eax			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%eax			# h+=Maj(a,b,c)
+	jmp	.Lrounds_16_xx
+.align	16
+.Lrounds_16_xx:
+	mov	4(%rsp),%r13d
+	mov	56(%rsp),%r12d
+
+	mov	%r13d,%r15d
+
+	shr	$3,%r13d
+	ror	$7,%r15d
+
+	xor	%r15d,%r13d
+	ror	$11,%r15d
+
+	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
+	mov	%r12d,%r14d
+
+	shr	$10,%r12d
+	ror	$17,%r14d
+
+	xor	%r14d,%r12d
+	ror	$2,%r14d
+
+	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
+
+	add	%r13d,%r12d
+
+	add	36(%rsp),%r12d
+
+	add	0(%rsp),%r12d
+	mov	%r8d,%r13d
+	mov	%r8d,%r14d
+	mov	%r9d,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%r10d,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%r8d,%r15d			# (f^g)&e
+	mov	%r12d,0(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%r10d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r11d,%r12d			# T1+=h
+
+	mov	%eax,%r11d
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%eax,%r13d
+	mov	%eax,%r14d
+
+	ror	$2,%r11d
+	ror	$13,%r13d
+	mov	%eax,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%r11d
+	ror	$9,%r13d
+	or	%ecx,%r14d			# a|c
+
+	xor	%r13d,%r11d			# h=Sigma0(a)
+	and	%ecx,%r15d			# a&c
+	add	%r12d,%edx			# d+=T1
+
+	and	%ebx,%r14d			# (a|c)&b
+	add	%r12d,%r11d			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%r11d			# h+=Maj(a,b,c)
+	mov	8(%rsp),%r13d
+	mov	60(%rsp),%r12d
+
+	mov	%r13d,%r15d
+
+	shr	$3,%r13d
+	ror	$7,%r15d
+
+	xor	%r15d,%r13d
+	ror	$11,%r15d
+
+	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
+	mov	%r12d,%r14d
+
+	shr	$10,%r12d
+	ror	$17,%r14d
+
+	xor	%r14d,%r12d
+	ror	$2,%r14d
+
+	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
+
+	add	%r13d,%r12d
+
+	add	40(%rsp),%r12d
+
+	add	4(%rsp),%r12d
+	mov	%edx,%r13d
+	mov	%edx,%r14d
+	mov	%r8d,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%r9d,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%edx,%r15d			# (f^g)&e
+	mov	%r12d,4(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%r9d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r10d,%r12d			# T1+=h
+
+	mov	%r11d,%r10d
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%r11d,%r13d
+	mov	%r11d,%r14d
+
+	ror	$2,%r10d
+	ror	$13,%r13d
+	mov	%r11d,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%r10d
+	ror	$9,%r13d
+	or	%ebx,%r14d			# a|c
+
+	xor	%r13d,%r10d			# h=Sigma0(a)
+	and	%ebx,%r15d			# a&c
+	add	%r12d,%ecx			# d+=T1
+
+	and	%eax,%r14d			# (a|c)&b
+	add	%r12d,%r10d			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%r10d			# h+=Maj(a,b,c)
+	mov	12(%rsp),%r13d
+	mov	0(%rsp),%r12d
+
+	mov	%r13d,%r15d
+
+	shr	$3,%r13d
+	ror	$7,%r15d
+
+	xor	%r15d,%r13d
+	ror	$11,%r15d
+
+	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
+	mov	%r12d,%r14d
+
+	shr	$10,%r12d
+	ror	$17,%r14d
+
+	xor	%r14d,%r12d
+	ror	$2,%r14d
+
+	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
+
+	add	%r13d,%r12d
+
+	add	44(%rsp),%r12d
+
+	add	8(%rsp),%r12d
+	mov	%ecx,%r13d
+	mov	%ecx,%r14d
+	mov	%edx,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%r8d,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%ecx,%r15d			# (f^g)&e
+	mov	%r12d,8(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%r8d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r9d,%r12d			# T1+=h
+
+	mov	%r10d,%r9d
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%r10d,%r13d
+	mov	%r10d,%r14d
+
+	ror	$2,%r9d
+	ror	$13,%r13d
+	mov	%r10d,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%r9d
+	ror	$9,%r13d
+	or	%eax,%r14d			# a|c
+
+	xor	%r13d,%r9d			# h=Sigma0(a)
+	and	%eax,%r15d			# a&c
+	add	%r12d,%ebx			# d+=T1
+
+	and	%r11d,%r14d			# (a|c)&b
+	add	%r12d,%r9d			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%r9d			# h+=Maj(a,b,c)
+	mov	16(%rsp),%r13d
+	mov	4(%rsp),%r12d
+
+	mov	%r13d,%r15d
+
+	shr	$3,%r13d
+	ror	$7,%r15d
+
+	xor	%r15d,%r13d
+	ror	$11,%r15d
+
+	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
+	mov	%r12d,%r14d
+
+	shr	$10,%r12d
+	ror	$17,%r14d
+
+	xor	%r14d,%r12d
+	ror	$2,%r14d
+
+	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
+
+	add	%r13d,%r12d
+
+	add	48(%rsp),%r12d
+
+	add	12(%rsp),%r12d
+	mov	%ebx,%r13d
+	mov	%ebx,%r14d
+	mov	%ecx,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%edx,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%ebx,%r15d			# (f^g)&e
+	mov	%r12d,12(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%edx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r8d,%r12d			# T1+=h
+
+	mov	%r9d,%r8d
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%r9d,%r13d
+	mov	%r9d,%r14d
+
+	ror	$2,%r8d
+	ror	$13,%r13d
+	mov	%r9d,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%r8d
+	ror	$9,%r13d
+	or	%r11d,%r14d			# a|c
+
+	xor	%r13d,%r8d			# h=Sigma0(a)
+	and	%r11d,%r15d			# a&c
+	add	%r12d,%eax			# d+=T1
+
+	and	%r10d,%r14d			# (a|c)&b
+	add	%r12d,%r8d			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%r8d			# h+=Maj(a,b,c)
+	mov	20(%rsp),%r13d
+	mov	8(%rsp),%r12d
+
+	mov	%r13d,%r15d
+
+	shr	$3,%r13d
+	ror	$7,%r15d
+
+	xor	%r15d,%r13d
+	ror	$11,%r15d
+
+	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
+	mov	%r12d,%r14d
+
+	shr	$10,%r12d
+	ror	$17,%r14d
+
+	xor	%r14d,%r12d
+	ror	$2,%r14d
+
+	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
+
+	add	%r13d,%r12d
+
+	add	52(%rsp),%r12d
+
+	add	16(%rsp),%r12d
+	mov	%eax,%r13d
+	mov	%eax,%r14d
+	mov	%ebx,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%ecx,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%eax,%r15d			# (f^g)&e
+	mov	%r12d,16(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%ecx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%edx,%r12d			# T1+=h
+
+	mov	%r8d,%edx
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%r8d,%r13d
+	mov	%r8d,%r14d
+
+	ror	$2,%edx
+	ror	$13,%r13d
+	mov	%r8d,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%edx
+	ror	$9,%r13d
+	or	%r10d,%r14d			# a|c
+
+	xor	%r13d,%edx			# h=Sigma0(a)
+	and	%r10d,%r15d			# a&c
+	add	%r12d,%r11d			# d+=T1
+
+	and	%r9d,%r14d			# (a|c)&b
+	add	%r12d,%edx			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%edx			# h+=Maj(a,b,c)
+	mov	24(%rsp),%r13d
+	mov	12(%rsp),%r12d
+
+	mov	%r13d,%r15d
+
+	shr	$3,%r13d
+	ror	$7,%r15d
+
+	xor	%r15d,%r13d
+	ror	$11,%r15d
+
+	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
+	mov	%r12d,%r14d
+
+	shr	$10,%r12d
+	ror	$17,%r14d
+
+	xor	%r14d,%r12d
+	ror	$2,%r14d
+
+	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
+
+	add	%r13d,%r12d
+
+	add	56(%rsp),%r12d
+
+	add	20(%rsp),%r12d
+	mov	%r11d,%r13d
+	mov	%r11d,%r14d
+	mov	%eax,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%ebx,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%r11d,%r15d			# (f^g)&e
+	mov	%r12d,20(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%ebx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%ecx,%r12d			# T1+=h
+
+	mov	%edx,%ecx
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%edx,%r13d
+	mov	%edx,%r14d
+
+	ror	$2,%ecx
+	ror	$13,%r13d
+	mov	%edx,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%ecx
+	ror	$9,%r13d
+	or	%r9d,%r14d			# a|c
+
+	xor	%r13d,%ecx			# h=Sigma0(a)
+	and	%r9d,%r15d			# a&c
+	add	%r12d,%r10d			# d+=T1
+
+	and	%r8d,%r14d			# (a|c)&b
+	add	%r12d,%ecx			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%ecx			# h+=Maj(a,b,c)
+	mov	28(%rsp),%r13d
+	mov	16(%rsp),%r12d
+
+	mov	%r13d,%r15d
+
+	shr	$3,%r13d
+	ror	$7,%r15d
+
+	xor	%r15d,%r13d
+	ror	$11,%r15d
+
+	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
+	mov	%r12d,%r14d
+
+	shr	$10,%r12d
+	ror	$17,%r14d
+
+	xor	%r14d,%r12d
+	ror	$2,%r14d
+
+	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
+
+	add	%r13d,%r12d
+
+	add	60(%rsp),%r12d
+
+	add	24(%rsp),%r12d
+	mov	%r10d,%r13d
+	mov	%r10d,%r14d
+	mov	%r11d,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%eax,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%r10d,%r15d			# (f^g)&e
+	mov	%r12d,24(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%eax,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%ebx,%r12d			# T1+=h
+
+	mov	%ecx,%ebx
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%ecx,%r13d
+	mov	%ecx,%r14d
+
+	ror	$2,%ebx
+	ror	$13,%r13d
+	mov	%ecx,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%ebx
+	ror	$9,%r13d
+	or	%r8d,%r14d			# a|c
+
+	xor	%r13d,%ebx			# h=Sigma0(a)
+	and	%r8d,%r15d			# a&c
+	add	%r12d,%r9d			# d+=T1
+
+	and	%edx,%r14d			# (a|c)&b
+	add	%r12d,%ebx			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%ebx			# h+=Maj(a,b,c)
+	mov	32(%rsp),%r13d
+	mov	20(%rsp),%r12d
+
+	mov	%r13d,%r15d
+
+	shr	$3,%r13d
+	ror	$7,%r15d
+
+	xor	%r15d,%r13d
+	ror	$11,%r15d
+
+	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
+	mov	%r12d,%r14d
+
+	shr	$10,%r12d
+	ror	$17,%r14d
+
+	xor	%r14d,%r12d
+	ror	$2,%r14d
+
+	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
+
+	add	%r13d,%r12d
+
+	add	0(%rsp),%r12d
+
+	add	28(%rsp),%r12d
+	mov	%r9d,%r13d
+	mov	%r9d,%r14d
+	mov	%r10d,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%r11d,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%r9d,%r15d			# (f^g)&e
+	mov	%r12d,28(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%r11d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%eax,%r12d			# T1+=h
+
+	mov	%ebx,%eax
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%ebx,%r13d
+	mov	%ebx,%r14d
+
+	ror	$2,%eax
+	ror	$13,%r13d
+	mov	%ebx,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%eax
+	ror	$9,%r13d
+	or	%edx,%r14d			# a|c
+
+	xor	%r13d,%eax			# h=Sigma0(a)
+	and	%edx,%r15d			# a&c
+	add	%r12d,%r8d			# d+=T1
+
+	and	%ecx,%r14d			# (a|c)&b
+	add	%r12d,%eax			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%eax			# h+=Maj(a,b,c)
+	mov	36(%rsp),%r13d
+	mov	24(%rsp),%r12d
+
+	mov	%r13d,%r15d
+
+	shr	$3,%r13d
+	ror	$7,%r15d
+
+	xor	%r15d,%r13d
+	ror	$11,%r15d
+
+	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
+	mov	%r12d,%r14d
+
+	shr	$10,%r12d
+	ror	$17,%r14d
+
+	xor	%r14d,%r12d
+	ror	$2,%r14d
+
+	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
+
+	add	%r13d,%r12d
+
+	add	4(%rsp),%r12d
+
+	add	32(%rsp),%r12d
+	mov	%r8d,%r13d
+	mov	%r8d,%r14d
+	mov	%r9d,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%r10d,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%r8d,%r15d			# (f^g)&e
+	mov	%r12d,32(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%r10d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r11d,%r12d			# T1+=h
+
+	mov	%eax,%r11d
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%eax,%r13d
+	mov	%eax,%r14d
+
+	ror	$2,%r11d
+	ror	$13,%r13d
+	mov	%eax,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%r11d
+	ror	$9,%r13d
+	or	%ecx,%r14d			# a|c
+
+	xor	%r13d,%r11d			# h=Sigma0(a)
+	and	%ecx,%r15d			# a&c
+	add	%r12d,%edx			# d+=T1
+
+	and	%ebx,%r14d			# (a|c)&b
+	add	%r12d,%r11d			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%r11d			# h+=Maj(a,b,c)
+	mov	40(%rsp),%r13d
+	mov	28(%rsp),%r12d
+
+	mov	%r13d,%r15d
+
+	shr	$3,%r13d
+	ror	$7,%r15d
+
+	xor	%r15d,%r13d
+	ror	$11,%r15d
+
+	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
+	mov	%r12d,%r14d
+
+	shr	$10,%r12d
+	ror	$17,%r14d
+
+	xor	%r14d,%r12d
+	ror	$2,%r14d
+
+	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
+
+	add	%r13d,%r12d
+
+	add	8(%rsp),%r12d
+
+	add	36(%rsp),%r12d
+	mov	%edx,%r13d
+	mov	%edx,%r14d
+	mov	%r8d,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%r9d,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%edx,%r15d			# (f^g)&e
+	mov	%r12d,36(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%r9d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r10d,%r12d			# T1+=h
+
+	mov	%r11d,%r10d
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%r11d,%r13d
+	mov	%r11d,%r14d
+
+	ror	$2,%r10d
+	ror	$13,%r13d
+	mov	%r11d,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%r10d
+	ror	$9,%r13d
+	or	%ebx,%r14d			# a|c
+
+	xor	%r13d,%r10d			# h=Sigma0(a)
+	and	%ebx,%r15d			# a&c
+	add	%r12d,%ecx			# d+=T1
+
+	and	%eax,%r14d			# (a|c)&b
+	add	%r12d,%r10d			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%r10d			# h+=Maj(a,b,c)
+	mov	44(%rsp),%r13d
+	mov	32(%rsp),%r12d
+
+	mov	%r13d,%r15d
+
+	shr	$3,%r13d
+	ror	$7,%r15d
+
+	xor	%r15d,%r13d
+	ror	$11,%r15d
+
+	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
+	mov	%r12d,%r14d
+
+	shr	$10,%r12d
+	ror	$17,%r14d
+
+	xor	%r14d,%r12d
+	ror	$2,%r14d
+
+	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
+
+	add	%r13d,%r12d
+
+	add	12(%rsp),%r12d
+
+	add	40(%rsp),%r12d
+	mov	%ecx,%r13d
+	mov	%ecx,%r14d
+	mov	%edx,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%r8d,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%ecx,%r15d			# (f^g)&e
+	mov	%r12d,40(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%r8d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r9d,%r12d			# T1+=h
+
+	mov	%r10d,%r9d
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%r10d,%r13d
+	mov	%r10d,%r14d
+
+	ror	$2,%r9d
+	ror	$13,%r13d
+	mov	%r10d,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%r9d
+	ror	$9,%r13d
+	or	%eax,%r14d			# a|c
+
+	xor	%r13d,%r9d			# h=Sigma0(a)
+	and	%eax,%r15d			# a&c
+	add	%r12d,%ebx			# d+=T1
+
+	and	%r11d,%r14d			# (a|c)&b
+	add	%r12d,%r9d			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%r9d			# h+=Maj(a,b,c)
+	mov	48(%rsp),%r13d
+	mov	36(%rsp),%r12d
+
+	mov	%r13d,%r15d
+
+	shr	$3,%r13d
+	ror	$7,%r15d
+
+	xor	%r15d,%r13d
+	ror	$11,%r15d
+
+	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
+	mov	%r12d,%r14d
+
+	shr	$10,%r12d
+	ror	$17,%r14d
+
+	xor	%r14d,%r12d
+	ror	$2,%r14d
+
+	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
+
+	add	%r13d,%r12d
+
+	add	16(%rsp),%r12d
+
+	add	44(%rsp),%r12d
+	mov	%ebx,%r13d
+	mov	%ebx,%r14d
+	mov	%ecx,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%edx,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%ebx,%r15d			# (f^g)&e
+	mov	%r12d,44(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%edx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r8d,%r12d			# T1+=h
+
+	mov	%r9d,%r8d
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%r9d,%r13d
+	mov	%r9d,%r14d
+
+	ror	$2,%r8d
+	ror	$13,%r13d
+	mov	%r9d,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%r8d
+	ror	$9,%r13d
+	or	%r11d,%r14d			# a|c
+
+	xor	%r13d,%r8d			# h=Sigma0(a)
+	and	%r11d,%r15d			# a&c
+	add	%r12d,%eax			# d+=T1
+
+	and	%r10d,%r14d			# (a|c)&b
+	add	%r12d,%r8d			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%r8d			# h+=Maj(a,b,c)
+	mov	52(%rsp),%r13d
+	mov	40(%rsp),%r12d
+
+	mov	%r13d,%r15d
+
+	shr	$3,%r13d
+	ror	$7,%r15d
+
+	xor	%r15d,%r13d
+	ror	$11,%r15d
+
+	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
+	mov	%r12d,%r14d
+
+	shr	$10,%r12d
+	ror	$17,%r14d
+
+	xor	%r14d,%r12d
+	ror	$2,%r14d
+
+	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
+
+	add	%r13d,%r12d
+
+	add	20(%rsp),%r12d
+
+	add	48(%rsp),%r12d
+	mov	%eax,%r13d
+	mov	%eax,%r14d
+	mov	%ebx,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%ecx,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%eax,%r15d			# (f^g)&e
+	mov	%r12d,48(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%ecx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%edx,%r12d			# T1+=h
+
+	mov	%r8d,%edx
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%r8d,%r13d
+	mov	%r8d,%r14d
+
+	ror	$2,%edx
+	ror	$13,%r13d
+	mov	%r8d,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%edx
+	ror	$9,%r13d
+	or	%r10d,%r14d			# a|c
+
+	xor	%r13d,%edx			# h=Sigma0(a)
+	and	%r10d,%r15d			# a&c
+	add	%r12d,%r11d			# d+=T1
+
+	and	%r9d,%r14d			# (a|c)&b
+	add	%r12d,%edx			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%edx			# h+=Maj(a,b,c)
+	mov	56(%rsp),%r13d
+	mov	44(%rsp),%r12d
+
+	mov	%r13d,%r15d
+
+	shr	$3,%r13d
+	ror	$7,%r15d
+
+	xor	%r15d,%r13d
+	ror	$11,%r15d
+
+	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
+	mov	%r12d,%r14d
+
+	shr	$10,%r12d
+	ror	$17,%r14d
+
+	xor	%r14d,%r12d
+	ror	$2,%r14d
+
+	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
+
+	add	%r13d,%r12d
+
+	add	24(%rsp),%r12d
+
+	add	52(%rsp),%r12d
+	mov	%r11d,%r13d
+	mov	%r11d,%r14d
+	mov	%eax,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%ebx,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%r11d,%r15d			# (f^g)&e
+	mov	%r12d,52(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%ebx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%ecx,%r12d			# T1+=h
+
+	mov	%edx,%ecx
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%edx,%r13d
+	mov	%edx,%r14d
+
+	ror	$2,%ecx
+	ror	$13,%r13d
+	mov	%edx,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%ecx
+	ror	$9,%r13d
+	or	%r9d,%r14d			# a|c
+
+	xor	%r13d,%ecx			# h=Sigma0(a)
+	and	%r9d,%r15d			# a&c
+	add	%r12d,%r10d			# d+=T1
+
+	and	%r8d,%r14d			# (a|c)&b
+	add	%r12d,%ecx			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%ecx			# h+=Maj(a,b,c)
+	mov	60(%rsp),%r13d
+	mov	48(%rsp),%r12d
+
+	mov	%r13d,%r15d
+
+	shr	$3,%r13d
+	ror	$7,%r15d
+
+	xor	%r15d,%r13d
+	ror	$11,%r15d
+
+	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
+	mov	%r12d,%r14d
+
+	shr	$10,%r12d
+	ror	$17,%r14d
+
+	xor	%r14d,%r12d
+	ror	$2,%r14d
+
+	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
+
+	add	%r13d,%r12d
+
+	add	28(%rsp),%r12d
+
+	add	56(%rsp),%r12d
+	mov	%r10d,%r13d
+	mov	%r10d,%r14d
+	mov	%r11d,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%eax,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%r10d,%r15d			# (f^g)&e
+	mov	%r12d,56(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%eax,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%ebx,%r12d			# T1+=h
+
+	mov	%ecx,%ebx
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%ecx,%r13d
+	mov	%ecx,%r14d
+
+	ror	$2,%ebx
+	ror	$13,%r13d
+	mov	%ecx,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%ebx
+	ror	$9,%r13d
+	or	%r8d,%r14d			# a|c
+
+	xor	%r13d,%ebx			# h=Sigma0(a)
+	and	%r8d,%r15d			# a&c
+	add	%r12d,%r9d			# d+=T1
+
+	and	%edx,%r14d			# (a|c)&b
+	add	%r12d,%ebx			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%ebx			# h+=Maj(a,b,c)
+	mov	0(%rsp),%r13d
+	mov	52(%rsp),%r12d
+
+	mov	%r13d,%r15d
+
+	shr	$3,%r13d
+	ror	$7,%r15d
+
+	xor	%r15d,%r13d
+	ror	$11,%r15d
+
+	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
+	mov	%r12d,%r14d
+
+	shr	$10,%r12d
+	ror	$17,%r14d
+
+	xor	%r14d,%r12d
+	ror	$2,%r14d
+
+	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
+
+	add	%r13d,%r12d
+
+	add	32(%rsp),%r12d
+
+	add	60(%rsp),%r12d
+	mov	%r9d,%r13d
+	mov	%r9d,%r14d
+	mov	%r10d,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%r11d,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%r9d,%r15d			# (f^g)&e
+	mov	%r12d,60(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%r11d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%eax,%r12d			# T1+=h
+
+	mov	%ebx,%eax
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%ebx,%r13d
+	mov	%ebx,%r14d
+
+	ror	$2,%eax
+	ror	$13,%r13d
+	mov	%ebx,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%eax
+	ror	$9,%r13d
+	or	%edx,%r14d			# a|c
+
+	xor	%r13d,%eax			# h=Sigma0(a)
+	and	%edx,%r15d			# a&c
+	add	%r12d,%r8d			# d+=T1
+
+	and	%ecx,%r14d			# (a|c)&b
+	add	%r12d,%eax			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%eax			# h+=Maj(a,b,c)
+	cmp	$64,%rdi
+	jb	.Lrounds_16_xx
+
+	mov	16*4+0*8(%rsp),%rdi
+	lea	16*4(%rsi),%rsi
+
+	add	4*0(%rdi),%eax
+	add	4*1(%rdi),%ebx
+	add	4*2(%rdi),%ecx
+	add	4*3(%rdi),%edx
+	add	4*4(%rdi),%r8d
+	add	4*5(%rdi),%r9d
+	add	4*6(%rdi),%r10d
+	add	4*7(%rdi),%r11d
+
+	cmp	16*4+2*8(%rsp),%rsi
+
+	mov	%eax,4*0(%rdi)
+	mov	%ebx,4*1(%rdi)
+	mov	%ecx,4*2(%rdi)
+	mov	%edx,4*3(%rdi)
+	mov	%r8d,4*4(%rdi)
+	mov	%r9d,4*5(%rdi)
+	mov	%r10d,4*6(%rdi)
+	mov	%r11d,4*7(%rdi)
+	jb	.Lloop
+
+	mov	16*4+3*8(%rsp),%rsp
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+
+	ret
+SET_SIZE(SHA256TransformBlocks)
+
+.data
+.align	64
+.type	K256,@object
+K256:
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+#endif /* !lint && !__lint */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/module/icp/asm-x86_64/sha2/sha512_impl.S b/module/icp/asm-x86_64/sha2/sha512_impl.S
new file mode 100644
index 000000000000..6e37618761b2
--- /dev/null
+++ b/module/icp/asm-x86_64/sha2/sha512_impl.S
@@ -0,0 +1,2088 @@
+/*
+ * ====================================================================
+ * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+ * project. Rights for redistribution and usage in source and binary
+ * forms are granted according to the OpenSSL license.
+ * ====================================================================
+ *
+ * sha256/512_block procedure for x86_64.
+ *
+ * 40% improvement over compiler-generated code on Opteron. On EM64T
+ * sha256 was observed to run >80% faster and sha512 - >40%. No magical
+ * tricks, just straight implementation... I really wonder why gcc
+ * [being armed with inline assembler] fails to generate as fast code.
+ * The only thing which is cool about this module is that it's very
+ * same instruction sequence used for both SHA-256 and SHA-512. In
+ * former case the instructions operate on 32-bit operands, while in
+ * latter - on 64-bit ones. All I had to do is to get one flavor right,
+ * the other one passed the test right away:-)
+ *
+ * sha256_block runs in ~1005 cycles on Opteron, which gives you
+ * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
+ * frequency in GHz. sha512_block runs in ~1275 cycles, which results
+ * in 128*1000/1275=100MBps per GHz. Is there room for improvement?
+ * Well, if you compare it to IA-64 implementation, which maintains
+ * X[16] in register bank[!], tends to 4 instructions per CPU clock
+ * cycle and runs in 1003 cycles, 1275 is very good result for 3-way
+ * issue Opteron pipeline and X[16] maintained in memory. So that *if*
+ * there is a way to improve it, *then* the only way would be to try to
+ * offload X[16] updates to SSE unit, but that would require "deeper"
+ * loop unroll, which in turn would naturally cause size blow-up, not
+ * to mention increased complexity! And once again, only *if* it's
+ * actually possible to noticeably improve overall ILP, instruction
+ * level parallelism, on a given CPU implementation in this case.
+ *
+ * Special note on Intel EM64T. While Opteron CPU exhibits perfect
+ * performance ratio of 1.5 between 64- and 32-bit flavors [see above],
+ * [currently available] EM64T CPUs apparently are far from it. On the
+ * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
+ * sha256_block:-( This is presumably because 64-bit shifts/rotates
+ * apparently are not atomic instructions, but implemented in microcode.
+ */
+
+/*
+ * OpenSolaris OS modifications
+ *
+ * Sun elects to use this software under the BSD license.
+ *
+ * This source originates from OpenSSL file sha512-x86_64.pl at
+ * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz
+ * (presumably for future OpenSSL release 0.9.8h), with these changes:
+ *
+ * 1. Added perl "use strict" and declared variables.
+ *
+ * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
+ * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards.
+ *
+ * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1)
+ * assemblers).  Replaced the .picmeup macro with assembler code.
+ *
+ * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype",
+ * at the beginning of SHA2_CTX (the next field is 8-byte aligned).
+ */
+
+/*
+ * This file was generated by a perl script (sha512-x86_64.pl) that were
+ * used to generate sha256 and sha512 variants from the same code base.
+ * The comments from the original file have been pasted above.
+ */
+
+
+#if defined(lint) || defined(__lint)
+#include <sys/stdint.h>
+#include <sha2/sha2.h>
+
+/* ARGSUSED */
+void
+SHA512TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num)
+{
+}
+
+
+#else
+#define _ASM
+#include <sys/asm_linkage.h>
+
+ENTRY_NP(SHA512TransformBlocks)
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	mov	%rsp,%rbp		# copy %rsp
+	shl	$4,%rdx		# num*16
+	sub	$16*8+4*8,%rsp
+	lea	(%rsi,%rdx,8),%rdx	# inp+num*16*8
+	and	$-64,%rsp		# align stack frame
+	add	$8,%rdi		# Skip OpenSolaris field, "algotype"
+	mov	%rdi,16*8+0*8(%rsp)		# save ctx, 1st arg
+	mov	%rsi,16*8+1*8(%rsp)		# save inp, 2nd arg
+	mov	%rdx,16*8+2*8(%rsp)		# save end pointer, "3rd" arg
+	mov	%rbp,16*8+3*8(%rsp)		# save copy of %rsp
+
+	#.picmeup %rbp
+	# The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts
+	# the address of the "next" instruction into the target register
+	# (%rbp).  This generates these 2 instructions:
+	lea	.Llea(%rip),%rbp
+	#nop	# .picmeup generates a nop for mod 8 alignment--not needed here
+
+.Llea:
+	lea	K512-.(%rbp),%rbp
+
+	mov	8*0(%rdi),%rax
+	mov	8*1(%rdi),%rbx
+	mov	8*2(%rdi),%rcx
+	mov	8*3(%rdi),%rdx
+	mov	8*4(%rdi),%r8
+	mov	8*5(%rdi),%r9
+	mov	8*6(%rdi),%r10
+	mov	8*7(%rdi),%r11
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+	xor	%rdi,%rdi
+	mov	8*0(%rsi),%r12
+	bswap	%r12
+	mov	%r8,%r13
+	mov	%r8,%r14
+	mov	%r9,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r10,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r8,%r15			# (f^g)&e
+	mov	%r12,0(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r10,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r11,%r12			# T1+=h
+
+	mov	%rax,%r11
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rax,%r13
+	mov	%rax,%r14
+
+	ror	$28,%r11
+	ror	$34,%r13
+	mov	%rax,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r11
+	ror	$5,%r13
+	or	%rcx,%r14			# a|c
+
+	xor	%r13,%r11			# h=Sigma0(a)
+	and	%rcx,%r15			# a&c
+	add	%r12,%rdx			# d+=T1
+
+	and	%rbx,%r14			# (a|c)&b
+	add	%r12,%r11			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r11			# h+=Maj(a,b,c)
+	mov	8*1(%rsi),%r12
+	bswap	%r12
+	mov	%rdx,%r13
+	mov	%rdx,%r14
+	mov	%r8,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r9,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rdx,%r15			# (f^g)&e
+	mov	%r12,8(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r9,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r10,%r12			# T1+=h
+
+	mov	%r11,%r10
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r11,%r13
+	mov	%r11,%r14
+
+	ror	$28,%r10
+	ror	$34,%r13
+	mov	%r11,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r10
+	ror	$5,%r13
+	or	%rbx,%r14			# a|c
+
+	xor	%r13,%r10			# h=Sigma0(a)
+	and	%rbx,%r15			# a&c
+	add	%r12,%rcx			# d+=T1
+
+	and	%rax,%r14			# (a|c)&b
+	add	%r12,%r10			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r10			# h+=Maj(a,b,c)
+	mov	8*2(%rsi),%r12
+	bswap	%r12
+	mov	%rcx,%r13
+	mov	%rcx,%r14
+	mov	%rdx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r8,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rcx,%r15			# (f^g)&e
+	mov	%r12,16(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r8,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r9,%r12			# T1+=h
+
+	mov	%r10,%r9
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r10,%r13
+	mov	%r10,%r14
+
+	ror	$28,%r9
+	ror	$34,%r13
+	mov	%r10,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r9
+	ror	$5,%r13
+	or	%rax,%r14			# a|c
+
+	xor	%r13,%r9			# h=Sigma0(a)
+	and	%rax,%r15			# a&c
+	add	%r12,%rbx			# d+=T1
+
+	and	%r11,%r14			# (a|c)&b
+	add	%r12,%r9			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r9			# h+=Maj(a,b,c)
+	mov	8*3(%rsi),%r12
+	bswap	%r12
+	mov	%rbx,%r13
+	mov	%rbx,%r14
+	mov	%rcx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rdx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rbx,%r15			# (f^g)&e
+	mov	%r12,24(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rdx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r8,%r12			# T1+=h
+
+	mov	%r9,%r8
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r9,%r13
+	mov	%r9,%r14
+
+	ror	$28,%r8
+	ror	$34,%r13
+	mov	%r9,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r8
+	ror	$5,%r13
+	or	%r11,%r14			# a|c
+
+	xor	%r13,%r8			# h=Sigma0(a)
+	and	%r11,%r15			# a&c
+	add	%r12,%rax			# d+=T1
+
+	and	%r10,%r14			# (a|c)&b
+	add	%r12,%r8			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r8			# h+=Maj(a,b,c)
+	mov	8*4(%rsi),%r12
+	bswap	%r12
+	mov	%rax,%r13
+	mov	%rax,%r14
+	mov	%rbx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rcx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rax,%r15			# (f^g)&e
+	mov	%r12,32(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rcx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rdx,%r12			# T1+=h
+
+	mov	%r8,%rdx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r8,%r13
+	mov	%r8,%r14
+
+	ror	$28,%rdx
+	ror	$34,%r13
+	mov	%r8,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rdx
+	ror	$5,%r13
+	or	%r10,%r14			# a|c
+
+	xor	%r13,%rdx			# h=Sigma0(a)
+	and	%r10,%r15			# a&c
+	add	%r12,%r11			# d+=T1
+
+	and	%r9,%r14			# (a|c)&b
+	add	%r12,%rdx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rdx			# h+=Maj(a,b,c)
+	mov	8*5(%rsi),%r12
+	bswap	%r12
+	mov	%r11,%r13
+	mov	%r11,%r14
+	mov	%rax,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rbx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r11,%r15			# (f^g)&e
+	mov	%r12,40(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rbx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rcx,%r12			# T1+=h
+
+	mov	%rdx,%rcx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rdx,%r13
+	mov	%rdx,%r14
+
+	ror	$28,%rcx
+	ror	$34,%r13
+	mov	%rdx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rcx
+	ror	$5,%r13
+	or	%r9,%r14			# a|c
+
+	xor	%r13,%rcx			# h=Sigma0(a)
+	and	%r9,%r15			# a&c
+	add	%r12,%r10			# d+=T1
+
+	and	%r8,%r14			# (a|c)&b
+	add	%r12,%rcx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rcx			# h+=Maj(a,b,c)
+	mov	8*6(%rsi),%r12
+	bswap	%r12
+	mov	%r10,%r13
+	mov	%r10,%r14
+	mov	%r11,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rax,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r10,%r15			# (f^g)&e
+	mov	%r12,48(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rax,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rbx,%r12			# T1+=h
+
+	mov	%rcx,%rbx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rcx,%r13
+	mov	%rcx,%r14
+
+	ror	$28,%rbx
+	ror	$34,%r13
+	mov	%rcx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rbx
+	ror	$5,%r13
+	or	%r8,%r14			# a|c
+
+	xor	%r13,%rbx			# h=Sigma0(a)
+	and	%r8,%r15			# a&c
+	add	%r12,%r9			# d+=T1
+
+	and	%rdx,%r14			# (a|c)&b
+	add	%r12,%rbx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rbx			# h+=Maj(a,b,c)
+	mov	8*7(%rsi),%r12
+	bswap	%r12
+	mov	%r9,%r13
+	mov	%r9,%r14
+	mov	%r10,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r11,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r9,%r15			# (f^g)&e
+	mov	%r12,56(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r11,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rax,%r12			# T1+=h
+
+	mov	%rbx,%rax
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rbx,%r13
+	mov	%rbx,%r14
+
+	ror	$28,%rax
+	ror	$34,%r13
+	mov	%rbx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rax
+	ror	$5,%r13
+	or	%rdx,%r14			# a|c
+
+	xor	%r13,%rax			# h=Sigma0(a)
+	and	%rdx,%r15			# a&c
+	add	%r12,%r8			# d+=T1
+
+	and	%rcx,%r14			# (a|c)&b
+	add	%r12,%rax			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rax			# h+=Maj(a,b,c)
+	mov	8*8(%rsi),%r12
+	bswap	%r12
+	mov	%r8,%r13
+	mov	%r8,%r14
+	mov	%r9,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r10,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r8,%r15			# (f^g)&e
+	mov	%r12,64(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r10,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r11,%r12			# T1+=h
+
+	mov	%rax,%r11
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rax,%r13
+	mov	%rax,%r14
+
+	ror	$28,%r11
+	ror	$34,%r13
+	mov	%rax,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r11
+	ror	$5,%r13
+	or	%rcx,%r14			# a|c
+
+	xor	%r13,%r11			# h=Sigma0(a)
+	and	%rcx,%r15			# a&c
+	add	%r12,%rdx			# d+=T1
+
+	and	%rbx,%r14			# (a|c)&b
+	add	%r12,%r11			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r11			# h+=Maj(a,b,c)
+	mov	8*9(%rsi),%r12
+	bswap	%r12
+	mov	%rdx,%r13
+	mov	%rdx,%r14
+	mov	%r8,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r9,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rdx,%r15			# (f^g)&e
+	mov	%r12,72(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r9,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r10,%r12			# T1+=h
+
+	mov	%r11,%r10
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r11,%r13
+	mov	%r11,%r14
+
+	ror	$28,%r10
+	ror	$34,%r13
+	mov	%r11,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r10
+	ror	$5,%r13
+	or	%rbx,%r14			# a|c
+
+	xor	%r13,%r10			# h=Sigma0(a)
+	and	%rbx,%r15			# a&c
+	add	%r12,%rcx			# d+=T1
+
+	and	%rax,%r14			# (a|c)&b
+	add	%r12,%r10			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r10			# h+=Maj(a,b,c)
+	mov	8*10(%rsi),%r12
+	bswap	%r12
+	mov	%rcx,%r13
+	mov	%rcx,%r14
+	mov	%rdx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r8,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rcx,%r15			# (f^g)&e
+	mov	%r12,80(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r8,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r9,%r12			# T1+=h
+
+	mov	%r10,%r9
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r10,%r13
+	mov	%r10,%r14
+
+	ror	$28,%r9
+	ror	$34,%r13
+	mov	%r10,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r9
+	ror	$5,%r13
+	or	%rax,%r14			# a|c
+
+	xor	%r13,%r9			# h=Sigma0(a)
+	and	%rax,%r15			# a&c
+	add	%r12,%rbx			# d+=T1
+
+	and	%r11,%r14			# (a|c)&b
+	add	%r12,%r9			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r9			# h+=Maj(a,b,c)
+	mov	8*11(%rsi),%r12
+	bswap	%r12
+	mov	%rbx,%r13
+	mov	%rbx,%r14
+	mov	%rcx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rdx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rbx,%r15			# (f^g)&e
+	mov	%r12,88(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rdx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r8,%r12			# T1+=h
+
+	mov	%r9,%r8
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r9,%r13
+	mov	%r9,%r14
+
+	ror	$28,%r8
+	ror	$34,%r13
+	mov	%r9,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r8
+	ror	$5,%r13
+	or	%r11,%r14			# a|c
+
+	xor	%r13,%r8			# h=Sigma0(a)
+	and	%r11,%r15			# a&c
+	add	%r12,%rax			# d+=T1
+
+	and	%r10,%r14			# (a|c)&b
+	add	%r12,%r8			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r8			# h+=Maj(a,b,c)
+	mov	8*12(%rsi),%r12
+	bswap	%r12
+	mov	%rax,%r13
+	mov	%rax,%r14
+	mov	%rbx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rcx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rax,%r15			# (f^g)&e
+	mov	%r12,96(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rcx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rdx,%r12			# T1+=h
+
+	mov	%r8,%rdx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r8,%r13
+	mov	%r8,%r14
+
+	ror	$28,%rdx
+	ror	$34,%r13
+	mov	%r8,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rdx
+	ror	$5,%r13
+	or	%r10,%r14			# a|c
+
+	xor	%r13,%rdx			# h=Sigma0(a)
+	and	%r10,%r15			# a&c
+	add	%r12,%r11			# d+=T1
+
+	and	%r9,%r14			# (a|c)&b
+	add	%r12,%rdx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rdx			# h+=Maj(a,b,c)
+	mov	8*13(%rsi),%r12
+	bswap	%r12
+	mov	%r11,%r13
+	mov	%r11,%r14
+	mov	%rax,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rbx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r11,%r15			# (f^g)&e
+	mov	%r12,104(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rbx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rcx,%r12			# T1+=h
+
+	mov	%rdx,%rcx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rdx,%r13
+	mov	%rdx,%r14
+
+	ror	$28,%rcx
+	ror	$34,%r13
+	mov	%rdx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rcx
+	ror	$5,%r13
+	or	%r9,%r14			# a|c
+
+	xor	%r13,%rcx			# h=Sigma0(a)
+	and	%r9,%r15			# a&c
+	add	%r12,%r10			# d+=T1
+
+	and	%r8,%r14			# (a|c)&b
+	add	%r12,%rcx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rcx			# h+=Maj(a,b,c)
+	mov	8*14(%rsi),%r12
+	bswap	%r12
+	mov	%r10,%r13
+	mov	%r10,%r14
+	mov	%r11,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rax,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r10,%r15			# (f^g)&e
+	mov	%r12,112(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rax,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rbx,%r12			# T1+=h
+
+	mov	%rcx,%rbx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rcx,%r13
+	mov	%rcx,%r14
+
+	ror	$28,%rbx
+	ror	$34,%r13
+	mov	%rcx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rbx
+	ror	$5,%r13
+	or	%r8,%r14			# a|c
+
+	xor	%r13,%rbx			# h=Sigma0(a)
+	and	%r8,%r15			# a&c
+	add	%r12,%r9			# d+=T1
+
+	and	%rdx,%r14			# (a|c)&b
+	add	%r12,%rbx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rbx			# h+=Maj(a,b,c)
+	mov	8*15(%rsi),%r12
+	bswap	%r12
+	mov	%r9,%r13
+	mov	%r9,%r14
+	mov	%r10,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r11,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r9,%r15			# (f^g)&e
+	mov	%r12,120(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r11,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rax,%r12			# T1+=h
+
+	mov	%rbx,%rax
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rbx,%r13
+	mov	%rbx,%r14
+
+	ror	$28,%rax
+	ror	$34,%r13
+	mov	%rbx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rax
+	ror	$5,%r13
+	or	%rdx,%r14			# a|c
+
+	xor	%r13,%rax			# h=Sigma0(a)
+	and	%rdx,%r15			# a&c
+	add	%r12,%r8			# d+=T1
+
+	and	%rcx,%r14			# (a|c)&b
+	add	%r12,%rax			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rax			# h+=Maj(a,b,c)
+	jmp	.Lrounds_16_xx
+.align	16
+.Lrounds_16_xx:
+	mov	8(%rsp),%r13
+	mov	112(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	72(%rsp),%r12
+
+	add	0(%rsp),%r12
+	mov	%r8,%r13
+	mov	%r8,%r14
+	mov	%r9,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r10,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r8,%r15			# (f^g)&e
+	mov	%r12,0(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r10,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r11,%r12			# T1+=h
+
+	mov	%rax,%r11
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rax,%r13
+	mov	%rax,%r14
+
+	ror	$28,%r11
+	ror	$34,%r13
+	mov	%rax,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r11
+	ror	$5,%r13
+	or	%rcx,%r14			# a|c
+
+	xor	%r13,%r11			# h=Sigma0(a)
+	and	%rcx,%r15			# a&c
+	add	%r12,%rdx			# d+=T1
+
+	and	%rbx,%r14			# (a|c)&b
+	add	%r12,%r11			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r11			# h+=Maj(a,b,c)
+	mov	16(%rsp),%r13
+	mov	120(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	80(%rsp),%r12
+
+	add	8(%rsp),%r12
+	mov	%rdx,%r13
+	mov	%rdx,%r14
+	mov	%r8,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r9,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rdx,%r15			# (f^g)&e
+	mov	%r12,8(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r9,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r10,%r12			# T1+=h
+
+	mov	%r11,%r10
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r11,%r13
+	mov	%r11,%r14
+
+	ror	$28,%r10
+	ror	$34,%r13
+	mov	%r11,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r10
+	ror	$5,%r13
+	or	%rbx,%r14			# a|c
+
+	xor	%r13,%r10			# h=Sigma0(a)
+	and	%rbx,%r15			# a&c
+	add	%r12,%rcx			# d+=T1
+
+	and	%rax,%r14			# (a|c)&b
+	add	%r12,%r10			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r10			# h+=Maj(a,b,c)
+	mov	24(%rsp),%r13
+	mov	0(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	88(%rsp),%r12
+
+	add	16(%rsp),%r12
+	mov	%rcx,%r13
+	mov	%rcx,%r14
+	mov	%rdx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r8,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rcx,%r15			# (f^g)&e
+	mov	%r12,16(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r8,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r9,%r12			# T1+=h
+
+	mov	%r10,%r9
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r10,%r13
+	mov	%r10,%r14
+
+	ror	$28,%r9
+	ror	$34,%r13
+	mov	%r10,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r9
+	ror	$5,%r13
+	or	%rax,%r14			# a|c
+
+	xor	%r13,%r9			# h=Sigma0(a)
+	and	%rax,%r15			# a&c
+	add	%r12,%rbx			# d+=T1
+
+	and	%r11,%r14			# (a|c)&b
+	add	%r12,%r9			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r9			# h+=Maj(a,b,c)
+	mov	32(%rsp),%r13
+	mov	8(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	96(%rsp),%r12
+
+	add	24(%rsp),%r12
+	mov	%rbx,%r13
+	mov	%rbx,%r14
+	mov	%rcx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rdx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rbx,%r15			# (f^g)&e
+	mov	%r12,24(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rdx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r8,%r12			# T1+=h
+
+	mov	%r9,%r8
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r9,%r13
+	mov	%r9,%r14
+
+	ror	$28,%r8
+	ror	$34,%r13
+	mov	%r9,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r8
+	ror	$5,%r13
+	or	%r11,%r14			# a|c
+
+	xor	%r13,%r8			# h=Sigma0(a)
+	and	%r11,%r15			# a&c
+	add	%r12,%rax			# d+=T1
+
+	and	%r10,%r14			# (a|c)&b
+	add	%r12,%r8			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r8			# h+=Maj(a,b,c)
+	mov	40(%rsp),%r13
+	mov	16(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	104(%rsp),%r12
+
+	add	32(%rsp),%r12
+	mov	%rax,%r13
+	mov	%rax,%r14
+	mov	%rbx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rcx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rax,%r15			# (f^g)&e
+	mov	%r12,32(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rcx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rdx,%r12			# T1+=h
+
+	mov	%r8,%rdx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r8,%r13
+	mov	%r8,%r14
+
+	ror	$28,%rdx
+	ror	$34,%r13
+	mov	%r8,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rdx
+	ror	$5,%r13
+	or	%r10,%r14			# a|c
+
+	xor	%r13,%rdx			# h=Sigma0(a)
+	and	%r10,%r15			# a&c
+	add	%r12,%r11			# d+=T1
+
+	and	%r9,%r14			# (a|c)&b
+	add	%r12,%rdx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rdx			# h+=Maj(a,b,c)
+	mov	48(%rsp),%r13
+	mov	24(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	112(%rsp),%r12
+
+	add	40(%rsp),%r12
+	mov	%r11,%r13
+	mov	%r11,%r14
+	mov	%rax,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rbx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r11,%r15			# (f^g)&e
+	mov	%r12,40(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rbx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rcx,%r12			# T1+=h
+
+	mov	%rdx,%rcx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rdx,%r13
+	mov	%rdx,%r14
+
+	ror	$28,%rcx
+	ror	$34,%r13
+	mov	%rdx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rcx
+	ror	$5,%r13
+	or	%r9,%r14			# a|c
+
+	xor	%r13,%rcx			# h=Sigma0(a)
+	and	%r9,%r15			# a&c
+	add	%r12,%r10			# d+=T1
+
+	and	%r8,%r14			# (a|c)&b
+	add	%r12,%rcx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rcx			# h+=Maj(a,b,c)
+	mov	56(%rsp),%r13
+	mov	32(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	120(%rsp),%r12
+
+	add	48(%rsp),%r12
+	mov	%r10,%r13
+	mov	%r10,%r14
+	mov	%r11,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rax,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r10,%r15			# (f^g)&e
+	mov	%r12,48(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rax,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rbx,%r12			# T1+=h
+
+	mov	%rcx,%rbx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rcx,%r13
+	mov	%rcx,%r14
+
+	ror	$28,%rbx
+	ror	$34,%r13
+	mov	%rcx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rbx
+	ror	$5,%r13
+	or	%r8,%r14			# a|c
+
+	xor	%r13,%rbx			# h=Sigma0(a)
+	and	%r8,%r15			# a&c
+	add	%r12,%r9			# d+=T1
+
+	and	%rdx,%r14			# (a|c)&b
+	add	%r12,%rbx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rbx			# h+=Maj(a,b,c)
+	mov	64(%rsp),%r13
+	mov	40(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	0(%rsp),%r12
+
+	add	56(%rsp),%r12
+	mov	%r9,%r13
+	mov	%r9,%r14
+	mov	%r10,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r11,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r9,%r15			# (f^g)&e
+	mov	%r12,56(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r11,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rax,%r12			# T1+=h
+
+	mov	%rbx,%rax
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rbx,%r13
+	mov	%rbx,%r14
+
+	ror	$28,%rax
+	ror	$34,%r13
+	mov	%rbx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rax
+	ror	$5,%r13
+	or	%rdx,%r14			# a|c
+
+	xor	%r13,%rax			# h=Sigma0(a)
+	and	%rdx,%r15			# a&c
+	add	%r12,%r8			# d+=T1
+
+	and	%rcx,%r14			# (a|c)&b
+	add	%r12,%rax			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rax			# h+=Maj(a,b,c)
+	mov	72(%rsp),%r13
+	mov	48(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	8(%rsp),%r12
+
+	add	64(%rsp),%r12
+	mov	%r8,%r13
+	mov	%r8,%r14
+	mov	%r9,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r10,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r8,%r15			# (f^g)&e
+	mov	%r12,64(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r10,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r11,%r12			# T1+=h
+
+	mov	%rax,%r11
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rax,%r13
+	mov	%rax,%r14
+
+	ror	$28,%r11
+	ror	$34,%r13
+	mov	%rax,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r11
+	ror	$5,%r13
+	or	%rcx,%r14			# a|c
+
+	xor	%r13,%r11			# h=Sigma0(a)
+	and	%rcx,%r15			# a&c
+	add	%r12,%rdx			# d+=T1
+
+	and	%rbx,%r14			# (a|c)&b
+	add	%r12,%r11			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r11			# h+=Maj(a,b,c)
+	mov	80(%rsp),%r13
+	mov	56(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	16(%rsp),%r12
+
+	add	72(%rsp),%r12
+	mov	%rdx,%r13
+	mov	%rdx,%r14
+	mov	%r8,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r9,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rdx,%r15			# (f^g)&e
+	mov	%r12,72(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r9,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r10,%r12			# T1+=h
+
+	mov	%r11,%r10
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r11,%r13
+	mov	%r11,%r14
+
+	ror	$28,%r10
+	ror	$34,%r13
+	mov	%r11,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r10
+	ror	$5,%r13
+	or	%rbx,%r14			# a|c
+
+	xor	%r13,%r10			# h=Sigma0(a)
+	and	%rbx,%r15			# a&c
+	add	%r12,%rcx			# d+=T1
+
+	and	%rax,%r14			# (a|c)&b
+	add	%r12,%r10			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r10			# h+=Maj(a,b,c)
+	mov	88(%rsp),%r13
+	mov	64(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	24(%rsp),%r12
+
+	add	80(%rsp),%r12
+	mov	%rcx,%r13
+	mov	%rcx,%r14
+	mov	%rdx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r8,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rcx,%r15			# (f^g)&e
+	mov	%r12,80(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r8,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r9,%r12			# T1+=h
+
+	mov	%r10,%r9
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r10,%r13
+	mov	%r10,%r14
+
+	ror	$28,%r9
+	ror	$34,%r13
+	mov	%r10,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r9
+	ror	$5,%r13
+	or	%rax,%r14			# a|c
+
+	xor	%r13,%r9			# h=Sigma0(a)
+	and	%rax,%r15			# a&c
+	add	%r12,%rbx			# d+=T1
+
+	and	%r11,%r14			# (a|c)&b
+	add	%r12,%r9			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r9			# h+=Maj(a,b,c)
+	mov	96(%rsp),%r13
+	mov	72(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	32(%rsp),%r12
+
+	add	88(%rsp),%r12
+	mov	%rbx,%r13
+	mov	%rbx,%r14
+	mov	%rcx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rdx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rbx,%r15			# (f^g)&e
+	mov	%r12,88(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rdx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r8,%r12			# T1+=h
+
+	mov	%r9,%r8
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r9,%r13
+	mov	%r9,%r14
+
+	ror	$28,%r8
+	ror	$34,%r13
+	mov	%r9,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r8
+	ror	$5,%r13
+	or	%r11,%r14			# a|c
+
+	xor	%r13,%r8			# h=Sigma0(a)
+	and	%r11,%r15			# a&c
+	add	%r12,%rax			# d+=T1
+
+	and	%r10,%r14			# (a|c)&b
+	add	%r12,%r8			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r8			# h+=Maj(a,b,c)
+	mov	104(%rsp),%r13
+	mov	80(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	40(%rsp),%r12
+
+	add	96(%rsp),%r12
+	mov	%rax,%r13
+	mov	%rax,%r14
+	mov	%rbx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rcx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rax,%r15			# (f^g)&e
+	mov	%r12,96(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rcx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rdx,%r12			# T1+=h
+
+	mov	%r8,%rdx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r8,%r13
+	mov	%r8,%r14
+
+	ror	$28,%rdx
+	ror	$34,%r13
+	mov	%r8,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rdx
+	ror	$5,%r13
+	or	%r10,%r14			# a|c
+
+	xor	%r13,%rdx			# h=Sigma0(a)
+	and	%r10,%r15			# a&c
+	add	%r12,%r11			# d+=T1
+
+	and	%r9,%r14			# (a|c)&b
+	add	%r12,%rdx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rdx			# h+=Maj(a,b,c)
+	mov	112(%rsp),%r13
+	mov	88(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	48(%rsp),%r12
+
+	add	104(%rsp),%r12
+	mov	%r11,%r13
+	mov	%r11,%r14
+	mov	%rax,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rbx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r11,%r15			# (f^g)&e
+	mov	%r12,104(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rbx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rcx,%r12			# T1+=h
+
+	mov	%rdx,%rcx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rdx,%r13
+	mov	%rdx,%r14
+
+	ror	$28,%rcx
+	ror	$34,%r13
+	mov	%rdx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rcx
+	ror	$5,%r13
+	or	%r9,%r14			# a|c
+
+	xor	%r13,%rcx			# h=Sigma0(a)
+	and	%r9,%r15			# a&c
+	add	%r12,%r10			# d+=T1
+
+	and	%r8,%r14			# (a|c)&b
+	add	%r12,%rcx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rcx			# h+=Maj(a,b,c)
+	mov	120(%rsp),%r13
+	mov	96(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	56(%rsp),%r12
+
+	add	112(%rsp),%r12
+	mov	%r10,%r13
+	mov	%r10,%r14
+	mov	%r11,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rax,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r10,%r15			# (f^g)&e
+	mov	%r12,112(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rax,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rbx,%r12			# T1+=h
+
+	mov	%rcx,%rbx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rcx,%r13
+	mov	%rcx,%r14
+
+	ror	$28,%rbx
+	ror	$34,%r13
+	mov	%rcx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rbx
+	ror	$5,%r13
+	or	%r8,%r14			# a|c
+
+	xor	%r13,%rbx			# h=Sigma0(a)
+	and	%r8,%r15			# a&c
+	add	%r12,%r9			# d+=T1
+
+	and	%rdx,%r14			# (a|c)&b
+	add	%r12,%rbx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rbx			# h+=Maj(a,b,c)
+	mov	0(%rsp),%r13
+	mov	104(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	64(%rsp),%r12
+
+	add	120(%rsp),%r12
+	mov	%r9,%r13
+	mov	%r9,%r14
+	mov	%r10,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r11,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r9,%r15			# (f^g)&e
+	mov	%r12,120(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r11,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rax,%r12			# T1+=h
+
+	mov	%rbx,%rax
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rbx,%r13
+	mov	%rbx,%r14
+
+	ror	$28,%rax
+	ror	$34,%r13
+	mov	%rbx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rax
+	ror	$5,%r13
+	or	%rdx,%r14			# a|c
+
+	xor	%r13,%rax			# h=Sigma0(a)
+	and	%rdx,%r15			# a&c
+	add	%r12,%r8			# d+=T1
+
+	and	%rcx,%r14			# (a|c)&b
+	add	%r12,%rax			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rax			# h+=Maj(a,b,c)
+	cmp	$80,%rdi
+	jb	.Lrounds_16_xx
+
+	mov	16*8+0*8(%rsp),%rdi
+	lea	16*8(%rsi),%rsi
+
+	add	8*0(%rdi),%rax
+	add	8*1(%rdi),%rbx
+	add	8*2(%rdi),%rcx
+	add	8*3(%rdi),%rdx
+	add	8*4(%rdi),%r8
+	add	8*5(%rdi),%r9
+	add	8*6(%rdi),%r10
+	add	8*7(%rdi),%r11
+
+	cmp	16*8+2*8(%rsp),%rsi
+
+	mov	%rax,8*0(%rdi)
+	mov	%rbx,8*1(%rdi)
+	mov	%rcx,8*2(%rdi)
+	mov	%rdx,8*3(%rdi)
+	mov	%r8,8*4(%rdi)
+	mov	%r9,8*5(%rdi)
+	mov	%r10,8*6(%rdi)
+	mov	%r11,8*7(%rdi)
+	jb	.Lloop
+
+	mov	16*8+3*8(%rsp),%rsp
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+
+	ret
+SET_SIZE(SHA512TransformBlocks)
+
+.data
+.align	64
+.type	K512,@object
+K512:
+	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad	0x3956c25bf348b538,0x59f111f1b605d019
+	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad	0xd807aa98a3030242,0x12835b0145706fbe
+	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad	0x06ca6351e003826f,0x142929670a0e6e70
+	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad	0x81c2c92e47edaee6,0x92722c851482353b
+	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad	0xd192e819d6ef5218,0xd69906245565a910
+	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad	0x90befffa23631e28,0xa4506cebde82bde9
+	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad	0xca273eceea26619c,0xd186b8c721c0c207
+	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad	0x113f9804bef90dae,0x1b710b35131c471b
+	.quad	0x28db77f523047d84,0x32caab7b40c72493
+	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+#endif /* !lint && !__lint */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif