8 files changed, 1888 insertions, 81 deletions
diff --git a/sys/contrib/openzfs/module/icp/algs/modes/gcm.c b/sys/contrib/openzfs/module/icp/algs/modes/gcm.c
index c2a982b5a376..3cfa5b8165ce 100644
--- a/sys/contrib/openzfs/module/icp/algs/modes/gcm.c
+++ b/sys/contrib/openzfs/module/icp/algs/modes/gcm.c
@@ -46,6 +46,9 @@
 #define	IMPL_CYCLE	(UINT32_MAX-1)
 #ifdef CAN_USE_GCM_ASM
 #define	IMPL_AVX	(UINT32_MAX-2)
+#if CAN_USE_GCM_ASM >= 2
+#define	IMPL_AVX2	(UINT32_MAX-3)
+#endif
 #endif
 #define	GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i))
 static uint32_t icp_gcm_impl = IMPL_FASTEST;
@@ -56,17 +59,16 @@ static uint32_t user_sel_impl = IMPL_FASTEST;
 boolean_t gcm_avx_can_use_movbe = B_FALSE;
 /*
  * Whether to use the optimized openssl gcm and ghash implementations.
- * Set to true if module parameter icp_gcm_impl == "avx".
  */
-static boolean_t gcm_use_avx = B_FALSE;
-#define	GCM_IMPL_USE_AVX	(*(volatile boolean_t *)&gcm_use_avx)
+static gcm_impl gcm_impl_used = GCM_IMPL_GENERIC;
+#define	GCM_IMPL_USED	(*(volatile gcm_impl *)&gcm_impl_used)
 
 extern boolean_t ASMABI atomic_toggle_boolean_nv(volatile boolean_t *);
 
 static inline boolean_t gcm_avx_will_work(void);
-static inline void gcm_set_avx(boolean_t);
-static inline boolean_t gcm_toggle_avx(void);
-static inline size_t gcm_simd_get_htab_size(boolean_t);
+static inline boolean_t gcm_avx2_will_work(void);
+static inline void gcm_use_impl(gcm_impl impl);
+static inline gcm_impl gcm_toggle_impl(void);
 
 static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t,
     crypto_data_t *, size_t);
@@ -89,7 +91,7 @@ gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
     void (*xor_block)(uint8_t *, uint8_t *))
 {
 #ifdef CAN_USE_GCM_ASM
-	if (ctx->gcm_use_avx == B_TRUE)
+	if (ctx->impl != GCM_IMPL_GENERIC)
 		return (gcm_mode_encrypt_contiguous_blocks_avx(
 		    ctx, data, length, out, block_size));
 #endif
@@ -208,7 +210,7 @@ gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
 {
 	(void) copy_block;
 #ifdef CAN_USE_GCM_ASM
-	if (ctx->gcm_use_avx == B_TRUE)
+	if (ctx->impl != GCM_IMPL_GENERIC)
 		return (gcm_encrypt_final_avx(ctx, out, block_size));
 #endif
 
@@ -374,7 +376,7 @@ gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
     void (*xor_block)(uint8_t *, uint8_t *))
 {
 #ifdef CAN_USE_GCM_ASM
-	if (ctx->gcm_use_avx == B_TRUE)
+	if (ctx->impl != GCM_IMPL_GENERIC)
 		return (gcm_decrypt_final_avx(ctx, out, block_size));
 #endif
 
@@ -631,23 +633,23 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param,
 	    ((aes_key_t *)gcm_ctx->gcm_keysched)->ops->needs_byteswap;
 
 	if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
-		gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
+		gcm_ctx->impl = GCM_IMPL_USED;
 	} else {
 		/*
-		 * Handle the "cycle" implementation by creating avx and
-		 * non-avx contexts alternately.
+		 * Handle the "cycle" implementation by creating different
+		 * contexts, one per implementation.
 		 */
-		gcm_ctx->gcm_use_avx = gcm_toggle_avx();
+		gcm_ctx->impl = gcm_toggle_impl();
 
-		/* The avx impl. doesn't handle byte swapped key schedules. */
-		if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) {
-			gcm_ctx->gcm_use_avx = B_FALSE;
+		/* The AVX impl. doesn't handle byte swapped key schedules. */
+		if (needs_bswap == B_TRUE) {
+			gcm_ctx->impl = GCM_IMPL_GENERIC;
 		}
 		/*
-		 * If this is a GCM context, use the MOVBE and the BSWAP
+		 * If this is an AVX context, use the MOVBE and the BSWAP
 		 * variants alternately.
 		 */
-		if (gcm_ctx->gcm_use_avx == B_TRUE &&
+		if (gcm_ctx->impl == GCM_IMPL_AVX &&
 		    zfs_movbe_available() == B_TRUE) {
 			(void) atomic_toggle_boolean_nv(
 			    (volatile boolean_t *)&gcm_avx_can_use_movbe);
@@ -658,12 +660,13 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param,
 	 * still they could be created by the aes generic implementation.
 	 * Make sure not to use them since we'll corrupt data if we do.
 	 */
-	if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) {
-		gcm_ctx->gcm_use_avx = B_FALSE;
+	if (gcm_ctx->impl != GCM_IMPL_GENERIC && needs_bswap == B_TRUE) {
+		gcm_ctx->impl = GCM_IMPL_GENERIC;
 
 		cmn_err_once(CE_WARN,
 		    "ICP: Can't use the aes generic or cycle implementations "
-		    "in combination with the gcm avx implementation!");
+		    "in combination with the gcm avx or avx2-vaes "
+		    "implementation!");
 		cmn_err_once(CE_WARN,
 		    "ICP: Falling back to a compatible implementation, "
 		    "aes-gcm performance will likely be degraded.");
@@ -672,36 +675,20 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param,
 		    "restore performance.");
 	}
 
-	/* Allocate Htab memory as needed. */
-	if (gcm_ctx->gcm_use_avx == B_TRUE) {
-		size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx);
-
-		if (htab_len == 0) {
-			return (CRYPTO_MECHANISM_PARAM_INVALID);
-		}
-		gcm_ctx->gcm_htab_len = htab_len;
-		gcm_ctx->gcm_Htable =
-		    kmem_alloc(htab_len, KM_SLEEP);
-
-		if (gcm_ctx->gcm_Htable == NULL) {
-			return (CRYPTO_HOST_MEMORY);
-		}
+	/*
+	 * AVX implementations use Htable with sizes depending on
+	 * implementation.
+	 */
+	if (gcm_ctx->impl != GCM_IMPL_GENERIC) {
+		rv = gcm_init_avx(gcm_ctx, iv, iv_len, aad, aad_len,
+		    block_size);
 	}
-	/* Avx and non avx context initialization differs from here on. */
-	if (gcm_ctx->gcm_use_avx == B_FALSE) {
+	else
 #endif /* ifdef CAN_USE_GCM_ASM */
-		if (gcm_init(gcm_ctx, iv, iv_len, aad, aad_len, block_size,
-		    encrypt_block, copy_block, xor_block) != CRYPTO_SUCCESS) {
-			rv = CRYPTO_MECHANISM_PARAM_INVALID;
-		}
-#ifdef CAN_USE_GCM_ASM
-	} else {
-		if (gcm_init_avx(gcm_ctx, iv, iv_len, aad, aad_len,
-		    block_size) != CRYPTO_SUCCESS) {
-			rv = CRYPTO_MECHANISM_PARAM_INVALID;
-		}
+	if (gcm_init(gcm_ctx, iv, iv_len, aad, aad_len, block_size,
+	    encrypt_block, copy_block, xor_block) != CRYPTO_SUCCESS) {
+		rv = CRYPTO_MECHANISM_PARAM_INVALID;
 	}
-#endif /* ifdef CAN_USE_GCM_ASM */
 
 	return (rv);
 }
@@ -767,6 +754,9 @@ gcm_impl_get_ops(void)
 		break;
 #ifdef CAN_USE_GCM_ASM
 	case IMPL_AVX:
+#if CAN_USE_GCM_ASM >= 2
+	case IMPL_AVX2:
+#endif
 		/*
 		 * Make sure that we return a valid implementation while
 		 * switching to the avx implementation since there still
@@ -828,6 +818,13 @@ gcm_impl_init(void)
 	 * Use the avx implementation if it's available and the implementation
 	 * hasn't changed from its default value of fastest on module load.
 	 */
+#if CAN_USE_GCM_ASM >= 2
+	if (gcm_avx2_will_work()) {
+		if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
+			gcm_use_impl(GCM_IMPL_AVX2);
+		}
+	} else
+#endif
 	if (gcm_avx_will_work()) {
 #ifdef HAVE_MOVBE
 		if (zfs_movbe_available() == B_TRUE) {
@@ -835,7 +832,7 @@ gcm_impl_init(void)
 		}
 #endif
 		if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
-			gcm_set_avx(B_TRUE);
+			gcm_use_impl(GCM_IMPL_AVX);
 		}
 	}
 #endif
@@ -852,6 +849,7 @@ static const struct {
 		{ "fastest",	IMPL_FASTEST },
 #ifdef CAN_USE_GCM_ASM
 		{ "avx",	IMPL_AVX },
+		{ "avx2-vaes",	IMPL_AVX2 },
 #endif
 };
 
@@ -887,7 +885,13 @@ gcm_impl_set(const char *val)
 	/* Check mandatory options */
 	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
 #ifdef CAN_USE_GCM_ASM
+#if CAN_USE_GCM_ASM >= 2
 		/* Ignore avx implementation if it won't work. */
+		if (gcm_impl_opts[i].sel == IMPL_AVX2 &&
+		    !gcm_avx2_will_work()) {
+			continue;
+		}
+#endif
 		if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
 			continue;
 		}
@@ -915,11 +919,17 @@ gcm_impl_set(const char *val)
 	 * Use the avx implementation if available and the requested one is
 	 * avx or fastest.
 	 */
+#if CAN_USE_GCM_ASM >= 2
+	if (gcm_avx2_will_work() == B_TRUE &&
+	    (impl == IMPL_AVX2 || impl == IMPL_FASTEST)) {
+		gcm_use_impl(GCM_IMPL_AVX2);
+	} else
+#endif
 	if (gcm_avx_will_work() == B_TRUE &&
 	    (impl == IMPL_AVX || impl == IMPL_FASTEST)) {
-		gcm_set_avx(B_TRUE);
+		gcm_use_impl(GCM_IMPL_AVX);
 	} else {
-		gcm_set_avx(B_FALSE);
+		gcm_use_impl(GCM_IMPL_GENERIC);
 	}
 #endif
 
@@ -952,6 +962,12 @@ icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp)
 	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
 #ifdef CAN_USE_GCM_ASM
 		/* Ignore avx implementation if it won't work. */
+#if CAN_USE_GCM_ASM >= 2
+		if (gcm_impl_opts[i].sel == IMPL_AVX2 &&
+		    !gcm_avx2_will_work()) {
+			continue;
+		}
+#endif
 		if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
 			continue;
 		}
@@ -993,9 +1009,6 @@ MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
 
 /* Clear the FPU registers since they hold sensitive internal state. */
 #define	clear_fpu_regs() clear_fpu_regs_avx()
-#define	GHASH_AVX(ctx, in, len) \
-    gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \
-    in, len)
 
 #define	gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1)
 
@@ -1010,20 +1023,77 @@ MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
 static uint32_t gcm_avx_chunk_size =
 	((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
 
+/*
+ * GCM definitions: uint128_t is copied from include/crypto/modes.h
+ * Avoiding u128 because it is already defined in kernel sources.
+ */
+typedef struct {
+    uint64_t hi, lo;
+} uint128_t;
+
 extern void ASMABI clear_fpu_regs_avx(void);
 extern void ASMABI gcm_xor_avx(const uint8_t *src, uint8_t *dst);
 extern void ASMABI aes_encrypt_intel(const uint32_t rk[], int nr,
     const uint32_t pt[4], uint32_t ct[4]);
 
 extern void ASMABI gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]);
+#if CAN_USE_GCM_ASM >= 2
+extern void ASMABI gcm_init_vpclmulqdq_avx2(uint128_t Htable[16],
+    const uint64_t H[2]);
+#endif
 extern void ASMABI gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable,
     const uint8_t *in, size_t len);
+#if CAN_USE_GCM_ASM >= 2
+extern void ASMABI gcm_ghash_vpclmulqdq_avx2(uint64_t ghash[2],
+    const uint64_t *Htable, const uint8_t *in, size_t len);
+#endif
+static inline void GHASH_AVX(gcm_ctx_t *ctx, const uint8_t *in, size_t len)
+{
+	switch (ctx->impl) {
+#if CAN_USE_GCM_ASM >= 2
+		case GCM_IMPL_AVX2:
+			gcm_ghash_vpclmulqdq_avx2(ctx->gcm_ghash,
+			    (const uint64_t *)ctx->gcm_Htable, in, len);
+			break;
+#endif
+
+		case GCM_IMPL_AVX:
+			gcm_ghash_avx(ctx->gcm_ghash,
+			    (const uint64_t *)ctx->gcm_Htable, in, len);
+			break;
+
+		default:
+			VERIFY(B_FALSE);
+	}
+}
 
+typedef size_t ASMABI aesni_gcm_encrypt_impl(const uint8_t *, uint8_t *,
+    size_t, const void *, uint64_t *, const uint64_t *Htable, uint64_t *);
 extern size_t ASMABI aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t,
     const void *, uint64_t *, uint64_t *);
+#if CAN_USE_GCM_ASM >= 2
+extern void ASMABI aes_gcm_enc_update_vaes_avx2(const uint8_t *in,
+    uint8_t *out, size_t len, const void *key, const uint8_t ivec[16],
+    const uint128_t Htable[16], uint8_t Xi[16]);
+#endif
 
+typedef size_t ASMABI aesni_gcm_decrypt_impl(const uint8_t *, uint8_t *,
+    size_t, const void *, uint64_t *, const uint64_t *Htable, uint64_t *);
 extern size_t ASMABI aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t,
     const void *, uint64_t *, uint64_t *);
+#if CAN_USE_GCM_ASM >= 2
+extern void ASMABI aes_gcm_dec_update_vaes_avx2(const uint8_t *in,
+    uint8_t *out, size_t len, const void *key, const uint8_t ivec[16],
+    const uint128_t Htable[16], uint8_t Xi[16]);
+#endif
+
+static inline boolean_t
+gcm_avx2_will_work(void)
+{
+	return (kfpu_allowed() &&
+	    zfs_avx2_available() && zfs_vaes_available() &&
+	    zfs_vpclmulqdq_available());
+}
 
 static inline boolean_t
 gcm_avx_will_work(void)
@@ -1035,33 +1105,67 @@ gcm_avx_will_work(void)
 }
 
 static inline void
-gcm_set_avx(boolean_t val)
+gcm_use_impl(gcm_impl impl)
 {
-	if (gcm_avx_will_work() == B_TRUE) {
-		atomic_swap_32(&gcm_use_avx, val);
+	switch (impl) {
+#if CAN_USE_GCM_ASM >= 2
+		case GCM_IMPL_AVX2:
+			if (gcm_avx2_will_work() == B_TRUE) {
+				atomic_swap_32(&gcm_impl_used, impl);
+				return;
+			}
+
+			zfs_fallthrough;
+#endif
+
+		case GCM_IMPL_AVX:
+			if (gcm_avx_will_work() == B_TRUE) {
+				atomic_swap_32(&gcm_impl_used, impl);
+				return;
+			}
+
+			zfs_fallthrough;
+
+		default:
+			atomic_swap_32(&gcm_impl_used, GCM_IMPL_GENERIC);
 	}
 }
 
 static inline boolean_t
-gcm_toggle_avx(void)
+gcm_impl_will_work(gcm_impl impl)
 {
-	if (gcm_avx_will_work() == B_TRUE) {
-		return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX));
-	} else {
-		return (B_FALSE);
+	switch (impl) {
+#if CAN_USE_GCM_ASM >= 2
+		case GCM_IMPL_AVX2:
+			return (gcm_avx2_will_work());
+#endif
+
+		case GCM_IMPL_AVX:
+			return (gcm_avx_will_work());
+
+		default:
+			return (B_TRUE);
 	}
 }
 
-static inline size_t
-gcm_simd_get_htab_size(boolean_t simd_mode)
+static inline gcm_impl
+gcm_toggle_impl(void)
 {
-	switch (simd_mode) {
-	case B_TRUE:
-		return (2 * 6 * 2 * sizeof (uint64_t));
+	gcm_impl current_impl, new_impl;
+	do { /* handle races */
+		current_impl = atomic_load_32(&gcm_impl_used);
+		new_impl = current_impl;
+		while (B_TRUE) { /* handle incompatble implementations */
+			new_impl = (new_impl + 1) % GCM_IMPL_MAX;
+			if (gcm_impl_will_work(new_impl)) {
+				break;
+			}
+		}
 
-	default:
-		return (0);
-	}
+	} while (atomic_cas_32(&gcm_impl_used, current_impl, new_impl) !=
+	    current_impl);
+
+	return (new_impl);
 }
 
 
@@ -1077,6 +1181,50 @@ gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n)
 	ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
 }
 
+static size_t aesni_gcm_encrypt_avx(const uint8_t *in, uint8_t *out,
+    size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,
+    uint64_t *Xip)
+{
+	(void) Htable;
+	return (aesni_gcm_encrypt(in, out, len, key, iv, Xip));
+}
+
+#if CAN_USE_GCM_ASM >= 2
+// kSizeTWithoutLower4Bits is a mask that can be used to zero the lower four
+// bits of a |size_t|.
+// This is from boringssl/crypto/fipsmodule/aes/gcm.cc.inc
+static const size_t kSizeTWithoutLower4Bits = (size_t)-16;
+
+/* The following CRYPTO methods are from boringssl/crypto/internal.h */
+static inline uint32_t CRYPTO_bswap4(uint32_t x) {
+	return (__builtin_bswap32(x));
+}
+
+static inline uint32_t CRYPTO_load_u32_be(const void *in) {
+	uint32_t v;
+	memcpy(&v, in, sizeof (v));
+	return (CRYPTO_bswap4(v));
+}
+
+static inline void CRYPTO_store_u32_be(void *out, uint32_t v) {
+	v = CRYPTO_bswap4(v);
+	memcpy(out, &v, sizeof (v));
+}
+
+static size_t aesni_gcm_encrypt_avx2(const uint8_t *in, uint8_t *out,
+    size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,
+    uint64_t *Xip)
+{
+	uint8_t *ivec = (uint8_t *)iv;
+	len &= kSizeTWithoutLower4Bits;
+	aes_gcm_enc_update_vaes_avx2(in, out, len, key, ivec,
+	    (const uint128_t *)Htable, (uint8_t *)Xip);
+	CRYPTO_store_u32_be(&ivec[12],
+	    CRYPTO_load_u32_be(&ivec[12]) + len / 16);
+	return (len);
+}
+#endif /* if CAN_USE_GCM_ASM >= 2 */
+
 /*
  * Encrypt multiple blocks of data in GCM mode.
  * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines
@@ -1091,8 +1239,15 @@ gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
 	size_t done = 0;
 	uint8_t *datap = (uint8_t *)data;
 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
+	aesni_gcm_encrypt_impl *encrypt_blocks =
+#if CAN_USE_GCM_ASM >= 2
+	    ctx->impl == GCM_IMPL_AVX2 ?
+	    aesni_gcm_encrypt_avx2 :
+#endif
+	    aesni_gcm_encrypt_avx;
 	const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
 	uint64_t *ghash = ctx->gcm_ghash;
+	uint64_t *htable = ctx->gcm_Htable;
 	uint64_t *cb = ctx->gcm_cb;
 	uint8_t *ct_buf = NULL;
 	uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
@@ -1156,8 +1311,8 @@ gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
 	/* Do the bulk encryption in chunk_size blocks. */
 	for (; bleft >= chunk_size; bleft -= chunk_size) {
 		kfpu_begin();
-		done = aesni_gcm_encrypt(
-		    datap, ct_buf, chunk_size, key, cb, ghash);
+		done = encrypt_blocks(
+		    datap, ct_buf, chunk_size, key, cb, htable, ghash);
 
 		clear_fpu_regs();
 		kfpu_end();
@@ -1180,7 +1335,8 @@ gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
 	/* Bulk encrypt the remaining data. */
 	kfpu_begin();
 	if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
-		done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash);
+		done = encrypt_blocks(datap, ct_buf, bleft, key, cb, htable,
+		    ghash);
 		if (done == 0) {
 			rv = CRYPTO_FAILED;
 			goto out;
@@ -1293,6 +1449,29 @@ gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
 	return (CRYPTO_SUCCESS);
 }
 
+static size_t aesni_gcm_decrypt_avx(const uint8_t *in, uint8_t *out,
+    size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,
+    uint64_t *Xip)
+{
+	(void) Htable;
+	return (aesni_gcm_decrypt(in, out, len, key, iv, Xip));
+}
+
+#if CAN_USE_GCM_ASM >= 2
+static size_t aesni_gcm_decrypt_avx2(const uint8_t *in, uint8_t *out,
+    size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,
+    uint64_t *Xip)
+{
+	uint8_t *ivec = (uint8_t *)iv;
+	len &= kSizeTWithoutLower4Bits;
+	aes_gcm_dec_update_vaes_avx2(in, out, len, key, ivec,
+	    (const uint128_t *)Htable, (uint8_t *)Xip);
+	CRYPTO_store_u32_be(&ivec[12],
+	    CRYPTO_load_u32_be(&ivec[12]) + len / 16);
+	return (len);
+}
+#endif /* if CAN_USE_GCM_ASM >= 2 */
+
 /*
  * Finalize decryption: We just have accumulated crypto text, so now we
  * decrypt it here inplace.
@@ -1306,10 +1485,17 @@ gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
 	    B_FALSE);
 
 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
+	aesni_gcm_decrypt_impl *decrypt_blocks =
+#if CAN_USE_GCM_ASM >= 2
+	    ctx->impl == GCM_IMPL_AVX2 ?
+	    aesni_gcm_decrypt_avx2 :
+#endif
+	    aesni_gcm_decrypt_avx;
 	size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
 	uint8_t *datap = ctx->gcm_pt_buf;
 	const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
 	uint32_t *cb = (uint32_t *)ctx->gcm_cb;
+	uint64_t *htable = ctx->gcm_Htable;
 	uint64_t *ghash = ctx->gcm_ghash;
 	uint32_t *tmp = (uint32_t *)ctx->gcm_tmp;
 	int rv = CRYPTO_SUCCESS;
@@ -1322,8 +1508,8 @@ gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
 	 */
 	for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) {
 		kfpu_begin();
-		done = aesni_gcm_decrypt(datap, datap, chunk_size,
-		    (const void *)key, ctx->gcm_cb, ghash);
+		done = decrypt_blocks(datap, datap, chunk_size,
+		    (const void *)key, ctx->gcm_cb, htable, ghash);
 		clear_fpu_regs();
 		kfpu_end();
 		if (done != chunk_size) {
@@ -1334,8 +1520,8 @@ gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
 	/* Decrypt remainder, which is less than chunk size, in one go. */
 	kfpu_begin();
 	if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) {
-		done = aesni_gcm_decrypt(datap, datap, bleft,
-		    (const void *)key, ctx->gcm_cb, ghash);
+		done = decrypt_blocks(datap, datap, bleft,
+		    (const void *)key, ctx->gcm_cb, htable, ghash);
 		if (done == 0) {
 			clear_fpu_regs();
 			kfpu_end();
@@ -1424,13 +1610,42 @@ gcm_init_avx(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len,
 	ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
 	    B_FALSE);
 
+	size_t htab_len = 0;
+#if CAN_USE_GCM_ASM >= 2
+	if (ctx->impl == GCM_IMPL_AVX2) {
+		/*
+		 * BoringSSL's API specifies uint128_t[16] for htab; but only
+		 * uint128_t[12] are used.
+		 * See https://github.com/google/boringssl/blob/
+		 * 813840dd094f9e9c1b00a7368aa25e656554221f1/crypto/fipsmodule/
+		 * modes/asm/aes-gcm-avx2-x86_64.pl#L198-L200
+		 */
+		htab_len = (2 * 8 * sizeof (uint128_t));
+	} else
+#endif /* CAN_USE_GCM_ASM >= 2 */
+	{
+		htab_len = (2 * 6 * sizeof (uint128_t));
+	}
+
+	ctx->gcm_Htable = kmem_alloc(htab_len, KM_SLEEP);
+	if (ctx->gcm_Htable == NULL) {
+		return (CRYPTO_HOST_MEMORY);
+	}
+
 	/* Init H (encrypt zero block) and create the initial counter block. */
 	memset(H, 0, sizeof (ctx->gcm_H));
 	kfpu_begin();
 	aes_encrypt_intel(keysched, aes_rounds,
 	    (const uint32_t *)H, (uint32_t *)H);
 
-	gcm_init_htab_avx(ctx->gcm_Htable, H);
+#if CAN_USE_GCM_ASM >= 2
+	if (ctx->impl == GCM_IMPL_AVX2) {
+		gcm_init_vpclmulqdq_avx2((uint128_t *)ctx->gcm_Htable, H);
+	} else
+#endif /* if CAN_USE_GCM_ASM >= 2 */
+	{
+		gcm_init_htab_avx(ctx->gcm_Htable, H);
+	}
 
 	if (iv_len == 12) {
 		memcpy(cb, iv, 12);
diff --git a/sys/contrib/openzfs/module/icp/algs/modes/modes.c b/sys/contrib/openzfs/module/icp/algs/modes/modes.c
index 343591cd9691..ef3c1806e4b6 100644
--- a/sys/contrib/openzfs/module/icp/algs/modes/modes.c
+++ b/sys/contrib/openzfs/module/icp/algs/modes/modes.c
@@ -171,7 +171,7 @@ gcm_clear_ctx(gcm_ctx_t *ctx)
 	explicit_memset(ctx->gcm_remainder, 0, sizeof (ctx->gcm_remainder));
 	explicit_memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H));
 #if defined(CAN_USE_GCM_ASM)
-	if (ctx->gcm_use_avx == B_TRUE) {
+	if (ctx->impl != GCM_IMPL_GENERIC) {
 		ASSERT3P(ctx->gcm_Htable, !=, NULL);
 		explicit_memset(ctx->gcm_Htable, 0, ctx->gcm_htab_len);
 		kmem_free(ctx->gcm_Htable, ctx->gcm_htab_len);
diff --git a/sys/contrib/openzfs/module/icp/algs/sha2/sha256_impl.c b/sys/contrib/openzfs/module/icp/algs/sha2/sha256_impl.c
index 6d3bcca9f995..dcb0a391dda4 100644
--- a/sys/contrib/openzfs/module/icp/algs/sha2/sha256_impl.c
+++ b/sys/contrib/openzfs/module/icp/algs/sha2/sha256_impl.c
@@ -38,11 +38,14 @@
 	kfpu_begin(); E(s, d, b); kfpu_end(); \
 }
 
+#if defined(__x86_64) || defined(__aarch64__) || defined(__arm__) || \
+    defined(__PPC64__)
 /* some implementation is always okay */
 static inline boolean_t sha2_is_supported(void)
 {
 	return (B_TRUE);
 }
+#endif
 
 #if defined(__x86_64)
 
diff --git a/sys/contrib/openzfs/module/icp/algs/sha2/sha512_impl.c b/sys/contrib/openzfs/module/icp/algs/sha2/sha512_impl.c
index 2efd9fcf4c99..a85a71a83df4 100644
--- a/sys/contrib/openzfs/module/icp/algs/sha2/sha512_impl.c
+++ b/sys/contrib/openzfs/module/icp/algs/sha2/sha512_impl.c
@@ -38,11 +38,14 @@
 	kfpu_begin(); E(s, d, b); kfpu_end(); \
 }
 
+#if defined(__x86_64) || defined(__aarch64__) || defined(__arm__) || \
+    defined(__aarch64__) || defined(__arm__) || defined(__PPC64__)
 /* some implementation is always okay */
 static inline boolean_t sha2_is_supported(void)
 {
 	return (B_TRUE);
 }
+#endif
 
 #if defined(__x86_64)
 
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.boringssl b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.boringssl
new file mode 100644
index 000000000000..04c03a37e0cb
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.boringssl
@@ -0,0 +1,253 @@
+BoringSSL is a fork of OpenSSL. As such, large parts of it fall under OpenSSL
+licensing. Files that are completely new have a Google copyright and an ISC
+license. This license is reproduced at the bottom of this file.
+
+Contributors to BoringSSL are required to follow the CLA rules for Chromium:
+https://cla.developers.google.com/clas
+
+Files in third_party/ have their own licenses, as described therein. The MIT
+license, for third_party/fiat, which, unlike other third_party directories, is
+compiled into non-test libraries, is included below.
+
+The OpenSSL toolkit stays under a dual license, i.e. both the conditions of the
+OpenSSL License and the original SSLeay license apply to the toolkit. See below
+for the actual license texts. Actually both licenses are BSD-style Open Source
+licenses. In case of any license issues related to OpenSSL please contact
+openssl-core@openssl.org.
+
+The following are Google-internal bug numbers where explicit permission from
+some authors is recorded for use of their work. (This is purely for our own
+record keeping.)
+  27287199
+  27287880
+  27287883
+  263291445
+
+
+  OpenSSL License
+  ---------------
+
+/* ====================================================================
+ * Copyright (c) 1998-2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+ Original SSLeay License
+ -----------------------
+
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+
+ISC license used for completely new code in BoringSSL:
+
+/* Copyright 2015 The BoringSSL Authors
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+
+The code in third_party/fiat carries the MIT license:
+
+Copyright (c) 2015-2016 the fiat-crypto authors (see
+https://github.com/mit-plv/fiat-crypto/blob/master/AUTHORS).
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+
+Licenses for support code
+-------------------------
+
+Parts of the TLS test suite are under the Go license. This code is not included
+in BoringSSL (i.e. libcrypto and libssl) when compiled, however, so
+distributing code linked against BoringSSL does not trigger this license:
+
+Copyright (c) 2009 The Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+BoringSSL uses the Chromium test infrastructure to run a continuous build,
+trybots etc. The scripts which manage this, and the script for generating build
+metadata, are under the Chromium license. Distributing code linked against
+BoringSSL does not trigger this license.
+
+Copyright 2015 The Chromium Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.boringssl.descrip b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.boringssl.descrip
new file mode 100644
index 000000000000..f63a67a4d2ae
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.boringssl.descrip
@@ -0,0 +1 @@
+PORTIONS OF AES GCM and GHASH FUNCTIONALITY
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-avx2-vaes.S b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-avx2-vaes.S
new file mode 100644
index 000000000000..3d1b045127e2
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-avx2-vaes.S
@@ -0,0 +1,1323 @@
+// SPDX-License-Identifier: Apache-2.0
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__x86_64__) && defined(HAVE_AVX) && \
+    defined(HAVE_VAES) && defined(HAVE_VPCLMULQDQ)
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+/* Windows userland links with OpenSSL */
+#if !defined (_WIN32) || defined (_KERNEL)
+
+.section	.rodata
+.balign	16
+
+
+.Lbswap_mask:
+.quad	0x08090a0b0c0d0e0f, 0x0001020304050607
+
+
+
+
+
+
+
+
+.Lgfpoly:
+.quad	1, 0xc200000000000000
+
+
+.Lgfpoly_and_internal_carrybit:
+.quad	1, 0xc200000000000001
+
+.balign	32
+
+.Lctr_pattern:
+.quad	0, 0
+.quad	1, 0
+.Linc_2blocks:
+.quad	2, 0
+.quad	2, 0
+
+ENTRY_ALIGN(gcm_init_vpclmulqdq_avx2, 32)
+.cfi_startproc
+
+ENDBR
+
+
+
+
+
+	vmovdqu	(%rsi),%xmm3
+	// KCF/ICP stores H in network byte order with the hi qword first
+	// so we need to swap all bytes, not the 2 qwords.
+	vmovdqu	.Lbswap_mask(%rip),%xmm4
+	vpshufb	%xmm4,%xmm3,%xmm3
+
+
+
+
+
+	vpshufd	$0xd3,%xmm3,%xmm0
+	vpsrad	$31,%xmm0,%xmm0
+	vpaddq	%xmm3,%xmm3,%xmm3
+	vpand	.Lgfpoly_and_internal_carrybit(%rip),%xmm0,%xmm0
+	vpxor	%xmm0,%xmm3,%xmm3
+
+	vbroadcasti128	.Lgfpoly(%rip),%ymm6
+
+
+	vpclmulqdq	$0x00,%xmm3,%xmm3,%xmm0
+	vpclmulqdq	$0x11,%xmm3,%xmm3,%xmm5
+	vpclmulqdq	$0x01,%xmm0,%xmm6,%xmm1
+	vpshufd	$0x4e,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm1,%xmm1
+	vpclmulqdq	$0x01,%xmm1,%xmm6,%xmm0
+	vpshufd	$0x4e,%xmm1,%xmm1
+	vpxor	%xmm1,%xmm5,%xmm5
+	vpxor	%xmm0,%xmm5,%xmm5
+
+
+
+	vinserti128	$1,%xmm3,%ymm5,%ymm3
+	vinserti128	$1,%xmm5,%ymm5,%ymm5
+
+
+	vpclmulqdq	$0x00,%ymm5,%ymm3,%ymm0
+	vpclmulqdq	$0x01,%ymm5,%ymm3,%ymm1
+	vpclmulqdq	$0x10,%ymm5,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpclmulqdq	$0x01,%ymm0,%ymm6,%ymm2
+	vpshufd	$0x4e,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpclmulqdq	$0x11,%ymm5,%ymm3,%ymm4
+	vpclmulqdq	$0x01,%ymm1,%ymm6,%ymm0
+	vpshufd	$0x4e,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm4,%ymm4
+	vpxor	%ymm0,%ymm4,%ymm4
+
+
+
+	vmovdqu	%ymm3,96(%rdi)
+	vmovdqu	%ymm4,64(%rdi)
+
+
+
+	vpunpcklqdq	%ymm3,%ymm4,%ymm0
+	vpunpckhqdq	%ymm3,%ymm4,%ymm1
+	vpxor	%ymm1,%ymm0,%ymm0
+	vmovdqu	%ymm0,128+32(%rdi)
+
+
+	vpclmulqdq	$0x00,%ymm5,%ymm4,%ymm0
+	vpclmulqdq	$0x01,%ymm5,%ymm4,%ymm1
+	vpclmulqdq	$0x10,%ymm5,%ymm4,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpclmulqdq	$0x01,%ymm0,%ymm6,%ymm2
+	vpshufd	$0x4e,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpclmulqdq	$0x11,%ymm5,%ymm4,%ymm3
+	vpclmulqdq	$0x01,%ymm1,%ymm6,%ymm0
+	vpshufd	$0x4e,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm3,%ymm3
+	vpxor	%ymm0,%ymm3,%ymm3
+
+	vpclmulqdq	$0x00,%ymm5,%ymm3,%ymm0
+	vpclmulqdq	$0x01,%ymm5,%ymm3,%ymm1
+	vpclmulqdq	$0x10,%ymm5,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpclmulqdq	$0x01,%ymm0,%ymm6,%ymm2
+	vpshufd	$0x4e,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpclmulqdq	$0x11,%ymm5,%ymm3,%ymm4
+	vpclmulqdq	$0x01,%ymm1,%ymm6,%ymm0
+	vpshufd	$0x4e,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm4,%ymm4
+	vpxor	%ymm0,%ymm4,%ymm4
+
+	vmovdqu	%ymm3,32(%rdi)
+	vmovdqu	%ymm4,0(%rdi)
+
+
+
+	vpunpcklqdq	%ymm3,%ymm4,%ymm0
+	vpunpckhqdq	%ymm3,%ymm4,%ymm1
+	vpxor	%ymm1,%ymm0,%ymm0
+	vmovdqu	%ymm0,128(%rdi)
+
+	vzeroupper
+	RET
+
+.cfi_endproc
+SET_SIZE(gcm_init_vpclmulqdq_avx2)
+ENTRY_ALIGN(gcm_gmult_vpclmulqdq_avx2, 32)
+.cfi_startproc
+
+ENDBR
+
+
+
+	vmovdqu	(%rdi),%xmm0
+	vmovdqu	.Lbswap_mask(%rip),%xmm1
+	vmovdqu	128-16(%rsi),%xmm2
+	vmovdqu	.Lgfpoly(%rip),%xmm3
+	vpshufb	%xmm1,%xmm0,%xmm0
+
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm4
+	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm5
+	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x01,%xmm4,%xmm3,%xmm6
+	vpshufd	$0x4e,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm0
+	vpclmulqdq	$0x01,%xmm5,%xmm3,%xmm4
+	vpshufd	$0x4e,%xmm5,%xmm5
+	vpxor	%xmm5,%xmm0,%xmm0
+	vpxor	%xmm4,%xmm0,%xmm0
+
+
+	vpshufb	%xmm1,%xmm0,%xmm0
+	vmovdqu	%xmm0,(%rdi)
+
+
+	RET
+
+.cfi_endproc
+SET_SIZE(gcm_gmult_vpclmulqdq_avx2)
+ENTRY_ALIGN(gcm_ghash_vpclmulqdq_avx2, 32)
+.cfi_startproc
+
+ENDBR
+
+
+
+
+
+
+	vmovdqu	.Lbswap_mask(%rip),%xmm6
+	vmovdqu	.Lgfpoly(%rip),%xmm7
+
+
+	vmovdqu	(%rdi),%xmm5
+	vpshufb	%xmm6,%xmm5,%xmm5
+
+
+	cmpq	$32,%rcx
+	jb	.Lghash_lastblock
+
+
+
+	vinserti128	$1,%xmm6,%ymm6,%ymm6
+	vinserti128	$1,%xmm7,%ymm7,%ymm7
+
+	cmpq	$127,%rcx
+	jbe	.Lghash_loop_1x
+
+
+	vmovdqu	128(%rsi),%ymm8
+	vmovdqu	128+32(%rsi),%ymm9
+.Lghash_loop_4x:
+
+	vmovdqu	0(%rdx),%ymm1
+	vpshufb	%ymm6,%ymm1,%ymm1
+	vmovdqu	0(%rsi),%ymm2
+	vpxor	%ymm5,%ymm1,%ymm1
+	vpclmulqdq	$0x00,%ymm2,%ymm1,%ymm3
+	vpclmulqdq	$0x11,%ymm2,%ymm1,%ymm5
+	vpunpckhqdq	%ymm1,%ymm1,%ymm0
+	vpxor	%ymm1,%ymm0,%ymm0
+	vpclmulqdq	$0x00,%ymm8,%ymm0,%ymm4
+
+	vmovdqu	32(%rdx),%ymm1
+	vpshufb	%ymm6,%ymm1,%ymm1
+	vmovdqu	32(%rsi),%ymm2
+	vpclmulqdq	$0x00,%ymm2,%ymm1,%ymm0
+	vpxor	%ymm0,%ymm3,%ymm3
+	vpclmulqdq	$0x11,%ymm2,%ymm1,%ymm0
+	vpxor	%ymm0,%ymm5,%ymm5
+	vpunpckhqdq	%ymm1,%ymm1,%ymm0
+	vpxor	%ymm1,%ymm0,%ymm0
+	vpclmulqdq	$0x10,%ymm8,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm4,%ymm4
+
+	vmovdqu	64(%rdx),%ymm1
+	vpshufb	%ymm6,%ymm1,%ymm1
+	vmovdqu	64(%rsi),%ymm2
+	vpclmulqdq	$0x00,%ymm2,%ymm1,%ymm0
+	vpxor	%ymm0,%ymm3,%ymm3
+	vpclmulqdq	$0x11,%ymm2,%ymm1,%ymm0
+	vpxor	%ymm0,%ymm5,%ymm5
+	vpunpckhqdq	%ymm1,%ymm1,%ymm0
+	vpxor	%ymm1,%ymm0,%ymm0
+	vpclmulqdq	$0x00,%ymm9,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm4,%ymm4
+
+
+	vmovdqu	96(%rdx),%ymm1
+	vpshufb	%ymm6,%ymm1,%ymm1
+	vmovdqu	96(%rsi),%ymm2
+	vpclmulqdq	$0x00,%ymm2,%ymm1,%ymm0
+	vpxor	%ymm0,%ymm3,%ymm3
+	vpclmulqdq	$0x11,%ymm2,%ymm1,%ymm0
+	vpxor	%ymm0,%ymm5,%ymm5
+	vpunpckhqdq	%ymm1,%ymm1,%ymm0
+	vpxor	%ymm1,%ymm0,%ymm0
+	vpclmulqdq	$0x10,%ymm9,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm4,%ymm4
+
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpxor	%ymm5,%ymm4,%ymm4
+
+
+	vbroadcasti128	.Lgfpoly(%rip),%ymm2
+	vpclmulqdq	$0x01,%ymm3,%ymm2,%ymm0
+	vpshufd	$0x4e,%ymm3,%ymm3
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpxor	%ymm0,%ymm4,%ymm4
+
+	vpclmulqdq	$0x01,%ymm4,%ymm2,%ymm0
+	vpshufd	$0x4e,%ymm4,%ymm4
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpxor	%ymm0,%ymm5,%ymm5
+	vextracti128	$1,%ymm5,%xmm0
+	vpxor	%xmm0,%xmm5,%xmm5
+
+	subq	$-128,%rdx
+	addq	$-128,%rcx
+	cmpq	$127,%rcx
+	ja	.Lghash_loop_4x
+
+
+	cmpq	$32,%rcx
+	jb	.Lghash_loop_1x_done
+.Lghash_loop_1x:
+	vmovdqu	(%rdx),%ymm0
+	vpshufb	%ymm6,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm5,%ymm5
+	vmovdqu	128-32(%rsi),%ymm0
+	vpclmulqdq	$0x00,%ymm0,%ymm5,%ymm1
+	vpclmulqdq	$0x01,%ymm0,%ymm5,%ymm2
+	vpclmulqdq	$0x10,%ymm0,%ymm5,%ymm3
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x01,%ymm1,%ymm7,%ymm3
+	vpshufd	$0x4e,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm2,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x11,%ymm0,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%ymm2,%ymm7,%ymm1
+	vpshufd	$0x4e,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpxor	%ymm1,%ymm5,%ymm5
+
+	vextracti128	$1,%ymm5,%xmm0
+	vpxor	%xmm0,%xmm5,%xmm5
+	addq	$32,%rdx
+	subq	$32,%rcx
+	cmpq	$32,%rcx
+	jae	.Lghash_loop_1x
+.Lghash_loop_1x_done:
+
+
+.Lghash_lastblock:
+	testq	%rcx,%rcx
+	jz	.Lghash_done
+	vmovdqu	(%rdx),%xmm0
+	vpshufb	%xmm6,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm5,%xmm5
+	vmovdqu	128-16(%rsi),%xmm0
+	vpclmulqdq	$0x00,%xmm0,%xmm5,%xmm1
+	vpclmulqdq	$0x01,%xmm0,%xmm5,%xmm2
+	vpclmulqdq	$0x10,%xmm0,%xmm5,%xmm3
+	vpxor	%xmm3,%xmm2,%xmm2
+	vpclmulqdq	$0x01,%xmm1,%xmm7,%xmm3
+	vpshufd	$0x4e,%xmm1,%xmm1
+	vpxor	%xmm1,%xmm2,%xmm2
+	vpxor	%xmm3,%xmm2,%xmm2
+	vpclmulqdq	$0x11,%xmm0,%xmm5,%xmm5
+	vpclmulqdq	$0x01,%xmm2,%xmm7,%xmm1
+	vpshufd	$0x4e,%xmm2,%xmm2
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpxor	%xmm1,%xmm5,%xmm5
+
+
+.Lghash_done:
+
+	vpshufb	%xmm6,%xmm5,%xmm5
+	vmovdqu	%xmm5,(%rdi)
+
+	vzeroupper
+	RET
+
+.cfi_endproc
+SET_SIZE(gcm_ghash_vpclmulqdq_avx2)
+ENTRY_ALIGN(aes_gcm_enc_update_vaes_avx2, 32)
+.cfi_startproc
+
+ENDBR
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-16
+
+	movq	16(%rsp),%r12
+#ifdef BORINGSSL_DISPATCH_TEST
+.extern	BORINGSSL_function_hit
+.hidden BORINGSSL_function_hit
+	movb	$1,BORINGSSL_function_hit+6(%rip)
+#endif
+	vbroadcasti128	.Lbswap_mask(%rip),%ymm0
+
+
+
+	vmovdqu	(%r12),%xmm1
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vbroadcasti128	(%r8),%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm11
+
+
+
+	movl	504(%rcx),%r10d		// ICP has a larger offset for rounds.
+	leal	-24(,%r10,4),%r10d	// ICP uses 10,12,14 not 9,11,13 for rounds.
+
+
+
+
+	leaq	96(%rcx,%r10,4),%r11
+	vbroadcasti128	(%rcx),%ymm9
+	vbroadcasti128	(%r11),%ymm10
+
+
+	vpaddd	.Lctr_pattern(%rip),%ymm11,%ymm11
+
+
+
+	cmpq	$127,%rdx
+	jbe	.Lcrypt_loop_4x_done__func1
+
+	vmovdqu	128(%r9),%ymm7
+	vmovdqu	128+32(%r9),%ymm8
+
+
+
+	vmovdqu	.Linc_2blocks(%rip),%ymm2
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm14
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm15
+	vpaddd	%ymm2,%ymm11,%ymm11
+
+
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	vpxor	%ymm9,%ymm14,%ymm14
+	vpxor	%ymm9,%ymm15,%ymm15
+
+	leaq	16(%rcx),%rax
+.Lvaesenc_loop_first_4_vecs__func1:
+	vbroadcasti128	(%rax),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	.Lvaesenc_loop_first_4_vecs__func1
+	vpxor	0(%rdi),%ymm10,%ymm2
+	vpxor	32(%rdi),%ymm10,%ymm3
+	vpxor	64(%rdi),%ymm10,%ymm5
+	vpxor	96(%rdi),%ymm10,%ymm6
+	vaesenclast	%ymm2,%ymm12,%ymm12
+	vaesenclast	%ymm3,%ymm13,%ymm13
+	vaesenclast	%ymm5,%ymm14,%ymm14
+	vaesenclast	%ymm6,%ymm15,%ymm15
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%ymm13,32(%rsi)
+	vmovdqu	%ymm14,64(%rsi)
+	vmovdqu	%ymm15,96(%rsi)
+
+	subq	$-128,%rdi
+	addq	$-128,%rdx
+	cmpq	$127,%rdx
+	jbe	.Lghash_last_ciphertext_4x__func1
+.balign	16
+.Lcrypt_loop_4x__func1:
+
+
+
+
+	vmovdqu	.Linc_2blocks(%rip),%ymm2
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm14
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm15
+	vpaddd	%ymm2,%ymm11,%ymm11
+
+
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	vpxor	%ymm9,%ymm14,%ymm14
+	vpxor	%ymm9,%ymm15,%ymm15
+
+	cmpl	$24,%r10d
+	jl	.Laes128__func1
+	je	.Laes192__func1
+
+	vbroadcasti128	-208(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vbroadcasti128	-192(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+.Laes192__func1:
+	vbroadcasti128	-176(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vbroadcasti128	-160(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+.Laes128__func1:
+	prefetcht0	512(%rdi)
+	prefetcht0	512+64(%rdi)
+
+	vmovdqu	0(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	0(%r9),%ymm4
+	vpxor	%ymm1,%ymm3,%ymm3
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x00,%ymm7,%ymm2,%ymm6
+
+	vbroadcasti128	-144(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vbroadcasti128	-128(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vmovdqu	32(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	32(%r9),%ymm4
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x10,%ymm7,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-112(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vmovdqu	64(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	64(%r9),%ymm4
+
+	vbroadcasti128	-96(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+
+	vbroadcasti128	-80(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x00,%ymm8,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+
+	vmovdqu	96(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+
+	vbroadcasti128	-64(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vmovdqu	96(%r9),%ymm4
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x10,%ymm8,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-48(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm1,%ymm6,%ymm6
+
+
+	vbroadcasti128	.Lgfpoly(%rip),%ymm4
+	vpclmulqdq	$0x01,%ymm5,%ymm4,%ymm2
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-32(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vpclmulqdq	$0x01,%ymm6,%ymm4,%ymm2
+	vpshufd	$0x4e,%ymm6,%ymm6
+	vpxor	%ymm6,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+
+	vbroadcasti128	-16(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vextracti128	$1,%ymm1,%xmm2
+	vpxor	%xmm2,%xmm1,%xmm1
+
+
+	subq	$-128,%rsi
+	vpxor	0(%rdi),%ymm10,%ymm2
+	vpxor	32(%rdi),%ymm10,%ymm3
+	vpxor	64(%rdi),%ymm10,%ymm5
+	vpxor	96(%rdi),%ymm10,%ymm6
+	vaesenclast	%ymm2,%ymm12,%ymm12
+	vaesenclast	%ymm3,%ymm13,%ymm13
+	vaesenclast	%ymm5,%ymm14,%ymm14
+	vaesenclast	%ymm6,%ymm15,%ymm15
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%ymm13,32(%rsi)
+	vmovdqu	%ymm14,64(%rsi)
+	vmovdqu	%ymm15,96(%rsi)
+
+	subq	$-128,%rdi
+
+	addq	$-128,%rdx
+	cmpq	$127,%rdx
+	ja	.Lcrypt_loop_4x__func1
+.Lghash_last_ciphertext_4x__func1:
+
+	vmovdqu	0(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	0(%r9),%ymm4
+	vpxor	%ymm1,%ymm3,%ymm3
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x00,%ymm7,%ymm2,%ymm6
+
+	vmovdqu	32(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	32(%r9),%ymm4
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x10,%ymm7,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vmovdqu	64(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	64(%r9),%ymm4
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x00,%ymm8,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+
+	vmovdqu	96(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	96(%r9),%ymm4
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x10,%ymm8,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm1,%ymm6,%ymm6
+
+
+	vbroadcasti128	.Lgfpoly(%rip),%ymm4
+	vpclmulqdq	$0x01,%ymm5,%ymm4,%ymm2
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vpclmulqdq	$0x01,%ymm6,%ymm4,%ymm2
+	vpshufd	$0x4e,%ymm6,%ymm6
+	vpxor	%ymm6,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+	vextracti128	$1,%ymm1,%xmm2
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	subq	$-128,%rsi
+.Lcrypt_loop_4x_done__func1:
+
+	testq	%rdx,%rdx
+	jz	.Ldone__func1
+
+
+
+
+
+	leaq	128(%r9),%r8
+	subq	%rdx,%r8
+
+
+	vpxor	%xmm5,%xmm5,%xmm5
+	vpxor	%xmm6,%xmm6,%xmm6
+	vpxor	%xmm7,%xmm7,%xmm7
+
+	cmpq	$64,%rdx
+	jb	.Llessthan64bytes__func1
+
+
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	.Linc_2blocks(%rip),%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpaddd	.Linc_2blocks(%rip),%ymm11,%ymm11
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	leaq	16(%rcx),%rax
+.Lvaesenc_loop_tail_1__func1:
+	vbroadcasti128	(%rax),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	.Lvaesenc_loop_tail_1__func1
+	vaesenclast	%ymm10,%ymm12,%ymm12
+	vaesenclast	%ymm10,%ymm13,%ymm13
+
+
+	vmovdqu	0(%rdi),%ymm2
+	vmovdqu	32(%rdi),%ymm3
+	vpxor	%ymm2,%ymm12,%ymm12
+	vpxor	%ymm3,%ymm13,%ymm13
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%ymm13,32(%rsi)
+
+
+	vpshufb	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm0,%ymm13,%ymm13
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	vmovdqu	32(%r8),%ymm3
+	vpclmulqdq	$0x00,%ymm2,%ymm12,%ymm5
+	vpclmulqdq	$0x01,%ymm2,%ymm12,%ymm6
+	vpclmulqdq	$0x10,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%ymm2,%ymm12,%ymm7
+	vpclmulqdq	$0x00,%ymm3,%ymm13,%ymm4
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%ymm3,%ymm13,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x10,%ymm3,%ymm13,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%ymm3,%ymm13,%ymm4
+	vpxor	%ymm4,%ymm7,%ymm7
+
+	addq	$64,%r8
+	addq	$64,%rdi
+	addq	$64,%rsi
+	subq	$64,%rdx
+	jz	.Lreduce__func1
+
+	vpxor	%xmm1,%xmm1,%xmm1
+
+
+.Llessthan64bytes__func1:
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	.Linc_2blocks(%rip),%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	leaq	16(%rcx),%rax
+.Lvaesenc_loop_tail_2__func1:
+	vbroadcasti128	(%rax),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	.Lvaesenc_loop_tail_2__func1
+	vaesenclast	%ymm10,%ymm12,%ymm12
+	vaesenclast	%ymm10,%ymm13,%ymm13
+
+
+
+
+	cmpq	$32,%rdx
+	jb	.Lxor_one_block__func1
+	je	.Lxor_two_blocks__func1
+
+.Lxor_three_blocks__func1:
+	vmovdqu	0(%rdi),%ymm2
+	vmovdqu	32(%rdi),%xmm3
+	vpxor	%ymm2,%ymm12,%ymm12
+	vpxor	%xmm3,%xmm13,%xmm13
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%xmm13,32(%rsi)
+
+	vpshufb	%ymm0,%ymm12,%ymm12
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	vmovdqu	32(%r8),%xmm3
+	vpclmulqdq	$0x00,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x10,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm7,%ymm7
+	jmp	.Lghash_mul_one_vec_unreduced__func1
+
+.Lxor_two_blocks__func1:
+	vmovdqu	(%rdi),%ymm2
+	vpxor	%ymm2,%ymm12,%ymm12
+	vmovdqu	%ymm12,(%rsi)
+	vpshufb	%ymm0,%ymm12,%ymm12
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	jmp	.Lghash_mul_one_vec_unreduced__func1
+
+.Lxor_one_block__func1:
+	vmovdqu	(%rdi),%xmm2
+	vpxor	%xmm2,%xmm12,%xmm12
+	vmovdqu	%xmm12,(%rsi)
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vpxor	%xmm1,%xmm12,%xmm12
+	vmovdqu	(%r8),%xmm2
+
+.Lghash_mul_one_vec_unreduced__func1:
+	vpclmulqdq	$0x00,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x10,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm7,%ymm7
+
+.Lreduce__func1:
+
+	vbroadcasti128	.Lgfpoly(%rip),%ymm2
+	vpclmulqdq	$0x01,%ymm5,%ymm2,%ymm3
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpclmulqdq	$0x01,%ymm6,%ymm2,%ymm3
+	vpshufd	$0x4e,%ymm6,%ymm6
+	vpxor	%ymm6,%ymm7,%ymm7
+	vpxor	%ymm3,%ymm7,%ymm7
+	vextracti128	$1,%ymm7,%xmm1
+	vpxor	%xmm7,%xmm1,%xmm1
+
+.Ldone__func1:
+
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vmovdqu	%xmm1,(%r12)
+
+	vzeroupper
+	popq	%r12
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r12
+	RET
+
+.cfi_endproc
+SET_SIZE(aes_gcm_enc_update_vaes_avx2)
+ENTRY_ALIGN(aes_gcm_dec_update_vaes_avx2, 32)
+.cfi_startproc
+
+ENDBR
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-16
+
+	movq	16(%rsp),%r12
+	vbroadcasti128	.Lbswap_mask(%rip),%ymm0
+
+
+
+	vmovdqu	(%r12),%xmm1
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vbroadcasti128	(%r8),%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm11
+
+
+
+	movl	504(%rcx),%r10d		// ICP has a larger offset for rounds.
+	leal	-24(,%r10,4),%r10d	// ICP uses 10,12,14 not 9,11,13 for rounds.
+
+
+
+
+	leaq	96(%rcx,%r10,4),%r11
+	vbroadcasti128	(%rcx),%ymm9
+	vbroadcasti128	(%r11),%ymm10
+
+
+	vpaddd	.Lctr_pattern(%rip),%ymm11,%ymm11
+
+
+
+	cmpq	$127,%rdx
+	jbe	.Lcrypt_loop_4x_done__func2
+
+	vmovdqu	128(%r9),%ymm7
+	vmovdqu	128+32(%r9),%ymm8
+.balign	16
+.Lcrypt_loop_4x__func2:
+
+
+
+
+	vmovdqu	.Linc_2blocks(%rip),%ymm2
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm14
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm15
+	vpaddd	%ymm2,%ymm11,%ymm11
+
+
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	vpxor	%ymm9,%ymm14,%ymm14
+	vpxor	%ymm9,%ymm15,%ymm15
+
+	cmpl	$24,%r10d
+	jl	.Laes128__func2
+	je	.Laes192__func2
+
+	vbroadcasti128	-208(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vbroadcasti128	-192(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+.Laes192__func2:
+	vbroadcasti128	-176(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vbroadcasti128	-160(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+.Laes128__func2:
+	prefetcht0	512(%rdi)
+	prefetcht0	512+64(%rdi)
+
+	vmovdqu	0(%rdi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	0(%r9),%ymm4
+	vpxor	%ymm1,%ymm3,%ymm3
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x00,%ymm7,%ymm2,%ymm6
+
+	vbroadcasti128	-144(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vbroadcasti128	-128(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vmovdqu	32(%rdi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	32(%r9),%ymm4
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x10,%ymm7,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-112(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vmovdqu	64(%rdi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	64(%r9),%ymm4
+
+	vbroadcasti128	-96(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+
+	vbroadcasti128	-80(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x00,%ymm8,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+
+	vmovdqu	96(%rdi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+
+	vbroadcasti128	-64(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vmovdqu	96(%r9),%ymm4
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x10,%ymm8,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-48(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm1,%ymm6,%ymm6
+
+
+	vbroadcasti128	.Lgfpoly(%rip),%ymm4
+	vpclmulqdq	$0x01,%ymm5,%ymm4,%ymm2
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-32(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vpclmulqdq	$0x01,%ymm6,%ymm4,%ymm2
+	vpshufd	$0x4e,%ymm6,%ymm6
+	vpxor	%ymm6,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+
+	vbroadcasti128	-16(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vextracti128	$1,%ymm1,%xmm2
+	vpxor	%xmm2,%xmm1,%xmm1
+
+
+
+	vpxor	0(%rdi),%ymm10,%ymm2
+	vpxor	32(%rdi),%ymm10,%ymm3
+	vpxor	64(%rdi),%ymm10,%ymm5
+	vpxor	96(%rdi),%ymm10,%ymm6
+	vaesenclast	%ymm2,%ymm12,%ymm12
+	vaesenclast	%ymm3,%ymm13,%ymm13
+	vaesenclast	%ymm5,%ymm14,%ymm14
+	vaesenclast	%ymm6,%ymm15,%ymm15
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%ymm13,32(%rsi)
+	vmovdqu	%ymm14,64(%rsi)
+	vmovdqu	%ymm15,96(%rsi)
+
+	subq	$-128,%rdi
+	subq	$-128,%rsi
+	addq	$-128,%rdx
+	cmpq	$127,%rdx
+	ja	.Lcrypt_loop_4x__func2
+.Lcrypt_loop_4x_done__func2:
+
+	testq	%rdx,%rdx
+	jz	.Ldone__func2
+
+
+
+
+
+	leaq	128(%r9),%r8
+	subq	%rdx,%r8
+
+
+	vpxor	%xmm5,%xmm5,%xmm5
+	vpxor	%xmm6,%xmm6,%xmm6
+	vpxor	%xmm7,%xmm7,%xmm7
+
+	cmpq	$64,%rdx
+	jb	.Llessthan64bytes__func2
+
+
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	.Linc_2blocks(%rip),%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpaddd	.Linc_2blocks(%rip),%ymm11,%ymm11
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	leaq	16(%rcx),%rax
+.Lvaesenc_loop_tail_1__func2:
+	vbroadcasti128	(%rax),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	.Lvaesenc_loop_tail_1__func2
+	vaesenclast	%ymm10,%ymm12,%ymm12
+	vaesenclast	%ymm10,%ymm13,%ymm13
+
+
+	vmovdqu	0(%rdi),%ymm2
+	vmovdqu	32(%rdi),%ymm3
+	vpxor	%ymm2,%ymm12,%ymm12
+	vpxor	%ymm3,%ymm13,%ymm13
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%ymm13,32(%rsi)
+
+
+	vpshufb	%ymm0,%ymm2,%ymm12
+	vpshufb	%ymm0,%ymm3,%ymm13
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	vmovdqu	32(%r8),%ymm3
+	vpclmulqdq	$0x00,%ymm2,%ymm12,%ymm5
+	vpclmulqdq	$0x01,%ymm2,%ymm12,%ymm6
+	vpclmulqdq	$0x10,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%ymm2,%ymm12,%ymm7
+	vpclmulqdq	$0x00,%ymm3,%ymm13,%ymm4
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%ymm3,%ymm13,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x10,%ymm3,%ymm13,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%ymm3,%ymm13,%ymm4
+	vpxor	%ymm4,%ymm7,%ymm7
+
+	addq	$64,%r8
+	addq	$64,%rdi
+	addq	$64,%rsi
+	subq	$64,%rdx
+	jz	.Lreduce__func2
+
+	vpxor	%xmm1,%xmm1,%xmm1
+
+
+.Llessthan64bytes__func2:
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	.Linc_2blocks(%rip),%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	leaq	16(%rcx),%rax
+.Lvaesenc_loop_tail_2__func2:
+	vbroadcasti128	(%rax),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	.Lvaesenc_loop_tail_2__func2
+	vaesenclast	%ymm10,%ymm12,%ymm12
+	vaesenclast	%ymm10,%ymm13,%ymm13
+
+
+
+
+	cmpq	$32,%rdx
+	jb	.Lxor_one_block__func2
+	je	.Lxor_two_blocks__func2
+
+.Lxor_three_blocks__func2:
+	vmovdqu	0(%rdi),%ymm2
+	vmovdqu	32(%rdi),%xmm3
+	vpxor	%ymm2,%ymm12,%ymm12
+	vpxor	%xmm3,%xmm13,%xmm13
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%xmm13,32(%rsi)
+
+	vpshufb	%ymm0,%ymm2,%ymm12
+	vpshufb	%xmm0,%xmm3,%xmm13
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	vmovdqu	32(%r8),%xmm3
+	vpclmulqdq	$0x00,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x10,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm7,%ymm7
+	jmp	.Lghash_mul_one_vec_unreduced__func2
+
+.Lxor_two_blocks__func2:
+	vmovdqu	(%rdi),%ymm2
+	vpxor	%ymm2,%ymm12,%ymm12
+	vmovdqu	%ymm12,(%rsi)
+	vpshufb	%ymm0,%ymm2,%ymm12
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	jmp	.Lghash_mul_one_vec_unreduced__func2
+
+.Lxor_one_block__func2:
+	vmovdqu	(%rdi),%xmm2
+	vpxor	%xmm2,%xmm12,%xmm12
+	vmovdqu	%xmm12,(%rsi)
+	vpshufb	%xmm0,%xmm2,%xmm12
+	vpxor	%xmm1,%xmm12,%xmm12
+	vmovdqu	(%r8),%xmm2
+
+.Lghash_mul_one_vec_unreduced__func2:
+	vpclmulqdq	$0x00,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x10,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm7,%ymm7
+
+.Lreduce__func2:
+
+	vbroadcasti128	.Lgfpoly(%rip),%ymm2
+	vpclmulqdq	$0x01,%ymm5,%ymm2,%ymm3
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpclmulqdq	$0x01,%ymm6,%ymm2,%ymm3
+	vpshufd	$0x4e,%ymm6,%ymm6
+	vpxor	%ymm6,%ymm7,%ymm7
+	vpxor	%ymm3,%ymm7,%ymm7
+	vextracti128	$1,%ymm7,%xmm1
+	vpxor	%xmm7,%xmm1,%xmm1
+
+.Ldone__func2:
+
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vmovdqu	%xmm1,(%r12)
+
+	vzeroupper
+	popq	%r12
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r12
+	RET
+
+.cfi_endproc
+SET_SIZE(aes_gcm_dec_update_vaes_avx2)
+
+#endif /* !_WIN32 || _KERNEL */
+
+/* Mark the stack non-executable. */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */
diff --git a/sys/contrib/openzfs/module/icp/include/modes/modes.h b/sys/contrib/openzfs/module/icp/include/modes/modes.h
index ca734cf4f045..de11d9eafafb 100644
--- a/sys/contrib/openzfs/module/icp/include/modes/modes.h
+++ b/sys/contrib/openzfs/module/icp/include/modes/modes.h
@@ -42,7 +42,7 @@ extern "C" {
  */
 #if defined(__x86_64__) && defined(HAVE_AVX) && \
     defined(HAVE_AES) && defined(HAVE_PCLMULQDQ)
-#define	CAN_USE_GCM_ASM
+#define	CAN_USE_GCM_ASM (HAVE_VAES && HAVE_VPCLMULQDQ ? 2 : 1)
 extern boolean_t gcm_avx_can_use_movbe;
 #endif
 
@@ -129,6 +129,15 @@ typedef struct ccm_ctx {
 #define	ccm_copy_to		ccm_common.cc_copy_to
 #define	ccm_flags		ccm_common.cc_flags
 
+#ifdef CAN_USE_GCM_ASM
+typedef enum gcm_impl {
+	GCM_IMPL_GENERIC = 0,
+	GCM_IMPL_AVX,
+	GCM_IMPL_AVX2,
+	GCM_IMPL_MAX,
+} gcm_impl;
+#endif
+
 /*
  * gcm_tag_len:		Length of authentication tag.
  *
@@ -174,7 +183,7 @@ typedef struct gcm_ctx {
 	uint64_t gcm_len_a_len_c[2];
 	uint8_t *gcm_pt_buf;
 #ifdef CAN_USE_GCM_ASM
-	boolean_t gcm_use_avx;
+	enum gcm_impl impl;
 #endif
 } gcm_ctx_t;