aboutsummaryrefslogtreecommitdiff
path: root/test
diff options
context:
space:
mode:
Diffstat (limited to 'test')
-rw-r--r--test/Analysis/BasicAA/args-rets-allocas-loads.ll7
-rw-r--r--test/Analysis/BasicAA/call-attrs.ll6
-rw-r--r--test/Analysis/BasicAA/cs-cs-arm.ll6
-rw-r--r--test/Analysis/BasicAA/cs-cs.ll62
-rw-r--r--test/Analysis/MemorySSA/volatile-clobber.ll13
-rw-r--r--test/Analysis/ValueTracking/memory-dereferenceable.ll22
-rw-r--r--test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll2
-rw-r--r--test/CodeGen/AArch64/GlobalISel/fp128-legalize-crash-pr35690.mir44
-rw-r--r--test/CodeGen/AArch64/GlobalISel/translate-gep.ll109
-rw-r--r--test/CodeGen/AArch64/arm64-jumptable.ll16
-rw-r--r--test/CodeGen/AArch64/arm64-memset-to-bzero.ll49
-rw-r--r--test/CodeGen/AArch64/arm64-neon-2velem.ll685
-rw-r--r--test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll9
-rw-r--r--test/CodeGen/AArch64/chkstk.ll25
-rw-r--r--test/CodeGen/AArch64/ldst-paired-aliasing.ll5
-rw-r--r--test/CodeGen/AMDGPU/amdgpu.private-memory.ll3
-rw-r--r--test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll32
-rw-r--r--test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir100
-rw-r--r--test/CodeGen/ARM/GlobalISel/arm-legalizer.mir52
-rw-r--r--test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir66
-rw-r--r--test/CodeGen/ARM/avoid-cpsr-rmw.ll4
-rw-r--r--test/CodeGen/ARM/su-addsub-overflow.ll135
-rw-r--r--test/CodeGen/ARM/usat.ll214
-rw-r--r--test/CodeGen/BPF/objdump_imm_hex.ll65
-rw-r--r--test/CodeGen/Hexagon/autohvx/build-vector-i32-type.ll19
-rw-r--r--test/CodeGen/Hexagon/autohvx/isel-bool-vector.ll18
-rw-r--r--test/CodeGen/Hexagon/autohvx/isel-select-const.ll32
-rw-r--r--test/CodeGen/Hexagon/expand-vstorerw-undef.ll2
-rw-r--r--test/CodeGen/Hexagon/v60-cur.ll5
-rw-r--r--test/CodeGen/Hexagon/vect/vect-extract-i1-debug.ll14
-rw-r--r--test/CodeGen/Hexagon/vect/vect-infloop.ll10
-rw-r--r--test/CodeGen/Mips/llvm-ir/extractelement.ll2
-rw-r--r--test/CodeGen/Mips/long-call-mcount.ll19
-rw-r--r--test/CodeGen/Mips/sll-micromips-r6-encoding.mir46
-rw-r--r--test/CodeGen/PowerPC/cmp_elimination.ll32
-rw-r--r--test/CodeGen/PowerPC/uint-to-ppcfp128-crash.ll15
-rw-r--r--test/CodeGen/PowerPC/variable_elem_vec_extracts.ll6
-rw-r--r--test/CodeGen/Thumb2/t2sizereduction.mir83
-rw-r--r--test/CodeGen/X86/avg-mask.ll24
-rw-r--r--test/CodeGen/X86/avg.ll2949
-rw-r--r--test/CodeGen/X86/avx512-calling-conv.ll88
-rw-r--r--test/CodeGen/X86/avx512-ext.ll21
-rw-r--r--test/CodeGen/X86/avx512-extract-subvector-load-store.ll136
-rw-r--r--test/CodeGen/X86/avx512-insert-extract.ll163
-rw-r--r--test/CodeGen/X86/avx512-insert-extract_i1.ll3
-rw-r--r--test/CodeGen/X86/avx512-mask-op.ll22
-rwxr-xr-xtest/CodeGen/X86/avx512-schedule.ll12
-rw-r--r--test/CodeGen/X86/avx512-shuffles/partial_permute.ll299
-rw-r--r--test/CodeGen/X86/avx512-skx-insert-subvec.ll7
-rw-r--r--test/CodeGen/X86/avx512-vec-cmp.ll5
-rw-r--r--test/CodeGen/X86/avx512vl-vec-masked-cmp.ll186
-rw-r--r--test/CodeGen/X86/bitcast-and-setcc-128.ll24
-rw-r--r--test/CodeGen/X86/bitcast-setcc-128.ll12
-rw-r--r--test/CodeGen/X86/combine-and.ll10
-rw-r--r--test/CodeGen/X86/combine-or.ll20
-rw-r--r--test/CodeGen/X86/darwin-bzero.ll5
-rw-r--r--test/CodeGen/X86/extractelement-index.ll12
-rw-r--r--test/CodeGen/X86/fma-fneg-combine.ll166
-rw-r--r--test/CodeGen/X86/fmsubadd-combine.ll96
-rw-r--r--test/CodeGen/X86/fold-vector-sext-crash.ll6
-rw-r--r--test/CodeGen/X86/horizontal-reduce-smax.ll284
-rw-r--r--test/CodeGen/X86/horizontal-reduce-smin.ll284
-rw-r--r--test/CodeGen/X86/horizontal-reduce-umax.ll284
-rw-r--r--test/CodeGen/X86/horizontal-reduce-umin.ll170
-rw-r--r--test/CodeGen/X86/known-bits-vector.ll14
-rw-r--r--test/CodeGen/X86/machinesink-merge-debuginfo.ll60
-rw-r--r--test/CodeGen/X86/machinesink-null-debuginfo.ll10
-rw-r--r--test/CodeGen/X86/masked_gather_scatter.ll121
-rw-r--r--test/CodeGen/X86/popcnt.ll2
-rw-r--r--test/CodeGen/X86/prefetch.ll102
-rw-r--r--test/CodeGen/X86/shuffle-strided-with-offset-128.ll81
-rw-r--r--test/CodeGen/X86/shuffle-strided-with-offset-256.ll149
-rw-r--r--test/CodeGen/X86/shuffle-strided-with-offset-512.ll48
-rw-r--r--test/CodeGen/X86/shuffle-vs-trunc-128.ll55
-rw-r--r--test/CodeGen/X86/shuffle-vs-trunc-256.ll184
-rw-r--r--test/CodeGen/X86/shuffle-vs-trunc-512.ll5
-rw-r--r--test/CodeGen/X86/var-permute-256.ll270
-rw-r--r--test/CodeGen/X86/var-permute-512.ll208
-rw-r--r--test/CodeGen/X86/vector-compare-results.ll56
-rw-r--r--test/CodeGen/X86/vector-half-conversions.ll511
-rw-r--r--test/CodeGen/X86/vector-rotate-128.ll76
-rw-r--r--test/CodeGen/X86/vector-shift-ashr-128.ll154
-rw-r--r--test/CodeGen/X86/vector-shift-lshr-128.ll154
-rw-r--r--test/CodeGen/X86/vector-shift-shl-128.ll154
-rw-r--r--test/CodeGen/X86/vector-shuffle-128-v16.ll113
-rw-r--r--test/CodeGen/X86/vector-shuffle-128-v8.ll397
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v16.ll849
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v32.ll946
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v4.ll237
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v8.ll264
-rw-r--r--test/CodeGen/X86/vector-shuffle-512-v32.ll18
-rw-r--r--test/CodeGen/X86/vector-shuffle-v1.ll185
-rw-r--r--test/CodeGen/X86/vector-shuffle-variable-128.ll596
-rw-r--r--test/CodeGen/X86/vector-trunc.ll62
-rw-r--r--test/CodeGen/X86/vector-zext.ll30
-rw-r--r--test/CodeGen/X86/vselect.ll6
-rw-r--r--test/DebugInfo/X86/dwarfdump-str-offsets-dwp.s173
-rw-r--r--test/DebugInfo/X86/dwarfdump-str-offsets-invalid-3.s4
-rw-r--r--test/DebugInfo/X86/dwarfdump-str-offsets-invalid-4.s5
-rw-r--r--test/DebugInfo/X86/dwarfdump-str-offsets-invalid-6.s94
-rw-r--r--test/DebugInfo/X86/dwarfdump-str-offsets-macho.s34
-rw-r--r--test/DebugInfo/X86/dwarfdump-str-offsets.s66
-rw-r--r--test/Instrumentation/HWAddressSanitizer/basic.ll72
-rw-r--r--test/Instrumentation/HWAddressSanitizer/with-calls.ll39
-rw-r--r--test/MC/AArch64/arm64-system-encoding.s3
-rw-r--r--test/MC/AArch64/basic-a64-diagnostics.s4
-rw-r--r--test/MC/AArch64/dot-req.s10
-rw-r--r--test/MC/AMDGPU/ds.s4
-rw-r--r--test/MC/AMDGPU/expressions.s8
-rw-r--r--test/MC/AMDGPU/invalid-instructions-spellcheck.s48
-rw-r--r--test/MC/AMDGPU/trap.s42
-rw-r--r--test/MC/AMDGPU/vop1-gfx9-err.s12
-rw-r--r--test/MC/AMDGPU/vop3p-err.s6
-rw-r--r--test/MC/ARM/dfb-neg.s10
-rw-r--r--test/MC/ARM/dfb.s6
-rw-r--r--test/MC/COFF/align-nops.s2
-rw-r--r--test/MC/Disassembler/AArch64/basic-a64-instructions.txt3
-rw-r--r--test/MC/Disassembler/AMDGPU/ds_vi.txt3
-rw-r--r--test/MC/Disassembler/AMDGPU/trap_gfx9.txt32
-rw-r--r--test/MC/Disassembler/AMDGPU/trap_vi.txt16
-rw-r--r--test/MC/Disassembler/ARM/dfb-arm.txt6
-rw-r--r--test/MC/Disassembler/ARM/dfb-thumb.txt6
-rw-r--r--test/MC/Disassembler/X86/x86-32.txt3
-rw-r--r--test/MC/ELF/align-nops.s2
-rw-r--r--test/MC/MachO/x86_32-optimal_nop.s2
-rw-r--r--test/MC/Mips/eva/invalid.s47
-rw-r--r--test/MC/WebAssembly/weak-alias.ll169
-rw-r--r--test/MC/X86/3DNow.s2
-rw-r--r--test/MC/X86/AlignedBundling/different-sections.s4
-rw-r--r--test/MC/X86/AlignedBundling/long-nop-pad.s4
-rw-r--r--test/MC/X86/AlignedBundling/misaligned-bundle-group.s4
-rw-r--r--test/MC/X86/AlignedBundling/misaligned-bundle.s4
-rw-r--r--test/MC/X86/AlignedBundling/pad-align-to-bundle-end.s4
-rw-r--r--test/MC/X86/AlignedBundling/pad-bundle-groups.s4
-rw-r--r--test/MC/X86/AlignedBundling/relax-in-bundle-group.s4
-rw-r--r--test/MC/X86/AlignedBundling/single-inst-bundling.s4
-rw-r--r--test/MC/X86/CLFLUSHOPT-32.s26
-rw-r--r--test/MC/X86/CLFLUSHOPT-64.s26
-rw-r--r--test/MC/X86/CLFSH-32.s26
-rw-r--r--test/MC/X86/CLFSH-64.s26
-rw-r--r--test/MC/X86/x86_long_nop.s8
-rw-r--r--test/TableGen/GlobalISelEmitter.td53
-rw-r--r--test/TableGen/intrinsic-long-name.td2
-rw-r--r--test/TableGen/intrinsic-struct.td2
-rw-r--r--test/TableGen/intrinsic-varargs.td2
-rw-r--r--test/ThinLTO/X86/cache.ll22
-rw-r--r--test/Transforms/CallSiteSplitting/callsite-no-or-structure.ll139
-rw-r--r--test/Transforms/CallSiteSplitting/callsite-no-splitting.ll18
-rw-r--r--test/Transforms/CodeGenPrepare/section.ll47
-rw-r--r--test/Transforms/GVN/tbaa.ll61
-rw-r--r--test/Transforms/Inline/AArch64/binop.ll291
-rw-r--r--test/Transforms/Inline/ARM/inline-fp.ll113
-rw-r--r--test/Transforms/Inline/inline-fp.ll137
-rw-r--r--test/Transforms/Inline/redundant-loads.ll18
-rw-r--r--test/Transforms/InstCombine/2011-09-03-Trampoline.ll23
-rw-r--r--test/Transforms/JumpThreading/guards.ll103
-rw-r--r--test/Transforms/LoopVectorize/legal_preheader_check.ll27
-rw-r--r--test/Transforms/MemCpyOpt/memcpy-invoke-memcpy.ll48
-rw-r--r--test/Transforms/MemCpyOpt/merge-into-memset.ll45
-rw-r--r--test/Transforms/MemCpyOpt/mixed-sizes.ll36
-rw-r--r--test/Transforms/MemCpyOpt/nonlocal-memcpy-memcpy.ll114
-rw-r--r--test/Transforms/NewGVN/tbaa.ll61
-rw-r--r--test/Transforms/PGOProfile/icp_covariant_call_return.ll3
-rw-r--r--test/Transforms/PGOProfile/icp_covariant_invoke_return.ll9
-rw-r--r--test/Transforms/PGOProfile/icp_invoke.ll17
-rw-r--r--test/Transforms/PGOProfile/icp_invoke_nouse.ll3
-rw-r--r--test/Transforms/PGOProfile/icp_vararg.ll3
-rw-r--r--test/Transforms/PGOProfile/indirect_call_promotion.ll3
-rw-r--r--test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll24
-rw-r--r--test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll125
-rw-r--r--test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll225
-rw-r--r--test/Transforms/SLPVectorizer/X86/jumbled-load.ll37
-rw-r--r--test/Transforms/SLPVectorizer/X86/store-jumbled.ll25
-rw-r--r--test/Transforms/SampleProfile/entry_counts.ll4
-rw-r--r--test/Transforms/SimplifyCFG/X86/if-conversion.ll231
-rw-r--r--test/tools/llvm-cvtres/machine.test56
-rw-r--r--test/tools/llvm-cvtres/symbols.test12
-rw-r--r--test/tools/llvm-dwarfdump/X86/lookup.s7
-rw-r--r--test/tools/llvm-objcopy/add-section-remove.test36
-rw-r--r--test/tools/llvm-objcopy/add-section.test37
-rw-r--r--test/tools/llvm-readobj/mips-got.test136
-rw-r--r--test/tools/llvm-readobj/mips-plt.test32
182 files changed, 9291 insertions, 7612 deletions
diff --git a/test/Analysis/BasicAA/args-rets-allocas-loads.ll b/test/Analysis/BasicAA/args-rets-allocas-loads.ll
index 05b56a07e44b..b31fb26f1c9b 100644
--- a/test/Analysis/BasicAA/args-rets-allocas-loads.ll
+++ b/test/Analysis/BasicAA/args-rets-allocas-loads.ll
@@ -308,4 +308,9 @@ define void @caller_a(double* %arg_a0,
; CHECK-NEXT: 0 mod responses (0.0%)
; CHECK-NEXT: 0 ref responses (0.0%)
; CHECK-NEXT: 140 mod & ref responses (76.0%)
-; CHECK-NEXT: Alias Analysis Evaluator Mod/Ref Summary: 23%/0%/0%/76%
+; CHECK-NEXT: 0 must responses (0.0%)
+; CHECK-NEXT: 0 must mod responses (0.0%)
+; CHECK-NEXT: 0 must ref responses (0.0%)
+; CHECK-NEXT: 0 must mod & ref responses (0.0%)
+; CHECK-NEXT: Alias Analysis Evaluator Mod/Ref Summary: 23%/0%/0%/76%/0%/0%/0%/0%
+
diff --git a/test/Analysis/BasicAA/call-attrs.ll b/test/Analysis/BasicAA/call-attrs.ll
index 9cd17e486799..8538e8b4771d 100644
--- a/test/Analysis/BasicAA/call-attrs.ll
+++ b/test/Analysis/BasicAA/call-attrs.ll
@@ -31,12 +31,12 @@ entry:
ret void
}
-; CHECK: Just Ref: Ptr: i8* %p <-> call void @readonly_attr(i8* %p)
+; CHECK: Just Ref (MustAlias): Ptr: i8* %p <-> call void @readonly_attr(i8* %p)
; CHECK: Just Ref: Ptr: i8* %p <-> call void @readonly_func(i8* %p)
-; CHECK: Just Mod: Ptr: i8* %p <-> call void @writeonly_attr(i8* %p)
+; CHECK: Just Mod (MustAlias): Ptr: i8* %p <-> call void @writeonly_attr(i8* %p)
; CHECK: Just Mod: Ptr: i8* %p <-> call void @writeonly_func(i8* %p)
; CHECK: NoModRef: Ptr: i8* %p <-> call void @readnone_attr(i8* %p)
; CHECK: NoModRef: Ptr: i8* %p <-> call void @readnone_func(i8* %p)
; CHECK: Both ModRef: Ptr: i8* %p <-> call void @read_write(i8* %p, i8* %p, i8* %p)
-; CHECK: Just Ref: Ptr: i8* %p <-> call void @func() [ "deopt"(i8* %p) ]
+; CHECK: Just Ref (MustAlias): Ptr: i8* %p <-> call void @func() [ "deopt"(i8* %p) ]
; CHECK: Both ModRef: Ptr: i8* %p <-> call void @writeonly_attr(i8* %p) [ "deopt"(i8* %p) ]
diff --git a/test/Analysis/BasicAA/cs-cs-arm.ll b/test/Analysis/BasicAA/cs-cs-arm.ll
index 1580af9ea826..e4367bb6d61b 100644
--- a/test/Analysis/BasicAA/cs-cs-arm.ll
+++ b/test/Analysis/BasicAA/cs-cs-arm.ll
@@ -19,11 +19,11 @@ entry:
; CHECK-LABEL: Function: test1:
; CHECK: NoAlias: i8* %p, i8* %q
-; CHECK: Just Ref: Ptr: i8* %p <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
+; CHECK: Just Ref (MustAlias): Ptr: i8* %p <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
; CHECK: NoModRef: Ptr: i8* %q <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
; CHECK: NoModRef: Ptr: i8* %p <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
-; CHECK: Both ModRef: Ptr: i8* %q <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
-; CHECK: Just Ref: Ptr: i8* %p <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
+; CHECK: Both ModRef (MustAlias): Ptr: i8* %q <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
+; CHECK: Just Ref (MustAlias): Ptr: i8* %p <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
; CHECK: NoModRef: Ptr: i8* %q <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
; CHECK: NoModRef: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #{{[0-9]+}} <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
; CHECK: NoModRef: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #{{[0-9]+}} <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
diff --git a/test/Analysis/BasicAA/cs-cs.ll b/test/Analysis/BasicAA/cs-cs.ll
index 3695275649b2..30b7e0720f0b 100644
--- a/test/Analysis/BasicAA/cs-cs.ll
+++ b/test/Analysis/BasicAA/cs-cs.ll
@@ -164,7 +164,7 @@ define void @test4(i8* %P, i8* noalias %Q) #3 {
; CHECK-LABEL: Function: test4:
; CHECK: NoAlias: i8* %P, i8* %Q
-; CHECK: Just Mod: Ptr: i8* %P <-> tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i32 1, i1 false)
+; CHECK: Just Mod (MustAlias): Ptr: i8* %P <-> tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i32 1, i1 false)
; CHECK: NoModRef: Ptr: i8* %Q <-> tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i32 1, i1 false)
; CHECK: Just Mod: Ptr: i8* %P <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
; CHECK: Just Ref: Ptr: i8* %Q <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
@@ -192,6 +192,26 @@ define void @test5(i8* %P, i8* %Q, i8* %R) #3 {
; CHECK: Just Mod: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false) <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
}
+define void @test5a(i8* noalias %P, i8* noalias %Q, i8* noalias %R) nounwind ssp {
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false)
+ ret void
+
+; CHECK-LABEL: Function: test5a:
+
+; CHECK: NoAlias: i8* %P, i8* %Q
+; CHECK: NoAlias: i8* %P, i8* %R
+; CHECK: NoAlias: i8* %Q, i8* %R
+; CHECK: Just Mod: Ptr: i8* %P <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Ref: Ptr: i8* %Q <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: NoModRef: Ptr: i8* %R <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Mod: Ptr: i8* %P <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false)
+; CHECK: NoModRef: Ptr: i8* %Q <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false)
+; CHECK: Just Ref: Ptr: i8* %R <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false)
+; CHECK: Just Mod: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false)
+; CHECK: Just Mod: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false) <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+}
+
define void @test6(i8* %P) #3 {
call void @llvm.memset.p0i8.i64(i8* %P, i8 -51, i64 32, i32 8, i1 false)
call void @a_readonly_func(i8* %P)
@@ -199,7 +219,7 @@ define void @test6(i8* %P) #3 {
; CHECK-LABEL: Function: test6:
-; CHECK: Just Mod: Ptr: i8* %P <-> call void @llvm.memset.p0i8.i64(i8* %P, i8 -51, i64 32, i32 8, i1 false)
+; CHECK: Just Mod (MustAlias): Ptr: i8* %P <-> call void @llvm.memset.p0i8.i64(i8* %P, i8 -51, i64 32, i32 8, i1 false)
; CHECK: Just Ref: Ptr: i8* %P <-> call void @a_readonly_func(i8* %P)
; CHECK: Just Mod: call void @llvm.memset.p0i8.i64(i8* %P, i8 -51, i64 32, i32 8, i1 false) <-> call void @a_readonly_func(i8* %P)
; CHECK: Just Ref: call void @a_readonly_func(i8* %P) <-> call void @llvm.memset.p0i8.i64(i8* %P, i8 -51, i64 32, i32 8, i1 false)
@@ -237,9 +257,9 @@ entry:
; CHECK: NoModRef: Ptr: i8* %p <-> call void @an_inaccessiblememonly_func()
; CHECK: NoModRef: Ptr: i8* %q <-> call void @an_inaccessiblememonly_func()
; CHECK: NoModRef: Ptr: i8* %p <-> call void @an_inaccessibleorargmemonly_func(i8* %q)
-; CHECK: Both ModRef: Ptr: i8* %q <-> call void @an_inaccessibleorargmemonly_func(i8* %q)
+; CHECK: Both ModRef (MustAlias): Ptr: i8* %q <-> call void @an_inaccessibleorargmemonly_func(i8* %q)
; CHECK: NoModRef: Ptr: i8* %p <-> call void @an_argmemonly_func(i8* %q)
-; CHECK: Both ModRef: Ptr: i8* %q <-> call void @an_argmemonly_func(i8* %q)
+; CHECK: Both ModRef (MustAlias): Ptr: i8* %q <-> call void @an_argmemonly_func(i8* %q)
; CHECK: Just Ref: call void @a_readonly_func(i8* %p) <-> call void @an_inaccessiblememonly_func()
; CHECK: Just Ref: call void @a_readonly_func(i8* %p) <-> call void @an_inaccessibleorargmemonly_func(i8* %q)
; CHECK: Just Ref: call void @a_readonly_func(i8* %p) <-> call void @an_argmemonly_func(i8* %q)
@@ -254,12 +274,34 @@ entry:
; CHECK: Both ModRef: call void @an_inaccessibleorargmemonly_func(i8* %q) <-> call void @a_readonly_func(i8* %p)
; CHECK: Both ModRef: call void @an_inaccessibleorargmemonly_func(i8* %q) <-> call void @a_writeonly_func(i8* %q)
; CHECK: Both ModRef: call void @an_inaccessibleorargmemonly_func(i8* %q) <-> call void @an_inaccessiblememonly_func()
-; CHECK: Both ModRef: call void @an_inaccessibleorargmemonly_func(i8* %q) <-> call void @an_argmemonly_func(i8* %q)
+; CHECK: Both ModRef (MustAlias): call void @an_inaccessibleorargmemonly_func(i8* %q) <-> call void @an_argmemonly_func(i8* %q)
; CHECK: Both ModRef: call void @an_argmemonly_func(i8* %q) <-> call void @a_readonly_func(i8* %p)
; CHECK: Both ModRef: call void @an_argmemonly_func(i8* %q) <-> call void @a_writeonly_func(i8* %q)
; CHECK: NoModRef: call void @an_argmemonly_func(i8* %q) <-> call void @an_inaccessiblememonly_func()
-; CHECK: Both ModRef: call void @an_argmemonly_func(i8* %q) <-> call void @an_inaccessibleorargmemonly_func(i8* %q)
+; CHECK: Both ModRef (MustAlias): call void @an_argmemonly_func(i8* %q) <-> call void @an_inaccessibleorargmemonly_func(i8* %q)
+}
+
+;; test that MustAlias is set for calls when no MayAlias is found.
+declare void @another_argmemonly_func(i8*, i8*) #0
+define void @test8a(i8* noalias %p, i8* noalias %q) {
+entry:
+ call void @another_argmemonly_func(i8* %p, i8* %q)
+ ret void
+
+; CHECK-LABEL: Function: test8a
+; CHECK: Both ModRef: Ptr: i8* %p <-> call void @another_argmemonly_func(i8* %p, i8* %q)
+; CHECK: Both ModRef: Ptr: i8* %q <-> call void @another_argmemonly_func(i8* %p, i8* %q)
}
+define void @test8b(i8* %p, i8* %q) {
+entry:
+ call void @another_argmemonly_func(i8* %p, i8* %q)
+ ret void
+
+; CHECK-LABEL: Function: test8b
+; CHECK: Both ModRef: Ptr: i8* %p <-> call void @another_argmemonly_func(i8* %p, i8* %q)
+; CHECK: Both ModRef: Ptr: i8* %q <-> call void @another_argmemonly_func(i8* %p, i8* %q)
+}
+
;; test that unknown operand bundle has unknown effect to the heap
define void @test9(i8* %p) {
@@ -310,9 +352,9 @@ entry:
; CHECK: NoModRef: Ptr: i8* %p <-> call void @an_inaccessiblememonly_func() #7 [ "unknown"() ]
; CHECK: NoModRef: Ptr: i8* %q <-> call void @an_inaccessiblememonly_func() #7 [ "unknown"() ]
; CHECK: NoModRef: Ptr: i8* %p <-> call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ]
-; CHECK: Both ModRef: Ptr: i8* %q <-> call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ]
+; CHECK: Both ModRef (MustAlias): Ptr: i8* %q <-> call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ]
; CHECK: NoModRef: Ptr: i8* %p <-> call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ]
-; CHECK: Both ModRef: Ptr: i8* %q <-> call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ]
+; CHECK: Both ModRef (MustAlias): Ptr: i8* %q <-> call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ]
; CHECK: Just Ref: call void @a_readonly_func(i8* %p) #6 [ "unknown"() ] <-> call void @an_inaccessiblememonly_func() #7 [ "unknown"() ]
; CHECK: Just Ref: call void @a_readonly_func(i8* %p) #6 [ "unknown"() ] <-> call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ]
; CHECK: Just Ref: call void @a_readonly_func(i8* %p) #6 [ "unknown"() ] <-> call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ]
@@ -321,10 +363,10 @@ entry:
; CHECK: NoModRef: call void @an_inaccessiblememonly_func() #7 [ "unknown"() ] <-> call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ]
; CHECK: Both ModRef: call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ] <-> call void @a_readonly_func(i8* %p) #6 [ "unknown"() ]
; CHECK: Both ModRef: call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ] <-> call void @an_inaccessiblememonly_func() #7 [ "unknown"() ]
-; CHECK: Both ModRef: call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ] <-> call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ]
+; CHECK: Both ModRef (MustAlias): call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ] <-> call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ]
; CHECK: Both ModRef: call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ] <-> call void @a_readonly_func(i8* %p) #6 [ "unknown"() ]
; CHECK: NoModRef: call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ] <-> call void @an_inaccessiblememonly_func() #7 [ "unknown"() ]
-; CHECK: Both ModRef: call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ] <-> call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ]
+; CHECK: Both ModRef (MustAlias): call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ] <-> call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ]
}
attributes #0 = { argmemonly nounwind }
diff --git a/test/Analysis/MemorySSA/volatile-clobber.ll b/test/Analysis/MemorySSA/volatile-clobber.ll
index d6f960f3e382..53df7de499bd 100644
--- a/test/Analysis/MemorySSA/volatile-clobber.ll
+++ b/test/Analysis/MemorySSA/volatile-clobber.ll
@@ -22,8 +22,7 @@ define i32 @foo() {
ret i32 %4
}
-; Ensuring that we don't automatically hoist nonvolatile loads around volatile
-; loads
+; Ensuring we allow hoisting nonvolatile loads around volatile loads.
; CHECK-LABEL define void @volatile_only
define void @volatile_only(i32* %arg1, i32* %arg2) {
; Trivially NoAlias/MustAlias
@@ -36,7 +35,7 @@ define void @volatile_only(i32* %arg1, i32* %arg2) {
; CHECK: MemoryUse(liveOnEntry)
; CHECK-NEXT: load i32, i32* %b
load i32, i32* %b
-; CHECK: MemoryUse(1)
+; CHECK: MemoryUse(liveOnEntry)
; CHECK-NEXT: load i32, i32* %a
load i32, i32* %a
@@ -44,7 +43,7 @@ define void @volatile_only(i32* %arg1, i32* %arg2) {
; CHECK: 2 = MemoryDef(1)
; CHECK-NEXT: load volatile i32, i32* %arg1
load volatile i32, i32* %arg1
-; CHECK: MemoryUse(2)
+; CHECK: MemoryUse(liveOnEntry)
; CHECK-NEXT: load i32, i32* %arg2
load i32, i32* %arg2
@@ -75,10 +74,10 @@ define void @volatile_atomics(i32* %arg1, i32* %arg2) {
; CHECK: MemoryUse(1)
; CHECK-NEXT: load atomic i32, i32* %b unordered, align 4
load atomic i32, i32* %b unordered, align 4
-; CHECK: MemoryUse(2)
+; CHECK: MemoryUse(1)
; CHECK-NEXT: load atomic i32, i32* %a unordered, align 4
load atomic i32, i32* %a unordered, align 4
-; CHECK: MemoryUse(2)
+; CHECK: MemoryUse(1)
; CHECK-NEXT: load i32, i32* %a
load i32, i32* %a
@@ -86,7 +85,7 @@ define void @volatile_atomics(i32* %arg1, i32* %arg2) {
; CHECK: 3 = MemoryDef(2)
; CHECK-NEXT: load atomic volatile i32, i32* %arg1 monotonic, align 4
load atomic volatile i32, i32* %arg1 monotonic, align 4
-; CHECK: MemoryUse(3)
+; CHECK: MemoryUse(1)
; CHECK-NEXT: load i32, i32* %arg2
load i32, i32* %arg2
diff --git a/test/Analysis/ValueTracking/memory-dereferenceable.ll b/test/Analysis/ValueTracking/memory-dereferenceable.ll
index ca16a266123c..2e9453f670ce 100644
--- a/test/Analysis/ValueTracking/memory-dereferenceable.ll
+++ b/test/Analysis/ValueTracking/memory-dereferenceable.ll
@@ -20,7 +20,8 @@ declare i32* @foo()
@globalptr.align16 = external global i8, align 16
; CHECK-LABEL: 'test'
-define void @test(i32 addrspace(1)* dereferenceable(8) %dparam,
+define void @test(%struct.A* sret %result,
+ i32 addrspace(1)* dereferenceable(8) %dparam,
i8 addrspace(1)* dereferenceable(32) align 1 %dparam.align1,
i8 addrspace(1)* dereferenceable(32) align 16 %dparam.align16,
i8* byval %i8_byval,
@@ -41,17 +42,14 @@ entry:
%empty_alloca = alloca i8, i64 0
%empty_load = load i8, i8* %empty_alloca
- ; Load from too small array alloca
-; CHECK-NOT: %small_array_alloca
- %small_array_alloca = alloca i8, i64 2
- %saa_cast = bitcast i8* %small_array_alloca to i32*
- %saa_load = load i32, i32* %saa_cast
-
- ; Load from array alloca
-; CHECK: %big_array_alloca{{.*}}(unaligned)
- %big_array_alloca = alloca i8, i64 4
- %baa_cast = bitcast i8* %big_array_alloca to i32*
- %baa_load = load i32, i32* %baa_cast
+ ; Loads from sret arguments
+; CHECK: %sret_gep{{.*}}(aligned)
+ %sret_gep = getelementptr inbounds %struct.A, %struct.A* %result, i64 0, i32 1, i64 2
+ load i8, i8* %sret_gep
+
+; CHECK-NOT: %sret_gep_outside
+ %sret_gep_outside = getelementptr %struct.A, %struct.A* %result, i64 0, i32 1, i64 7
+ load i8, i8* %sret_gep_outside
; CHECK: %dparam{{.*}}(aligned)
%load3 = load i32, i32 addrspace(1)* %dparam
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
index 86ac5507a407..111aaf88b160 100644
--- a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
+++ b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
@@ -135,7 +135,7 @@ continue:
}
; Check that we fallback on invoke translation failures.
-; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %0:_(s128) = G_FCONSTANT quad 2
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %0:_(s128) = G_FCONSTANT fp128 0xL00000000000000004000000000000000
; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for test_quad_dump
; FALLBACK-WITH-REPORT-OUT-LABEL: test_quad_dump:
define fp128 @test_quad_dump() {
diff --git a/test/CodeGen/AArch64/GlobalISel/fp128-legalize-crash-pr35690.mir b/test/CodeGen/AArch64/GlobalISel/fp128-legalize-crash-pr35690.mir
new file mode 100644
index 000000000000..47fda8f998d7
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/fp128-legalize-crash-pr35690.mir
@@ -0,0 +1,44 @@
+# RUN: llc -O0 -run-pass=legalizer -global-isel -global-isel-abort=0 %s -o - | FileCheck %s
+--- |
+ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+ target triple = "aarch64"
+
+ define fp128 @x(fp128 %a) {
+ entry:
+ %a.addr = alloca fp128, align 16
+ store fp128 %a, fp128* %a.addr, align 16
+ %0 = load fp128, fp128* %a.addr, align 16
+ %sub = fsub fp128 0xL00000000000000008000000000000000, %0
+ ret fp128 %sub
+ }
+
+...
+---
+name: x
+alignment: 2
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+fixedStack:
+stack:
+ - { id: 0, name: a.addr, type: default, offset: 0, size: 16, alignment: 16,
+ stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+ di-variable: '', di-expression: '', di-location: '' }
+body: |
+ bb.1.entry:
+ liveins: %q0
+
+ ; This test just checks we don't crash on G_FNEG of FP128 types. Expect to fall
+ ; back until support is added for fp128.
+ ; CHECK: ret
+ %0:_(s128) = COPY %q0
+ %1:_(p0) = G_FRAME_INDEX %stack.0.a.addr
+ G_STORE %0(s128), %1(p0) :: (store 16 into %ir.a.addr)
+ %2:_(s128) = G_LOAD %1(p0) :: (load 16 from %ir.a.addr)
+ %3:_(s128) = G_FNEG %2
+ %q0 = COPY %3(s128)
+ RET_ReallyLR implicit %q0
+
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/translate-gep.ll b/test/CodeGen/AArch64/GlobalISel/translate-gep.ll
index 865315bbe0a3..4b69575079a3 100644
--- a/test/CodeGen/AArch64/GlobalISel/translate-gep.ll
+++ b/test/CodeGen/AArch64/GlobalISel/translate-gep.ll
@@ -1,85 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu -O0 -global-isel -stop-after=irtranslator -o - %s | FileCheck %s
%type = type [4 x {i8, i32}]
define %type* @first_offset_const(%type* %addr) {
-; CHECK-LABEL: name: first_offset_const
-; CHECK: [[BASE:%[0-9]+]]:_(p0) = COPY %x0
-; CHECK: [[OFFSET:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
-; CHECK: [[RES:%[0-9]+]]:_(p0) = G_GEP [[BASE]], [[OFFSET]](s64)
-; CHECK: %x0 = COPY [[RES]](p0)
+ ; CHECK-LABEL: name: first_offset_const
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK: liveins: %x0
+ ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY %x0
+ ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
+ ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64)
+ ; CHECK: %x0 = COPY [[GEP]](p0)
+ ; CHECK: RET_ReallyLR implicit %x0
%res = getelementptr %type, %type* %addr, i32 1
ret %type* %res
}
define %type* @first_offset_trivial(%type* %addr) {
-; CHECK-LABEL: name: first_offset_trivial
-; CHECK: [[BASE:%[0-9]+]]:_(p0) = COPY %x0
-; CHECK: [[TRIVIAL:%[0-9]+]]:_(p0) = COPY [[BASE]](p0)
-; CHECK: %x0 = COPY [[TRIVIAL]](p0)
+ ; CHECK-LABEL: name: first_offset_trivial
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK: liveins: %x0
+ ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY %x0
+ ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY [[COPY]](p0)
+ ; CHECK: %x0 = COPY [[COPY1]](p0)
+ ; CHECK: RET_ReallyLR implicit %x0
%res = getelementptr %type, %type* %addr, i32 0
ret %type* %res
}
define %type* @first_offset_variable(%type* %addr, i64 %idx) {
-; CHECK-LABEL: name: first_offset_variable
-; CHECK: [[BASE:%[0-9]+]]:_(p0) = COPY %x0
-; CHECK: [[IDX:%[0-9]+]]:_(s64) = COPY %x1
-; CHECK: [[SIZE:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
-; CHECK: [[OFFSET:%[0-9]+]]:_(s64) = G_MUL [[SIZE]], [[IDX]]
-; CHECK: [[STEP0:%[0-9]+]]:_(p0) = G_GEP [[BASE]], [[OFFSET]](s64)
-; CHECK: [[RES:%[0-9]+]]:_(p0) = COPY [[STEP0]](p0)
-; CHECK: %x0 = COPY [[RES]](p0)
+ ; CHECK-LABEL: name: first_offset_variable
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK: liveins: %x0, %x1
+ ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY %x0
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY %x1
+ ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
+ ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[C]], [[COPY1]]
+ ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[MUL]](s64)
+ ; CHECK: [[COPY2:%[0-9]+]]:_(p0) = COPY [[GEP]](p0)
+ ; CHECK: %x0 = COPY [[COPY2]](p0)
+ ; CHECK: RET_ReallyLR implicit %x0
%res = getelementptr %type, %type* %addr, i64 %idx
ret %type* %res
}
define %type* @first_offset_ext(%type* %addr, i32 %idx) {
-; CHECK-LABEL: name: first_offset_ext
-; CHECK: [[BASE:%[0-9]+]]:_(p0) = COPY %x0
-; CHECK: [[IDX32:%[0-9]+]]:_(s32) = COPY %w1
-; CHECK: [[SIZE:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
-; CHECK: [[IDX64:%[0-9]+]]:_(s64) = G_SEXT [[IDX32]](s32)
-; CHECK: [[OFFSET:%[0-9]+]]:_(s64) = G_MUL [[SIZE]], [[IDX64]]
-; CHECK: [[STEP0:%[0-9]+]]:_(p0) = G_GEP [[BASE]], [[OFFSET]](s64)
-; CHECK: [[RES:%[0-9]+]]:_(p0) = COPY [[STEP0]](p0)
-; CHECK: %x0 = COPY [[RES]](p0)
+ ; CHECK-LABEL: name: first_offset_ext
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK: liveins: %w1, %x0
+ ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY %x0
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %w1
+ ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
+ ; CHECK: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY1]](s32)
+ ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[C]], [[SEXT]]
+ ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[MUL]](s64)
+ ; CHECK: [[COPY2:%[0-9]+]]:_(p0) = COPY [[GEP]](p0)
+ ; CHECK: %x0 = COPY [[COPY2]](p0)
+ ; CHECK: RET_ReallyLR implicit %x0
%res = getelementptr %type, %type* %addr, i32 %idx
ret %type* %res
}
%type1 = type [4 x [4 x i32]]
define i32* @const_then_var(%type1* %addr, i64 %idx) {
-; CHECK-LABEL: name: const_then_var
-; CHECK: [[BASE:%[0-9]+]]:_(p0) = COPY %x0
-; CHECK: [[IDX:%[0-9]+]]:_(s64) = COPY %x1
-; CHECK: [[OFFSET1:%[0-9]+]]:_(s64) = G_CONSTANT i64 272
-; CHECK: [[SIZE:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-; CHECK: [[BASE1:%[0-9]+]]:_(p0) = G_GEP [[BASE]], [[OFFSET1]](s64)
-; CHECK: [[OFFSET2:%[0-9]+]]:_(s64) = G_MUL [[SIZE]], [[IDX]]
-; CHECK: [[BASE2:%[0-9]+]]:_(p0) = G_GEP [[BASE1]], [[OFFSET2]](s64)
-; CHECK: [[RES:%[0-9]+]]:_(p0) = COPY [[BASE2]](p0)
-; CHECK: %x0 = COPY [[RES]](p0)
+ ; CHECK-LABEL: name: const_then_var
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK: liveins: %x0, %x1
+ ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY %x0
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY %x1
+ ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 272
+ ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+ ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64)
+ ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[C1]], [[COPY1]]
+ ; CHECK: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[GEP]], [[MUL]](s64)
+ ; CHECK: [[COPY2:%[0-9]+]]:_(p0) = COPY [[GEP1]](p0)
+ ; CHECK: %x0 = COPY [[COPY2]](p0)
+ ; CHECK: RET_ReallyLR implicit %x0
%res = getelementptr %type1, %type1* %addr, i32 4, i32 1, i64 %idx
ret i32* %res
}
define i32* @var_then_const(%type1* %addr, i64 %idx) {
-; CHECK-LABEL: name: var_then_const
-; CHECK: [[BASE:%[0-9]+]]:_(p0) = COPY %x0
-; CHECK: [[IDX:%[0-9]+]]:_(s64) = COPY %x1
-; CHECK: [[SIZE:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
-; CHECK: [[OFFSET2:%[0-9]+]]:_(s64) = G_CONSTANT i64 40
-; CHECK: [[OFFSET1:%[0-9]+]]:_(s64) = G_MUL [[SIZE]], [[IDX]]
-; CHECK: [[BASE1:%[0-9]+]]:_(p0) = G_GEP [[BASE]], [[OFFSET1]](s64)
-; CHECK: [[BASE2:%[0-9]+]]:_(p0) = G_GEP [[BASE1]], [[OFFSET2]](s64)
-; CHECK: %x0 = COPY [[BASE2]](p0)
+ ; CHECK-LABEL: name: var_then_const
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK: liveins: %x0, %x1
+ ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY %x0
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY %x1
+ ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
+ ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 40
+ ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[C]], [[COPY1]]
+ ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[MUL]](s64)
+ ; CHECK: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[GEP]], [[C1]](s64)
+ ; CHECK: %x0 = COPY [[GEP1]](p0)
+ ; CHECK: RET_ReallyLR implicit %x0
%res = getelementptr %type1, %type1* %addr, i64 %idx, i32 2, i32 2
ret i32* %res
}
diff --git a/test/CodeGen/AArch64/arm64-jumptable.ll b/test/CodeGen/AArch64/arm64-jumptable.ll
index f5c2ee6da0bf..fac3e5704d15 100644
--- a/test/CodeGen/AArch64/arm64-jumptable.ll
+++ b/test/CodeGen/AArch64/arm64-jumptable.ll
@@ -6,22 +6,20 @@ define void @sum(i32 %a, i32* %to, i32 %c) {
entry:
switch i32 %a, label %exit [
i32 1, label %bb1
- i32 2, label %bb2
+ i32 2, label %exit.sink.split
i32 3, label %bb3
i32 4, label %bb4
]
bb1:
%b = add i32 %c, 1
- store i32 %b, i32* %to
- br label %exit
-bb2:
- store i32 2, i32* %to
- br label %exit
+ br label %exit.sink.split
bb3:
- store i32 3, i32* %to
- br label %exit
+ br label %exit.sink.split
bb4:
- store i32 5, i32* %to
+ br label %exit.sink.split
+exit.sink.split:
+ %.sink = phi i32 [ 5, %bb4 ], [ %b, %bb1 ], [ 3, %bb3 ], [ %a, %entry ]
+ store i32 %.sink, i32* %to
br label %exit
exit:
ret void
diff --git a/test/CodeGen/AArch64/arm64-memset-to-bzero.ll b/test/CodeGen/AArch64/arm64-memset-to-bzero.ll
index 29036caabf3a..3466e1bace56 100644
--- a/test/CodeGen/AArch64/arm64-memset-to-bzero.ll
+++ b/test/CodeGen/AArch64/arm64-memset-to-bzero.ll
@@ -4,9 +4,10 @@
; RUN: FileCheck --check-prefix=CHECK-LINUX --check-prefix=CHECK %s
; <rdar://problem/14199482> ARM64: Calls to bzero() replaced with calls to memset()
-; CHECK: @fct1
+; CHECK-LABEL: fct1:
; For small size (<= 256), we do not change memset to bzero.
-; CHECK: memset
+; CHECK-DARWIN: {{b|bl}} _memset
+; CHECK-LINUX: {{b|bl}} memset
define void @fct1(i8* nocapture %ptr) {
entry:
tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 256, i32 1, i1 false)
@@ -15,20 +16,20 @@ entry:
declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
-; CHECK: @fct2
+; CHECK-LABEL: fct2:
; When the size is bigger than 256, change into bzero.
-; CHECK-DARWIN: bzero
-; CHECK-LINUX: memset
+; CHECK-DARWIN: {{b|bl}} _bzero
+; CHECK-LINUX: {{b|bl}} memset
define void @fct2(i8* nocapture %ptr) {
entry:
tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 257, i32 1, i1 false)
ret void
}
-; CHECK: @fct3
+; CHECK-LABEL: fct3:
; For unknown size, change to bzero.
-; CHECK-DARWIN: bzero
-; CHECK-LINUX: memset
+; CHECK-DARWIN: {{b|bl}} _bzero
+; CHECK-LINUX: {{b|bl}} memset
define void @fct3(i8* nocapture %ptr, i32 %unknown) {
entry:
%conv = sext i32 %unknown to i64
@@ -36,9 +37,10 @@ entry:
ret void
}
-; CHECK: @fct4
+; CHECK-LABEL: fct4:
; Size <= 256, no change.
-; CHECK: memset
+; CHECK-DARWIN: {{b|bl}} _memset
+; CHECK-LINUX: {{b|bl}} memset
define void @fct4(i8* %ptr) {
entry:
%tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
@@ -50,10 +52,10 @@ declare i8* @__memset_chk(i8*, i32, i64, i64)
declare i64 @llvm.objectsize.i64(i8*, i1)
-; CHECK: @fct5
+; CHECK-LABEL: fct5:
; Size > 256, change.
-; CHECK-DARWIN: bzero
-; CHECK-LINUX: memset
+; CHECK-DARWIN: {{b|bl}} _bzero
+; CHECK-LINUX: {{b|bl}} memset
define void @fct5(i8* %ptr) {
entry:
%tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
@@ -61,10 +63,10 @@ entry:
ret void
}
-; CHECK: @fct6
+; CHECK-LABEL: fct6:
; Size = unknown, change.
-; CHECK-DARWIN: bzero
-; CHECK-LINUX: memset
+; CHECK-DARWIN: {{b|bl}} _bzero
+; CHECK-LINUX: {{b|bl}} memset
define void @fct6(i8* %ptr, i32 %unknown) {
entry:
%conv = sext i32 %unknown to i64
@@ -76,9 +78,10 @@ entry:
; Next functions check that memset is not turned into bzero
; when the set constant is non-zero, whatever the given size.
-; CHECK: @fct7
+; CHECK-LABEL: fct7:
; memset with something that is not a zero, no change.
-; CHECK: memset
+; CHECK-DARWIN: {{b|bl}} _memset
+; CHECK-LINUX: {{b|bl}} memset
define void @fct7(i8* %ptr) {
entry:
%tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
@@ -86,9 +89,10 @@ entry:
ret void
}
-; CHECK: @fct8
+; CHECK-LABEL: fct8:
; memset with something that is not a zero, no change.
-; CHECK: memset
+; CHECK-DARWIN: {{b|bl}} _memset
+; CHECK-LINUX: {{b|bl}} memset
define void @fct8(i8* %ptr) {
entry:
%tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
@@ -96,9 +100,10 @@ entry:
ret void
}
-; CHECK: @fct9
+; CHECK-LABEL: fct9:
; memset with something that is not a zero, no change.
-; CHECK: memset
+; CHECK-DARWIN: {{b|bl}} _memset
+; CHECK-LINUX: {{b|bl}} memset
define void @fct9(i8* %ptr, i32 %unknown) {
entry:
%conv = sext i32 %unknown to i64
diff --git a/test/CodeGen/AArch64/arm64-neon-2velem.ll b/test/CodeGen/AArch64/arm64-neon-2velem.ll
index d22bfc76d1d5..b3a2bcd5d669 100644
--- a/test/CodeGen/AArch64/arm64-neon-2velem.ll
+++ b/test/CodeGen/AArch64/arm64-neon-2velem.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast -mcpu=exynos-m1 | FileCheck --check-prefix=EXYNOS %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s --check-prefixes=CHECK,GENERIC
; The instruction latencies of Exynos-M1 trigger the transform we see under the Exynos check.
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast -mcpu=exynos-m1 | FileCheck %s --check-prefixes=CHECK,EXYNOSM1
declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>)
@@ -47,7 +47,6 @@ declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
; CHECK-LABEL: test_vmla_lane_s16:
; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%mul = mul <4 x i16> %shuffle, %b
@@ -58,7 +57,6 @@ entry:
define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
; CHECK-LABEL: test_vmlaq_lane_s16:
; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%mul = mul <8 x i16> %shuffle, %b
@@ -69,7 +67,6 @@ entry:
define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
; CHECK-LABEL: test_vmla_lane_s32:
; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
%mul = mul <2 x i32> %shuffle, %b
@@ -80,7 +77,6 @@ entry:
define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
; CHECK-LABEL: test_vmlaq_lane_s32:
; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%mul = mul <4 x i32> %shuffle, %b
@@ -91,7 +87,6 @@ entry:
define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
; CHECK-LABEL: test_vmla_laneq_s16:
; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
%mul = mul <4 x i16> %shuffle, %b
@@ -102,7 +97,6 @@ entry:
define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
; CHECK-LABEL: test_vmlaq_laneq_s16:
; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
%mul = mul <8 x i16> %shuffle, %b
@@ -113,7 +107,6 @@ entry:
define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
; CHECK-LABEL: test_vmla_laneq_s32:
; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
%mul = mul <2 x i32> %shuffle, %b
@@ -124,7 +117,6 @@ entry:
define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
; CHECK-LABEL: test_vmlaq_laneq_s32:
; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%mul = mul <4 x i32> %shuffle, %b
@@ -135,7 +127,6 @@ entry:
define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
; CHECK-LABEL: test_vmls_lane_s16:
; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%mul = mul <4 x i16> %shuffle, %b
@@ -146,7 +137,6 @@ entry:
define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
; CHECK-LABEL: test_vmlsq_lane_s16:
; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%mul = mul <8 x i16> %shuffle, %b
@@ -157,7 +147,6 @@ entry:
define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
; CHECK-LABEL: test_vmls_lane_s32:
; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
%mul = mul <2 x i32> %shuffle, %b
@@ -168,7 +157,6 @@ entry:
define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
; CHECK-LABEL: test_vmlsq_lane_s32:
; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%mul = mul <4 x i32> %shuffle, %b
@@ -179,7 +167,6 @@ entry:
define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
; CHECK-LABEL: test_vmls_laneq_s16:
; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
%mul = mul <4 x i16> %shuffle, %b
@@ -190,7 +177,6 @@ entry:
define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
; CHECK-LABEL: test_vmlsq_laneq_s16:
; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
%mul = mul <8 x i16> %shuffle, %b
@@ -201,7 +187,6 @@ entry:
define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
; CHECK-LABEL: test_vmls_laneq_s32:
; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
%mul = mul <2 x i32> %shuffle, %b
@@ -212,7 +197,6 @@ entry:
define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
; CHECK-LABEL: test_vmlsq_laneq_s32:
; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%mul = mul <4 x i32> %shuffle, %b
@@ -223,7 +207,6 @@ entry:
define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vmul_lane_s16:
; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%mul = mul <4 x i16> %shuffle, %a
@@ -233,7 +216,6 @@ entry:
define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vmulq_lane_s16:
; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%mul = mul <8 x i16> %shuffle, %a
@@ -243,7 +225,6 @@ entry:
define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vmul_lane_s32:
; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
%mul = mul <2 x i32> %shuffle, %a
@@ -253,7 +234,6 @@ entry:
define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vmulq_lane_s32:
; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%mul = mul <4 x i32> %shuffle, %a
@@ -263,7 +243,6 @@ entry:
define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vmul_lane_u16:
; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%mul = mul <4 x i16> %shuffle, %a
@@ -273,7 +252,6 @@ entry:
define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vmulq_lane_u16:
; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%mul = mul <8 x i16> %shuffle, %a
@@ -283,7 +261,6 @@ entry:
define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vmul_lane_u32:
; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
%mul = mul <2 x i32> %shuffle, %a
@@ -293,7 +270,6 @@ entry:
define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vmulq_lane_u32:
; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%mul = mul <4 x i32> %shuffle, %a
@@ -303,7 +279,6 @@ entry:
define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vmul_laneq_s16:
; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
%mul = mul <4 x i16> %shuffle, %a
@@ -313,7 +288,6 @@ entry:
define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vmulq_laneq_s16:
; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
%mul = mul <8 x i16> %shuffle, %a
@@ -323,7 +297,6 @@ entry:
define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vmul_laneq_s32:
; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
%mul = mul <2 x i32> %shuffle, %a
@@ -333,7 +306,6 @@ entry:
define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vmulq_laneq_s32:
; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%mul = mul <4 x i32> %shuffle, %a
@@ -343,7 +315,6 @@ entry:
define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vmul_laneq_u16:
; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
%mul = mul <4 x i16> %shuffle, %a
@@ -353,7 +324,6 @@ entry:
define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vmulq_laneq_u16:
; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
%mul = mul <8 x i16> %shuffle, %a
@@ -363,7 +333,6 @@ entry:
define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vmul_laneq_u32:
; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
%mul = mul <2 x i32> %shuffle, %a
@@ -373,7 +342,6 @@ entry:
define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vmulq_laneq_u32:
; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%mul = mul <4 x i32> %shuffle, %a
@@ -382,12 +350,9 @@ entry:
define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
; CHECK-LABEL: test_vfma_lane_f32:
-; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfma_lane_f32:
-; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
-; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; EXYNOS-NEXT: ret
+; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
entry:
%lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
%0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -398,12 +363,9 @@ declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
; CHECK-LABEL: test_vfmaq_lane_f32:
-; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmaq_lane_f32:
-; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
-; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
entry:
%lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
@@ -414,12 +376,9 @@ declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
; CHECK-LABEL: test_vfma_laneq_f32:
-; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfma_laneq_f32:
-; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[3]
-; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; EXYNOS-NEXT: ret
+; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
entry:
%lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
%0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -428,12 +387,9 @@ entry:
define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
; CHECK-LABEL: test_vfmaq_laneq_f32:
-; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmaq_laneq_f32:
-; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
-; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
entry:
%lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
@@ -442,12 +398,9 @@ entry:
define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
; CHECK-LABEL: test_vfms_lane_f32:
-; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfms_lane_f32:
-; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
-; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; EXYNOS-NEXT: ret
+; GENERIC: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
entry:
%sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
%lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> <i32 1, i32 1>
@@ -457,12 +410,9 @@ entry:
define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
; CHECK-LABEL: test_vfmsq_lane_f32:
-; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmsq_lane_f32:
-; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
-; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
entry:
%sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
%lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -472,12 +422,9 @@ entry:
define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
; CHECK-LABEL: test_vfms_laneq_f32:
-; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfms_laneq_f32:
-; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[3]
-; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; EXYNOS-NEXT: ret
+; GENERIC: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
entry:
%sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
%lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> <i32 3, i32 3>
@@ -487,12 +434,9 @@ entry:
define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
; CHECK-LABEL: test_vfmsq_laneq_f32:
-; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmsq_laneq_f32:
-; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
-; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
entry:
%sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
%lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -502,12 +446,9 @@ entry:
define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
; CHECK-LABEL: test_vfmaq_lane_f64:
-; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmaq_lane_f64:
-; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
-; EXYNOS: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
-; EXYNOS-NEXT: ret
+; GENERIC: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
entry:
%lane = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
%0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
@@ -518,12 +459,9 @@ declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
; CHECK-LABEL: test_vfmaq_laneq_f64:
-; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmaq_laneq_f64:
-; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[1]
-; EXYNOS: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
-; EXYNOS-NEXT: ret
+; GENERIC: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[1]
+; EXYNOSM1: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
entry:
%lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
%0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
@@ -532,12 +470,9 @@ entry:
define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
; CHECK-LABEL: test_vfmsq_lane_f64:
-; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmsq_lane_f64:
-; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
-; EXYNOS: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
-; EXYNOS-NEXT: ret
+; GENERIC: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
entry:
%sub = fsub <1 x double> <double -0.000000e+00>, %v
%lane = shufflevector <1 x double> %sub, <1 x double> undef, <2 x i32> zeroinitializer
@@ -547,12 +482,9 @@ entry:
define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
; CHECK-LABEL: test_vfmsq_laneq_f64:
-; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmsq_laneq_f64:
-; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[1]
-; EXYNOS: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
-; EXYNOS-NEXT: ret
+; GENERIC: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[1]
+; EXYNOSM1: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
entry:
%sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
%lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> <i32 1, i32 1>
@@ -563,10 +495,6 @@ entry:
define float @test_vfmas_laneq_f32(float %a, float %b, <4 x float> %v) {
; CHECK-LABEL: test_vfmas_laneq_f32
; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-; EXNOS-LABEL: test_vfmas_laneq_f32
-; EXNOS: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
-; EXNOS-NEXT: ret
entry:
%extract = extractelement <4 x float> %v, i32 3
%0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
@@ -578,7 +506,6 @@ declare float @llvm.fma.f32(float, float, float)
define double @test_vfmsd_lane_f64(double %a, double %b, <1 x double> %v) {
; CHECK-LABEL: test_vfmsd_lane_f64
; CHECK: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK-NEXT: ret
entry:
%extract.rhs = extractelement <1 x double> %v, i32 0
%extract = fsub double -0.000000e+00, %extract.rhs
@@ -591,10 +518,6 @@ declare double @llvm.fma.f64(double, double, double)
define float @test_vfmss_lane_f32(float %a, float %b, <2 x float> %v) {
; CHECK-LABEL: test_vfmss_lane_f32
; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmss_lane_f32
-; EXYNOS: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
-; EXYNOS-NEXT: ret
entry:
%extract.rhs = extractelement <2 x float> %v, i32 1
%extract = fsub float -0.000000e+00, %extract.rhs
@@ -605,7 +528,6 @@ entry:
define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) {
; CHECK-LABEL: test_vfmss_laneq_f32
; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
entry:
%extract.rhs = extractelement <4 x float> %v, i32 3
%extract = fsub float -0.000000e+00, %extract.rhs
@@ -616,10 +538,6 @@ entry:
define double @test_vfmsd_laneq_f64(double %a, double %b, <2 x double> %v) {
; CHECK-LABEL: test_vfmsd_laneq_f64
; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmsd_laneq_f64
-; EXYNOS: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
-; EXYNOS-NEXT: ret
entry:
%extract.rhs = extractelement <2 x double> %v, i32 1
%extract = fsub double -0.000000e+00, %extract.rhs
@@ -641,10 +559,6 @@ entry:
define float @test_vfmss_lane_f32_0(float %a, float %b, <2 x float> %v) {
; CHECK-LABEL: test_vfmss_lane_f32_0
; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmss_lane_f32_0
-; EXYNOS: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
-; EXYNOS-NEXT: ret
entry:
%tmp0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
%tmp1 = extractelement <2 x float> %tmp0, i32 1
@@ -655,7 +569,6 @@ entry:
define float @test_vfmss_laneq_f32_0(float %a, float %b, <4 x float> %v) {
; CHECK-LABEL: test_vfmss_laneq_f32_0
; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
entry:
%tmp0 = fsub <4 x float><float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
%tmp1 = extractelement <4 x float> %tmp0, i32 3
@@ -666,7 +579,6 @@ entry:
define double @test_vfmsd_laneq_f64_0(double %a, double %b, <2 x double> %v) {
; CHECK-LABEL: test_vfmsd_laneq_f64_0
; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: ret
entry:
%tmp0 = fsub <2 x double><double -0.000000e+00, double -0.000000e+00>, %v
%tmp1 = extractelement <2 x double> %tmp0, i32 1
@@ -677,7 +589,6 @@ entry:
define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
; CHECK-LABEL: test_vmlal_lane_s16:
; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -688,7 +599,6 @@ entry:
define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
; CHECK-LABEL: test_vmlal_lane_s32:
; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -699,7 +609,6 @@ entry:
define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
; CHECK-LABEL: test_vmlal_laneq_s16:
; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -710,7 +619,6 @@ entry:
define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
; CHECK-LABEL: test_vmlal_laneq_s32:
; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -721,7 +629,6 @@ entry:
define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
; CHECK-LABEL: test_vmlal_high_lane_s16:
; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -733,7 +640,6 @@ entry:
define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
; CHECK-LABEL: test_vmlal_high_lane_s32:
; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -745,7 +651,6 @@ entry:
define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
; CHECK-LABEL: test_vmlal_high_laneq_s16:
; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
@@ -757,7 +662,6 @@ entry:
define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
; CHECK-LABEL: test_vmlal_high_laneq_s32:
; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
@@ -769,7 +673,6 @@ entry:
define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
; CHECK-LABEL: test_vmlsl_lane_s16:
; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -780,7 +683,6 @@ entry:
define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
; CHECK-LABEL: test_vmlsl_lane_s32:
; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -791,7 +693,6 @@ entry:
define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
; CHECK-LABEL: test_vmlsl_laneq_s16:
; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -802,7 +703,6 @@ entry:
define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
; CHECK-LABEL: test_vmlsl_laneq_s32:
; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -813,7 +713,6 @@ entry:
define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
; CHECK-LABEL: test_vmlsl_high_lane_s16:
; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -825,7 +724,6 @@ entry:
define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
; CHECK-LABEL: test_vmlsl_high_lane_s32:
; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -837,7 +735,6 @@ entry:
define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
; CHECK-LABEL: test_vmlsl_high_laneq_s16:
; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
@@ -849,7 +746,6 @@ entry:
define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
; CHECK-LABEL: test_vmlsl_high_laneq_s32:
; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
@@ -861,7 +757,6 @@ entry:
define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
; CHECK-LABEL: test_vmlal_lane_u16:
; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -872,7 +767,6 @@ entry:
define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
; CHECK-LABEL: test_vmlal_lane_u32:
; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -883,7 +777,6 @@ entry:
define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
; CHECK-LABEL: test_vmlal_laneq_u16:
; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -894,7 +787,6 @@ entry:
define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
; CHECK-LABEL: test_vmlal_laneq_u32:
; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -905,7 +797,6 @@ entry:
define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
; CHECK-LABEL: test_vmlal_high_lane_u16:
; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -917,7 +808,6 @@ entry:
define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
; CHECK-LABEL: test_vmlal_high_lane_u32:
; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -929,7 +819,6 @@ entry:
define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
; CHECK-LABEL: test_vmlal_high_laneq_u16:
; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
@@ -941,7 +830,6 @@ entry:
define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
; CHECK-LABEL: test_vmlal_high_laneq_u32:
; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
@@ -953,7 +841,6 @@ entry:
define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
; CHECK-LABEL: test_vmlsl_lane_u16:
; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -964,7 +851,6 @@ entry:
define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
; CHECK-LABEL: test_vmlsl_lane_u32:
; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -975,7 +861,6 @@ entry:
define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
; CHECK-LABEL: test_vmlsl_laneq_u16:
; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -986,7 +871,6 @@ entry:
define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
; CHECK-LABEL: test_vmlsl_laneq_u32:
; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -997,7 +881,6 @@ entry:
define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
; CHECK-LABEL: test_vmlsl_high_lane_u16:
; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -1009,7 +892,6 @@ entry:
define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
; CHECK-LABEL: test_vmlsl_high_lane_u32:
; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -1021,7 +903,6 @@ entry:
define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
; CHECK-LABEL: test_vmlsl_high_laneq_u16:
; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
@@ -1033,7 +914,6 @@ entry:
define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
; CHECK-LABEL: test_vmlsl_high_laneq_u32:
; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
@@ -1045,7 +925,6 @@ entry:
define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vmull_lane_s16:
; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -1055,7 +934,6 @@ entry:
define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vmull_lane_s32:
; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -1065,7 +943,6 @@ entry:
define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vmull_lane_u16:
; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -1075,7 +952,6 @@ entry:
define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vmull_lane_u32:
; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -1085,7 +961,6 @@ entry:
define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vmull_high_lane_s16:
; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -1096,7 +971,6 @@ entry:
define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vmull_high_lane_s32:
; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -1107,7 +981,6 @@ entry:
define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vmull_high_lane_u16:
; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -1118,7 +991,6 @@ entry:
define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vmull_high_lane_u32:
; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -1129,7 +1001,6 @@ entry:
define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vmull_laneq_s16:
; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -1139,7 +1010,6 @@ entry:
define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vmull_laneq_s32:
; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -1149,7 +1019,6 @@ entry:
define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vmull_laneq_u16:
; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -1159,7 +1028,6 @@ entry:
define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vmull_laneq_u32:
; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -1169,7 +1037,6 @@ entry:
define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vmull_high_laneq_s16:
; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
@@ -1180,7 +1047,6 @@ entry:
define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vmull_high_laneq_s32:
; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
@@ -1191,7 +1057,6 @@ entry:
define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vmull_high_laneq_u16:
; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
@@ -1202,7 +1067,6 @@ entry:
define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vmull_high_laneq_u32:
; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
@@ -1213,7 +1077,6 @@ entry:
define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
; CHECK-LABEL: test_vqdmlal_lane_s16:
; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -1224,7 +1087,6 @@ entry:
define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
; CHECK-LABEL: test_vqdmlal_lane_s32:
; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
%vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -1235,7 +1097,6 @@ entry:
define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
; CHECK-LABEL: test_vqdmlal_high_lane_s16:
; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -1247,7 +1108,6 @@ entry:
define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
; CHECK-LABEL: test_vqdmlal_high_lane_s32:
; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -1259,7 +1119,6 @@ entry:
define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
; CHECK-LABEL: test_vqdmlsl_lane_s16:
; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -1270,7 +1129,6 @@ entry:
define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
; CHECK-LABEL: test_vqdmlsl_lane_s32:
; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
%vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -1281,7 +1139,6 @@ entry:
define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
; CHECK-LABEL: test_vqdmlsl_high_lane_s16:
; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -1293,7 +1150,6 @@ entry:
define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
; CHECK-LABEL: test_vqdmlsl_high_lane_s32:
; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -1305,7 +1161,6 @@ entry:
define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vqdmull_lane_s16:
; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -1315,7 +1170,6 @@ entry:
define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqdmull_lane_s32:
; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
%vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -1325,7 +1179,6 @@ entry:
define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vqdmull_laneq_s16:
; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -1335,7 +1188,6 @@ entry:
define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vqdmull_laneq_s32:
; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
%vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -1345,7 +1197,6 @@ entry:
define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vqdmull_high_lane_s16:
; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -1356,7 +1207,6 @@ entry:
define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqdmull_high_lane_s32:
; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -1367,7 +1217,6 @@ entry:
define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vqdmull_high_laneq_s16:
; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
@@ -1378,7 +1227,6 @@ entry:
define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vqdmull_high_laneq_s32:
; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
@@ -1389,7 +1237,6 @@ entry:
define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vqdmulh_lane_s16:
; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
@@ -1399,7 +1246,6 @@ entry:
define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vqdmulhq_lane_s16:
; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
@@ -1409,7 +1255,6 @@ entry:
define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqdmulh_lane_s32:
; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
%vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
@@ -1419,7 +1264,6 @@ entry:
define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqdmulhq_lane_s32:
; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
@@ -1429,7 +1273,6 @@ entry:
define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vqrdmulh_lane_s16:
; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
@@ -1439,7 +1282,6 @@ entry:
define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vqrdmulhq_lane_s16:
; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
@@ -1449,7 +1291,6 @@ entry:
define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqrdmulh_lane_s32:
; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
%vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
@@ -1459,7 +1300,6 @@ entry:
define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqrdmulhq_lane_s32:
; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
@@ -1468,12 +1308,9 @@ entry:
define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) {
; CHECK-LABEL: test_vmul_lane_f32:
-; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmul_lane_f32:
-; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
-; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; EXYNOS-NEXT: ret
+; GENERIC: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
entry:
%shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
%mul = fmul <2 x float> %shuffle, %a
@@ -1483,10 +1320,6 @@ entry:
define <1 x double> @test_vmul_lane_f64(<1 x double> %a, <1 x double> %v) {
; CHECK-LABEL: test_vmul_lane_f64:
; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmul_lane_f64:
-; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-; EXYNOS-NEXT: ret
entry:
%0 = bitcast <1 x double> %a to <8 x i8>
%1 = bitcast <8 x i8> %0 to double
@@ -1498,12 +1331,9 @@ entry:
define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) {
; CHECK-LABEL: test_vmulq_lane_f32:
-; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulq_lane_f32:
-; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
-; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
entry:
%shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%mul = fmul <4 x float> %shuffle, %a
@@ -1512,12 +1342,9 @@ entry:
define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) {
; CHECK-LABEL: test_vmulq_lane_f64:
-; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulq_lane_f64:
-; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
-; EXYNOS: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-; EXYNOS-NEXT: ret
+; GENERIC: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
%shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
%mul = fmul <2 x double> %shuffle, %a
@@ -1526,12 +1353,9 @@ entry:
define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) {
; CHECK-LABEL: test_vmul_laneq_f32:
-; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmul_laneq_f32:
-; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[3]
-; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; EXYNOS-NEXT: ret
+; GENERIC: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
entry:
%shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
%mul = fmul <2 x float> %shuffle, %a
@@ -1541,10 +1365,6 @@ entry:
define <1 x double> @test_vmul_laneq_f64(<1 x double> %a, <2 x double> %v) {
; CHECK-LABEL: test_vmul_laneq_f64:
; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmul_laneq_f64:
-; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
-; EXYNOS-NEXT: ret
entry:
%0 = bitcast <1 x double> %a to <8 x i8>
%1 = bitcast <8 x i8> %0 to double
@@ -1556,12 +1376,9 @@ entry:
define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) {
; CHECK-LABEL: test_vmulq_laneq_f32:
-; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulq_laneq_f32:
-; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
-; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
entry:
%shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%mul = fmul <4 x float> %shuffle, %a
@@ -1570,12 +1387,9 @@ entry:
define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) {
; CHECK-LABEL: test_vmulq_laneq_f64:
-; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulq_laneq_f64:
-; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[1]
-; EXYNOS: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
-; EXYNOS-NEXT: ret
+; GENERIC: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[1]
+; EXYNOSM1: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
entry:
%shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
%mul = fmul <2 x double> %shuffle, %a
@@ -1584,12 +1398,9 @@ entry:
define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) {
; CHECK-LABEL: test_vmulx_lane_f32:
-; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulx_lane_f32:
-; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
-; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; EXYNOS-NEXT: ret
+; GENERIC: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
entry:
%shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
%vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
@@ -1598,12 +1409,9 @@ entry:
define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) {
; CHECK-LABEL: test_vmulxq_lane_f32:
-; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulxq_lane_f32:
-; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
-; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; Exynos-NEXT: ret
+; GENERIC: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
entry:
%shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
@@ -1612,12 +1420,9 @@ entry:
define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) {
; CHECK-LABEL: test_vmulxq_lane_f64:
-; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulxq_lane_f64:
-; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
-; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
-; EXYNOS-NEXT: ret
+; GENERIC: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
entry:
%shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
%vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
@@ -1626,12 +1431,9 @@ entry:
define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) {
; CHECK-LABEL: test_vmulx_laneq_f32:
-; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulx_laneq_f32:
-; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[3]
-; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; EXYNOS-NEXT: ret
+; GENERIC: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
entry:
%shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
%vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
@@ -1640,12 +1442,9 @@ entry:
define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) {
; CHECK-LABEL: test_vmulxq_laneq_f32:
-; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulxq_laneq_f32:
-; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
-; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
entry:
%shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
@@ -1654,12 +1453,9 @@ entry:
define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) {
; CHECK-LABEL: test_vmulxq_laneq_f64:
-; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulxq_laneq_f64:
-; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[1]
-; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
-; EXYNOS-NEXT: ret
+; GENERIC: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[1]
+; EXYNOSM1: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
entry:
%shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
%vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
@@ -1669,7 +1465,6 @@ entry:
define <4 x i16> @test_vmla_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
; CHECK-LABEL: test_vmla_lane_s16_0:
; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
%mul = mul <4 x i16> %shuffle, %b
@@ -1680,7 +1475,6 @@ entry:
define <8 x i16> @test_vmlaq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
; CHECK-LABEL: test_vmlaq_lane_s16_0:
; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
%mul = mul <8 x i16> %shuffle, %b
@@ -1691,7 +1485,6 @@ entry:
define <2 x i32> @test_vmla_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
; CHECK-LABEL: test_vmla_lane_s32_0:
; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
%mul = mul <2 x i32> %shuffle, %b
@@ -1702,7 +1495,6 @@ entry:
define <4 x i32> @test_vmlaq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
; CHECK-LABEL: test_vmlaq_lane_s32_0:
; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
%mul = mul <4 x i32> %shuffle, %b
@@ -1713,7 +1505,6 @@ entry:
define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
; CHECK-LABEL: test_vmla_laneq_s16_0:
; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
%mul = mul <4 x i16> %shuffle, %b
@@ -1724,7 +1515,6 @@ entry:
define <8 x i16> @test_vmlaq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
; CHECK-LABEL: test_vmlaq_laneq_s16_0:
; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
%mul = mul <8 x i16> %shuffle, %b
@@ -1735,7 +1525,6 @@ entry:
define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
; CHECK-LABEL: test_vmla_laneq_s32_0:
; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
%mul = mul <2 x i32> %shuffle, %b
@@ -1746,7 +1535,6 @@ entry:
define <4 x i32> @test_vmlaq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
; CHECK-LABEL: test_vmlaq_laneq_s32_0:
; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
%mul = mul <4 x i32> %shuffle, %b
@@ -1757,7 +1545,6 @@ entry:
define <4 x i16> @test_vmls_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
; CHECK-LABEL: test_vmls_lane_s16_0:
; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
%mul = mul <4 x i16> %shuffle, %b
@@ -1768,7 +1555,6 @@ entry:
define <8 x i16> @test_vmlsq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
; CHECK-LABEL: test_vmlsq_lane_s16_0:
; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
%mul = mul <8 x i16> %shuffle, %b
@@ -1779,7 +1565,6 @@ entry:
define <2 x i32> @test_vmls_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
; CHECK-LABEL: test_vmls_lane_s32_0:
; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
%mul = mul <2 x i32> %shuffle, %b
@@ -1790,7 +1575,6 @@ entry:
define <4 x i32> @test_vmlsq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
; CHECK-LABEL: test_vmlsq_lane_s32_0:
; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
%mul = mul <4 x i32> %shuffle, %b
@@ -1801,7 +1585,6 @@ entry:
define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
; CHECK-LABEL: test_vmls_laneq_s16_0:
; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
%mul = mul <4 x i16> %shuffle, %b
@@ -1812,7 +1595,6 @@ entry:
define <8 x i16> @test_vmlsq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
; CHECK-LABEL: test_vmlsq_laneq_s16_0:
; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
%mul = mul <8 x i16> %shuffle, %b
@@ -1823,7 +1605,6 @@ entry:
define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
; CHECK-LABEL: test_vmls_laneq_s32_0:
; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
%mul = mul <2 x i32> %shuffle, %b
@@ -1834,7 +1615,6 @@ entry:
define <4 x i32> @test_vmlsq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
; CHECK-LABEL: test_vmlsq_laneq_s32_0:
; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
%mul = mul <4 x i32> %shuffle, %b
@@ -1845,7 +1625,6 @@ entry:
define <4 x i16> @test_vmul_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vmul_lane_s16_0:
; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
%mul = mul <4 x i16> %shuffle, %a
@@ -1855,7 +1634,6 @@ entry:
define <8 x i16> @test_vmulq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vmulq_lane_s16_0:
; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
%mul = mul <8 x i16> %shuffle, %a
@@ -1865,7 +1643,6 @@ entry:
define <2 x i32> @test_vmul_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vmul_lane_s32_0:
; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
%mul = mul <2 x i32> %shuffle, %a
@@ -1875,7 +1652,6 @@ entry:
define <4 x i32> @test_vmulq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vmulq_lane_s32_0:
; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
%mul = mul <4 x i32> %shuffle, %a
@@ -1885,7 +1661,6 @@ entry:
define <4 x i16> @test_vmul_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vmul_lane_u16_0:
; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
%mul = mul <4 x i16> %shuffle, %a
@@ -1895,7 +1670,6 @@ entry:
define <8 x i16> @test_vmulq_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vmulq_lane_u16_0:
; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
%mul = mul <8 x i16> %shuffle, %a
@@ -1905,7 +1679,6 @@ entry:
define <2 x i32> @test_vmul_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vmul_lane_u32_0:
; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
%mul = mul <2 x i32> %shuffle, %a
@@ -1915,7 +1688,6 @@ entry:
define <4 x i32> @test_vmulq_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vmulq_lane_u32_0:
; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
%mul = mul <4 x i32> %shuffle, %a
@@ -1925,7 +1697,6 @@ entry:
define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vmul_laneq_s16_0:
; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
%mul = mul <4 x i16> %shuffle, %a
@@ -1935,7 +1706,6 @@ entry:
define <8 x i16> @test_vmulq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vmulq_laneq_s16_0:
; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
%mul = mul <8 x i16> %shuffle, %a
@@ -1945,7 +1715,6 @@ entry:
define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vmul_laneq_s32_0:
; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
%mul = mul <2 x i32> %shuffle, %a
@@ -1955,7 +1724,6 @@ entry:
define <4 x i32> @test_vmulq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vmulq_laneq_s32_0:
; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
%mul = mul <4 x i32> %shuffle, %a
@@ -1965,7 +1733,6 @@ entry:
define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vmul_laneq_u16_0:
; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
%mul = mul <4 x i16> %shuffle, %a
@@ -1975,7 +1742,6 @@ entry:
define <8 x i16> @test_vmulq_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vmulq_laneq_u16_0:
; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
%mul = mul <8 x i16> %shuffle, %a
@@ -1985,7 +1751,6 @@ entry:
define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vmul_laneq_u32_0:
; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
%mul = mul <2 x i32> %shuffle, %a
@@ -1995,7 +1760,6 @@ entry:
define <4 x i32> @test_vmulq_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vmulq_laneq_u32_0:
; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
%mul = mul <4 x i32> %shuffle, %a
@@ -2004,12 +1768,9 @@ entry:
define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
; CHECK-LABEL: test_vfma_lane_f32_0:
-; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfma_lane_f32_0:
-; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
-; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; EXYNOS-NEXT: ret
+; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
entry:
%lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
%0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -2018,12 +1779,9 @@ entry:
define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
; CHECK-LABEL: test_vfmaq_lane_f32_0:
-; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmaq_lane_f32_0:
-; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
-; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
entry:
%lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
%0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
@@ -2032,12 +1790,9 @@ entry:
define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
; CHECK-LABEL: test_vfma_laneq_f32_0:
-; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfma_laneq_f32_0:
-; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
-; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; EXYNOS-NEXT: ret
+; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
entry:
%lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
%0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -2046,12 +1801,9 @@ entry:
define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
; CHECK-LABEL: test_vfmaq_laneq_f32_0:
-; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmaq_laneq_f32_0:
-; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
-; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
entry:
%lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
%0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
@@ -2060,12 +1812,9 @@ entry:
define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
; CHECK-LABEL: test_vfms_lane_f32_0:
-; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfms_lane_f32_0:
-; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
-; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; EXYNOS-NEXT: ret
+; GENERIC: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
entry:
%sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
%lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> zeroinitializer
@@ -2075,12 +1824,9 @@ entry:
define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
; CHECK-LABEL: test_vfmsq_lane_f32_0:
-; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmsq_lane_f32_0:
-; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
-; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
entry:
%sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
%lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> zeroinitializer
@@ -2090,12 +1836,9 @@ entry:
define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
; CHECK-LABEL: test_vfms_laneq_f32_0:
-; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfms_laneq_f32_0:
-; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
-; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; EXYNOS-NEXT: ret
+; GENERIC: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
entry:
%sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
%lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> zeroinitializer
@@ -2105,12 +1848,9 @@ entry:
define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
; CHECK-LABEL: test_vfmsq_laneq_f32_0:
-; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmsq_laneq_f32_0:
-; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
-; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
entry:
%sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
%lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> zeroinitializer
@@ -2120,12 +1860,9 @@ entry:
define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
; CHECK-LABEL: test_vfmaq_laneq_f64_0:
-; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmaq_laneq_f64_0:
-; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
-; EXYNOS: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
-; EXYNOS-NEXT: ret
+; GENERIC: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
entry:
%lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
%0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
@@ -2134,12 +1871,9 @@ entry:
define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
; CHECK-LABEL: test_vfmsq_laneq_f64_0:
-; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmsq_laneq_f64_0:
-; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
-; EXYNOS: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
-; EXYNOS-NEXT: ret
+; GENERIC: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
entry:
%sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
%lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> zeroinitializer
@@ -2150,7 +1884,6 @@ entry:
define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
; CHECK-LABEL: test_vmlal_lane_s16_0:
; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -2161,7 +1894,6 @@ entry:
define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
; CHECK-LABEL: test_vmlal_lane_s32_0:
; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -2172,7 +1904,6 @@ entry:
define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
; CHECK-LABEL: test_vmlal_laneq_s16_0:
; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -2183,7 +1914,6 @@ entry:
define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
; CHECK-LABEL: test_vmlal_laneq_s32_0:
; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -2194,7 +1924,6 @@ entry:
define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
; CHECK-LABEL: test_vmlal_high_lane_s16_0:
; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -2206,7 +1935,6 @@ entry:
define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
; CHECK-LABEL: test_vmlal_high_lane_s32_0:
; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -2218,7 +1946,6 @@ entry:
define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
; CHECK-LABEL: test_vmlal_high_laneq_s16_0:
; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
@@ -2230,7 +1957,6 @@ entry:
define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
; CHECK-LABEL: test_vmlal_high_laneq_s32_0:
; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
@@ -2242,7 +1968,6 @@ entry:
define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
; CHECK-LABEL: test_vmlsl_lane_s16_0:
; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -2253,7 +1978,6 @@ entry:
define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
; CHECK-LABEL: test_vmlsl_lane_s32_0:
; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -2264,7 +1988,6 @@ entry:
define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
; CHECK-LABEL: test_vmlsl_laneq_s16_0:
; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -2275,7 +1998,6 @@ entry:
define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
; CHECK-LABEL: test_vmlsl_laneq_s32_0:
; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -2286,7 +2008,6 @@ entry:
define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
; CHECK-LABEL: test_vmlsl_high_lane_s16_0:
; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -2298,7 +2019,6 @@ entry:
define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
; CHECK-LABEL: test_vmlsl_high_lane_s32_0:
; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -2310,7 +2030,6 @@ entry:
define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
; CHECK-LABEL: test_vmlsl_high_laneq_s16_0:
; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
@@ -2322,7 +2041,6 @@ entry:
define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
; CHECK-LABEL: test_vmlsl_high_laneq_s32_0:
; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
@@ -2334,7 +2052,6 @@ entry:
define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
; CHECK-LABEL: test_vmlal_lane_u16_0:
; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -2345,7 +2062,6 @@ entry:
define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
; CHECK-LABEL: test_vmlal_lane_u32_0:
; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -2356,7 +2072,6 @@ entry:
define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
; CHECK-LABEL: test_vmlal_laneq_u16_0:
; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -2367,7 +2082,6 @@ entry:
define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
; CHECK-LABEL: test_vmlal_laneq_u32_0:
; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -2378,7 +2092,6 @@ entry:
define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
; CHECK-LABEL: test_vmlal_high_lane_u16_0:
; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -2390,7 +2103,6 @@ entry:
define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
; CHECK-LABEL: test_vmlal_high_lane_u32_0:
; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -2402,7 +2114,6 @@ entry:
define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
; CHECK-LABEL: test_vmlal_high_laneq_u16_0:
; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
@@ -2414,7 +2125,6 @@ entry:
define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
; CHECK-LABEL: test_vmlal_high_laneq_u32_0:
; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
@@ -2426,7 +2136,6 @@ entry:
define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
; CHECK-LABEL: test_vmlsl_lane_u16_0:
; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -2437,7 +2146,6 @@ entry:
define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
; CHECK-LABEL: test_vmlsl_lane_u32_0:
; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -2448,7 +2156,6 @@ entry:
define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
; CHECK-LABEL: test_vmlsl_laneq_u16_0:
; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -2459,7 +2166,6 @@ entry:
define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
; CHECK-LABEL: test_vmlsl_laneq_u32_0:
; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -2470,7 +2176,6 @@ entry:
define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
; CHECK-LABEL: test_vmlsl_high_lane_u16_0:
; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -2482,7 +2187,6 @@ entry:
define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
; CHECK-LABEL: test_vmlsl_high_lane_u32_0:
; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -2494,7 +2198,6 @@ entry:
define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
; CHECK-LABEL: test_vmlsl_high_laneq_u16_0:
; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
@@ -2506,7 +2209,6 @@ entry:
define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
; CHECK-LABEL: test_vmlsl_high_laneq_u32_0:
; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
@@ -2518,7 +2220,6 @@ entry:
define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vmull_lane_s16_0:
; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -2528,7 +2229,6 @@ entry:
define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vmull_lane_s32_0:
; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -2538,7 +2238,6 @@ entry:
define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vmull_lane_u16_0:
; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -2548,7 +2247,6 @@ entry:
define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vmull_lane_u32_0:
; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -2558,7 +2256,6 @@ entry:
define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vmull_high_lane_s16_0:
; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -2569,7 +2266,6 @@ entry:
define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vmull_high_lane_s32_0:
; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -2580,7 +2276,6 @@ entry:
define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vmull_high_lane_u16_0:
; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -2591,7 +2286,6 @@ entry:
define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vmull_high_lane_u32_0:
; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -2602,7 +2296,6 @@ entry:
define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vmull_laneq_s16_0:
; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -2612,7 +2305,6 @@ entry:
define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vmull_laneq_s32_0:
; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -2622,7 +2314,6 @@ entry:
define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vmull_laneq_u16_0:
; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -2632,7 +2323,6 @@ entry:
define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vmull_laneq_u32_0:
; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -2642,7 +2332,6 @@ entry:
define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vmull_high_laneq_s16_0:
; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
@@ -2653,7 +2342,6 @@ entry:
define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vmull_high_laneq_s32_0:
; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
@@ -2664,7 +2352,6 @@ entry:
define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vmull_high_laneq_u16_0:
; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
@@ -2675,7 +2362,6 @@ entry:
define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vmull_high_laneq_u32_0:
; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
@@ -2686,7 +2372,6 @@ entry:
define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
; CHECK-LABEL: test_vqdmlal_lane_s16_0:
; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
%vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -2697,7 +2382,6 @@ entry:
define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
; CHECK-LABEL: test_vqdmlal_lane_s32_0:
; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
%vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -2708,7 +2392,6 @@ entry:
define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
; CHECK-LABEL: test_vqdmlal_high_lane_s16_0:
; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -2720,7 +2403,6 @@ entry:
define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
; CHECK-LABEL: test_vqdmlal_high_lane_s32_0:
; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -2732,7 +2414,6 @@ entry:
define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
; CHECK-LABEL: test_vqdmlsl_lane_s16_0:
; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
%vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -2743,7 +2424,6 @@ entry:
define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
; CHECK-LABEL: test_vqdmlsl_lane_s32_0:
; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
%vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -2754,7 +2434,6 @@ entry:
define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
; CHECK-LABEL: test_vqdmlsl_high_lane_s16_0:
; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -2766,7 +2445,6 @@ entry:
define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
; CHECK-LABEL: test_vqdmlsl_high_lane_s32_0:
; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -2778,7 +2456,6 @@ entry:
define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vqdmull_lane_s16_0:
; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
%vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -2788,7 +2465,6 @@ entry:
define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqdmull_lane_s32_0:
; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
%vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -2798,7 +2474,6 @@ entry:
define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vqdmull_laneq_s16_0:
; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
%vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -2808,7 +2483,6 @@ entry:
define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vqdmull_laneq_s32_0:
; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
%vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -2818,7 +2492,6 @@ entry:
define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vqdmull_high_lane_s16_0:
; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -2829,7 +2502,6 @@ entry:
define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqdmull_high_lane_s32_0:
; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -2840,7 +2512,6 @@ entry:
define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vqdmull_high_laneq_s16_0:
; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
@@ -2851,7 +2522,6 @@ entry:
define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vqdmull_high_laneq_s32_0:
; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
@@ -2862,7 +2532,6 @@ entry:
define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vqdmulh_lane_s16_0:
; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
%vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
@@ -2872,7 +2541,6 @@ entry:
define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vqdmulhq_lane_s16_0:
; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
%vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
@@ -2882,7 +2550,6 @@ entry:
define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqdmulh_lane_s32_0:
; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
%vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
@@ -2892,7 +2559,6 @@ entry:
define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqdmulhq_lane_s32_0:
; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
%vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
@@ -2902,7 +2568,6 @@ entry:
define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vqrdmulh_lane_s16_0:
; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
%vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
@@ -2912,7 +2577,6 @@ entry:
define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vqrdmulhq_lane_s16_0:
; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
%vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
@@ -2922,7 +2586,6 @@ entry:
define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqrdmulh_lane_s32_0:
; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
%vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
@@ -2932,7 +2595,6 @@ entry:
define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqrdmulhq_lane_s32_0:
; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
%vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
@@ -2941,12 +2603,9 @@ entry:
define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) {
; CHECK-LABEL: test_vmul_lane_f32_0:
-; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmul_lane_f32_0:
-; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
-; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; EXYNOS-NEXT: ret
+; GENERIC: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
entry:
%shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
%mul = fmul <2 x float> %shuffle, %a
@@ -2955,12 +2614,9 @@ entry:
define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
; CHECK-LABEL: test_vmulq_lane_f32_0:
-; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulq_lane_f32_0:
-; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
-; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
entry:
%shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
%mul = fmul <4 x float> %shuffle, %a
@@ -2969,12 +2625,9 @@ entry:
define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
; CHECK-LABEL: test_vmul_laneq_f32_0:
-; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmul_laneq_f32_0:
-; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
-; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; EXYNOS-NEXT: ret
+; GENERIC: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
entry:
%shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
%mul = fmul <2 x float> %shuffle, %a
@@ -2984,10 +2637,6 @@ entry:
define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) {
; CHECK-LABEL: test_vmul_laneq_f64_0:
; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmul_laneq_f64_0:
-; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
-; EXYNOS-NEXT: ret
entry:
%0 = bitcast <1 x double> %a to <8 x i8>
%1 = bitcast <8 x i8> %0 to double
@@ -2999,12 +2648,9 @@ entry:
define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
; CHECK-LABEL: test_vmulq_laneq_f32_0:
-; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulq_laneq_f32_0:
-; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
-; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
entry:
%shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
%mul = fmul <4 x float> %shuffle, %a
@@ -3013,12 +2659,9 @@ entry:
define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
; CHECK-LABEL: test_vmulq_laneq_f64_0:
-; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulq_laneq_f64_0:
-; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
-; EXYNOS: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
-; EXYNOS-NEXT: ret
+; GENERIC: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
entry:
%shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
%mul = fmul <2 x double> %shuffle, %a
@@ -3027,12 +2670,9 @@ entry:
define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) {
; CHECK-LABEL: test_vmulx_lane_f32_0:
-; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulx_lane_f32_0:
-; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
-; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; EXYNOS-NEXT: ret
+; GENERIC: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
entry:
%shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
%vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
@@ -3041,12 +2681,9 @@ entry:
define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
; CHECK-LABEL: test_vmulxq_lane_f32_0:
-; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulxq_lane_f32_0:
-; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
-; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
entry:
%shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
%vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
@@ -3055,12 +2692,9 @@ entry:
define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) {
; CHECK-LABEL: test_vmulxq_lane_f64_0:
-; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulxq_lane_f64_0:
-; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
-; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
-; EXYNOS-NEXT: ret
+; GENERIC: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
entry:
%shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
%vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
@@ -3069,12 +2703,9 @@ entry:
define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
; CHECK-LABEL: test_vmulx_laneq_f32_0:
-; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulx_laneq_f32_0:
-; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
-; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; EXYNOS-NEXT: ret
+; GENERIC: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
entry:
%shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
%vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
@@ -3083,12 +2714,9 @@ entry:
define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
; CHECK-LABEL: test_vmulxq_laneq_f32_0:
-; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulxq_laneq_f32_0:
-; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
-; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
entry:
%shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
%vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
@@ -3097,12 +2725,9 @@ entry:
define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
; CHECK-LABEL: test_vmulxq_laneq_f64_0:
-; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulxq_laneq_f64_0:
-; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
-; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
-; EXYNOS-NEXT: ret
+; GENERIC: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
entry:
%shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
%vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
@@ -3111,14 +2736,11 @@ entry:
define <4 x float> @optimize_dup(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %v) {
; CHECK-LABEL: optimize_dup:
-; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: optimize_dup:
-; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
-; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
+; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
entry:
%lane1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane1, <4 x float> %b, <4 x float> %a)
@@ -3130,15 +2752,12 @@ entry:
define <4 x float> @no_optimize_dup(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %v) {
; CHECK-LABEL: no_optimize_dup:
-; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: no_optimize_dup:
-; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
-; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS: dup [[y:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
-; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[y]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
+; EXYNOSM1: dup [[W:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[W]].4s
entry:
%lane1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane1, <4 x float> %b, <4 x float> %a)
@@ -3150,8 +2769,7 @@ entry:
define <2 x float> @test_vfma_lane_simdinstr_opt_pass_caching_a57(<2 x float> %a, <2 x float> %b, <2 x float> %v) "target-cpu"="cortex-a57" {
; CHECK-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_a57:
-; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
+; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
entry:
%lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
%0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -3160,9 +2778,8 @@ entry:
define <2 x float> @test_vfma_lane_simdinstr_opt_pass_caching_m1(<2 x float> %a, <2 x float> %b, <2 x float> %v) "target-cpu"="exynos-m1" {
; CHECK-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_m1:
-; CHECK: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
-; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; CHECK-NEXT: ret
+; GENERIC: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
+; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
entry:
%lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
%0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
diff --git a/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
index 453334dce601..2fb9d3b2d030 100644
--- a/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
+++ b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
@@ -87,4 +87,13 @@ for.end:
ret double %v0
}
+define <2 x i64> @t6() {
+; ALL-LABEL: t6:
+; CYCLONE: movi.16b v0, #0
+; KRYO: movi v0.2d, #0000000000000000
+; FALKOR: movi v0.2d, #0000000000000000
+ ret <2 x i64> zeroinitializer
+}
+
+
declare double @sin(double)
diff --git a/test/CodeGen/AArch64/chkstk.ll b/test/CodeGen/AArch64/chkstk.ll
new file mode 100644
index 000000000000..1c2e5528f10c
--- /dev/null
+++ b/test/CodeGen/AArch64/chkstk.ll
@@ -0,0 +1,25 @@
+; RUN: llc -mtriple=aarch64-windows -verify-machineinstrs %s -o - \
+; RUN: | FileCheck -check-prefix CHECK-DEFAULT-CODE-MODEL %s
+
+; RUN: llc -mtriple=aarch64-windows -verify-machineinstrs -code-model=large %s -o - \
+; RUN: | FileCheck -check-prefix CHECK-LARGE-CODE-MODEL %s
+
+define void @check_watermark() {
+entry:
+ %buffer = alloca [4096 x i8], align 1
+ ret void
+}
+
+; CHECK-DEFAULT-CODE-MODEL: check_watermark:
+; CHECK-DEFAULT-CODE-MODEL-DAG: stp x29, x30, [sp
+; CHECK-DEFAULT-CODE-MODEL-DAG: orr x15, xzr, #0x100
+; CHECK-DEFAULT-CODE-MODEL: bl __chkstk
+; CHECK-DEFAULT-CODE-MODEL: sub sp, sp, x15, lsl #4
+
+; CHECK-LARGE-CODE-MODEL: check_watermark:
+; CHECK-LARGE-CODE-MODEL-DAG: stp x29, x30, [sp
+; CHECK-LARGE-CODE-MODEL-DAG: orr x15, xzr, #0x100
+; CHECK-LARGE-CODE-MODEL-DAG: adrp x16, __chkstk
+; CHECK-LARGE-CODE-MODEL-DAG: add x16, x16, __chkstk
+; CHECK-LARGE-CODE-MODEL: blr x16
+; CHECK-LARGE-CODE-MODEL: sub sp, sp, x15, lsl #4
diff --git a/test/CodeGen/AArch64/ldst-paired-aliasing.ll b/test/CodeGen/AArch64/ldst-paired-aliasing.ll
index 9c698b5fdcc6..9b0b51d369a3 100644
--- a/test/CodeGen/AArch64/ldst-paired-aliasing.ll
+++ b/test/CodeGen/AArch64/ldst-paired-aliasing.ll
@@ -10,11 +10,10 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #3
define i32 @main() local_unnamed_addr #1 {
; Make sure the stores happen in the correct order (the exact instructions could change).
; CHECK-LABEL: main:
-; CHECK: stp xzr, xzr, [sp, #72]
+; CHECK: str xzr, [sp, #80]
; CHECK: str w9, [sp, #80]
-; CHECK: str q0, [sp, #48]
+; CHECK: stp q0, q0, [sp, #48]
; CHECK: ldr w8, [sp, #48]
-; CHECK: str q0, [sp, #64]
for.body.lr.ph.i.i.i.i.i.i63:
%b1 = alloca [10 x i32], align 16
diff --git a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
index 228d3c7d4306..71c4c83c28f9 100644
--- a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -251,7 +251,8 @@ entry:
; R600: MOVA_INT
-; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding:
+; SI-PROMOTE-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding:
+; SI-PROMOTE-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding:
; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: [0x04,0x00,0x60,0xe0
; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding: [0x05,0x00,0x60,0xe0
diff --git a/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll b/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll
new file mode 100644
index 000000000000..f97785beab6f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll
@@ -0,0 +1,32 @@
+; RUN: llc -mtriple=amdgcn--amdhsa-amdgiz -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Effectively, check that the compile finishes; in the case
+; of an infinite loop, llc toggles between merging 2 ST4s
+; ( MergeConsecutiveStores() ) and breaking the resulting ST8
+; apart ( LegalizeStoreOps() ).
+
+target datalayout = "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5"
+
+; GCN-LABEL: {{^}}_Z6brokenPd:
+; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}
+define amdgpu_kernel void @_Z6brokenPd(double* %arg) {
+bb:
+ %tmp = alloca double, align 8, addrspace(5)
+ %tmp1 = alloca double, align 8, addrspace(5)
+ %tmp2 = load double, double* %arg, align 8
+ br i1 1, label %bb6, label %bb4
+
+bb3: ; No predecessors!
+ br label %bb4
+
+bb4: ; preds = %bb3, %bb
+ %tmp5 = phi double addrspace(5)* [ %tmp1, %bb3 ], [ %tmp, %bb ]
+ store double %tmp2, double addrspace(5)* %tmp5, align 8
+ br label %bb6
+
+bb6: ; preds = %bb4, %bb
+ %tmp7 = phi double [ 0x7FF8123000000000, %bb4 ], [ 0x7FF8000000000000, %bb ]
+ store double %tmp7, double* %arg, align 8
+ ret void
+}
diff --git a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir
index 7c2666e3680f..8b9b83f6d0e7 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir
@@ -6,6 +6,7 @@
define void @test_trunc_and_zext_s16() { ret void }
define void @test_trunc_and_anyext_s8() { ret void }
define void @test_trunc_and_anyext_s16() { ret void }
+ define void @test_trunc_s64() #0 { ret void }
define void @test_add_s32() { ret void }
define void @test_add_fold_imm_s32() { ret void }
@@ -46,6 +47,10 @@
define void @test_gep() { ret void }
define void @test_constant_imm() { ret void }
define void @test_constant_cimm() { ret void }
+ define void @test_pointer_constant() { ret void }
+
+ define void @test_inttoptr_s32() { ret void }
+ define void @test_ptrtoint_s32() { ret void }
define void @test_select_s32() { ret void }
define void @test_select_ptr() { ret void }
@@ -241,6 +246,36 @@ body: |
; CHECK: BX_RET 14, %noreg, implicit %r0
...
---
+name: test_trunc_s64
+# CHECK-LABEL: name: test_trunc_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: gprb }
+ - { id: 2, class: gprb }
+body: |
+ bb.0:
+ liveins: %r0, %d0
+
+ %0(s64) = COPY %d0
+ ; CHECK: [[VREG:%[0-9]+]]:dpr = COPY %d0
+
+ %2(p0) = COPY %r0
+ ; CHECK: [[PTR:%[0-9]+]]:gpr = COPY %r0
+
+ %1(s32) = G_TRUNC %0(s64)
+ ; CHECK: [[VREGTRUNC:%[0-9]+]]:gpr, [[UNINTERESTING:%[0-9]+]]:gpr = VMOVRRD [[VREG]]
+
+ G_STORE %1(s32), %2 :: (store 4)
+ ; CHECK: STRi12 [[VREGTRUNC]], [[PTR]], 0, 14, %noreg
+
+ BX_RET 14, %noreg
+ ; CHECK: BX_RET 14, %noreg
+...
+---
name: test_add_s32
# CHECK-LABEL: name: test_add_s32
legalized: true
@@ -1075,6 +1110,71 @@ body: |
BX_RET 14, %noreg, implicit %r0
...
---
+name: test_pointer_constant
+# CHECK-LABEL: name: test_pointer_constant
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: gprb }
+body: |
+ bb.0:
+ %0(p0) = G_CONSTANT i32 0
+ ; CHECK: %[[C:[0-9]+]]:gpr = MOVi 0, 14, %noreg, %noreg
+
+ %r0 = COPY %0(p0)
+ BX_RET 14, %noreg, implicit %r0
+...
+---
+name: test_inttoptr_s32
+# CHECK-LABEL: name: test_inttoptr_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: gprb }
+ - { id: 1, class: gprb }
+body: |
+ bb.0:
+ liveins: %r0
+
+ %0(s32) = COPY %r0
+ %1(p0) = G_INTTOPTR %0(s32)
+ ; CHECK: [[INT:%[0-9]+]]:gpr = COPY %r0
+ ; CHECK: [[PTR:%[0-9]+]]:gpr = COPY [[INT]]
+
+ %r0 = COPY %1(p0)
+ ; CHECK: %r0 = COPY [[PTR]]
+
+ BX_RET 14, %noreg, implicit %r0
+...
+---
+name: test_ptrtoint_s32
+# CHECK-LABEL: name: test_ptrtoint_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: gprb }
+ - { id: 1, class: gprb }
+body: |
+ bb.0:
+ liveins: %r0
+
+ %0(p0) = COPY %r0
+ %1(s32) = G_PTRTOINT %0(p0)
+ ; CHECK: [[PTR:%[0-9]+]]:gpr = COPY %r0
+ ; CHECK: [[INT:%[0-9]+]]:gpr = COPY [[PTR]]
+
+ %r0 = COPY %1(s32)
+ ; CHECK: %r0 = COPY [[INT]]
+
+ BX_RET 14, %noreg, implicit %r0
+...
+---
name: test_select_s32
# CHECK-LABEL: name: test_select_s32
legalized: true
diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
index e3e206cf76e9..204434e981b4 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
@@ -3,6 +3,9 @@
define void @test_sext_s8() { ret void }
define void @test_zext_s16() { ret void }
+ define void @test_inttoptr_s32() { ret void }
+ define void @test_ptrtoint_s32() { ret void }
+
define void @test_add_s8() { ret void }
define void @test_add_s16() { ret void }
define void @test_add_s32() { ret void }
@@ -101,6 +104,50 @@ body: |
BX_RET 14, %noreg, implicit %r0
...
---
+name: test_inttoptr_s32
+# CHECK-LABEL: name: test_inttoptr_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+body: |
+ bb.0:
+ liveins: %r0
+
+ %0(s32) = COPY %r0
+ %1(p0) = G_INTTOPTR %0(s32)
+ ; G_INTTOPTR with s32 is legal, so we should find it unchanged in the output
+ ; CHECK: {{%[0-9]+}}:_(p0) = G_INTTOPTR {{%[0-9]+}}
+ %r0 = COPY %1(p0)
+ BX_RET 14, %noreg, implicit %r0
+...
+---
+name: test_ptrtoint_s32
+# CHECK-LABEL: name: test_ptrtoint_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+body: |
+ bb.0:
+ liveins: %r0
+
+ %0(p0) = COPY %r0
+ %1(s32) = G_PTRTOINT %0(p0)
+ ; G_PTRTOINT with s32 is legal, so we should find it unchanged in the output
+ ; CHECK: {{%[0-9]+}}:_(s32) = G_PTRTOINT {{%[0-9]+}}
+ %r0 = COPY %1(s32)
+ BX_RET 14, %noreg, implicit %r0
+...
+---
name: test_add_s8
# CHECK-LABEL: name: test_add_s8
legalized: false
@@ -826,6 +873,7 @@ registers:
- { id: 2, class: _ }
- { id: 3, class: _ }
- { id: 4, class: _ }
+ - { id: 5, class: _ }
body: |
bb.0:
liveins: %r0
@@ -856,6 +904,10 @@ body: |
; CHECK: {{%[0-9]+}}:_(s1) = G_TRUNC [[EXT]](s32)
; CHECK-NOT: G_CONSTANT i1
+ %5(p0) = G_CONSTANT 0
+ G_STORE %5(p0), %4(p0) :: (store 4)
+ ; CHECK: {{%[0-9]+}}:_(p0) = G_CONSTANT 0
+
%r0 = COPY %0(s32)
BX_RET 14, %noreg, implicit %r0
...
diff --git a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
index 044740e33a2d..175333626f97 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
@@ -24,6 +24,9 @@
define void @test_constants() { ret void }
+ define void @test_inttoptr_s32() { ret void }
+ define void @test_ptrtoint_s32() { ret void }
+
@a_global = global float 1.0
define void @test_globals() { ret void }
@@ -31,6 +34,7 @@
define void @test_anyext_s16_32() { ret void }
define void @test_trunc_s32_16() { ret void }
+ define void @test_trunc_s64_32() #0 { ret void }
define void @test_icmp_eq_s32() { ret void }
define void @test_fcmp_one_s32() #0 { ret void }
@@ -496,6 +500,44 @@ body: |
BX_RET 14, %noreg, implicit %r0
...
---
+name: test_inttoptr_s32
+# CHECK-LABEL: name: test_inttoptr_s32
+legalized: true
+regBankSelected: false
+selected: false
+# CHECK: registers:
+# CHECK: - { id: 0, class: gprb, preferred-register: '' }
+# CHECK: - { id: 1, class: gprb, preferred-register: '' }
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+body: |
+ bb.0:
+ %0(s32) = COPY %r0
+ %1(p0) = G_INTTOPTR %0(s32)
+ %r0 = COPY %1(p0)
+ BX_RET 14, %noreg, implicit %r0
+...
+---
+name: test_ptrtoint_s32
+# CHECK-LABEL: name: test_ptrtoint_s32
+legalized: true
+regBankSelected: false
+selected: false
+# CHECK: registers:
+# CHECK: - { id: 0, class: gprb, preferred-register: '' }
+# CHECK: - { id: 1, class: gprb, preferred-register: '' }
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+body: |
+ bb.0:
+ %0(p0) = COPY %r0
+ %1(s32) = G_PTRTOINT %0(p0)
+ %r0 = COPY %1(s32)
+ BX_RET 14, %noreg, implicit %r0
+...
+---
name: test_globals
# CHECK-LABEL: name: test_globals
legalized: true
@@ -584,6 +626,30 @@ body: |
BX_RET 14, %noreg
...
---
+name: test_trunc_s64_32
+# CHECK-LABEL: name: test_trunc_s64_32
+legalized: true
+regBankSelected: false
+selected: false
+# CHECK: registers:
+# CHECK: - { id: 0, class: fprb, preferred-register: '' }
+# CHECK: - { id: 1, class: gprb, preferred-register: '' }
+# CHECK: - { id: 2, class: gprb, preferred-register: '' }
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %d0
+
+ %0(s64) = COPY %d0
+ %2(p0) = COPY %r0
+ %1(s32) = G_TRUNC %0(s64)
+ G_STORE %1(s32), %2 :: (store 4)
+ BX_RET 14, %noreg
+...
+---
name: test_icmp_eq_s32
# CHECK-LABEL: name: test_icmp_eq_s32
legalized: true
diff --git a/test/CodeGen/ARM/avoid-cpsr-rmw.ll b/test/CodeGen/ARM/avoid-cpsr-rmw.ll
index 78d3ebf371a4..9373c5d44210 100644
--- a/test/CodeGen/ARM/avoid-cpsr-rmw.ll
+++ b/test/CodeGen/ARM/avoid-cpsr-rmw.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CORTEX
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=swift | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SWIFT
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 -simplifycfg-sink-common=false | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CORTEX
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=swift -simplifycfg-sink-common=false | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SWIFT
; Avoid some 's' 16-bit instruction which partially update CPSR (and add false
; dependency) when it isn't dependent on last CPSR defining instruction.
; rdar://8928208
diff --git a/test/CodeGen/ARM/su-addsub-overflow.ll b/test/CodeGen/ARM/su-addsub-overflow.ll
new file mode 100644
index 000000000000..eef531282033
--- /dev/null
+++ b/test/CodeGen/ARM/su-addsub-overflow.ll
@@ -0,0 +1,135 @@
+; RUN: llc < %s -mtriple=arm-eabi -mcpu=generic | FileCheck %s
+
+define i32 @sadd(i32 %a, i32 %b) local_unnamed_addr #0 {
+; CHECK-LABEL: sadd:
+; CHECK: mov r[[R0:[0-9]+]], r0
+; CHECK-NEXT: add r[[R1:[0-9]+]], r[[R0]], r1
+; CHECK-NEXT: cmp r[[R1]], r[[R0]]
+; CHECK-NEXT: movvc pc, lr
+entry:
+ %0 = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b)
+ %1 = extractvalue { i32, i1 } %0, 1
+ br i1 %1, label %trap, label %cont
+
+trap:
+ tail call void @llvm.trap() #2
+ unreachable
+
+cont:
+ %2 = extractvalue { i32, i1 } %0, 0
+ ret i32 %2
+
+}
+
+define i32 @uadd(i32 %a, i32 %b) local_unnamed_addr #0 {
+; CHECK-LABEL: uadd:
+; CHECK: mov r[[R0:[0-9]+]], r0
+; CHECK-NEXT: adds r[[R1:[0-9]+]], r[[R0]], r1
+; CHECK-NEXT: cmp r[[R1]], r[[R0]]
+; CHECK-NEXT: movhs pc, lr
+entry:
+ %0 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
+ %1 = extractvalue { i32, i1 } %0, 1
+ br i1 %1, label %trap, label %cont
+
+trap:
+ tail call void @llvm.trap() #2
+ unreachable
+
+cont:
+ %2 = extractvalue { i32, i1 } %0, 0
+ ret i32 %2
+
+}
+
+define i32 @ssub(i32 %a, i32 %b) local_unnamed_addr #0 {
+; CHECK-LABEL: ssub:
+; CHECK: cmp r0, r1
+; CHECK-NEXT: subvc r0, r0, r1
+; CHECK-NEXT: movvc pc, lr
+entry:
+ %0 = tail call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b)
+ %1 = extractvalue { i32, i1 } %0, 1
+ br i1 %1, label %trap, label %cont
+
+trap:
+ tail call void @llvm.trap() #2
+ unreachable
+
+cont:
+ %2 = extractvalue { i32, i1 } %0, 0
+ ret i32 %2
+
+}
+
+define i32 @usub(i32 %a, i32 %b) local_unnamed_addr #0 {
+; CHECK-LABEL: usub:
+; CHECK: mov r[[R0:[0-9]+]], r0
+; CHECK-NEXT: subs r[[R1:[0-9]+]], r[[R0]], r1
+; CHECK-NEXT: cmp r[[R0]], r1
+; CHECK-NEXT: movhs pc, lr
+entry:
+ %0 = tail call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
+ %1 = extractvalue { i32, i1 } %0, 1
+ br i1 %1, label %trap, label %cont
+
+trap:
+ tail call void @llvm.trap() #2
+ unreachable
+
+cont:
+ %2 = extractvalue { i32, i1 } %0, 0
+ ret i32 %2
+
+}
+
+define void @sum(i32* %a, i32* %b, i32 %n) local_unnamed_addr #0 {
+; CHECK-LABEL: sum:
+; CHECK: ldr [[R0:r[0-9]+]],
+; CHECK-NEXT: ldr [[R1:r[0-9]+|lr]],
+; CHECK-NEXT: add [[R2:r[0-9]+]], [[R1]], [[R0]]
+; CHECK-NEXT: cmp [[R2]], [[R1]]
+; CHECK-NEXT: strvc [[R2]],
+; CHECK-NEXT: addvc
+; CHECK-NEXT: cmpvc
+; CHECK-NEXT: bvs
+entry:
+ %cmp7 = icmp eq i32 %n, 0
+ br i1 %cmp7, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %i.08 = phi i32 [ %7, %cont2 ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i32, i32* %b, i32 %i.08
+ %0 = load i32, i32* %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds i32, i32* %a, i32 %i.08
+ %1 = load i32, i32* %arrayidx1, align 4
+ %2 = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %1, i32 %0)
+ %3 = extractvalue { i32, i1 } %2, 1
+ br i1 %3, label %trap, label %cont
+
+trap:
+ tail call void @llvm.trap() #2
+ unreachable
+
+cont:
+ %4 = extractvalue { i32, i1 } %2, 0
+ store i32 %4, i32* %arrayidx1, align 4
+ %5 = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %i.08, i32 1)
+ %6 = extractvalue { i32, i1 } %5, 1
+ br i1 %6, label %trap, label %cont2
+
+cont2:
+ %7 = extractvalue { i32, i1 } %5, 0
+ %cmp = icmp eq i32 %7, %n
+ br i1 %cmp, label %for.cond.cleanup, label %for.body
+
+}
+
+declare void @llvm.trap() #2
+declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) #1
+declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1
+declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) #1
+declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #1
diff --git a/test/CodeGen/ARM/usat.ll b/test/CodeGen/ARM/usat.ll
new file mode 100644
index 000000000000..8f19d11ef7bb
--- /dev/null
+++ b/test/CodeGen/ARM/usat.ll
@@ -0,0 +1,214 @@
+; RUN: llc -mtriple=armv4t-eabi %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=V4T
+; RUN: llc -mtriple=armv6-eabi %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=V6
+; RUN: llc -mtriple=armv6t2-eabi %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=V6T2
+
+; Check for several conditions that should result in USAT.
+; For example, the base test is equivalent to
+; x < 0 ? 0 : (x > k ? k : x) in C. All patterns that bound x
+; to the interval [0, k] where k + 1 is a power of 2 can be
+; transformed into USAT. At the end there are some tests
+; checking that conditionals are not transformed if they don't
+; match the right pattern.
+
+;
+; Base tests with different bit widths
+;
+
+; x < 0 ? 0 : (x > k ? k : x)
+; 32-bit base test
+define i32 @unsigned_sat_base_32bit(i32 %x) #0 {
+; CHECK-LABEL: unsigned_sat_base_32bit:
+; V6: usat r0, #23, r0
+; V6T2: usat r0, #23, r0
+; V4T-NOT: usat
+entry:
+ %cmpLow = icmp slt i32 %x, 0
+ %cmpUp = icmp sgt i32 %x, 8388607
+ %saturateUp = select i1 %cmpUp, i32 8388607, i32 %x
+ %saturateLow = select i1 %cmpLow, i32 0, i32 %saturateUp
+ ret i32 %saturateLow
+}
+
+; x < 0 ? 0 : (x > k ? k : x)
+; 16-bit base test
+define i16 @unsigned_sat_base_16bit(i16 %x) #0 {
+; CHECK-LABEL: unsigned_sat_base_16bit:
+; V6: usat r0, #11, r0
+; V6T2: usat r0, #11, r0
+; V4T-NOT: usat
+entry:
+ %cmpLow = icmp slt i16 %x, 0
+ %cmpUp = icmp sgt i16 %x, 2047
+ %saturateUp = select i1 %cmpUp, i16 2047, i16 %x
+ %saturateLow = select i1 %cmpLow, i16 0, i16 %saturateUp
+ ret i16 %saturateLow
+}
+
+; x < 0 ? 0 : (x > k ? k : x)
+; 8-bit base test
+define i8 @unsigned_sat_base_8bit(i8 %x) #0 {
+; CHECK-LABEL: unsigned_sat_base_8bit:
+; V6: usat r0, #5, r0
+; V6T2: usat r0, #5, r0
+; V4T-NOT: usat
+entry:
+ %cmpLow = icmp slt i8 %x, 0
+ %cmpUp = icmp sgt i8 %x, 31
+ %saturateUp = select i1 %cmpUp, i8 31, i8 %x
+ %saturateLow = select i1 %cmpLow, i8 0, i8 %saturateUp
+ ret i8 %saturateLow
+}
+
+;
+; Tests where the conditionals that check for upper and lower bounds,
+; or the < and > operators, are arranged in different ways. Only some
+; of the possible combinations that lead to USAT are tested.
+;
+; x < 0 ? 0 : (x < k ? x : k)
+define i32 @unsigned_sat_lower_upper_1(i32 %x) #0 {
+; CHECK-LABEL: unsigned_sat_lower_upper_1:
+; V6: usat r0, #23, r0
+; V6T2: usat r0, #23, r0
+; V4T-NOT: usat
+entry:
+ %cmpLow = icmp slt i32 %x, 0
+ %cmpUp = icmp slt i32 %x, 8388607
+ %saturateUp = select i1 %cmpUp, i32 %x, i32 8388607
+ %saturateLow = select i1 %cmpLow, i32 0, i32 %saturateUp
+ ret i32 %saturateLow
+}
+
+; x > 0 ? (x > k ? k : x) : 0
+define i32 @unsigned_sat_lower_upper_2(i32 %x) #0 {
+; CHECK-LABEL: unsigned_sat_lower_upper_2:
+; V6: usat r0, #23, r0
+; V6T2: usat r0, #23, r0
+; V4T-NOT: usat
+entry:
+ %cmpLow = icmp sgt i32 %x, 0
+ %cmpUp = icmp sgt i32 %x, 8388607
+ %saturateUp = select i1 %cmpUp, i32 8388607, i32 %x
+ %saturateLow = select i1 %cmpLow, i32 %saturateUp, i32 0
+ ret i32 %saturateLow
+}
+
+; x < k ? (x < 0 ? 0 : x) : k
+define i32 @unsigned_sat_upper_lower_1(i32 %x) #0 {
+; CHECK-LABEL: unsigned_sat_upper_lower_1:
+; V6: usat r0, #23, r0
+; V6T2: usat r0, #23, r0
+; V4T-NOT: usat
+entry:
+ %cmpUp = icmp slt i32 %x, 8388607
+ %cmpLow = icmp slt i32 %x, 0
+ %saturateLow = select i1 %cmpLow, i32 0, i32 %x
+ %saturateUp = select i1 %cmpUp, i32 %saturateLow, i32 8388607
+ ret i32 %saturateUp
+}
+
+; x > k ? k : (x < 0 ? 0 : x)
+define i32 @unsigned_sat_upper_lower_2(i32 %x) #0 {
+; CHECK-LABEL: unsigned_sat_upper_lower_2:
+; V6: usat r0, #23, r0
+; V6T2: usat r0, #23, r0
+; V4T-NOT: usat
+entry:
+ %cmpUp = icmp sgt i32 %x, 8388607
+ %cmpLow = icmp slt i32 %x, 0
+ %saturateLow = select i1 %cmpLow, i32 0, i32 %x
+ %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow
+ ret i32 %saturateUp
+}
+
+; k < x ? k : (x > 0 ? x : 0)
+define i32 @unsigned_sat_upper_lower_3(i32 %x) #0 {
+; CHECK-LABEL: unsigned_sat_upper_lower_3:
+; V6: usat r0, #23, r0
+; V6T2: usat r0, #23, r0
+; V4T-NOT: usat
+entry:
+ %cmpUp = icmp slt i32 8388607, %x
+ %cmpLow = icmp sgt i32 %x, 0
+ %saturateLow = select i1 %cmpLow, i32 %x, i32 0
+ %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow
+ ret i32 %saturateUp
+}
+
+;
+; The following tests check for patterns that should not transform
+; into USAT but are similar enough that could confuse the selector.
+;
+; x > k ? k : (x > 0 ? 0 : x)
+; First condition upper-saturates, second doesn't lower-saturate.
+define i32 @no_unsigned_sat_missing_lower(i32 %x) #0 {
+; CHECK-LABEL: no_unsigned_sat_missing_lower
+; CHECK-NOT: usat
+entry:
+ %cmpUp = icmp sgt i32 %x, 8388607
+ %cmpLow = icmp sgt i32 %x, 0
+ %saturateLow = select i1 %cmpLow, i32 0, i32 %x
+ %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow
+ ret i32 %saturateUp
+}
+
+; x < k ? k : (x < 0 ? 0 : x)
+; Second condition lower-saturates, first doesn't upper-saturate.
+define i32 @no_unsigned_sat_missing_upper(i32 %x) #0 {
+; CHECK-LABEL: no_unsigned_sat_missing_upper:
+; CHECK-NOT: usat
+entry:
+ %cmpUp = icmp slt i32 %x, 8388607
+ %cmpLow = icmp slt i32 %x, 0
+ %saturateLow = select i1 %cmpLow, i32 0, i32 %x
+ %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow
+ ret i32 %saturateUp
+}
+
+; Lower constant is different in the select and in the compare
+define i32 @no_unsigned_sat_incorrect_constant(i32 %x) #0 {
+; CHECK-LABEL: no_unsigned_sat_incorrect_constant:
+; CHECK-NOT: usat
+entry:
+ %cmpUp = icmp sgt i32 %x, 8388607
+ %cmpLow = icmp slt i32 %x, 0
+ %saturateLow = select i1 %cmpLow, i32 -1, i32 %x
+ %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow
+ ret i32 %saturateUp
+}
+
+; The interval is not [0, k]
+define i32 @no_unsigned_sat_incorrect_interval(i32 %x) #0 {
+; CHECK-LABEL: no_unsigned_sat_incorrect_interval:
+; CHECK-NOT: usat
+entry:
+ %cmpUp = icmp sgt i32 %x, 8388607
+ %cmpLow = icmp slt i32 %x, -4
+ %saturateLow = select i1 %cmpLow, i32 -4, i32 %x
+ %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow
+ ret i32 %saturateUp
+}
+
+; The returned value (y) is not the same as the tested value (x).
+define i32 @no_unsigned_sat_incorrect_return(i32 %x, i32 %y) #0 {
+; CHECK-LABEL: no_unsigned_sat_incorrect_return:
+; CHECK-NOT: usat
+entry:
+ %cmpUp = icmp sgt i32 %x, 8388607
+ %cmpLow = icmp slt i32 %x, 0
+ %saturateLow = select i1 %cmpLow, i32 0, i32 %y
+ %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow
+ ret i32 %saturateUp
+}
+
+; One of the values in a compare (y) is not the same as the rest
+; of the compare and select values (x).
+define i32 @no_unsigned_sat_incorrect_compare(i32 %x, i32 %y) #0 {
+; CHECK-LABEL: no_unsigned_sat_incorrect_compare:
+; CHECK-NOT: usat
+entry:
+ %cmpUp = icmp sgt i32 %x, 8388607
+ %cmpLow = icmp slt i32 %y, 0
+ %saturateLow = select i1 %cmpLow, i32 0, i32 %x
+ %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow
+ ret i32 %saturateUp
+}
diff --git a/test/CodeGen/BPF/objdump_imm_hex.ll b/test/CodeGen/BPF/objdump_imm_hex.ll
new file mode 100644
index 000000000000..a245a6c791f2
--- /dev/null
+++ b/test/CodeGen/BPF/objdump_imm_hex.ll
@@ -0,0 +1,65 @@
+; RUN: llc -march=bpfel -filetype=obj -o - %s | llvm-objdump -d - | FileCheck --check-prefix=CHECK-DEC %s
+; RUN: llc -march=bpfel -filetype=obj -o - %s | llvm-objdump -d -print-imm-hex - | FileCheck --check-prefix=CHECK-HEX %s
+
+; Source Code:
+; int gbl;
+; int test(unsigned long long a, unsigned long long b) {
+; int ret = 0;
+; if (a == 0xABCDABCDabcdabcdULL) {
+; gbl = gbl * gbl * 2;
+; ret = 1;
+; goto out;
+; }
+; if (b == 0xABCDabcdabcdULL) {
+; gbl = gbl * 4;
+; ret = 2;
+; }
+; out:
+; return ret;
+; }
+
+@gbl = common local_unnamed_addr global i32 0, align 4
+
+; Function Attrs: norecurse nounwind
+define i32 @test(i64, i64) local_unnamed_addr #0 {
+; CHECK-LABEL: test
+ %3 = icmp eq i64 %0, -6067004223159161907
+ br i1 %3, label %4, label %8
+; CHECK-DEC: 18 03 00 00 cd ab cd ab 00 00 00 00 cd ab cd ab r3 = -6067004223159161907 ll
+; CHECK-DEC: 5d 31 07 00 00 00 00 00 if r1 != r3 goto +7
+; CHECK-HEX: 18 03 00 00 cd ab cd ab 00 00 00 00 cd ab cd ab r3 = -0x5432543254325433 ll
+; CHECK-HEX: 5d 31 07 00 00 00 00 00 if r1 != r3 goto +0x7
+
+; <label>:4: ; preds = %2
+ %5 = load i32, i32* @gbl, align 4
+ %6 = shl i32 %5, 1
+; CHECK-DEC: 67 01 00 00 01 00 00 00 r1 <<= 1
+; CHECK-HEX: 67 01 00 00 01 00 00 00 r1 <<= 0x1
+ %7 = mul i32 %6, %5
+ br label %13
+
+; <label>:8: ; preds = %2
+ %9 = icmp eq i64 %1, 188899839028173
+; CHECK-DEC: 18 01 00 00 cd ab cd ab 00 00 00 00 cd ab 00 00 r1 = 188899839028173 ll
+; CHECK-HEX: 18 01 00 00 cd ab cd ab 00 00 00 00 cd ab 00 00 r1 = 0xabcdabcdabcd ll
+ br i1 %9, label %10, label %16
+
+; <label>:10: ; preds = %8
+ %11 = load i32, i32* @gbl, align 4
+ %12 = shl nsw i32 %11, 2
+ br label %13
+
+; <label>:13: ; preds = %4, %10
+ %14 = phi i32 [ %12, %10 ], [ %7, %4 ]
+ %15 = phi i32 [ 2, %10 ], [ 1, %4 ]
+ store i32 %14, i32* @gbl, align 4
+; CHECK-DEC: 63 12 00 00 00 00 00 00 *(u32 *)(r2 + 0) = r1
+; CHECK-HEX: 63 12 00 00 00 00 00 00 *(u32 *)(r2 + 0x0) = r1
+ br label %16
+
+; <label>:16: ; preds = %13, %8
+ %17 = phi i32 [ 0, %8 ], [ %15, %13 ]
+ ret i32 %17
+}
+
+attributes #0 = { norecurse nounwind }
diff --git a/test/CodeGen/Hexagon/autohvx/build-vector-i32-type.ll b/test/CodeGen/Hexagon/autohvx/build-vector-i32-type.ll
new file mode 100644
index 000000000000..f96dbf2af496
--- /dev/null
+++ b/test/CodeGen/Hexagon/autohvx/build-vector-i32-type.ll
@@ -0,0 +1,19 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; Check that this doesn't crash.
+; CHECK: sfcmp
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+define void @fred() #0 {
+b0:
+ %v1 = load <16 x float>, <16 x float>* null, align 8
+ %v2 = fcmp olt <16 x float> undef, %v1
+ %v3 = select <16 x i1> %v2, <16 x i16> undef, <16 x i16> zeroinitializer
+ %v4 = sext <16 x i16> %v3 to <16 x i32>
+ store <16 x i32> %v4, <16 x i32>* undef, align 64
+ unreachable
+}
+
+attributes #0 = { noinline norecurse nounwind "target-cpu"="hexagonv60" "target-features"="+hvx-length64b,+hvxv60" }
diff --git a/test/CodeGen/Hexagon/autohvx/isel-bool-vector.ll b/test/CodeGen/Hexagon/autohvx/isel-bool-vector.ll
new file mode 100644
index 000000000000..4cbd00837fc6
--- /dev/null
+++ b/test/CodeGen/Hexagon/autohvx/isel-bool-vector.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; Check that this testcase doesn't crash.
+; CHECK: sfcmp
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+define void @fred() #0 {
+b0:
+ %v1 = fcmp olt <16 x float> zeroinitializer, undef
+ %v2 = select <16 x i1> %v1, <16 x i16> undef, <16 x i16> zeroinitializer
+ %v3 = sext <16 x i16> %v2 to <16 x i32>
+ store <16 x i32> %v3, <16 x i32>* undef, align 128
+ unreachable
+}
+
+attributes #0 = { noinline norecurse nounwind "target-cpu"="hexagonv60" "target-features"="+hvx-length64b" }
diff --git a/test/CodeGen/Hexagon/autohvx/isel-select-const.ll b/test/CodeGen/Hexagon/autohvx/isel-select-const.ll
new file mode 100644
index 000000000000..c251292c9da4
--- /dev/null
+++ b/test/CodeGen/Hexagon/autohvx/isel-select-const.ll
@@ -0,0 +1,32 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; Check that this doesn't crash.
+; CHECK: vlut32
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon-unknown--elf"
+
+define void @fred() #0 {
+b0:
+ %v1 = tail call <16 x i32> @llvm.hexagon.V6.vlutvvb.oracc(<16 x i32> undef, <16 x i32> <i32 151388928, i32 353505036, i32 555621144, i32 757737252, i32 959853360, i32 1161969468, i32 1364085576, i32 1566201684, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <16 x i32> undef, i32 3)
+ %v2 = bitcast <16 x i32> %v1 to <64 x i8>
+ %v3 = shufflevector <64 x i8> %v2, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %v4 = shufflevector <32 x i8> zeroinitializer, <32 x i8> %v3, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+ %v5 = bitcast <64 x i8> %v4 to <16 x i32>
+ %v6 = tail call <16 x i32> @llvm.hexagon.V6.vshuffb(<16 x i32> %v5)
+ store <16 x i32> %v6, <16 x i32>* undef, align 1
+ %v7 = tail call <16 x i32> @llvm.hexagon.V6.vlutvvb.oracc(<16 x i32> undef, <16 x i32> <i32 151388928, i32 353505036, i32 555621144, i32 757737252, i32 959853360, i32 1161969468, i32 1364085576, i32 1566201684, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <16 x i32> zeroinitializer, i32 3)
+ %v8 = bitcast <16 x i32> %v7 to <64 x i8>
+ %v9 = shufflevector <64 x i8> %v8, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %v10 = shufflevector <32 x i8> %v9, <32 x i8> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+ %v11 = bitcast <64 x i8> %v10 to <16 x i32>
+ %v12 = tail call <16 x i32> @llvm.hexagon.V6.vshuffb(<16 x i32> %v11)
+ store <16 x i32> %v12, <16 x i32>* undef, align 1
+ unreachable
+}
+
+declare <16 x i32> @llvm.hexagon.V6.vshuffb(<16 x i32>) #1
+declare <16 x i32> @llvm.hexagon.V6.vlutvvb.oracc(<16 x i32>, <16 x i32>, <16 x i32>, i32) #1
+
+attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvx,+hvx-length64b" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/Hexagon/expand-vstorerw-undef.ll b/test/CodeGen/Hexagon/expand-vstorerw-undef.ll
index 88eaec938fd3..5ac0f59bd2d1 100644
--- a/test/CodeGen/Hexagon/expand-vstorerw-undef.ll
+++ b/test/CodeGen/Hexagon/expand-vstorerw-undef.ll
@@ -12,7 +12,7 @@
; CHECK-LABEL: fred:
; CHECK: v[[REG:[0-9]+]] = vsplat
-; CHECK: vmem(r29+#6) = v[[REG]]
+; CHECK: vmem(r29+#{{[0-9]+}}) = v[[REG]]
target triple = "hexagon"
diff --git a/test/CodeGen/Hexagon/v60-cur.ll b/test/CodeGen/Hexagon/v60-cur.ll
index 26d40c9a6975..d0ffe1d8fdd8 100644
--- a/test/CodeGen/Hexagon/v60-cur.ll
+++ b/test/CodeGen/Hexagon/v60-cur.ll
@@ -1,9 +1,8 @@
-; RUN: llc -march=hexagon -enable-pipeliner=false < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s | FileCheck %s
; Test that we generate a .cur
-; CHECK: v{{[0-9]*}}.cur{{ *}}
-; CHECK: v{{[0-9]*}}.cur{{ *}}
+; CHECK: v{{[0-9]*}}.cur
define void @conv3x3_i(i8* noalias nocapture readonly %iptr0, i32 %shift, i32 %width) #0 {
entry:
diff --git a/test/CodeGen/Hexagon/vect/vect-extract-i1-debug.ll b/test/CodeGen/Hexagon/vect/vect-extract-i1-debug.ll
new file mode 100644
index 000000000000..af2a55ea47d5
--- /dev/null
+++ b/test/CodeGen/Hexagon/vect/vect-extract-i1-debug.ll
@@ -0,0 +1,14 @@
+; RUN: llc -march=hexagon -debug-only=isel < %s 2>/dev/null
+; REQUIRES: asserts
+
+; Make sure that this doesn't crash. Debug option enabled a failing assertion
+; about type mismatch in formal arguments.
+; CHECK: vaddub
+
+define i1 @t_i4x8(<4 x i8> %a, <4 x i8> %b) nounwind {
+entry:
+ %0 = add <4 x i8> %a, %b
+ %1 = bitcast <4 x i8> %0 to <32 x i1>
+ %2 = extractelement <32 x i1> %1, i32 0
+ ret i1 %2
+}
diff --git a/test/CodeGen/Hexagon/vect/vect-infloop.ll b/test/CodeGen/Hexagon/vect/vect-infloop.ll
index 4de390159fdd..9ee0b0ab3aa6 100644
--- a/test/CodeGen/Hexagon/vect/vect-infloop.ll
+++ b/test/CodeGen/Hexagon/vect/vect-infloop.ll
@@ -1,10 +1,10 @@
; Extracted from test/CodeGen/Generic/vector-casts.ll: used to loop indefinitely.
; RUN: llc -march=hexagon < %s | FileCheck %s
-; CHECK: combine
+; CHECK: convert_df2w
define void @a(<2 x double>* %p, <2 x i8>* %q) {
- %t = load <2 x double>, <2 x double>* %p
- %r = fptosi <2 x double> %t to <2 x i8>
- store <2 x i8> %r, <2 x i8>* %q
- ret void
+ %t = load <2 x double>, <2 x double>* %p
+ %r = fptosi <2 x double> %t to <2 x i8>
+ store <2 x i8> %r, <2 x i8>* %q
+ ret void
}
diff --git a/test/CodeGen/Mips/llvm-ir/extractelement.ll b/test/CodeGen/Mips/llvm-ir/extractelement.ll
index f7b8ea5f9e15..4f926cbee0b2 100644
--- a/test/CodeGen/Mips/llvm-ir/extractelement.ll
+++ b/test/CodeGen/Mips/llvm-ir/extractelement.ll
@@ -15,5 +15,5 @@ define i1 @via_stack_bug(i8 signext %idx) {
; ALL-DAG: sh [[ONE]], 6($sp)
; ALL-DAG: andi [[MASKED_IDX:\$[0-9]+]], $4, 1
; ALL-DAG: addiu [[VPTR:\$[0-9]+]], $sp, 6
-; ALL-DAG: or [[EPTR:\$[0-9]+]], [[MASKED_IDX]], [[VPTR]]
+; ALL-DAG: or [[EPTR:\$[0-9]+]], [[VPTR]], [[MASKED_IDX]]
; ALL: lbu $2, 0([[EPTR]])
diff --git a/test/CodeGen/Mips/long-call-mcount.ll b/test/CodeGen/Mips/long-call-mcount.ll
new file mode 100644
index 000000000000..70a4410d060b
--- /dev/null
+++ b/test/CodeGen/Mips/long-call-mcount.ll
@@ -0,0 +1,19 @@
+; Check call to mcount in case of long/short call options.
+; RUN: llc -march=mips -target-abi o32 --mattr=+long-calls,+noabicalls < %s \
+; RUN: | FileCheck -check-prefixes=CHECK,LONG %s
+; RUN: llc -march=mips -target-abi o32 --mattr=-long-calls,+noabicalls < %s \
+; RUN: | FileCheck -check-prefixes=CHECK,SHORT %s
+
+; Function Attrs: noinline nounwind optnone
+define void @foo() #0 {
+entry:
+ ret void
+
+; CHECK-LABEL: foo
+; LONG: lui $1, %hi(_mcount)
+; LONG-NEXT: addiu $25, $1, %lo(_mcount)
+; LONG-NEXT: jalr $25
+; SHORT: jal _mcount
+}
+
+attributes #0 = { "instrument-function-entry-inlined"="_mcount" }
diff --git a/test/CodeGen/Mips/sll-micromips-r6-encoding.mir b/test/CodeGen/Mips/sll-micromips-r6-encoding.mir
new file mode 100644
index 000000000000..85ce251ac315
--- /dev/null
+++ b/test/CodeGen/Mips/sll-micromips-r6-encoding.mir
@@ -0,0 +1,46 @@
+# RUN: llc -march=mips -mcpu=mips32r6 -mattr=+micromips %s -start-after=xray-instrumentation -o - -show-mc-encoding | FileCheck %s
+
+# Test that the 'sll $zero, $zero, 0' is correctly recognized as a real
+# instruction rather than some unimplemented opcode for the purposes of
+# encoding an instruction.
+
+# CHECK-LABEL: a:
+# CHECK: nop # encoding: [0x00,0x00,0x00,0x00]
+# CHECK: jrc $ra # encoding: [0x45,0xbf]
+---
+name: a
+alignment: 2
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: false
+registers:
+liveins:
+ - { reg: '%a0', virtual-reg: '' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 1
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ savePoint: ''
+ restorePoint: ''
+fixedStack:
+stack:
+constants:
+body: |
+ bb.0.entry:
+ renamable %zero = SLL_MMR6 killed renamable %zero, 0
+ JRC16_MM undef %ra, implicit %v0
+
+...
diff --git a/test/CodeGen/PowerPC/cmp_elimination.ll b/test/CodeGen/PowerPC/cmp_elimination.ll
index 3251ae2881b9..6bc8b8a041c2 100644
--- a/test/CodeGen/PowerPC/cmp_elimination.ll
+++ b/test/CodeGen/PowerPC/cmp_elimination.ll
@@ -1,4 +1,3 @@
-; XFAIL: *
; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s
; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s
@@ -748,6 +747,37 @@ do.end:
ret void
}
+define void @func29(i32 signext %a) {
+; We cannot merge two compares due to difference in sign extension behaviors.
+; equivalent C code example:
+; int a = .. ;
+; if (a == -1) dummy1();
+; if (a == (uint16_t)-1) dummy2();
+
+; CHECK-LABEL: @func29
+; CHECK: cmp
+; CHECK: cmp
+; CHECK: blr
+entry:
+ %cmp = icmp eq i32 %a, -1
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ tail call void @dummy1()
+ br label %if.end3
+
+if.else:
+ %cmp1 = icmp eq i32 %a, 65535
+ br i1 %cmp1, label %if.then2, label %if.end3
+
+if.then2:
+ tail call void @dummy2()
+ br label %if.end3
+
+if.end3:
+ ret void
+}
+
declare void @dummy1()
declare void @dummy2()
declare void @dummy3()
diff --git a/test/CodeGen/PowerPC/uint-to-ppcfp128-crash.ll b/test/CodeGen/PowerPC/uint-to-ppcfp128-crash.ll
new file mode 100644
index 000000000000..ad8dd90ea920
--- /dev/null
+++ b/test/CodeGen/PowerPC/uint-to-ppcfp128-crash.ll
@@ -0,0 +1,15 @@
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 \
+; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
+
+; Ensure we don't crash by trying to convert directly from a subword load
+; to a ppc_fp128 as we do for conversions to f32/f64.
+define ppc_fp128 @test(i16* nocapture readonly %Ptr) {
+entry:
+ %0 = load i16, i16* %Ptr, align 2
+ %conv = uitofp i16 %0 to ppc_fp128
+ ret ppc_fp128 %conv
+; CHECK: lhz [[LD:[0-9]+]], 0(3)
+; CHECK: mtvsrwa [[MV:[0-9]+]], [[LD]]
+; CHECK: xscvsxddp [[CONV:[0-9]+]], [[MV]]
+; CHECK: bl __gcc_qadd
+}
diff --git a/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll b/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll
index 82c6c318abdc..247961e85b12 100644
--- a/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll
+++ b/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll
@@ -25,7 +25,7 @@ entry:
; CHECK: extsw 3, [[RSHREG]]
; CHECK-P7-DAG: rlwinm [[ELEMOFFREG:[0-9]+]], 5, 2, 28, 29
; CHECK-P7-DAG: stxvw4x 34,
-; CHECK-P7: lwax 3, [[ELEMOFFREG]],
+; CHECK-P7: lwax 3, 3, [[ELEMOFFREG]]
; CHECK-BE-DAG: andi. [[ANDREG:[0-9]+]], 5, 2
; CHECK-BE-DAG: sldi [[SLREG:[0-9]+]], [[ANDREG]], 2
; CHECK-BE-DAG: lvsl [[SHMSKREG:[0-9]+]], 0, [[SLREG]]
@@ -54,7 +54,7 @@ entry:
; CHECK: mfvsrd 3,
; CHECK-P7-DAG: rlwinm [[ELEMOFFREG:[0-9]+]], 5, 3, 28, 28
; CHECK-P7-DAG: stxvd2x 34,
-; CHECK-P7: ldx 3, [[ELEMOFFREG]],
+; CHECK-P7: ldx 3, 3, [[ELEMOFFREG]]
; CHECK-BE-DAG: andi. [[ANDREG:[0-9]+]], 5, 1
; CHECK-BE-DAG: sldi [[SLREG:[0-9]+]], [[ANDREG]], 3
; CHECK-BE-DAG: lvsl [[SHMSKREG:[0-9]+]], 0, [[SLREG]]
@@ -77,7 +77,7 @@ entry:
; CHECK: xscvspdpn 1,
; CHECK-P7-DAG: rlwinm [[ELEMOFFREG:[0-9]+]], 5, 2, 28, 29
; CHECK-P7-DAG: stxvw4x 34,
-; CHECK-P7: lfsx 1, [[ELEMOFFREG]],
+; CHECK-P7: lfsx 1, 3, [[ELEMOFFREG]]
; CHECK-BE: sldi [[ELNOREG:[0-9]+]], 5, 2
; CHECK-BE: lvsl [[SHMSKREG:[0-9]+]], 0, [[ELNOREG]]
; CHECK-BE: vperm {{[0-9]+}}, 2, 2, [[SHMSKREG]]
diff --git a/test/CodeGen/Thumb2/t2sizereduction.mir b/test/CodeGen/Thumb2/t2sizereduction.mir
new file mode 100644
index 000000000000..377c0ccc7b0a
--- /dev/null
+++ b/test/CodeGen/Thumb2/t2sizereduction.mir
@@ -0,0 +1,83 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -run-pass=t2-reduce-size %s -o - | FileCheck %s
+
+--- |
+ target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+ target triple = "thumbv8m.main-arm-none-eabi"
+
+ ; Function Attrs: norecurse nounwind readnone
+ define i32 @test(i32 %x, i32 %y) local_unnamed_addr #0 {
+ entry:
+ %cmp6 = icmp sgt i32 %y, 0
+ br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+ for.body.preheader: ; preds = %entry
+ br label %for.body
+
+ for.cond.cleanup: ; preds = %for.body, %entry
+ %sum.0.lcssa = phi i32 [ 1, %entry ], [ %mul, %for.body ]
+ ret i32 %sum.0.lcssa
+
+ for.body: ; preds = %for.body, %for.body.preheader
+ %lsr.iv1 = phi i32 [ %lsr.iv.next2, %for.body ], [ %x, %for.body.preheader ]
+ %lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ %y, %for.body.preheader ]
+ %sum.07 = phi i32 [ %mul, %for.body ], [ 1, %for.body.preheader ]
+ %mul = mul nsw i32 %lsr.iv1, %sum.07
+ %lsr.iv.next = add i32 %lsr.iv, -1
+ %lsr.iv.next2 = add i32 %lsr.iv1, 1
+ %exitcond = icmp eq i32 %lsr.iv.next, 0
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+ }
+
+ attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m7" "target-features"="+d16,+dsp,+fp-armv8,+fp-only-sp,+hwdiv,+strict-align,+thumb-mode,-crc,-dotprod,-hwdiv-arm,-ras" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+...
+---
+name: test
+tracksRegLiveness: true
+liveins:
+ - { reg: '%r0', virtual-reg: '' }
+ - { reg: '%r1', virtual-reg: '' }
+body: |
+ ; CHECK-LABEL: name: test
+ ; CHECK: bb.0.entry:
+ ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK: liveins: %r0, %r1
+ ; CHECK: %r2 = tMOVr %r0, 14, %noreg
+ ; CHECK: %r0, dead %cpsr = tMOVi8 1, 14, %noreg
+ ; CHECK: tCMPi8 %r1, 1, 14, %noreg, implicit-def %cpsr
+ ; CHECK: t2Bcc %bb.2, 11, killed %cpsr
+ ; CHECK: bb.1.for.body:
+ ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; CHECK: liveins: %r0, %r1, %r2
+ ; CHECK: %r0, dead %cpsr = tMUL %r2, killed %r0, 14, %noreg
+ ; CHECK: %r2, dead %cpsr = tADDi8 killed %r2, 1, 14, %noreg
+ ; CHECK: %r1, %cpsr = tSUBi8 killed %r1, 1, 14, %noreg
+ ; CHECK: t2Bcc %bb.1, 1, killed %cpsr
+ ; CHECK: bb.2.for.cond.cleanup:
+ ; CHECK: liveins: %r0
+ ; CHECK: tBX_RET 14, %noreg, implicit %r0
+ bb.0.entry:
+ successors: %bb.1.for.body, %bb.2.for.cond.cleanup
+ liveins: %r0, %r1
+
+ %r2 = tMOVr %r0, 14, _
+ %r0 = t2MOVi 1, 14, _, _
+ t2CMPri %r1, 1, 14, _, implicit-def %cpsr
+ t2Bcc %bb.2.for.cond.cleanup, 11, killed %cpsr
+
+ bb.1.for.body:
+ successors: %bb.2.for.cond.cleanup, %bb.1.for.body
+ liveins: %r0, %r1, %r2
+
+ %r0 = t2MUL %r2, killed %r0, 14, _
+ %r2 = t2ADDri killed %r2, 1, 14, _, _
+ %r1 = t2SUBri killed %r1, 1, 14, _, def %cpsr
+ t2Bcc %bb.1.for.body, 1, killed %cpsr
+
+ bb.2.for.cond.cleanup:
+ liveins: %r0
+
+ tBX_RET 14, _, implicit %r0
+
+...
diff --git a/test/CodeGen/X86/avg-mask.ll b/test/CodeGen/X86/avg-mask.ll
index 578d7aa75287..d32b0e70791a 100644
--- a/test/CodeGen/X86/avg-mask.ll
+++ b/test/CodeGen/X86/avg-mask.ll
@@ -143,16 +143,8 @@ define <64 x i8> @avg_v64i8_mask(<64 x i8> %a, <64 x i8> %b, <64 x i8> %src, i64
; AVX512F-NEXT: shrq $32, %rax
; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: movl %edi, (%rsp)
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm6
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm8
-; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm7
-; AVX512F-NEXT: vpavgb %xmm7, %xmm6, %xmm6
-; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm7
-; AVX512F-NEXT: vpavgb %xmm7, %xmm8, %xmm7
-; AVX512F-NEXT: vpavgb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm1, %ymm1
-; AVX512F-NEXT: vpavgb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
+; AVX512F-NEXT: vpavgb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpavgb %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
@@ -201,16 +193,8 @@ define <64 x i8> @avg_v64i8_maskz(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwin
; AVX512F-NEXT: shrq $32, %rax
; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: movl %edi, (%rsp)
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm5
-; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm6
-; AVX512F-NEXT: vpavgb %xmm6, %xmm4, %xmm4
-; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm6
-; AVX512F-NEXT: vpavgb %xmm6, %xmm5, %xmm5
-; AVX512F-NEXT: vpavgb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpavgb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpavgb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpavgb %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll
index d1e26b787f48..dd11f6ca2935 100644
--- a/test/CodeGen/X86/avg.ll
+++ b/test/CodeGen/X86/avg.ll
@@ -90,150 +90,23 @@ define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) nounwind {
define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) nounwind {
; SSE2-LABEL: avg_v32i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm3
-; SSE2-NEXT: movdqa 16(%rdi), %xmm8
-; SSE2-NEXT: movdqa (%rsi), %xmm0
-; SSE2-NEXT: movdqa 16(%rsi), %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: movdqa %xmm3, %xmm5
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm3, %xmm12
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT: movdqa %xmm8, %xmm7
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm7, %xmm11
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm8, %xmm10
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm2, %xmm9
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
-; SSE2-NEXT: paddd %xmm6, %xmm9
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE2-NEXT: paddd %xmm5, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; SSE2-NEXT: paddd %xmm12, %xmm5
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE2-NEXT: paddd %xmm3, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm3, %xmm6
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
-; SSE2-NEXT: paddd %xmm11, %xmm6
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT: paddd %xmm7, %xmm3
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm7
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
-; SSE2-NEXT: paddd %xmm10, %xmm7
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT: paddd %xmm8, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE2-NEXT: psubd %xmm4, %xmm9
-; SSE2-NEXT: psubd %xmm4, %xmm2
-; SSE2-NEXT: psubd %xmm4, %xmm5
-; SSE2-NEXT: psubd %xmm4, %xmm0
-; SSE2-NEXT: psubd %xmm4, %xmm6
-; SSE2-NEXT: psubd %xmm4, %xmm3
-; SSE2-NEXT: psubd %xmm4, %xmm7
-; SSE2-NEXT: psubd %xmm4, %xmm1
-; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: psrld $1, %xmm7
-; SSE2-NEXT: psrld $1, %xmm3
-; SSE2-NEXT: psrld $1, %xmm6
-; SSE2-NEXT: psrld $1, %xmm0
-; SSE2-NEXT: psrld $1, %xmm5
-; SSE2-NEXT: psrld $1, %xmm2
-; SSE2-NEXT: psrld $1, %xmm9
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT: pand %xmm4, %xmm9
-; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: packuswb %xmm9, %xmm2
-; SSE2-NEXT: pand %xmm4, %xmm5
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: packuswb %xmm5, %xmm0
-; SSE2-NEXT: packuswb %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm4, %xmm6
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: packuswb %xmm6, %xmm3
-; SSE2-NEXT: pand %xmm4, %xmm7
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: packuswb %xmm7, %xmm1
-; SSE2-NEXT: packuswb %xmm3, %xmm1
-; SSE2-NEXT: movdqu %xmm1, (%rax)
+; SSE2-NEXT: movdqa 16(%rdi), %xmm0
+; SSE2-NEXT: movdqa (%rsi), %xmm1
+; SSE2-NEXT: pavgb (%rdi), %xmm1
+; SSE2-NEXT: pavgb 16(%rsi), %xmm0
; SSE2-NEXT: movdqu %xmm0, (%rax)
+; SSE2-NEXT: movdqu %xmm1, (%rax)
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v32i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm0, %xmm9
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm1, %xmm1
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm7
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vpsubd %xmm0, %xmm9, %xmm8
-; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm1
-; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm2
-; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm3
-; AVX1-NEXT: vpsubd %xmm0, %xmm4, %xmm4
-; AVX1-NEXT: vpsubd %xmm0, %xmm5, %xmm5
-; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm6
-; AVX1-NEXT: vpsubd %xmm0, %xmm7, %xmm0
-; AVX1-NEXT: vpsrld $1, %xmm0, %xmm9
-; AVX1-NEXT: vpsrld $1, %xmm6, %xmm6
-; AVX1-NEXT: vpsrld $1, %xmm5, %xmm5
-; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
-; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
-; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm8, %xmm7
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT: vpand %xmm0, %xmm7, %xmm7
-; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm1
-; AVX1-NEXT: vpackuswb %xmm7, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm2
-; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm3
-; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpand %xmm0, %xmm4, %xmm2
-; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm3
-; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpand %xmm0, %xmm6, %xmm3
-; AVX1-NEXT: vpand %xmm0, %xmm9, %xmm0
-; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vmovdqa (%rsi), %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpavgb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vmovups %ymm0, (%rax)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -265,444 +138,467 @@ define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) nounwind {
ret void
}
-define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) nounwind {
-; SSE2-LABEL: avg_v64i8:
+define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind {
+; SSE2-LABEL: avg_v48i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm6
-; SSE2-NEXT: movdqa 16(%rdi), %xmm2
-; SSE2-NEXT: movdqa 32(%rdi), %xmm1
-; SSE2-NEXT: movdqa 48(%rdi), %xmm0
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa (%rsi), %xmm5
+; SSE2-NEXT: movdqa (%rdi), %xmm1
+; SSE2-NEXT: movdqa 16(%rdi), %xmm6
+; SSE2-NEXT: movdqa 32(%rdi), %xmm11
+; SSE2-NEXT: movdqa (%rsi), %xmm12
; SSE2-NEXT: movdqa 16(%rsi), %xmm13
-; SSE2-NEXT: movdqa 32(%rsi), %xmm11
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm6, %xmm4
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm4, %xmm7
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm6, %xmm12
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm2, %xmm15
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm15, %xmm14
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm2, %xmm8
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm5, %xmm10
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm10, %xmm3
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT: paddd %xmm7, %xmm3
-; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm1, %xmm7
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
-; SSE2-NEXT: paddd %xmm4, %xmm10
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm5, %xmm3
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT: paddd %xmm12, %xmm3
-; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; SSE2-NEXT: paddd %xmm6, %xmm5
-; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm13, %xmm4
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm4, %xmm12
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
-; SSE2-NEXT: paddd %xmm14, %xmm12
-; SSE2-NEXT: movdqa %xmm7, %xmm5
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSE2-NEXT: paddd %xmm15, %xmm4
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3],xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm13, %xmm15
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
-; SSE2-NEXT: paddd %xmm8, %xmm15
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
-; SSE2-NEXT: paddd %xmm2, %xmm13
-; SSE2-NEXT: movdqa %xmm11, %xmm6
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm6, %xmm9
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
-; SSE2-NEXT: paddd %xmm5, %xmm9
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; SSE2-NEXT: paddd %xmm7, %xmm6
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm11, %xmm14
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
-; SSE2-NEXT: paddd %xmm2, %xmm14
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
-; SSE2-NEXT: movdqa %xmm5, %xmm2
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
-; SSE2-NEXT: paddd %xmm1, %xmm11
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: movdqa 48(%rsi), %xmm7
-; SSE2-NEXT: movdqa %xmm7, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-NEXT: movdqa 32(%rsi), %xmm0
+; SSE2-NEXT: pxor %xmm7, %xmm7
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15]
+; SSE2-NEXT: movdqa %xmm4, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
+; SSE2-NEXT: movdqa %xmm1, %xmm10
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3]
+; SSE2-NEXT: movdqa %xmm6, %xmm5
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
+; SSE2-NEXT: movdqa %xmm5, %xmm15
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
+; SSE2-NEXT: movdqa %xmm6, %xmm14
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; SSE2-NEXT: movdqa %xmm12, %xmm3
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15]
; SSE2-NEXT: movdqa %xmm3, %xmm8
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
-; SSE2-NEXT: paddd %xmm1, %xmm8
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; SSE2-NEXT: paddd %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm5, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm7, %xmm5
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT: paddd %xmm1, %xmm5
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
-; SSE2-NEXT: paddd %xmm2, %xmm7
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT: psubd %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: psubd %xmm0, %xmm10
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT: psubd %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT: psubd %xmm0, %xmm2
-; SSE2-NEXT: psubd %xmm0, %xmm12
-; SSE2-NEXT: psubd %xmm0, %xmm4
-; SSE2-NEXT: psubd %xmm0, %xmm15
-; SSE2-NEXT: psubd %xmm0, %xmm13
-; SSE2-NEXT: psubd %xmm0, %xmm9
-; SSE2-NEXT: psubd %xmm0, %xmm6
-; SSE2-NEXT: psubd %xmm0, %xmm14
-; SSE2-NEXT: psubd %xmm0, %xmm11
-; SSE2-NEXT: psubd %xmm0, %xmm8
-; SSE2-NEXT: psubd %xmm0, %xmm3
-; SSE2-NEXT: psubd %xmm0, %xmm5
-; SSE2-NEXT: psubd %xmm0, %xmm7
-; SSE2-NEXT: psrld $1, %xmm10
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm10
-; SSE2-NEXT: packuswb %xmm1, %xmm10
-; SSE2-NEXT: psrld $1, %xmm2
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: packuswb %xmm1, %xmm2
-; SSE2-NEXT: packuswb %xmm10, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: psrld $1, %xmm4
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; SSE2-NEXT: paddd %xmm2, %xmm8
+; SSE2-NEXT: movdqa %xmm11, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
+; SSE2-NEXT: paddd %xmm4, %xmm3
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7]
+; SSE2-NEXT: movdqa %xmm12, %xmm9
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
+; SSE2-NEXT: paddd %xmm10, %xmm9
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3]
+; SSE2-NEXT: paddd %xmm1, %xmm12
+; SSE2-NEXT: movdqa %xmm13, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15]
+; SSE2-NEXT: movdqa %xmm4, %xmm10
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
+; SSE2-NEXT: paddd %xmm15, %xmm10
+; SSE2-NEXT: movdqa %xmm2, %xmm15
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
+; SSE2-NEXT: paddd %xmm5, %xmm4
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7]
+; SSE2-NEXT: movdqa %xmm13, %xmm1
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
+; SSE2-NEXT: paddd %xmm14, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3]
+; SSE2-NEXT: paddd %xmm6, %xmm13
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
+; SSE2-NEXT: movdqa %xmm6, %xmm14
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7]
+; SSE2-NEXT: paddd %xmm15, %xmm14
+; SSE2-NEXT: movdqa %xmm11, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; SSE2-NEXT: paddd %xmm2, %xmm6
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7]
+; SSE2-NEXT: paddd %xmm5, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
+; SSE2-NEXT: paddd %xmm11, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm5
+; SSE2-NEXT: psubd %xmm5, %xmm8
+; SSE2-NEXT: psubd %xmm5, %xmm3
+; SSE2-NEXT: psubd %xmm5, %xmm9
+; SSE2-NEXT: psubd %xmm5, %xmm12
+; SSE2-NEXT: psubd %xmm5, %xmm10
+; SSE2-NEXT: psubd %xmm5, %xmm4
+; SSE2-NEXT: psubd %xmm5, %xmm1
+; SSE2-NEXT: psubd %xmm5, %xmm13
+; SSE2-NEXT: psubd %xmm5, %xmm14
+; SSE2-NEXT: psubd %xmm5, %xmm6
+; SSE2-NEXT: psubd %xmm5, %xmm2
+; SSE2-NEXT: psubd %xmm5, %xmm0
+; SSE2-NEXT: psrld $1, %xmm3
+; SSE2-NEXT: psrld $1, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255]
+; SSE2-NEXT: pand %xmm7, %xmm8
+; SSE2-NEXT: pand %xmm7, %xmm3
+; SSE2-NEXT: packuswb %xmm8, %xmm3
; SSE2-NEXT: psrld $1, %xmm12
-; SSE2-NEXT: pand %xmm0, %xmm12
-; SSE2-NEXT: pand %xmm0, %xmm4
-; SSE2-NEXT: packuswb %xmm12, %xmm4
+; SSE2-NEXT: psrld $1, %xmm9
+; SSE2-NEXT: pand %xmm7, %xmm9
+; SSE2-NEXT: pand %xmm7, %xmm12
+; SSE2-NEXT: packuswb %xmm9, %xmm12
+; SSE2-NEXT: packuswb %xmm3, %xmm12
+; SSE2-NEXT: psrld $1, %xmm4
+; SSE2-NEXT: psrld $1, %xmm10
+; SSE2-NEXT: pand %xmm7, %xmm10
+; SSE2-NEXT: pand %xmm7, %xmm4
+; SSE2-NEXT: packuswb %xmm10, %xmm4
; SSE2-NEXT: psrld $1, %xmm13
-; SSE2-NEXT: psrld $1, %xmm15
-; SSE2-NEXT: pand %xmm0, %xmm15
-; SSE2-NEXT: pand %xmm0, %xmm13
-; SSE2-NEXT: packuswb %xmm15, %xmm13
+; SSE2-NEXT: psrld $1, %xmm1
+; SSE2-NEXT: pand %xmm7, %xmm1
+; SSE2-NEXT: pand %xmm7, %xmm13
+; SSE2-NEXT: packuswb %xmm1, %xmm13
; SSE2-NEXT: packuswb %xmm4, %xmm13
; SSE2-NEXT: psrld $1, %xmm6
-; SSE2-NEXT: psrld $1, %xmm9
-; SSE2-NEXT: pand %xmm0, %xmm9
-; SSE2-NEXT: pand %xmm0, %xmm6
-; SSE2-NEXT: packuswb %xmm9, %xmm6
-; SSE2-NEXT: psrld $1, %xmm11
; SSE2-NEXT: psrld $1, %xmm14
-; SSE2-NEXT: pand %xmm0, %xmm14
-; SSE2-NEXT: pand %xmm0, %xmm11
-; SSE2-NEXT: packuswb %xmm14, %xmm11
-; SSE2-NEXT: packuswb %xmm6, %xmm11
-; SSE2-NEXT: psrld $1, %xmm3
-; SSE2-NEXT: psrld $1, %xmm8
-; SSE2-NEXT: pand %xmm0, %xmm8
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: packuswb %xmm8, %xmm3
-; SSE2-NEXT: psrld $1, %xmm7
-; SSE2-NEXT: psrld $1, %xmm5
-; SSE2-NEXT: pand %xmm0, %xmm5
-; SSE2-NEXT: pand %xmm0, %xmm7
-; SSE2-NEXT: packuswb %xmm5, %xmm7
-; SSE2-NEXT: packuswb %xmm3, %xmm7
-; SSE2-NEXT: movdqu %xmm7, (%rax)
-; SSE2-NEXT: movdqu %xmm11, (%rax)
+; SSE2-NEXT: pand %xmm7, %xmm14
+; SSE2-NEXT: pand %xmm7, %xmm6
+; SSE2-NEXT: packuswb %xmm14, %xmm6
+; SSE2-NEXT: psrld $1, %xmm0
+; SSE2-NEXT: psrld $1, %xmm2
+; SSE2-NEXT: pand %xmm7, %xmm2
+; SSE2-NEXT: pand %xmm7, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: packuswb %xmm6, %xmm0
+; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: movdqu %xmm13, (%rax)
-; SSE2-NEXT: movdqu %xmm1, (%rax)
+; SSE2-NEXT: movdqu %xmm12, (%rax)
; SSE2-NEXT: retq
;
-; AVX1-LABEL: avg_v64i8:
+; AVX1-LABEL: avg_v48i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: subq $24, %rsp
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vmovdqa %xmm7, (%rsp) # 16-byte Spill
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm1, %xmm0
-; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm0
-; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm0
-; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm0
-; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm13
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm12
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd %xmm4, %xmm15, %xmm11
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd %xmm0, %xmm8, %xmm10
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd %xmm1, %xmm9, %xmm8
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd %xmm2, %xmm14, %xmm9
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd -{{[0-9]+}}(%rsp), %xmm3, %xmm4 # 16-byte Folded Reload
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd (%rsp), %xmm7, %xmm7 # 16-byte Folded Reload
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd -{{[0-9]+}}(%rsp), %xmm5, %xmm3 # 16-byte Folded Reload
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd -{{[0-9]+}}(%rsp), %xmm5, %xmm2 # 16-byte Folded Reload
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
-; AVX1-NEXT: vpsubd %xmm0, %xmm5, %xmm14
-; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
-; AVX1-NEXT: vpsubd %xmm0, %xmm5, %xmm5
-; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
-; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm6
-; AVX1-NEXT: vmovdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
-; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm6
-; AVX1-NEXT: vmovdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
-; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm15
-; AVX1-NEXT: vmovdqa %xmm15, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT: vpsubd %xmm0, %xmm13, %xmm13
-; AVX1-NEXT: vpsubd %xmm0, %xmm12, %xmm12
-; AVX1-NEXT: vpsubd %xmm0, %xmm11, %xmm11
-; AVX1-NEXT: vpsubd %xmm0, %xmm10, %xmm10
-; AVX1-NEXT: vpsubd %xmm0, %xmm8, %xmm8
-; AVX1-NEXT: vpsubd %xmm0, %xmm9, %xmm9
-; AVX1-NEXT: vpsubd %xmm0, %xmm4, %xmm4
-; AVX1-NEXT: vpsubd %xmm0, %xmm7, %xmm7
-; AVX1-NEXT: vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm3
-; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa (%rdi), %ymm2
+; AVX1-NEXT: vmovdqa 32(%rdi), %ymm5
+; AVX1-NEXT: vmovdqa (%rsi), %ymm1
+; AVX1-NEXT: vmovdqa 32(%rsi), %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[3,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm12 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[3,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,0,1]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm10 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
; AVX1-NEXT: vmovdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpsrld $1, %xmm5, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm14, %xmm14
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT: vpand %xmm5, %xmm14, %xmm14
-; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpackuswb %xmm14, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; AVX1-NEXT: vpsrld $1, %xmm2, %xmm6
-; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
-; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm6
-; AVX1-NEXT: vpackuswb %xmm2, %xmm6, %xmm2
-; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm13, %xmm2
-; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
-; AVX1-NEXT: vpsrld $1, %xmm6, %xmm6
-; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm6
-; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm6, %xmm2, %xmm2
-; AVX1-NEXT: vpsrld $1, %xmm11, %xmm6
-; AVX1-NEXT: vpsrld $1, %xmm12, %xmm7
-; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm7
-; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm6
-; AVX1-NEXT: vpackuswb %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpackuswb %xmm2, %xmm6, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT: vpsrld $1, %xmm8, %xmm2
-; AVX1-NEXT: vpsrld $1, %xmm10, %xmm6
-; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm6
-; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[1,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT: vmovdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[3,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; AVX1-NEXT: vpaddd %xmm5, %xmm7, %xmm5
+; AVX1-NEXT: vmovdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
+; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm9
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm8
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT: vpaddd %xmm2, %xmm11, %xmm11
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[3,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
+; AVX1-NEXT: vpaddd %xmm7, %xmm12, %xmm12
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; AVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT: vpaddd %xmm5, %xmm13, %xmm13
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT: vpaddd %xmm1, %xmm15, %xmm15
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; AVX1-NEXT: vpaddd %xmm4, %xmm14, %xmm14
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
+; AVX1-NEXT: vpaddd %xmm6, %xmm10, %xmm6
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpaddd -{{[0-9]+}}(%rsp), %xmm2, %xmm2 # 16-byte Folded Reload
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpaddd -{{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT: vpcmpeqd %xmm7, %xmm7, %xmm7
+; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; AVX1-NEXT: vpsubd %xmm7, %xmm1, %xmm10
+; AVX1-NEXT: vpsubd %xmm7, %xmm9, %xmm9
+; AVX1-NEXT: vpsubd %xmm7, %xmm8, %xmm8
+; AVX1-NEXT: vpsubd %xmm7, %xmm11, %xmm11
+; AVX1-NEXT: vpsubd %xmm7, %xmm12, %xmm12
+; AVX1-NEXT: vpsubd %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpsubd %xmm7, %xmm13, %xmm4
+; AVX1-NEXT: vpsubd %xmm7, %xmm15, %xmm5
+; AVX1-NEXT: vpsubd %xmm7, %xmm14, %xmm1
+; AVX1-NEXT: vpsubd %xmm7, %xmm6, %xmm6
+; AVX1-NEXT: vpsubd %xmm7, %xmm2, %xmm2
+; AVX1-NEXT: vpsubd %xmm7, %xmm0, %xmm0
+; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vpsrld $1, %xmm2, %xmm14
+; AVX1-NEXT: vpsrld $1, %xmm6, %xmm15
+; AVX1-NEXT: vpsrld $1, %xmm1, %xmm13
+; AVX1-NEXT: vpsrld $1, %xmm5, %xmm5
; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
-; AVX1-NEXT: vpsrld $1, %xmm9, %xmm6
-; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm6
-; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpackuswb %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpackuswb %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
+; AVX1-NEXT: vpsrld $1, %xmm12, %xmm12
+; AVX1-NEXT: vpsrld $1, %xmm11, %xmm11
+; AVX1-NEXT: vpsrld $1, %xmm8, %xmm7
+; AVX1-NEXT: vpsrld $1, %xmm9, %xmm2
+; AVX1-NEXT: vpsrld $1, %xmm10, %xmm6
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm6
+; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm2
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm7
+; AVX1-NEXT: vpshufb %xmm0, %xmm11, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm7[0],xmm1[0]
+; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm2
+; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm3
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm3
+; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm4
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vpshufb %xmm0, %xmm13, %xmm2
+; AVX1-NEXT: vpshufb %xmm0, %xmm15, %xmm3
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm0, %xmm14, %xmm3
; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
-; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
-; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
-; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
-; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vmovups %ymm0, (%rax)
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vmovdqu %xmm0, (%rax)
; AVX1-NEXT: vmovups %ymm1, (%rax)
-; AVX1-NEXT: addq $24, %rsp
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: avg_v64i8:
+; AVX2-LABEL: avg_v48i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm0
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm1
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpaddd %ymm8, %ymm2, %ymm2
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm3
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpaddd %ymm8, %ymm4, %ymm4
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpaddd %ymm8, %ymm5, %ymm5
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpaddd %ymm8, %ymm6, %ymm6
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpaddd %ymm8, %ymm7, %ymm7
-; AVX2-NEXT: vpcmpeqd %ymm8, %ymm8, %ymm8
-; AVX2-NEXT: vpsubd %ymm8, %ymm0, %ymm9
-; AVX2-NEXT: vpsubd %ymm8, %ymm1, %ymm10
-; AVX2-NEXT: vpsubd %ymm8, %ymm2, %ymm2
-; AVX2-NEXT: vpsubd %ymm8, %ymm3, %ymm3
-; AVX2-NEXT: vpsubd %ymm8, %ymm4, %ymm4
-; AVX2-NEXT: vpsubd %ymm8, %ymm5, %ymm5
-; AVX2-NEXT: vpsubd %ymm8, %ymm6, %ymm1
-; AVX2-NEXT: vpsubd %ymm8, %ymm7, %ymm0
-; AVX2-NEXT: vpsrld $1, %ymm0, %ymm11
-; AVX2-NEXT: vpsrld $1, %ymm1, %ymm12
+; AVX2-NEXT: vmovdqa (%rdi), %ymm1
+; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2
+; AVX2-NEXT: vmovdqa (%rsi), %ymm3
+; AVX2-NEXT: vmovdqa 32(%rsi), %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
+; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,0,1]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[3,1,2,3]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX2-NEXT: vpand %ymm9, %ymm5, %ymm5
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm11 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm6
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,3,0,1]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[3,1,2,3]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
+; AVX2-NEXT: vpand %ymm9, %ymm2, %ymm2
+; AVX2-NEXT: vpaddd %ymm2, %ymm5, %ymm2
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
+; AVX2-NEXT: vpaddd %ymm4, %ymm7, %ymm4
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
+; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
+; AVX2-NEXT: vpaddd %ymm3, %ymm11, %ymm3
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT: vpaddd %ymm0, %ymm10, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6
+; AVX2-NEXT: vpsubd %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vpsubd %ymm6, %ymm4, %ymm4
+; AVX2-NEXT: vpsubd %ymm6, %ymm1, %ymm1
+; AVX2-NEXT: vpsubd %ymm6, %ymm3, %ymm3
+; AVX2-NEXT: vpsubd %ymm6, %ymm5, %ymm5
+; AVX2-NEXT: vpsubd %ymm6, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2
+; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0
; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5
+; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3
+; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4
-; AVX2-NEXT: vpsrld $1, %ymm3, %ymm6
-; AVX2-NEXT: vpsrld $1, %ymm2, %ymm7
-; AVX2-NEXT: vpsrld $1, %ymm10, %ymm8
-; AVX2-NEXT: vpsrld $1, %ymm9, %ymm3
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm3
-; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm3[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm3, %xmm9, %xmm0
-; AVX2-NEXT: vpshufb %ymm2, %ymm8, %ymm8
-; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm3, %xmm8, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT: vpshufb %ymm2, %ymm7, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm6
-; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm6
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm6[0],xmm1[0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm4
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT: vpshufb %ymm6, %ymm4, %ymm4
; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm7, %xmm4, %xmm4
+; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
-; AVX2-NEXT: vpshufb %ymm2, %ymm12, %ymm4
-; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4
-; AVX2-NEXT: vpshufb %ymm2, %ymm11, %ymm2
+; AVX2-NEXT: vpshufb %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %ymm6, %ymm3, %ymm3
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm3
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: vpshufb %ymm6, %ymm5, %ymm2
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX2-NEXT: vmovdqu %xmm0, (%rax)
; AVX2-NEXT: vmovdqu %ymm1, (%rax)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: avg_v48i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: vmovdqa (%rsi), %ymm2
+; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm3
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm5
+; AVX512F-NEXT: vpavgb %xmm5, %xmm4, %xmm4
+; AVX512F-NEXT: vpavgb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpavgb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vmovdqu %xmm1, (%rax)
+; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: avg_v48i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1
+; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero,xmm3[8],zero,zero,zero,xmm3[9],zero,zero,zero,xmm3[10],zero,zero,zero,xmm3[11],zero,zero,zero,xmm3[12],zero,zero,zero,xmm3[13],zero,zero,zero,xmm3[14],zero,zero,zero,xmm3[15],zero,zero,zero
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512BW-NEXT: vpaddd %zmm4, %zmm2, %zmm2
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm4
+; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
+; AVX512BW-NEXT: vpaddd %zmm4, %zmm3, %zmm3
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsubd %zmm1, %zmm2, %zmm2
+; AVX512BW-NEXT: vpsubd %zmm1, %zmm3, %zmm3
+; AVX512BW-NEXT: vpsubd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrld $1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrld $1, %zmm3, %zmm1
+; AVX512BW-NEXT: vpsrld $1, %zmm2, %zmm2
+; AVX512BW-NEXT: vpmovdw %zmm2, %ymm2
+; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT: vmovdqa %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vmovdqu %ymm1, (%rax)
+; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+ %1 = load <48 x i8>, <48 x i8>* %a
+ %2 = load <48 x i8>, <48 x i8>* %b
+ %3 = zext <48 x i8> %1 to <48 x i32>
+ %4 = zext <48 x i8> %2 to <48 x i32>
+ %5 = add nuw nsw <48 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %6 = add nuw nsw <48 x i32> %5, %4
+ %7 = lshr <48 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %8 = trunc <48 x i32> %7 to <48 x i8>
+ store <48 x i8> %8, <48 x i8>* undef, align 4
+ ret void
+}
+
+define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) nounwind {
+; SSE2-LABEL: avg_v64i8:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa 32(%rdi), %xmm0
+; SSE2-NEXT: movdqa (%rsi), %xmm1
+; SSE2-NEXT: movdqa 16(%rsi), %xmm2
+; SSE2-NEXT: movdqa 48(%rsi), %xmm3
+; SSE2-NEXT: pavgb (%rdi), %xmm1
+; SSE2-NEXT: pavgb 16(%rdi), %xmm2
+; SSE2-NEXT: pavgb 32(%rsi), %xmm0
+; SSE2-NEXT: pavgb 48(%rdi), %xmm3
+; SSE2-NEXT: movdqu %xmm3, (%rax)
+; SSE2-NEXT: movdqu %xmm0, (%rax)
+; SSE2-NEXT: movdqu %xmm2, (%rax)
+; SSE2-NEXT: movdqu %xmm1, (%rax)
+; SSE2-NEXT: retq
+;
+; AVX1-LABEL: avg_v64i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX1-NEXT: vmovdqa (%rsi), %ymm2
+; AVX1-NEXT: vmovdqa 32(%rsi), %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT: vpavgb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpavgb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpavgb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpavgb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vmovups %ymm1, (%rax)
+; AVX1-NEXT: vmovups %ymm0, (%rax)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: avg_v64i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
+; AVX2-NEXT: vmovdqa (%rsi), %ymm1
+; AVX2-NEXT: vpavgb (%rdi), %ymm1, %ymm1
+; AVX2-NEXT: vpavgb 32(%rsi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
+; AVX2-NEXT: vmovdqu %ymm1, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: avg_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vpaddd %zmm4, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vpaddd %zmm4, %zmm2, %zmm2
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vpaddd %zmm4, %zmm3, %zmm3
-; AVX512F-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4
-; AVX512F-NEXT: vpsubd %zmm4, %zmm0, %zmm0
-; AVX512F-NEXT: vpsubd %zmm4, %zmm1, %zmm1
-; AVX512F-NEXT: vpsubd %zmm4, %zmm2, %zmm2
-; AVX512F-NEXT: vpsubd %zmm4, %zmm3, %zmm3
-; AVX512F-NEXT: vpsrld $1, %zmm3, %zmm3
-; AVX512F-NEXT: vpsrld $1, %zmm2, %zmm2
-; AVX512F-NEXT: vpsrld $1, %zmm1, %zmm1
-; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovdb %zmm2, %xmm1
-; AVX512F-NEXT: vpmovdb %zmm3, %xmm2
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
+; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0
+; AVX512F-NEXT: vmovdqa (%rsi), %ymm1
+; AVX512F-NEXT: vpavgb (%rdi), %ymm1, %ymm1
+; AVX512F-NEXT: vpavgb 32(%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -782,81 +678,23 @@ define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) nounwind {
define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) nounwind {
; SSE2-LABEL: avg_v16i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm2
-; SSE2-NEXT: movdqa 16(%rdi), %xmm4
-; SSE2-NEXT: movdqa (%rsi), %xmm0
-; SSE2-NEXT: movdqa 16(%rsi), %xmm1
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: movdqa %xmm2, %xmm6
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
-; SSE2-NEXT: movdqa %xmm4, %xmm7
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; SSE2-NEXT: paddd %xmm6, %xmm3
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
-; SSE2-NEXT: paddd %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; SSE2-NEXT: paddd %xmm7, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
-; SSE2-NEXT: paddd %xmm4, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE2-NEXT: psubd %xmm4, %xmm3
-; SSE2-NEXT: psubd %xmm4, %xmm0
-; SSE2-NEXT: psubd %xmm4, %xmm2
-; SSE2-NEXT: psubd %xmm4, %xmm1
-; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: psrld $1, %xmm2
-; SSE2-NEXT: psrld $1, %xmm0
-; SSE2-NEXT: psrld $1, %xmm3
-; SSE2-NEXT: pslld $16, %xmm3
-; SSE2-NEXT: psrad $16, %xmm3
-; SSE2-NEXT: pslld $16, %xmm0
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: packssdw %xmm3, %xmm0
-; SSE2-NEXT: pslld $16, %xmm2
-; SSE2-NEXT: psrad $16, %xmm2
-; SSE2-NEXT: pslld $16, %xmm1
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: packssdw %xmm2, %xmm1
-; SSE2-NEXT: movdqu %xmm1, (%rax)
+; SSE2-NEXT: movdqa 16(%rdi), %xmm0
+; SSE2-NEXT: movdqa (%rsi), %xmm1
+; SSE2-NEXT: pavgw (%rdi), %xmm1
+; SSE2-NEXT: pavgw 16(%rsi), %xmm0
; SSE2-NEXT: movdqu %xmm0, (%rax)
+; SSE2-NEXT: movdqu %xmm1, (%rax)
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpsubd %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpsubd %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpsubd %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpsubd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
-; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
-; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7]
-; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vmovdqa (%rsi), %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpavgw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpavgw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vmovups %ymm0, (%rax)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -891,207 +729,60 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) nounwind {
define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) nounwind {
; SSE2-LABEL: avg_v32i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm4
-; SSE2-NEXT: movdqa 16(%rdi), %xmm11
-; SSE2-NEXT: movdqa 32(%rdi), %xmm10
-; SSE2-NEXT: movdqa 48(%rdi), %xmm8
-; SSE2-NEXT: movdqa (%rsi), %xmm9
-; SSE2-NEXT: movdqa 16(%rsi), %xmm1
-; SSE2-NEXT: movdqa 32(%rsi), %xmm2
+; SSE2-NEXT: movdqa 32(%rdi), %xmm0
+; SSE2-NEXT: movdqa (%rsi), %xmm1
+; SSE2-NEXT: movdqa 16(%rsi), %xmm2
; SSE2-NEXT: movdqa 48(%rsi), %xmm3
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm4, %xmm6
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm11, %xmm5
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm10, %xmm12
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm8, %xmm13
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm9, %xmm7
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; SSE2-NEXT: paddd %xmm6, %xmm7
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
-; SSE2-NEXT: paddd %xmm4, %xmm9
-; SSE2-NEXT: movdqa %xmm1, %xmm6
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; SSE2-NEXT: paddd %xmm5, %xmm6
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: paddd %xmm11, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT: paddd %xmm12, %xmm5
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: paddd %xmm10, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE2-NEXT: paddd %xmm13, %xmm4
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; SSE2-NEXT: paddd %xmm8, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
-; SSE2-NEXT: psubd %xmm0, %xmm7
-; SSE2-NEXT: psubd %xmm0, %xmm9
-; SSE2-NEXT: psubd %xmm0, %xmm6
-; SSE2-NEXT: psubd %xmm0, %xmm1
-; SSE2-NEXT: psubd %xmm0, %xmm5
-; SSE2-NEXT: psubd %xmm0, %xmm2
-; SSE2-NEXT: psubd %xmm0, %xmm4
-; SSE2-NEXT: psubd %xmm0, %xmm3
-; SSE2-NEXT: psrld $1, %xmm3
-; SSE2-NEXT: psrld $1, %xmm4
-; SSE2-NEXT: psrld $1, %xmm2
-; SSE2-NEXT: psrld $1, %xmm5
-; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: psrld $1, %xmm6
-; SSE2-NEXT: psrld $1, %xmm9
-; SSE2-NEXT: psrld $1, %xmm7
-; SSE2-NEXT: pslld $16, %xmm7
-; SSE2-NEXT: psrad $16, %xmm7
-; SSE2-NEXT: pslld $16, %xmm9
-; SSE2-NEXT: psrad $16, %xmm9
-; SSE2-NEXT: packssdw %xmm7, %xmm9
-; SSE2-NEXT: pslld $16, %xmm6
-; SSE2-NEXT: psrad $16, %xmm6
-; SSE2-NEXT: pslld $16, %xmm1
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: packssdw %xmm6, %xmm1
-; SSE2-NEXT: pslld $16, %xmm5
-; SSE2-NEXT: psrad $16, %xmm5
-; SSE2-NEXT: pslld $16, %xmm2
-; SSE2-NEXT: psrad $16, %xmm2
-; SSE2-NEXT: packssdw %xmm5, %xmm2
-; SSE2-NEXT: pslld $16, %xmm4
-; SSE2-NEXT: psrad $16, %xmm4
-; SSE2-NEXT: pslld $16, %xmm3
-; SSE2-NEXT: psrad $16, %xmm3
-; SSE2-NEXT: packssdw %xmm4, %xmm3
+; SSE2-NEXT: pavgw (%rdi), %xmm1
+; SSE2-NEXT: pavgw 16(%rdi), %xmm2
+; SSE2-NEXT: pavgw 32(%rsi), %xmm0
+; SSE2-NEXT: pavgw 48(%rdi), %xmm3
; SSE2-NEXT: movdqu %xmm3, (%rax)
+; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: movdqu %xmm2, (%rax)
; SSE2-NEXT: movdqu %xmm1, (%rax)
-; SSE2-NEXT: movdqu %xmm9, (%rax)
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v32i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm8 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm0, %xmm9
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm1, %xmm1
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm7
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vpsubd %xmm0, %xmm9, %xmm8
-; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm1
-; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm2
-; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm3
-; AVX1-NEXT: vpsubd %xmm0, %xmm4, %xmm4
-; AVX1-NEXT: vpsubd %xmm0, %xmm5, %xmm5
-; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm6
-; AVX1-NEXT: vpsubd %xmm0, %xmm7, %xmm0
-; AVX1-NEXT: vpsrld $1, %xmm0, %xmm9
-; AVX1-NEXT: vpsrld $1, %xmm6, %xmm6
-; AVX1-NEXT: vpsrld $1, %xmm5, %xmm5
-; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
-; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
-; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm8, %xmm7
-; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm0[1],xmm7[2],xmm0[3],xmm7[4],xmm0[5],xmm7[6],xmm0[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
-; AVX1-NEXT: vpackusdw %xmm7, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2],xmm0[3],xmm3[4],xmm0[5],xmm3[6],xmm0[7]
-; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4],xmm0[5],xmm4[6],xmm0[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm0[1],xmm5[2],xmm0[3],xmm5[4],xmm0[5],xmm5[6],xmm0[7]
-; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm0[1],xmm6[2],xmm0[3],xmm6[4],xmm0[5],xmm6[6],xmm0[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2],xmm0[3],xmm9[4],xmm0[5],xmm9[6],xmm0[7]
-; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vmovups %ymm0, (%rax)
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX1-NEXT: vmovdqa (%rsi), %ymm2
+; AVX1-NEXT: vmovdqa 32(%rsi), %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT: vpavgw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpavgw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpavgw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpavgw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: vmovups %ymm1, (%rax)
+; AVX1-NEXT: vmovups %ymm0, (%rax)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: avg_v32i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3
-; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX2-NEXT: vpsubd %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpsubd %ymm4, %ymm1, %ymm1
-; AVX2-NEXT: vpsubd %ymm4, %ymm2, %ymm2
-; AVX2-NEXT: vpsubd %ymm4, %ymm3, %ymm3
-; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3
-; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2
-; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
-; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm2
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX2-NEXT: vmovdqu %ymm1, (%rax)
+; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
+; AVX2-NEXT: vmovdqa (%rsi), %ymm1
+; AVX2-NEXT: vpavgw (%rdi), %ymm1, %ymm1
+; AVX2-NEXT: vpavgw 32(%rsi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
+; AVX2-NEXT: vmovdqu %ymm1, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: avg_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1
-; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
-; AVX512F-NEXT: vpsubd %zmm2, %zmm0, %zmm0
-; AVX512F-NEXT: vpsubd %zmm2, %zmm1, %zmm1
-; AVX512F-NEXT: vpsrld $1, %zmm1, %zmm1
-; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovdw %zmm0, (%rax)
-; AVX512F-NEXT: vpmovdw %zmm1, (%rax)
+; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0
+; AVX512F-NEXT: vmovdqa (%rsi), %ymm1
+; AVX512F-NEXT: vpavgw (%rdi), %ymm1, %ymm1
+; AVX512F-NEXT: vpavgw 32(%rsi), %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -1199,150 +890,23 @@ define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) nounwind {
define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) nounwind {
; SSE2-LABEL: avg_v32i8_2:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm3
-; SSE2-NEXT: movdqa 16(%rdi), %xmm8
-; SSE2-NEXT: movdqa (%rsi), %xmm0
+; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: movdqa 16(%rsi), %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: movdqa %xmm3, %xmm5
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm3, %xmm12
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT: movdqa %xmm8, %xmm7
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm7, %xmm11
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm8, %xmm10
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm2, %xmm9
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
-; SSE2-NEXT: paddd %xmm6, %xmm9
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE2-NEXT: paddd %xmm5, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; SSE2-NEXT: paddd %xmm12, %xmm5
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE2-NEXT: paddd %xmm3, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm3, %xmm6
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
-; SSE2-NEXT: paddd %xmm11, %xmm6
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT: paddd %xmm7, %xmm3
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm7
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
-; SSE2-NEXT: paddd %xmm10, %xmm7
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT: paddd %xmm8, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE2-NEXT: psubd %xmm4, %xmm9
-; SSE2-NEXT: psubd %xmm4, %xmm2
-; SSE2-NEXT: psubd %xmm4, %xmm5
-; SSE2-NEXT: psubd %xmm4, %xmm0
-; SSE2-NEXT: psubd %xmm4, %xmm6
-; SSE2-NEXT: psubd %xmm4, %xmm3
-; SSE2-NEXT: psubd %xmm4, %xmm7
-; SSE2-NEXT: psubd %xmm4, %xmm1
-; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: psrld $1, %xmm7
-; SSE2-NEXT: psrld $1, %xmm3
-; SSE2-NEXT: psrld $1, %xmm6
-; SSE2-NEXT: psrld $1, %xmm0
-; SSE2-NEXT: psrld $1, %xmm5
-; SSE2-NEXT: psrld $1, %xmm2
-; SSE2-NEXT: psrld $1, %xmm9
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT: pand %xmm4, %xmm9
-; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: packuswb %xmm9, %xmm2
-; SSE2-NEXT: pand %xmm4, %xmm5
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: packuswb %xmm5, %xmm0
-; SSE2-NEXT: packuswb %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm4, %xmm6
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: packuswb %xmm6, %xmm3
-; SSE2-NEXT: pand %xmm4, %xmm7
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: packuswb %xmm7, %xmm1
-; SSE2-NEXT: packuswb %xmm3, %xmm1
+; SSE2-NEXT: pavgb (%rsi), %xmm0
+; SSE2-NEXT: pavgb 16(%rdi), %xmm1
; SSE2-NEXT: movdqu %xmm1, (%rax)
; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v32i8_2:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm0, %xmm9
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm1, %xmm1
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm7
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vpsubd %xmm0, %xmm9, %xmm8
-; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm1
-; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm2
-; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm3
-; AVX1-NEXT: vpsubd %xmm0, %xmm4, %xmm4
-; AVX1-NEXT: vpsubd %xmm0, %xmm5, %xmm5
-; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm6
-; AVX1-NEXT: vpsubd %xmm0, %xmm7, %xmm0
-; AVX1-NEXT: vpsrld $1, %xmm0, %xmm9
-; AVX1-NEXT: vpsrld $1, %xmm6, %xmm6
-; AVX1-NEXT: vpsrld $1, %xmm5, %xmm5
-; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
-; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
-; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm8, %xmm7
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT: vpand %xmm0, %xmm7, %xmm7
-; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm1
-; AVX1-NEXT: vpackuswb %xmm7, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm2
-; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm3
-; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpand %xmm0, %xmm4, %xmm2
-; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm3
-; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpand %xmm0, %xmm6, %xmm3
-; AVX1-NEXT: vpand %xmm0, %xmm9, %xmm0
-; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vmovdqa (%rsi), %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpavgb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpavgb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vmovups %ymm0, (%rax)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -1377,249 +941,31 @@ define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) nounwind {
define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) nounwind {
; SSE2-LABEL: avg_v64i8_2:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rsi), %xmm14
-; SSE2-NEXT: movdqa 16(%rsi), %xmm12
+; SSE2-NEXT: movdqa (%rsi), %xmm0
+; SSE2-NEXT: movdqa 16(%rsi), %xmm1
; SSE2-NEXT: movdqa 32(%rsi), %xmm2
-; SSE2-NEXT: movdqa 48(%rsi), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm14, %xmm7
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm7, %xmm15
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm14, %xmm8
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm12, %xmm6
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm6, %xmm13
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm12, %xmm9
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm5, %xmm11
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm2, %xmm10
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm4, %xmm3
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: paddd %xmm1, %xmm1
-; SSE2-NEXT: paddd %xmm3, %xmm3
-; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: paddd %xmm4, %xmm4
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm3, %xmm3
-; SSE2-NEXT: paddd %xmm2, %xmm2
-; SSE2-NEXT: paddd %xmm10, %xmm10
-; SSE2-NEXT: paddd %xmm5, %xmm5
-; SSE2-NEXT: paddd %xmm11, %xmm11
-; SSE2-NEXT: paddd %xmm12, %xmm12
-; SSE2-NEXT: paddd %xmm9, %xmm9
-; SSE2-NEXT: paddd %xmm6, %xmm6
-; SSE2-NEXT: paddd %xmm13, %xmm13
-; SSE2-NEXT: paddd %xmm14, %xmm14
-; SSE2-NEXT: paddd %xmm8, %xmm8
-; SSE2-NEXT: paddd %xmm7, %xmm7
-; SSE2-NEXT: paddd %xmm15, %xmm15
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
-; SSE2-NEXT: psubd %xmm0, %xmm15
-; SSE2-NEXT: psubd %xmm0, %xmm7
-; SSE2-NEXT: psubd %xmm0, %xmm8
-; SSE2-NEXT: psubd %xmm0, %xmm14
-; SSE2-NEXT: psubd %xmm0, %xmm13
-; SSE2-NEXT: psubd %xmm0, %xmm6
-; SSE2-NEXT: psubd %xmm0, %xmm9
-; SSE2-NEXT: psubd %xmm0, %xmm12
-; SSE2-NEXT: psubd %xmm0, %xmm11
-; SSE2-NEXT: psubd %xmm0, %xmm5
-; SSE2-NEXT: psubd %xmm0, %xmm10
-; SSE2-NEXT: psubd %xmm0, %xmm2
-; SSE2-NEXT: psubd %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: psubd %xmm0, %xmm4
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; SSE2-NEXT: psubd %xmm0, %xmm3
-; SSE2-NEXT: psubd %xmm0, %xmm1
-; SSE2-NEXT: psrld $1, %xmm7
-; SSE2-NEXT: psrld $1, %xmm15
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT: pand %xmm0, %xmm15
-; SSE2-NEXT: pand %xmm0, %xmm7
-; SSE2-NEXT: packuswb %xmm15, %xmm7
-; SSE2-NEXT: psrld $1, %xmm14
-; SSE2-NEXT: psrld $1, %xmm8
-; SSE2-NEXT: pand %xmm0, %xmm8
-; SSE2-NEXT: pand %xmm0, %xmm14
-; SSE2-NEXT: packuswb %xmm8, %xmm14
-; SSE2-NEXT: packuswb %xmm7, %xmm14
-; SSE2-NEXT: psrld $1, %xmm6
-; SSE2-NEXT: psrld $1, %xmm13
-; SSE2-NEXT: pand %xmm0, %xmm13
-; SSE2-NEXT: pand %xmm0, %xmm6
-; SSE2-NEXT: packuswb %xmm13, %xmm6
-; SSE2-NEXT: psrld $1, %xmm12
-; SSE2-NEXT: psrld $1, %xmm9
-; SSE2-NEXT: pand %xmm0, %xmm9
-; SSE2-NEXT: pand %xmm0, %xmm12
-; SSE2-NEXT: packuswb %xmm9, %xmm12
-; SSE2-NEXT: packuswb %xmm6, %xmm12
-; SSE2-NEXT: psrld $1, %xmm5
-; SSE2-NEXT: psrld $1, %xmm11
-; SSE2-NEXT: pand %xmm0, %xmm11
-; SSE2-NEXT: pand %xmm0, %xmm5
-; SSE2-NEXT: packuswb %xmm11, %xmm5
-; SSE2-NEXT: psrld $1, %xmm2
-; SSE2-NEXT: psrld $1, %xmm10
-; SSE2-NEXT: pand %xmm0, %xmm10
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: packuswb %xmm10, %xmm2
-; SSE2-NEXT: packuswb %xmm5, %xmm2
-; SSE2-NEXT: psrld $1, %xmm4
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
-; SSE2-NEXT: psrld $1, %xmm5
-; SSE2-NEXT: pand %xmm0, %xmm5
-; SSE2-NEXT: pand %xmm0, %xmm4
-; SSE2-NEXT: packuswb %xmm5, %xmm4
-; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: movdqa %xmm3, %xmm5
-; SSE2-NEXT: psrld $1, %xmm5
-; SSE2-NEXT: pand %xmm0, %xmm5
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm5, %xmm1
-; SSE2-NEXT: packuswb %xmm4, %xmm1
-; SSE2-NEXT: movdqu %xmm1, (%rax)
+; SSE2-NEXT: movdqa 48(%rsi), %xmm3
+; SSE2-NEXT: pavgb %xmm0, %xmm0
+; SSE2-NEXT: pavgb %xmm1, %xmm1
+; SSE2-NEXT: pavgb %xmm2, %xmm2
+; SSE2-NEXT: pavgb %xmm3, %xmm3
+; SSE2-NEXT: movdqu %xmm3, (%rax)
; SSE2-NEXT: movdqu %xmm2, (%rax)
-; SSE2-NEXT: movdqu %xmm12, (%rax)
-; SSE2-NEXT: movdqu %xmm14, (%rax)
+; SSE2-NEXT: movdqu %xmm1, (%rax)
+; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v64i8_2:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm7, %xmm7
-; AVX1-NEXT: vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT: vpaddd %xmm6, %xmm6, %xmm6
-; AVX1-NEXT: vmovdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT: vpaddd %xmm5, %xmm5, %xmm6
-; AVX1-NEXT: vpaddd %xmm4, %xmm4, %xmm5
-; AVX1-NEXT: vpaddd %xmm3, %xmm3, %xmm4
-; AVX1-NEXT: vpaddd %xmm2, %xmm2, %xmm3
-; AVX1-NEXT: vpaddd %xmm1, %xmm1, %xmm2
-; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm1
-; AVX1-NEXT: vpaddd %xmm15, %xmm15, %xmm15
-; AVX1-NEXT: vpaddd %xmm14, %xmm14, %xmm14
-; AVX1-NEXT: vpaddd %xmm13, %xmm13, %xmm13
-; AVX1-NEXT: vpaddd %xmm12, %xmm12, %xmm12
-; AVX1-NEXT: vpaddd %xmm11, %xmm11, %xmm11
-; AVX1-NEXT: vpaddd %xmm10, %xmm10, %xmm10
-; AVX1-NEXT: vpaddd %xmm9, %xmm9, %xmm9
-; AVX1-NEXT: vpaddd %xmm8, %xmm8, %xmm8
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vpsubd %xmm0, %xmm8, %xmm7
-; AVX1-NEXT: vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT: vpsubd %xmm0, %xmm9, %xmm8
-; AVX1-NEXT: vpsubd %xmm0, %xmm10, %xmm10
-; AVX1-NEXT: vpsubd %xmm0, %xmm11, %xmm9
-; AVX1-NEXT: vpsubd %xmm0, %xmm12, %xmm7
-; AVX1-NEXT: vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT: vpsubd %xmm0, %xmm13, %xmm11
-; AVX1-NEXT: vpsubd %xmm0, %xmm14, %xmm13
-; AVX1-NEXT: vpsubd %xmm0, %xmm15, %xmm12
-; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm1
-; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm15
-; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm2
-; AVX1-NEXT: vpsubd %xmm0, %xmm4, %xmm14
-; AVX1-NEXT: vpsubd %xmm0, %xmm5, %xmm3
-; AVX1-NEXT: vmovdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm5
-; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT: vpsrld $1, %xmm8, %xmm6
-; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; AVX1-NEXT: vpsrld $1, %xmm0, %xmm8
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8
-; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpackuswb %xmm8, %xmm6, %xmm8
-; AVX1-NEXT: vpsrld $1, %xmm9, %xmm6
-; AVX1-NEXT: vpsrld $1, %xmm10, %xmm4
-; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4
-; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpackuswb %xmm4, %xmm6, %xmm4
-; AVX1-NEXT: vpackuswb %xmm8, %xmm4, %xmm4
-; AVX1-NEXT: vpsrld $1, %xmm11, %xmm6
-; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; AVX1-NEXT: vpsrld $1, %xmm0, %xmm3
-; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpackuswb %xmm3, %xmm6, %xmm3
-; AVX1-NEXT: vpsrld $1, %xmm12, %xmm6
-; AVX1-NEXT: vpsrld $1, %xmm13, %xmm0
-; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpackuswb %xmm0, %xmm6, %xmm0
-; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: vpsrld $1, %xmm15, %xmm3
-; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm14, %xmm3
-; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
-; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
-; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm5, %xmm2
-; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
-; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
-; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4
-; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vmovdqa (%rsi), %ymm0
+; AVX1-NEXT: vmovdqa 32(%rsi), %ymm1
+; AVX1-NEXT: vpavgb %xmm0, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpavgb %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vpavgb %xmm1, %xmm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpavgb %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX1-NEXT: vmovups %ymm1, (%rax)
; AVX1-NEXT: vmovups %ymm0, (%rax)
@@ -1628,71 +974,10 @@ define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) nounwind {
;
; AVX2-LABEL: avg_v64i8_2:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpaddd %ymm7, %ymm7, %ymm7
-; AVX2-NEXT: vpaddd %ymm6, %ymm6, %ymm6
-; AVX2-NEXT: vpaddd %ymm5, %ymm5, %ymm5
-; AVX2-NEXT: vpaddd %ymm4, %ymm4, %ymm4
-; AVX2-NEXT: vpaddd %ymm3, %ymm3, %ymm3
-; AVX2-NEXT: vpaddd %ymm2, %ymm2, %ymm2
-; AVX2-NEXT: vpaddd %ymm1, %ymm1, %ymm1
-; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpeqd %ymm8, %ymm8, %ymm8
-; AVX2-NEXT: vpsubd %ymm8, %ymm0, %ymm9
-; AVX2-NEXT: vpsubd %ymm8, %ymm1, %ymm10
-; AVX2-NEXT: vpsubd %ymm8, %ymm2, %ymm2
-; AVX2-NEXT: vpsubd %ymm8, %ymm3, %ymm3
-; AVX2-NEXT: vpsubd %ymm8, %ymm4, %ymm4
-; AVX2-NEXT: vpsubd %ymm8, %ymm5, %ymm5
-; AVX2-NEXT: vpsubd %ymm8, %ymm6, %ymm1
-; AVX2-NEXT: vpsubd %ymm8, %ymm7, %ymm0
-; AVX2-NEXT: vpsrld $1, %ymm0, %ymm11
-; AVX2-NEXT: vpsrld $1, %ymm1, %ymm12
-; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5
-; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4
-; AVX2-NEXT: vpsrld $1, %ymm3, %ymm6
-; AVX2-NEXT: vpsrld $1, %ymm2, %ymm7
-; AVX2-NEXT: vpsrld $1, %ymm10, %ymm8
-; AVX2-NEXT: vpsrld $1, %ymm9, %ymm3
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm3
-; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm3[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm3, %xmm9, %xmm0
-; AVX2-NEXT: vpshufb %ymm2, %ymm8, %ymm8
-; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm3, %xmm8, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT: vpshufb %ymm2, %ymm7, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm6
-; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm6
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm6[0],xmm1[0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm4
-; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
-; AVX2-NEXT: vpshufb %ymm2, %ymm12, %ymm4
-; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4
-; AVX2-NEXT: vpshufb %ymm2, %ymm11, %ymm2
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX2-NEXT: vmovdqa (%rsi), %ymm0
+; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1
+; AVX2-NEXT: vpavgb %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vpavgb %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vmovdqu %ymm1, (%rax)
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
; AVX2-NEXT: vzeroupper
@@ -1700,29 +985,10 @@ define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) nounwind {
;
; AVX512F-LABEL: avg_v64i8_2:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vpaddd %zmm3, %zmm3, %zmm3
-; AVX512F-NEXT: vpaddd %zmm2, %zmm2, %zmm2
-; AVX512F-NEXT: vpaddd %zmm1, %zmm1, %zmm1
-; AVX512F-NEXT: vpaddd %zmm0, %zmm0, %zmm0
-; AVX512F-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4
-; AVX512F-NEXT: vpsubd %zmm4, %zmm0, %zmm0
-; AVX512F-NEXT: vpsubd %zmm4, %zmm1, %zmm1
-; AVX512F-NEXT: vpsubd %zmm4, %zmm2, %zmm2
-; AVX512F-NEXT: vpsubd %zmm4, %zmm3, %zmm3
-; AVX512F-NEXT: vpsrld $1, %zmm3, %zmm3
-; AVX512F-NEXT: vpsrld $1, %zmm2, %zmm2
-; AVX512F-NEXT: vpsrld $1, %zmm1, %zmm1
-; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovdb %zmm2, %xmm1
-; AVX512F-NEXT: vpmovdb %zmm3, %xmm2
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
+; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1
+; AVX512F-NEXT: vpavgb %ymm0, %ymm0, %ymm0
+; AVX512F-NEXT: vpavgb %ymm1, %ymm1, %ymm1
; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
; AVX512F-NEXT: vzeroupper
@@ -1805,81 +1071,23 @@ define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) nounwind {
define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) nounwind {
; SSE2-LABEL: avg_v16i16_2:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm2
-; SSE2-NEXT: movdqa 16(%rdi), %xmm4
-; SSE2-NEXT: movdqa (%rsi), %xmm0
+; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: movdqa 16(%rsi), %xmm1
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: movdqa %xmm2, %xmm6
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
-; SSE2-NEXT: movdqa %xmm4, %xmm7
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; SSE2-NEXT: paddd %xmm6, %xmm3
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
-; SSE2-NEXT: paddd %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; SSE2-NEXT: paddd %xmm7, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
-; SSE2-NEXT: paddd %xmm4, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE2-NEXT: psubd %xmm4, %xmm3
-; SSE2-NEXT: psubd %xmm4, %xmm0
-; SSE2-NEXT: psubd %xmm4, %xmm2
-; SSE2-NEXT: psubd %xmm4, %xmm1
-; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: psrld $1, %xmm2
-; SSE2-NEXT: psrld $1, %xmm0
-; SSE2-NEXT: psrld $1, %xmm3
-; SSE2-NEXT: pslld $16, %xmm3
-; SSE2-NEXT: psrad $16, %xmm3
-; SSE2-NEXT: pslld $16, %xmm0
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: packssdw %xmm3, %xmm0
-; SSE2-NEXT: pslld $16, %xmm2
-; SSE2-NEXT: psrad $16, %xmm2
-; SSE2-NEXT: pslld $16, %xmm1
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: packssdw %xmm2, %xmm1
+; SSE2-NEXT: pavgw (%rsi), %xmm0
+; SSE2-NEXT: pavgw 16(%rdi), %xmm1
; SSE2-NEXT: movdqu %xmm1, (%rax)
; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v16i16_2:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpsubd %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpsubd %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpsubd %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpsubd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
-; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
-; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7]
-; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vmovdqa (%rsi), %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpavgw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpavgw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vmovups %ymm0, (%rax)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -1914,187 +1122,47 @@ define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) nounwind {
define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) nounwind {
; SSE2-LABEL: avg_v32i16_2:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm4
-; SSE2-NEXT: movdqa 16(%rdi), %xmm11
-; SSE2-NEXT: movdqa 32(%rdi), %xmm10
-; SSE2-NEXT: movdqa 48(%rdi), %xmm8
-; SSE2-NEXT: movdqa (%rsi), %xmm9
-; SSE2-NEXT: movdqa 16(%rsi), %xmm1
-; SSE2-NEXT: movdqa 32(%rsi), %xmm2
-; SSE2-NEXT: movdqa 48(%rsi), %xmm3
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm4, %xmm6
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm11, %xmm5
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm10, %xmm12
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm8, %xmm13
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm9, %xmm7
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; SSE2-NEXT: paddd %xmm6, %xmm7
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
-; SSE2-NEXT: paddd %xmm4, %xmm9
-; SSE2-NEXT: movdqa %xmm1, %xmm6
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; SSE2-NEXT: paddd %xmm5, %xmm6
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: paddd %xmm11, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT: paddd %xmm12, %xmm5
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: paddd %xmm10, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE2-NEXT: paddd %xmm13, %xmm4
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; SSE2-NEXT: paddd %xmm8, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
-; SSE2-NEXT: psubd %xmm0, %xmm7
-; SSE2-NEXT: psubd %xmm0, %xmm9
-; SSE2-NEXT: psubd %xmm0, %xmm6
-; SSE2-NEXT: psubd %xmm0, %xmm1
-; SSE2-NEXT: psubd %xmm0, %xmm5
-; SSE2-NEXT: psubd %xmm0, %xmm2
-; SSE2-NEXT: psubd %xmm0, %xmm4
-; SSE2-NEXT: psubd %xmm0, %xmm3
-; SSE2-NEXT: psrld $1, %xmm3
-; SSE2-NEXT: psrld $1, %xmm4
-; SSE2-NEXT: psrld $1, %xmm2
-; SSE2-NEXT: psrld $1, %xmm5
-; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: psrld $1, %xmm6
-; SSE2-NEXT: psrld $1, %xmm9
-; SSE2-NEXT: psrld $1, %xmm7
-; SSE2-NEXT: pslld $16, %xmm7
-; SSE2-NEXT: psrad $16, %xmm7
-; SSE2-NEXT: pslld $16, %xmm9
-; SSE2-NEXT: psrad $16, %xmm9
-; SSE2-NEXT: packssdw %xmm7, %xmm9
-; SSE2-NEXT: pslld $16, %xmm6
-; SSE2-NEXT: psrad $16, %xmm6
-; SSE2-NEXT: pslld $16, %xmm1
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: packssdw %xmm6, %xmm1
-; SSE2-NEXT: pslld $16, %xmm5
-; SSE2-NEXT: psrad $16, %xmm5
-; SSE2-NEXT: pslld $16, %xmm2
-; SSE2-NEXT: psrad $16, %xmm2
-; SSE2-NEXT: packssdw %xmm5, %xmm2
-; SSE2-NEXT: pslld $16, %xmm4
-; SSE2-NEXT: psrad $16, %xmm4
-; SSE2-NEXT: pslld $16, %xmm3
-; SSE2-NEXT: psrad $16, %xmm3
-; SSE2-NEXT: packssdw %xmm4, %xmm3
-; SSE2-NEXT: movdqu %xmm3, (%rax)
+; SSE2-NEXT: movdqa (%rdi), %xmm0
+; SSE2-NEXT: movdqa 16(%rdi), %xmm1
+; SSE2-NEXT: movdqa 48(%rdi), %xmm2
+; SSE2-NEXT: movdqa 32(%rsi), %xmm3
+; SSE2-NEXT: pavgw (%rsi), %xmm0
+; SSE2-NEXT: pavgw 16(%rsi), %xmm1
+; SSE2-NEXT: pavgw 32(%rdi), %xmm3
+; SSE2-NEXT: pavgw 48(%rsi), %xmm2
; SSE2-NEXT: movdqu %xmm2, (%rax)
+; SSE2-NEXT: movdqu %xmm3, (%rax)
; SSE2-NEXT: movdqu %xmm1, (%rax)
-; SSE2-NEXT: movdqu %xmm9, (%rax)
+; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v32i16_2:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm8 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm0, %xmm9
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm1, %xmm1
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm7
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vpsubd %xmm0, %xmm9, %xmm8
-; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm1
-; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm2
-; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm3
-; AVX1-NEXT: vpsubd %xmm0, %xmm4, %xmm4
-; AVX1-NEXT: vpsubd %xmm0, %xmm5, %xmm5
-; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm6
-; AVX1-NEXT: vpsubd %xmm0, %xmm7, %xmm0
-; AVX1-NEXT: vpsrld $1, %xmm0, %xmm9
-; AVX1-NEXT: vpsrld $1, %xmm6, %xmm6
-; AVX1-NEXT: vpsrld $1, %xmm5, %xmm5
-; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
-; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
-; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm8, %xmm7
-; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm0[1],xmm7[2],xmm0[3],xmm7[4],xmm0[5],xmm7[6],xmm0[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
-; AVX1-NEXT: vpackusdw %xmm7, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2],xmm0[3],xmm3[4],xmm0[5],xmm3[6],xmm0[7]
-; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4],xmm0[5],xmm4[6],xmm0[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm0[1],xmm5[2],xmm0[3],xmm5[4],xmm0[5],xmm5[6],xmm0[7]
-; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm0[1],xmm6[2],xmm0[3],xmm6[4],xmm0[5],xmm6[6],xmm0[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2],xmm0[3],xmm9[4],xmm0[5],xmm9[6],xmm0[7]
-; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vmovups %ymm0, (%rax)
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX1-NEXT: vmovdqa (%rsi), %ymm2
+; AVX1-NEXT: vmovdqa 32(%rsi), %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpavgw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpavgw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpavgw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpavgw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: vmovups %ymm1, (%rax)
+; AVX1-NEXT: vmovups %ymm0, (%rax)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: avg_v32i16_2:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3
-; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX2-NEXT: vpsubd %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpsubd %ymm4, %ymm1, %ymm1
-; AVX2-NEXT: vpsubd %ymm4, %ymm2, %ymm2
-; AVX2-NEXT: vpsubd %ymm4, %ymm3, %ymm3
-; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3
-; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2
-; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
-; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm2
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1
+; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0
+; AVX2-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1
; AVX2-NEXT: vmovdqu %ymm1, (%rax)
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
; AVX2-NEXT: vzeroupper
@@ -2102,19 +1170,12 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) nounwind {
;
; AVX512F-LABEL: avg_v32i16_2:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1
-; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
-; AVX512F-NEXT: vpsubd %zmm2, %zmm0, %zmm0
-; AVX512F-NEXT: vpsubd %zmm2, %zmm1, %zmm1
-; AVX512F-NEXT: vpsrld $1, %zmm1, %zmm1
-; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovdw %zmm0, (%rax)
-; AVX512F-NEXT: vpmovdw %zmm1, (%rax)
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1
+; AVX512F-NEXT: vpavgw (%rsi), %ymm0, %ymm0
+; AVX512F-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1
+; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
+; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -2209,89 +1270,21 @@ define void @avg_v16i8_const(<16 x i8>* %a) nounwind {
define void @avg_v32i8_const(<32 x i8>* %a) nounwind {
; SSE2-LABEL: avg_v32i8_const:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm0
-; SSE2-NEXT: movdqa 16(%rdi), %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm7
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm3, %xmm6
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm0, %xmm8
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [1,2,3,4]
-; SSE2-NEXT: paddd %xmm9, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,6,7,8]
-; SSE2-NEXT: paddd %xmm4, %xmm8
-; SSE2-NEXT: paddd %xmm9, %xmm2
-; SSE2-NEXT: paddd %xmm4, %xmm5
-; SSE2-NEXT: paddd %xmm9, %xmm3
-; SSE2-NEXT: paddd %xmm4, %xmm6
-; SSE2-NEXT: paddd %xmm9, %xmm1
-; SSE2-NEXT: paddd %xmm4, %xmm7
-; SSE2-NEXT: psrld $1, %xmm7
-; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: packuswb %xmm7, %xmm1
-; SSE2-NEXT: psrld $1, %xmm6
-; SSE2-NEXT: psrld $1, %xmm3
-; SSE2-NEXT: packuswb %xmm6, %xmm3
-; SSE2-NEXT: packuswb %xmm3, %xmm1
-; SSE2-NEXT: psrld $1, %xmm5
-; SSE2-NEXT: psrld $1, %xmm2
-; SSE2-NEXT: packuswb %xmm5, %xmm2
-; SSE2-NEXT: psrld $1, %xmm8
-; SSE2-NEXT: psrld $1, %xmm0
-; SSE2-NEXT: packuswb %xmm8, %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; SSE2-NEXT: movdqa (%rdi), %xmm1
+; SSE2-NEXT: pavgb %xmm0, %xmm1
+; SSE2-NEXT: pavgb 16(%rdi), %xmm0
+; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: movdqu %xmm1, (%rax)
-; SSE2-NEXT: movdqu %xmm2, (%rax)
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v32i8_const:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [1,2,3,4]
-; AVX1-NEXT: vpaddd %xmm0, %xmm7, %xmm9
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [5,6,7,8]
-; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpaddd %xmm0, %xmm5, %xmm5
-; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4
-; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm3
-; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
-; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $1, %xmm2, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm3, %xmm2
-; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $1, %xmm4, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm5, %xmm2
-; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm6, %xmm2
-; AVX1-NEXT: vpsrld $1, %xmm9, %xmm3
-; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX1-NEXT: vpavgb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpavgb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vmovups %ymm0, (%rax)
; AVX1-NEXT: vzeroupper
@@ -2324,211 +1317,33 @@ define void @avg_v32i8_const(<32 x i8>* %a) nounwind {
define void @avg_v64i8_const(<64 x i8>* %a) nounwind {
; SSE2-LABEL: avg_v64i8_const:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm5
-; SSE2-NEXT: movdqa 16(%rdi), %xmm6
-; SSE2-NEXT: movdqa 32(%rdi), %xmm15
-; SSE2-NEXT: movdqa 48(%rdi), %xmm11
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm11, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm10
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm9
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm11, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm15, %xmm14
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm14, %xmm13
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm15, %xmm12
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm6, %xmm3
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm3, %xmm8
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm6, %xmm4
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm5, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm5, %xmm7
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [5,6,7,8]
-; SSE2-NEXT: paddd %xmm0, %xmm5
-; SSE2-NEXT: paddd %xmm0, %xmm2
-; SSE2-NEXT: paddd %xmm0, %xmm6
-; SSE2-NEXT: paddd %xmm0, %xmm3
-; SSE2-NEXT: paddd %xmm0, %xmm15
-; SSE2-NEXT: paddd %xmm0, %xmm14
-; SSE2-NEXT: paddd %xmm0, %xmm11
-; SSE2-NEXT: paddd %xmm0, %xmm9
-; SSE2-NEXT: movdqa %xmm9, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,2,3,4]
-; SSE2-NEXT: paddd %xmm0, %xmm7
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: paddd %xmm0, %xmm4
-; SSE2-NEXT: paddd %xmm0, %xmm8
-; SSE2-NEXT: paddd %xmm0, %xmm12
-; SSE2-NEXT: paddd %xmm0, %xmm13
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm9 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm0, %xmm9
-; SSE2-NEXT: movdqa %xmm9, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: paddd %xmm0, %xmm10
-; SSE2-NEXT: psrld $1, %xmm7
-; SSE2-NEXT: psrld $1, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT: pand %xmm0, %xmm5
-; SSE2-NEXT: pand %xmm0, %xmm7
-; SSE2-NEXT: packuswb %xmm5, %xmm7
-; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: psrld $1, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: packuswb %xmm7, %xmm1
-; SSE2-NEXT: psrld $1, %xmm4
-; SSE2-NEXT: psrld $1, %xmm6
-; SSE2-NEXT: pand %xmm0, %xmm6
-; SSE2-NEXT: pand %xmm0, %xmm4
-; SSE2-NEXT: packuswb %xmm6, %xmm4
-; SSE2-NEXT: psrld $1, %xmm8
-; SSE2-NEXT: psrld $1, %xmm3
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm0, %xmm8
-; SSE2-NEXT: packuswb %xmm3, %xmm8
-; SSE2-NEXT: packuswb %xmm4, %xmm8
-; SSE2-NEXT: psrld $1, %xmm12
-; SSE2-NEXT: psrld $1, %xmm15
-; SSE2-NEXT: pand %xmm0, %xmm15
-; SSE2-NEXT: pand %xmm0, %xmm12
-; SSE2-NEXT: packuswb %xmm15, %xmm12
-; SSE2-NEXT: psrld $1, %xmm13
-; SSE2-NEXT: psrld $1, %xmm14
-; SSE2-NEXT: pand %xmm0, %xmm14
-; SSE2-NEXT: pand %xmm0, %xmm13
-; SSE2-NEXT: packuswb %xmm14, %xmm13
-; SSE2-NEXT: packuswb %xmm12, %xmm13
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT: psrld $1, %xmm2
-; SSE2-NEXT: psrld $1, %xmm11
-; SSE2-NEXT: pand %xmm0, %xmm11
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: packuswb %xmm11, %xmm2
-; SSE2-NEXT: psrld $1, %xmm10
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; SSE2-NEXT: psrld $1, %xmm3
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm0, %xmm10
-; SSE2-NEXT: packuswb %xmm3, %xmm10
-; SSE2-NEXT: packuswb %xmm2, %xmm10
-; SSE2-NEXT: movdqu %xmm10, (%rax)
-; SSE2-NEXT: movdqu %xmm13, (%rax)
-; SSE2-NEXT: movdqu %xmm8, (%rax)
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; SSE2-NEXT: movdqa (%rdi), %xmm1
+; SSE2-NEXT: pavgb %xmm0, %xmm1
+; SSE2-NEXT: movdqa 16(%rdi), %xmm2
+; SSE2-NEXT: pavgb %xmm0, %xmm2
+; SSE2-NEXT: movdqa 32(%rdi), %xmm3
+; SSE2-NEXT: pavgb %xmm0, %xmm3
+; SSE2-NEXT: pavgb 48(%rdi), %xmm0
+; SSE2-NEXT: movdqu %xmm0, (%rax)
+; SSE2-NEXT: movdqu %xmm3, (%rax)
+; SSE2-NEXT: movdqu %xmm2, (%rax)
; SSE2-NEXT: movdqu %xmm1, (%rax)
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v64i8_const:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [5,6,7,8]
-; AVX1-NEXT: vpaddd %xmm0, %xmm5, %xmm5
-; AVX1-NEXT: vpaddd %xmm0, %xmm7, %xmm15
-; AVX1-NEXT: vpaddd %xmm0, %xmm13, %xmm13
-; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm7
-; AVX1-NEXT: vpaddd %xmm0, %xmm11, %xmm11
-; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm1
-; AVX1-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT: vpaddd %xmm0, %xmm9, %xmm9
-; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,3,4]
-; AVX1-NEXT: vpaddd %xmm2, %xmm12, %xmm0
-; AVX1-NEXT: vpaddd %xmm2, %xmm10, %xmm10
-; AVX1-NEXT: vpaddd %xmm2, %xmm8, %xmm8
-; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm1
-; AVX1-NEXT: vpaddd %xmm2, %xmm14, %xmm6
-; AVX1-NEXT: vpaddd -{{[0-9]+}}(%rsp), %xmm2, %xmm12 # 16-byte Folded Reload
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm14
-; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $1, %xmm5, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $1, %xmm10, %xmm3
-; AVX1-NEXT: vpsrld $1, %xmm15, %xmm2
-; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpsrld $1, %xmm8, %xmm2
-; AVX1-NEXT: vpsrld $1, %xmm13, %xmm3
-; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpsrld $1, %xmm4, %xmm3
-; AVX1-NEXT: vpsrld $1, %xmm7, %xmm4
-; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm11, %xmm2
-; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm6, %xmm2
-; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm12, %xmm2
-; AVX1-NEXT: vpsrld $1, %xmm9, %xmm3
-; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpsrld $1, %xmm14, %xmm3
-; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
-; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
-; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX1-NEXT: vpavgb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpavgb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpavgb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpavgb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: vmovups %ymm1, (%rax)
; AVX1-NEXT: vmovups %ymm0, (%rax)
; AVX1-NEXT: vzeroupper
@@ -2536,82 +1351,21 @@ define void @avg_v64i8_const(<64 x i8>* %a) nounwind {
;
; AVX2-LABEL: avg_v64i8_const:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [1,2,3,4,5,6,7,8]
-; AVX2-NEXT: vpaddd %ymm8, %ymm7, %ymm7
-; AVX2-NEXT: vpaddd %ymm8, %ymm6, %ymm6
-; AVX2-NEXT: vpaddd %ymm8, %ymm5, %ymm5
-; AVX2-NEXT: vpaddd %ymm8, %ymm4, %ymm4
-; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm3
-; AVX2-NEXT: vpaddd %ymm8, %ymm2, %ymm2
-; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm1
-; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm0
-; AVX2-NEXT: vpsrld $1, %ymm0, %ymm8
-; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
-; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2
-; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3
-; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4
-; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5
-; AVX2-NEXT: vpsrld $1, %ymm6, %ymm6
-; AVX2-NEXT: vpsrld $1, %ymm7, %ymm7
-; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm0
-; AVX2-NEXT: vpackssdw %xmm0, %xmm7, %xmm0
-; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX2-NEXT: vpackssdw %xmm7, %xmm6, %xmm6
-; AVX2-NEXT: vpackuswb %xmm0, %xmm6, %xmm0
-; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5
-; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
-; AVX2-NEXT: vpackssdw %xmm6, %xmm4, %xmm4
-; AVX2-NEXT: vpackuswb %xmm5, %xmm4, %xmm4
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
-; AVX2-NEXT: vpackssdw %xmm4, %xmm2, %xmm2
-; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm3
-; AVX2-NEXT: vpackssdw %xmm3, %xmm8, %xmm3
-; AVX2-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX2-NEXT: vmovdqu %ymm1, (%rax)
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
+; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm1
+; AVX2-NEXT: vpavgb 32(%rdi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
+; AVX2-NEXT: vmovdqu %ymm1, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: avg_v64i8_const:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8]
-; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpaddd %zmm4, %zmm3, %zmm3
-; AVX512F-NEXT: vpaddd %zmm4, %zmm2, %zmm2
-; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1
-; AVX512F-NEXT: vpaddd %zmm4, %zmm0, %zmm0
-; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0
-; AVX512F-NEXT: vpsrld $1, %zmm1, %zmm1
-; AVX512F-NEXT: vpsrld $1, %zmm2, %zmm2
-; AVX512F-NEXT: vpsrld $1, %zmm3, %zmm3
-; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
-; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm0 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
+; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm1
+; AVX512F-NEXT: vpavgb 32(%rdi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
-; AVX512F-NEXT: vmovdqu %ymm2, (%rax)
+; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -2680,57 +1434,21 @@ define void @avg_v8i16_const(<8 x i16>* %a) nounwind {
define void @avg_v16i16_const(<16 x i16>* %a) nounwind {
; SSE2-LABEL: avg_v16i16_const:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm3
-; SSE2-NEXT: movdqa 16(%rdi), %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,6,7,8]
-; SSE2-NEXT: paddd %xmm4, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2,3,4]
-; SSE2-NEXT: paddd %xmm5, %xmm2
-; SSE2-NEXT: paddd %xmm4, %xmm0
-; SSE2-NEXT: paddd %xmm5, %xmm1
-; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: psrld $1, %xmm0
-; SSE2-NEXT: psrld $1, %xmm2
-; SSE2-NEXT: psrld $1, %xmm3
-; SSE2-NEXT: pslld $16, %xmm3
-; SSE2-NEXT: psrad $16, %xmm3
-; SSE2-NEXT: pslld $16, %xmm2
-; SSE2-NEXT: psrad $16, %xmm2
-; SSE2-NEXT: packssdw %xmm3, %xmm2
-; SSE2-NEXT: pslld $16, %xmm0
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: pslld $16, %xmm1
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: packssdw %xmm0, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7]
+; SSE2-NEXT: movdqa (%rdi), %xmm1
+; SSE2-NEXT: pavgw %xmm0, %xmm1
+; SSE2-NEXT: pavgw 16(%rdi), %xmm0
+; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: movdqu %xmm1, (%rax)
-; SSE2-NEXT: movdqu %xmm2, (%rax)
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v16i16_const:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,2,3,4]
-; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [5,6,7,8]
-; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpaddd %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
-; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpsrld $1, %xmm2, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm3, %xmm2
-; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpavgw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpavgw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vmovups %ymm0, (%rax)
; AVX1-NEXT: vzeroupper
@@ -2763,148 +1481,57 @@ define void @avg_v16i16_const(<16 x i16>* %a) nounwind {
define void @avg_v32i16_const(<32 x i16>* %a) nounwind {
; SSE2-LABEL: avg_v32i16_const:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm7
-; SSE2-NEXT: movdqa 16(%rdi), %xmm6
-; SSE2-NEXT: movdqa 32(%rdi), %xmm4
-; SSE2-NEXT: movdqa 48(%rdi), %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm8
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
-; SSE2-NEXT: movdqa %xmm4, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
-; SSE2-NEXT: movdqa %xmm6, %xmm3
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
-; SSE2-NEXT: movdqa %xmm7, %xmm5
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [5,6,7,8]
-; SSE2-NEXT: paddd %xmm8, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [1,2,3,4]
-; SSE2-NEXT: paddd %xmm9, %xmm5
-; SSE2-NEXT: paddd %xmm8, %xmm6
-; SSE2-NEXT: paddd %xmm9, %xmm3
-; SSE2-NEXT: paddd %xmm8, %xmm4
-; SSE2-NEXT: paddd %xmm9, %xmm2
-; SSE2-NEXT: paddd %xmm8, %xmm0
-; SSE2-NEXT: paddd %xmm9, %xmm1
-; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: psrld $1, %xmm0
-; SSE2-NEXT: psrld $1, %xmm2
-; SSE2-NEXT: psrld $1, %xmm4
-; SSE2-NEXT: psrld $1, %xmm3
-; SSE2-NEXT: psrld $1, %xmm6
-; SSE2-NEXT: psrld $1, %xmm5
-; SSE2-NEXT: psrld $1, %xmm7
-; SSE2-NEXT: pslld $16, %xmm7
-; SSE2-NEXT: psrad $16, %xmm7
-; SSE2-NEXT: pslld $16, %xmm5
-; SSE2-NEXT: psrad $16, %xmm5
-; SSE2-NEXT: packssdw %xmm7, %xmm5
-; SSE2-NEXT: pslld $16, %xmm6
-; SSE2-NEXT: psrad $16, %xmm6
-; SSE2-NEXT: pslld $16, %xmm3
-; SSE2-NEXT: psrad $16, %xmm3
-; SSE2-NEXT: packssdw %xmm6, %xmm3
-; SSE2-NEXT: pslld $16, %xmm4
-; SSE2-NEXT: psrad $16, %xmm4
-; SSE2-NEXT: pslld $16, %xmm2
-; SSE2-NEXT: psrad $16, %xmm2
-; SSE2-NEXT: packssdw %xmm4, %xmm2
-; SSE2-NEXT: pslld $16, %xmm0
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: pslld $16, %xmm1
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: packssdw %xmm0, %xmm1
-; SSE2-NEXT: movdqu %xmm1, (%rax)
-; SSE2-NEXT: movdqu %xmm2, (%rax)
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7]
+; SSE2-NEXT: movdqa (%rdi), %xmm1
+; SSE2-NEXT: pavgw %xmm0, %xmm1
+; SSE2-NEXT: movdqa 16(%rdi), %xmm2
+; SSE2-NEXT: pavgw %xmm0, %xmm2
+; SSE2-NEXT: movdqa 32(%rdi), %xmm3
+; SSE2-NEXT: pavgw %xmm0, %xmm3
+; SSE2-NEXT: pavgw 48(%rdi), %xmm0
+; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: movdqu %xmm3, (%rax)
-; SSE2-NEXT: movdqu %xmm5, (%rax)
+; SSE2-NEXT: movdqu %xmm2, (%rax)
+; SSE2-NEXT: movdqu %xmm1, (%rax)
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v32i16_const:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm8 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [1,2,3,4]
-; AVX1-NEXT: vpaddd %xmm0, %xmm7, %xmm9
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [5,6,7,8]
-; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpaddd %xmm0, %xmm5, %xmm5
-; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4
-; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm3
-; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $1, %xmm2, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm3, %xmm2
-; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm4, %xmm2
-; AVX1-NEXT: vpsrld $1, %xmm5, %xmm3
-; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrld $1, %xmm6, %xmm3
-; AVX1-NEXT: vpsrld $1, %xmm9, %xmm4
-; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpavgw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpavgw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpavgw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpavgw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vmovups %ymm1, (%rax)
; AVX1-NEXT: vmovups %ymm0, (%rax)
-; AVX1-NEXT: vmovups %ymm2, (%rax)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: avg_v32i16_const:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [1,2,3,4,5,6,7,8]
-; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3
-; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2
-; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1
-; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0
-; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
-; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2
-; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3
-; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
-; AVX2-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX2-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm1
+; AVX2-NEXT: vpavgw 32(%rdi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
-; AVX2-NEXT: vmovdqu %ymm2, (%rax)
+; AVX2-NEXT: vmovdqu %ymm1, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: avg_v32i16_const:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8]
-; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1
-; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0
-; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0
-; AVX512F-NEXT: vpsrld $1, %zmm1, %zmm1
-; AVX512F-NEXT: vpmovdw %zmm1, (%rax)
-; AVX512F-NEXT: vpmovdw %zmm0, (%rax)
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX512F-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm1
+; AVX512F-NEXT: vpavgw 32(%rdi), %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -2946,77 +1573,16 @@ define <16 x i8> @avg_v16i8_3(<16 x i8> %a, <16 x i8> %b) nounwind {
define <32 x i8> @avg_v32i8_3(<32 x i8> %a, <32 x i8> %b) nounwind {
; SSE2-LABEL: avg_v32i8_3:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm7
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
-; SSE2-NEXT: paddw %xmm6, %xmm4
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; SSE2-NEXT: paddw %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
-; SSE2-NEXT: paddw %xmm7, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; SSE2-NEXT: paddw %xmm3, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
-; SSE2-NEXT: psubw %xmm3, %xmm4
-; SSE2-NEXT: psubw %xmm3, %xmm0
-; SSE2-NEXT: psubw %xmm3, %xmm2
-; SSE2-NEXT: psubw %xmm3, %xmm1
-; SSE2-NEXT: psrlw $1, %xmm1
-; SSE2-NEXT: psrlw $1, %xmm2
-; SSE2-NEXT: psrlw $1, %xmm0
-; SSE2-NEXT: psrlw $1, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: packuswb %xmm4, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
+; SSE2-NEXT: pavgb %xmm2, %xmm0
+; SSE2-NEXT: pavgb %xmm3, %xmm1
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v32i8_3:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; AVX1-NEXT: vpaddw %xmm6, %xmm3, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; AVX1-NEXT: vpaddw %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT: vpaddw %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpsubw %xmm1, %xmm3, %xmm3
-; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm2
-; AVX1-NEXT: vpsubw %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm1
-; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2
-; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpavgb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpavgb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -3041,203 +1607,36 @@ define <32 x i8> @avg_v32i8_3(<32 x i8> %a, <32 x i8> %b) nounwind {
define <64 x i8> @avg_v64i8_3(<64 x i8> %a, <64 x i8> %b) nounwind {
; SSE2-LABEL: avg_v64i8_3:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm9, %xmm9
-; SSE2-NEXT: movdqa %xmm0, %xmm10
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm11
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
-; SSE2-NEXT: movdqa %xmm2, %xmm12
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
-; SSE2-NEXT: movdqa %xmm3, %xmm13
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
-; SSE2-NEXT: movdqa %xmm4, %xmm8
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15]
-; SSE2-NEXT: paddw %xmm10, %xmm8
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
-; SSE2-NEXT: paddw %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
-; SSE2-NEXT: paddw %xmm11, %xmm4
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
-; SSE2-NEXT: paddw %xmm5, %xmm1
-; SSE2-NEXT: movdqa %xmm6, %xmm5
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15]
-; SSE2-NEXT: paddw %xmm12, %xmm5
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
-; SSE2-NEXT: paddw %xmm6, %xmm2
-; SSE2-NEXT: movdqa %xmm7, %xmm6
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15]
-; SSE2-NEXT: paddw %xmm13, %xmm6
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
-; SSE2-NEXT: paddw %xmm7, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm7, %xmm7
-; SSE2-NEXT: psubw %xmm7, %xmm8
-; SSE2-NEXT: psubw %xmm7, %xmm0
-; SSE2-NEXT: psubw %xmm7, %xmm4
-; SSE2-NEXT: psubw %xmm7, %xmm1
-; SSE2-NEXT: psubw %xmm7, %xmm5
-; SSE2-NEXT: psubw %xmm7, %xmm2
-; SSE2-NEXT: psubw %xmm7, %xmm6
-; SSE2-NEXT: psubw %xmm7, %xmm3
-; SSE2-NEXT: psrlw $1, %xmm3
-; SSE2-NEXT: psrlw $1, %xmm6
-; SSE2-NEXT: psrlw $1, %xmm2
-; SSE2-NEXT: psrlw $1, %xmm5
-; SSE2-NEXT: psrlw $1, %xmm1
-; SSE2-NEXT: psrlw $1, %xmm4
-; SSE2-NEXT: psrlw $1, %xmm0
-; SSE2-NEXT: psrlw $1, %xmm8
-; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
-; SSE2-NEXT: pand %xmm7, %xmm8
-; SSE2-NEXT: pand %xmm7, %xmm0
-; SSE2-NEXT: packuswb %xmm8, %xmm0
-; SSE2-NEXT: pand %xmm7, %xmm4
-; SSE2-NEXT: pand %xmm7, %xmm1
-; SSE2-NEXT: packuswb %xmm4, %xmm1
-; SSE2-NEXT: pand %xmm7, %xmm5
-; SSE2-NEXT: pand %xmm7, %xmm2
-; SSE2-NEXT: packuswb %xmm5, %xmm2
-; SSE2-NEXT: pand %xmm7, %xmm6
-; SSE2-NEXT: pand %xmm7, %xmm3
-; SSE2-NEXT: packuswb %xmm6, %xmm3
+; SSE2-NEXT: pavgb %xmm4, %xmm0
+; SSE2-NEXT: pavgb %xmm5, %xmm1
+; SSE2-NEXT: pavgb %xmm6, %xmm2
+; SSE2-NEXT: pavgb %xmm7, %xmm3
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v64i8_3:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,3,0,1]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm11 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm10 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT: vpaddw %xmm7, %xmm5, %xmm12
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT: vpaddw %xmm1, %xmm4, %xmm13
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT: vpaddw %xmm4, %xmm6, %xmm14
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm15
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT: vpaddw %xmm6, %xmm8, %xmm6
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT: vpaddw %xmm2, %xmm11, %xmm2
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; AVX1-NEXT: vpaddw %xmm7, %xmm9, %xmm7
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; AVX1-NEXT: vpaddw %xmm3, %xmm10, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpsubw %xmm5, %xmm12, %xmm8
-; AVX1-NEXT: vpsubw %xmm5, %xmm13, %xmm4
-; AVX1-NEXT: vpsubw %xmm5, %xmm14, %xmm0
-; AVX1-NEXT: vpsubw %xmm5, %xmm15, %xmm1
-; AVX1-NEXT: vpsubw %xmm5, %xmm6, %xmm6
-; AVX1-NEXT: vpsubw %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpsubw %xmm5, %xmm7, %xmm7
-; AVX1-NEXT: vpsubw %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm9
-; AVX1-NEXT: vpsrlw $1, %xmm7, %xmm5
-; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2
-; AVX1-NEXT: vpsrlw $1, %xmm6, %xmm6
-; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm4
-; AVX1-NEXT: vpsrlw $1, %xmm8, %xmm7
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm3, %xmm7, %xmm7
-; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm7[0],xmm4[0]
-; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpavgb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpavgb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm1
-; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm2
-; AVX1-NEXT: vpshufb %xmm3, %xmm9, %xmm3
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpavgb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpavgb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: avg_v64i8_3:
; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm6
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero,xmm6[8],zero,xmm6[9],zero,xmm6[10],zero,xmm6[11],zero,xmm6[12],zero,xmm6[13],zero,xmm6[14],zero,xmm6[15],zero
-; AVX2-NEXT: vpaddw %ymm6, %ymm4, %ymm4
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm2
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; AVX2-NEXT: vpaddw %ymm2, %ymm5, %ymm2
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
-; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX2-NEXT: vpsubw %ymm3, %ymm4, %ymm4
-; AVX2-NEXT: vpsubw %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vpsubw %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpsubw %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX2-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX2-NEXT: vpsrlw $1, %ymm4, %ymm3
-; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
-; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
-; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
-; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
-; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: vpavgb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpavgb %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512F-LABEL: avg_v64i8_3:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm4
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm5
-; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm6
-; AVX512F-NEXT: vpavgb %xmm6, %xmm4, %xmm4
-; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm6
-; AVX512F-NEXT: vpavgb %xmm6, %xmm5, %xmm5
-; AVX512F-NEXT: vpavgb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
-; AVX512F-NEXT: vpavgb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpavgb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpavgb %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v64i8_3:
diff --git a/test/CodeGen/X86/avx512-calling-conv.ll b/test/CodeGen/X86/avx512-calling-conv.ll
index 6e6d61f37d2e..248462d0de51 100644
--- a/test/CodeGen/X86/avx512-calling-conv.ll
+++ b/test/CodeGen/X86/avx512-calling-conv.ll
@@ -17,78 +17,40 @@ define <16 x i1> @test1() {
}
define <16 x i1> @test2(<16 x i1>%a, <16 x i1>%b) {
-; KNL-LABEL: test2:
-; KNL: ## %bb.0:
-; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT: vpslld $31, %zmm1, %zmm1
-; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
-; KNL-NEXT: vpslld $31, %zmm0, %zmm0
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1}
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT: vpmovdb %zmm0, %xmm0
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test2:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpsllw $7, %xmm1, %xmm1
-; SKX-NEXT: vpmovb2m %xmm1, %k0
-; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
-; SKX-NEXT: vpmovb2m %xmm0, %k1
-; SKX-NEXT: kandw %k0, %k1, %k0
-; SKX-NEXT: vpmovm2b %k0, %xmm0
-; SKX-NEXT: retq
+; ALL_X64-LABEL: test2:
+; ALL_X64: ## %bb.0:
+; ALL_X64-NEXT: vpand %xmm1, %xmm0, %xmm0
+; ALL_X64-NEXT: vpsllw $7, %xmm0, %xmm0
+; ALL_X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; ALL_X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; ALL_X64-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; ALL_X64-NEXT: retq
;
; KNL_X32-LABEL: test2:
; KNL_X32: ## %bb.0:
-; KNL_X32-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL_X32-NEXT: vpslld $31, %zmm1, %zmm1
-; KNL_X32-NEXT: vpmovsxbd %xmm0, %zmm0
-; KNL_X32-NEXT: vpslld $31, %zmm0, %zmm0
-; KNL_X32-NEXT: vptestmd %zmm0, %zmm0, %k1
-; KNL_X32-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1}
-; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL_X32-NEXT: vpmovdb %zmm0, %xmm0
+; KNL_X32-NEXT: vpand %xmm1, %xmm0, %xmm0
+; KNL_X32-NEXT: vpsllw $7, %xmm0, %xmm0
+; KNL_X32-NEXT: vpand LCPI1_0, %xmm0, %xmm0
+; KNL_X32-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; KNL_X32-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; KNL_X32-NEXT: retl
%c = and <16 x i1>%a, %b
ret <16 x i1> %c
}
define <8 x i1> @test3(<8 x i1>%a, <8 x i1>%b) {
-; KNL-LABEL: test3:
-; KNL: ## %bb.0:
-; KNL-NEXT: vpmovsxwq %xmm1, %zmm1
-; KNL-NEXT: vpsllq $63, %zmm1, %zmm1
-; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
-; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
-; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 {%k1}
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT: vpmovdw %zmm0, %ymm0
-; KNL-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test3:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpsllw $15, %xmm1, %xmm1
-; SKX-NEXT: vpmovw2m %xmm1, %k0
-; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
-; SKX-NEXT: vpmovw2m %xmm0, %k1
-; SKX-NEXT: kandb %k0, %k1, %k0
-; SKX-NEXT: vpmovm2w %k0, %xmm0
-; SKX-NEXT: retq
+; ALL_X64-LABEL: test3:
+; ALL_X64: ## %bb.0:
+; ALL_X64-NEXT: vpand %xmm1, %xmm0, %xmm0
+; ALL_X64-NEXT: vpsllw $15, %xmm0, %xmm0
+; ALL_X64-NEXT: vpsraw $15, %xmm0, %xmm0
+; ALL_X64-NEXT: retq
;
; KNL_X32-LABEL: test3:
; KNL_X32: ## %bb.0:
-; KNL_X32-NEXT: vpmovsxwq %xmm1, %zmm1
-; KNL_X32-NEXT: vpsllq $63, %zmm1, %zmm1
-; KNL_X32-NEXT: vpmovsxwq %xmm0, %zmm0
-; KNL_X32-NEXT: vpsllq $63, %zmm0, %zmm0
-; KNL_X32-NEXT: vptestmq %zmm0, %zmm0, %k1
-; KNL_X32-NEXT: vptestmq %zmm1, %zmm1, %k1 {%k1}
-; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL_X32-NEXT: vpmovdw %zmm0, %ymm0
-; KNL_X32-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0
+; KNL_X32-NEXT: vpand %xmm1, %xmm0, %xmm0
+; KNL_X32-NEXT: vpsllw $15, %xmm0, %xmm0
+; KNL_X32-NEXT: vpsraw $15, %xmm0, %xmm0
; KNL_X32-NEXT: retl
%c = and <8 x i1>%a, %b
ret <8 x i1> %c
@@ -102,11 +64,9 @@ define <4 x i1> @test4(<4 x i1>%a, <4 x i1>%b) {
;
; SKX-LABEL: test4:
; SKX: ## %bb.0:
-; SKX-NEXT: vpslld $31, %xmm1, %xmm1
+; SKX-NEXT: vpand %xmm1, %xmm0, %xmm0
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
-; SKX-NEXT: vptestmd %xmm1, %xmm1, %k0 {%k1}
-; SKX-NEXT: vpmovm2d %k0, %xmm0
+; SKX-NEXT: vpsrad $31, %xmm0, %xmm0
; SKX-NEXT: retq
;
; KNL_X32-LABEL: test4:
diff --git a/test/CodeGen/X86/avx512-ext.ll b/test/CodeGen/X86/avx512-ext.ll
index 97beff63811a..8c7941591217 100644
--- a/test/CodeGen/X86/avx512-ext.ll
+++ b/test/CodeGen/X86/avx512-ext.ll
@@ -1366,21 +1366,12 @@ define i16 @trunc_16i32_to_16i1(<16 x i32> %a) {
}
define <4 x i32> @trunc_4i32_to_4i1(<4 x i32> %a, <4 x i32> %b) {
-; KNL-LABEL: trunc_4i32_to_4i1:
-; KNL: # %bb.0:
-; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0
-; KNL-NEXT: vpslld $31, %xmm0, %xmm0
-; KNL-NEXT: vpsrad $31, %xmm0, %xmm0
-; KNL-NEXT: retq
-;
-; SKX-LABEL: trunc_4i32_to_4i1:
-; SKX: # %bb.0:
-; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
-; SKX-NEXT: vpslld $31, %xmm1, %xmm0
-; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 {%k1}
-; SKX-NEXT: vpmovm2d %k0, %xmm0
-; SKX-NEXT: retq
+; ALL-LABEL: trunc_4i32_to_4i1:
+; ALL: # %bb.0:
+; ALL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; ALL-NEXT: vpslld $31, %xmm0, %xmm0
+; ALL-NEXT: vpsrad $31, %xmm0, %xmm0
+; ALL-NEXT: retq
%mask_a = trunc <4 x i32>%a to <4 x i1>
%mask_b = trunc <4 x i32>%b to <4 x i1>
%a_and_b = and <4 x i1>%mask_a, %mask_b
diff --git a/test/CodeGen/X86/avx512-extract-subvector-load-store.ll b/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
index 34ea468aebee..e1ed8ea98a1c 100644
--- a/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
+++ b/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
@@ -249,9 +249,9 @@ define void @load_v32i1_broadcast_16_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x fl
; AVX512: # %bb.0:
; AVX512-NEXT: kmovd (%rdi), %k0
; AVX512-NEXT: kshiftrd $16, %k0, %k0
-; AVX512-NEXT: vpmovm2q %k0, %zmm2
-; AVX512-NEXT: vpbroadcastq %xmm2, %zmm2
-; AVX512-NEXT: vpmovq2m %zmm2, %k1
+; AVX512-NEXT: vpmovm2d %k0, %ymm2
+; AVX512-NEXT: vpbroadcastd %xmm2, %ymm2
+; AVX512-NEXT: vpmovd2m %ymm2, %k1
; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1}
; AVX512-NEXT: vmovaps %ymm1, (%rsi)
; AVX512-NEXT: vzeroupper
@@ -261,10 +261,11 @@ define void @load_v32i1_broadcast_16_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x fl
; AVX512NOTDQ: # %bb.0:
; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1
-; AVX512NOTDQ-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %zmm2
-; AVX512NOTDQ-NEXT: vpsllq $63, %zmm2, %zmm2
-; AVX512NOTDQ-NEXT: vptestmq %zmm2, %zmm2, %k1
+; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %ymm2
+; AVX512NOTDQ-NEXT: vpslld $31, %ymm2, %ymm2
+; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1
; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
; AVX512NOTDQ-NEXT: vzeroupper
@@ -340,10 +341,10 @@ define void @load_v32i1_broadcast_31_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x fl
; AVX512: # %bb.0:
; AVX512-NEXT: kmovd (%rdi), %k0
; AVX512-NEXT: kshiftrd $24, %k0, %k0
-; AVX512-NEXT: vpmovm2q %k0, %zmm2
-; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7]
-; AVX512-NEXT: vpermq %zmm2, %zmm3, %zmm2
-; AVX512-NEXT: vpmovq2m %zmm2, %k1
+; AVX512-NEXT: vpmovm2d %k0, %ymm2
+; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7]
+; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
+; AVX512-NEXT: vpmovd2m %ymm2, %k1
; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1}
; AVX512-NEXT: vmovaps %ymm1, (%rsi)
; AVX512-NEXT: vzeroupper
@@ -353,11 +354,12 @@ define void @load_v32i1_broadcast_31_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x fl
; AVX512NOTDQ: # %bb.0:
; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
; AVX512NOTDQ-NEXT: kshiftrd $24, %k0, %k1
-; AVX512NOTDQ-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; AVX512NOTDQ-NEXT: vpbroadcastq {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7]
-; AVX512NOTDQ-NEXT: vpermq %zmm2, %zmm3, %zmm2
-; AVX512NOTDQ-NEXT: vpsllq $63, %zmm2, %zmm2
-; AVX512NOTDQ-NEXT: vptestmq %zmm2, %zmm2, %k1
+; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7]
+; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
+; AVX512NOTDQ-NEXT: vpslld $31, %ymm2, %ymm2
+; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1
; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
; AVX512NOTDQ-NEXT: vzeroupper
@@ -433,9 +435,9 @@ define void @load_v64i1_broadcast_32_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x fl
; AVX512: # %bb.0:
; AVX512-NEXT: kmovq (%rdi), %k0
; AVX512-NEXT: kshiftrq $32, %k0, %k0
-; AVX512-NEXT: vpmovm2q %k0, %zmm2
-; AVX512-NEXT: vpbroadcastq %xmm2, %zmm2
-; AVX512-NEXT: vpmovq2m %zmm2, %k1
+; AVX512-NEXT: vpmovm2d %k0, %ymm2
+; AVX512-NEXT: vpbroadcastd %xmm2, %ymm2
+; AVX512-NEXT: vpmovd2m %ymm2, %k1
; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1}
; AVX512-NEXT: vmovaps %ymm1, (%rsi)
; AVX512-NEXT: vzeroupper
@@ -445,10 +447,11 @@ define void @load_v64i1_broadcast_32_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x fl
; AVX512NOTDQ: # %bb.0:
; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1
-; AVX512NOTDQ-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %zmm2
-; AVX512NOTDQ-NEXT: vpsllq $63, %zmm2, %zmm2
-; AVX512NOTDQ-NEXT: vptestmq %zmm2, %zmm2, %k1
+; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %ymm2
+; AVX512NOTDQ-NEXT: vpslld $31, %ymm2, %ymm2
+; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1
; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
; AVX512NOTDQ-NEXT: vzeroupper
@@ -555,10 +558,10 @@ define void @load_v64i1_broadcast_63_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x fl
; AVX512: # %bb.0:
; AVX512-NEXT: kmovq (%rdi), %k0
; AVX512-NEXT: kshiftrq $56, %k0, %k0
-; AVX512-NEXT: vpmovm2q %k0, %zmm2
-; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7]
-; AVX512-NEXT: vpermq %zmm2, %zmm3, %zmm2
-; AVX512-NEXT: vpmovq2m %zmm2, %k1
+; AVX512-NEXT: vpmovm2d %k0, %ymm2
+; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7]
+; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
+; AVX512-NEXT: vpmovd2m %ymm2, %k1
; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1}
; AVX512-NEXT: vmovaps %ymm1, (%rsi)
; AVX512-NEXT: vzeroupper
@@ -568,11 +571,12 @@ define void @load_v64i1_broadcast_63_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x fl
; AVX512NOTDQ: # %bb.0:
; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
; AVX512NOTDQ-NEXT: kshiftrq $56, %k0, %k1
-; AVX512NOTDQ-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; AVX512NOTDQ-NEXT: vpbroadcastq {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7]
-; AVX512NOTDQ-NEXT: vpermq %zmm2, %zmm3, %zmm2
-; AVX512NOTDQ-NEXT: vpsllq $63, %zmm2, %zmm2
-; AVX512NOTDQ-NEXT: vptestmq %zmm2, %zmm2, %k1
+; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7]
+; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
+; AVX512NOTDQ-NEXT: vpslld $31, %ymm2, %ymm2
+; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1
; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
; AVX512NOTDQ-NEXT: vzeroupper
@@ -1054,9 +1058,9 @@ define void @load_v32i1_broadcast_16_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
; AVX512: # %bb.0:
; AVX512-NEXT: kmovd (%rdi), %k0
; AVX512-NEXT: kshiftrd $16, %k0, %k0
-; AVX512-NEXT: vpmovm2q %k0, %zmm0
-; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0
-; AVX512-NEXT: vpmovq2m %zmm0, %k0
+; AVX512-NEXT: vpmovm2d %k0, %ymm0
+; AVX512-NEXT: vpbroadcastd %xmm0, %ymm0
+; AVX512-NEXT: vpmovd2m %ymm0, %k0
; AVX512-NEXT: kmovb %k0, (%rsi)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -1065,10 +1069,11 @@ define void @load_v32i1_broadcast_16_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
; AVX512NOTDQ: # %bb.0:
; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1
-; AVX512NOTDQ-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %zmm0
-; AVX512NOTDQ-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512NOTDQ-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %ymm0
+; AVX512NOTDQ-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
; AVX512NOTDQ-NEXT: vzeroupper
@@ -1159,10 +1164,10 @@ define void @load_v32i1_broadcast_31_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
; AVX512: # %bb.0:
; AVX512-NEXT: kmovd (%rdi), %k0
; AVX512-NEXT: kshiftrd $24, %k0, %k0
-; AVX512-NEXT: vpmovm2q %k0, %zmm0
-; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7]
-; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0
-; AVX512-NEXT: vpmovq2m %zmm0, %k0
+; AVX512-NEXT: vpmovm2d %k0, %ymm0
+; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7]
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX512-NEXT: vpmovd2m %ymm0, %k0
; AVX512-NEXT: kmovb %k0, (%rsi)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -1171,11 +1176,12 @@ define void @load_v32i1_broadcast_31_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
; AVX512NOTDQ: # %bb.0:
; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
; AVX512NOTDQ-NEXT: kshiftrd $24, %k0, %k1
-; AVX512NOTDQ-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512NOTDQ-NEXT: vpbroadcastq {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7]
-; AVX512NOTDQ-NEXT: vpermq %zmm0, %zmm1, %zmm0
-; AVX512NOTDQ-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512NOTDQ-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7]
+; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX512NOTDQ-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
; AVX512NOTDQ-NEXT: vzeroupper
@@ -1266,9 +1272,9 @@ define void @load_v64i1_broadcast_32_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
; AVX512: # %bb.0:
; AVX512-NEXT: kmovq (%rdi), %k0
; AVX512-NEXT: kshiftrq $32, %k0, %k0
-; AVX512-NEXT: vpmovm2q %k0, %zmm0
-; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0
-; AVX512-NEXT: vpmovq2m %zmm0, %k0
+; AVX512-NEXT: vpmovm2d %k0, %ymm0
+; AVX512-NEXT: vpbroadcastd %xmm0, %ymm0
+; AVX512-NEXT: vpmovd2m %ymm0, %k0
; AVX512-NEXT: kmovb %k0, (%rsi)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -1277,10 +1283,11 @@ define void @load_v64i1_broadcast_32_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
; AVX512NOTDQ: # %bb.0:
; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1
-; AVX512NOTDQ-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %zmm0
-; AVX512NOTDQ-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512NOTDQ-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %ymm0
+; AVX512NOTDQ-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
; AVX512NOTDQ-NEXT: vzeroupper
@@ -1399,10 +1406,10 @@ define void @load_v64i1_broadcast_63_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
; AVX512: # %bb.0:
; AVX512-NEXT: kmovq (%rdi), %k0
; AVX512-NEXT: kshiftrq $56, %k0, %k0
-; AVX512-NEXT: vpmovm2q %k0, %zmm0
-; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7]
-; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0
-; AVX512-NEXT: vpmovq2m %zmm0, %k0
+; AVX512-NEXT: vpmovm2d %k0, %ymm0
+; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7]
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX512-NEXT: vpmovd2m %ymm0, %k0
; AVX512-NEXT: kmovb %k0, (%rsi)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -1411,11 +1418,12 @@ define void @load_v64i1_broadcast_63_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
; AVX512NOTDQ: # %bb.0:
; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
; AVX512NOTDQ-NEXT: kshiftrq $56, %k0, %k1
-; AVX512NOTDQ-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512NOTDQ-NEXT: vpbroadcastq {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7]
-; AVX512NOTDQ-NEXT: vpermq %zmm0, %zmm1, %zmm0
-; AVX512NOTDQ-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512NOTDQ-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7]
+; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX512NOTDQ-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
; AVX512NOTDQ-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll
index 7e0b981b2c6a..7477e05f0c7f 100644
--- a/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/test/CodeGen/X86/avx512-insert-extract.ll
@@ -793,11 +793,10 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32>
; KNL-NEXT: cmpl %esi, %edi
; KNL-NEXT: setb %al
; KNL-NEXT: vpcmpltud %zmm2, %zmm0, %k1
-; KNL-NEXT: movl {{.*}}(%rip), %ecx
-; KNL-NEXT: vpbroadcastd %ecx, %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vpcmpltud %zmm3, %zmm1, %k1
-; KNL-NEXT: vpbroadcastd %ecx, %zmm1 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; KNL-NEXT: vpsllw $7, %ymm0, %ymm0
@@ -1432,8 +1431,7 @@ define i8 @test_extractelement_variable_v16i8(<16 x i8> %t1, i32 %index) {
; CHECK-NEXT: ## kill: def %edi killed %edi def %rdi
; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: andl $15, %edi
-; CHECK-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT: movb (%rdi,%rax), %al
+; CHECK-NEXT: movb -24(%rsp,%rdi), %al
; CHECK-NEXT: retq
%t2 = extractelement <16 x i8> %t1, i32 %index
ret i8 %t2
@@ -1452,8 +1450,7 @@ define i8 @test_extractelement_variable_v32i8(<32 x i8> %t1, i32 %index) {
; CHECK-NEXT: ## kill: def %edi killed %edi def %rdi
; CHECK-NEXT: vmovaps %ymm0, (%rsp)
; CHECK-NEXT: andl $31, %edi
-; CHECK-NEXT: movq %rsp, %rax
-; CHECK-NEXT: movb (%rdi,%rax), %al
+; CHECK-NEXT: movb (%rsp,%rdi), %al
; CHECK-NEXT: movq %rbp, %rsp
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: vzeroupper
@@ -1477,8 +1474,7 @@ define i8 @test_extractelement_variable_v64i8(<64 x i8> %t1, i32 %index) {
; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovaps %ymm0, (%rsp)
; KNL-NEXT: andl $63, %edi
-; KNL-NEXT: movq %rsp, %rax
-; KNL-NEXT: movb (%rdi,%rax), %al
+; KNL-NEXT: movb (%rsp,%rdi), %al
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: vzeroupper
@@ -1496,8 +1492,7 @@ define i8 @test_extractelement_variable_v64i8(<64 x i8> %t1, i32 %index) {
; SKX-NEXT: ## kill: def %edi killed %edi def %rdi
; SKX-NEXT: vmovaps %zmm0, (%rsp)
; SKX-NEXT: andl $63, %edi
-; SKX-NEXT: movq %rsp, %rax
-; SKX-NEXT: movb (%rdi,%rax), %al
+; SKX-NEXT: movb (%rsp,%rdi), %al
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
; SKX-NEXT: vzeroupper
@@ -1522,8 +1517,7 @@ define i8 @test_extractelement_variable_v64i8_indexi8(<64 x i8> %t1, i8 %index)
; KNL-NEXT: vmovaps %ymm0, (%rsp)
; KNL-NEXT: movzbl %dil, %eax
; KNL-NEXT: andl $63, %eax
-; KNL-NEXT: movq %rsp, %rcx
-; KNL-NEXT: movb (%rax,%rcx), %al
+; KNL-NEXT: movb (%rsp,%rax), %al
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: vzeroupper
@@ -1542,8 +1536,7 @@ define i8 @test_extractelement_variable_v64i8_indexi8(<64 x i8> %t1, i8 %index)
; SKX-NEXT: vmovaps %zmm0, (%rsp)
; SKX-NEXT: movzbl %dil, %eax
; SKX-NEXT: andl $63, %eax
-; SKX-NEXT: movq %rsp, %rcx
-; SKX-NEXT: movb (%rax,%rcx), %al
+; SKX-NEXT: movb (%rsp,%rax), %al
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
; SKX-NEXT: vzeroupper
@@ -1617,45 +1610,28 @@ define zeroext i8 @test_extractelement_varible_v4i1(<4 x i32> %a, <4 x i32> %b,
define zeroext i8 @test_extractelement_varible_v8i1(<8 x i32> %a, <8 x i32> %b, i32 %index) {
; KNL-LABEL: test_extractelement_varible_v8i1:
; KNL: ## %bb.0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: .cfi_offset %rbp, -16
-; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: .cfi_def_cfa_register %rbp
-; KNL-NEXT: andq $-64, %rsp
-; KNL-NEXT: subq $128, %rsp
; KNL-NEXT: ## kill: def %edi killed %edi def %rdi
; KNL-NEXT: ## kill: def %ymm1 killed %ymm1 def %zmm1
; KNL-NEXT: ## kill: def %ymm0 killed %ymm0 def %zmm0
; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k1
-; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT: vmovdqa64 %zmm0, (%rsp)
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovdw %zmm0, %ymm0
+; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
; KNL-NEXT: andl $7, %edi
-; KNL-NEXT: movzbl (%rsp,%rdi,8), %eax
+; KNL-NEXT: movzbl -24(%rsp,%rdi,2), %eax
; KNL-NEXT: andl $1, %eax
-; KNL-NEXT: movq %rbp, %rsp
-; KNL-NEXT: popq %rbp
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_varible_v8i1:
; SKX: ## %bb.0:
-; SKX-NEXT: pushq %rbp
-; SKX-NEXT: .cfi_def_cfa_offset 16
-; SKX-NEXT: .cfi_offset %rbp, -16
-; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: .cfi_def_cfa_register %rbp
-; SKX-NEXT: andq $-64, %rsp
-; SKX-NEXT: subq $128, %rsp
; SKX-NEXT: ## kill: def %edi killed %edi def %rdi
; SKX-NEXT: vpcmpnleud %ymm1, %ymm0, %k0
-; SKX-NEXT: vpmovm2q %k0, %zmm0
-; SKX-NEXT: vmovdqa64 %zmm0, (%rsp)
+; SKX-NEXT: vpmovm2w %k0, %xmm0
+; SKX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
; SKX-NEXT: andl $7, %edi
-; SKX-NEXT: movzbl (%rsp,%rdi,8), %eax
+; SKX-NEXT: movzbl -24(%rsp,%rdi,2), %eax
; SKX-NEXT: andl $1, %eax
-; SKX-NEXT: movq %rbp, %rsp
-; SKX-NEXT: popq %rbp
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t1 = icmp ugt <8 x i32> %a, %b
@@ -1667,43 +1643,26 @@ define zeroext i8 @test_extractelement_varible_v8i1(<8 x i32> %a, <8 x i32> %b,
define zeroext i8 @test_extractelement_varible_v16i1(<16 x i32> %a, <16 x i32> %b, i32 %index) {
; KNL-LABEL: test_extractelement_varible_v16i1:
; KNL: ## %bb.0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: .cfi_offset %rbp, -16
-; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: .cfi_def_cfa_register %rbp
-; KNL-NEXT: andq $-64, %rsp
-; KNL-NEXT: subq $128, %rsp
; KNL-NEXT: ## kill: def %edi killed %edi def %rdi
; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT: vmovdqa32 %zmm0, (%rsp)
+; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
; KNL-NEXT: andl $15, %edi
-; KNL-NEXT: movzbl (%rsp,%rdi,4), %eax
+; KNL-NEXT: movzbl -24(%rsp,%rdi), %eax
; KNL-NEXT: andl $1, %eax
-; KNL-NEXT: movq %rbp, %rsp
-; KNL-NEXT: popq %rbp
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_varible_v16i1:
; SKX: ## %bb.0:
-; SKX-NEXT: pushq %rbp
-; SKX-NEXT: .cfi_def_cfa_offset 16
-; SKX-NEXT: .cfi_offset %rbp, -16
-; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: .cfi_def_cfa_register %rbp
-; SKX-NEXT: andq $-64, %rsp
-; SKX-NEXT: subq $128, %rsp
; SKX-NEXT: ## kill: def %edi killed %edi def %rdi
; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
-; SKX-NEXT: vpmovm2d %k0, %zmm0
-; SKX-NEXT: vmovdqa32 %zmm0, (%rsp)
+; SKX-NEXT: vpmovm2b %k0, %xmm0
+; SKX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
; SKX-NEXT: andl $15, %edi
-; SKX-NEXT: movzbl (%rsp,%rdi,4), %eax
+; SKX-NEXT: movzbl -24(%rsp,%rdi), %eax
; SKX-NEXT: andl $1, %eax
-; SKX-NEXT: movq %rbp, %rsp
-; SKX-NEXT: popq %rbp
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t1 = icmp ugt <16 x i32> %a, %b
@@ -1729,8 +1688,7 @@ define zeroext i8 @test_extractelement_varible_v32i1(<32 x i8> %a, <32 x i8> %b,
; KNL-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; KNL-NEXT: vmovdqa %ymm0, (%rsp)
; KNL-NEXT: andl $31, %edi
-; KNL-NEXT: movq %rsp, %rax
-; KNL-NEXT: movzbl (%rdi,%rax), %eax
+; KNL-NEXT: movzbl (%rsp,%rdi), %eax
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
@@ -1744,14 +1702,14 @@ define zeroext i8 @test_extractelement_varible_v32i1(<32 x i8> %a, <32 x i8> %b,
; SKX-NEXT: .cfi_offset %rbp, -16
; SKX-NEXT: movq %rsp, %rbp
; SKX-NEXT: .cfi_def_cfa_register %rbp
-; SKX-NEXT: andq $-64, %rsp
-; SKX-NEXT: subq $128, %rsp
+; SKX-NEXT: andq $-32, %rsp
+; SKX-NEXT: subq $64, %rsp
; SKX-NEXT: ## kill: def %edi killed %edi def %rdi
; SKX-NEXT: vpcmpnleub %ymm1, %ymm0, %k0
-; SKX-NEXT: vpmovm2w %k0, %zmm0
-; SKX-NEXT: vmovdqa32 %zmm0, (%rsp)
+; SKX-NEXT: vpmovm2b %k0, %ymm0
+; SKX-NEXT: vmovdqa %ymm0, (%rsp)
; SKX-NEXT: andl $31, %edi
-; SKX-NEXT: movzbl (%rsp,%rdi,2), %eax
+; SKX-NEXT: movzbl (%rsp,%rdi), %eax
; SKX-NEXT: andl $1, %eax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
@@ -1792,8 +1750,7 @@ define i32 @test_insertelement_variable_v32i1(<32 x i8> %a, i8 %b, i32 %index) {
; KNL-NEXT: andl $31, %esi
; KNL-NEXT: testb %dil, %dil
; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
-; KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rax
-; KNL-NEXT: setne (%rsi,%rax)
+; KNL-NEXT: setne 32(%rsp,%rsi)
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm0
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
@@ -1817,20 +1774,18 @@ define i32 @test_insertelement_variable_v32i1(<32 x i8> %a, i8 %b, i32 %index) {
; SKX-NEXT: .cfi_offset %rbp, -16
; SKX-NEXT: movq %rsp, %rbp
; SKX-NEXT: .cfi_def_cfa_register %rbp
-; SKX-NEXT: andq $-64, %rsp
-; SKX-NEXT: subq $128, %rsp
+; SKX-NEXT: andq $-32, %rsp
+; SKX-NEXT: subq $64, %rsp
; SKX-NEXT: ## kill: def %esi killed %esi def %rsi
; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT: vpcmpnleub %ymm1, %ymm0, %k1
-; SKX-NEXT: xorl %eax, %eax
-; SKX-NEXT: testb %dil, %dil
-; SKX-NEXT: setne %al
-; SKX-NEXT: vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z}
-; SKX-NEXT: vmovdqa32 %zmm0, (%rsp)
+; SKX-NEXT: vpcmpnleub %ymm1, %ymm0, %k0
; SKX-NEXT: andl $31, %esi
-; SKX-NEXT: movw %ax, (%rsp,%rsi,2)
-; SKX-NEXT: vpsllw $15, (%rsp), %zmm0
-; SKX-NEXT: vpmovw2m %zmm0, %k0
+; SKX-NEXT: testb %dil, %dil
+; SKX-NEXT: vpmovm2b %k0, %ymm0
+; SKX-NEXT: vmovdqa %ymm0, (%rsp)
+; SKX-NEXT: setne (%rsp,%rsi)
+; SKX-NEXT: vpsllw $7, (%rsp), %ymm0
+; SKX-NEXT: vpmovb2m %ymm0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
@@ -1863,8 +1818,7 @@ define i64 @test_insertelement_variable_v64i1(<64 x i8> %a, i8 %b, i32 %index) {
; KNL-NEXT: testb %dil, %dil
; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
-; KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rax
-; KNL-NEXT: setne (%rsi,%rax)
+; KNL-NEXT: setne 64(%rsp,%rsi)
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm0
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm2
@@ -1905,13 +1859,12 @@ define i64 @test_insertelement_variable_v64i1(<64 x i8> %a, i8 %b, i32 %index) {
; SKX-NEXT: subq $128, %rsp
; SKX-NEXT: ## kill: def %esi killed %esi def %rsi
; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT: vpcmpnleub %zmm1, %zmm0, %k1
+; SKX-NEXT: vpcmpnleub %zmm1, %zmm0, %k0
; SKX-NEXT: andl $63, %esi
; SKX-NEXT: testb %dil, %dil
-; SKX-NEXT: vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z}
+; SKX-NEXT: vpmovm2b %k0, %zmm0
; SKX-NEXT: vmovdqa32 %zmm0, (%rsp)
-; SKX-NEXT: movq %rsp, %rax
-; SKX-NEXT: setne (%rsi,%rax)
+; SKX-NEXT: setne (%rsp,%rsi)
; SKX-NEXT: vpsllw $7, (%rsp), %zmm0
; SKX-NEXT: vpmovb2m %zmm0, %k0
; SKX-NEXT: kmovq %k0, %rax
@@ -2050,8 +2003,7 @@ define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) {
; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovdqa %ymm2, {{[0-9]+}}(%rsp)
-; KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; KNL-NEXT: setne (%rax,%rcx)
+; KNL-NEXT: setne 128(%rsp,%rax)
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm2
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm3
@@ -2215,18 +2167,16 @@ define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) {
; SKX-NEXT: vpinsrb $15, 728(%rbp), %xmm2, %xmm2
; SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; SKX-NEXT: vpcmpnleub %zmm2, %zmm0, %k1
-; SKX-NEXT: vpcmpnleub %zmm2, %zmm1, %k2
+; SKX-NEXT: vpcmpnleub %zmm2, %zmm0, %k0
+; SKX-NEXT: vpcmpnleub %zmm2, %zmm1, %k1
; SKX-NEXT: movl 744(%rbp), %eax
; SKX-NEXT: andl $127, %eax
; SKX-NEXT: cmpb $0, 736(%rbp)
-; SKX-NEXT: vmovdqa64 {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SKX-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} {z}
-; SKX-NEXT: vmovdqa32 %zmm1, {{[0-9]+}}(%rsp)
-; SKX-NEXT: vmovdqu8 %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT: vpmovm2b %k1, %zmm0
+; SKX-NEXT: vmovdqa32 %zmm0, {{[0-9]+}}(%rsp)
+; SKX-NEXT: vpmovm2b %k0, %zmm0
; SKX-NEXT: vmovdqa32 %zmm0, (%rsp)
-; SKX-NEXT: movq %rsp, %rcx
-; SKX-NEXT: setne (%rax,%rcx)
+; SKX-NEXT: setne (%rsp,%rax)
; SKX-NEXT: vpsllw $7, {{[0-9]+}}(%rsp), %zmm0
; SKX-NEXT: vpmovb2m %zmm0, %k0
; SKX-NEXT: vpsllw $7, (%rsp), %zmm0
@@ -2270,8 +2220,7 @@ define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index
; KNL-NEXT: vmovdqa %ymm2, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
-; KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rax
-; KNL-NEXT: setne (%rsi,%rax)
+; KNL-NEXT: setne 128(%rsp,%rsi)
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm2
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm3
@@ -2336,17 +2285,15 @@ define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index
; SKX-NEXT: subq $256, %rsp ## imm = 0x100
; SKX-NEXT: ## kill: def %esi killed %esi def %rsi
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; SKX-NEXT: vpcmpnleub %zmm2, %zmm0, %k1
-; SKX-NEXT: vpcmpnleub %zmm2, %zmm1, %k2
+; SKX-NEXT: vpcmpnleub %zmm2, %zmm0, %k0
+; SKX-NEXT: vpcmpnleub %zmm2, %zmm1, %k1
; SKX-NEXT: andl $127, %esi
; SKX-NEXT: testb %dil, %dil
-; SKX-NEXT: vmovdqa64 {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SKX-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} {z}
-; SKX-NEXT: vmovdqa32 %zmm1, {{[0-9]+}}(%rsp)
-; SKX-NEXT: vmovdqu8 %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT: vpmovm2b %k1, %zmm0
+; SKX-NEXT: vmovdqa32 %zmm0, {{[0-9]+}}(%rsp)
+; SKX-NEXT: vpmovm2b %k0, %zmm0
; SKX-NEXT: vmovdqa32 %zmm0, (%rsp)
-; SKX-NEXT: movq %rsp, %rax
-; SKX-NEXT: setne (%rsi,%rax)
+; SKX-NEXT: setne (%rsp,%rsi)
; SKX-NEXT: vpsllw $7, {{[0-9]+}}(%rsp), %zmm0
; SKX-NEXT: vpmovb2m %zmm0, %k0
; SKX-NEXT: vpsllw $7, (%rsp), %zmm0
diff --git a/test/CodeGen/X86/avx512-insert-extract_i1.ll b/test/CodeGen/X86/avx512-insert-extract_i1.ll
index e28e384ae996..9283fd32d746 100644
--- a/test/CodeGen/X86/avx512-insert-extract_i1.ll
+++ b/test/CodeGen/X86/avx512-insert-extract_i1.ll
@@ -18,8 +18,7 @@ define zeroext i8 @test_extractelement_varible_v64i1(<64 x i8> %a, <64 x i8> %b,
; SKX-NEXT: vpmovm2b %k0, %zmm0
; SKX-NEXT: vmovdqa32 %zmm0, (%rsp)
; SKX-NEXT: andl $63, %edi
-; SKX-NEXT: movq %rsp, %rax
-; SKX-NEXT: movzbl (%rdi,%rax), %eax
+; SKX-NEXT: movzbl (%rsp,%rdi), %eax
; SKX-NEXT: andl $1, %eax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll
index dfe42d53483f..4877157d911d 100644
--- a/test/CodeGen/X86/avx512-mask-op.ll
+++ b/test/CodeGen/X86/avx512-mask-op.ll
@@ -1783,20 +1783,19 @@ define void @ktest_2(<32 x float> %in, float * %base) {
; KNL-NEXT: vmovups (%rdi), %zmm2
; KNL-NEXT: vmovups 64(%rdi), %zmm3
; KNL-NEXT: vcmpltps %zmm0, %zmm2, %k1
-; KNL-NEXT: movl {{.*}}(%rip), %eax
-; KNL-NEXT: vpbroadcastd %eax, %zmm2 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm2, %xmm2
; KNL-NEXT: vcmpltps %zmm1, %zmm3, %k2
-; KNL-NEXT: vpbroadcastd %eax, %zmm3 {%k2} {z}
+; KNL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
; KNL-NEXT: vpmovdb %zmm3, %xmm3
; KNL-NEXT: vmovups 68(%rdi), %zmm4 {%k2} {z}
; KNL-NEXT: vmovups 4(%rdi), %zmm5 {%k1} {z}
; KNL-NEXT: vcmpltps %zmm5, %zmm0, %k1
-; KNL-NEXT: vpbroadcastd %eax, %zmm5 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm5, %xmm5
; KNL-NEXT: vpor %xmm5, %xmm2, %xmm2
; KNL-NEXT: vcmpltps %zmm4, %zmm1, %k1
-; KNL-NEXT: vpbroadcastd %eax, %zmm4 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm4, %xmm4
; KNL-NEXT: vpor %xmm4, %xmm3, %xmm3
; KNL-NEXT: vpmovsxbd %xmm3, %zmm3
@@ -1886,20 +1885,19 @@ define void @ktest_2(<32 x float> %in, float * %base) {
; AVX512DQ-NEXT: vmovups (%rdi), %zmm2
; AVX512DQ-NEXT: vmovups 64(%rdi), %zmm3
; AVX512DQ-NEXT: vcmpltps %zmm0, %zmm2, %k1
-; AVX512DQ-NEXT: movl {{.*}}(%rip), %eax
-; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm2 {%k1} {z}
+; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2
; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2
; AVX512DQ-NEXT: vcmpltps %zmm1, %zmm3, %k2
-; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm3 {%k2} {z}
+; AVX512DQ-NEXT: vpmovm2d %k2, %zmm3
; AVX512DQ-NEXT: vpmovdb %zmm3, %xmm3
; AVX512DQ-NEXT: vmovups 68(%rdi), %zmm4 {%k2} {z}
; AVX512DQ-NEXT: vmovups 4(%rdi), %zmm5 {%k1} {z}
-; AVX512DQ-NEXT: vcmpltps %zmm5, %zmm0, %k1
-; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm5 {%k1} {z}
+; AVX512DQ-NEXT: vcmpltps %zmm5, %zmm0, %k0
+; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5
; AVX512DQ-NEXT: vpmovdb %zmm5, %xmm5
; AVX512DQ-NEXT: vpor %xmm5, %xmm2, %xmm2
-; AVX512DQ-NEXT: vcmpltps %zmm4, %zmm1, %k1
-; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm4 {%k1} {z}
+; AVX512DQ-NEXT: vcmpltps %zmm4, %zmm1, %k0
+; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4
; AVX512DQ-NEXT: vpmovdb %zmm4, %xmm4
; AVX512DQ-NEXT: vpor %xmm4, %xmm3, %xmm3
; AVX512DQ-NEXT: vpmovsxbd %xmm3, %zmm3
diff --git a/test/CodeGen/X86/avx512-schedule.ll b/test/CodeGen/X86/avx512-schedule.ll
index 1b450b98a6d5..78111874b58a 100755
--- a/test/CodeGen/X86/avx512-schedule.ll
+++ b/test/CodeGen/X86/avx512-schedule.ll
@@ -4376,20 +4376,16 @@ define i16 @trunc_16i32_to_16i1(<16 x i32> %a) {
define <4 x i32> @trunc_4i32_to_4i1(<4 x i32> %a, <4 x i32> %b) {
; GENERIC-LABEL: trunc_4i32_to_4i1:
; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00]
-; GENERIC-NEXT: vpslld $31, %xmm1, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k0 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vpsrad $31, %xmm0, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: trunc_4i32_to_4i1:
; SKX: # %bb.0:
+; SKX-NEXT: vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpslld $31, %xmm1, %xmm0 # sched: [1:0.50]
-; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 {%k1} # sched: [3:1.00]
-; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: vpsrad $31, %xmm0, %xmm0 # sched: [1:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%mask_a = trunc <4 x i32>%a to <4 x i1>
%mask_b = trunc <4 x i32>%b to <4 x i1>
diff --git a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index df88f0fca456..0601c011e290 100644
--- a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -1488,12 +1488,10 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x
define <4 x i32> @test_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec) {
; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
-; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,3,4,6,4,7]
-; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7]
-; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; CHECK-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,12,4,6,4,12]
+; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm1
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
@@ -1503,13 +1501,11 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i
; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
-; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,3,4,6,4,7]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6],ymm3[7]
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,12,4,6,4,12]
+; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpeqd %xmm0, %xmm2, %k1
+; CHECK-NEXT: vpblendmd %xmm4, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
@@ -1522,13 +1518,11 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x
; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
-; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,3,4,6,4,7]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6],ymm2[7]
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
-; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,12,4,6,4,12]
+; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm3
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpeqd %xmm0, %xmm1, %k1
+; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
@@ -1863,14 +1857,12 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, <4
; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
-; CHECK-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[3,1,2,3,7,5,6,7]
-; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm2
-; CHECK-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,3,2,4,5,7,6]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3],ymm3[4],ymm2[5,6,7]
-; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %xmm3, %xmm1, %k1
-; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [15,5,3,2,15,5,7,6]
+; CHECK-NEXT: vpermi2d %ymm2, %ymm3, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovdqa32 %xmm4, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -1884,14 +1876,12 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp,
; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa32 (%rdi), %zmm1
-; CHECK-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[3,1,2,3,7,5,6,7]
-; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm1
-; CHECK-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,3,2,4,5,7,6]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7]
-; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
-; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [15,5,3,2,15,5,7,6]
+; CHECK-NEXT: vpermi2d %ymm1, %ymm2, %ymm3
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -2298,13 +2288,12 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i
define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask4:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermq {{.*#+}} ymm3 = ymm0[3,1,2,3]
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,3,1]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [7,3,3,1]
+; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1
+; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2315,13 +2304,12 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64
define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm0[3,1,2,3]
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,3,1]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [7,3,3,1]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1
+; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2332,12 +2320,11 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64
; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask5:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,1,4,5,4,5]
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [4,1,0,6]
+; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1
+; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2348,13 +2335,12 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64
define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5]
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [4,1,0,6]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1
+; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2405,13 +2391,12 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i
define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask7:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,0,3,3]
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0
-; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,0,3,4]
+; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1
+; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2422,13 +2407,12 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64
define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask7:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,0,3,3]
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0
-; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,3,4]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1
+; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2585,12 +2569,11 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,1,2,1]
-; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %ymm3, %ymm1, %k1
-; CHECK-NEXT: vmovdqa64 %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [7,1,1,5]
+; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5>
@@ -2602,14 +2585,13 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x
define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,1,2,1]
-; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm0, %k1
-; CHECK-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,1,1,5]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1
+; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5>
@@ -2669,12 +2651,11 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,1,4,5,4,5]
-; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5],ymm2[6,7]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %ymm3, %ymm1, %k1
-; CHECK-NEXT: vmovdqa64 %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [0,4,6,1]
+; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1>
@@ -2686,14 +2667,13 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x
define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask4:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5]
-; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5],ymm1[6,7]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm0, %k1
-; CHECK-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,6,1]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1
+; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1>
@@ -2739,11 +2719,10 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4
define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp) {
; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask6:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps (%rdi), %zmm0
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3]
-; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,2]
-; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [7,2,3,2]
+; CHECK-NEXT: vpermi2q %ymm2, %ymm1, %ymm0
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
%res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
@@ -2754,12 +2733,11 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,1,2,3]
-; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,3,2]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %ymm3, %ymm1, %k1
-; CHECK-NEXT: vmovdqa64 %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [7,2,3,2]
+; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
@@ -2771,14 +2749,13 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x
define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask6:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,1,2,3]
-; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,2]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm0, %k1
-; CHECK-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,2,3,2]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1
+; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
@@ -3307,14 +3284,13 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask1(<16 x float> %v
define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = <0,4,u,u,6,1,4,4>
-; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm3
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
-; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovddup {{.*#+}} xmm3 = xmm3[0,0]
+; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [0,4,10,11,6,1,4,4]
+; CHECK-NEXT: vpermi2ps %ymm3, %ymm0, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vcmpeqps %ymm0, %ymm2, %k1
+; CHECK-NEXT: vblendmps %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4>
%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
@@ -3325,14 +3301,13 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec
define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %mask) {
; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = <0,4,u,u,6,1,4,4>
-; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm2
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vmovddup {{.*#+}} xmm3 = xmm2[0,0]
+; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,4,10,11,6,1,4,4]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm1, %k1
+; CHECK-NEXT: vpermi2ps %ymm3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4>
%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
@@ -3775,12 +3750,11 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float>*
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %zmm2
; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,2],ymm2[2,3],ymm3[4,6],ymm2[6,7]
-; CHECK-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,1,3,4,6,5,7]
-; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
-; CHECK-NEXT: vmovaps %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [0,10,6,15,4,14,6,15]
+; CHECK-NEXT: vpermi2ps %ymm2, %ymm3, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovaps %xmm4, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <16 x float>, <16 x float>* %vp
@@ -3795,12 +3769,11 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %zmm1
; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[2,3],ymm2[4,6],ymm1[6,7]
-; CHECK-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,1,3,4,6,5,7]
-; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1
-; CHECK-NEXT: vmovaps %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,10,6,15,4,14,6,15]
+; CHECK-NEXT: vpermi2ps %ymm1, %ymm2, %ymm3
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1
+; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <16 x float>, <16 x float>* %vp
@@ -3815,12 +3788,11 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>*
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %zmm2
; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[0,0],ymm2[6,4],ymm3[4,4]
-; CHECK-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[2,3],ymm2[6,4],ymm3[6,7]
-; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
-; CHECK-NEXT: vmovaps %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [4,14,4,14,4,14,6,7]
+; CHECK-NEXT: vpermi2ps %ymm2, %ymm3, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovaps %xmm4, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <16 x float>, <16 x float>* %vp
@@ -3835,12 +3807,11 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %zmm1
; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[0,0],ymm1[6,4],ymm2[4,4]
-; CHECK-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,3],ymm1[6,4],ymm2[6,7]
-; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1
-; CHECK-NEXT: vmovaps %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [4,14,4,14,4,14,6,7]
+; CHECK-NEXT: vpermi2ps %ymm1, %ymm2, %ymm3
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1
+; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <16 x float>, <16 x float>* %vp
diff --git a/test/CodeGen/X86/avx512-skx-insert-subvec.ll b/test/CodeGen/X86/avx512-skx-insert-subvec.ll
index 1182bbf94ec5..6bee0de181ab 100644
--- a/test/CodeGen/X86/avx512-skx-insert-subvec.ll
+++ b/test/CodeGen/X86/avx512-skx-insert-subvec.ll
@@ -30,10 +30,9 @@ define <8 x i1> @test2(<2 x i1> %a) {
; CHECK: # %bb.0:
; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0
-; CHECK-NEXT: vpmovm2q %k0, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; CHECK-NEXT: vpmovq2m %zmm0, %k0
+; CHECK-NEXT: vpmovm2d %k0, %ymm0
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
+; CHECK-NEXT: vpmovd2m %ymm0, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll
index fc684e54b063..826a4538f3f1 100644
--- a/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -192,15 +192,14 @@ define i32 @test12_v32i32(<32 x i32> %a, <32 x i32> %b) nounwind {
; KNL-NEXT: andq $-32, %rsp
; KNL-NEXT: subq $32, %rsp
; KNL-NEXT: vpcmpeqd %zmm3, %zmm1, %k1
-; KNL-NEXT: movl {{.*}}(%rip), %eax
-; KNL-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
-; KNL-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
diff --git a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
index 9863a0a7d283..5f4b050b863d 100644
--- a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
+++ b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
@@ -101,7 +101,7 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -147,7 +147,7 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -282,7 +282,7 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -333,7 +333,7 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -1354,7 +1354,7 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -1401,7 +1401,7 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -1543,7 +1543,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -1595,7 +1595,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -4799,7 +4799,7 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -4839,7 +4839,7 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k1
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -4882,7 +4882,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -4926,7 +4926,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -4970,7 +4970,7 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %_
; NoVLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k1
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -5014,7 +5014,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u
; NoVLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -5061,7 +5061,7 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -5106,7 +5106,7 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -5154,7 +5154,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -5203,7 +5203,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -5252,7 +5252,7 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %_
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -5301,7 +5301,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -9211,7 +9211,7 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -9257,7 +9257,7 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -9392,7 +9392,7 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -9443,7 +9443,7 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -10464,7 +10464,7 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -10511,7 +10511,7 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -10653,7 +10653,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -10705,7 +10705,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -13909,7 +13909,7 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -13949,7 +13949,7 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k1
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -13992,7 +13992,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -14036,7 +14036,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -14080,7 +14080,7 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k1
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -14124,7 +14124,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -14171,7 +14171,7 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -14216,7 +14216,7 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -14264,7 +14264,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -14313,7 +14313,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -14362,7 +14362,7 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -14411,7 +14411,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -18328,7 +18328,7 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -18377,7 +18377,7 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -18519,7 +18519,7 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -18573,7 +18573,7 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -19641,7 +19641,7 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -19691,7 +19691,7 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -19840,7 +19840,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -19895,7 +19895,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -23147,7 +23147,7 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k1
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -23187,7 +23187,7 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k1
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -23230,7 +23230,7 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -23274,7 +23274,7 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -23318,7 +23318,7 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: vpcmpnltd (%rdi){1to16}, %zmm0, %k1
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -23362,7 +23362,7 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: vpcmpnltd (%rsi){1to16}, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -23409,7 +23409,7 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -23454,7 +23454,7 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -23502,7 +23502,7 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -23551,7 +23551,7 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -23600,7 +23600,7 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -23649,7 +23649,7 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -27664,7 +27664,7 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -27713,7 +27713,7 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -27857,7 +27857,7 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -27911,7 +27911,7 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -28989,7 +28989,7 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -29039,7 +29039,7 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -29190,7 +29190,7 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -29245,7 +29245,7 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -32541,7 +32541,7 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k1
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -32581,7 +32581,7 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: vpcmpltud (%rdi), %zmm0, %k1
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -32624,7 +32624,7 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -32668,7 +32668,7 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpcmpltud (%rsi), %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -32712,7 +32712,7 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k1
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -32756,7 +32756,7 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -32803,7 +32803,7 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -32848,7 +32848,7 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -32896,7 +32896,7 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -32945,7 +32945,7 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -32994,7 +32994,7 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -33043,7 +33043,7 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -39363,7 +39363,7 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k1
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -39403,7 +39403,7 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: vcmpeqps (%rdi), %zmm0, %k1
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -39444,7 +39444,7 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, float*
; NoVLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k1
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -39488,7 +39488,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -39532,7 +39532,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vcmpeqps (%rsi), %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -39577,7 +39577,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: vcmpeqps (%rsi){1to16}, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -39666,7 +39666,7 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -39711,7 +39711,7 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -39757,7 +39757,7 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, float*
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -39806,7 +39806,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -39855,7 +39855,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -39905,7 +39905,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -43827,7 +43827,7 @@ define i32 @test_cmpm_rnd_zero(<16 x float> %a, <16 x float> %b) {
; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k1
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
diff --git a/test/CodeGen/X86/bitcast-and-setcc-128.ll b/test/CodeGen/X86/bitcast-and-setcc-128.ll
index 2276e5634537..78c44e4dca3b 100644
--- a/test/CodeGen/X86/bitcast-and-setcc-128.ll
+++ b/test/CodeGen/X86/bitcast-and-setcc-128.ll
@@ -30,13 +30,13 @@ define i8 @v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i16> %d) {
; AVX512F-LABEL: v8i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
-; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k1
; AVX512F-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm0
-; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
-; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k0 {%k1}
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: vzeroupper
@@ -943,12 +943,12 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
; AVX512F-NEXT: vpsllw $8, %xmm0, %xmm0
; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm0
; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
-; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm0
-; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k1
+; AVX512F-NEXT: vpmovsxwd %xmm2, %ymm0
+; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k0 {%k1}
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/bitcast-setcc-128.ll b/test/CodeGen/X86/bitcast-setcc-128.ll
index 7d0381837b70..8fdacb7b79d6 100644
--- a/test/CodeGen/X86/bitcast-setcc-128.ll
+++ b/test/CodeGen/X86/bitcast-setcc-128.ll
@@ -26,9 +26,9 @@ define i8 @v8i16(<8 x i16> %a, <8 x i16> %b) {
; AVX512F-LABEL: v8i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
-; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: vzeroupper
@@ -640,9 +640,9 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b) {
; AVX512F-NEXT: vpsllw $8, %xmm0, %xmm0
; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm0
; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
-; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/combine-and.ll b/test/CodeGen/X86/combine-and.ll
index e92237f524f5..c2da74a1646c 100644
--- a/test/CodeGen/X86/combine-and.ll
+++ b/test/CodeGen/X86/combine-and.ll
@@ -220,6 +220,16 @@ define <4 x i32> @and_or_v4i32(<4 x i32> %a0) {
ret <4 x i32> %2
}
+define <8 x i16> @and_or_v8i16(<8 x i16> %a0) {
+; CHECK-LABEL: and_or_v8i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps {{.*#+}} xmm0 = [15,7,3,1,14,10,2,32767]
+; CHECK-NEXT: retq
+ %1 = or <8 x i16> %a0, <i16 255, i16 127, i16 63, i16 31, i16 15, i16 31, i16 63, i16 -1>
+ %2 = and <8 x i16> %1, <i16 15, i16 7, i16 3, i16 1, i16 14, i16 10, i16 2, i16 32767>
+ ret <8 x i16> %2
+}
+
;
; known bits folding
;
diff --git a/test/CodeGen/X86/combine-or.ll b/test/CodeGen/X86/combine-or.ll
index 1601c67dce25..65184c4d278e 100644
--- a/test/CodeGen/X86/combine-or.ll
+++ b/test/CodeGen/X86/combine-or.ll
@@ -430,25 +430,35 @@ define <4 x i32> @test2f(<4 x i32> %a, <4 x i32> %b) {
ret <4 x i32> %or
}
-; TODO: Why would we do this?
-; (or (and X, c1), c2) -> (and (or X, c2), c1|c2)
+; (or (and X, c1), c2) -> (and (or X, c2), c1|c2) iff (c1 & c2) != 0
define <2 x i64> @or_and_v2i64(<2 x i64> %a0) {
; CHECK-LABEL: or_and_v2i64:
; CHECK: # %bb.0:
-; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
; CHECK-NEXT: orps {{.*}}(%rip), %xmm0
+; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%1 = and <2 x i64> %a0, <i64 7, i64 7>
%2 = or <2 x i64> %1, <i64 3, i64 3>
ret <2 x i64> %2
}
-; If all masked bits are going to be set, that's a constant fold.
-
define <4 x i32> @or_and_v4i32(<4 x i32> %a0) {
; CHECK-LABEL: or_and_v4i32:
; CHECK: # %bb.0:
+; CHECK-NEXT: orps {{.*}}(%rip), %xmm0
+; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
+; CHECK-NEXT: retq
+ %1 = and <4 x i32> %a0, <i32 1, i32 3, i32 5, i32 7>
+ %2 = or <4 x i32> %1, <i32 3, i32 2, i32 15, i32 2>
+ ret <4 x i32> %2
+}
+
+; If all masked bits are going to be set, that's a constant fold.
+
+define <4 x i32> @or_and_v4i32_fold(<4 x i32> %a0) {
+; CHECK-LABEL: or_and_v4i32_fold:
+; CHECK: # %bb.0:
; CHECK-NEXT: movaps {{.*#+}} xmm0 = [3,3,3,3]
; CHECK-NEXT: retq
%1 = and <4 x i32> %a0, <i32 1, i32 1, i32 1, i32 1>
diff --git a/test/CodeGen/X86/darwin-bzero.ll b/test/CodeGen/X86/darwin-bzero.ll
index 3099526028ab..410d67ff0ec1 100644
--- a/test/CodeGen/X86/darwin-bzero.ll
+++ b/test/CodeGen/X86/darwin-bzero.ll
@@ -1,7 +1,10 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin10 | grep __bzero
+; RUN: llc < %s -mtriple=i386-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s
declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
+; CHECK-LABEL: foo:
+; CHECK: {{calll|callq}} ___bzero
define void @foo(i8* %p, i32 %len) {
call void @llvm.memset.p0i8.i32(i8* %p, i8 0, i32 %len, i32 1, i1 false)
ret void
diff --git a/test/CodeGen/X86/extractelement-index.ll b/test/CodeGen/X86/extractelement-index.ll
index 4d24a15fe2e1..66bdfb8475f1 100644
--- a/test/CodeGen/X86/extractelement-index.ll
+++ b/test/CodeGen/X86/extractelement-index.ll
@@ -403,16 +403,14 @@ define i8 @extractelement_v16i8_var(<16 x i8> %a, i256 %i) nounwind {
; SSE: # %bb.0:
; SSE-NEXT: andl $15, %edi
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movb (%rdi,%rax), %al
+; SSE-NEXT: movb -24(%rsp,%rdi), %al
; SSE-NEXT: retq
;
; AVX-LABEL: extractelement_v16i8_var:
; AVX: # %bb.0:
; AVX-NEXT: andl $15, %edi
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movb (%rdi,%rax), %al
+; AVX-NEXT: movb -24(%rsp,%rdi), %al
; AVX-NEXT: retq
%b = extractelement <16 x i8> %a, i256 %i
ret i8 %b
@@ -428,8 +426,7 @@ define i8 @extractelement_v32i8_var(<32 x i8> %a, i256 %i) nounwind {
; SSE-NEXT: andl $31, %edi
; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE-NEXT: movaps %xmm0, (%rsp)
-; SSE-NEXT: movq %rsp, %rax
-; SSE-NEXT: movb (%rdi,%rax), %al
+; SSE-NEXT: movb (%rsp,%rdi), %al
; SSE-NEXT: movq %rbp, %rsp
; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
@@ -442,8 +439,7 @@ define i8 @extractelement_v32i8_var(<32 x i8> %a, i256 %i) nounwind {
; AVX-NEXT: subq $64, %rsp
; AVX-NEXT: andl $31, %edi
; AVX-NEXT: vmovaps %ymm0, (%rsp)
-; AVX-NEXT: movq %rsp, %rax
-; AVX-NEXT: movb (%rdi,%rax), %al
+; AVX-NEXT: movb (%rsp,%rdi), %al
; AVX-NEXT: movq %rbp, %rsp
; AVX-NEXT: popq %rbp
; AVX-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/fma-fneg-combine.ll b/test/CodeGen/X86/fma-fneg-combine.ll
index 8dacf2dcf971..a0e919d128df 100644
--- a/test/CodeGen/X86/fma-fneg-combine.ll
+++ b/test/CodeGen/X86/fma-fneg-combine.ll
@@ -140,21 +140,23 @@ declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %a, <2 x doubl
define <4 x float> @test11(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
; SKX-LABEL: test11:
; SKX: # %bb.0: # %entry
-; SKX-NEXT: vxorps {{.*}}(%rip){1to4}, %xmm2, %xmm0
+; SKX-NEXT: vxorps {{.*}}(%rip){1to4}, %xmm2, %xmm2
; SKX-NEXT: kmovd %edi, %k1
-; SKX-NEXT: vfmadd231ss %xmm1, %xmm1, %xmm0 {%k1}
+; SKX-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm2 {%k1}
+; SKX-NEXT: vmovaps %xmm2, %xmm0
; SKX-NEXT: retq
;
; KNL-LABEL: test11:
; KNL: # %bb.0: # %entry
-; KNL-NEXT: vbroadcastss {{.*#+}} xmm0 = [-0,-0,-0,-0]
-; KNL-NEXT: vxorps %xmm0, %xmm2, %xmm0
+; KNL-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0,-0,-0,-0]
+; KNL-NEXT: vxorps %xmm3, %xmm2, %xmm2
; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: vfmadd231ss %xmm1, %xmm1, %xmm0 {%k1}
+; KNL-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm2 {%k1}
+; KNL-NEXT: vmovaps %xmm2, %xmm0
; KNL-NEXT: retq
entry:
%sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
- %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %b, <4 x float> %b, <4 x float> %sub.i, i8 %mask, i32 4) #10
+ %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %sub.i, i8 %mask, i32 4) #10
ret <4 x float> %0
}
@@ -164,19 +166,17 @@ define <4 x float> @test11b(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 z
; SKX-LABEL: test11b:
; SKX: # %bb.0: # %entry
; SKX-NEXT: kmovd %edi, %k1
-; SKX-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm1 {%k1}
-; SKX-NEXT: vmovaps %xmm1, %xmm0
+; SKX-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
;
; KNL-LABEL: test11b:
; KNL: # %bb.0: # %entry
; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm1 {%k1}
-; KNL-NEXT: vmovaps %xmm1, %xmm0
+; KNL-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 {%k1}
; KNL-NEXT: retq
entry:
%sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
- %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %b, <4 x float> %b, <4 x float> %sub.i, i8 %mask, i32 4) #10
+ %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %sub.i, i8 %mask, i32 4) #10
ret <4 x float> %0
}
@@ -305,3 +305,147 @@ define <8 x double> @test17(<8 x double> %a, <8 x double> %b, <8 x double> %c, i
ret <8 x double> %res
}
declare <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
+
+define <4 x float> @test18(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
+; SKX-LABEL: test18:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: kmovd %edi, %k1
+; SKX-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 {%k1}
+; SKX-NEXT: retq
+;
+; KNL-LABEL: test18:
+; KNL: # %bb.0: # %entry
+; KNL-NEXT: kmovw %edi, %k1
+; KNL-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 {%k1}
+; KNL-NEXT: retq
+entry:
+ %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
+ %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %sub.i, <4 x float> %c, i8 %mask, i32 4) #10
+ ret <4 x float> %0
+}
+
+define <4 x float> @test19(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
+; SKX-LABEL: test19:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: kmovd %edi, %k1
+; SKX-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 {%k1}
+; SKX-NEXT: retq
+;
+; KNL-LABEL: test19:
+; KNL: # %bb.0: # %entry
+; KNL-NEXT: kmovw %edi, %k1
+; KNL-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 {%k1}
+; KNL-NEXT: retq
+entry:
+ %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
+ %sub.i.2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
+ %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %sub.i, <4 x float> %sub.i.2, i8 %mask, i32 4) #10
+ ret <4 x float> %0
+}
+
+define <4 x float> @test20(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
+; SKX-LABEL: test20:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: kmovd %edi, %k1
+; SKX-NEXT: vfnmadd231ss %xmm1, %xmm0, %xmm2 {%k1}
+; SKX-NEXT: vmovaps %xmm2, %xmm0
+; SKX-NEXT: retq
+;
+; KNL-LABEL: test20:
+; KNL: # %bb.0: # %entry
+; KNL-NEXT: kmovw %edi, %k1
+; KNL-NEXT: vfnmadd231ss %xmm1, %xmm0, %xmm2 {%k1}
+; KNL-NEXT: vmovaps %xmm2, %xmm0
+; KNL-NEXT: retq
+entry:
+ %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
+ %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %a, <4 x float> %sub.i, <4 x float> %c, i8 %mask, i32 4) #10
+ ret <4 x float> %0
+}
+
+define <4 x float> @test21(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
+; SKX-LABEL: test21:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: kmovd %edi, %k1
+; SKX-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
+; SKX-NEXT: retq
+;
+; KNL-LABEL: test21:
+; KNL: # %bb.0: # %entry
+; KNL-NEXT: kmovw %edi, %k1
+; KNL-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
+; KNL-NEXT: retq
+entry:
+ %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
+ %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %sub.i, <4 x float> %c, i8 %mask, i32 8) #10
+ ret <4 x float> %0
+}
+
+define <4 x float> @test22(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
+; SKX-LABEL: test22:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: kmovd %edi, %k1
+; SKX-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
+; SKX-NEXT: retq
+;
+; KNL-LABEL: test22:
+; KNL: # %bb.0: # %entry
+; KNL-NEXT: kmovw %edi, %k1
+; KNL-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
+; KNL-NEXT: retq
+entry:
+ %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
+ %sub.i.2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
+ %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %sub.i, <4 x float> %sub.i.2, i8 %mask, i32 8) #10
+ ret <4 x float> %0
+}
+
+define <4 x float> @test23(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
+; SKX-LABEL: test23:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: kmovd %edi, %k1
+; SKX-NEXT: vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; SKX-NEXT: vmovaps %xmm2, %xmm0
+; SKX-NEXT: retq
+;
+; KNL-LABEL: test23:
+; KNL: # %bb.0: # %entry
+; KNL-NEXT: kmovw %edi, %k1
+; KNL-NEXT: vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; KNL-NEXT: vmovaps %xmm2, %xmm0
+; KNL-NEXT: retq
+entry:
+ %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
+ %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %a, <4 x float> %sub.i, <4 x float> %c, i8 %mask, i32 8) #10
+ ret <4 x float> %0
+}
+
+define <4 x float> @test24(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
+; SKX-LABEL: test24:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: kmovd %edi, %k1
+; SKX-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
+; SKX-NEXT: retq
+;
+; KNL-LABEL: test24:
+; KNL: # %bb.0: # %entry
+; KNL-NEXT: kmovw %edi, %k1
+; KNL-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
+; KNL-NEXT: retq
+entry:
+ %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
+ %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %sub.i, i8 %mask, i32 8) #10
+ ret <4 x float> %0
+}
+
+define <16 x float> @test25(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
+; CHECK-LABEL: test25:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
+entry:
+ %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
+ %sub.i.2 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
+ %0 = tail call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a, <16 x float> %sub.i, <16 x float> %sub.i.2, i16 -1, i32 8) #2
+ ret <16 x float> %0
+}
diff --git a/test/CodeGen/X86/fmsubadd-combine.ll b/test/CodeGen/X86/fmsubadd-combine.ll
index 814d61e22382..ca2c61a88507 100644
--- a/test/CodeGen/X86/fmsubadd-combine.ll
+++ b/test/CodeGen/X86/fmsubadd-combine.ll
@@ -8,26 +8,17 @@
define <2 x double> @mul_subadd_pd128(<2 x double> %A, <2 x double> %B, <2 x double> %C) #0 {
; FMA3_256-LABEL: mul_subadd_pd128:
; FMA3_256: # %bb.0: # %entry
-; FMA3_256-NEXT: vmulpd %xmm1, %xmm0, %xmm0
-; FMA3_256-NEXT: vsubpd %xmm2, %xmm0, %xmm1
-; FMA3_256-NEXT: vaddpd %xmm2, %xmm0, %xmm0
-; FMA3_256-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; FMA3_256-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0
; FMA3_256-NEXT: retq
;
; FMA3_512-LABEL: mul_subadd_pd128:
; FMA3_512: # %bb.0: # %entry
-; FMA3_512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
-; FMA3_512-NEXT: vsubpd %xmm2, %xmm0, %xmm1
-; FMA3_512-NEXT: vaddpd %xmm2, %xmm0, %xmm0
-; FMA3_512-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; FMA3_512-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0
; FMA3_512-NEXT: retq
;
; FMA4-LABEL: mul_subadd_pd128:
; FMA4: # %bb.0: # %entry
-; FMA4-NEXT: vmulpd %xmm1, %xmm0, %xmm0
-; FMA4-NEXT: vsubpd %xmm2, %xmm0, %xmm1
-; FMA4-NEXT: vaddpd %xmm2, %xmm0, %xmm0
-; FMA4-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; FMA4-NEXT: vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
entry:
%AB = fmul <2 x double> %A, %B
@@ -40,18 +31,12 @@ entry:
define <4 x float> @mul_subadd_ps128(<4 x float> %A, <4 x float> %B, <4 x float> %C) #0 {
; FMA3-LABEL: mul_subadd_ps128:
; FMA3: # %bb.0: # %entry
-; FMA3-NEXT: vmulps %xmm1, %xmm0, %xmm0
-; FMA3-NEXT: vsubps %xmm2, %xmm0, %xmm1
-; FMA3-NEXT: vaddps %xmm2, %xmm0, %xmm0
-; FMA3-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; FMA3-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0
; FMA3-NEXT: retq
;
; FMA4-LABEL: mul_subadd_ps128:
; FMA4: # %bb.0: # %entry
-; FMA4-NEXT: vmulps %xmm1, %xmm0, %xmm0
-; FMA4-NEXT: vsubps %xmm2, %xmm0, %xmm1
-; FMA4-NEXT: vaddps %xmm2, %xmm0, %xmm0
-; FMA4-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; FMA4-NEXT: vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
entry:
%AB = fmul <4 x float> %A, %B
@@ -64,18 +49,12 @@ entry:
define <4 x double> @mul_subadd_pd256(<4 x double> %A, <4 x double> %B, <4 x double> %C) #0 {
; FMA3-LABEL: mul_subadd_pd256:
; FMA3: # %bb.0: # %entry
-; FMA3-NEXT: vmulpd %ymm1, %ymm0, %ymm0
-; FMA3-NEXT: vsubpd %ymm2, %ymm0, %ymm1
-; FMA3-NEXT: vaddpd %ymm2, %ymm0, %ymm0
-; FMA3-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; FMA3-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0
; FMA3-NEXT: retq
;
; FMA4-LABEL: mul_subadd_pd256:
; FMA4: # %bb.0: # %entry
-; FMA4-NEXT: vmulpd %ymm1, %ymm0, %ymm0
-; FMA4-NEXT: vsubpd %ymm2, %ymm0, %ymm1
-; FMA4-NEXT: vaddpd %ymm2, %ymm0, %ymm0
-; FMA4-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; FMA4-NEXT: vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0
; FMA4-NEXT: retq
entry:
%AB = fmul <4 x double> %A, %B
@@ -88,18 +67,12 @@ entry:
define <8 x float> @mul_subadd_ps256(<8 x float> %A, <8 x float> %B, <8 x float> %C) #0 {
; FMA3-LABEL: mul_subadd_ps256:
; FMA3: # %bb.0: # %entry
-; FMA3-NEXT: vmulps %ymm1, %ymm0, %ymm0
-; FMA3-NEXT: vsubps %ymm2, %ymm0, %ymm1
-; FMA3-NEXT: vaddps %ymm2, %ymm0, %ymm0
-; FMA3-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; FMA3-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0
; FMA3-NEXT: retq
;
; FMA4-LABEL: mul_subadd_ps256:
; FMA4: # %bb.0: # %entry
-; FMA4-NEXT: vmulps %ymm1, %ymm0, %ymm0
-; FMA4-NEXT: vsubps %ymm2, %ymm0, %ymm1
-; FMA4-NEXT: vaddps %ymm2, %ymm0, %ymm0
-; FMA4-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; FMA4-NEXT: vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0
; FMA4-NEXT: retq
entry:
%AB = fmul <8 x float> %A, %B
@@ -112,34 +85,19 @@ entry:
define <8 x double> @mul_subadd_pd512(<8 x double> %A, <8 x double> %B, <8 x double> %C) #0 {
; FMA3_256-LABEL: mul_subadd_pd512:
; FMA3_256: # %bb.0: # %entry
-; FMA3_256-NEXT: vmulpd %ymm2, %ymm0, %ymm0
-; FMA3_256-NEXT: vmulpd %ymm3, %ymm1, %ymm1
-; FMA3_256-NEXT: vsubpd %ymm5, %ymm1, %ymm2
-; FMA3_256-NEXT: vsubpd %ymm4, %ymm0, %ymm3
-; FMA3_256-NEXT: vaddpd %ymm5, %ymm1, %ymm1
-; FMA3_256-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3]
-; FMA3_256-NEXT: vaddpd %ymm4, %ymm0, %ymm0
-; FMA3_256-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3]
+; FMA3_256-NEXT: vfmsubadd213pd %ymm4, %ymm2, %ymm0
+; FMA3_256-NEXT: vfmsubadd213pd %ymm5, %ymm3, %ymm1
; FMA3_256-NEXT: retq
;
; FMA3_512-LABEL: mul_subadd_pd512:
; FMA3_512: # %bb.0: # %entry
-; FMA3_512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
-; FMA3_512-NEXT: vsubpd %zmm2, %zmm0, %zmm1
-; FMA3_512-NEXT: vaddpd %zmm2, %zmm0, %zmm0
-; FMA3_512-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[1],zmm0[2],zmm1[3],zmm0[4],zmm1[5],zmm0[6],zmm1[7]
+; FMA3_512-NEXT: vfmsubadd213pd %zmm2, %zmm1, %zmm0
; FMA3_512-NEXT: retq
;
; FMA4-LABEL: mul_subadd_pd512:
; FMA4: # %bb.0: # %entry
-; FMA4-NEXT: vmulpd %ymm2, %ymm0, %ymm0
-; FMA4-NEXT: vmulpd %ymm3, %ymm1, %ymm1
-; FMA4-NEXT: vsubpd %ymm5, %ymm1, %ymm2
-; FMA4-NEXT: vsubpd %ymm4, %ymm0, %ymm3
-; FMA4-NEXT: vaddpd %ymm5, %ymm1, %ymm1
-; FMA4-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3]
-; FMA4-NEXT: vaddpd %ymm4, %ymm0, %ymm0
-; FMA4-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3]
+; FMA4-NEXT: vfmsubaddpd %ymm4, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT: vfmsubaddpd %ymm5, %ymm3, %ymm1, %ymm1
; FMA4-NEXT: retq
entry:
%AB = fmul <8 x double> %A, %B
@@ -152,35 +110,19 @@ entry:
define <16 x float> @mul_subadd_ps512(<16 x float> %A, <16 x float> %B, <16 x float> %C) #0 {
; FMA3_256-LABEL: mul_subadd_ps512:
; FMA3_256: # %bb.0: # %entry
-; FMA3_256-NEXT: vmulps %ymm2, %ymm0, %ymm0
-; FMA3_256-NEXT: vmulps %ymm3, %ymm1, %ymm1
-; FMA3_256-NEXT: vsubps %ymm5, %ymm1, %ymm2
-; FMA3_256-NEXT: vsubps %ymm4, %ymm0, %ymm3
-; FMA3_256-NEXT: vaddps %ymm5, %ymm1, %ymm1
-; FMA3_256-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
-; FMA3_256-NEXT: vaddps %ymm4, %ymm0, %ymm0
-; FMA3_256-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
+; FMA3_256-NEXT: vfmsubadd213ps %ymm4, %ymm2, %ymm0
+; FMA3_256-NEXT: vfmsubadd213ps %ymm5, %ymm3, %ymm1
; FMA3_256-NEXT: retq
;
; FMA3_512-LABEL: mul_subadd_ps512:
; FMA3_512: # %bb.0: # %entry
-; FMA3_512-NEXT: vmulps %zmm1, %zmm0, %zmm1
-; FMA3_512-NEXT: vaddps %zmm2, %zmm1, %zmm0
-; FMA3_512-NEXT: movw $-21846, %ax # imm = 0xAAAA
-; FMA3_512-NEXT: kmovw %eax, %k1
-; FMA3_512-NEXT: vsubps %zmm2, %zmm1, %zmm0 {%k1}
+; FMA3_512-NEXT: vfmsubadd213ps %zmm2, %zmm1, %zmm0
; FMA3_512-NEXT: retq
;
; FMA4-LABEL: mul_subadd_ps512:
; FMA4: # %bb.0: # %entry
-; FMA4-NEXT: vmulps %ymm2, %ymm0, %ymm0
-; FMA4-NEXT: vmulps %ymm3, %ymm1, %ymm1
-; FMA4-NEXT: vsubps %ymm5, %ymm1, %ymm2
-; FMA4-NEXT: vsubps %ymm4, %ymm0, %ymm3
-; FMA4-NEXT: vaddps %ymm5, %ymm1, %ymm1
-; FMA4-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
-; FMA4-NEXT: vaddps %ymm4, %ymm0, %ymm0
-; FMA4-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
+; FMA4-NEXT: vfmsubaddps %ymm4, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT: vfmsubaddps %ymm5, %ymm3, %ymm1, %ymm1
; FMA4-NEXT: retq
entry:
%AB = fmul <16 x float> %A, %B
diff --git a/test/CodeGen/X86/fold-vector-sext-crash.ll b/test/CodeGen/X86/fold-vector-sext-crash.ll
index 481f55e9e10d..db73195698e3 100644
--- a/test/CodeGen/X86/fold-vector-sext-crash.ll
+++ b/test/CodeGen/X86/fold-vector-sext-crash.ll
@@ -9,9 +9,9 @@
define <4 x i64> @foo(<4 x i64> %A) {
; CHECK-LABEL: foo:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vmovdqa %xmm1, %xmm1
-; CHECK-NEXT: vandps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295,0,0,0,0]
+; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0
; CHECK-NEXT: retl
%1 = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i64> %A, <4 x i64><i64 undef, i64 undef, i64 0, i64 0>
ret <4 x i64> %1
diff --git a/test/CodeGen/X86/horizontal-reduce-smax.ll b/test/CodeGen/X86/horizontal-reduce-smax.ll
index a54e01d9af67..fa92158ae92d 100644
--- a/test/CodeGen/X86/horizontal-reduce-smax.ll
+++ b/test/CodeGen/X86/horizontal-reduce-smax.ll
@@ -309,30 +309,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
;
; X86-SSE42-LABEL: test_reduce_v16i8:
; X86-SSE42: ## %bb.0:
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
-; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT: psrld $16, %xmm1
-; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
-; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT: psrlw $8, %xmm0
-; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT: psrlw $8, %xmm2
+; X86-SSE42-NEXT: pminub %xmm0, %xmm2
+; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X86-SSE42-NEXT: retl
;
; X86-AVX-LABEL: test_reduce_v16i8:
; X86-AVX: ## %bb.0:
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X86-AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX-NEXT: retl
@@ -371,30 +366,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
;
; X64-SSE42-LABEL: test_reduce_v16i8:
; X64-SSE42: ## %bb.0:
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
-; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT: psrld $16, %xmm1
-; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
-; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT: psrlw $8, %xmm0
-; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT: psrlw $8, %xmm2
+; X64-SSE42-NEXT: pminub %xmm0, %xmm2
+; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X64-SSE42-NEXT: retq
;
; X64-AVX-LABEL: test_reduce_v16i8:
; X64-AVX: ## %bb.0:
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X64-AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX-NEXT: retq
@@ -906,16 +896,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X86-SSE42-LABEL: test_reduce_v32i8:
; X86-SSE42: ## %bb.0:
; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
-; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT: psrld $16, %xmm1
-; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
-; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT: psrlw $8, %xmm0
-; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT: psrlw $8, %xmm2
+; X86-SSE42-NEXT: pminub %xmm0, %xmm2
+; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X86-SSE42-NEXT: retl
@@ -924,14 +911,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X86-AVX1: ## %bb.0:
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX1-NEXT: vzeroupper
@@ -940,15 +925,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X86-AVX2-LABEL: test_reduce_v32i8:
; X86-AVX2: ## %bb.0:
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X86-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX2-NEXT: vzeroupper
@@ -994,16 +977,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-SSE42-LABEL: test_reduce_v32i8:
; X64-SSE42: ## %bb.0:
; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
-; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT: psrld $16, %xmm1
-; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
-; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT: psrlw $8, %xmm0
-; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT: psrlw $8, %xmm2
+; X64-SSE42-NEXT: pminub %xmm0, %xmm2
+; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X64-SSE42-NEXT: retq
@@ -1012,14 +992,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-AVX1: ## %bb.0:
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX1-NEXT: vzeroupper
@@ -1028,15 +1006,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-AVX2-LABEL: test_reduce_v32i8:
; X64-AVX2: ## %bb.0:
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X64-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX2-NEXT: vzeroupper
@@ -1045,15 +1021,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-AVX512-LABEL: test_reduce_v32i8:
; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X64-AVX512-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX512-NEXT: vzeroupper
@@ -1743,16 +1717,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X86-SSE42-NEXT: pmaxsb %xmm3, %xmm1
; X86-SSE42-NEXT: pmaxsb %xmm2, %xmm0
; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
-; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT: psrld $16, %xmm1
-; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
-; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT: psrlw $8, %xmm0
-; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT: psrlw $8, %xmm2
+; X86-SSE42-NEXT: pminub %xmm0, %xmm2
+; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X86-SSE42-NEXT: retl
@@ -1764,14 +1735,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X86-AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX1-NEXT: vzeroupper
@@ -1781,15 +1750,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X86-AVX2: ## %bb.0:
; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X86-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX2-NEXT: vzeroupper
@@ -1847,16 +1814,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-SSE42-NEXT: pmaxsb %xmm3, %xmm1
; X64-SSE42-NEXT: pmaxsb %xmm2, %xmm0
; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
-; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT: psrld $16, %xmm1
-; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
-; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT: psrlw $8, %xmm0
-; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT: psrlw $8, %xmm2
+; X64-SSE42-NEXT: pminub %xmm0, %xmm2
+; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X64-SSE42-NEXT: retq
@@ -1868,14 +1832,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX1-NEXT: vzeroupper
@@ -1885,15 +1847,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-AVX2: ## %bb.0:
; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X64-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX2-NEXT: vzeroupper
@@ -1902,17 +1862,15 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-AVX512-LABEL: test_reduce_v64i8:
; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X64-AVX512-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX512-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/horizontal-reduce-smin.ll b/test/CodeGen/X86/horizontal-reduce-smin.ll
index f03e745598e6..fa5828a45700 100644
--- a/test/CodeGen/X86/horizontal-reduce-smin.ll
+++ b/test/CodeGen/X86/horizontal-reduce-smin.ll
@@ -311,30 +311,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
;
; X86-SSE42-LABEL: test_reduce_v16i8:
; X86-SSE42: ## %bb.0:
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
-; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT: psrld $16, %xmm1
-; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
-; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT: psrlw $8, %xmm0
-; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT: psrlw $8, %xmm2
+; X86-SSE42-NEXT: pminub %xmm0, %xmm2
+; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X86-SSE42-NEXT: retl
;
; X86-AVX-LABEL: test_reduce_v16i8:
; X86-AVX: ## %bb.0:
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X86-AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX-NEXT: retl
@@ -373,30 +368,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
;
; X64-SSE42-LABEL: test_reduce_v16i8:
; X64-SSE42: ## %bb.0:
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
-; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT: psrld $16, %xmm1
-; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
-; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT: psrlw $8, %xmm0
-; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT: psrlw $8, %xmm2
+; X64-SSE42-NEXT: pminub %xmm0, %xmm2
+; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X64-SSE42-NEXT: retq
;
; X64-AVX-LABEL: test_reduce_v16i8:
; X64-AVX: ## %bb.0:
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X64-AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX-NEXT: retq
@@ -910,16 +900,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X86-SSE42-LABEL: test_reduce_v32i8:
; X86-SSE42: ## %bb.0:
; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
-; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT: psrld $16, %xmm1
-; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
-; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT: psrlw $8, %xmm0
-; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT: psrlw $8, %xmm2
+; X86-SSE42-NEXT: pminub %xmm0, %xmm2
+; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X86-SSE42-NEXT: retl
@@ -928,14 +915,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X86-AVX1: ## %bb.0:
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX1-NEXT: vzeroupper
@@ -944,15 +929,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X86-AVX2-LABEL: test_reduce_v32i8:
; X86-AVX2: ## %bb.0:
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X86-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX2-NEXT: vzeroupper
@@ -998,16 +981,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-SSE42-LABEL: test_reduce_v32i8:
; X64-SSE42: ## %bb.0:
; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
-; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT: psrld $16, %xmm1
-; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
-; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT: psrlw $8, %xmm0
-; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT: psrlw $8, %xmm2
+; X64-SSE42-NEXT: pminub %xmm0, %xmm2
+; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X64-SSE42-NEXT: retq
@@ -1016,14 +996,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-AVX1: ## %bb.0:
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX1-NEXT: vzeroupper
@@ -1032,15 +1010,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-AVX2-LABEL: test_reduce_v32i8:
; X64-AVX2: ## %bb.0:
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X64-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX2-NEXT: vzeroupper
@@ -1049,15 +1025,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-AVX512-LABEL: test_reduce_v32i8:
; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X64-AVX512-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX512-NEXT: vzeroupper
@@ -1745,16 +1719,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X86-SSE42-NEXT: pminsb %xmm3, %xmm1
; X86-SSE42-NEXT: pminsb %xmm2, %xmm0
; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
-; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT: psrld $16, %xmm1
-; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
-; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT: psrlw $8, %xmm0
-; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT: psrlw $8, %xmm2
+; X86-SSE42-NEXT: pminub %xmm0, %xmm2
+; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X86-SSE42-NEXT: retl
@@ -1766,14 +1737,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X86-AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2
; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX1-NEXT: vzeroupper
@@ -1783,15 +1752,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X86-AVX2: ## %bb.0:
; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X86-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX2-NEXT: vzeroupper
@@ -1849,16 +1816,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-SSE42-NEXT: pminsb %xmm3, %xmm1
; X64-SSE42-NEXT: pminsb %xmm2, %xmm0
; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
-; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT: psrld $16, %xmm1
-; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
-; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT: psrlw $8, %xmm0
-; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT: psrlw $8, %xmm2
+; X64-SSE42-NEXT: pminub %xmm0, %xmm2
+; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X64-SSE42-NEXT: retq
@@ -1870,14 +1834,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2
; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX1-NEXT: vzeroupper
@@ -1887,15 +1849,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-AVX2: ## %bb.0:
; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X64-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX2-NEXT: vzeroupper
@@ -1904,17 +1864,15 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-AVX512-LABEL: test_reduce_v64i8:
; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X64-AVX512-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX512-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/horizontal-reduce-umax.ll b/test/CodeGen/X86/horizontal-reduce-umax.ll
index 52e623b82718..204479976e90 100644
--- a/test/CodeGen/X86/horizontal-reduce-umax.ll
+++ b/test/CodeGen/X86/horizontal-reduce-umax.ll
@@ -362,30 +362,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
;
; X86-SSE42-LABEL: test_reduce_v16i8:
; X86-SSE42: ## %bb.0:
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
-; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT: psrld $16, %xmm1
-; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
-; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT: psrlw $8, %xmm0
-; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: pcmpeqd %xmm1, %xmm1
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT: psrlw $8, %xmm2
+; X86-SSE42-NEXT: pminub %xmm0, %xmm2
+; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X86-SSE42-NEXT: retl
;
; X86-AVX-LABEL: test_reduce_v16i8:
; X86-AVX: ## %bb.0:
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X86-AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX-NEXT: retl
@@ -408,30 +403,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
;
; X64-SSE42-LABEL: test_reduce_v16i8:
; X64-SSE42: ## %bb.0:
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
-; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT: psrld $16, %xmm1
-; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
-; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT: psrlw $8, %xmm0
-; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: pcmpeqd %xmm1, %xmm1
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT: psrlw $8, %xmm2
+; X64-SSE42-NEXT: pminub %xmm0, %xmm2
+; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X64-SSE42-NEXT: retq
;
; X64-AVX-LABEL: test_reduce_v16i8:
; X64-AVX: ## %bb.0:
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X64-AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX-NEXT: retq
@@ -1031,16 +1021,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X86-SSE42-LABEL: test_reduce_v32i8:
; X86-SSE42: ## %bb.0:
; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
-; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT: psrld $16, %xmm1
-; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
-; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT: psrlw $8, %xmm0
-; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: pcmpeqd %xmm1, %xmm1
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT: psrlw $8, %xmm2
+; X86-SSE42-NEXT: pminub %xmm0, %xmm2
+; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X86-SSE42-NEXT: retl
@@ -1049,14 +1036,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X86-AVX1: ## %bb.0:
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX1-NEXT: vzeroupper
@@ -1065,15 +1050,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X86-AVX2-LABEL: test_reduce_v32i8:
; X86-AVX2: ## %bb.0:
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X86-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX2-NEXT: vzeroupper
@@ -1099,16 +1082,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-SSE42-LABEL: test_reduce_v32i8:
; X64-SSE42: ## %bb.0:
; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
-; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT: psrld $16, %xmm1
-; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
-; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT: psrlw $8, %xmm0
-; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: pcmpeqd %xmm1, %xmm1
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT: psrlw $8, %xmm2
+; X64-SSE42-NEXT: pminub %xmm0, %xmm2
+; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X64-SSE42-NEXT: retq
@@ -1117,14 +1097,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-AVX1: ## %bb.0:
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX1-NEXT: vzeroupper
@@ -1133,15 +1111,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-AVX2-LABEL: test_reduce_v32i8:
; X64-AVX2: ## %bb.0:
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X64-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX2-NEXT: vzeroupper
@@ -1150,15 +1126,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-AVX512-LABEL: test_reduce_v32i8:
; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X64-AVX512-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX512-NEXT: vzeroupper
@@ -1992,16 +1966,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X86-SSE42-NEXT: pmaxub %xmm3, %xmm1
; X86-SSE42-NEXT: pmaxub %xmm2, %xmm0
; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
-; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT: psrld $16, %xmm1
-; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
-; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT: psrlw $8, %xmm0
-; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: pcmpeqd %xmm1, %xmm1
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT: psrlw $8, %xmm2
+; X86-SSE42-NEXT: pminub %xmm0, %xmm2
+; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X86-SSE42-NEXT: retl
@@ -2013,14 +1984,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X86-AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2
; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX1-NEXT: vzeroupper
@@ -2030,15 +1999,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X86-AVX2: ## %bb.0:
; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X86-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX2-NEXT: vzeroupper
@@ -2068,16 +2035,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-SSE42-NEXT: pmaxub %xmm3, %xmm1
; X64-SSE42-NEXT: pmaxub %xmm2, %xmm0
; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
-; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT: psrld $16, %xmm1
-; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
-; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT: psrlw $8, %xmm0
-; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: pcmpeqd %xmm1, %xmm1
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT: psrlw $8, %xmm2
+; X64-SSE42-NEXT: pminub %xmm0, %xmm2
+; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X64-SSE42-NEXT: retq
@@ -2089,14 +2053,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2
; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX1-NEXT: vzeroupper
@@ -2106,15 +2068,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-AVX2: ## %bb.0:
; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X64-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX2-NEXT: vzeroupper
@@ -2123,17 +2083,15 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-AVX512-LABEL: test_reduce_v64i8:
; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm2
+; X64-AVX512-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX512-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/horizontal-reduce-umin.ll b/test/CodeGen/X86/horizontal-reduce-umin.ll
index 505663656a3a..2a37d17365be 100644
--- a/test/CodeGen/X86/horizontal-reduce-umin.ll
+++ b/test/CodeGen/X86/horizontal-reduce-umin.ll
@@ -352,30 +352,19 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
;
; X86-SSE42-LABEL: test_reduce_v16i8:
; X86-SSE42: ## %bb.0:
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT: pminub %xmm0, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT: pminub %xmm1, %xmm0
; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: psrlw $8, %xmm1
; X86-SSE42-NEXT: pminub %xmm0, %xmm1
-; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT: psrlw $8, %xmm0
-; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0
; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X86-SSE42-NEXT: retl
;
; X86-AVX-LABEL: test_reduce_v16i8:
; X86-AVX: ## %bb.0:
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX-NEXT: retl
@@ -398,30 +387,19 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
;
; X64-SSE42-LABEL: test_reduce_v16i8:
; X64-SSE42: ## %bb.0:
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT: pminub %xmm0, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT: pminub %xmm1, %xmm0
; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: psrlw $8, %xmm1
; X64-SSE42-NEXT: pminub %xmm0, %xmm1
-; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT: psrlw $8, %xmm0
-; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0
; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X64-SSE42-NEXT: retq
;
; X64-AVX-LABEL: test_reduce_v16i8:
; X64-AVX: ## %bb.0:
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX-NEXT: retq
@@ -1004,16 +982,10 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X86-SSE42-LABEL: test_reduce_v32i8:
; X86-SSE42: ## %bb.0:
; X86-SSE42-NEXT: pminub %xmm1, %xmm0
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT: pminub %xmm0, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT: pminub %xmm1, %xmm0
; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: psrlw $8, %xmm1
; X86-SSE42-NEXT: pminub %xmm0, %xmm1
-; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT: psrlw $8, %xmm0
-; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0
; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X86-SSE42-NEXT: retl
@@ -1022,14 +994,9 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X86-AVX1: ## %bb.0:
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX1-NEXT: vzeroupper
@@ -1038,15 +1005,10 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X86-AVX2-LABEL: test_reduce_v32i8:
; X86-AVX2: ## %bb.0:
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX2-NEXT: vzeroupper
@@ -1072,16 +1034,10 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-SSE42-LABEL: test_reduce_v32i8:
; X64-SSE42: ## %bb.0:
; X64-SSE42-NEXT: pminub %xmm1, %xmm0
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT: pminub %xmm0, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT: pminub %xmm1, %xmm0
; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: psrlw $8, %xmm1
; X64-SSE42-NEXT: pminub %xmm0, %xmm1
-; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT: psrlw $8, %xmm0
-; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0
; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X64-SSE42-NEXT: retq
@@ -1090,14 +1046,9 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-AVX1: ## %bb.0:
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX1-NEXT: vzeroupper
@@ -1106,15 +1057,10 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-AVX2-LABEL: test_reduce_v32i8:
; X64-AVX2: ## %bb.0:
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX2-NEXT: vzeroupper
@@ -1123,15 +1069,10 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-AVX512-LABEL: test_reduce_v32i8:
; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX512-NEXT: vzeroupper
@@ -1942,16 +1883,10 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X86-SSE42-NEXT: pminub %xmm3, %xmm1
; X86-SSE42-NEXT: pminub %xmm2, %xmm0
; X86-SSE42-NEXT: pminub %xmm1, %xmm0
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT: pminub %xmm0, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT: pminub %xmm1, %xmm0
; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: psrlw $8, %xmm1
; X86-SSE42-NEXT: pminub %xmm0, %xmm1
-; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT: psrlw $8, %xmm0
-; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0
; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X86-SSE42-NEXT: retl
@@ -1963,14 +1898,9 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X86-AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2
; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX1-NEXT: vzeroupper
@@ -1980,15 +1910,10 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X86-AVX2: ## %bb.0:
; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX2-NEXT: vzeroupper
@@ -2018,16 +1943,10 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-SSE42-NEXT: pminub %xmm3, %xmm1
; X64-SSE42-NEXT: pminub %xmm2, %xmm0
; X64-SSE42-NEXT: pminub %xmm1, %xmm0
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT: pminub %xmm0, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT: pminub %xmm1, %xmm0
; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: psrlw $8, %xmm1
; X64-SSE42-NEXT: pminub %xmm0, %xmm1
-; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT: psrlw $8, %xmm0
-; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0
; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X64-SSE42-NEXT: retq
@@ -2039,14 +1958,9 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2
; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX1-NEXT: vzeroupper
@@ -2056,15 +1970,10 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-AVX2: ## %bb.0:
; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX2-NEXT: vzeroupper
@@ -2073,17 +1982,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-AVX512-LABEL: test_reduce_v64i8:
; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX512-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/known-bits-vector.ll b/test/CodeGen/X86/known-bits-vector.ll
index 283d1f93dfb6..46a888f3b9b6 100644
--- a/test/CodeGen/X86/known-bits-vector.ll
+++ b/test/CodeGen/X86/known-bits-vector.ll
@@ -160,17 +160,19 @@ define <4 x float> @knownbits_mask_shuffle_uitofp(<4 x i32> %a0) nounwind {
define <4 x float> @knownbits_mask_or_shuffle_uitofp(<4 x i32> %a0) nounwind {
; X32-LABEL: knownbits_mask_or_shuffle_uitofp:
; X32: # %bb.0:
-; X32-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0
-; X32-NEXT: vorps {{\.LCPI.*}}, %xmm0, %xmm0
-; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; X32-NEXT: vpor {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X32-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6]
+; X32-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; X32-NEXT: vcvtdq2ps %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: knownbits_mask_or_shuffle_uitofp:
; X64: # %bb.0:
-; X64-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
-; X64-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0
-; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; X64-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6]
+; X64-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; X64-NEXT: vcvtdq2ps %xmm0, %xmm0
; X64-NEXT: retq
%1 = and <4 x i32> %a0, <i32 -1, i32 -1, i32 255, i32 4085>
diff --git a/test/CodeGen/X86/machinesink-merge-debuginfo.ll b/test/CodeGen/X86/machinesink-merge-debuginfo.ll
index d8fcea1872e8..f5023bbeb5f9 100644
--- a/test/CodeGen/X86/machinesink-merge-debuginfo.ll
+++ b/test/CodeGen/X86/machinesink-merge-debuginfo.ll
@@ -5,6 +5,21 @@ source_filename = "test-sink-debug.cpp"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
+; double foo(double x, double y, bool c) {
+; double a = x / 3.0;
+; double b = y / 5.0;
+; double ret;
+;
+; if (c)
+; ret = a + 1.0;
+; else
+; ret = b + 1.0;
+;
+; ret = ret + 1.0;
+;
+; return ret;
+; }
+
; Function Attrs: nounwind readnone uwtable
define double @_Z3fooddb(double %x, double %y, i1 zeroext %c) local_unnamed_addr !dbg !7 {
tail call void @llvm.dbg.value(metadata double %x, metadata !13, metadata !DIExpression()), !dbg !16
@@ -17,9 +32,10 @@ first:
%e = fadd double %a, 1.000000e+00
br label %final
second:
- %f = fadd double %b, 1.000000e+00, !dbg !17
-; CHECK: debug-location !17
-; CHECK: debug-location !17
+; CHECK-NOT: debug-location !17
+; CHECK: debug-location !18
+; CHECK-NOT: debug-location !17
+ %f = fadd double %b, 1.000000e+00, !dbg !18
br label %final
final:
%cond = phi double [%e, %first], [%f, %second]
@@ -27,15 +43,39 @@ final:
ret double %d
}
-; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-attributes #1 = { nounwind readnone speculatable }
+
+; Function Attrs: nounwind readnone uwtable
+define double @_Z4foo1ddb(double %x, double %y, i1 zeroext %c) local_unnamed_addr !dbg !19 {
+ tail call void @llvm.dbg.value(metadata double %x, metadata !21, metadata !DIExpression()), !dbg !24
+ tail call void @llvm.dbg.value(metadata double %y, metadata !22, metadata !DIExpression()), !dbg !24
+ tail call void @llvm.dbg.value(metadata i1 %c, metadata !23, metadata !DIExpression()), !dbg !24
+ %a = fdiv double %x, 3.000000e+00
+ %b = fdiv double %y, 5.000000e+00, !dbg !25
+ br i1 %c, label %first, label %second
+first:
+ %e = fadd double %a, 1.000000e+00
+ br label %final
+second:
+ %f = fadd double %b, 1.000000e+00, !dbg !25
+; CHECK: debug-location !25
+; CHECK-NEXT: debug-location !25
+ br label %final
+final:
+ %cond = phi double [%e, %first], [%f, %second]
+ %d = fadd double %cond, 1.000000e+00
+ ret double %d
+}
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
!llvm.ident = !{!6}
+attributes #1 = { nounwind readnone speculatable }
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 6.0.0 (trunk 313291)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
!1 = !DIFile(filename: "test-sink-debug.cpp", directory: "/tmp")
!2 = !{}
@@ -54,3 +94,11 @@ attributes #1 = { nounwind readnone speculatable }
!15 = !DILocalVariable(name: "c", arg: 3, scope: !7, file: !1, line: 1, type: !11)
!16 = !DILocation(line: 1, column: 19, scope: !7)
!17 = !DILocation(line: 2, column: 26, scope: !7)
+!18 = !DILocation(line: 3, column: 20, scope: !7)
+!19 = distinct !DISubprogram(name: "foo1", linkageName: "_Z4foo1ddb", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !20)
+!20 = !{!21, !22, !23}
+!21 = !DILocalVariable(name: "x", arg: 1, scope: !19, file: !1, line: 1, type: !10)
+!22 = !DILocalVariable(name: "y", arg: 2, scope: !19, file: !1, line: 1, type: !10)
+!23 = !DILocalVariable(name: "c", arg: 3, scope: !19, file: !1, line: 1, type: !11)
+!24 = !DILocation(line: 1, column: 19, scope: !19)
+!25 = !DILocation(line: 2, column: 26, scope: !19)
diff --git a/test/CodeGen/X86/machinesink-null-debuginfo.ll b/test/CodeGen/X86/machinesink-null-debuginfo.ll
index 454e0cd704ff..c0399b3cfa81 100644
--- a/test/CodeGen/X86/machinesink-null-debuginfo.ll
+++ b/test/CodeGen/X86/machinesink-null-debuginfo.ll
@@ -11,10 +11,10 @@ define double @_Z3fooddb(double %x, double %y, i1 zeroext %c) local_unnamed_addr
tail call void @llvm.dbg.value(metadata double %y, metadata !14, metadata !DIExpression()), !dbg !17
tail call void @llvm.dbg.value(metadata i1 %c, metadata !15, metadata !DIExpression()), !dbg !18
%a = fdiv double %x, 3.000000e+00
- %b = fdiv double %y, 5.000000e+00, !dbg !21
+ %b = fdiv double %y, 5.000000e+00, !dbg !19
%cond = select i1 %c, double %a, double %b
-; CHECK-NOT: debug-location !21
- ret double %cond, !dbg !22
+; CHECK-NOT: debug-location !19
+ ret double %cond, !dbg !20
}
; Function Attrs: nounwind readnone speculatable
@@ -45,5 +45,5 @@ attributes #1 = { nounwind readnone speculatable }
!16 = !DILocation(line: 1, column: 19, scope: !7)
!17 = !DILocation(line: 1, column: 29, scope: !7)
!18 = !DILocation(line: 1, column: 37, scope: !7)
-!21 = !DILocation(line: 2, column: 26, scope: !7)
-!22 = !DILocation(line: 2, column: 3, scope: !7)
+!19 = !DILocation(line: 2, column: 26, scope: !7)
+!20 = !DILocation(line: 2, column: 3, scope: !7)
diff --git a/test/CodeGen/X86/masked_gather_scatter.ll b/test/CodeGen/X86/masked_gather_scatter.ll
index 1eb2631e26ef..d318dde34434 100644
--- a/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/test/CodeGen/X86/masked_gather_scatter.ll
@@ -1500,7 +1500,8 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
; KNL_32-NEXT: vpsllq $32, %xmm0, %xmm0
; KNL_32-NEXT: vpsraq $32, %zmm0, %zmm0
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,1,0]
+; KNL_32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; KNL_32-NEXT: vmovdqa %xmm2, %xmm2
; KNL_32-NEXT: vpsllq $63, %zmm2, %zmm2
; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
@@ -1596,7 +1597,8 @@ define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
; KNL_32-NEXT: vpsllq $32, %xmm1, %xmm1
; KNL_32-NEXT: vpsraq $32, %zmm1, %zmm1
; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; KNL_32-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,1,0]
+; KNL_32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; KNL_32-NEXT: vmovdqa %xmm2, %xmm2
; KNL_32-NEXT: vpsllq $63, %zmm2, %zmm2
; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
@@ -2606,56 +2608,32 @@ define <2 x float> @large_index(float* %base, <2 x i128> %ind, <2 x i1> %mask, <
define <16 x float> @sext_i8_index(float* %base, <16 x i8> %ind) {
; KNL_64-LABEL: sext_i8_index:
; KNL_64: # %bb.0:
-; KNL_64-NEXT: vpmovsxbw %xmm0, %ymm0
-; KNL_64-NEXT: vpmovsxwq %xmm0, %zmm1
-; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL_64-NEXT: vpmovsxwq %xmm0, %zmm0
+; KNL_64-NEXT: vpmovsxbd %xmm0, %zmm1
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
-; KNL_64-NEXT: kxnorw %k0, %k0, %k2
-; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k2}
-; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
-; KNL_64-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: sext_i8_index:
; KNL_32: # %bb.0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: vpmovsxbw %xmm0, %ymm0
-; KNL_32-NEXT: vpmovsxwq %xmm0, %zmm1
-; KNL_32-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL_32-NEXT: vpmovsxwq %xmm0, %zmm0
+; KNL_32-NEXT: vpmovsxbd %xmm0, %zmm1
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
-; KNL_32-NEXT: kxnorw %k0, %k0, %k2
-; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k2}
-; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
-; KNL_32-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
; KNL_32-NEXT: retl
;
; SKX-LABEL: sext_i8_index:
; SKX: # %bb.0:
-; SKX-NEXT: vpmovsxbw %xmm0, %ymm0
-; SKX-NEXT: vpmovsxwq %xmm0, %zmm1
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT: vpmovsxwq %xmm0, %zmm0
+; SKX-NEXT: vpmovsxbd %xmm0, %zmm1
; SKX-NEXT: kxnorw %k0, %k0, %k1
-; SKX-NEXT: kxnorw %k0, %k0, %k2
-; SKX-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k2}
-; SKX-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
-; SKX-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
; SKX-NEXT: retq
;
; SKX_32-LABEL: sext_i8_index:
; SKX_32: # %bb.0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT: vpmovsxbw %xmm0, %ymm0
-; SKX_32-NEXT: vpmovsxwq %xmm0, %zmm1
-; SKX_32-NEXT: vextracti128 $1, %ymm0, %xmm0
-; SKX_32-NEXT: vpmovsxwq %xmm0, %zmm0
+; SKX_32-NEXT: vpmovsxbd %xmm0, %zmm1
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
-; SKX_32-NEXT: kxnorw %k0, %k0, %k2
-; SKX_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k2}
-; SKX_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
-; SKX_32-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
; SKX_32-NEXT: retl
%sext_ind = sext <16 x i8> %ind to <16 x i64>
@@ -2669,40 +2647,42 @@ define <16 x float> @sext_i8_index(float* %base, <16 x i8> %ind) {
define <8 x float> @sext_v8i8_index(float* %base, <8 x i8> %ind) {
; KNL_64-LABEL: sext_v8i8_index:
; KNL_64: # %bb.0:
-; KNL_64-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; KNL_64-NEXT: vpsllq $56, %zmm0, %zmm0
-; KNL_64-NEXT: vpsraq $56, %zmm0, %zmm1
+; KNL_64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
+; KNL_64-NEXT: vpslld $24, %ymm0, %ymm0
+; KNL_64-NEXT: vpsrad $24, %ymm0, %ymm0
+; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm1
; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: sext_v8i8_index:
; KNL_32: # %bb.0:
-; KNL_32-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; KNL_32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: vpsllq $56, %zmm0, %zmm0
-; KNL_32-NEXT: vpsraq $56, %zmm0, %zmm1
+; KNL_32-NEXT: vpslld $24, %ymm0, %ymm0
+; KNL_32-NEXT: vpsrad $24, %ymm0, %ymm0
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
+; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
; KNL_32-NEXT: retl
;
; SKX-LABEL: sext_v8i8_index:
; SKX: # %bb.0:
-; SKX-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; SKX-NEXT: vpsllq $56, %zmm0, %zmm0
-; SKX-NEXT: vpsraq $56, %zmm0, %zmm1
+; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SKX-NEXT: kxnorw %k0, %k0, %k1
-; SKX-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
+; SKX-NEXT: vpslld $24, %ymm0, %ymm0
+; SKX-NEXT: vpsrad $24, %ymm0, %ymm1
+; SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
; SKX-NEXT: retq
;
; SKX_32-LABEL: sext_v8i8_index:
; SKX_32: # %bb.0:
-; SKX_32-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; SKX_32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT: vpsllq $56, %zmm0, %zmm0
-; SKX_32-NEXT: vpsraq $56, %zmm0, %zmm1
+; SKX_32-NEXT: vpslld $24, %ymm0, %ymm0
+; SKX_32-NEXT: vpsrad $24, %ymm0, %ymm1
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
-; SKX_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
+; SKX_32-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1}
; SKX_32-NEXT: retl
%sext_ind = sext <8 x i8> %ind to <8 x i64>
@@ -2764,3 +2744,48 @@ define void @test_scatter_2i32_index(<2 x double> %a1, double* %base, <2 x i32>
}
declare void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double>, <2 x double*>, i32, <2 x i1>)
+define <16 x float> @zext_index(float* %base, <16 x i32> %ind) {
+; KNL_64-LABEL: zext_index:
+; KNL_64: # %bb.0:
+; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1
+; KNL_64-NEXT: kxnorw %k0, %k0, %k1
+; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: zext_index:
+; KNL_32: # %bb.0:
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vpandd {{\.LCPI.*}}{1to16}, %zmm0, %zmm1
+; KNL_32-NEXT: kxnorw %k0, %k0, %k1
+; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
+; KNL_32-NEXT: retl
+;
+; SKX_SMALL-LABEL: zext_index:
+; SKX_SMALL: # %bb.0:
+; SKX_SMALL-NEXT: vandps {{.*}}(%rip){1to16}, %zmm0, %zmm1
+; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
+; SKX_SMALL-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
+; SKX_SMALL-NEXT: retq
+;
+; SKX_LARGE-LABEL: zext_index:
+; SKX_LARGE: # %bb.0:
+; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
+; SKX_LARGE-NEXT: vandps (%rax){1to16}, %zmm0, %zmm1
+; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
+; SKX_LARGE-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
+; SKX_LARGE-NEXT: retq
+;
+; SKX_32-LABEL: zext_index:
+; SKX_32: # %bb.0:
+; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT: vandps {{\.LCPI.*}}{1to16}, %zmm0, %zmm1
+; SKX_32-NEXT: kxnorw %k0, %k0, %k1
+; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
+; SKX_32-NEXT: retl
+ %ind_masked = and <16 x i32> %ind, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+ %sext_ind = zext <16 x i32> %ind_masked to <16 x i64>
+ %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
+
+ %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+ ret <16 x float>%res
+}
diff --git a/test/CodeGen/X86/popcnt.ll b/test/CodeGen/X86/popcnt.ll
index d7622c8d0cab..f7b9ea9b8b2e 100644
--- a/test/CodeGen/X86/popcnt.ll
+++ b/test/CodeGen/X86/popcnt.ll
@@ -78,7 +78,7 @@ define i16 @cnt16(i16 %x) nounwind readnone {
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: shll $8, %eax
; X32-NEXT: addl %ecx, %eax
-; X32-NEXT: movzbl %ah, %eax
+; X32-NEXT: movzbl %ah, %eax # NOREX
; X32-NEXT: # kill: def %ax killed %ax killed %eax
; X32-NEXT: retl
;
diff --git a/test/CodeGen/X86/prefetch.ll b/test/CodeGen/X86/prefetch.ll
index 17a9ac994a79..839948174a43 100644
--- a/test/CodeGen/X86/prefetch.ll
+++ b/test/CodeGen/X86/prefetch.ll
@@ -1,27 +1,101 @@
-; RUN: llc < %s -mtriple=i686-- -mattr=+sse | FileCheck %s
-; RUN: llc < %s -mtriple=i686-- -mattr=+avx | FileCheck %s
-; RUN: llc < %s -mtriple=i686-- -mattr=+sse -mattr=+prfchw | FileCheck %s -check-prefix=PRFCHW
-; RUN: llc < %s -mtriple=i686-- -mcpu=slm | FileCheck %s -check-prefix=SLM
-; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 | FileCheck %s -check-prefix=PRFCHW
-; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 -mattr=-prfchw | FileCheck %s -check-prefix=NOPRFCHW
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=i686-- -mattr=+avx | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse,+prfchw | FileCheck %s -check-prefix=PRFCHWSSE
+; RUN: llc < %s -mtriple=i686-- -mattr=+prfchw | FileCheck %s -check-prefix=PRFCHWSSE
+; RUN: llc < %s -mtriple=i686-- -mcpu=slm | FileCheck %s -check-prefix=PRFCHWSSE
+; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 | FileCheck %s -check-prefix=PRFCHWSSE
+; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 -mattr=-prfchw | FileCheck %s -check-prefix=SSE
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse,+prefetchwt1 | FileCheck %s -check-prefix=PREFETCHWT1
+; RUN: llc < %s -mtriple=i686-- -mattr=-sse,+prefetchwt1 | FileCheck %s -check-prefix=PREFETCHWT1
+; RUN: llc < %s -mtriple=i686-- -mattr=-sse,+3dnow,+prefetchwt1 | FileCheck %s -check-prefix=PREFETCHWT1
+; RUN: llc < %s -mtriple=i686-- -mattr=+3dnow | FileCheck %s -check-prefix=3DNOW
+; RUN: llc < %s -mtriple=i686-- -mattr=+3dnow,+prfchw | FileCheck %s -check-prefix=PRFCHW3DNOW
+
+; Rules:
+; 3dnow by itself get you just the single prefetch instruction with no hints
+; sse provides prefetch0/1/2/nta
+; supporting prefetchw, but not 3dnow implicitly provides prefetcht0/1/2/nta regardless of sse setting as we need something to fall back to for the non-write hint.
+; supporting prefetchwt1 implies prefetcht0/1/2/nta and prefetchw regardless of other settings. this allows levels for non-write and gives us an instruction for write+T0
+; 3dnow prefetch instruction will only get used if you have no other prefetch instructions enabled
; rdar://10538297
define void @t(i8* %ptr) nounwind {
+; SSE-LABEL: t:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT: prefetcht2 (%eax)
+; SSE-NEXT: prefetcht1 (%eax)
+; SSE-NEXT: prefetcht0 (%eax)
+; SSE-NEXT: prefetchnta (%eax)
+; SSE-NEXT: prefetcht2 (%eax)
+; SSE-NEXT: prefetcht1 (%eax)
+; SSE-NEXT: prefetcht0 (%eax)
+; SSE-NEXT: prefetchnta (%eax)
+; SSE-NEXT: retl
+;
+; PRFCHWSSE-LABEL: t:
+; PRFCHWSSE: # %bb.0: # %entry
+; PRFCHWSSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; PRFCHWSSE-NEXT: prefetcht2 (%eax)
+; PRFCHWSSE-NEXT: prefetcht1 (%eax)
+; PRFCHWSSE-NEXT: prefetcht0 (%eax)
+; PRFCHWSSE-NEXT: prefetchnta (%eax)
+; PRFCHWSSE-NEXT: prefetchw (%eax)
+; PRFCHWSSE-NEXT: prefetchw (%eax)
+; PRFCHWSSE-NEXT: prefetchw (%eax)
+; PRFCHWSSE-NEXT: prefetchw (%eax)
+; PRFCHWSSE-NEXT: retl
+;
+; PREFETCHWT1-LABEL: t:
+; PREFETCHWT1: # %bb.0: # %entry
+; PREFETCHWT1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; PREFETCHWT1-NEXT: prefetcht2 (%eax)
+; PREFETCHWT1-NEXT: prefetcht1 (%eax)
+; PREFETCHWT1-NEXT: prefetcht0 (%eax)
+; PREFETCHWT1-NEXT: prefetchnta (%eax)
+; PREFETCHWT1-NEXT: prefetchwt1 (%eax)
+; PREFETCHWT1-NEXT: prefetchwt1 (%eax)
+; PREFETCHWT1-NEXT: prefetchw (%eax)
+; PREFETCHWT1-NEXT: prefetchwt1 (%eax)
+; PREFETCHWT1-NEXT: retl
+;
+; 3DNOW-LABEL: t:
+; 3DNOW: # %bb.0: # %entry
+; 3DNOW-NEXT: movl {{[0-9]+}}(%esp), %eax
+; 3DNOW-NEXT: prefetch (%eax)
+; 3DNOW-NEXT: prefetch (%eax)
+; 3DNOW-NEXT: prefetch (%eax)
+; 3DNOW-NEXT: prefetch (%eax)
+; 3DNOW-NEXT: prefetch (%eax)
+; 3DNOW-NEXT: prefetch (%eax)
+; 3DNOW-NEXT: prefetch (%eax)
+; 3DNOW-NEXT: prefetch (%eax)
+; 3DNOW-NEXT: retl
+;
+; PRFCHW3DNOW-LABEL: t:
+; PRFCHW3DNOW: # %bb.0: # %entry
+; PRFCHW3DNOW-NEXT: movl {{[0-9]+}}(%esp), %eax
+; PRFCHW3DNOW-NEXT: prefetch (%eax)
+; PRFCHW3DNOW-NEXT: prefetch (%eax)
+; PRFCHW3DNOW-NEXT: prefetch (%eax)
+; PRFCHW3DNOW-NEXT: prefetch (%eax)
+; PRFCHW3DNOW-NEXT: prefetchw (%eax)
+; PRFCHW3DNOW-NEXT: prefetchw (%eax)
+; PRFCHW3DNOW-NEXT: prefetchw (%eax)
+; PRFCHW3DNOW-NEXT: prefetchw (%eax)
+; PRFCHW3DNOW-NEXT: retl
entry:
-; CHECK: prefetcht2
-; CHECK: prefetcht1
-; CHECK: prefetcht0
-; CHECK: prefetchnta
-; PRFCHW: prefetchw
-; NOPRFCHW-NOT: prefetchw
-; SLM: prefetchw
tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 1, i32 1 )
tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 2, i32 1 )
tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 3, i32 1 )
tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 0, i32 1 )
+ tail call void @llvm.prefetch( i8* %ptr, i32 1, i32 1, i32 1 )
+ tail call void @llvm.prefetch( i8* %ptr, i32 1, i32 2, i32 1 )
tail call void @llvm.prefetch( i8* %ptr, i32 1, i32 3, i32 1 )
+ tail call void @llvm.prefetch( i8* %ptr, i32 1, i32 0, i32 1 )
ret void
}
-declare void @llvm.prefetch(i8*, i32, i32, i32) nounwind
+declare void @llvm.prefetch(i8*, i32, i32, i32) nounwind
diff --git a/test/CodeGen/X86/shuffle-strided-with-offset-128.ll b/test/CodeGen/X86/shuffle-strided-with-offset-128.ll
index 0f1f818e250d..2df115e2f9e4 100644
--- a/test/CodeGen/X86/shuffle-strided-with-offset-128.ll
+++ b/test/CodeGen/X86/shuffle-strided-with-offset-128.ll
@@ -2,7 +2,8 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
@@ -363,12 +364,26 @@ define void @shuffle_v8i16_to_v2i16_1(<8 x i16>* %L, <2 x i16>* %S) nounwind {
; SSE-NEXT: movd %xmm0, (%rsi)
; SSE-NEXT: retq
;
-; AVX-LABEL: shuffle_v8i16_to_v2i16_1:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
-; AVX-NEXT: vmovd %xmm0, (%rsi)
-; AVX-NEXT: retq
+; AVX1-LABEL: shuffle_v8i16_to_v2i16_1:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX1-NEXT: vmovd %xmm0, (%rsi)
+; AVX1-NEXT: retq
+;
+; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_1:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi)
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_1:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi)
+; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: shuffle_v8i16_to_v2i16_1:
; AVX512F: # %bb.0:
@@ -409,12 +424,26 @@ define void @shuffle_v8i16_to_v2i16_2(<8 x i16>* %L, <2 x i16>* %S) nounwind {
; SSE-NEXT: movd %xmm0, (%rsi)
; SSE-NEXT: retq
;
-; AVX-LABEL: shuffle_v8i16_to_v2i16_2:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
-; AVX-NEXT: vmovd %xmm0, (%rsi)
-; AVX-NEXT: retq
+; AVX1-LABEL: shuffle_v8i16_to_v2i16_2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
+; AVX1-NEXT: vmovd %xmm0, (%rsi)
+; AVX1-NEXT: retq
+;
+; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_2:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi)
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_2:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi)
+; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: shuffle_v8i16_to_v2i16_2:
; AVX512F: # %bb.0:
@@ -455,12 +484,26 @@ define void @shuffle_v8i16_to_v2i16_3(<8 x i16>* %L, <2 x i16>* %S) nounwind {
; SSE-NEXT: movd %xmm0, (%rsi)
; SSE-NEXT: retq
;
-; AVX-LABEL: shuffle_v8i16_to_v2i16_3:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; AVX-NEXT: vmovd %xmm0, (%rsi)
-; AVX-NEXT: retq
+; AVX1-LABEL: shuffle_v8i16_to_v2i16_3:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX1-NEXT: vmovd %xmm0, (%rsi)
+; AVX1-NEXT: retq
+;
+; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_3:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi)
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_3:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi)
+; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: shuffle_v8i16_to_v2i16_3:
; AVX512F: # %bb.0:
diff --git a/test/CodeGen/X86/shuffle-strided-with-offset-256.ll b/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
index 7cef269ebc2b..081c962ab94a 100644
--- a/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
+++ b/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
@@ -362,18 +363,30 @@ define void @shuffle_v16i16_to_v4i16_1(<16 x i16>* %L, <4 x i16>* %S) nounwind {
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_to_v4i16_1:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT: vmovq %xmm0, (%rsi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_1:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_1:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: shuffle_v16i16_to_v4i16_1:
; AVX512F: # %bb.0:
@@ -446,18 +459,30 @@ define void @shuffle_v16i16_to_v4i16_2(<16 x i16>* %L, <4 x i16>* %S) nounwind {
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_to_v4i16_2:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT: vmovq %xmm0, (%rsi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_2:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_2:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: shuffle_v16i16_to_v4i16_2:
; AVX512F: # %bb.0:
@@ -522,18 +547,30 @@ define void @shuffle_v16i16_to_v4i16_3(<16 x i16>* %L, <4 x i16>* %S) nounwind {
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_to_v4i16_3:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT: vmovq %xmm0, (%rsi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_3:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_3:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: shuffle_v16i16_to_v4i16_3:
; AVX512F: # %bb.0:
@@ -633,11 +670,9 @@ define void @shuffle_v32i8_to_v4i8_1(<32 x i8>* %L, <4 x i8>* %S) nounwind {
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,1,1,8,8,9,9,8,8,9,9,10,10,11,11]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,255,255,9,9,255,255,9,9,255,255,11,11,255,255]
; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpsrld $16, %xmm1, %xmm1
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrld $16, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
@@ -659,11 +694,9 @@ define void @shuffle_v32i8_to_v4i8_1(<32 x i8>* %L, <4 x i8>* %S) nounwind {
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,1,1,8,8,9,9,8,8,9,9,10,10,11,11]
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,255,255,9,9,255,255,9,9,255,255,11,11,255,255]
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpsrld $16, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
@@ -795,11 +828,9 @@ define void @shuffle_v32i8_to_v4i8_3(<32 x i8>* %L, <4 x i8>* %S) nounwind {
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [10,10,11,11,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,11,11,11,11,3,3,8,8,9,9,10,10,11,11]
; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
@@ -821,11 +852,9 @@ define void @shuffle_v32i8_to_v4i8_3(<32 x i8>* %L, <4 x i8>* %S) nounwind {
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [10,10,11,11,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,11,11,11,11,3,3,8,8,9,9,10,10,11,11]
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
@@ -949,14 +978,9 @@ define void @shuffle_v32i8_to_v4i8_5(<32 x i8>* %L, <4 x i8>* %S) nounwind {
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [5,5,13,13,13,13,5,5,4,4,5,5,6,6,7,7]
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
@@ -978,14 +1002,9 @@ define void @shuffle_v32i8_to_v4i8_5(<32 x i8>* %L, <4 x i8>* %S) nounwind {
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [5,5,13,13,13,13,5,5,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/shuffle-strided-with-offset-512.ll b/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
index 7f3431fabedc..a5d0a7fa401b 100644
--- a/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
+++ b/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
@@ -99,10 +99,9 @@ define void @shuffle_v32i16_to_v16i16_1(<32 x i16>* %L, <16 x i16>* %S) nounwind
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,17,19,21,23,9,11,13,15,25,27,29,31]
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
-; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3]
-; AVX512BWVL-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %L
@@ -673,17 +672,14 @@ define void @shuffle_v64i8_to_v8i8_1(<64 x i8>* %L, <8 x i8>* %S) nounwind {
; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,1,1,8,8,9,9,8,8,9,9,10,10,11,11]
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,1,1,1,1,9,9,8,8,9,9,10,10,11,11]
; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,9,9,8,8,9,9,8,8,9,9,10,10,11,11]
; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
@@ -831,17 +827,14 @@ define void @shuffle_v64i8_to_v8i8_3(<64 x i8>* %L, <8 x i8>* %S) nounwind {
; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [10,10,11,11,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [10,10,11,11,3,3,11,11,8,8,9,9,10,10,11,11]
; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [3,3,11,11,2,2,3,3,8,8,9,9,10,10,11,11]
; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
@@ -989,24 +982,14 @@ define void @shuffle_v64i8_to_v8i8_5(<64 x i8>* %L, <8 x i8>* %S) nounwind {
; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [12,12,13,13,5,5,13,13,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [5,5,13,13,4,4,5,5,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
@@ -1154,17 +1137,14 @@ define void @shuffle_v64i8_to_v8i8_7(<64 x i8>* %L, <8 x i8>* %S) nounwind {
; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [14,14,15,15,6,6,7,7,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [14,14,15,15,7,7,15,15,4,4,5,5,6,6,7,7]
; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,15,15,6,6,7,7,4,4,5,5,6,6,7,7]
; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
diff --git a/test/CodeGen/X86/shuffle-vs-trunc-128.ll b/test/CodeGen/X86/shuffle-vs-trunc-128.ll
index 1bfe37b1497e..f2cd81d3d937 100644
--- a/test/CodeGen/X86/shuffle-vs-trunc-128.ll
+++ b/test/CodeGen/X86/shuffle-vs-trunc-128.ll
@@ -2,7 +2,8 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
@@ -436,12 +437,26 @@ define void @shuffle_v8i16_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind {
; SSE-NEXT: movd %xmm0, (%rsi)
; SSE-NEXT: retq
;
-; AVX-LABEL: shuffle_v8i16_to_v2i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX-NEXT: vmovd %xmm0, (%rsi)
-; AVX-NEXT: retq
+; AVX1-LABEL: shuffle_v8i16_to_v2i16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX1-NEXT: vmovd %xmm0, (%rsi)
+; AVX1-NEXT: retq
+;
+; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi)
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi)
+; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: shuffle_v8i16_to_v2i16:
; AVX512F: # %bb.0:
@@ -482,12 +497,26 @@ define void @trunc_v2i64_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind {
; SSE-NEXT: movd %xmm0, (%rsi)
; SSE-NEXT: retq
;
-; AVX-LABEL: trunc_v2i64_to_v2i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX-NEXT: vmovd %xmm0, (%rsi)
-; AVX-NEXT: retq
+; AVX1-LABEL: trunc_v2i64_to_v2i16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX1-NEXT: vmovd %xmm0, (%rsi)
+; AVX1-NEXT: retq
+;
+; AVX2-SLOW-LABEL: trunc_v2i64_to_v2i16:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi)
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_v2i64_to_v2i16:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi)
+; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: trunc_v2i64_to_v2i16:
; AVX512F: # %bb.0:
diff --git a/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/test/CodeGen/X86/shuffle-vs-trunc-256.ll
index 59a8aa47246c..5b46c73a3f65 100644
--- a/test/CodeGen/X86/shuffle-vs-trunc-256.ll
+++ b/test/CodeGen/X86/shuffle-vs-trunc-256.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
@@ -139,59 +140,17 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: shuffle_v16i16_to_v8i16:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_to_v8i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v16i16_to_v8i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v16i16_to_v8i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
%strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
store <8 x i16> %strided.vec, <8 x i16>* %S
@@ -290,13 +249,21 @@ define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_v4i64_to_v4i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vmovaps %xmm0, (%rsi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i32:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vmovaps %xmm0, (%rsi)
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_v4i64_to_v4i32:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermps (%rdi), %ymm0, %ymm0
+; AVX2-FAST-NEXT: vmovaps %xmm0, (%rsi)
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: trunc_v4i64_to_v4i32:
; AVX512F: # %bb.0:
@@ -399,12 +366,9 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
@@ -490,18 +454,30 @@ define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_to_v4i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT: vmovq %xmm0, (%rsi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: shuffle_v16i16_to_v4i16:
; AVX512F: # %bb.0:
@@ -563,14 +539,23 @@ define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_v4i64_to_v4i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX2-NEXT: vmovq %xmm0, (%rsi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd (%rdi), %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: trunc_v4i64_to_v4i16:
; AVX512F: # %bb.0:
@@ -693,14 +678,23 @@ define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_v4i64_to_v4i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vmovd %xmm0, (%rsi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i8:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi)
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_v4i64_to_v4i8:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd (%rdi), %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi)
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: trunc_v4i64_to_v4i8:
; AVX512F: # %bb.0:
diff --git a/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index 3fa148405f6b..4577cd8f8b44 100644
--- a/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -148,10 +148,9 @@ define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,16,18,20,22,8,10,12,14,24,26,28,30]
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
-; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3]
-; AVX512BWVL-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %L
diff --git a/test/CodeGen/X86/var-permute-256.ll b/test/CodeGen/X86/var-permute-256.ll
index 82a790298f23..3baab2476d40 100644
--- a/test/CodeGen/X86/var-permute-256.ll
+++ b/test/CodeGen/X86/var-permute-256.ll
@@ -142,14 +142,13 @@ define <8 x i32> @var_shuffle_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind {
; AVX1-NEXT: andl $7, %r10d
; AVX1-NEXT: andl $28, %edi
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT: movq %rsp, %rax
-; AVX1-NEXT: vpinsrd $1, (%rdx,%rax), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrd $1, (%rsp,%rdx), %xmm0, %xmm0
; AVX1-NEXT: vpinsrd $2, (%rsp,%r10,4), %xmm0, %xmm0
-; AVX1-NEXT: vpinsrd $3, (%rdi,%rax), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrd $3, (%rsp,%rdi), %xmm0, %xmm0
; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-NEXT: vpinsrd $1, (%rsi,%rax), %xmm1, %xmm1
+; AVX1-NEXT: vpinsrd $1, (%rsp,%rsi), %xmm1, %xmm1
; AVX1-NEXT: vpinsrd $2, (%rsp,%r8,4), %xmm1, %xmm1
-; AVX1-NEXT: vpinsrd $3, (%rcx,%rax), %xmm1, %xmm1
+; AVX1-NEXT: vpinsrd $3, (%rsp,%rcx), %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: movq %rbp, %rsp
; AVX1-NEXT: popq %rbp
@@ -505,118 +504,117 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
; AVX1-NEXT: vpextrb $0, %xmm2, %eax
; AVX1-NEXT: vmovaps %ymm0, (%rsp)
; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: movq %rsp, %rcx
-; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vpextrb $1, %xmm2, %eax
; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $2, %xmm2, %eax
; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $3, %xmm2, %eax
; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $4, %xmm2, %eax
; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $5, %xmm2, %eax
; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $6, %xmm2, %eax
; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $7, %xmm2, %eax
; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $8, %xmm2, %eax
; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $9, %xmm2, %eax
; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $10, %xmm2, %eax
; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $11, %xmm2, %eax
; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $12, %xmm2, %eax
; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $13, %xmm2, %eax
; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $14, %xmm2, %eax
; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $15, %xmm2, %eax
; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $0, %xmm1, %eax
; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vmovd %eax, %xmm2
; AVX1-NEXT: vpextrb $1, %xmm1, %eax
; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT: vpinsrb $1, (%rsp,%rax), %xmm2, %xmm2
; AVX1-NEXT: vpextrb $2, %xmm1, %eax
; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT: vpinsrb $2, (%rsp,%rax), %xmm2, %xmm2
; AVX1-NEXT: vpextrb $3, %xmm1, %eax
; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT: vpinsrb $3, (%rsp,%rax), %xmm2, %xmm2
; AVX1-NEXT: vpextrb $4, %xmm1, %eax
; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT: vpinsrb $4, (%rsp,%rax), %xmm2, %xmm2
; AVX1-NEXT: vpextrb $5, %xmm1, %eax
; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT: vpinsrb $5, (%rsp,%rax), %xmm2, %xmm2
; AVX1-NEXT: vpextrb $6, %xmm1, %eax
; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT: vpinsrb $6, (%rsp,%rax), %xmm2, %xmm2
; AVX1-NEXT: vpextrb $7, %xmm1, %eax
; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT: vpinsrb $7, (%rsp,%rax), %xmm2, %xmm2
; AVX1-NEXT: vpextrb $8, %xmm1, %eax
; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT: vpinsrb $8, (%rsp,%rax), %xmm2, %xmm2
; AVX1-NEXT: vpextrb $9, %xmm1, %eax
; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT: vpinsrb $9, (%rsp,%rax), %xmm2, %xmm2
; AVX1-NEXT: vpextrb $10, %xmm1, %eax
; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT: vpinsrb $10, (%rsp,%rax), %xmm2, %xmm2
; AVX1-NEXT: vpextrb $11, %xmm1, %eax
; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT: vpinsrb $11, (%rsp,%rax), %xmm2, %xmm2
; AVX1-NEXT: vpextrb $12, %xmm1, %eax
; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT: vpinsrb $12, (%rsp,%rax), %xmm2, %xmm2
; AVX1-NEXT: vpextrb $13, %xmm1, %eax
; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT: vpinsrb $13, (%rsp,%rax), %xmm2, %xmm2
; AVX1-NEXT: vpextrb $14, %xmm1, %eax
; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT: vpinsrb $14, (%rsp,%rax), %xmm2, %xmm2
; AVX1-NEXT: vpextrb $15, %xmm1, %eax
; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: movq %rbp, %rsp
@@ -633,118 +631,117 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
; AVX2-NEXT: vpextrb $0, %xmm2, %eax
; AVX2-NEXT: vmovaps %ymm0, (%rsp)
; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: movq %rsp, %rcx
-; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vpextrb $1, %xmm2, %eax
; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $2, %xmm2, %eax
; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $3, %xmm2, %eax
; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $4, %xmm2, %eax
; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $5, %xmm2, %eax
; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $6, %xmm2, %eax
; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $7, %xmm2, %eax
; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $8, %xmm2, %eax
; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $9, %xmm2, %eax
; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $10, %xmm2, %eax
; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $11, %xmm2, %eax
; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $12, %xmm2, %eax
; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $13, %xmm2, %eax
; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $14, %xmm2, %eax
; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $15, %xmm2, %eax
; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $0, %xmm1, %eax
; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vmovd %eax, %xmm2
; AVX2-NEXT: vpextrb $1, %xmm1, %eax
; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpinsrb $1, (%rsp,%rax), %xmm2, %xmm2
; AVX2-NEXT: vpextrb $2, %xmm1, %eax
; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpinsrb $2, (%rsp,%rax), %xmm2, %xmm2
; AVX2-NEXT: vpextrb $3, %xmm1, %eax
; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpinsrb $3, (%rsp,%rax), %xmm2, %xmm2
; AVX2-NEXT: vpextrb $4, %xmm1, %eax
; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpinsrb $4, (%rsp,%rax), %xmm2, %xmm2
; AVX2-NEXT: vpextrb $5, %xmm1, %eax
; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpinsrb $5, (%rsp,%rax), %xmm2, %xmm2
; AVX2-NEXT: vpextrb $6, %xmm1, %eax
; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpinsrb $6, (%rsp,%rax), %xmm2, %xmm2
; AVX2-NEXT: vpextrb $7, %xmm1, %eax
; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpinsrb $7, (%rsp,%rax), %xmm2, %xmm2
; AVX2-NEXT: vpextrb $8, %xmm1, %eax
; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpinsrb $8, (%rsp,%rax), %xmm2, %xmm2
; AVX2-NEXT: vpextrb $9, %xmm1, %eax
; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpinsrb $9, (%rsp,%rax), %xmm2, %xmm2
; AVX2-NEXT: vpextrb $10, %xmm1, %eax
; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpinsrb $10, (%rsp,%rax), %xmm2, %xmm2
; AVX2-NEXT: vpextrb $11, %xmm1, %eax
; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpinsrb $11, (%rsp,%rax), %xmm2, %xmm2
; AVX2-NEXT: vpextrb $12, %xmm1, %eax
; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpinsrb $12, (%rsp,%rax), %xmm2, %xmm2
; AVX2-NEXT: vpextrb $13, %xmm1, %eax
; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpinsrb $13, (%rsp,%rax), %xmm2, %xmm2
; AVX2-NEXT: vpextrb $14, %xmm1, %eax
; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpinsrb $14, (%rsp,%rax), %xmm2, %xmm2
; AVX2-NEXT: vpextrb $15, %xmm1, %eax
; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: movq %rbp, %rsp
@@ -761,118 +758,117 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
; AVX512F-NEXT: vpextrb $0, %xmm2, %eax
; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: movq %rsp, %rcx
-; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vmovd %eax, %xmm0
; AVX512F-NEXT: vpextrb $1, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $2, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $3, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $4, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $5, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $6, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $7, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $8, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $9, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $10, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $11, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $12, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $13, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $14, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $15, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $0, %xmm1, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vmovd %eax, %xmm2
; AVX512F-NEXT: vpextrb $1, %xmm1, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm2, %xmm2
; AVX512F-NEXT: vpextrb $2, %xmm1, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm2, %xmm2
; AVX512F-NEXT: vpextrb $3, %xmm1, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm2, %xmm2
; AVX512F-NEXT: vpextrb $4, %xmm1, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm2, %xmm2
; AVX512F-NEXT: vpextrb $5, %xmm1, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm2, %xmm2
; AVX512F-NEXT: vpextrb $6, %xmm1, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm2, %xmm2
; AVX512F-NEXT: vpextrb $7, %xmm1, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm2, %xmm2
; AVX512F-NEXT: vpextrb $8, %xmm1, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm2, %xmm2
; AVX512F-NEXT: vpextrb $9, %xmm1, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm2, %xmm2
; AVX512F-NEXT: vpextrb $10, %xmm1, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm2, %xmm2
; AVX512F-NEXT: vpextrb $11, %xmm1, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm2, %xmm2
; AVX512F-NEXT: vpextrb $12, %xmm1, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm2, %xmm2
; AVX512F-NEXT: vpextrb $13, %xmm1, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $13, (%rsp,%rax), %xmm2, %xmm2
; AVX512F-NEXT: vpextrb $14, %xmm1, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $14, (%rsp,%rax), %xmm2, %xmm2
; AVX512F-NEXT: vpextrb $15, %xmm1, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX512F-NEXT: movq %rbp, %rsp
@@ -889,118 +885,117 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
; AVX512VL-NEXT: vpextrb $0, %xmm2, %eax
; AVX512VL-NEXT: vmovaps %ymm0, (%rsp)
; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: movq %rsp, %rcx
-; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vmovd %eax, %xmm0
; AVX512VL-NEXT: vpextrb $1, %xmm2, %eax
; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $2, %xmm2, %eax
; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $3, %xmm2, %eax
; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $4, %xmm2, %eax
; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $5, %xmm2, %eax
; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $6, %xmm2, %eax
; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $7, %xmm2, %eax
; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $8, %xmm2, %eax
; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $9, %xmm2, %eax
; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $10, %xmm2, %eax
; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $11, %xmm2, %eax
; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $12, %xmm2, %eax
; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $13, %xmm2, %eax
; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $14, %xmm2, %eax
; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $15, %xmm2, %eax
; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $0, %xmm1, %eax
; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vmovd %eax, %xmm2
; AVX512VL-NEXT: vpextrb $1, %xmm1, %eax
; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpinsrb $1, (%rsp,%rax), %xmm2, %xmm2
; AVX512VL-NEXT: vpextrb $2, %xmm1, %eax
; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpinsrb $2, (%rsp,%rax), %xmm2, %xmm2
; AVX512VL-NEXT: vpextrb $3, %xmm1, %eax
; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpinsrb $3, (%rsp,%rax), %xmm2, %xmm2
; AVX512VL-NEXT: vpextrb $4, %xmm1, %eax
; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpinsrb $4, (%rsp,%rax), %xmm2, %xmm2
; AVX512VL-NEXT: vpextrb $5, %xmm1, %eax
; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpinsrb $5, (%rsp,%rax), %xmm2, %xmm2
; AVX512VL-NEXT: vpextrb $6, %xmm1, %eax
; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpinsrb $6, (%rsp,%rax), %xmm2, %xmm2
; AVX512VL-NEXT: vpextrb $7, %xmm1, %eax
; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpinsrb $7, (%rsp,%rax), %xmm2, %xmm2
; AVX512VL-NEXT: vpextrb $8, %xmm1, %eax
; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpinsrb $8, (%rsp,%rax), %xmm2, %xmm2
; AVX512VL-NEXT: vpextrb $9, %xmm1, %eax
; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpinsrb $9, (%rsp,%rax), %xmm2, %xmm2
; AVX512VL-NEXT: vpextrb $10, %xmm1, %eax
; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpinsrb $10, (%rsp,%rax), %xmm2, %xmm2
; AVX512VL-NEXT: vpextrb $11, %xmm1, %eax
; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpinsrb $11, (%rsp,%rax), %xmm2, %xmm2
; AVX512VL-NEXT: vpextrb $12, %xmm1, %eax
; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpinsrb $12, (%rsp,%rax), %xmm2, %xmm2
; AVX512VL-NEXT: vpextrb $13, %xmm1, %eax
; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpinsrb $13, (%rsp,%rax), %xmm2, %xmm2
; AVX512VL-NEXT: vpextrb $14, %xmm1, %eax
; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpinsrb $14, (%rsp,%rax), %xmm2, %xmm2
; AVX512VL-NEXT: vpextrb $15, %xmm1, %eax
; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX512VL-NEXT: movq %rbp, %rsp
@@ -1240,7 +1235,6 @@ define <8 x float> @var_shuffle_v8f32(<8 x float> %v, <8 x i32> %indices) nounwi
; AVX1-NEXT: andl $7, %r10d
; AVX1-NEXT: andl $28, %edi
; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT: movq %rsp, %rax
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
diff --git a/test/CodeGen/X86/var-permute-512.ll b/test/CodeGen/X86/var-permute-512.ll
index a5aa73cdf1a2..3f9f96b008c5 100644
--- a/test/CodeGen/X86/var-permute-512.ll
+++ b/test/CodeGen/X86/var-permute-512.ll
@@ -511,265 +511,201 @@ define <64 x i8> @var_shuffle_v64i8(<64 x i8> %v, <64 x i8> %indices) nounwind {
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, (%rsp)
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: movzbl (%rax,%rcx), %eax
-; NOBW-NEXT: vpextrb $1, %xmm4, %ecx
-; NOBW-NEXT: andl $63, %ecx
+; NOBW-NEXT: movzbl 3008(%rsp,%rax), %eax
; NOBW-NEXT: vmovd %eax, %xmm0
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rax
-; NOBW-NEXT: vpinsrb $1, (%rcx,%rax), %xmm0, %xmm0
+; NOBW-NEXT: vpextrb $1, %xmm4, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: vpinsrb $1, 2944(%rsp,%rax), %xmm0, %xmm0
; NOBW-NEXT: vpextrb $2, %xmm4, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $2, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrb $2, 2880(%rsp,%rax), %xmm0, %xmm0
; NOBW-NEXT: vpextrb $3, %xmm4, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $3, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrb $3, 2816(%rsp,%rax), %xmm0, %xmm0
; NOBW-NEXT: vpextrb $4, %xmm4, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $4, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrb $4, 2752(%rsp,%rax), %xmm0, %xmm0
; NOBW-NEXT: vpextrb $5, %xmm4, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $5, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrb $5, 2688(%rsp,%rax), %xmm0, %xmm0
; NOBW-NEXT: vpextrb $6, %xmm4, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $6, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrb $6, 2624(%rsp,%rax), %xmm0, %xmm0
; NOBW-NEXT: vpextrb $7, %xmm4, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $7, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrb $7, 2560(%rsp,%rax), %xmm0, %xmm0
; NOBW-NEXT: vpextrb $8, %xmm4, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $8, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrb $8, 2496(%rsp,%rax), %xmm0, %xmm0
; NOBW-NEXT: vpextrb $9, %xmm4, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $9, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrb $9, 2432(%rsp,%rax), %xmm0, %xmm0
; NOBW-NEXT: vpextrb $10, %xmm4, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $10, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrb $10, 2368(%rsp,%rax), %xmm0, %xmm0
; NOBW-NEXT: vpextrb $11, %xmm4, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $11, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrb $11, 2304(%rsp,%rax), %xmm0, %xmm0
; NOBW-NEXT: vpextrb $12, %xmm4, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $12, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrb $12, 2240(%rsp,%rax), %xmm0, %xmm0
; NOBW-NEXT: vpextrb $13, %xmm4, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $13, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrb $13, 2176(%rsp,%rax), %xmm0, %xmm0
; NOBW-NEXT: vpextrb $14, %xmm4, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $14, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrb $14, 2112(%rsp,%rax), %xmm0, %xmm0
; NOBW-NEXT: vpextrb $15, %xmm4, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $15, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrb $15, 2048(%rsp,%rax), %xmm0, %xmm0
; NOBW-NEXT: vpextrb $0, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: movzbl (%rax,%rcx), %eax
-; NOBW-NEXT: vpextrb $1, %xmm2, %ecx
-; NOBW-NEXT: andl $63, %ecx
+; NOBW-NEXT: movzbl 4032(%rsp,%rax), %eax
; NOBW-NEXT: vmovd %eax, %xmm1
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rax
-; NOBW-NEXT: vpinsrb $1, (%rcx,%rax), %xmm1, %xmm1
+; NOBW-NEXT: vpextrb $1, %xmm2, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: vpinsrb $1, 3968(%rsp,%rax), %xmm1, %xmm1
; NOBW-NEXT: vpextrb $2, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $2, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT: vpinsrb $2, 3904(%rsp,%rax), %xmm1, %xmm1
; NOBW-NEXT: vpextrb $3, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $3, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT: vpinsrb $3, 3840(%rsp,%rax), %xmm1, %xmm1
; NOBW-NEXT: vpextrb $4, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $4, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT: vpinsrb $4, 3776(%rsp,%rax), %xmm1, %xmm1
; NOBW-NEXT: vpextrb $5, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $5, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT: vpinsrb $5, 3712(%rsp,%rax), %xmm1, %xmm1
; NOBW-NEXT: vpextrb $6, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $6, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT: vpinsrb $6, 3648(%rsp,%rax), %xmm1, %xmm1
; NOBW-NEXT: vpextrb $7, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $7, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT: vpinsrb $7, 3584(%rsp,%rax), %xmm1, %xmm1
; NOBW-NEXT: vpextrb $8, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $8, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT: vpinsrb $8, 3520(%rsp,%rax), %xmm1, %xmm1
; NOBW-NEXT: vpextrb $9, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $9, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT: vpinsrb $9, 3456(%rsp,%rax), %xmm1, %xmm1
; NOBW-NEXT: vpextrb $10, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $10, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT: vpinsrb $10, 3392(%rsp,%rax), %xmm1, %xmm1
; NOBW-NEXT: vpextrb $11, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $11, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT: vpinsrb $11, 3328(%rsp,%rax), %xmm1, %xmm1
; NOBW-NEXT: vpextrb $12, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $12, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT: vpinsrb $12, 3264(%rsp,%rax), %xmm1, %xmm1
; NOBW-NEXT: vpextrb $13, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $13, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT: vpinsrb $13, 3200(%rsp,%rax), %xmm1, %xmm1
; NOBW-NEXT: vpextrb $14, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $14, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT: vpinsrb $14, 3136(%rsp,%rax), %xmm1, %xmm1
; NOBW-NEXT: vpextrb $15, %xmm2, %eax
; NOBW-NEXT: vextracti128 $1, %ymm3, %xmm2
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $15, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT: vpinsrb $15, 3072(%rsp,%rax), %xmm1, %xmm1
; NOBW-NEXT: vpextrb $0, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: movzbl (%rax,%rcx), %eax
-; NOBW-NEXT: vpextrb $1, %xmm2, %ecx
-; NOBW-NEXT: andl $63, %ecx
+; NOBW-NEXT: movzbl 960(%rsp,%rax), %eax
; NOBW-NEXT: vmovd %eax, %xmm4
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rax
-; NOBW-NEXT: vpinsrb $1, (%rcx,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $1, %xmm2, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: vpinsrb $1, 896(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $2, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $2, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $2, 832(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $3, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $3, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $3, 768(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $4, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $4, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $4, 704(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $5, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $5, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $5, 640(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $6, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $6, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $6, 576(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $7, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $7, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $7, 512(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $8, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $8, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $8, 448(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $9, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $9, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $9, 384(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $10, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $10, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $10, 320(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $11, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $11, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $11, 256(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $12, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $12, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $12, 192(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $13, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $13, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $13, 128(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $14, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $14, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $14, 64(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $15, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: movq %rsp, %rcx
-; NOBW-NEXT: vpinsrb $15, (%rax,%rcx), %xmm4, %xmm2
+; NOBW-NEXT: vpinsrb $15, (%rsp,%rax), %xmm4, %xmm2
; NOBW-NEXT: vpextrb $0, %xmm3, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: movzbl (%rax,%rcx), %eax
-; NOBW-NEXT: vpextrb $1, %xmm3, %ecx
-; NOBW-NEXT: andl $63, %ecx
+; NOBW-NEXT: movzbl 1984(%rsp,%rax), %eax
; NOBW-NEXT: vmovd %eax, %xmm4
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rax
-; NOBW-NEXT: vpinsrb $1, (%rcx,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $1, %xmm3, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: vpinsrb $1, 1920(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $2, %xmm3, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $2, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $2, 1856(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $3, %xmm3, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $3, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $3, 1792(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $4, %xmm3, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $4, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $4, 1728(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $5, %xmm3, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $5, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $5, 1664(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $6, %xmm3, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $6, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $6, 1600(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $7, %xmm3, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $7, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $7, 1536(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $8, %xmm3, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $8, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $8, 1472(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $9, %xmm3, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $9, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $9, 1408(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $10, %xmm3, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $10, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $10, 1344(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $11, %xmm3, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $11, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $11, 1280(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $12, %xmm3, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $12, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $12, 1216(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $13, %xmm3, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $13, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $13, 1152(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $14, %xmm3, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $14, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $14, 1088(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $15, %xmm3, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT: vpinsrb $15, (%rax,%rcx), %xmm4, %xmm3
+; NOBW-NEXT: vpinsrb $15, 1024(%rsp,%rax), %xmm4, %xmm3
; NOBW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; NOBW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1
; NOBW-NEXT: movq %rbp, %rsp
diff --git a/test/CodeGen/X86/vector-compare-results.ll b/test/CodeGen/X86/vector-compare-results.ll
index 12530adf15cb..2178eb70cdec 100644
--- a/test/CodeGen/X86/vector-compare-results.ll
+++ b/test/CodeGen/X86/vector-compare-results.ll
@@ -2384,11 +2384,10 @@ define <32 x i1> @test_cmp_v32f32(<32 x float> %a0, <32 x float> %a1) nounwind {
; AVX512F-LABEL: test_cmp_v32f32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vcmpltps %zmm0, %zmm2, %k1
-; AVX512F-NEXT: movl {{.*}}(%rip), %eax
-; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vcmpltps %zmm1, %zmm3, %k1
-; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
@@ -2399,12 +2398,11 @@ define <32 x i1> @test_cmp_v32f32(<32 x float> %a0, <32 x float> %a1) nounwind {
;
; AVX512DQ-LABEL: test_cmp_v32f32:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vcmpltps %zmm0, %zmm2, %k1
-; AVX512DQ-NEXT: movl {{.*}}(%rip), %eax
-; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
+; AVX512DQ-NEXT: vcmpltps %zmm0, %zmm2, %k0
+; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vcmpltps %zmm1, %zmm3, %k1
-; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z}
+; AVX512DQ-NEXT: vcmpltps %zmm1, %zmm3, %k0
+; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsllw $7, %ymm0, %ymm0
@@ -2893,11 +2891,10 @@ define <32 x i1> @test_cmp_v32i32(<32 x i32> %a0, <32 x i32> %a1) nounwind {
; AVX512F-LABEL: test_cmp_v32i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpcmpgtd %zmm2, %zmm0, %k1
-; AVX512F-NEXT: movl {{.*}}(%rip), %eax
-; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpcmpgtd %zmm3, %zmm1, %k1
-; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
@@ -2908,12 +2905,11 @@ define <32 x i1> @test_cmp_v32i32(<32 x i32> %a0, <32 x i32> %a1) nounwind {
;
; AVX512DQ-LABEL: test_cmp_v32i32:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpcmpgtd %zmm2, %zmm0, %k1
-; AVX512DQ-NEXT: movl {{.*}}(%rip), %eax
-; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
+; AVX512DQ-NEXT: vpcmpgtd %zmm2, %zmm0, %k0
+; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vpcmpgtd %zmm3, %zmm1, %k1
-; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z}
+; AVX512DQ-NEXT: vpcmpgtd %zmm3, %zmm1, %k0
+; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsllw $7, %ymm0, %ymm0
@@ -5965,13 +5961,12 @@ define <32 x i1> @test_cmp_v32f64(<32 x double> %a0, <32 x double> %a1) nounwind
; AVX512F-NEXT: vcmpltpd %zmm0, %zmm4, %k0
; AVX512F-NEXT: vcmpltpd %zmm1, %zmm5, %k1
; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
-; AVX512F-NEXT: movl {{.*}}(%rip), %eax
-; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vcmpltpd %zmm2, %zmm6, %k0
; AVX512F-NEXT: vcmpltpd %zmm3, %zmm7, %k1
; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
-; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
@@ -5984,14 +5979,13 @@ define <32 x i1> @test_cmp_v32f64(<32 x double> %a0, <32 x double> %a1) nounwind
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vcmpltpd %zmm0, %zmm4, %k0
; AVX512DQ-NEXT: vcmpltpd %zmm1, %zmm5, %k1
-; AVX512DQ-NEXT: kunpckbw %k0, %k1, %k1
-; AVX512DQ-NEXT: movl {{.*}}(%rip), %eax
-; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
+; AVX512DQ-NEXT: kunpckbw %k0, %k1, %k0
+; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vcmpltpd %zmm2, %zmm6, %k0
; AVX512DQ-NEXT: vcmpltpd %zmm3, %zmm7, %k1
-; AVX512DQ-NEXT: kunpckbw %k0, %k1, %k1
-; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z}
+; AVX512DQ-NEXT: kunpckbw %k0, %k1, %k0
+; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsllw $7, %ymm0, %ymm0
@@ -6588,13 +6582,12 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind {
; AVX512F-NEXT: vpcmpgtq %zmm4, %zmm0, %k0
; AVX512F-NEXT: vpcmpgtq %zmm5, %zmm1, %k1
; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
-; AVX512F-NEXT: movl {{.*}}(%rip), %eax
-; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpcmpgtq %zmm6, %zmm2, %k0
; AVX512F-NEXT: vpcmpgtq %zmm7, %zmm3, %k1
; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
-; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
@@ -6607,14 +6600,13 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind {
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpcmpgtq %zmm4, %zmm0, %k0
; AVX512DQ-NEXT: vpcmpgtq %zmm5, %zmm1, %k1
-; AVX512DQ-NEXT: kunpckbw %k0, %k1, %k1
-; AVX512DQ-NEXT: movl {{.*}}(%rip), %eax
-; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
+; AVX512DQ-NEXT: kunpckbw %k0, %k1, %k0
+; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpcmpgtq %zmm6, %zmm2, %k0
; AVX512DQ-NEXT: vpcmpgtq %zmm7, %zmm3, %k1
-; AVX512DQ-NEXT: kunpckbw %k0, %k1, %k1
-; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z}
+; AVX512DQ-NEXT: kunpckbw %k0, %k1, %k0
+; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsllw $7, %ymm0, %ymm0
diff --git a/test/CodeGen/X86/vector-half-conversions.ll b/test/CodeGen/X86/vector-half-conversions.ll
index 44fe38fa86b9..4da23e539f6a 100644
--- a/test/CodeGen/X86/vector-half-conversions.ll
+++ b/test/CodeGen/X86/vector-half-conversions.ll
@@ -2195,8 +2195,7 @@ define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind {
; AVX512VL-NEXT: shlq $32, %rdx
; AVX512VL-NEXT: orq %rcx, %rdx
; AVX512VL-NEXT: vmovq %rdx, %xmm0
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512VL-NEXT: retq
%1 = fptrunc <4 x float> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
@@ -2205,108 +2204,30 @@ define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind {
}
define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind {
-; AVX1-LABEL: cvt_4f32_to_8i16_zero:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %eax
-; AVX1-NEXT: shll $16, %eax
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %ecx
-; AVX1-NEXT: movzwl %cx, %ecx
-; AVX1-NEXT: orl %eax, %ecx
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %eax
-; AVX1-NEXT: shll $16, %eax
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %edx
-; AVX1-NEXT: movzwl %dx, %edx
-; AVX1-NEXT: orl %eax, %edx
-; AVX1-NEXT: shlq $32, %rdx
-; AVX1-NEXT: orq %rcx, %rdx
-; AVX1-NEXT: vmovq %rdx, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: cvt_4f32_to_8i16_zero:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %eax
-; AVX2-NEXT: shll $16, %eax
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %ecx
-; AVX2-NEXT: movzwl %cx, %ecx
-; AVX2-NEXT: orl %eax, %ecx
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %eax
-; AVX2-NEXT: shll $16, %eax
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %edx
-; AVX2-NEXT: movzwl %dx, %edx
-; AVX2-NEXT: orl %eax, %edx
-; AVX2-NEXT: shlq $32, %rdx
-; AVX2-NEXT: orq %rcx, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: cvt_4f32_to_8i16_zero:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: shll $16, %eax
-; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: movzwl %cx, %ecx
-; AVX512F-NEXT: orl %eax, %ecx
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: shll $16, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512F-NEXT: vmovd %xmm0, %edx
-; AVX512F-NEXT: movzwl %dx, %edx
-; AVX512F-NEXT: orl %eax, %edx
-; AVX512F-NEXT: shlq $32, %rdx
-; AVX512F-NEXT: orq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_4f32_to_8i16_zero:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: shll $16, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %ecx
-; AVX512VL-NEXT: movzwl %cx, %ecx
-; AVX512VL-NEXT: orl %eax, %ecx
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: shll $16, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %edx
-; AVX512VL-NEXT: movzwl %dx, %edx
-; AVX512VL-NEXT: orl %eax, %edx
-; AVX512VL-NEXT: shlq $32, %rdx
-; AVX512VL-NEXT: orq %rcx, %rdx
-; AVX512VL-NEXT: vmovq %rdx, %xmm0
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX512VL-NEXT: retq
+; ALL-LABEL: cvt_4f32_to_8i16_zero:
+; ALL: # %bb.0:
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: movzwl %cx, %ecx
+; ALL-NEXT: orl %eax, %ecx
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %edx
+; ALL-NEXT: movzwl %dx, %edx
+; ALL-NEXT: orl %eax, %edx
+; ALL-NEXT: shlq $32, %rdx
+; ALL-NEXT: orq %rcx, %rdx
+; ALL-NEXT: vmovq %rdx, %xmm0
+; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
+; ALL-NEXT: retq
%1 = fptrunc <4 x float> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
%3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -2715,8 +2636,7 @@ define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounw
; AVX512VL-NEXT: shlq $32, %rdx
; AVX512VL-NEXT: orq %rcx, %rdx
; AVX512VL-NEXT: vmovq %rdx, %xmm0
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512VL-NEXT: vmovdqa %xmm0, (%rdi)
; AVX512VL-NEXT: retq
%1 = fptrunc <4 x float> %a0 to <4 x half>
@@ -2727,112 +2647,31 @@ define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounw
}
define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwind {
-; AVX1-LABEL: store_cvt_4f32_to_8i16_zero:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %eax
-; AVX1-NEXT: shll $16, %eax
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %ecx
-; AVX1-NEXT: movzwl %cx, %ecx
-; AVX1-NEXT: orl %eax, %ecx
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %eax
-; AVX1-NEXT: shll $16, %eax
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %edx
-; AVX1-NEXT: movzwl %dx, %edx
-; AVX1-NEXT: orl %eax, %edx
-; AVX1-NEXT: shlq $32, %rdx
-; AVX1-NEXT: orq %rcx, %rdx
-; AVX1-NEXT: vmovq %rdx, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: store_cvt_4f32_to_8i16_zero:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %eax
-; AVX2-NEXT: shll $16, %eax
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %ecx
-; AVX2-NEXT: movzwl %cx, %ecx
-; AVX2-NEXT: orl %eax, %ecx
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %eax
-; AVX2-NEXT: shll $16, %eax
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %edx
-; AVX2-NEXT: movzwl %dx, %edx
-; AVX2-NEXT: orl %eax, %edx
-; AVX2-NEXT: shlq $32, %rdx
-; AVX2-NEXT: orq %rcx, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: store_cvt_4f32_to_8i16_zero:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: shll $16, %eax
-; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: movzwl %cx, %ecx
-; AVX512F-NEXT: orl %eax, %ecx
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: shll $16, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512F-NEXT: vmovd %xmm0, %edx
-; AVX512F-NEXT: movzwl %dx, %edx
-; AVX512F-NEXT: orl %eax, %edx
-; AVX512F-NEXT: shlq $32, %rdx
-; AVX512F-NEXT: orq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vmovdqa %xmm0, (%rdi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: store_cvt_4f32_to_8i16_zero:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: shll $16, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %ecx
-; AVX512VL-NEXT: movzwl %cx, %ecx
-; AVX512VL-NEXT: orl %eax, %ecx
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: shll $16, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %edx
-; AVX512VL-NEXT: movzwl %dx, %edx
-; AVX512VL-NEXT: orl %eax, %edx
-; AVX512VL-NEXT: shlq $32, %rdx
-; AVX512VL-NEXT: orq %rcx, %rdx
-; AVX512VL-NEXT: vmovq %rdx, %xmm0
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX512VL-NEXT: vmovdqa %xmm0, (%rdi)
-; AVX512VL-NEXT: retq
+; ALL-LABEL: store_cvt_4f32_to_8i16_zero:
+; ALL: # %bb.0:
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: movzwl %cx, %ecx
+; ALL-NEXT: orl %eax, %ecx
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %edx
+; ALL-NEXT: movzwl %dx, %edx
+; ALL-NEXT: orl %eax, %edx
+; ALL-NEXT: shlq $32, %rdx
+; ALL-NEXT: orq %rcx, %rdx
+; ALL-NEXT: vmovq %rdx, %xmm0
+; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
+; ALL-NEXT: vmovdqa %xmm0, (%rdi)
+; ALL-NEXT: retq
%1 = fptrunc <4 x float> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
%3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -3389,8 +3228,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
; AVX512VL-NEXT: shlq $32, %rax
; AVX512VL-NEXT: orq %r14, %rax
; AVX512VL-NEXT: vmovq %rax, %xmm0
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512VL-NEXT: addq $40, %rsp
; AVX512VL-NEXT: popq %rbx
; AVX512VL-NEXT: popq %r14
@@ -3478,84 +3316,43 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
; AVX2-NEXT: popq %r14
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: cvt_4f64_to_8i16_zero:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: subq $40, %rsp
-; AVX512F-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movl %eax, %ebx
-; AVX512F-NEXT: shll $16, %ebx
-; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movzwl %ax, %r14d
-; AVX512F-NEXT: orl %ebx, %r14d
-; AVX512F-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movl %eax, %ebx
-; AVX512F-NEXT: shll $16, %ebx
-; AVX512F-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
-; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: orl %ebx, %eax
-; AVX512F-NEXT: shlq $32, %rax
-; AVX512F-NEXT: orq %r14, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: addq $40, %rsp
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_4f64_to_8i16_zero:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: subq $40, %rsp
-; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movl %eax, %ebx
-; AVX512VL-NEXT: shll $16, %ebx
-; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX512VL-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movzwl %ax, %r14d
-; AVX512VL-NEXT: orl %ebx, %r14d
-; AVX512VL-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movl %eax, %ebx
-; AVX512VL-NEXT: shll $16, %ebx
-; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
-; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movzwl %ax, %eax
-; AVX512VL-NEXT: orl %ebx, %eax
-; AVX512VL-NEXT: shlq $32, %rax
-; AVX512VL-NEXT: orq %r14, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm0
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX512VL-NEXT: addq $40, %rsp
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: cvt_4f64_to_8i16_zero:
+; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: subq $40, %rsp
+; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %ebx
+; AVX512-NEXT: shll $16, %ebx
+; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %r14d
+; AVX512-NEXT: orl %ebx, %r14d
+; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %ebx
+; AVX512-NEXT: shll $16, %ebx
+; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %eax
+; AVX512-NEXT: orl %ebx, %eax
+; AVX512-NEXT: shlq $32, %rax
+; AVX512-NEXT: orq %r14, %rax
+; AVX512-NEXT: vmovq %rax, %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: addq $40, %rsp
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: retq
%1 = fptrunc <4 x double> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
%3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -4095,8 +3892,7 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) noun
; AVX512VL-NEXT: shlq $32, %rax
; AVX512VL-NEXT: orq %rbx, %rax
; AVX512VL-NEXT: vmovq %rax, %xmm0
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512VL-NEXT: vmovdqa %xmm0, (%r14)
; AVX512VL-NEXT: addq $32, %rsp
; AVX512VL-NEXT: popq %rbx
@@ -4195,92 +3991,47 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounw
; AVX2-NEXT: popq %rbp
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: store_cvt_4f64_to_8i16_zero:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rbp
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: subq $32, %rsp
-; AVX512F-NEXT: movq %rdi, %r14
-; AVX512F-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movl %eax, %ebp
-; AVX512F-NEXT: shll $16, %ebp
-; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movzwl %ax, %ebx
-; AVX512F-NEXT: orl %ebp, %ebx
-; AVX512F-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movl %eax, %ebp
-; AVX512F-NEXT: shll $16, %ebp
-; AVX512F-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
-; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: orl %ebp, %eax
-; AVX512F-NEXT: shlq $32, %rax
-; AVX512F-NEXT: orq %rbx, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vmovdqa %xmm0, (%r14)
-; AVX512F-NEXT: addq $32, %rsp
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: popq %rbp
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: store_cvt_4f64_to_8i16_zero:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %rbp
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: subq $32, %rsp
-; AVX512VL-NEXT: movq %rdi, %r14
-; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movl %eax, %ebp
-; AVX512VL-NEXT: shll $16, %ebp
-; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX512VL-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movzwl %ax, %ebx
-; AVX512VL-NEXT: orl %ebp, %ebx
-; AVX512VL-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movl %eax, %ebp
-; AVX512VL-NEXT: shll $16, %ebp
-; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
-; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movzwl %ax, %eax
-; AVX512VL-NEXT: orl %ebp, %eax
-; AVX512VL-NEXT: shlq $32, %rax
-; AVX512VL-NEXT: orq %rbx, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm0
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX512VL-NEXT: vmovdqa %xmm0, (%r14)
-; AVX512VL-NEXT: addq $32, %rsp
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %rbp
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: store_cvt_4f64_to_8i16_zero:
+; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: subq $32, %rsp
+; AVX512-NEXT: movq %rdi, %r14
+; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %ebp
+; AVX512-NEXT: shll $16, %ebp
+; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %ebx
+; AVX512-NEXT: orl %ebp, %ebx
+; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %ebp
+; AVX512-NEXT: shll $16, %ebp
+; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %eax
+; AVX512-NEXT: orl %ebp, %eax
+; AVX512-NEXT: shlq $32, %rax
+; AVX512-NEXT: orq %rbx, %rax
+; AVX512-NEXT: vmovq %rax, %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vmovdqa %xmm0, (%r14)
+; AVX512-NEXT: addq $32, %rsp
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: retq
%1 = fptrunc <4 x double> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
%3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
diff --git a/test/CodeGen/X86/vector-rotate-128.ll b/test/CodeGen/X86/vector-rotate-128.ll
index b40c9eddd46b..8af96c168be6 100644
--- a/test/CodeGen/X86/vector-rotate-128.ll
+++ b/test/CodeGen/X86/vector-rotate-128.ll
@@ -699,20 +699,35 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
-; AVX512-LABEL: var_rotate_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512-NEXT: vpsubb %xmm1, %xmm2, %xmm2
-; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-NEXT: vpsllvd %zmm1, %zmm0, %zmm1
-; AVX512-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
-; AVX512-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: var_rotate_v16i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpsubb %xmm1, %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: var_rotate_v16i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT: vpsubb %xmm1, %xmm2, %xmm2
+; AVX512VL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VL-NEXT: vpsllvw %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vpmovwb %ymm1, %xmm1
+; AVX512VL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VL-NEXT: vpsrlvw %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512VL-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
;
; XOP-LABEL: var_rotate_v16i8:
; XOP: # %bb.0:
@@ -1249,16 +1264,29 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
-; AVX512-LABEL: constant_rotate_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm1
-; AVX512-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: constant_rotate_v16i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: constant_rotate_v16i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VL-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm1
+; AVX512VL-NEXT: vpmovwb %ymm1, %xmm1
+; AVX512VL-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512VL-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
;
; XOP-LABEL: constant_rotate_v16i8:
; XOP: # %bb.0:
diff --git a/test/CodeGen/X86/vector-shift-ashr-128.ll b/test/CodeGen/X86/vector-shift-ashr-128.ll
index ea33f22cc07a..ca670f40ab3f 100644
--- a/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -531,23 +531,42 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
-; AVX512-LABEL: var_shift_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT: vpsravd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512DQ-LABEL: var_shift_v16i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512VL-LABEL: var_shift_v16i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512VL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; AVX512BW-LABEL: var_shift_v16i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: var_shift_v16i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: var_shift_v16i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v16i8:
; X32-SSE: # %bb.0:
@@ -948,25 +967,46 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
-; AVX512-LABEL: splatvar_shift_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512-NEXT: vpsravd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512DQ-LABEL: splatvar_shift_v16i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512VL-LABEL: splatvar_shift_v16i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512VL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; AVX512BW-LABEL: splatvar_shift_v16i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v16i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v16i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v16i8:
; X32-SSE: # %bb.0:
@@ -1441,21 +1481,39 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
-; AVX512-LABEL: constant_shift_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512DQ-LABEL: constant_shift_v16i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512VL-LABEL: constant_shift_v16i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512VL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; AVX512BW-LABEL: constant_shift_v16i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: constant_shift_v16i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: constant_shift_v16i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v16i8:
; X32-SSE: # %bb.0:
diff --git a/test/CodeGen/X86/vector-shift-lshr-128.ll b/test/CodeGen/X86/vector-shift-lshr-128.ll
index 307cf287219d..890cedf97c9d 100644
--- a/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -451,23 +451,42 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
-; AVX512-LABEL: var_shift_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512DQ-LABEL: var_shift_v16i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512VL-LABEL: var_shift_v16i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; AVX512BW-LABEL: var_shift_v16i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: var_shift_v16i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: var_shift_v16i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v16i8:
; X32-SSE: # %bb.0:
@@ -753,25 +772,46 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
-; AVX512-LABEL: splatvar_shift_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512DQ-LABEL: splatvar_shift_v16i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512VL-LABEL: splatvar_shift_v16i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; AVX512BW-LABEL: splatvar_shift_v16i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v16i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v16i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v16i8:
; X32-SSE: # %bb.0:
@@ -1148,21 +1188,39 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
-; AVX512-LABEL: constant_shift_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512DQ-LABEL: constant_shift_v16i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512VL-LABEL: constant_shift_v16i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; AVX512BW-LABEL: constant_shift_v16i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: constant_shift_v16i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: constant_shift_v16i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v16i8:
; X32-SSE: # %bb.0:
diff --git a/test/CodeGen/X86/vector-shift-shl-128.ll b/test/CodeGen/X86/vector-shift-shl-128.ll
index b518ad5fcffd..9481e46c0c52 100644
--- a/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -401,23 +401,42 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
-; AVX512-LABEL: var_shift_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512DQ-LABEL: var_shift_v16i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512VL-LABEL: var_shift_v16i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; AVX512BW-LABEL: var_shift_v16i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: var_shift_v16i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: var_shift_v16i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v16i8:
; X32-SSE: # %bb.0:
@@ -695,25 +714,46 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
-; AVX512-LABEL: splatvar_shift_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512DQ-LABEL: splatvar_shift_v16i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512VL-LABEL: splatvar_shift_v16i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; AVX512BW-LABEL: splatvar_shift_v16i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v16i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v16i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v16i8:
; X32-SSE: # %bb.0:
@@ -998,21 +1038,39 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
-; AVX512-LABEL: constant_shift_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512DQ-LABEL: constant_shift_v16i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512VL-LABEL: constant_shift_v16i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; AVX512BW-LABEL: constant_shift_v16i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: constant_shift_v16i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: constant_shift_v16i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v16i8:
; X32-SSE: # %bb.0:
diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll
index 2fcbd89b857e..2f5a2b116115 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -3,8 +3,10 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VLBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512vbmi,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VLVBMI
define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
@@ -58,17 +60,10 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01(
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
; SSE41-NEXT: retq
;
-; AVX1OR2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-; AVX1OR2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX512VL-NEXT: retq
+; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
ret <16 x i8> %shuffle
}
@@ -93,17 +88,10 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
; SSE41-NEXT: retq
;
-; AVX1OR2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
-; AVX1OR2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,1,1,8,8,9,9,8,8,9,9,10,10,11,11]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX512VL-NEXT: retq
+; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
+; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
ret <16 x i8> %shuffle
}
@@ -115,11 +103,27 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03(
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE-NEXT: retq
;
-; AVX-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
-; AVX: # %bb.0:
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; AVX-NEXT: retq
+; AVX1-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; AVX1-NEXT: retq
+;
+; AVX2-SLOW-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
ret <16 x i8> %shuffle
}
@@ -131,11 +135,27 @@ define <16 x i8> @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07(
; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; SSE-NEXT: retq
;
-; AVX-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
-; AVX: # %bb.0:
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; AVX-NEXT: retq
+; AVX1-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: retq
+;
+; AVX2-SLOW-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
ret <16 x i8> %shuffle
}
@@ -1203,12 +1223,25 @@ define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00(
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
+; AVX1OR2: # %bb.0: # %entry
+; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
+; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
+; AVX1OR2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX1OR2-NEXT: retq
+;
+; AVX512VLBW-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
+; AVX512VLBW: # %bb.0: # %entry
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
+; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512VLBW-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
+; AVX512VLVBMI: # %bb.0: # %entry
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = <u,10,2,7,22,14,7,2,18,3,1,14,18,9,11,0>
+; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
+; AVX512VLVBMI-NEXT: retq
entry:
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 undef, i32 10, i32 2, i32 7, i32 22, i32 14, i32 7, i32 2, i32 18, i32 3, i32 1, i32 14, i32 18, i32 9, i32 11, i32 0>
diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll
index 60bc36948d23..072d71fae570 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll
@@ -3,8 +3,10 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST
define <8 x i16> @shuffle_v8i16_01012323(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: shuffle_v8i16_01012323:
@@ -85,11 +87,33 @@ define <8 x i16> @shuffle_v8i16_00004444(<8 x i16> %a, <8 x i16> %b) {
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
; SSE-NEXT: retq
;
-; AVX-LABEL: shuffle_v8i16_00004444:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; AVX-NEXT: retq
+; AVX1-LABEL: shuffle_v8i16_00004444:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX1-NEXT: retq
+;
+; AVX2-SLOW-LABEL: shuffle_v8i16_00004444:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i16_00004444:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i16_00004444:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i16_00004444:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9]
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
ret <8 x i16> %shuffle
}
@@ -126,11 +150,33 @@ define <8 x i16> @shuffle_v8i16_31206745(<8 x i16> %a, <8 x i16> %b) {
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
; SSE-NEXT: retq
;
-; AVX-LABEL: shuffle_v8i16_31206745:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
-; AVX-NEXT: retq
+; AVX1-LABEL: shuffle_v8i16_31206745:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX1-NEXT: retq
+;
+; AVX2-SLOW-LABEL: shuffle_v8i16_31206745:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i16_31206745:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,0,1,12,13,14,15,8,9,10,11]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i16_31206745:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i16_31206745:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,0,1,12,13,14,15,8,9,10,11]
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 6, i32 7, i32 4, i32 5>
ret <8 x i16> %shuffle
}
@@ -179,11 +225,33 @@ define <8 x i16> @shuffle_v8i16_23026745(<8 x i16> %a, <8 x i16> %b) {
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
; SSE-NEXT: retq
;
-; AVX-LABEL: shuffle_v8i16_23026745:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
-; AVX-NEXT: retq
+; AVX1-LABEL: shuffle_v8i16_23026745:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX1-NEXT: retq
+;
+; AVX2-SLOW-LABEL: shuffle_v8i16_23026745:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i16_23026745:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,4,5,12,13,14,15,8,9,10,11]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i16_23026745:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7]
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i16_23026745:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,4,5,12,13,14,15,8,9,10,11]
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 3, i32 0, i32 2, i32 6, i32 7, i32 4, i32 5>
ret <8 x i16> %shuffle
}
@@ -194,11 +262,33 @@ define <8 x i16> @shuffle_v8i16_23016747(<8 x i16> %a, <8 x i16> %b) {
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7]
; SSE-NEXT: retq
;
-; AVX-LABEL: shuffle_v8i16_23016747:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7]
-; AVX-NEXT: retq
+; AVX1-LABEL: shuffle_v8i16_23016747:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7]
+; AVX1-NEXT: retq
+;
+; AVX2-SLOW-LABEL: shuffle_v8i16_23016747:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i16_23016747:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3,12,13,14,15,8,9,14,15]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i16_23016747:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i16_23016747:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3,12,13,14,15,8,9,14,15]
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 7>
ret <8 x i16> %shuffle
}
@@ -597,11 +687,33 @@ define <8 x i16> @shuffle_v8i16_04404567(<8 x i16> %a, <8 x i16> %b) {
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
; SSE-NEXT: retq
;
-; AVX-LABEL: shuffle_v8i16_04404567:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
-; AVX-NEXT: retq
+; AVX1-LABEL: shuffle_v8i16_04404567:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-SLOW-LABEL: shuffle_v8i16_04404567:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i16_04404567:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i16_04404567:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i16_04404567:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,10,11,12,13,14,15]
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 5, i32 6, i32 7>
ret <8 x i16> %shuffle
}
@@ -700,17 +812,10 @@ define <8 x i16> @shuffle_v8i16_0127XXXX(<8 x i16> %a, <8 x i16> %b) {
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15]
; SSE41-NEXT: retq
;
-; AVX1OR2-LABEL: shuffle_v8i16_0127XXXX:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15]
-; AVX1OR2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i16_0127XXXX:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512VL-NEXT: retq
+; AVX-LABEL: shuffle_v8i16_0127XXXX:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
}
@@ -733,17 +838,10 @@ define <8 x i16> @shuffle_v8i16_XXXX4563(<8 x i16> %a, <8 x i16> %b) {
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7]
; SSE41-NEXT: retq
;
-; AVX1OR2-LABEL: shuffle_v8i16_XXXX4563:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7]
-; AVX1OR2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i16_XXXX4563:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
-; AVX512VL-NEXT: retq
+; AVX-LABEL: shuffle_v8i16_XXXX4563:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 3>
ret <8 x i16> %shuffle
}
@@ -766,17 +864,10 @@ define <8 x i16> @shuffle_v8i16_4563XXXX(<8 x i16> %a, <8 x i16> %b) {
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3]
; SSE41-NEXT: retq
;
-; AVX1OR2-LABEL: shuffle_v8i16_4563XXXX:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3]
-; AVX1OR2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i16_4563XXXX:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
-; AVX512VL-NEXT: retq
+; AVX-LABEL: shuffle_v8i16_4563XXXX:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
}
@@ -799,17 +890,10 @@ define <8 x i16> @shuffle_v8i16_01274563(<8 x i16> %a, <8 x i16> %b) {
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7]
; SSE41-NEXT: retq
;
-; AVX1OR2-LABEL: shuffle_v8i16_01274563:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7]
-; AVX1OR2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i16_01274563:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,1,2]
-; AVX512VL-NEXT: retq
+; AVX-LABEL: shuffle_v8i16_01274563:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6, i32 3>
ret <8 x i16> %shuffle
}
@@ -832,17 +916,10 @@ define <8 x i16> @shuffle_v8i16_45630127(<8 x i16> %a, <8 x i16> %b) {
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15]
; SSE41-NEXT: retq
;
-; AVX1OR2-LABEL: shuffle_v8i16_45630127:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15]
-; AVX1OR2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i16_45630127:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,3,1]
-; AVX512VL-NEXT: retq
+; AVX-LABEL: shuffle_v8i16_45630127:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 3, i32 0, i32 1, i32 2, i32 7>
ret <8 x i16> %shuffle
}
@@ -980,12 +1057,38 @@ define <8 x i16> @shuffle_v8i16_109832ba(<8 x i16> %a, <8 x i16> %b) {
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5]
; SSE-NEXT: retq
;
-; AVX-LABEL: shuffle_v8i16_109832ba:
-; AVX: # %bb.0:
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5]
-; AVX-NEXT: retq
+; AVX1-LABEL: shuffle_v8i16_109832ba:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5]
+; AVX1-NEXT: retq
+;
+; AVX2-SLOW-LABEL: shuffle_v8i16_109832ba:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i16_109832ba:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,6,7,2,3,12,13,8,9,14,15,10,11]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i16_109832ba:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7]
+; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i16_109832ba:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,6,7,2,3,12,13,8,9,14,15,10,11]
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 0, i32 9, i32 8, i32 3, i32 2, i32 11, i32 10>
ret <8 x i16> %shuffle
}
@@ -1028,13 +1131,43 @@ define <8 x i16> @shuffle_v8i16_0213cedf(<8 x i16> %a, <8 x i16> %b) {
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
-; AVX-LABEL: shuffle_v8i16_0213cedf:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7]
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-NEXT: retq
+; AVX1-LABEL: shuffle_v8i16_0213cedf:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: retq
+;
+; AVX2-SLOW-LABEL: shuffle_v8i16_0213cedf:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i16_0213cedf:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,12,13,10,11,14,15,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i16_0213cedf:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7]
+; AVX512VL-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i16_0213cedf:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,12,13,10,11,14,15,8,9,10,11,12,13,14,15]
+; AVX512VL-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; AVX512VL-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 1, i32 3, i32 12, i32 14, i32 13, i32 15>
ret <8 x i16> %shuffle
}
@@ -1064,12 +1197,38 @@ define <8 x i16> @shuffle_v8i16_443aXXXX(<8 x i16> %a, <8 x i16> %b) {
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
; SSE41-NEXT: retq
;
-; AVX-LABEL: shuffle_v8i16_443aXXXX:
-; AVX: # %bb.0:
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
-; AVX-NEXT: retq
+; AVX1-LABEL: shuffle_v8i16_443aXXXX:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-SLOW-LABEL: shuffle_v8i16_443aXXXX:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i16_443aXXXX:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i16_443aXXXX:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i16_443aXXXX:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,8,9,10,11,12,13,14,15]
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 3, i32 10, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
}
@@ -1336,13 +1495,35 @@ define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) {
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v8i16_XXX1X579:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
-; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
-; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
-; AVX2OR512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v8i16_XXX1X579:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i16_XXX1X579:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %xmm1
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,2,3,8,9,10,11,14,15,14,15]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i16_XXX1X579:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpbroadcastd %xmm1, %xmm1
+; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
+; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
+; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i16_XXX1X579:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpbroadcastd %xmm1, %xmm1
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,2,3,8,9,10,11,14,15,14,15]
+; AVX512VL-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 5, i32 7, i32 9>
ret <8 x i16> %shuffle
}
diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll
index 11f25a2d687d..cbd1b83a4eb2 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -1,7 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
@@ -156,15 +158,24 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpbroadcastw %xmm0, %xmm1
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpbroadcastw %xmm0, %xmm1
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
; AVX512VL: # %bb.0:
@@ -214,12 +225,19 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
; AVX512VL: # %bb.0:
@@ -241,12 +259,19 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VL: # %bb.0:
@@ -373,11 +398,27 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
-; AVX2OR512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
ret <16 x i16> %shuffle
}
@@ -393,11 +434,27 @@ define <16 x i16> @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
-; AVX2OR512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,30,31,30,31,30,31,30,31,30,31,30,31,30,31,30,31]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,30,31,30,31,30,31,30,31,30,31,30,31,30,31,30,31]
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
ret <16 x i16> %shuffle
}
@@ -413,11 +470,27 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX2OR512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
ret <16 x i16> %shuffle
}
@@ -433,11 +506,27 @@ define <16 x i16> @shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15]
-; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
-; AVX2OR512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,6,7,6,7,6,7,14,15,14,15,14,15,14,15,22,23,22,23,22,23,22,23,30,31,30,31,30,31,30,31]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15]
+; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,6,7,6,7,6,7,14,15,14,15,14,15,14,15,22,23,22,23,22,23,22,23,30,31,30,31,30,31,30,31]
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7, i32 11, i32 11, i32 11, i32 11, i32 15, i32 15, i32 15, i32 15>
ret <16 x i16> %shuffle
}
@@ -453,11 +542,27 @@ define <16 x i16> @shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,2,4,5,6,7,8,8,10,10,12,13,14,15]
-; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,6,8,9,10,11,12,12,14,14]
-; AVX2OR512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,2,4,5,6,7,8,8,10,10,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,6,8,9,10,11,12,12,14,14]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,16,17,16,17,20,21,20,21,24,25,24,25,28,29,28,29]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,2,4,5,6,7,8,8,10,10,12,13,14,15]
+; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,6,8,9,10,11,12,12,14,14]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,16,17,16,17,20,21,20,21,24,25,24,25,28,29,28,29]
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
ret <16 x i16> %shuffle
}
@@ -473,11 +578,27 @@ define <16 x i16> @shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15]
-; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,7,7,8,9,10,11,13,13,15,15]
-; AVX2OR512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,7,7,8,9,10,11,13,13,15,15]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15,18,19,18,19,22,23,22,23,26,27,26,27,30,31,30,31]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15]
+; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,7,7,8,9,10,11,13,13,15,15]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15,18,19,18,19,22,23,22,23,26,27,26,27,30,31,30,31]
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
ret <16 x i16> %shuffle
}
@@ -789,13 +910,20 @@ define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_2
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
; AVX512VL: # %bb.0:
@@ -850,12 +978,18 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_1
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,4,5,2,3,0,1,14,15,12,13,10,11,8,9,22,23,20,21,18,19,16,17,30,31,28,29,26,27,24,25]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12:
; AVX512VL: # %bb.0:
@@ -880,13 +1014,20 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,6,7,4,5,2,3,0,1,16,17,18,19,20,21,22,23,22,23,20,21,18,19,16,17]
+; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08:
; AVX512VL: # %bb.0:
@@ -1349,12 +1490,18 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_2
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20:
; AVX512VL: # %bb.0:
@@ -1376,12 +1523,18 @@ define <16 x i16> @shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_2
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20:
; AVX512VL: # %bb.0:
@@ -1404,12 +1557,18 @@ define <16 x i16> @shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_2
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28:
; AVX512VL: # %bb.0:
@@ -1431,12 +1590,18 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_2
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28:
; AVX512VL: # %bb.0:
@@ -1877,15 +2042,24 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
-; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT: vpbroadcastw %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-FAST-NEXT: vpbroadcastw %xmm1, %xmm1
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:
; AVX512VL: # %bb.0:
@@ -1909,15 +2083,24 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1
-; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT: vpsllq $48, %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-FAST-NEXT: vpsllq $48, %xmm1, %xmm1
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12:
; AVX512VL: # %bb.0:
@@ -1995,14 +2178,22 @@ define <16 x i16> @shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,0,4,5,6,7,11,9,10,8,12,13,14,15]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,0,4,5,6,7,11,9,10,8,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,2,3,4,5,0,1,12,13,14,15,8,9,10,11,22,23,18,19,20,21,16,17,28,29,30,31,24,25,26,27]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13:
; AVX512VL: # %bb.0:
@@ -2081,14 +2272,22 @@ define <16 x i16> @shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,255,255,255,255,255,255,0,0,255,255,255,255,255,255,u,u,255,255,255,255,255,255,255,255,255,255,255,255>
-; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,3,0,2,4,5,6,7,10,11,8,10,12,13,14,15]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,255,255,255,255,255,255,0,0,255,255,255,255,255,255,u,u,255,255,255,255,255,255,255,255,255,255,255,255>
+; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,3,0,2,4,5,6,7,10,11,8,10,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,255,255,255,255,255,255,0,0,255,255,255,255,255,255,u,u,255,255,255,255,255,255,255,255,255,255,255,255>
+; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,6,7,0,1,4,5,12,13,14,15,8,9,10,11,20,21,22,23,16,17,20,21,28,29,30,31,24,25,26,27]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13:
; AVX512VL: # %bb.0:
@@ -2110,14 +2309,22 @@ define <16 x i16> @shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[4,5,6,7,0,1,2,3,12,13,14,15,8,9,14,15,20,21,22,23,16,17,18,19,28,29,30,31,24,25,30,31]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15:
; AVX512VL: # %bb.0:
@@ -2396,12 +2603,19 @@ define <16 x i16> @shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_1
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
; AVX512VL: # %bb.0:
@@ -2540,14 +2754,22 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,0,4,5,6,7,8,10,10,8,12,13,14,15]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,0,4,5,6,7,8,10,10,8,12,13,14,15]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,8,9,0,1,8,9,10,11,12,13,14,15,16,17,24,25,24,25,16,17,24,25,26,27,28,29,30,31]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15:
; AVX512VL: # %bb.0:
@@ -2655,17 +2877,10 @@ define <16 x i16> @shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_u
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_uu:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15,16,17,18,19,20,21,30,31,20,21,30,31,28,29,30,31]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_uu:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,7,6,7,8,9,10,11,12,15,14,15]
-; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_uu:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15,16,17,18,19,20,21,30,31,20,21,30,31,28,29,30,31]
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 9, i32 10, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
ret <16 x i16> %shuffle
}
@@ -2705,17 +2920,10 @@ define <16 x i16> @shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_u
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_uu:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3,24,25,26,27,28,29,22,23,24,25,26,27,16,17,18,19]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_uu:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,2,0,7,5,6,4]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,3,4,5,6,7,8,11,10,11,12,13,14,15]
-; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_uu:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3,24,25,26,27,28,29,22,23,24,25,26,27,16,17,18,19]
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 5, i32 6, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
ret <16 x i16> %shuffle
}
@@ -2803,12 +3011,19 @@ define <16 x i16> @shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_1
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
; AVX512VL: # %bb.0:
@@ -2957,16 +3172,25 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_3
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7]
-; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,2,3,8,9,12,13,12,13,14,15,16,17,16,17,20,21,18,19,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,2,3,6,7,12,13,10,11,14,15,14,15,16,17,18,19,18,19,22,23,28,29,26,27,30,31,30,31]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31:
; AVX512VL: # %bb.0:
@@ -2992,16 +3216,26 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_2
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u>
-; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[8,9,8,9,4,5,10,11,0,1,0,1,12,13,2,3,24,25,24,25,20,21,26,27,16,17,16,17,28,29,18,19]
-; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u>
+; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[8,9,8,9,4,5,10,11,0,1,0,1,12,13,2,3,24,25,24,25,20,21,26,27,16,17,16,17,28,29,18,19]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u>
+; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[8,9,8,9,4,5,10,11,0,1,0,1,12,13,2,3,24,25,24,25,20,21,26,27,16,17,16,17,28,29,18,19]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,2,3,6,7,12,13,10,11,14,15,14,15,16,17,18,19,18,19,22,23,28,29,26,27,30,31,30,31]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25:
; AVX512VL: # %bb.0:
@@ -3123,15 +3357,24 @@ define <16 x i16> @shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_3
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,2,3,6,7,8,9,12,13,10,11,14,15,16,17,20,21,18,19,22,23,24,25,28,29,26,27,30,31]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31:
; AVX512VL: # %bb.0:
@@ -3156,12 +3399,18 @@ define <16 x i16> @shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_u
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,2,3,6,5,6,7]
-; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,3,2,4,5,6,7,8,8,11,10,12,13,14,15]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,2,3,6,5,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,3,2,4,5,6,7,8,8,11,10,12,13,14,15]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,6,7,4,5,8,9,10,11,12,13,14,15,24,25,24,25,22,23,20,21,24,25,26,27,28,29,30,31]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu:
; AVX512VL: # %bb.0:
@@ -3363,14 +3612,22 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1
-; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,7,8,9,10,9,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,7,7,8,9,10,11,12,13,15,15]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %ymm1
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,7,8,9,10,9,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,7,7,8,9,10,11,12,13,15,15]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,2,3,8,9,10,11,14,15,14,15,16,17,18,19,20,21,18,19,24,25,26,27,30,31,30,31]
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %ymm1
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25:
; AVX512VL: # %bb.0:
@@ -3783,12 +4040,31 @@ define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a,
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX2OR512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7]
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX512VL-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7]
+; AVX512VL-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
ret <16 x i16> %shuffle
}
@@ -3821,11 +4097,33 @@ define <16 x i16> @shuffle_v16i16_4_20_5_21_6_22_7_23_u_u_u_u_u_u_u_u(<16 x i16>
}
define <16 x i16> @shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) {
-; ALL-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
-; ALL: # %bb.0:
-; ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; ALL-NEXT: retq
+; AVX1-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: retq
+;
+; AVX2-SLOW-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7]
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <16 x i16> %shuffle
}
@@ -3838,12 +4136,31 @@ define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a,
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX2OR512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <16 x i16> %shuffle
}
@@ -3906,9 +4223,8 @@ define <16 x i16> @shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_2
;
; AVX512VL-LABEL: shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [2,18,3,19,0,16,1,17,10,26,11,27,8,24,9,25]
-; AVX512VL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [2,18,3,19,10,26,11,27,0,16,1,17,8,24,9,25]
+; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 0, i32 16, i32 1, i32 17, i32 10, i32 26, i32 11, i32 27, i32 8, i32 24, i32 9, i32 25>
%2 = bitcast <16 x i16> %1 to <4 x i64>
@@ -4006,21 +4322,36 @@ define <16 x i16> @PR24935(<16 x i16> %a, <16 x i16> %b) {
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: PR24935:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[8,9,10,11,4,5,8,9,0,1,14,15,12,13,0,1,24,25,26,27,20,21,24,25,16,17,30,31,28,29,16,17]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,0,1,10,11,4,5,10,11,4,5,6,7,22,23,20,21,16,17,26,27,20,21,26,27,20,21,22,23]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,255,255,255,255,0,0,u,u,0,0,u,u,u,u,255,255,0,0,u,u,u,u,u,u,0,0>
-; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,5,6,7,8,9,10,11,13,13,14,15]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5,6,7,8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13,14,15]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,0,0,0,0,255,255,255,255,0,0,0,0,0,0,255,255]
-; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: PR24935:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[8,9,10,11,4,5,8,9,0,1,14,15,12,13,0,1,24,25,26,27,20,21,24,25,16,17,30,31,28,29,16,17]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,0,1,10,11,4,5,10,11,4,5,6,7,22,23,20,21,16,17,26,27,20,21,26,27,20,21,22,23]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,255,255,255,255,0,0,u,u,0,0,u,u,u,u,255,255,0,0,u,u,u,u,u,u,0,0>
+; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,5,6,7,8,9,10,11,13,13,14,15]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5,6,7,8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13,14,15]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,0,0,0,0,255,255,255,255,0,0,0,0,0,0,255,255]
+; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: PR24935:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[8,9,10,11,4,5,8,9,0,1,14,15,12,13,0,1,24,25,26,27,20,21,24,25,16,17,30,31,28,29,16,17]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,0,1,10,11,4,5,10,11,4,5,6,7,22,23,20,21,16,17,26,27,20,21,26,27,20,21,22,23]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,255,255,255,255,0,0,u,u,0,0,u,u,u,u,255,255,0,0,u,u,u,u,u,u,0,0>
+; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,1,2,3,2,3,6,7,10,11,10,11,12,13,14,15,16,17,18,19,18,19,22,23,26,27,26,27,28,29,30,31]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5,6,7,8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13,14,15]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,0,0,0,0,255,255,255,255,0,0,0,0,0,0,255,255]
+; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: PR24935:
; AVX512VL: # %bb.0:
diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll
index 01c7fc466eb8..51ef3a18438f 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -1,7 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VLBW --check-prefix=AVX512VLBW-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VLBW --check-prefix=AVX512VLBW-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512vbmi | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VLVBMI --check-prefix=AVX512VLVBMI-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512vbmi,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VLVBMI --check-prefix=AVX512VLVBMI-FAST
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
@@ -312,17 +316,23 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
-; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpbroadcastb %xmm0, %xmm0
-; AVX512VL-NEXT: movl $32767, %eax # imm = 0x7FFF
-; AVX512VL-NEXT: kmovd %eax, %k1
-; AVX512VL-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
-; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0
-; AVX512VL-NEXT: retq
+; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX512VLBW-NEXT: vpbroadcastb %xmm0, %xmm0
+; AVX512VLBW-NEXT: movl $32767, %eax # imm = 0x7FFF
+; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
+; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VLBW-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
}
@@ -346,14 +356,20 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
-; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512VL-NEXT: movl $1, %eax
-; AVX512VL-NEXT: kmovd %eax, %k1
-; AVX512VL-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
-; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: retq
+; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-NEXT: movl $1, %eax
+; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 17, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
}
@@ -377,14 +393,20 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
-; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512VL-NEXT: movw $1, %ax
-; AVX512VL-NEXT: kmovd %eax, %k1
-; AVX512VL-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
-; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: retq
+; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-NEXT: movw $1, %ax
+; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,18,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 18, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
}
@@ -408,14 +430,20 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
-; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512VL-NEXT: movw $1, %ax
-; AVX512VL-NEXT: kmovd %eax, %k1
-; AVX512VL-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
-; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: retq
+; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-NEXT: movw $1, %ax
+; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,19,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 19, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
}
@@ -431,12 +459,39 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2OR512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-SLOW: # %bb.0:
+; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-SLOW-NEXT: retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-FAST: # %bb.0:
+; AVX512VLBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3]
+; AVX512VLBW-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-FAST-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
}
@@ -452,12 +507,39 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2OR512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-SLOW: # %bb.0:
+; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-SLOW-NEXT: retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-FAST: # %bb.0:
+; AVX512VLBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3]
+; AVX512VLBW-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-FAST-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,21,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 21, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
}
@@ -473,12 +555,39 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2OR512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-SLOW: # %bb.0:
+; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-SLOW-NEXT: retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-FAST: # %bb.0:
+; AVX512VLBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3]
+; AVX512VLBW-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-FAST-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 22, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
}
@@ -494,12 +603,39 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2OR512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-SLOW: # %bb.0:
+; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-SLOW-NEXT: retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-FAST: # %bb.0:
+; AVX512VLBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3]
+; AVX512VLBW-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-FAST-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,23,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 23, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
}
@@ -522,12 +658,24 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
-; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: retq
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-SLOW: # %bb.0:
+; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-SLOW-NEXT: retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-FAST: # %bb.0:
+; AVX512VLBW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-FAST-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
}
@@ -550,12 +698,24 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
-; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: retq
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-SLOW: # %bb.0:
+; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-SLOW-NEXT: retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-FAST: # %bb.0:
+; AVX512VLBW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-FAST-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,25,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 25, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
}
@@ -578,12 +738,24 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
-; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: retq
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-SLOW: # %bb.0:
+; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-SLOW-NEXT: retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-FAST: # %bb.0:
+; AVX512VLBW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-FAST-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,26,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 26, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
}
@@ -606,12 +778,24 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
-; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: retq
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-SLOW: # %bb.0:
+; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-SLOW-NEXT: retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-FAST: # %bb.0:
+; AVX512VLBW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-FAST-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,27,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 27, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
}
@@ -634,12 +818,24 @@ define <32 x i8> @shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
-; AVX512VL-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: retq
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-SLOW: # %bb.0:
+; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-SLOW-NEXT: retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-FAST: # %bb.0:
+; AVX512VLBW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-FAST-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 28, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
}
@@ -662,12 +858,24 @@ define <32 x i8> @shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
-; AVX512VL-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: retq
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-SLOW: # %bb.0:
+; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-SLOW-NEXT: retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-FAST: # %bb.0:
+; AVX512VLBW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-FAST-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,29,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 29, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
}
@@ -690,12 +898,24 @@ define <32 x i8> @shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
-; AVX512VL-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: retq
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-SLOW: # %bb.0:
+; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-SLOW-NEXT: retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-FAST: # %bb.0:
+; AVX512VLBW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-FAST-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,30,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 30, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
}
@@ -722,14 +942,29 @@ define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
-; AVX512VL-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX512VL-NEXT: movl $15, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm1
-; AVX512VL-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-SLOW: # %bb.0:
+; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX512VLBW-SLOW-NEXT: movl $15, %eax
+; AVX512VLBW-SLOW-NEXT: vmovd %eax, %xmm1
+; AVX512VLBW-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0
+; AVX512VLBW-SLOW-NEXT: retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-FAST: # %bb.0:
+; AVX512VLBW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX512VLBW-FAST-NEXT: movl $15, %eax
+; AVX512VLBW-FAST-NEXT: vmovd %eax, %xmm1
+; AVX512VLBW-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0
+; AVX512VLBW-FAST-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: movl $31, %eax
+; AVX512VLVBMI-NEXT: vmovd %eax, %xmm1
+; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
}
@@ -1092,25 +1327,49 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA
-; AVX512VL-NEXT: kmovd %eax, %k1
-; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm0 {%k1}
-; AVX512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
+; AVX512VLBW-SLOW: # %bb.0:
+; AVX512VLBW-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX512VLBW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX512VLBW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLBW-SLOW-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA
+; AVX512VLBW-SLOW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm0 {%k1}
+; AVX512VLBW-SLOW-NEXT: retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
+; AVX512VLBW-FAST: # %bb.0:
+; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX512VLBW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLBW-FAST-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA
+; AVX512VLBW-FAST-NEXT: kmovd %eax, %k1
+; AVX512VLBW-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm0 {%k1}
+; AVX512VLBW-FAST-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,32,0,32,0,32,0,32,0,32,0,32,0,32,0,32,16,48,16,48,16,48,16,48,16,48,16,48,16,48,16,48]
+; AVX512VLVBMI-NEXT: vpermt2b %ymm1, %ymm2, %ymm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48>
ret <32 x i8> %shuffle
}
@@ -1129,12 +1388,26 @@ define <32 x i8> @shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2OR512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT: retq
+;
+; AVX512VLBW-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX512VLBW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX512VLBW-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,40,41,42,43,44,45,46,47,16,16,16,16,16,16,16,16,56,57,58,59,60,61,62,63]
+; AVX512VLVBMI-NEXT: vpermi2b %ymm0, %ymm1, %ymm2
+; AVX512VLVBMI-NEXT: vmovdqa %ymm2, %ymm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
ret <32 x i8> %shuffle
}
@@ -1155,11 +1428,24 @@ define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
+; AVX2-NEXT: retq
+;
+; AVX512VLBW-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
+; AVX512VLBW-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,47,46,45,44,43,42,41,40,23,22,21,20,19,18,17,16,63,62,61,60,59,58,57,56]
+; AVX512VLVBMI-NEXT: vpermi2b %ymm0, %ymm1, %ymm2
+; AVX512VLVBMI-NEXT: vmovdqa %ymm2, %ymm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24>
ret <32 x i8> %shuffle
}
@@ -1177,12 +1463,26 @@ define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16]
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16,u,u,u,u,u,u,u,u]
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT: retq
+;
+; AVX512VLBW-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16]
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16,u,u,u,u,u,u,u,u]
+; AVX512VLBW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX512VLBW-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,39,38,37,36,35,34,33,32,23,22,21,20,19,18,17,16,55,54,53,52,51,50,49,48]
+; AVX512VLVBMI-NEXT: vpermi2b %ymm0, %ymm1, %ymm2
+; AVX512VLVBMI-NEXT: vmovdqa %ymm2, %ymm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16>
ret <32 x i8> %shuffle
}
@@ -1350,13 +1650,19 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
-; AVX512VL-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31,u]
-; AVX512VL-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA
-; AVX512VL-NEXT: kmovd %eax, %k1
-; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm1[u,0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31]
-; AVX512VL-NEXT: retq
+; AVX512VLBW-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31,u]
+; AVX512VLBW-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA
+; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm1[u,0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31]
+; AVX512VLBW-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63]
+; AVX512VLVBMI-NEXT: vpermt2b %ymm1, %ymm2, %ymm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
ret <32 x i8> %shuffle
}
@@ -1379,13 +1685,19 @@ define <32 x i8> @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
-; AVX512VL-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23,u]
-; AVX512VL-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA
-; AVX512VL-NEXT: kmovd %eax, %k1
-; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm1[u,8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23]
-; AVX512VL-NEXT: retq
+; AVX512VLBW-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23,u]
+; AVX512VLBW-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA
+; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm1[u,8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23]
+; AVX512VLBW-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47,16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55]
+; AVX512VLVBMI-NEXT: vpermt2b %ymm1, %ymm2, %ymm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
ret <32 x i8> %shuffle
}
@@ -1634,22 +1946,29 @@ define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
-; AVX512VL-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,13,u,u,3,3,u,8,u,u,u,12,1,u,u,u,u,u,20,u,17,22,u,u,16,u,27,u,u,u,u,u]
-; AVX512VL-NEXT: movl $-222248896, %eax # imm = 0xF2C0C040
-; AVX512VL-NEXT: kmovd %eax, %k1
-; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm2[u,u,u,u,u,u,12,u,u,u,u,u,u,u,0,3,u,u,u,u,u,u,21,16,u,26,u,u,20,18,20,23]
-; AVX512VL-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4,5],ymm0[6],ymm2[7]
-; AVX512VL-NEXT: movl $134948620, %eax # imm = 0x80B270C
-; AVX512VL-NEXT: kmovd %eax, %k1
-; AVX512VL-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
-; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0
-; AVX512VL-NEXT: retq
+; AVX512VLBW-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,13,u,u,3,3,u,8,u,u,u,12,1,u,u,u,u,u,20,u,17,22,u,u,16,u,27,u,u,u,u,u]
+; AVX512VLBW-NEXT: movl $-222248896, %eax # imm = 0xF2C0C040
+; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm2[u,u,u,u,u,u,12,u,u,u,u,u,u,u,0,3,u,u,u,u,u,u,21,16,u,26,u,u,20,18,20,23]
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u]
+; AVX512VLBW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4,5],ymm0[6],ymm2[7]
+; AVX512VLBW-NEXT: movl $134948620, %eax # imm = 0x80B270C
+; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
+; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VLBW-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [10,13,44,45,3,3,28,8,49,54,61,12,1,44,16,19,52,51,20,51,17,22,5,0,16,10,27,39,4,2,4,7]
+; AVX512VLVBMI-NEXT: vpermi2b %ymm0, %ymm1, %ymm2
+; AVX512VLVBMI-NEXT: vmovdqa %ymm2, %ymm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 42, i32 45, i32 12, i32 13, i32 35, i32 35, i32 60, i32 40, i32 17, i32 22, i32 29, i32 44, i32 33, i32 12, i32 48, i32 51, i32 20, i32 19, i32 52, i32 19, i32 49, i32 54, i32 37, i32 32, i32 48, i32 42, i32 59, i32 7, i32 36, i32 34, i32 36, i32 39>
ret <32 x i8> %shuffle
}
@@ -1663,11 +1982,23 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
+; AVX2-NEXT: retq
+;
+; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
+; AVX512VLBW-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,32,32,32,32,32,32,32,32,40,40,40,40,40,40,40,40]
+; AVX512VLVBMI-NEXT: vpermt2b %ymm1, %ymm2, %ymm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40>
ret <32 x i8> %shuffle
}
@@ -1682,11 +2013,23 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
+; AVX2-NEXT: retq
+;
+; AVX512VLBW-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
+; AVX512VLBW-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24,32,32,32,32,32,32,32,32,40,40,40,40,40,40,40,40]
+; AVX512VLVBMI-NEXT: vpermt2b %ymm1, %ymm2, %ymm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40>
ret <32 x i8> %shuffle
}
@@ -1702,11 +2045,23 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
+; AVX2-NEXT: retq
+;
+; AVX512VLBW-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
+; AVX512VLBW-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24,48,48,48,48,48,48,48,48,56,56,56,56,56,56,56,56]
+; AVX512VLVBMI-NEXT: vpermt2b %ymm1, %ymm2, %ymm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56>
ret <32 x i8> %shuffle
}
@@ -1721,11 +2076,23 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
+; AVX2-NEXT: retq
+;
+; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
+; AVX512VLBW-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,48,48,48,48,48,48,48,48,56,56,56,56,56,56,56,56]
+; AVX512VLVBMI-NEXT: vpermt2b %ymm1, %ymm2, %ymm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56>
ret <32 x i8> %shuffle
}
@@ -1738,12 +2105,25 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX2OR512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512VLBW-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512VLBW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512VLBW-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
+; AVX512VLVBMI-NEXT: vpermt2b %ymm1, %ymm2, %ymm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47>
ret <32 x i8> %shuffle
}
@@ -1961,18 +2341,26 @@ define <32 x i8> @shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
;
-; AVX512VL-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512VL-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512VL-NEXT: movl $286331153, %eax # imm = 0x11111111
-; AVX512VL-NEXT: kmovd %eax, %k1
-; AVX512VL-NEXT: vmovdqu8 %ymm0, %ymm0 {%k1} {z}
-; AVX512VL-NEXT: retq
+; AVX512VLBW-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512VLBW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512VLBW-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX512VLBW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512VLBW-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX512VLBW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512VLBW-NEXT: movl $286331153, %eax # imm = 0x11111111
+; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: vmovdqu8 %ymm0, %ymm0 {%k1} {z}
+; AVX512VLBW-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [56,1,2,3,57,5,6,7,58,9,10,11,59,13,14,15,60,17,18,19,61,21,22,23,62,25,26,27,63,29,30,31]
+; AVX512VLVBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VLVBMI-NEXT: vpermt2b %ymm0, %ymm2, %ymm1
+; AVX512VLVBMI-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 56, i32 1, i32 2, i32 3, i32 57, i32 5, i32 6, i32 7, i32 58, i32 9, i32 10, i32 11, i32 59, i32 13, i32 14, i32 15, i32 60, i32 17, i32 18, i32 19, i32 61, i32 21, i32 22, i32 23, i32 62, i32 25, i32 26, i32 27, i32 63, i32 29, i32 30, i32 31>
ret <32 x i8> %shuffle
}
@@ -2159,19 +2547,11 @@ define <32 x i8> @shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
ret <32 x i8> %shuffle
}
@@ -2203,37 +2583,21 @@ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15]
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15]
+; AVX2OR512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <32 x i8> %shuffle
}
define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) {
-; AVX1OR2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1OR2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX512VL-NEXT: retq
+; ALL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; ALL: # %bb.0:
+; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; ALL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <32 x i8> %shuffle
}
@@ -2245,19 +2609,11 @@ define <32 x i8> @shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <32 x i8> %shuffle
}
@@ -2276,13 +2632,27 @@ define <32 x i8> @shuffe_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_2
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffe_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX2OR512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffe_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: retq
+;
+; AVX512VLBW-LABEL: shuffe_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512VLBW-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512VLBW-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffe_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63]
+; AVX512VLVBMI-NEXT: vpermt2b %ymm1, %ymm2, %ymm0
+; AVX512VLVBMI-NEXT: retq
%1 = lshr <16 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%2 = lshr <16 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%3 = bitcast <16 x i16> %1 to <32 x i8>
@@ -2310,11 +2680,29 @@ define <4 x i64> @PR28136(<32 x i8> %a0, <32 x i8> %a1) {
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: PR28136:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: PR28136:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: retq
+;
+; AVX512VLBW-LABEL: PR28136:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512VLBW-NEXT: retq
+;
+; AVX512VLVBMI-SLOW-LABEL: PR28136:
+; AVX512VLVBMI-SLOW: # %bb.0:
+; AVX512VLVBMI-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512VLVBMI-SLOW-NEXT: retq
+;
+; AVX512VLVBMI-FAST-LABEL: PR28136:
+; AVX512VLVBMI-FAST: # %bb.0:
+; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,32,1,33,2,34,3,35,16,48,17,49,18,50,19,51,4,36,5,37,6,38,7,39,20,52,21,53,22,54,23,55]
+; AVX512VLVBMI-FAST-NEXT: vpermt2b %ymm1, %ymm2, %ymm0
+; AVX512VLVBMI-FAST-NEXT: retq
%1 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50,i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
%2 = bitcast <32 x i8> %1 to <4 x i64>
%3 = shufflevector <4 x i64> %2, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll
index 41dcb5032ee2..ebef762787d9 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -1,7 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2 --check-prefix=AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX2 --check-prefix=AVX2-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512VL-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST
define <4 x double> @shuffle_v4f64_0000(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: shuffle_v4f64_0000:
@@ -546,19 +548,29 @@ define <4 x double> @shuffle_v4f64_0z3z(<4 x double> %a, <4 x double> %b) {
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v4f64_0z3z:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2]
-; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v4f64_0z3z:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2]
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
-; AVX512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v4f64_0z3z:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2]
+; AVX2-SLOW-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v4f64_0z3z:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v4f64_0z3z:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2]
+; AVX512VL-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v4f64_0z3z:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> <double 0.000000e+00, double undef, double undef, double undef>, <4 x i32> <i32 0, i32 4, i32 3, i32 4>
ret <4 x double> %shuffle
}
@@ -574,19 +586,29 @@ define <4 x double> @shuffle_v4f64_1z2z(<4 x double> %a, <4 x double> %b) {
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v4f64_1z2z:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v4f64_1z2z:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
-; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0]
-; AVX512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v4f64_1z2z:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v4f64_1z2z:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v4f64_1z2z:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
+; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v4f64_1z2z:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-FAST-NEXT: retq
%1 = shufflevector <4 x double> %a, <4 x double> <double 0.000000e+00, double undef, double undef, double undef>, <4 x i32> <i32 1, i32 4, i32 2, i32 4>
ret <4 x double> %1
}
@@ -812,11 +834,17 @@ define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) {
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-NEXT: retq
;
-; AVX512VL-LABEL: shuffle_v4i64_0124:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vbroadcastsd %xmm1, %ymm1
-; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512VL-NEXT: retq
+; AVX512VL-SLOW-LABEL: shuffle_v4i64_0124:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1
+; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v4i64_0124:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,4]
+; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
ret <4 x i64> %shuffle
}
@@ -863,12 +891,19 @@ define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) {
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX2-NEXT: retq
;
-; AVX512VL-LABEL: shuffle_v4i64_0412:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
-; AVX512VL-NEXT: retq
+; AVX512VL-SLOW-LABEL: shuffle_v4i64_0412:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpbroadcastq %xmm1, %xmm1
+; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
+; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v4i64_0412:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpbroadcastq %xmm1, %xmm1
+; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,1,2]
+; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2>
ret <4 x i64> %shuffle
}
@@ -889,11 +924,17 @@ define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) {
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX2-NEXT: retq
;
-; AVX512VL-LABEL: shuffle_v4i64_4012:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,2]
-; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX512VL-NEXT: retq
+; AVX512VL-SLOW-LABEL: shuffle_v4i64_4012:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,2]
+; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v4i64_4012:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,0,1,2]
+; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
ret <4 x i64> %shuffle
}
@@ -924,9 +965,8 @@ define <4 x i64> @shuffle_v4i64_0451(<4 x i64> %a, <4 x i64> %b) {
;
; AVX512VL-LABEL: shuffle_v4i64_0451:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,3]
-; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1]
-; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,4,5,1]
+; AVX512VL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1>
ret <4 x i64> %shuffle
@@ -958,9 +998,8 @@ define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) {
;
; AVX512VL-LABEL: shuffle_v4i64_4015:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
-; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
-; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,0,1,5]
+; AVX512VL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
ret <4 x i64> %shuffle
@@ -980,11 +1019,17 @@ define <4 x i64> @shuffle_v4i64_2u35(<4 x i64> %a, <4 x i64> %b) {
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,1]
; AVX2-NEXT: retq
;
-; AVX512VL-LABEL: shuffle_v4i64_2u35:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
-; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,1]
-; AVX512VL-NEXT: retq
+; AVX512VL-SLOW-LABEL: shuffle_v4i64_2u35:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,1]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v4i64_2u35:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,5,3,5]
+; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 undef, i32 3, i32 5>
ret <4 x i64> %shuffle
}
@@ -1008,9 +1053,8 @@ define <4 x i64> @shuffle_v4i64_1251(<4 x i64> %a, <4 x i64> %b) {
;
; AVX512VL-LABEL: shuffle_v4i64_1251:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,2,1]
-; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,5,1]
+; AVX512VL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 2, i32 5, i32 1>
ret <4 x i64> %shuffle
@@ -1121,9 +1165,8 @@ define <4 x i64> @shuffle_v4i64_0415(<4 x i64> %a, <4 x i64> %b) {
;
; AVX512VL-LABEL: shuffle_v4i64_0415:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,4,1,5]
+; AVX512VL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
ret <4 x i64> %shuffle
@@ -1564,19 +1607,29 @@ define <4 x i64> @shuffle_v4i64_z0z3(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v4i64_z0z3:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,3]
-; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v4i64_z0z3:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,3]
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v4i64_z0z3:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,3]
+; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v4i64_z0z3:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v4i64_z0z3:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,3]
+; AVX512VL-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v4i64_z0z3:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31]
+; AVX512VL-FAST-NEXT: retq
%1 = shufflevector <4 x i64> %a, <4 x i64> <i64 0, i64 undef, i64 undef, i64 undef>, <4 x i32> <i32 4, i32 0, i32 4, i32 3>
ret <4 x i64> %1
}
@@ -1592,19 +1645,29 @@ define <4 x i64> @shuffle_v4i64_1z2z(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v4i64_1z2z:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v4i64_1z2z:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,2,0]
-; AVX512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v4i64_1z2z:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v4i64_1z2z:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v4i64_1z2z:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,2,0]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v4i64_1z2z:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-FAST-NEXT: retq
%1 = shufflevector <4 x i64> %a, <4 x i64> <i64 0, i64 undef, i64 undef, i64 undef>, <4 x i32> <i32 1, i32 4, i32 2, i32 4>
ret <4 x i64> %1
}
diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll
index 44d0217f5295..1b6df7dd2d25 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -1,7 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST
define <8 x float> @shuffle_v8f32_00000000(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_00000000:
@@ -342,12 +344,26 @@ define <8 x float> @shuffle_v8f32_09ab1def(<8 x float> %a, <8 x float> %b) {
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v8f32_09ab1def:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX2OR512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v8f32_09ab1def:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX2-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8f32_09ab1def:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8f32_09ab1def:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3]
+; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm0 = [8,1,2,3,10,5,6,7]
+; AVX512VL-FAST-NEXT: vpermi2ps %ymm2, %ymm1, %ymm0
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
ret <8 x float> %shuffle
}
@@ -651,14 +667,23 @@ define <8 x float> @shuffle_v8f32_c348cda0(<8 x float> %a, <8 x float> %b) {
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8f32_c348cda0:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <4,u,u,0,4,5,2,u>
-; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,2,0,4,7,6,4]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1]
-; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v8f32_c348cda0:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm2 = <4,u,u,0,4,5,2,u>
+; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,2,0,4,7,6,4]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1]
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v8f32_c348cda0:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,3,4,7,4,7,2,0]
+; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <4,u,u,0,4,5,2,u>
+; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8f32_c348cda0:
; AVX512VL: # %bb.0:
@@ -681,14 +706,23 @@ define <8 x float> @shuffle_v8f32_f511235a(<8 x float> %a, <8 x float> %b) {
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8f32_f511235a:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,2,3,7,6,6,7]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,0]
-; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,2,3,5,5,6,7]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,1,2]
-; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v8f32_f511235a:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,2,3,7,6,6,7]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,0]
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,2,3,5,5,6,7]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,1,2]
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v8f32_f511235a:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [7,6,2,3,7,6,3,2]
+; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [5,5,1,1,2,3,5,5]
+; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8f32_f511235a:
; AVX512VL: # %bb.0:
@@ -722,11 +756,29 @@ define <8 x float> @shuffle_v8f32_76547654(<8 x float> %a, <8 x float> %b) {
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v8f32_76547654:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2OR512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v8f32_76547654:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v8f32_76547654:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
+; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8f32_76547654:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8f32_76547654:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
+; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
ret <8 x float> %shuffle
}
@@ -738,11 +790,29 @@ define <8 x float> @shuffle_v8f32_76543210(<8 x float> %a, <8 x float> %b) {
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v8f32_76543210:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX2OR512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v8f32_76543210:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v8f32_76543210:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
+; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8f32_76543210:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8f32_76543210:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
+; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
ret <8 x float> %shuffle
}
@@ -798,11 +868,23 @@ define <8 x float> @PR21138(<8 x float> %truc, <8 x float> %tchose) {
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: PR21138:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: PR21138:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: PR21138:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
+; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: PR21138:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15]
+; AVX512VL-FAST-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x float> %truc, <8 x float> %tchose, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
ret <8 x float> %shuffle
}
@@ -1264,12 +1346,26 @@ define <8 x i32> @shuffle_v8i32_09ab1def(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v8i32_09ab1def:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v8i32_09ab1def:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX2-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i32_09ab1def:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i32_09ab1def:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
+; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [8,1,2,3,10,5,6,7]
+; AVX512VL-FAST-NEXT: vpermi2d %ymm2, %ymm1, %ymm0
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
ret <8 x i32> %shuffle
}
@@ -1696,13 +1792,21 @@ define <8 x i32> @shuffle_v8i32_6caa87e5(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8i32_6caa87e5:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,3,2]
-; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,0,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v8i32_6caa87e5:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,3,2]
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,0,3]
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i32_6caa87e5:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [4,4,2,2,0,0,6,6]
+; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,3,2]
+; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8i32_6caa87e5:
; AVX512VL: # %bb.0:
@@ -1737,11 +1841,29 @@ define <8 x i32> @shuffle_v8i32_76547654(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v8i32_76547654:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2OR512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v8i32_76547654:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i32_76547654:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
+; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i32_76547654:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i32_76547654:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
+; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
ret <8 x i32> %shuffle
}
@@ -1753,11 +1875,29 @@ define <8 x i32> @shuffle_v8i32_76543210(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v8i32_76543210:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX2OR512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v8i32_76543210:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i32_76543210:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
+; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i32_76543210:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i32_76543210:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
+; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
ret <8 x i32> %shuffle
}
diff --git a/test/CodeGen/X86/vector-shuffle-512-v32.ll b/test/CodeGen/X86/vector-shuffle-512-v32.ll
index 3e49957bf85e..d4fb0fd52a79 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v32.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v32.ll
@@ -186,8 +186,7 @@ define <32 x i16> @shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19
;
; SKX-LABEL: shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28:
; SKX: ## %bb.0:
-; SKX-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15,17,17,16,16,20,21,22,23,25,25,24,24,28,29,30,31]
-; SKX-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12,16,17,18,19,21,21,20,20,24,25,26,27,29,29,28,28]
+; SKX-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[2,3,2,3,0,1,0,1,10,11,10,11,8,9,8,9,18,19,18,19,16,17,16,17,26,27,26,27,24,25,24,25,34,35,34,35,32,33,32,33,42,43,42,43,40,41,40,41,50,51,50,51,48,49,48,49,58,59,58,59,56,57,56,57]
; SKX-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 5, i32 5, i32 4, i32 4, i32 9, i32 9, i32 8, i32 8, i32 13, i32 13, i32 12, i32 12, i32 17, i32 17, i32 16, i32 16, i32 21, i32 21, i32 20, i32 20, i32 25, i32 25, i32 24, i32 24, i32 29, i32 29, i32 28, i32 28>
ret <32 x i16> %c
@@ -354,3 +353,18 @@ define <8 x i16> @pr32967(<32 x i16> %v) {
%shuffle = shufflevector <32 x i16> %v, <32 x i16> undef, <8 x i32> <i32 1,i32 5,i32 9,i32 13,i32 17,i32 21,i32 25,i32 29>
ret <8 x i16> %shuffle
}
+
+define <32 x i16> @shuffle_v32i16_07_zz_05_zz_03_zz_01_zz_15_zz_13_zz_11_zz_09_zz_23_zz_21_zz_19_zz_17_zz_31_zz_29_zz_27_zz_25_zz(<32 x i16> %a) {
+; KNL-LABEL: shuffle_v32i16_07_zz_05_zz_03_zz_01_zz_15_zz_13_zz_11_zz_09_zz_23_zz_21_zz_19_zz_17_zz_31_zz_29_zz_27_zz_25_zz:
+; KNL: ## %bb.0:
+; KNL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15],zero,zero,ymm0[10,11],zero,zero,ymm0[6,7],zero,zero,ymm0[2,3],zero,zero,ymm0[30,31],zero,zero,ymm0[26,27],zero,zero,ymm0[22,23],zero,zero,ymm0[18,19],zero,zero
+; KNL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[14,15],zero,zero,ymm1[10,11],zero,zero,ymm1[6,7],zero,zero,ymm1[2,3],zero,zero,ymm1[30,31],zero,zero,ymm1[26,27],zero,zero,ymm1[22,23],zero,zero,ymm1[20,21],zero,zero
+; KNL-NEXT: retq
+;
+; SKX-LABEL: shuffle_v32i16_07_zz_05_zz_03_zz_01_zz_15_zz_13_zz_11_zz_09_zz_23_zz_21_zz_19_zz_17_zz_31_zz_29_zz_27_zz_25_zz:
+; SKX: ## %bb.0:
+; SKX-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[14,15],zero,zero,zmm0[10,11],zero,zero,zmm0[6,7],zero,zero,zmm0[2,3],zero,zero,zmm0[30,31],zero,zero,zmm0[26,27],zero,zero,zmm0[22,23],zero,zero,zmm0[18,19],zero,zero,zmm0[46,47],zero,zero,zmm0[42,43],zero,zero,zmm0[38,39],zero,zero,zmm0[34,35],zero,zero,zmm0[62,63],zero,zero,zmm0[58,59],zero,zero,zmm0[54,55],zero,zero,zmm0[52,53],zero,zero
+; SKX-NEXT: retq
+ %shuffle = shufflevector <32 x i16> zeroinitializer, <32 x i16> %a, <32 x i32> <i32 39, i32 0, i32 37, i32 0, i32 35, i32 0, i32 33, i32 0, i32 47, i32 0, i32 45, i32 0, i32 43, i32 0, i32 41, i32 0, i32 55, i32 0, i32 53, i32 0, i32 51, i32 0, i32 49, i32 0, i32 63, i32 0, i32 61, i32 0, i32 59, i32 0, i32 58, i32 0>
+ ret <32 x i16> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-v1.ll b/test/CodeGen/X86/vector-shuffle-v1.ll
index 9c92ca756ebd..f3433ce834cd 100644
--- a/test/CodeGen/X86/vector-shuffle-v1.ll
+++ b/test/CodeGen/X86/vector-shuffle-v1.ll
@@ -48,9 +48,8 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} {z}
-; AVX512VL-NEXT: movb $1, %al
-; AVX512VL-NEXT: kmovw %eax, %k1
-; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm2 {%k1} {z}
+; AVX512VL-NEXT: movq $-1, %rax
+; AVX512VL-NEXT: vmovq %rax, %xmm2
; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
; AVX512VL-NEXT: vpsllq $63, %xmm1, %xmm1
; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1
@@ -61,9 +60,8 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
; VL_BW_DQ-NEXT: vptestmq %xmm0, %xmm0, %k0
-; VL_BW_DQ-NEXT: movb $1, %al
-; VL_BW_DQ-NEXT: kmovd %eax, %k1
-; VL_BW_DQ-NEXT: vpmovm2q %k1, %xmm0
+; VL_BW_DQ-NEXT: movq $-1, %rax
+; VL_BW_DQ-NEXT: vmovq %rax, %xmm0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm1
; VL_BW_DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
@@ -123,12 +121,12 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %
; AVX512VL-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
-; AVX512VL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
-; AVX512VL-NEXT: vpermq %zmm0, %zmm1, %zmm0
-; AVX512VL-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,1,0,3,7,7,0]
+; AVX512VL-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vpslld $31, %ymm1, %ymm1
+; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1
; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
; AVX512VL-NEXT: vzeroupper
@@ -137,10 +135,10 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %
; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: vpcmpeqq %zmm2, %zmm0, %k0
-; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
-; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
-; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0
-; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
+; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
+; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [3,6,1,0,3,7,7,0]
+; VL_BW_DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0
; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
@@ -250,12 +248,12 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
; AVX512VL-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: kmovw %edi, %k1
-; AVX512VL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vpbroadcastq %xmm0, %zmm0
-; AVX512VL-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX512VL-NEXT: vpbroadcastq %xmm1, %ymm1
+; AVX512VL-NEXT: vpslld $31, %ymm1, %ymm1
+; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1
; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
; AVX512VL-NEXT: vzeroupper
@@ -264,10 +262,10 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
; VL_BW_DQ-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: kmovd %edi, %k0
-; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
-; VL_BW_DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
-; VL_BW_DQ-NEXT: vpbroadcastq %xmm0, %zmm0
-; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
+; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
+; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; VL_BW_DQ-NEXT: vpbroadcastq %xmm0, %ymm0
+; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0
; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
@@ -294,12 +292,14 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
; AVX512VL-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: kmovw %edi, %k1
-; AVX512VL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
-; AVX512VL-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512VL-NEXT: vpsllq $63, %zmm2, %zmm0
-; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4,5,6,7]
+; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
; AVX512VL-NEXT: kmovw %k0, %eax
; AVX512VL-NEXT: # kill: def %al killed %al killed %eax
; AVX512VL-NEXT: vzeroupper
@@ -308,11 +308,12 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: kmovd %edi, %k0
-; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
+; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
+; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; VL_BW_DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
-; VL_BW_DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0
+; VL_BW_DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4,5,6,7]
+; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
; VL_BW_DQ-NEXT: kmovd %k0, %eax
; VL_BW_DQ-NEXT: # kill: def %al killed %al killed %eax
; VL_BW_DQ-NEXT: vzeroupper
@@ -339,10 +340,11 @@ define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {
; AVX512VL-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: kmovw %edi, %k1
-; AVX512VL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512VL-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1]
-; AVX512VL-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
; AVX512VL-NEXT: kmovw %k0, %eax
; AVX512VL-NEXT: # kill: def %al killed %al killed %eax
; AVX512VL-NEXT: vzeroupper
@@ -351,9 +353,9 @@ define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {
; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: kmovd %edi, %k0
-; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
-; VL_BW_DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1]
-; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
+; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
+; VL_BW_DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
; VL_BW_DQ-NEXT: kmovd %k0, %eax
; VL_BW_DQ-NEXT: # kill: def %al killed %al killed %eax
; VL_BW_DQ-NEXT: vzeroupper
@@ -382,12 +384,13 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
; AVX512VL-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: kmovw %edi, %k1
-; AVX512VL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
-; AVX512VL-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512VL-NEXT: vpsllq $63, %zmm2, %zmm0
-; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0]
+; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT: vpslld $31, %ymm2, %ymm0
+; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
; AVX512VL-NEXT: kmovw %k0, %eax
; AVX512VL-NEXT: # kill: def %al killed %al killed %eax
; AVX512VL-NEXT: vzeroupper
@@ -396,11 +399,11 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: kmovd %edi, %k0
-; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
+; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
-; VL_BW_DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0
+; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0]
+; VL_BW_DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
+; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0
; VL_BW_DQ-NEXT: kmovd %k0, %eax
; VL_BW_DQ-NEXT: # kill: def %al killed %al killed %eax
; VL_BW_DQ-NEXT: vzeroupper
@@ -429,12 +432,13 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: kmovw %edi, %k1
-; AVX512VL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
-; AVX512VL-NEXT: vpsllq $63, %zmm2, %zmm0
-; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4,5,6,7]
+; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
; AVX512VL-NEXT: kmovw %k0, %eax
; AVX512VL-NEXT: # kill: def %al killed %al killed %eax
; AVX512VL-NEXT: vzeroupper
@@ -443,11 +447,11 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: kmovd %edi, %k0
-; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
-; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
-; VL_BW_DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
-; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0
+; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
+; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
+; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; VL_BW_DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4,5,6,7]
+; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
; VL_BW_DQ-NEXT: kmovd %k0, %eax
; VL_BW_DQ-NEXT: # kill: def %al killed %al killed %eax
; VL_BW_DQ-NEXT: vzeroupper
@@ -462,12 +466,10 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
; AVX512F: # %bb.0:
; AVX512F-NEXT: kmovw %edi, %k1
-; AVX512F-NEXT: movb $51, %al
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; AVX512F-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1]
-; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,6,1,0,3,7,7,1]
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,18446744073709551615,18446744073709551615,0,0]
+; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
@@ -478,14 +480,12 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
; AVX512VL-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: kmovw %edi, %k1
-; AVX512VL-NEXT: movb $51, %al
-; AVX512VL-NEXT: kmovw %eax, %k2
-; AVX512VL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; AVX512VL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1]
-; AVX512VL-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512VL-NEXT: vpsllq $63, %zmm2, %zmm0
-; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7]
+; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
; AVX512VL-NEXT: kmovw %k0, %eax
; AVX512VL-NEXT: # kill: def %al killed %al killed %eax
; AVX512VL-NEXT: vzeroupper
@@ -494,11 +494,10 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: kmovd %edi, %k0
-; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
-; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,6,1,0,3,7,7,1]
-; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,18446744073709551615,18446744073709551615,0,0]
-; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
-; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0
+; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
+; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; VL_BW_DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7]
+; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
; VL_BW_DQ-NEXT: kmovd %k0, %eax
; VL_BW_DQ-NEXT: # kill: def %al killed %al killed %eax
; VL_BW_DQ-NEXT: vzeroupper
@@ -528,15 +527,15 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
;
; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovsxwq %xmm0, %zmm0
-; AVX512VL-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512VL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
-; AVX512VL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
-; AVX512VL-NEXT: vpsllq $63, %zmm2, %zmm0
-; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1
+; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
+; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
; AVX512VL-NEXT: kmovw %k0, %eax
; AVX512VL-NEXT: # kill: def %al killed %al killed %eax
; AVX512VL-NEXT: vzeroupper
@@ -546,11 +545,11 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: vpsllw $15, %xmm0, %xmm0
; VL_BW_DQ-NEXT: vpmovw2m %xmm0, %k0
-; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
-; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7]
-; VL_BW_DQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
-; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
-; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0
+; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
+; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; VL_BW_DQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; VL_BW_DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
; VL_BW_DQ-NEXT: kmovd %k0, %eax
; VL_BW_DQ-NEXT: # kill: def %al killed %al killed %eax
; VL_BW_DQ-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/vector-shuffle-variable-128.ll b/test/CodeGen/X86/vector-shuffle-variable-128.ll
index 0367737dda60..4de24d5fec4d 100644
--- a/test/CodeGen/X86/vector-shuffle-variable-128.ll
+++ b/test/CodeGen/X86/vector-shuffle-variable-128.ll
@@ -414,63 +414,62 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: andl $15, %eax
-; SSE2-NEXT: leaq -{{[0-9]+}}(%rsp), %r10
-; SSE2-NEXT: movzbl (%rax,%r10), %eax
+; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE2-NEXT: movd %eax, %xmm8
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: andl $15, %eax
-; SSE2-NEXT: movzbl (%rax,%r10), %eax
+; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE2-NEXT: movd %eax, %xmm15
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: andl $15, %eax
-; SSE2-NEXT: movzbl (%rax,%r10), %eax
+; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE2-NEXT: movd %eax, %xmm9
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: andl $15, %eax
-; SSE2-NEXT: movzbl (%rax,%r10), %eax
+; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE2-NEXT: movd %eax, %xmm3
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: andl $15, %eax
-; SSE2-NEXT: movzbl (%rax,%r10), %eax
+; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE2-NEXT: movd %eax, %xmm10
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: andl $15, %eax
-; SSE2-NEXT: movzbl (%rax,%r10), %eax
+; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE2-NEXT: movd %eax, %xmm7
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: andl $15, %eax
-; SSE2-NEXT: movzbl (%rax,%r10), %eax
+; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE2-NEXT: movd %eax, %xmm11
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: andl $15, %eax
-; SSE2-NEXT: movzbl (%rax,%r10), %eax
+; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE2-NEXT: movd %eax, %xmm6
-; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT: andl $15, %eax
-; SSE2-NEXT: movzbl (%rax,%r10), %eax
+; SSE2-NEXT: andl $15, %ecx
+; SSE2-NEXT: movzbl -24(%rsp,%rcx), %eax
; SSE2-NEXT: movd %eax, %xmm12
-; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT: andl $15, %eax
-; SSE2-NEXT: movzbl (%rax,%r10), %eax
+; SSE2-NEXT: andl $15, %edx
+; SSE2-NEXT: movzbl -24(%rsp,%rdx), %eax
; SSE2-NEXT: movd %eax, %xmm5
-; SSE2-NEXT: andl $15, %r9d
-; SSE2-NEXT: movzbl (%r9,%r10), %eax
+; SSE2-NEXT: andl $15, %esi
+; SSE2-NEXT: movzbl -24(%rsp,%rsi), %eax
; SSE2-NEXT: movd %eax, %xmm13
-; SSE2-NEXT: andl $15, %r8d
-; SSE2-NEXT: movzbl (%r8,%r10), %eax
-; SSE2-NEXT: movd %eax, %xmm4
-; SSE2-NEXT: andl $15, %ecx
-; SSE2-NEXT: movzbl (%rcx,%r10), %eax
+; SSE2-NEXT: andl $15, %edi
+; SSE2-NEXT: movzbl -24(%rsp,%rdi), %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: andl $15, %r9d
+; SSE2-NEXT: movzbl -24(%rsp,%r9), %eax
; SSE2-NEXT: movd %eax, %xmm14
-; SSE2-NEXT: andl $15, %edx
-; SSE2-NEXT: movzbl (%rdx,%r10), %eax
+; SSE2-NEXT: andl $15, %r8d
+; SSE2-NEXT: movzbl -24(%rsp,%r8), %eax
; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: andl $15, %esi
-; SSE2-NEXT: movzbl (%rsi,%r10), %eax
+; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: andl $15, %eax
+; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax
+; SSE2-NEXT: movd %eax, %xmm4
+; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: andl $15, %eax
+; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE2-NEXT: movd %eax, %xmm2
-; SSE2-NEXT: andl $15, %edi
-; SSE2-NEXT: movzbl (%rdi,%r10), %eax
-; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
@@ -479,12 +478,12 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
; SSE2-NEXT: retq
;
@@ -499,63 +498,62 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: andl $15, %eax
-; SSSE3-NEXT: leaq -{{[0-9]+}}(%rsp), %r10
-; SSSE3-NEXT: movzbl (%rax,%r10), %eax
+; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSSE3-NEXT: movd %eax, %xmm8
; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: andl $15, %eax
-; SSSE3-NEXT: movzbl (%rax,%r10), %eax
+; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSSE3-NEXT: movd %eax, %xmm15
; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: andl $15, %eax
-; SSSE3-NEXT: movzbl (%rax,%r10), %eax
+; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSSE3-NEXT: movd %eax, %xmm9
; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: andl $15, %eax
-; SSSE3-NEXT: movzbl (%rax,%r10), %eax
+; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSSE3-NEXT: movd %eax, %xmm3
; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: andl $15, %eax
-; SSSE3-NEXT: movzbl (%rax,%r10), %eax
+; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSSE3-NEXT: movd %eax, %xmm10
; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: andl $15, %eax
-; SSSE3-NEXT: movzbl (%rax,%r10), %eax
+; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSSE3-NEXT: movd %eax, %xmm7
; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: andl $15, %eax
-; SSSE3-NEXT: movzbl (%rax,%r10), %eax
+; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSSE3-NEXT: movd %eax, %xmm11
; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: andl $15, %eax
-; SSSE3-NEXT: movzbl (%rax,%r10), %eax
+; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSSE3-NEXT: movd %eax, %xmm6
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT: andl $15, %eax
-; SSSE3-NEXT: movzbl (%rax,%r10), %eax
+; SSSE3-NEXT: andl $15, %ecx
+; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %eax
; SSSE3-NEXT: movd %eax, %xmm12
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT: andl $15, %eax
-; SSSE3-NEXT: movzbl (%rax,%r10), %eax
+; SSSE3-NEXT: andl $15, %edx
+; SSSE3-NEXT: movzbl -24(%rsp,%rdx), %eax
; SSSE3-NEXT: movd %eax, %xmm5
-; SSSE3-NEXT: andl $15, %r9d
-; SSSE3-NEXT: movzbl (%r9,%r10), %eax
+; SSSE3-NEXT: andl $15, %esi
+; SSSE3-NEXT: movzbl -24(%rsp,%rsi), %eax
; SSSE3-NEXT: movd %eax, %xmm13
-; SSSE3-NEXT: andl $15, %r8d
-; SSSE3-NEXT: movzbl (%r8,%r10), %eax
-; SSSE3-NEXT: movd %eax, %xmm4
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%r10), %eax
+; SSSE3-NEXT: andl $15, %edi
+; SSSE3-NEXT: movzbl -24(%rsp,%rdi), %eax
+; SSSE3-NEXT: movd %eax, %xmm0
+; SSSE3-NEXT: andl $15, %r9d
+; SSSE3-NEXT: movzbl -24(%rsp,%r9), %eax
; SSSE3-NEXT: movd %eax, %xmm14
-; SSSE3-NEXT: andl $15, %edx
-; SSSE3-NEXT: movzbl (%rdx,%r10), %eax
+; SSSE3-NEXT: andl $15, %r8d
+; SSSE3-NEXT: movzbl -24(%rsp,%r8), %eax
; SSSE3-NEXT: movd %eax, %xmm1
-; SSSE3-NEXT: andl $15, %esi
-; SSSE3-NEXT: movzbl (%rsi,%r10), %eax
+; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT: andl $15, %eax
+; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax
+; SSSE3-NEXT: movd %eax, %xmm4
+; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT: andl $15, %eax
+; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSSE3-NEXT: movd %eax, %xmm2
-; SSSE3-NEXT: andl $15, %edi
-; SSSE3-NEXT: movzbl (%rdi,%r10), %eax
-; SSSE3-NEXT: movd %eax, %xmm0
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
@@ -564,12 +562,12 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
; SSSE3-NEXT: retq
;
@@ -583,49 +581,48 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
; SSE41-NEXT: # kill: def %edi killed %edi def %rdi
; SSE41-NEXT: andl $15, %edi
; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
-; SSE41-NEXT: movzbl (%rdi,%rax), %edi
-; SSE41-NEXT: movd %edi, %xmm0
+; SSE41-NEXT: movzbl -24(%rsp,%rdi), %eax
+; SSE41-NEXT: movd %eax, %xmm0
; SSE41-NEXT: andl $15, %esi
-; SSE41-NEXT: pinsrb $1, (%rsi,%rax), %xmm0
+; SSE41-NEXT: pinsrb $1, -24(%rsp,%rsi), %xmm0
; SSE41-NEXT: andl $15, %edx
-; SSE41-NEXT: pinsrb $2, (%rdx,%rax), %xmm0
+; SSE41-NEXT: pinsrb $2, -24(%rsp,%rdx), %xmm0
; SSE41-NEXT: andl $15, %ecx
-; SSE41-NEXT: pinsrb $3, (%rcx,%rax), %xmm0
+; SSE41-NEXT: pinsrb $3, -24(%rsp,%rcx), %xmm0
; SSE41-NEXT: andl $15, %r8d
-; SSE41-NEXT: pinsrb $4, (%r8,%rax), %xmm0
+; SSE41-NEXT: pinsrb $4, -24(%rsp,%r8), %xmm0
; SSE41-NEXT: andl $15, %r9d
-; SSE41-NEXT: pinsrb $5, (%r9,%rax), %xmm0
-; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; SSE41-NEXT: andl $15, %ecx
-; SSE41-NEXT: pinsrb $6, (%rcx,%rax), %xmm0
-; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; SSE41-NEXT: andl $15, %ecx
-; SSE41-NEXT: pinsrb $7, (%rcx,%rax), %xmm0
-; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; SSE41-NEXT: andl $15, %ecx
-; SSE41-NEXT: pinsrb $8, (%rcx,%rax), %xmm0
-; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; SSE41-NEXT: andl $15, %ecx
-; SSE41-NEXT: pinsrb $9, (%rcx,%rax), %xmm0
-; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; SSE41-NEXT: andl $15, %ecx
-; SSE41-NEXT: pinsrb $10, (%rcx,%rax), %xmm0
-; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; SSE41-NEXT: andl $15, %ecx
-; SSE41-NEXT: pinsrb $11, (%rcx,%rax), %xmm0
-; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; SSE41-NEXT: andl $15, %ecx
-; SSE41-NEXT: pinsrb $12, (%rcx,%rax), %xmm0
-; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; SSE41-NEXT: andl $15, %ecx
-; SSE41-NEXT: pinsrb $13, (%rcx,%rax), %xmm0
-; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; SSE41-NEXT: andl $15, %ecx
-; SSE41-NEXT: pinsrb $14, (%rcx,%rax), %xmm0
-; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; SSE41-NEXT: andl $15, %ecx
-; SSE41-NEXT: pinsrb $15, (%rcx,%rax), %xmm0
+; SSE41-NEXT: pinsrb $5, -24(%rsp,%r9), %xmm0
+; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSE41-NEXT: andl $15, %eax
+; SSE41-NEXT: pinsrb $6, -24(%rsp,%rax), %xmm0
+; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSE41-NEXT: andl $15, %eax
+; SSE41-NEXT: pinsrb $7, -24(%rsp,%rax), %xmm0
+; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSE41-NEXT: andl $15, %eax
+; SSE41-NEXT: pinsrb $8, -24(%rsp,%rax), %xmm0
+; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSE41-NEXT: andl $15, %eax
+; SSE41-NEXT: pinsrb $9, -24(%rsp,%rax), %xmm0
+; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSE41-NEXT: andl $15, %eax
+; SSE41-NEXT: pinsrb $10, -24(%rsp,%rax), %xmm0
+; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSE41-NEXT: andl $15, %eax
+; SSE41-NEXT: pinsrb $11, -24(%rsp,%rax), %xmm0
+; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSE41-NEXT: andl $15, %eax
+; SSE41-NEXT: pinsrb $12, -24(%rsp,%rax), %xmm0
+; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSE41-NEXT: andl $15, %eax
+; SSE41-NEXT: pinsrb $13, -24(%rsp,%rax), %xmm0
+; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSE41-NEXT: andl $15, %eax
+; SSE41-NEXT: pinsrb $14, -24(%rsp,%rax), %xmm0
+; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSE41-NEXT: andl $15, %eax
+; SSE41-NEXT: pinsrb $15, -24(%rsp,%rax), %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
@@ -638,49 +635,48 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
; AVX-NEXT: # kill: def %edi killed %edi def %rdi
; AVX-NEXT: andl $15, %edi
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movzbl (%rdi,%rax), %edi
-; AVX-NEXT: vmovd %edi, %xmm0
+; AVX-NEXT: movzbl -24(%rsp,%rdi), %eax
+; AVX-NEXT: vmovd %eax, %xmm0
; AVX-NEXT: andl $15, %esi
-; AVX-NEXT: vpinsrb $1, (%rsi,%rax), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $1, -24(%rsp,%rsi), %xmm0, %xmm0
; AVX-NEXT: andl $15, %edx
-; AVX-NEXT: vpinsrb $2, (%rdx,%rax), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $2, -24(%rsp,%rdx), %xmm0, %xmm0
; AVX-NEXT: andl $15, %ecx
-; AVX-NEXT: vpinsrb $3, (%rcx,%rax), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $3, -24(%rsp,%rcx), %xmm0, %xmm0
; AVX-NEXT: andl $15, %r8d
-; AVX-NEXT: vpinsrb $4, (%r8,%rax), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $4, -24(%rsp,%r8), %xmm0, %xmm0
; AVX-NEXT: andl $15, %r9d
-; AVX-NEXT: vpinsrb $5, (%r9,%rax), %xmm0, %xmm0
-; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; AVX-NEXT: andl $15, %ecx
-; AVX-NEXT: vpinsrb $6, (%rcx,%rax), %xmm0, %xmm0
-; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; AVX-NEXT: andl $15, %ecx
-; AVX-NEXT: vpinsrb $7, (%rcx,%rax), %xmm0, %xmm0
-; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; AVX-NEXT: andl $15, %ecx
-; AVX-NEXT: vpinsrb $8, (%rcx,%rax), %xmm0, %xmm0
-; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; AVX-NEXT: andl $15, %ecx
-; AVX-NEXT: vpinsrb $9, (%rcx,%rax), %xmm0, %xmm0
-; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; AVX-NEXT: andl $15, %ecx
-; AVX-NEXT: vpinsrb $10, (%rcx,%rax), %xmm0, %xmm0
-; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; AVX-NEXT: andl $15, %ecx
-; AVX-NEXT: vpinsrb $11, (%rcx,%rax), %xmm0, %xmm0
-; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; AVX-NEXT: andl $15, %ecx
-; AVX-NEXT: vpinsrb $12, (%rcx,%rax), %xmm0, %xmm0
-; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; AVX-NEXT: andl $15, %ecx
-; AVX-NEXT: vpinsrb $13, (%rcx,%rax), %xmm0, %xmm0
-; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; AVX-NEXT: andl $15, %ecx
-; AVX-NEXT: vpinsrb $14, (%rcx,%rax), %xmm0, %xmm0
-; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; AVX-NEXT: andl $15, %ecx
-; AVX-NEXT: vpinsrb $15, (%rcx,%rax), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $5, -24(%rsp,%r9), %xmm0, %xmm0
+; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX-NEXT: andl $15, %eax
+; AVX-NEXT: vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0
+; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX-NEXT: andl $15, %eax
+; AVX-NEXT: vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0
+; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX-NEXT: andl $15, %eax
+; AVX-NEXT: vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0
+; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX-NEXT: andl $15, %eax
+; AVX-NEXT: vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0
+; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX-NEXT: andl $15, %eax
+; AVX-NEXT: vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0
+; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX-NEXT: andl $15, %eax
+; AVX-NEXT: vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0
+; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX-NEXT: andl $15, %eax
+; AVX-NEXT: vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0
+; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX-NEXT: andl $15, %eax
+; AVX-NEXT: vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0
+; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX-NEXT: andl $15, %eax
+; AVX-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0
+; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX-NEXT: andl $15, %eax
+; AVX-NEXT: vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0
; AVX-NEXT: retq
%x0 = extractelement <16 x i8> %x, i8 %i0
%x1 = extractelement <16 x i8> %x, i8 %i1
@@ -819,69 +815,68 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8*
; SSE2: # %bb.0:
; SSE2-NEXT: movzbl (%rdi), %eax
; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT: movzbl 15(%rdi), %edx
-; SSE2-NEXT: andl $15, %edx
-; SSE2-NEXT: leaq -{{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: movzbl (%rdx,%rcx), %edx
-; SSE2-NEXT: movd %edx, %xmm8
-; SSE2-NEXT: movzbl 14(%rdi), %edx
-; SSE2-NEXT: andl $15, %edx
-; SSE2-NEXT: movzbl (%rdx,%rcx), %edx
-; SSE2-NEXT: movd %edx, %xmm15
-; SSE2-NEXT: movzbl 13(%rdi), %edx
-; SSE2-NEXT: andl $15, %edx
-; SSE2-NEXT: movzbl (%rdx,%rcx), %edx
-; SSE2-NEXT: movd %edx, %xmm9
-; SSE2-NEXT: movzbl 12(%rdi), %edx
-; SSE2-NEXT: andl $15, %edx
-; SSE2-NEXT: movzbl (%rdx,%rcx), %edx
-; SSE2-NEXT: movd %edx, %xmm3
-; SSE2-NEXT: movzbl 11(%rdi), %edx
-; SSE2-NEXT: andl $15, %edx
-; SSE2-NEXT: movzbl (%rdx,%rcx), %edx
-; SSE2-NEXT: movd %edx, %xmm10
-; SSE2-NEXT: movzbl 10(%rdi), %edx
-; SSE2-NEXT: andl $15, %edx
-; SSE2-NEXT: movzbl (%rdx,%rcx), %edx
-; SSE2-NEXT: movd %edx, %xmm7
-; SSE2-NEXT: movzbl 9(%rdi), %edx
-; SSE2-NEXT: andl $15, %edx
-; SSE2-NEXT: movzbl (%rdx,%rcx), %edx
-; SSE2-NEXT: movd %edx, %xmm11
-; SSE2-NEXT: movzbl 8(%rdi), %edx
-; SSE2-NEXT: andl $15, %edx
-; SSE2-NEXT: movzbl (%rdx,%rcx), %edx
-; SSE2-NEXT: movd %edx, %xmm6
-; SSE2-NEXT: movzbl 7(%rdi), %edx
-; SSE2-NEXT: andl $15, %edx
-; SSE2-NEXT: movzbl (%rdx,%rcx), %edx
-; SSE2-NEXT: movd %edx, %xmm12
-; SSE2-NEXT: movzbl 6(%rdi), %edx
-; SSE2-NEXT: andl $15, %edx
-; SSE2-NEXT: movzbl (%rdx,%rcx), %edx
-; SSE2-NEXT: movd %edx, %xmm5
-; SSE2-NEXT: movzbl 5(%rdi), %edx
-; SSE2-NEXT: andl $15, %edx
-; SSE2-NEXT: movzbl (%rdx,%rcx), %edx
-; SSE2-NEXT: movd %edx, %xmm13
-; SSE2-NEXT: movzbl 4(%rdi), %edx
-; SSE2-NEXT: andl $15, %edx
-; SSE2-NEXT: movzbl (%rdx,%rcx), %edx
-; SSE2-NEXT: movd %edx, %xmm4
-; SSE2-NEXT: movzbl 3(%rdi), %edx
-; SSE2-NEXT: andl $15, %edx
-; SSE2-NEXT: movzbl (%rdx,%rcx), %edx
-; SSE2-NEXT: movd %edx, %xmm14
-; SSE2-NEXT: movzbl 2(%rdi), %edx
-; SSE2-NEXT: andl $15, %edx
-; SSE2-NEXT: movzbl (%rdx,%rcx), %edx
-; SSE2-NEXT: movd %edx, %xmm1
-; SSE2-NEXT: movzbl 1(%rdi), %edx
-; SSE2-NEXT: andl $15, %edx
-; SSE2-NEXT: movzbl (%rdx,%rcx), %edx
-; SSE2-NEXT: movd %edx, %xmm2
+; SSE2-NEXT: movzbl 15(%rdi), %ecx
+; SSE2-NEXT: andl $15, %ecx
+; SSE2-NEXT: movzbl -24(%rsp,%rcx), %ecx
+; SSE2-NEXT: movd %ecx, %xmm8
+; SSE2-NEXT: movzbl 14(%rdi), %ecx
+; SSE2-NEXT: andl $15, %ecx
+; SSE2-NEXT: movzbl -24(%rsp,%rcx), %ecx
+; SSE2-NEXT: movd %ecx, %xmm15
+; SSE2-NEXT: movzbl 13(%rdi), %ecx
+; SSE2-NEXT: andl $15, %ecx
+; SSE2-NEXT: movzbl -24(%rsp,%rcx), %ecx
+; SSE2-NEXT: movd %ecx, %xmm9
+; SSE2-NEXT: movzbl 12(%rdi), %ecx
+; SSE2-NEXT: andl $15, %ecx
+; SSE2-NEXT: movzbl -24(%rsp,%rcx), %ecx
+; SSE2-NEXT: movd %ecx, %xmm3
+; SSE2-NEXT: movzbl 11(%rdi), %ecx
+; SSE2-NEXT: andl $15, %ecx
+; SSE2-NEXT: movzbl -24(%rsp,%rcx), %ecx
+; SSE2-NEXT: movd %ecx, %xmm10
+; SSE2-NEXT: movzbl 10(%rdi), %ecx
+; SSE2-NEXT: andl $15, %ecx
+; SSE2-NEXT: movzbl -24(%rsp,%rcx), %ecx
+; SSE2-NEXT: movd %ecx, %xmm7
+; SSE2-NEXT: movzbl 9(%rdi), %ecx
+; SSE2-NEXT: andl $15, %ecx
+; SSE2-NEXT: movzbl -24(%rsp,%rcx), %ecx
+; SSE2-NEXT: movd %ecx, %xmm11
+; SSE2-NEXT: movzbl 8(%rdi), %ecx
+; SSE2-NEXT: andl $15, %ecx
+; SSE2-NEXT: movzbl -24(%rsp,%rcx), %ecx
+; SSE2-NEXT: movd %ecx, %xmm6
+; SSE2-NEXT: movzbl 7(%rdi), %ecx
+; SSE2-NEXT: andl $15, %ecx
+; SSE2-NEXT: movzbl -24(%rsp,%rcx), %ecx
+; SSE2-NEXT: movd %ecx, %xmm12
+; SSE2-NEXT: movzbl 6(%rdi), %ecx
+; SSE2-NEXT: andl $15, %ecx
+; SSE2-NEXT: movzbl -24(%rsp,%rcx), %ecx
+; SSE2-NEXT: movd %ecx, %xmm5
+; SSE2-NEXT: movzbl 5(%rdi), %ecx
+; SSE2-NEXT: andl $15, %ecx
+; SSE2-NEXT: movzbl -24(%rsp,%rcx), %ecx
+; SSE2-NEXT: movd %ecx, %xmm13
+; SSE2-NEXT: movzbl 4(%rdi), %ecx
+; SSE2-NEXT: andl $15, %ecx
+; SSE2-NEXT: movzbl -24(%rsp,%rcx), %ecx
+; SSE2-NEXT: movd %ecx, %xmm4
+; SSE2-NEXT: movzbl 3(%rdi), %ecx
+; SSE2-NEXT: andl $15, %ecx
+; SSE2-NEXT: movzbl -24(%rsp,%rcx), %ecx
+; SSE2-NEXT: movd %ecx, %xmm14
+; SSE2-NEXT: movzbl 2(%rdi), %ecx
+; SSE2-NEXT: andl $15, %ecx
+; SSE2-NEXT: movzbl -24(%rsp,%rcx), %ecx
+; SSE2-NEXT: movd %ecx, %xmm1
+; SSE2-NEXT: movzbl 1(%rdi), %ecx
+; SSE2-NEXT: andl $15, %ecx
+; SSE2-NEXT: movzbl -24(%rsp,%rcx), %ecx
+; SSE2-NEXT: movd %ecx, %xmm2
; SSE2-NEXT: andl $15, %eax
-; SSE2-NEXT: movzbl (%rax,%rcx), %eax
+; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
@@ -904,69 +899,68 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8*
; SSSE3: # %bb.0:
; SSSE3-NEXT: movzbl (%rdi), %eax
; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl 15(%rdi), %edx
-; SSSE3-NEXT: andl $15, %edx
-; SSSE3-NEXT: leaq -{{[0-9]+}}(%rsp), %rcx
-; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx
-; SSSE3-NEXT: movd %edx, %xmm8
-; SSSE3-NEXT: movzbl 14(%rdi), %edx
-; SSSE3-NEXT: andl $15, %edx
-; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx
-; SSSE3-NEXT: movd %edx, %xmm15
-; SSSE3-NEXT: movzbl 13(%rdi), %edx
-; SSSE3-NEXT: andl $15, %edx
-; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx
-; SSSE3-NEXT: movd %edx, %xmm9
-; SSSE3-NEXT: movzbl 12(%rdi), %edx
-; SSSE3-NEXT: andl $15, %edx
-; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx
-; SSSE3-NEXT: movd %edx, %xmm3
-; SSSE3-NEXT: movzbl 11(%rdi), %edx
-; SSSE3-NEXT: andl $15, %edx
-; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx
-; SSSE3-NEXT: movd %edx, %xmm10
-; SSSE3-NEXT: movzbl 10(%rdi), %edx
-; SSSE3-NEXT: andl $15, %edx
-; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx
-; SSSE3-NEXT: movd %edx, %xmm7
-; SSSE3-NEXT: movzbl 9(%rdi), %edx
-; SSSE3-NEXT: andl $15, %edx
-; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx
-; SSSE3-NEXT: movd %edx, %xmm11
-; SSSE3-NEXT: movzbl 8(%rdi), %edx
-; SSSE3-NEXT: andl $15, %edx
-; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx
-; SSSE3-NEXT: movd %edx, %xmm6
-; SSSE3-NEXT: movzbl 7(%rdi), %edx
-; SSSE3-NEXT: andl $15, %edx
-; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx
-; SSSE3-NEXT: movd %edx, %xmm12
-; SSSE3-NEXT: movzbl 6(%rdi), %edx
-; SSSE3-NEXT: andl $15, %edx
-; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx
-; SSSE3-NEXT: movd %edx, %xmm5
-; SSSE3-NEXT: movzbl 5(%rdi), %edx
-; SSSE3-NEXT: andl $15, %edx
-; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx
-; SSSE3-NEXT: movd %edx, %xmm13
-; SSSE3-NEXT: movzbl 4(%rdi), %edx
-; SSSE3-NEXT: andl $15, %edx
-; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx
-; SSSE3-NEXT: movd %edx, %xmm4
-; SSSE3-NEXT: movzbl 3(%rdi), %edx
-; SSSE3-NEXT: andl $15, %edx
-; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx
-; SSSE3-NEXT: movd %edx, %xmm14
-; SSSE3-NEXT: movzbl 2(%rdi), %edx
-; SSSE3-NEXT: andl $15, %edx
-; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx
-; SSSE3-NEXT: movd %edx, %xmm1
-; SSSE3-NEXT: movzbl 1(%rdi), %edx
-; SSSE3-NEXT: andl $15, %edx
-; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx
-; SSSE3-NEXT: movd %edx, %xmm2
+; SSSE3-NEXT: movzbl 15(%rdi), %ecx
+; SSSE3-NEXT: andl $15, %ecx
+; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm8
+; SSSE3-NEXT: movzbl 14(%rdi), %ecx
+; SSSE3-NEXT: andl $15, %ecx
+; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm15
+; SSSE3-NEXT: movzbl 13(%rdi), %ecx
+; SSSE3-NEXT: andl $15, %ecx
+; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm9
+; SSSE3-NEXT: movzbl 12(%rdi), %ecx
+; SSSE3-NEXT: andl $15, %ecx
+; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm3
+; SSSE3-NEXT: movzbl 11(%rdi), %ecx
+; SSSE3-NEXT: andl $15, %ecx
+; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm10
+; SSSE3-NEXT: movzbl 10(%rdi), %ecx
+; SSSE3-NEXT: andl $15, %ecx
+; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm7
+; SSSE3-NEXT: movzbl 9(%rdi), %ecx
+; SSSE3-NEXT: andl $15, %ecx
+; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm11
+; SSSE3-NEXT: movzbl 8(%rdi), %ecx
+; SSSE3-NEXT: andl $15, %ecx
+; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm6
+; SSSE3-NEXT: movzbl 7(%rdi), %ecx
+; SSSE3-NEXT: andl $15, %ecx
+; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm12
+; SSSE3-NEXT: movzbl 6(%rdi), %ecx
+; SSSE3-NEXT: andl $15, %ecx
+; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm5
+; SSSE3-NEXT: movzbl 5(%rdi), %ecx
+; SSSE3-NEXT: andl $15, %ecx
+; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm13
+; SSSE3-NEXT: movzbl 4(%rdi), %ecx
+; SSSE3-NEXT: andl $15, %ecx
+; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm4
+; SSSE3-NEXT: movzbl 3(%rdi), %ecx
+; SSSE3-NEXT: andl $15, %ecx
+; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm14
+; SSSE3-NEXT: movzbl 2(%rdi), %ecx
+; SSSE3-NEXT: andl $15, %ecx
+; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm1
+; SSSE3-NEXT: movzbl 1(%rdi), %ecx
+; SSSE3-NEXT: andl $15, %ecx
+; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm2
; SSSE3-NEXT: andl $15, %eax
-; SSSE3-NEXT: movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSSE3-NEXT: movd %eax, %xmm0
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
@@ -990,54 +984,53 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8*
; SSE41-NEXT: movzbl (%rdi), %eax
; SSE41-NEXT: andl $15, %eax
; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT: leaq -{{[0-9]+}}(%rsp), %rcx
-; SSE41-NEXT: movzbl (%rax,%rcx), %eax
+; SSE41-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE41-NEXT: movd %eax, %xmm0
; SSE41-NEXT: movzbl 1(%rdi), %eax
; SSE41-NEXT: andl $15, %eax
-; SSE41-NEXT: pinsrb $1, (%rax,%rcx), %xmm0
+; SSE41-NEXT: pinsrb $1, -24(%rsp,%rax), %xmm0
; SSE41-NEXT: movzbl 2(%rdi), %eax
; SSE41-NEXT: andl $15, %eax
-; SSE41-NEXT: pinsrb $2, (%rax,%rcx), %xmm0
+; SSE41-NEXT: pinsrb $2, -24(%rsp,%rax), %xmm0
; SSE41-NEXT: movzbl 3(%rdi), %eax
; SSE41-NEXT: andl $15, %eax
-; SSE41-NEXT: pinsrb $3, (%rax,%rcx), %xmm0
+; SSE41-NEXT: pinsrb $3, -24(%rsp,%rax), %xmm0
; SSE41-NEXT: movzbl 4(%rdi), %eax
; SSE41-NEXT: andl $15, %eax
-; SSE41-NEXT: pinsrb $4, (%rax,%rcx), %xmm0
+; SSE41-NEXT: pinsrb $4, -24(%rsp,%rax), %xmm0
; SSE41-NEXT: movzbl 5(%rdi), %eax
; SSE41-NEXT: andl $15, %eax
-; SSE41-NEXT: pinsrb $5, (%rax,%rcx), %xmm0
+; SSE41-NEXT: pinsrb $5, -24(%rsp,%rax), %xmm0
; SSE41-NEXT: movzbl 6(%rdi), %eax
; SSE41-NEXT: andl $15, %eax
-; SSE41-NEXT: pinsrb $6, (%rax,%rcx), %xmm0
+; SSE41-NEXT: pinsrb $6, -24(%rsp,%rax), %xmm0
; SSE41-NEXT: movzbl 7(%rdi), %eax
; SSE41-NEXT: andl $15, %eax
-; SSE41-NEXT: pinsrb $7, (%rax,%rcx), %xmm0
+; SSE41-NEXT: pinsrb $7, -24(%rsp,%rax), %xmm0
; SSE41-NEXT: movzbl 8(%rdi), %eax
; SSE41-NEXT: andl $15, %eax
-; SSE41-NEXT: pinsrb $8, (%rax,%rcx), %xmm0
+; SSE41-NEXT: pinsrb $8, -24(%rsp,%rax), %xmm0
; SSE41-NEXT: movzbl 9(%rdi), %eax
; SSE41-NEXT: andl $15, %eax
-; SSE41-NEXT: pinsrb $9, (%rax,%rcx), %xmm0
+; SSE41-NEXT: pinsrb $9, -24(%rsp,%rax), %xmm0
; SSE41-NEXT: movzbl 10(%rdi), %eax
; SSE41-NEXT: andl $15, %eax
-; SSE41-NEXT: pinsrb $10, (%rax,%rcx), %xmm0
+; SSE41-NEXT: pinsrb $10, -24(%rsp,%rax), %xmm0
; SSE41-NEXT: movzbl 11(%rdi), %eax
; SSE41-NEXT: andl $15, %eax
-; SSE41-NEXT: pinsrb $11, (%rax,%rcx), %xmm0
+; SSE41-NEXT: pinsrb $11, -24(%rsp,%rax), %xmm0
; SSE41-NEXT: movzbl 12(%rdi), %eax
; SSE41-NEXT: andl $15, %eax
-; SSE41-NEXT: pinsrb $12, (%rax,%rcx), %xmm0
+; SSE41-NEXT: pinsrb $12, -24(%rsp,%rax), %xmm0
; SSE41-NEXT: movzbl 13(%rdi), %eax
; SSE41-NEXT: andl $15, %eax
-; SSE41-NEXT: pinsrb $13, (%rax,%rcx), %xmm0
+; SSE41-NEXT: pinsrb $13, -24(%rsp,%rax), %xmm0
; SSE41-NEXT: movzbl 14(%rdi), %eax
; SSE41-NEXT: andl $15, %eax
-; SSE41-NEXT: pinsrb $14, (%rax,%rcx), %xmm0
+; SSE41-NEXT: pinsrb $14, -24(%rsp,%rax), %xmm0
; SSE41-NEXT: movzbl 15(%rdi), %eax
; SSE41-NEXT: andl $15, %eax
-; SSE41-NEXT: pinsrb $15, (%rax,%rcx), %xmm0
+; SSE41-NEXT: pinsrb $15, -24(%rsp,%rax), %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
@@ -1045,54 +1038,53 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8*
; AVX-NEXT: movzbl (%rdi), %eax
; AVX-NEXT: andl $15, %eax
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movzbl (%rax,%rcx), %eax
+; AVX-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX-NEXT: vmovd %eax, %xmm0
; AVX-NEXT: movzbl 1(%rdi), %eax
; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $1, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $1, -24(%rsp,%rax), %xmm0, %xmm0
; AVX-NEXT: movzbl 2(%rdi), %eax
; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $2, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $2, -24(%rsp,%rax), %xmm0, %xmm0
; AVX-NEXT: movzbl 3(%rdi), %eax
; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $3, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $3, -24(%rsp,%rax), %xmm0, %xmm0
; AVX-NEXT: movzbl 4(%rdi), %eax
; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $4, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $4, -24(%rsp,%rax), %xmm0, %xmm0
; AVX-NEXT: movzbl 5(%rdi), %eax
; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $5, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $5, -24(%rsp,%rax), %xmm0, %xmm0
; AVX-NEXT: movzbl 6(%rdi), %eax
; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $6, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0
; AVX-NEXT: movzbl 7(%rdi), %eax
; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $7, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0
; AVX-NEXT: movzbl 8(%rdi), %eax
; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $8, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0
; AVX-NEXT: movzbl 9(%rdi), %eax
; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $9, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0
; AVX-NEXT: movzbl 10(%rdi), %eax
; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $10, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0
; AVX-NEXT: movzbl 11(%rdi), %eax
; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $11, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0
; AVX-NEXT: movzbl 12(%rdi), %eax
; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $12, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0
; AVX-NEXT: movzbl 13(%rdi), %eax
; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $13, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0
; AVX-NEXT: movzbl 14(%rdi), %eax
; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $14, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0
; AVX-NEXT: movzbl 15(%rdi), %eax
; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $15, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0
; AVX-NEXT: retq
%p0 = getelementptr inbounds i8, i8* %i, i64 0
%p1 = getelementptr inbounds i8, i8* %i, i64 1
diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll
index d25117ca715c..fd4c30fb327b 100644
--- a/test/CodeGen/X86/vector-trunc.ll
+++ b/test/CodeGen/X86/vector-trunc.ll
@@ -1504,12 +1504,9 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
; AVX512VL: # %bb.0: # %entry
; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0
; AVX512VL-NEXT: vpmovqd %ymm1, %xmm1
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
@@ -1531,12 +1528,9 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
; AVX512BWVL: # %bb.0: # %entry
; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0
; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm1
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
@@ -1647,43 +1641,13 @@ define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
;
-; AVX512F-LABEL: trunc2x4i32_8i16:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc2x4i32_8i16:
-; AVX512VL: # %bb.0: # %entry
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc2x4i32_8i16:
-; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc2x4i32_8i16:
-; AVX512BWVL: # %bb.0: # %entry
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: trunc2x4i32_8i16:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT: retq
entry:
%0 = trunc <4 x i32> %a to <4 x i16>
%1 = trunc <4 x i32> %b to <4 x i16>
diff --git a/test/CodeGen/X86/vector-zext.ll b/test/CodeGen/X86/vector-zext.ll
index 94eadd8c1aaf..3ea65573fc70 100644
--- a/test/CodeGen/X86/vector-zext.ll
+++ b/test/CodeGen/X86/vector-zext.ll
@@ -3,7 +3,8 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
@@ -1911,11 +1912,28 @@ define <4 x i32> @shuf_zext_8i16_to_4i32_offset1(<8 x i16> %A) nounwind uwtable
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: retq
;
-; AVX-LABEL: shuf_zext_8i16_to_4i32_offset1:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX-NEXT: retq
+; AVX1-LABEL: shuf_zext_8i16_to_4i32_offset1:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT: retq
+;
+; AVX2-SLOW-LABEL: shuf_zext_8i16_to_4i32_offset1:
+; AVX2-SLOW: # %bb.0: # %entry
+; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuf_zext_8i16_to_4i32_offset1:
+; AVX2-FAST: # %bb.0: # %entry
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3],zero,zero,xmm0[4,5],zero,zero,xmm0[6,7],zero,zero,xmm0[8,9],zero,zero
+; AVX2-FAST-NEXT: retq
+;
+; AVX512-LABEL: shuf_zext_8i16_to_4i32_offset1:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512-NEXT: retq
entry:
%B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 8, i32 2, i32 8, i32 3, i32 8, i32 4, i32 8>
%Z = bitcast <8 x i16> %B to <4 x i32>
diff --git a/test/CodeGen/X86/vselect.ll b/test/CodeGen/X86/vselect.ll
index 985f6a861b93..3d4355e5f39c 100644
--- a/test/CodeGen/X86/vselect.ll
+++ b/test/CodeGen/X86/vselect.ll
@@ -182,7 +182,7 @@ define <8 x i16> @test10(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @test11(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: test11:
; SSE2: # %bb.0:
-; SSE2-NEXT: movaps {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,65535,65535]
+; SSE2-NEXT: movaps {{.*#+}} xmm2 = [0,65535,65535,0,0,65535,65535,0]
; SSE2-NEXT: andps %xmm2, %xmm0
; SSE2-NEXT: andnps %xmm1, %xmm2
; SSE2-NEXT: orps %xmm2, %xmm0
@@ -190,12 +190,12 @@ define <8 x i16> @test11(<8 x i16> %a, <8 x i16> %b) {
;
; SSE41-LABEL: test11:
; SSE41: # %bb.0:
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3,4],xmm0[5,6],xmm1[7]
; SSE41-NEXT: retq
;
; AVX-LABEL: test11:
; AVX: # %bb.0:
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3,4],xmm0[5,6],xmm1[7]
; AVX-NEXT: retq
%1 = select <8 x i1> <i1 false, i1 true, i1 true, i1 false, i1 undef, i1 true, i1 true, i1 undef>, <8 x i16> %a, <8 x i16> %b
ret <8 x i16> %1
diff --git a/test/DebugInfo/X86/dwarfdump-str-offsets-dwp.s b/test/DebugInfo/X86/dwarfdump-str-offsets-dwp.s
index 9158c60fbeb5..fb095e3ebc25 100644
--- a/test/DebugInfo/X86/dwarfdump-str-offsets-dwp.s
+++ b/test/DebugInfo/X86/dwarfdump-str-offsets-dwp.s
@@ -2,9 +2,10 @@
# RUN: llvm-dwarfdump -v %t.o | FileCheck %s
# Test object to verify that dwarfdump handles dwp files with DWARF v5 string
-# offset tables. We have 2 CUs and 2 TUs, where it is assumed that
+# offset tables. We have 3 CUs and 2 TUs, where it is assumed that
# CU1 and TU1 came from one object file, CU2 and TU2 from a second object
-# file.
+# file, and CU3 from a third object file that was compiled with
+# -gdwarf-4.
#
.section .debug_str.dwo,"MS",@progbits,1
str_producer:
@@ -25,56 +26,50 @@ str_TU2:
.asciz "Type_Unit_2"
str_TU2_type:
.asciz "MyStruct_2"
+str_CU3:
+ .asciz "Compile_Unit_3"
+str_CU3_dir:
+ .asciz "/home/test/CU3"
.section .debug_str_offsets.dwo,"",@progbits
# Object files 1's portion of the .debug_str_offsets.dwo section.
-.debug_str_offsets_object_file1:
-
-# CU1's contribution (from object file 1)
-.debug_str_offsets_start_CU1:
- .long .debug_str_offsets_end_CU1-.debug_str_offsets_base_CU1
+# CU1 and TU1 share a contribution to the string offsets table.
+.debug_str_offsets_object_file1_start:
+ .long .debug_str_offsets_object_file1_end-.debug_str_offsets_base_1
.short 5 # DWARF version
.short 0 # Padding
-.debug_str_offsets_base_CU1:
+.debug_str_offsets_base_1:
.long str_producer-.debug_str.dwo
.long str_CU1-.debug_str.dwo
.long str_CU1_dir-.debug_str.dwo
-.debug_str_offsets_end_CU1:
-
-# TU1's contribution (from object file 1)
-.debug_str_offsets_start_TU1:
- .long .debug_str_offsets_end_TU1-.debug_str_offsets_base_TU1
- .short 5 # DWARF version
- .short 0 # Padding
-.debug_str_offsets_base_TU1:
.long str_TU1-.debug_str.dwo
.long str_TU1_type-.debug_str.dwo
-.debug_str_offsets_end_TU1:
+.debug_str_offsets_object_file1_end:
# Object files 2's portion of the .debug_str_offsets.dwo section.
-.debug_str_offsets_object_file2:
-
-# CU2's contribution (from object file 2)
-.debug_str_offsets_start_CU2:
- .long .debug_str_offsets_end_CU2-.debug_str_offsets_base_CU2
+# CU2 and TU2 share a contribution to the string offsets table.
+.debug_str_offsets_object_file2_start:
+ .long .debug_str_offsets_object_file2_end-.debug_str_offsets_base_2
.short 5 # DWARF version
.short 0 # Padding
-.debug_str_offsets_base_CU2:
+.debug_str_offsets_base_2:
.long str_producer-.debug_str.dwo
.long str_CU2-.debug_str.dwo
.long str_CU2_dir-.debug_str.dwo
-.debug_str_offsets_end_CU2:
-
-# TU2's contribution (from object file 2)
-.debug_str_offsets_start_TU2:
- .long .debug_str_offsets_end_TU2-.debug_str_offsets_base_TU2
- .short 5 # DWARF version
- .short 0 # Padding
-.debug_str_offsets_base_TU2:
.long str_TU2-.debug_str.dwo
.long str_TU2_type-.debug_str.dwo
-.debug_str_offsets_end_TU2:
+.debug_str_offsets_object_file2_end:
+# Object files 3's portion of the .debug_str_offsets.dwo section.
+# This file is assumed to have been compiled with -gdwarf-4 and
+# therefore contains a version 4 CU and a GNU format contribution
+# to the .debug_str_offsets section.
+.debug_str_offsets_object_file3_start:
+.debug_str_offsets_base_3:
+ .long str_producer-.debug_str.dwo
+ .long str_CU3-.debug_str.dwo
+ .long str_CU3_dir-.debug_str.dwo
+.debug_str_offsets_object_file3_end:
# Abbrevs are shared for all compile and type units
.section .debug_abbrev.dwo,"",@progbits
@@ -85,8 +80,6 @@ str_TU2_type:
.byte 0x1a # DW_FORM_strx
.byte 0x03 # DW_AT_name
.byte 0x1a # DW_FORM_strx
- .byte 0x72 # DW_AT_str_offsets_base
- .byte 0x17 # DW_FORM_sec_offset
.byte 0x03 # DW_AT_name
.byte 0x1a # DW_FORM_strx
.byte 0x00 # EOM(1)
@@ -96,8 +89,6 @@ str_TU2_type:
.byte 0x01 # DW_CHILDREN_yes
.byte 0x03 # DW_AT_name
.byte 0x1a # DW_FORM_strx
- .byte 0x72 # DW_AT_str_offsets_base
- .byte 0x17 # DW_FORM_sec_offset
.byte 0x00 # EOM(1)
.byte 0x00 # EOM(2)
.byte 0x03 # Abbrev code
@@ -107,6 +98,17 @@ str_TU2_type:
.byte 0x1a # DW_FORM_strx
.byte 0x00 # EOM(1)
.byte 0x00 # EOM(2)
+ .byte 0x04 # Abbrev code
+ .byte 0x11 # DW_TAG_compile_unit
+ .byte 0x00 # DW_CHILDREN_no
+ .byte 0x25 # DW_AT_producer
+ .short 0x3e82 # DW_FORM_GNU_str_index
+ .byte 0x03 # DW_AT_name
+ .short 0x3e82 # DW_FORM_GNU_str_index
+ .byte 0x03 # DW_AT_name
+ .short 0x3e82 # DW_FORM_GNU_str_index
+ .byte 0x00 # EOM(1)
+ .byte 0x00 # EOM(2)
.byte 0x00 # EOM(3)
abbrev_end:
@@ -120,15 +122,11 @@ CU1_5_version:
.byte 1 # DWARF Unit Type
.byte 8 # Address Size (in bytes)
.long .debug_abbrev.dwo # Offset Into Abbrev. Section
-# The compile-unit DIE, which has a DW_AT_producer, DW_AT_name,
-# DW_AT_str_offsets and DW_AT_compdir.
+# The compile-unit DIE, which has a DW_AT_producer, DW_AT_name
+# and DW_AT_compdir.
.byte 1 # Abbreviation code
.byte 0 # The index of the producer string
.byte 1 # The index of the CU name string
-# The DW_AT_str_offsets_base attribute for CU1 contains the offset of CU1's
-# contribution relative to the start of object file 1's portion of the
-# .debug_str_offsets section.
- .long .debug_str_offsets_base_CU1-.debug_str_offsets_object_file1
.byte 2 # The index of the comp dir string
.byte 0 # NULL
CU1_5_end:
@@ -140,19 +138,30 @@ CU2_5_version:
.byte 1 # DWARF Unit Type
.byte 8 # Address Size (in bytes)
.long .debug_abbrev.dwo # Offset Into Abbrev. Section
-# The compile-unit DIE, which has a DW_AT_producer, DW_AT_name,
-# DW_AT_str_offsets and DW_AT_compdir.
+# The compile-unit DIE, which has a DW_AT_producer, DW_AT_name
+# and DW_AT_compdir.
.byte 1 # Abbreviation code
.byte 0 # The index of the producer string
.byte 1 # The index of the CU name string
-# The DW_AT_str_offsets_base attribute for CU2 contains the offset of CU2's
-# contribution relative to the start of object file 2's portion of the
-# .debug_str_offsets section.
- .long .debug_str_offsets_base_CU2-.debug_str_offsets_object_file2
.byte 2 # The index of the comp dir string
.byte 0 # NULL
CU2_5_end:
+CU3_4_start:
+ .long CU3_4_end-CU3_4_version # Length of Unit
+CU3_4_version:
+ .short 4 # DWARF version number
+ .long .debug_abbrev.dwo # Offset Into Abbrev. Section
+ .byte 8 # Address Size (in bytes)
+# The compile-unit DIE, which has a DW_AT_producer, DW_AT_name
+# and DW_AT_compdir.
+ .byte 4 # Abbreviation code
+ .byte 0 # The index of the producer string
+ .byte 1 # The index of the CU name string
+ .byte 2 # The index of the comp dir string
+ .byte 0 # NULL
+CU3_4_end:
+
.section .debug_types.dwo,"",@progbits
# DWARF v5 Type unit header.
TU1_5_start:
@@ -166,15 +175,11 @@ TU1_5_version:
.long TU1_5_type-TU1_5_start # Type offset
# The type-unit DIE, which has a name.
.byte 2 # Abbreviation code
- .byte 0 # Index of the unit type name string
-# The DW_AT_str_offsets_base attribute for TU1 contains the offset of TU1's
-# contribution relative to the start of object file 1's portion of the
-# .debug_str_offsets section.
- .long .debug_str_offsets_base_TU1-.debug_str_offsets_object_file1
+ .byte 3 # Index of the unit type name string
# The type DIE, which has a name.
TU1_5_type:
.byte 3 # Abbreviation code
- .byte 1 # Index of the type name string
+ .byte 4 # Index of the type name string
.byte 0 # NULL
.byte 0 # NULL
TU1_5_end:
@@ -190,15 +195,11 @@ TU2_5_version:
.long TU2_5_type-TU2_5_start # Type offset
# The type-unit DIE, which has a name.
.byte 2 # Abbreviation code
- .byte 0 # Index of the unit type name string
-# The DW_AT_str_offsets_base attribute for TU2 contains the offset of TU2's
-# contribution relative to the start of object file 2's portion of the
-# .debug_str_offsets section.
- .long .debug_str_offsets_base_TU2-.debug_str_offsets_object_file2
+ .byte 3 # Index of the unit type name string
# The type DIE, which has a name.
TU2_5_type:
.byte 3 # Abbreviation code
- .byte 1 # Index of the type name string
+ .byte 4 # Index of the type name string
.byte 0 # NULL
.byte 0 # NULL
TU2_5_end:
@@ -207,37 +208,45 @@ TU2_5_end:
# The index header
.long 2 # Version
.long 3 # Columns of contribution matrix
- .long 2 # number of units
- .long 2 # number of hash buckets in table
+ .long 3 # number of units
+ .long 3 # number of hash buckets in table
- # The signatures for both CUs.
+ # The signatures for all CUs.
.quad 0xddeeaaddbbaabbee # signature 1
.quad 0xff00ffeeffaaff00 # signature 2
+ .quad 0xf00df00df00df00d # signature 2
# The indexes for both CUs.
.long 1 # index 1
.long 2 # index 2
- # The sections to which both CUs contribute.
+ .long 3 # index 3
+ # The sections to which all CUs contribute.
.long 1 # DW_SECT_INFO
.long 3 # DW_SECT_ABBREV
.long 6 # DW_SECT_STR_OFFSETS
- # The starting offsets of both CU's contributions to info,
+ # The starting offsets of all CU's contributions to info,
# abbrev and string offsets table.
.long CU1_5_start-.debug_info.dwo
.long 0
- .long .debug_str_offsets_object_file1-.debug_str_offsets.dwo
+ .long .debug_str_offsets_object_file1_start-.debug_str_offsets.dwo
.long CU2_5_start-.debug_info.dwo
.long 0
- .long .debug_str_offsets_object_file2-.debug_str_offsets.dwo
+ .long .debug_str_offsets_object_file2_start-.debug_str_offsets.dwo
+ .long CU3_4_start-.debug_info.dwo
+ .long 0
+ .long .debug_str_offsets_object_file3_start-.debug_str_offsets.dwo
- # The lengths of both CU's contributions to info, abbrev and
+ # The lengths of all CU's contributions to info, abbrev and
# string offsets table.
.long CU1_5_end-CU1_5_start
.long abbrev_end-.debug_abbrev.dwo
- .long .debug_str_offsets_end_CU1-.debug_str_offsets_start_CU1
+ .long .debug_str_offsets_object_file1_end-.debug_str_offsets_object_file1_start
.long CU2_5_end-CU2_5_start
.long abbrev_end-.debug_abbrev.dwo
- .long .debug_str_offsets_end_CU2-.debug_str_offsets_start_CU2
+ .long .debug_str_offsets_object_file2_end-.debug_str_offsets_object_file2_start
+ .long CU3_4_end-CU3_4_start
+ .long abbrev_end-.debug_abbrev.dwo
+ .long .debug_str_offsets_object_file3_end-.debug_str_offsets_object_file3_start
.section .debug_tu_index,"",@progbits
# The index header
@@ -261,19 +270,19 @@ TU2_5_end:
# abbrev and string offsets table.
.long TU1_5_start-.debug_types.dwo
.long 0
- .long .debug_str_offsets_object_file1-.debug_str_offsets.dwo
+ .long .debug_str_offsets_object_file1_start-.debug_str_offsets.dwo
.long TU2_5_start-.debug_types.dwo
.long 0
- .long .debug_str_offsets_object_file2-.debug_str_offsets.dwo
+ .long .debug_str_offsets_object_file2_start-.debug_str_offsets.dwo
# The lengths of both TU's contributions to info, abbrev and
# string offsets table.
.long TU1_5_end-TU1_5_start
.long abbrev_end-.debug_abbrev.dwo
- .long .debug_str_offsets_end_TU1-.debug_str_offsets_start_TU1
+ .long .debug_str_offsets_object_file1_end-.debug_str_offsets_object_file1_start
.long TU2_5_end-TU2_5_start
.long abbrev_end-.debug_abbrev.dwo
- .long .debug_str_offsets_end_TU2-.debug_str_offsets_start_TU2
+ .long .debug_str_offsets_object_file2_end-.debug_str_offsets_object_file2_start
# Verify that the correct strings from each unit are displayed and that the
@@ -284,7 +293,6 @@ TU2_5_end:
# CHECK: DW_TAG_compile_unit
# CHECK-NEXT: DW_AT_producer [DW_FORM_strx] ( indexed (00000000) string = "Handmade DWARF producer")
# CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "Compile_Unit_1")
-# CHECK-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000008)
# CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000002) string = "/home/test/CU1")
# CHECK-NOT: NULL
@@ -293,26 +301,23 @@ TU2_5_end:
# CHECK: DW_TAG_compile_unit
# CHECK-NEXT: DW_AT_producer [DW_FORM_strx] ( indexed (00000000) string = "Handmade DWARF producer")
# CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "Compile_Unit_2")
-# CHECK-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000008)
# CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000002) string = "/home/test/CU2")
#
# CHECK: Type Unit
# CHECK-NOT: NULL
# CHECK: DW_TAG_type_unit
-# CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000000) string = "Type_Unit_1")
-# CHECK-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x0000001c)
+# CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000003) string = "Type_Unit_1")
# CHECK-NOT: NULL
# CHECK: DW_TAG_structure_type
-# CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "MyStruct_1")
+# CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000004) string = "MyStruct_1")
#
# CHECK: Type Unit
# CHECK-NOT: NULL
# CHECK: DW_TAG_type_unit
-# CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000000) string = "Type_Unit_2")
-# CHECK-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x0000001c)
+# CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000003) string = "Type_Unit_2")
# CHECK-NOT: NULL
# CHECK: DW_TAG_structure_type
-# CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "MyStruct_2")
+# CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000004) string = "MyStruct_2")
# Verify the correct offets of the compile and type units contributions in the
# index tables.
@@ -322,11 +327,11 @@ TU2_5_end:
# CHECK: 1 0xddeeaaddbbaabbee [{{0x[0-9a-f]*, 0x[0-9a-f]*}}) [{{0x[0-9a-f]*, 0x[0-9a-f]*}})
# CHECK-SAME: [0x00000000
# CHECK-NEXT: 2 0xff00ffeeffaaff00 [{{0x[0-9a-f]*, 0x[0-9a-f]*}}) [{{0x[0-9a-f]*, 0x[0-9a-f]*}})
-# CHECK-SAME: [0x00000024
+# CHECK-SAME: [0x0000001c
# CHECK: .debug_tu_index contents:
# CHECK-NOT: contents:
# CHECK: 1 0xeeaaddbbaabbeedd [{{0x[0-9a-f]*, 0x[0-9a-f]*}}) [{{0x[0-9a-f]*, 0x[0-9a-f]*}})
# CHECK-SAME: [0x00000000
# CHECK-NEXT: 2 0x00ffeeffaaff00ff [{{0x[0-9a-f]*, 0x[0-9a-f]*}}) [{{0x[0-9a-f]*, 0x[0-9a-f]*}})
-# CHECK: [0x00000024
+# CHECK: [0x0000001c
diff --git a/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-3.s b/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-3.s
index ebde22213728..0ff6fdbb0aa7 100644
--- a/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-3.s
+++ b/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-3.s
@@ -37,6 +37,8 @@ dwo_str_TU_5_type:
.byte 0x01 # Abbrev code
.byte 0x11 # DW_TAG_compile_unit
.byte 0x00 # DW_CHILDREN_no
+ .byte 0x72 # DW_AT_str_offsets_base
+ .byte 0x17 # DW_FORM_sec_offset
.byte 0x00 # EOM(1)
.byte 0x00 # EOM(2)
.byte 0x00 # EOM(3)
@@ -54,13 +56,13 @@ CU1_5_version:
.long .debug_abbrev # Offset Into Abbrev. Section
# A compile-unit DIE, which has no attributes.
.byte 1 # Abbreviation code
+ .long .debug_str_offsets_base0
CU1_5_end:
.section .debug_str_offsets,"",@progbits
# CU1's contribution
# Invalid length
.long 0xfffffffe
- .long .debug_str_offsets_segment0_end-.debug_str_offsets_base0
.short 5 # DWARF version
.short 0 # Padding
.debug_str_offsets_base0:
diff --git a/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-4.s b/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-4.s
index 9c6094ebade1..36ac7124a9f0 100644
--- a/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-4.s
+++ b/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-4.s
@@ -15,6 +15,8 @@ str_CU1:
.byte 0x01 # Abbrev code
.byte 0x11 # DW_TAG_compile_unit
.byte 0x00 # DW_CHILDREN_no
+ .byte 0x72 # DW_AT_str_offsets_base
+ .byte 0x17 # DW_FORM_sec_offset
.byte 0x00 # EOM(1)
.byte 0x00 # EOM(2)
.byte 0x00 # EOM(3)
@@ -32,6 +34,7 @@ CU1_5_version:
.long .debug_abbrev # Offset Into Abbrev. Section
# A compile-unit DIE, which has no attributes.
.byte 1 # Abbreviation code
+ .long .debug_str_offsets_base0
CU1_5_end:
# Every unit contributes to the string_offsets table.
@@ -50,4 +53,4 @@ CU1_5_end:
# INVALIDLENGTH: .debug_str_offsets contents:
# INVALIDLENGTH-NOT: contents:
-# INVALIDLENGTH: error: contribution to string offsets table in section .debug_str_offsets has invalid length.
+# INVALIDLENGTH: error: invalid contribution to string offsets table in section .debug_str_offsets.
diff --git a/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-6.s b/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-6.s
new file mode 100644
index 000000000000..28c4a418d125
--- /dev/null
+++ b/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-6.s
@@ -0,0 +1,94 @@
+# RUN: llvm-mc -triple x86_64-unknown-linux %s -filetype=obj -o %t.o
+# RUN: llvm-dwarfdump -v %t.o | FileCheck --check-prefix=OVERLAP %s
+#
+# Test object to verify that llvm-dwarfdump handles an invalid string offsets
+# table with overlapping contributions.
+
+ .section .debug_str,"MS",@progbits,1
+str_producer:
+ .asciz "Handmade DWARF producer"
+str_CU1:
+ .asciz "Compile_Unit_1"
+str_CU1_dir:
+ .asciz "/home/test/CU1"
+str_CU2:
+ .asciz "Compile_Unit_2"
+str_CU2_dir:
+ .asciz "/home/test/CU2"
+str_TU:
+ .asciz "Type_Unit"
+str_TU_type:
+ .asciz "MyStruct"
+
+ .section .debug_str.dwo,"MS",@progbits,1
+dwo_str_CU_5_producer:
+ .asciz "Handmade split DWARF producer"
+dwo_str_CU_5_name:
+ .asciz "V5_split_compile_unit"
+dwo_str_CU_5_comp_dir:
+ .asciz "/home/test/splitCU"
+dwo_str_TU_5:
+ .asciz "V5_split_type_unit"
+dwo_str_TU_5_type:
+ .asciz "V5_split_Mystruct"
+
+# A rudimentary abbrev section.
+ .section .debug_abbrev,"",@progbits
+ .byte 0x01 # Abbrev code
+ .byte 0x11 # DW_TAG_compile_unit
+ .byte 0x00 # DW_CHILDREN_no
+ .byte 0x72 # DW_AT_str_offsets_base
+ .byte 0x17 # DW_FORM_sec_offset
+ .byte 0x00 # EOM(1)
+ .byte 0x00 # EOM(2)
+ .byte 0x00 # EOM(3)
+
+ .section .debug_info,"",@progbits
+# DWARF v5 CU header.
+ .long CU1_5_end-CU1_5_version # Length of Unit
+CU1_5_version:
+ .short 5 # DWARF version number
+ .byte 1 # DWARF Unit Type
+ .byte 8 # Address Size (in bytes)
+ .long .debug_abbrev # Offset Into Abbrev. Section
+# A compile-unit DIE, which has no attributes.
+ .byte 1 # Abbreviation code
+ .long .debug_str_offsets_base0
+CU1_5_end:
+
+# DWARF v5 CU header.
+ .long CU2_5_end-CU2_5_version # Length of Unit
+CU2_5_version:
+ .short 5 # DWARF version number
+ .byte 1 # DWARF Unit Type
+ .byte 8 # Address Size (in bytes)
+ .long .debug_abbrev # Offset Into Abbrev. Section
+# A compile-unit DIE, which has no attributes.
+ .byte 1 # Abbreviation code
+ .long .debug_str_offsets_base1
+CU2_5_end:
+
+ .section .debug_str_offsets,"",@progbits
+# CU1's contribution
+ .long .debug_str_offsets_segment1_end-.debug_str_offsets_base0
+ .short 5 # DWARF version
+ .short 0 # Padding
+.debug_str_offsets_base0:
+ .long str_producer
+ .long str_CU1
+ .long str_CU1_dir
+.debug_str_offsets_segment0_end:
+# CU2's contribution
+# Overlapping with CU1's contribution
+ .long .debug_str_offsets_segment1_end-.debug_str_offsets_base1
+ .short 5 # DWARF version
+ .short 0 # Padding
+.debug_str_offsets_base1:
+ .long str_producer
+ .long str_CU2
+ .long str_CU2_dir
+.debug_str_offsets_segment1_end:
+
+# OVERLAP: .debug_str_offsets contents:
+# OVERLAP-NOT: contents:
+# OVERLAP: error: overlapping contributions to string offsets table in section .debug_str_offsets.
diff --git a/test/DebugInfo/X86/dwarfdump-str-offsets-macho.s b/test/DebugInfo/X86/dwarfdump-str-offsets-macho.s
index 34f151d37d95..f8f48ea13882 100644
--- a/test/DebugInfo/X86/dwarfdump-str-offsets-macho.s
+++ b/test/DebugInfo/X86/dwarfdump-str-offsets-macho.s
@@ -43,14 +43,17 @@ Ldebug_str_offsets_base0:
.long str_Variable2
.long str_Variable3
Ldebug_str_offsets_segment0_end:
-# CU2's contribution
- .long Ldebug_str_offsets_segment1_end-Ldebug_str_offsets_base1
+# A 4-byte gap.
+ .long 0
+# CU2's contribution (DWARF64 format)
+ .long 0xffffffff
+ .quad Ldebug_str_offsets_segment1_end-Ldebug_str_offsets_base1
.short 5 # DWARF version
.short 0 # Padding
Ldebug_str_offsets_base1:
- .long str_producer
- .long str_CU2
- .long str_CU2_dir
+ .quad str_producer
+ .quad str_CU2
+ .quad str_CU2_dir
Ldebug_str_offsets_segment1_end:
# The TU's contribution
.long Ldebug_str_offsets_segment2_end-Ldebug_str_offsets_base2
@@ -234,20 +237,20 @@ TU_5_end:
# COMMON: DW_TAG_compile_unit
# COMMON-NEXT: DW_AT_producer [DW_FORM_strx] ( indexed (00000000) string = "Handmade DWARF producer")
# COMMON-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "Compile_Unit_2")
-# COMMON-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x0000002c)
+# COMMON-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000038)
# COMMON-NEXT: DW_AT_comp_dir [DW_FORM_strx] ( indexed (00000002) string = "/home/test/CU2")
#
# The type unit
# COMMON: .debug_types contents:
# COMMON: DW_TAG_type_unit
# COMMON-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000000) string = "Type_Unit")
-# COMMON-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000040)
+# COMMON-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000058)
# COMMON: DW_TAG_structure_type
# COMMON-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "MyStruct")
#
# The .debug_str_offsets section
# COMMON: .debug_str_offsets contents:
-# COMMON-NEXT: 0x00000000: Contribution size = 28, Version = 5
+# COMMON-NEXT: 0x00000000: Contribution size = 28, Format = DWARF32, Version = 5
# COMMON-NEXT: 0x00000008: 00000000 "Handmade DWARF producer"
# COMMON-NEXT: 0x0000000c: 00000018 "Compile_Unit_1"
# COMMON-NEXT: 0x00000010: 00000027 "/home/test/CU1"
@@ -255,10 +258,11 @@ TU_5_end:
# COMMON-NEXT: 0x00000018: 0000006e "MyVar1"
# COMMON-NEXT: 0x0000001c: 00000075 "MyVar2"
# COMMON-NEXT: 0x00000020: 0000007c "MyVar3"
-# COMMON-NEXT: 0x00000024: Contribution size = 12, Version = 5
-# COMMON-NEXT: 0x0000002c: 00000000 "Handmade DWARF producer"
-# COMMON-NEXT: 0x00000030: 00000036 "Compile_Unit_2"
-# COMMON-NEXT: 0x00000034: 00000045 "/home/test/CU2"
-# COMMON-NEXT: 0x00000038: Contribution size = 8, Version = 5
-# COMMON-NEXT: 0x00000040: 00000054 "Type_Unit"
-# COMMON-NEXT: 0x00000044: 0000005e "MyStruct"
+# COMMON-NEXT: 0x00000024: Gap, length = 4
+# COMMON-NEXT: 0x00000028: Contribution size = 24, Format = DWARF64, Version = 5
+# COMMON-NEXT: 0x00000038: 00000000 "Handmade DWARF producer"
+# COMMON-NEXT: 0x00000040: 00000036 "Compile_Unit_2"
+# COMMON-NEXT: 0x00000048: 00000045 "/home/test/CU2"
+# COMMON-NEXT: 0x00000050: Contribution size = 8, Format = DWARF32, Version = 5
+# COMMON-NEXT: 0x00000058: 00000054 "Type_Unit"
+# COMMON-NEXT: 0x0000005c: 0000005e "MyStruct"
diff --git a/test/DebugInfo/X86/dwarfdump-str-offsets.s b/test/DebugInfo/X86/dwarfdump-str-offsets.s
index 5fbef186af60..363775a59f50 100644
--- a/test/DebugInfo/X86/dwarfdump-str-offsets.s
+++ b/test/DebugInfo/X86/dwarfdump-str-offsets.s
@@ -44,14 +44,17 @@ str_Variable3:
.long str_Variable2
.long str_Variable3
.debug_str_offsets_segment0_end:
-# CU2's contribution
- .long .debug_str_offsets_segment1_end-.debug_str_offsets_base1
+# A 4-byte gap.
+ .long 0
+# CU2's contribution in DWARF64 format
+ .long 0xffffffff
+ .quad .debug_str_offsets_segment1_end-.debug_str_offsets_base1
.short 5 # DWARF version
.short 0 # Padding
.debug_str_offsets_base1:
- .long str_producer
- .long str_CU2
- .long str_CU2_dir
+ .quad str_producer
+ .quad str_CU2
+ .quad str_CU2_dir
.debug_str_offsets_segment1_end:
# The TU's contribution
.long .debug_str_offsets_segment2_end-.debug_str_offsets_base2
@@ -75,7 +78,7 @@ dwo_str_TU_5_type:
.asciz "V5_split_Mystruct"
.section .debug_str_offsets.dwo,"",@progbits
-# The split CU's contribution
+# One contribution only in a .dwo file
.long .debug_dwo_str_offsets_segment0_end-.debug_dwo_str_offsets_base0
.short 5 # DWARF version
.short 0 # Padding
@@ -83,15 +86,9 @@ dwo_str_TU_5_type:
.long dwo_str_CU_5_producer-.debug_str.dwo
.long dwo_str_CU_5_name-.debug_str.dwo
.long dwo_str_CU_5_comp_dir-.debug_str.dwo
-.debug_dwo_str_offsets_segment0_end:
-# The split TU's contribution
- .long .debug_dwo_str_offsets_segment1_end-.debug_dwo_str_offsets_base1
- .short 5 # DWARF version
- .short 0 # Padding
-.debug_dwo_str_offsets_base1:
.long dwo_str_TU_5-.debug_str.dwo
.long dwo_str_TU_5_type-.debug_str.dwo
-.debug_dwo_str_offsets_segment1_end:
+.debug_dwo_str_offsets_segment0_end:
# All CUs/TUs use the same abbrev section for simplicity.
.section .debug_abbrev,"",@progbits
@@ -163,8 +160,6 @@ dwo_str_TU_5_type:
.byte 0x1a # DW_FORM_strx
.byte 0x03 # DW_AT_name
.byte 0x1a # DW_FORM_strx
- .byte 0x72 # DW_AT_str_offsets_base
- .byte 0x17 # DW_FORM_sec_offset
.byte 0x1b # DW_AT_comp_dir
.byte 0x1a # DW_FORM_strx
.byte 0x00 # EOM(1)
@@ -174,8 +169,6 @@ dwo_str_TU_5_type:
.byte 0x01 # DW_CHILDREN_yes
.byte 0x03 # DW_AT_name
.byte 0x1a # DW_FORM_strx
- .byte 0x72 # DW_AT_str_offsets_base
- .byte 0x17 # DW_FORM_sec_offset
.byte 0x00 # EOM(1)
.byte 0x00 # EOM(2)
.byte 0x03 # Abbrev code
@@ -275,7 +268,6 @@ CU_split_5_version:
.byte 1 # Abbreviation code
.byte 0 # The index of the producer string
.byte 1 # The index of the CU name string
- .long .debug_dwo_str_offsets_base0-.debug_str_offsets.dwo
.byte 2 # The index of the comp dir string
.byte 0 # NULL
CU_split_5_end:
@@ -294,12 +286,11 @@ TU_split_5_version:
.long TU_split_5_type-TU_split_5_start # Type offset
# The type-unit DIE, which has a name.
.byte 2 # Abbreviation code
- .byte 0 # The index of the type unit name string
- .long .debug_dwo_str_offsets_base1-.debug_str_offsets.dwo
+ .byte 3 # The index of the type unit name string
# The type DIE, which has a name.
TU_split_5_type:
.byte 3 # Abbreviation code
- .byte 1 # The index of the type name string
+ .byte 4 # The index of the type name string
.byte 0 # NULL
.byte 0 # NULL
TU_split_5_end:
@@ -340,7 +331,7 @@ TU_split_5_end:
# COMMON: DW_TAG_compile_unit
# COMMON-NEXT: DW_AT_producer [DW_FORM_strx] ( indexed (00000000) string = "Handmade DWARF producer")
# COMMON-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "Compile_Unit_2")
-# COMMON-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x0000002c)
+# COMMON-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000038)
# COMMON-NEXT: DW_AT_comp_dir [DW_FORM_strx] ( indexed (00000002) string = "/home/test/CU2")
#
# The split CU
@@ -349,28 +340,25 @@ TU_split_5_end:
# SPLIT: DW_TAG_compile_unit
# SPLIT-NEXT: DW_AT_producer [DW_FORM_strx] ( indexed (00000000) string = "Handmade split DWARF producer")
# SPLIT-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "V5_split_compile_unit")
-# SPLIT-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000008)
# SPLIT-NEXT: DW_AT_comp_dir [DW_FORM_strx] ( indexed (00000002) string = "/home/test/splitCU")
#
# The type unit
# COMMON: .debug_types contents:
# COMMON: DW_TAG_type_unit
# COMMON-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000000) string = "Type_Unit")
-# COMMON-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000040)
# COMMON: DW_TAG_structure_type
# COMMON-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "MyStruct")
#
# The split type unit
# SPLIT: .debug_types.dwo contents:
# SPLIT: DW_TAG_type_unit
-# SPLIT-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000000) string = "V5_split_type_unit")
-# SPLIT-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x0000001c)
+# SPLIT-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000003) string = "V5_split_type_unit")
# SPLIT: DW_TAG_structure_type
-# SPLIT-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "V5_split_Mystruct")
+# SPLIT-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000004) string = "V5_split_Mystruct")
#
# The .debug_str_offsets section
# COMMON: .debug_str_offsets contents:
-# COMMON-NEXT: 0x00000000: Contribution size = 28, Version = 5
+# COMMON-NEXT: 0x00000000: Contribution size = 28, Format = DWARF32, Version = 5
# COMMON-NEXT: 0x00000008: 00000000 "Handmade DWARF producer"
# COMMON-NEXT: 0x0000000c: 00000018 "Compile_Unit_1"
# COMMON-NEXT: 0x00000010: 00000027 "/home/test/CU1"
@@ -378,19 +366,19 @@ TU_split_5_end:
# COMMON-NEXT: 0x00000018: 0000006e "MyVar1"
# COMMON-NEXT: 0x0000001c: 00000075 "MyVar2"
# COMMON-NEXT: 0x00000020: 0000007c "MyVar3"
-# COMMON-NEXT: 0x00000024: Contribution size = 12, Version = 5
-# COMMON-NEXT: 0x0000002c: 00000000 "Handmade DWARF producer"
-# COMMON-NEXT: 0x00000030: 00000036 "Compile_Unit_2"
-# COMMON-NEXT: 0x00000034: 00000045 "/home/test/CU2"
-# COMMON-NEXT: 0x00000038: Contribution size = 8, Version = 5
-# COMMON-NEXT: 0x00000040: 00000054 "Type_Unit"
-# COMMON-NEXT: 0x00000044: 0000005e "MyStruct"
+# COMMON-NEXT: Gap, length = 4
+# COMMON-NEXT: 0x00000028: Contribution size = 24, Format = DWARF64, Version = 5
+# COMMON-NEXT: 0x00000038: 00000000 "Handmade DWARF producer"
+# COMMON-NEXT: 0x00000040: 00000036 "Compile_Unit_2"
+# COMMON-NEXT: 0x00000048: 00000045 "/home/test/CU2"
+# COMMON-NEXT: 0x00000050: Contribution size = 8, Format = DWARF32, Version = 5
+# COMMON-NEXT: 0x00000058: 00000054 "Type_Unit"
+# COMMON-NEXT: 0x0000005c: 0000005e "MyStruct"
#
# SPLIT: .debug_str_offsets.dwo contents:
-# SPLIT-NEXT: 0x00000000: Contribution size = 12, Version = 5
+# SPLIT-NEXT: 0x00000000: Contribution size = 20, Format = DWARF32, Version = 5
# SPLIT-NEXT: 0x00000008: 00000000 "Handmade split DWARF producer"
# SPLIT-NEXT: 0x0000000c: 0000001e "V5_split_compile_unit"
# SPLIT-NEXT: 0x00000010: 00000034 "/home/test/splitCU"
-# SPLIT-NEXT: 0x00000014: Contribution size = 8, Version = 5
-# SPLIT-NEXT: 0x0000001c: 00000047 "V5_split_type_unit"
-# SPLIT-NEXT: 0x00000020: 0000005a "V5_split_Mystruct"
+# SPLIT-NEXT: 0x00000014: 00000047 "V5_split_type_unit"
+# SPLIT-NEXT: 0x00000018: 0000005a "V5_split_Mystruct"
diff --git a/test/Instrumentation/HWAddressSanitizer/basic.ll b/test/Instrumentation/HWAddressSanitizer/basic.ll
index db1f52fc56e9..a5e49062669e 100644
--- a/test/Instrumentation/HWAddressSanitizer/basic.ll
+++ b/test/Instrumentation/HWAddressSanitizer/basic.ll
@@ -1,6 +1,7 @@
; Test basic address sanitizer instrumentation.
;
-; RUN: opt < %s -hwasan -S | FileCheck %s
+; RUN: opt < %s -hwasan -hwasan-recover=0 -S | FileCheck %s --check-prefixes=CHECK,ABORT
+; RUN: opt < %s -hwasan -hwasan-recover=1 -S | FileCheck %s --check-prefixes=CHECK,RECOVER
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64--linux-android"
@@ -17,8 +18,10 @@ define i8 @test_load8(i8* %a) sanitize_hwaddress {
; CHECK: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]]
; CHECK: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}}
-; CHECK: call void asm sideeffect "hlt #256", "{x0}"(i64 %[[A]])
-; CHECK: br label
+; ABORT: call void asm sideeffect "hlt #256", "{x0}"(i64 %[[A]])
+; ABORT: unreachable
+; RECOVER: call void asm sideeffect "hlt #288", "{x0}"(i64 %[[A]])
+; RECOVER: br label
; CHECK: %[[G:[^ ]*]] = load i8, i8* %a, align 4
; CHECK: ret i8 %[[G]]
@@ -40,8 +43,10 @@ define i16 @test_load16(i16* %a) sanitize_hwaddress {
; CHECK: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]]
; CHECK: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}}
-; CHECK: call void asm sideeffect "hlt #257", "{x0}"(i64 %[[A]])
-; CHECK: br label
+; ABORT: call void asm sideeffect "hlt #257", "{x0}"(i64 %[[A]])
+; ABORT: unreachable
+; RECOVER: call void asm sideeffect "hlt #289", "{x0}"(i64 %[[A]])
+; RECOVER: br label
; CHECK: %[[G:[^ ]*]] = load i16, i16* %a, align 4
; CHECK: ret i16 %[[G]]
@@ -63,8 +68,10 @@ define i32 @test_load32(i32* %a) sanitize_hwaddress {
; CHECK: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]]
; CHECK: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}}
-; CHECK: call void asm sideeffect "hlt #258", "{x0}"(i64 %[[A]])
-; CHECK: br label
+; ABORT: call void asm sideeffect "hlt #258", "{x0}"(i64 %[[A]])
+; ABORT: unreachable
+; RECOVER: call void asm sideeffect "hlt #290", "{x0}"(i64 %[[A]])
+; RECOVER: br label
; CHECK: %[[G:[^ ]*]] = load i32, i32* %a, align 4
; CHECK: ret i32 %[[G]]
@@ -86,8 +93,10 @@ define i64 @test_load64(i64* %a) sanitize_hwaddress {
; CHECK: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]]
; CHECK: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}}
-; CHECK: call void asm sideeffect "hlt #259", "{x0}"(i64 %[[A]])
-; CHECK: br label
+; ABORT: call void asm sideeffect "hlt #259", "{x0}"(i64 %[[A]])
+; ABORT: unreachable
+; RECOVER: call void asm sideeffect "hlt #291", "{x0}"(i64 %[[A]])
+; RECOVER: br label
; CHECK: %[[G:[^ ]*]] = load i64, i64* %a, align 8
; CHECK: ret i64 %[[G]]
@@ -109,8 +118,10 @@ define i128 @test_load128(i128* %a) sanitize_hwaddress {
; CHECK: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]]
; CHECK: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}}
-; CHECK: call void asm sideeffect "hlt #260", "{x0}"(i64 %[[A]])
-; CHECK: br label
+; ABORT: call void asm sideeffect "hlt #260", "{x0}"(i64 %[[A]])
+; ABORT: unreachable
+; RECOVER: call void asm sideeffect "hlt #292", "{x0}"(i64 %[[A]])
+; RECOVER: br label
; CHECK: %[[G:[^ ]*]] = load i128, i128* %a, align 16
; CHECK: ret i128 %[[G]]
@@ -123,7 +134,8 @@ entry:
define i40 @test_load40(i40* %a) sanitize_hwaddress {
; CHECK-LABEL: @test_load40(
; CHECK: %[[A:[^ ]*]] = ptrtoint i40* %a to i64
-; CHECK: call void @__hwasan_load(i64 %[[A]], i64 5)
+; ABORT: call void @__hwasan_load(i64 %[[A]], i64 5)
+; RECOVER: call void @__hwasan_load_noabort(i64 %[[A]], i64 5)
; CHECK: %[[B:[^ ]*]] = load i40, i40* %a
; CHECK: ret i40 %[[B]]
@@ -144,8 +156,10 @@ define void @test_store8(i8* %a, i8 %b) sanitize_hwaddress {
; CHECK: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]]
; CHECK: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}}
-; CHECK: call void asm sideeffect "hlt #272", "{x0}"(i64 %[[A]])
-; CHECK: br label
+; ABORT: call void asm sideeffect "hlt #272", "{x0}"(i64 %[[A]])
+; ABORT: unreachable
+; RECOVER: call void asm sideeffect "hlt #304", "{x0}"(i64 %[[A]])
+; RECOVER: br label
; CHECK: store i8 %b, i8* %a, align 4
; CHECK: ret void
@@ -167,8 +181,10 @@ define void @test_store16(i16* %a, i16 %b) sanitize_hwaddress {
; CHECK: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]]
; CHECK: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}}
-; CHECK: call void asm sideeffect "hlt #273", "{x0}"(i64 %[[A]])
-; CHECK: br label
+; ABORT: call void asm sideeffect "hlt #273", "{x0}"(i64 %[[A]])
+; ABORT: unreachable
+; RECOVER: call void asm sideeffect "hlt #305", "{x0}"(i64 %[[A]])
+; RECOVER: br label
; CHECK: store i16 %b, i16* %a, align 4
; CHECK: ret void
@@ -190,8 +206,10 @@ define void @test_store32(i32* %a, i32 %b) sanitize_hwaddress {
; CHECK: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]]
; CHECK: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}}
-; CHECK: call void asm sideeffect "hlt #274", "{x0}"(i64 %[[A]])
-; CHECK: br label
+; ABORT: call void asm sideeffect "hlt #274", "{x0}"(i64 %[[A]])
+; ABORT: unreachable
+; RECOVER: call void asm sideeffect "hlt #306", "{x0}"(i64 %[[A]])
+; RECOVER: br label
; CHECK: store i32 %b, i32* %a, align 4
; CHECK: ret void
@@ -213,8 +231,10 @@ define void @test_store64(i64* %a, i64 %b) sanitize_hwaddress {
; CHECK: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]]
; CHECK: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}}
-; CHECK: call void asm sideeffect "hlt #275", "{x0}"(i64 %[[A]])
-; CHECK: br label
+; ABORT: call void asm sideeffect "hlt #275", "{x0}"(i64 %[[A]])
+; ABORT: unreachable
+; RECOVER: call void asm sideeffect "hlt #307", "{x0}"(i64 %[[A]])
+; RECOVER: br label
; CHECK: store i64 %b, i64* %a, align 8
; CHECK: ret void
@@ -236,8 +256,10 @@ define void @test_store128(i128* %a, i128 %b) sanitize_hwaddress {
; CHECK: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]]
; CHECK: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}}
-; CHECK: call void asm sideeffect "hlt #276", "{x0}"(i64 %[[A]])
-; CHECK: br label
+; ABORT: call void asm sideeffect "hlt #276", "{x0}"(i64 %[[A]])
+; ABORT: unreachable
+; RECOVER: call void asm sideeffect "hlt #308", "{x0}"(i64 %[[A]])
+; RECOVER: br label
; CHECK: store i128 %b, i128* %a, align 16
; CHECK: ret void
@@ -250,7 +272,8 @@ entry:
define void @test_store40(i40* %a, i40 %b) sanitize_hwaddress {
; CHECK-LABEL: @test_store40(
; CHECK: %[[A:[^ ]*]] = ptrtoint i40* %a to i64
-; CHECK: call void @__hwasan_store(i64 %[[A]], i64 5)
+; ABORT: call void @__hwasan_store(i64 %[[A]], i64 5)
+; RECOVER: call void @__hwasan_store_noabort(i64 %[[A]], i64 5)
; CHECK: store i40 %b, i40* %a
; CHECK: ret void
@@ -262,7 +285,8 @@ entry:
define void @test_store_unaligned(i64* %a, i64 %b) sanitize_hwaddress {
; CHECK-LABEL: @test_store_unaligned(
; CHECK: %[[A:[^ ]*]] = ptrtoint i64* %a to i64
-; CHECK: call void @__hwasan_store(i64 %[[A]], i64 8)
+; ABORT: call void @__hwasan_store(i64 %[[A]], i64 8)
+; RECOVER: call void @__hwasan_store_noabort(i64 %[[A]], i64 8)
; CHECK: store i64 %b, i64* %a, align 4
; CHECK: ret void
diff --git a/test/Instrumentation/HWAddressSanitizer/with-calls.ll b/test/Instrumentation/HWAddressSanitizer/with-calls.ll
index be3b33d4f8d6..0a1713b83912 100644
--- a/test/Instrumentation/HWAddressSanitizer/with-calls.ll
+++ b/test/Instrumentation/HWAddressSanitizer/with-calls.ll
@@ -1,6 +1,7 @@
; Test basic address sanitizer instrumentation.
;
-; RUN: opt < %s -hwasan -hwasan-instrument-with-calls -S | FileCheck %s
+; RUN: opt < %s -hwasan -hwasan-instrument-with-calls -S | FileCheck %s --check-prefixes=CHECK,ABORT
+; RUN: opt < %s -hwasan -hwasan-instrument-with-calls -hwasan-recover=1 -S | FileCheck %s --check-prefixes=CHECK,RECOVER
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64--linux-android"
@@ -8,7 +9,8 @@ target triple = "aarch64--linux-android"
define i8 @test_load8(i8* %a) sanitize_hwaddress {
; CHECK-LABEL: @test_load8(
; CHECK: %[[A:[^ ]*]] = ptrtoint i8* %a to i64
-; CHECK: call void @__hwasan_load1(i64 %[[A]])
+; ABORT: call void @__hwasan_load1(i64 %[[A]])
+; RECOVER: call void @__hwasan_load1_noabort(i64 %[[A]])
; CHECK: %[[B:[^ ]*]] = load i8, i8* %a
; CHECK: ret i8 %[[B]]
@@ -20,7 +22,8 @@ entry:
define i16 @test_load16(i16* %a) sanitize_hwaddress {
; CHECK-LABEL: @test_load16(
; CHECK: %[[A:[^ ]*]] = ptrtoint i16* %a to i64
-; CHECK: call void @__hwasan_load2(i64 %[[A]])
+; ABORT: call void @__hwasan_load2(i64 %[[A]])
+; RECOVER: call void @__hwasan_load2_noabort(i64 %[[A]])
; CHECK: %[[B:[^ ]*]] = load i16, i16* %a
; CHECK: ret i16 %[[B]]
@@ -32,7 +35,8 @@ entry:
define i32 @test_load32(i32* %a) sanitize_hwaddress {
; CHECK-LABEL: @test_load32(
; CHECK: %[[A:[^ ]*]] = ptrtoint i32* %a to i64
-; CHECK: call void @__hwasan_load4(i64 %[[A]])
+; ABORT: call void @__hwasan_load4(i64 %[[A]])
+; RECOVER: call void @__hwasan_load4_noabort(i64 %[[A]])
; CHECK: %[[B:[^ ]*]] = load i32, i32* %a
; CHECK: ret i32 %[[B]]
@@ -44,7 +48,8 @@ entry:
define i64 @test_load64(i64* %a) sanitize_hwaddress {
; CHECK-LABEL: @test_load64(
; CHECK: %[[A:[^ ]*]] = ptrtoint i64* %a to i64
-; CHECK: call void @__hwasan_load8(i64 %[[A]])
+; ABORT: call void @__hwasan_load8(i64 %[[A]])
+; RECOVER: call void @__hwasan_load8_noabort(i64 %[[A]])
; CHECK: %[[B:[^ ]*]] = load i64, i64* %a
; CHECK: ret i64 %[[B]]
@@ -56,7 +61,8 @@ entry:
define i128 @test_load128(i128* %a) sanitize_hwaddress {
; CHECK-LABEL: @test_load128(
; CHECK: %[[A:[^ ]*]] = ptrtoint i128* %a to i64
-; CHECK: call void @__hwasan_load16(i64 %[[A]])
+; ABORT: call void @__hwasan_load16(i64 %[[A]])
+; RECOVER: call void @__hwasan_load16_noabort(i64 %[[A]])
; CHECK: %[[B:[^ ]*]] = load i128, i128* %a
; CHECK: ret i128 %[[B]]
@@ -68,7 +74,8 @@ entry:
define i40 @test_load40(i40* %a) sanitize_hwaddress {
; CHECK-LABEL: @test_load40(
; CHECK: %[[A:[^ ]*]] = ptrtoint i40* %a to i64
-; CHECK: call void @__hwasan_load(i64 %[[A]], i64 5)
+; ABORT: call void @__hwasan_load(i64 %[[A]], i64 5)
+; RECOVER: call void @__hwasan_load_noabort(i64 %[[A]], i64 5)
; CHECK: %[[B:[^ ]*]] = load i40, i40* %a
; CHECK: ret i40 %[[B]]
@@ -80,7 +87,8 @@ entry:
define void @test_store8(i8* %a, i8 %b) sanitize_hwaddress {
; CHECK-LABEL: @test_store8(
; CHECK: %[[A:[^ ]*]] = ptrtoint i8* %a to i64
-; CHECK: call void @__hwasan_store1(i64 %[[A]])
+; ABORT: call void @__hwasan_store1(i64 %[[A]])
+; RECOVER: call void @__hwasan_store1_noabort(i64 %[[A]])
; CHECK: store i8 %b, i8* %a
; CHECK: ret void
@@ -92,7 +100,8 @@ entry:
define void @test_store16(i16* %a, i16 %b) sanitize_hwaddress {
; CHECK-LABEL: @test_store16(
; CHECK: %[[A:[^ ]*]] = ptrtoint i16* %a to i64
-; CHECK: call void @__hwasan_store2(i64 %[[A]])
+; ABORT: call void @__hwasan_store2(i64 %[[A]])
+; RECOVER: call void @__hwasan_store2_noabort(i64 %[[A]])
; CHECK: store i16 %b, i16* %a
; CHECK: ret void
@@ -104,7 +113,8 @@ entry:
define void @test_store32(i32* %a, i32 %b) sanitize_hwaddress {
; CHECK-LABEL: @test_store32(
; CHECK: %[[A:[^ ]*]] = ptrtoint i32* %a to i64
-; CHECK: call void @__hwasan_store4(i64 %[[A]])
+; ABORT: call void @__hwasan_store4(i64 %[[A]])
+; RECOVER: call void @__hwasan_store4_noabort(i64 %[[A]])
; CHECK: store i32 %b, i32* %a
; CHECK: ret void
@@ -116,7 +126,8 @@ entry:
define void @test_store64(i64* %a, i64 %b) sanitize_hwaddress {
; CHECK-LABEL: @test_store64(
; CHECK: %[[A:[^ ]*]] = ptrtoint i64* %a to i64
-; CHECK: call void @__hwasan_store8(i64 %[[A]])
+; ABORT: call void @__hwasan_store8(i64 %[[A]])
+; RECOVER: call void @__hwasan_store8_noabort(i64 %[[A]])
; CHECK: store i64 %b, i64* %a
; CHECK: ret void
@@ -128,7 +139,8 @@ entry:
define void @test_store128(i128* %a, i128 %b) sanitize_hwaddress {
; CHECK-LABEL: @test_store128(
; CHECK: %[[A:[^ ]*]] = ptrtoint i128* %a to i64
-; CHECK: call void @__hwasan_store16(i64 %[[A]])
+; ABORT: call void @__hwasan_store16(i64 %[[A]])
+; RECOVER: call void @__hwasan_store16_noabort(i64 %[[A]])
; CHECK: store i128 %b, i128* %a
; CHECK: ret void
@@ -140,7 +152,8 @@ entry:
define void @test_store40(i40* %a, i40 %b) sanitize_hwaddress {
; CHECK-LABEL: @test_store40(
; CHECK: %[[A:[^ ]*]] = ptrtoint i40* %a to i64
-; CHECK: call void @__hwasan_store(i64 %[[A]], i64 5)
+; ABORT: call void @__hwasan_store(i64 %[[A]], i64 5)
+; RECOVER: call void @__hwasan_store_noabort(i64 %[[A]], i64 5)
; CHECK: store i40 %b, i40* %a
; CHECK: ret void
diff --git a/test/MC/AArch64/arm64-system-encoding.s b/test/MC/AArch64/arm64-system-encoding.s
index ef4037b7bf3f..19ed248db3a8 100644
--- a/test/MC/AArch64/arm64-system-encoding.s
+++ b/test/MC/AArch64/arm64-system-encoding.s
@@ -1,4 +1,5 @@
; RUN: not llvm-mc -triple arm64-apple-darwin -show-encoding < %s 2> %t | FileCheck %s
+; RUN: not llvm-mc -triple arm64-apple-darwin -mattr=+v8.3a -show-encoding < %s 2> %t | FileCheck %s --check-prefix=CHECK-V83
; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
foo:
@@ -233,6 +234,7 @@ foo:
mrs x3, AMAIR_EL3
mrs x3, CCSIDR_EL1
mrs x3, CLIDR_EL1
+ mrs x3, CCSIDR2_EL1
mrs x3, CNTFRQ_EL0
mrs x3, CNTHCTL_EL2
mrs x3, CNTHP_CTL_EL2
@@ -418,6 +420,7 @@ foo:
; CHECK: mrs x3, AMAIR_EL3 ; encoding: [0x03,0xa3,0x3e,0xd5]
; CHECK: mrs x3, CCSIDR_EL1 ; encoding: [0x03,0x00,0x39,0xd5]
; CHECK: mrs x3, CLIDR_EL1 ; encoding: [0x23,0x00,0x39,0xd5]
+; CHECK-V83: mrs x3, CCSIDR2_EL1 ; encoding: [0x43,0x00,0x39,0xd5]
; CHECK: mrs x3, CNTFRQ_EL0 ; encoding: [0x03,0xe0,0x3b,0xd5]
; CHECK: mrs x3, CNTHCTL_EL2 ; encoding: [0x03,0xe1,0x3c,0xd5]
; CHECK: mrs x3, CNTHP_CTL_EL2 ; encoding: [0x23,0xe2,0x3c,0xd5]
diff --git a/test/MC/AArch64/basic-a64-diagnostics.s b/test/MC/AArch64/basic-a64-diagnostics.s
index 597adfb3a25b..3b791bef0b6f 100644
--- a/test/MC/AArch64/basic-a64-diagnostics.s
+++ b/test/MC/AArch64/basic-a64-diagnostics.s
@@ -3504,6 +3504,7 @@
msr MIDR_EL1, x12
msr CCSIDR_EL1, x12
msr CLIDR_EL1, x12
+ msr CCSIDR2_EL1, x12
msr CTR_EL0, x12
msr MPIDR_EL1, x12
msr REVIDR_EL1, x12
@@ -3572,6 +3573,9 @@
// CHECK-ERROR-NEXT: msr CLIDR_EL1, x12
// CHECK-ERROR-NEXT: ^
// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT: msr CCSIDR2_EL1, x12
+// CHECK-ERROR-NEXT: ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
// CHECK-ERROR-NEXT: msr CTR_EL0, x12
// CHECK-ERROR-NEXT: ^
// CHECK-ERROR-NEXT: error: expected writable system register or pstate
diff --git a/test/MC/AArch64/dot-req.s b/test/MC/AArch64/dot-req.s
index a557f0c67589..582674b1f8d9 100644
--- a/test/MC/AArch64/dot-req.s
+++ b/test/MC/AArch64/dot-req.s
@@ -37,3 +37,13 @@ bar:
// CHECK: fmov d2, d3 // encoding: [0x62,0x40,0x60,0x1e]
// CHECK: ldr q2, [sp] // encoding: [0xe2,0x03,0xc0,0x3d]
// CHECK: mov v0.8b, v1.8b // encoding: [0x20,0x1c,0xa1,0x0e]
+
+ peter .req x6
+ add peter, x0, x0
+ .unreq peter
+// CHECK: add x6, x0, x0
+
+ zoe .req x6
+ add zoe, x0, x0
+ .unreq zoe
+// CHECK: add x6, x0, x0
diff --git a/test/MC/AMDGPU/ds.s b/test/MC/AMDGPU/ds.s
index ef36a98f746a..b06101a4051b 100644
--- a/test/MC/AMDGPU/ds.s
+++ b/test/MC/AMDGPU/ds.s
@@ -511,6 +511,10 @@ ds_swizzle_b32 v8, v2
// SICI: ds_swizzle_b32 v8, v2 ; encoding: [0x00,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08]
// VI: ds_swizzle_b32 v8, v2 ; encoding: [0x00,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08]
+ds_swizzle_b32 v8, v2 gds
+// SICI: ds_swizzle_b32 v8, v2 gds ; encoding: [0x00,0x00,0xd6,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 gds ; encoding: [0x00,0x00,0x7b,0xd8,0x02,0x00,0x00,0x08]
+
ds_swizzle_b32 v8, v2 offset:0xFFFF
// SICI: ds_swizzle_b32 v8, v2 offset:65535 ; encoding: [0xff,0xff,0xd4,0xd8,0x02,0x00,0x00,0x08]
// VI: ds_swizzle_b32 v8, v2 offset:65535 ; encoding: [0xff,0xff,0x7a,0xd8,0x02,0x00,0x00,0x08]
diff --git a/test/MC/AMDGPU/expressions.s b/test/MC/AMDGPU/expressions.s
index 7b0e90378a06..dd4957c8baec 100644
--- a/test/MC/AMDGPU/expressions.s
+++ b/test/MC/AMDGPU/expressions.s
@@ -46,3 +46,11 @@ BB2:
s_sub_u32 vcc_lo, vcc_lo, (BB2+4)-BB1
// VI: s_sub_u32 vcc_lo, vcc_lo, (BB2+4)-BB1 ; encoding: [0x6a,0xff,0xea,0x80,A,A,A,A]
// VI-NEXT: ; fixup A - offset: 4, value: (BB2+4)-BB1, kind: FK_Data_4
+
+t=1
+s_sub_u32 s0, s0, -t
+// VI: s_sub_u32 s0, s0, -1 ; encoding: [0x00,0xc1,0x80,0x80]
+
+t=-1
+s_sub_u32 s0, s0, -t
+// VI: s_sub_u32 s0, s0, 1 ; encoding: [0x00,0x81,0x80,0x80]
diff --git a/test/MC/AMDGPU/invalid-instructions-spellcheck.s b/test/MC/AMDGPU/invalid-instructions-spellcheck.s
new file mode 100644
index 000000000000..f4198f10f4b8
--- /dev/null
+++ b/test/MC/AMDGPU/invalid-instructions-spellcheck.s
@@ -0,0 +1,48 @@
+# RUN: not llvm-mc -triple amdgcn < %s 2>&1 | FileCheck %s
+
+# This tests the mnemonic spell checker.
+
+# First check what happens when an instruction is omitted:
+
+v2, v4, v6
+
+# CHECK: unknown token in expression
+# CHECK-NEXT: v2, v4, v6
+# CHECK-NEXT: ^
+
+# CHECK: error: not a valid operand.
+# CHECK-NEXT: v2, v4, v6
+# CHECK-NEXT: ^
+
+# We don't want to see a suggestion here; the edit distance is too large to
+# give sensible suggestions:
+
+aaaaaaaaaaaaaaa v1, v2, v3
+
+# CHECK: error: invalid instruction
+# CHECK-NEXT: aaaaaaaaaaaaaaa v1, v2, v3
+# CHECK-NEXT: ^
+
+# Check that we get one suggestion: 'dsc_write_src2_b64' is 1 edit away, i.e. an deletion.
+
+dsc_write_src2_b64 v1, v2, v3
+
+# CHECK: error: invalid instruction, did you mean: ds_write_src2_b64?
+# CHECK-NEXT: dsc_write_src2_b64 v1, v2, v3
+# CHECK-NEXT: ^
+
+# Check edit distance 1 and 2, just insertions:
+
+s_mov_b v1, v2
+
+# CHECK: error: invalid instruction, did you mean: s_mov_b32, s_mov_b64?
+# CHECK-NEXT: s_mov_b v1, v2
+# CHECK-NEXT: ^
+
+# Check an instruction that is 2 edits away, and also has a lot of candidates:
+
+s_load_dwordx v1, v2, v3
+
+# CHECK: error: invalid instruction, did you mean: s_load_dword, s_load_dwordx16, s_load_dwordx2, s_load_dwordx4, s_load_dwordx8?
+# CHECK-NEXT: s_load_dwordx v1, v2, v3
+# CHECK-NEXT: ^
diff --git a/test/MC/AMDGPU/trap.s b/test/MC/AMDGPU/trap.s
index de551ca00b52..7b527ba3072e 100644
--- a/test/MC/AMDGPU/trap.s
+++ b/test/MC/AMDGPU/trap.s
@@ -190,6 +190,48 @@ s_mov_b64 ttmp[14:15], exec
// GFX9: s_mov_b64 ttmp[14:15], exec ; encoding: [0x7e,0x01,0xfa,0xbe]
//===----------------------------------------------------------------------===//
+// Trap Handler related - 8-dword registers
+// NB: gfx7 doc states that SMRD does not support trap registers for dst
+//===----------------------------------------------------------------------===//
+
+s_buffer_load_dwordx8 ttmp[0:7], s[0:3], s0
+// VI: [0x00,0x1c,0x2c,0xc0,0x00,0x00,0x00,0x00]
+// GFX9: [0x00,0x1b,0x2c,0xc0,0x00,0x00,0x00,0x00]
+
+s_buffer_load_dwordx8 ttmp[4:11], s[0:3], s0
+// VI: [0x00,0x1d,0x2c,0xc0,0x00,0x00,0x00,0x00]
+// GFX9: [0x00,0x1c,0x2c,0xc0,0x00,0x00,0x00,0x00]
+
+s_buffer_load_dwordx8 ttmp[8:15], s[0:3], s0
+// NOSICIVI: error: not a valid operand
+// GFX9: [0x00,0x1d,0x2c,0xc0,0x00,0x00,0x00,0x00]
+
+s_load_dwordx8 ttmp[0:7], s[0:1], s0
+// VI: [0x00,0x1c,0x0c,0xc0,0x00,0x00,0x00,0x00]
+// GFX9: [0x00,0x1b,0x0c,0xc0,0x00,0x00,0x00,0x00]
+
+s_load_dwordx8 ttmp[4:11], s[0:1], s0
+// VI: [0x00,0x1d,0x0c,0xc0,0x00,0x00,0x00,0x00]
+// GFX9: [0x00,0x1c,0x0c,0xc0,0x00,0x00,0x00,0x00]
+
+s_load_dwordx8 ttmp[8:15], s[0:1], s0
+// NOSICIVI: error: not a valid operand
+// GFX9: [0x00,0x1d,0x0c,0xc0,0x00,0x00,0x00,0x00]
+
+//===----------------------------------------------------------------------===//
+// Trap Handler related - 16-dword registers
+// NB: gfx7 doc states that SMRD does not support trap registers for dst
+//===----------------------------------------------------------------------===//
+
+s_buffer_load_dwordx16 ttmp[0:15], s[0:3], s0
+// NOSICIVI: error: not a valid operand
+// GFX9: [0x00,0x1b,0x30,0xc0,0x00,0x00,0x00,0x00]
+
+s_load_dwordx16 ttmp[0:15], s[0:1], s0
+// NOSICIVI: error: not a valid operand
+// GFX9: [0x00,0x1b,0x10,0xc0,0x00,0x00,0x00,0x00]
+
+//===----------------------------------------------------------------------===//
// Trap Handler related - Some specific instructions
//===----------------------------------------------------------------------===//
diff --git a/test/MC/AMDGPU/vop1-gfx9-err.s b/test/MC/AMDGPU/vop1-gfx9-err.s
index 0036b79c71b5..61bf5f661759 100644
--- a/test/MC/AMDGPU/vop1-gfx9-err.s
+++ b/test/MC/AMDGPU/vop1-gfx9-err.s
@@ -1,6 +1,6 @@
-// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding %s 2>&1 | FileCheck -check-prefix=GCN %s
-// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck -check-prefix=GCN %s
-// RUN: not llvm-mc -arch=amdgcn -mcpu=hawaii -show-encoding %s 2>&1 | FileCheck -check-prefix=GCN %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding %s 2>&1 | FileCheck -check-prefixes=GCN,GFX9 %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck -check-prefixes=GCN,VI %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=hawaii -show-encoding %s 2>&1 | FileCheck -check-prefixes=GCN,CI %s
v_swap_b32 v1, 1
// GCN: :16: error: invalid operand for instruction
@@ -10,7 +10,9 @@ v_swap_b32 v1, s0
// FIXME: Better error for it requiring VOP1 encoding
v_swap_b32_e64 v1, v2
-// GCN: :1: error: unrecognized instruction mnemonic
+// GFX9: :1: error: invalid instruction, did you mean: v_swap_b32?
+// CI: :1: error: invalid instruction
+// VI: :1: error: invalid instruction
v_swap_b32 v1, v2, v1
// GCN: :20: error: invalid operand for instruction
@@ -22,4 +24,4 @@ v_swap_b32 v1, v2, v2, v2
// GCN: :20: error: invalid operand for instruction
v_swap_codegen_pseudo_b32 v1, v2
-// GCN: :1: error: unrecognized instruction mnemonic
+// GCN: :1: error: invalid instruction
diff --git a/test/MC/AMDGPU/vop3p-err.s b/test/MC/AMDGPU/vop3p-err.s
index 7dfd951b6ad0..fe3fee97b5e7 100644
--- a/test/MC/AMDGPU/vop3p-err.s
+++ b/test/MC/AMDGPU/vop3p-err.s
@@ -59,16 +59,16 @@ v_pk_add_f16 v1, v2, |v3|
// GFX9: :22: error: invalid operand for instruction
v_pk_add_f16 v1, v2, abs(v3)
-// GFX9: :19: error: invalid operand for instruction
+// GFX9: :18: error: invalid operand for instruction
v_pk_add_f16 v1, -v2, v3
-// GFX9: :23: error: invalid operand for instruction
+// GFX9: :22: error: invalid operand for instruction
v_pk_add_f16 v1, v2, -v3
// GFX9: :18: error: invalid operand for instruction
v_pk_add_u16 v1, abs(v2), v3
-// GFX9: :19: error: invalid operand for instruction
+// GFX9: :18: error: invalid operand for instruction
v_pk_add_u16 v1, -v2, v3
//
diff --git a/test/MC/ARM/dfb-neg.s b/test/MC/ARM/dfb-neg.s
new file mode 100644
index 000000000000..15c44877fa6f
--- /dev/null
+++ b/test/MC/ARM/dfb-neg.s
@@ -0,0 +1,10 @@
+@ RUN: not llvm-mc -triple armv8-none-eabi -mcpu=cortex-r52 -mattr=-dfb -show-encoding < %s 2>&1 | FileCheck %s
+@ RUN: not llvm-mc -triple thumbv8-none-eabi -mcpu=cortex-r52 -mattr=-dfb -show-encoding < %s 2>&1 | FileCheck %s
+
+ dfb
+@ CHECK: error: instruction requires: full-data-barrier
+
+ dfb sy
+ dfb #0
+@ CHECK: error: invalid instruction
+@ CHECK: error: invalid instruction
diff --git a/test/MC/ARM/dfb.s b/test/MC/ARM/dfb.s
new file mode 100644
index 000000000000..58477749807f
--- /dev/null
+++ b/test/MC/ARM/dfb.s
@@ -0,0 +1,6 @@
+@ RUN: llvm-mc -triple armv8-none-eabi -mcpu=cortex-r52 -show-encoding < %s | FileCheck %s --check-prefix=CHECK-ARM
+@ RUN: llvm-mc -triple thumbv8-none-eabi -mcpu=cortex-r52 -show-encoding < %s | FileCheck %s --check-prefix=CHECK-THUMB
+
+ dfb
+@ CHECK-ARM: dfb @ encoding: [0x4c,0xf0,0x7f,0xf5]
+@ CHECK-THUMB: dfb @ encoding: [0xbf,0xf3,0x4c,0x8f]
diff --git a/test/MC/COFF/align-nops.s b/test/MC/COFF/align-nops.s
index 02b488475e90..6d23721ed779 100644
--- a/test/MC/COFF/align-nops.s
+++ b/test/MC/COFF/align-nops.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 %s | llvm-readobj -s -sd | FileCheck %s
+// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 -mcpu=pentiumpro %s | llvm-readobj -s -sd | FileCheck %s
// Test that we get optimal nops in text
.text
diff --git a/test/MC/Disassembler/AArch64/basic-a64-instructions.txt b/test/MC/Disassembler/AArch64/basic-a64-instructions.txt
index a2f9d24091ef..33b8d6b4de44 100644
--- a/test/MC/Disassembler/AArch64/basic-a64-instructions.txt
+++ b/test/MC/Disassembler/AArch64/basic-a64-instructions.txt
@@ -1,6 +1,7 @@
# RUN: llvm-mc -triple=aarch64 -mattr=+fp-armv8 -disassemble < %s | FileCheck %s
# RUN: llvm-mc -triple=arm64 -mattr=+fp-armv8 -disassemble < %s | FileCheck %s
# RUN: llvm-mc -triple=arm64 -mattr=+fp-armv8,+fullfp16 -disassemble < %s | FileCheck %s --check-prefix=CHECK --check-prefix=FP16
+# RUN: llvm-mc -triple=arm64 -mattr=+v8.3a -disassemble < %s | FileCheck %s --check-prefix=CHECK-V83
#------------------------------------------------------------------------------
# Add/sub (immediate)
@@ -3493,6 +3494,7 @@
# CHECK: mrs x9, {{midr_el1|MIDR_EL1}}
# CHECK: mrs x9, {{ccsidr_el1|CCSIDR_EL1}}
# CHECK: mrs x9, {{csselr_el1|CSSELR_EL1}}
+# CHECK-V83: mrs x9, {{ccsidr2_el1|CCSIDR2_EL1}}
# CHECK: mrs x9, {{vpidr_el2|VPIDR_EL2}}
# CHECK: mrs x9, {{clidr_el1|CLIDR_EL1}}
# CHECK: mrs x9, {{ctr_el0|CTR_EL0}}
@@ -4048,6 +4050,7 @@
0x9 0x0 0x38 0xd5
0x9 0x0 0x39 0xd5
0x9 0x0 0x3a 0xd5
+0x49 0x0 0x39 0xd5
0x9 0x0 0x3c 0xd5
0x29 0x0 0x39 0xd5
0x29 0x0 0x3b 0xd5
diff --git a/test/MC/Disassembler/AMDGPU/ds_vi.txt b/test/MC/Disassembler/AMDGPU/ds_vi.txt
index 6d910ea5bb58..c12e7a157e82 100644
--- a/test/MC/Disassembler/AMDGPU/ds_vi.txt
+++ b/test/MC/Disassembler/AMDGPU/ds_vi.txt
@@ -171,6 +171,9 @@
# VI: ds_swizzle_b32 v8, v2 ; encoding: [0x00,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08]
0x00 0x00 0x7a 0xd8 0x02 0x00 0x00 0x08
+# VI: ds_swizzle_b32 v8, v2 gds ; encoding: [0x00,0x00,0x7b,0xd8,0x02,0x00,0x00,0x08]
+0x00 0x00 0x7b 0xd8 0x02 0x00 0x00 0x08
+
# VI: ds_read_b32 v8, v2 ; encoding: [0x00,0x00,0x6c,0xd8,0x02,0x00,0x00,0x08]
0x00 0x00 0x6c 0xd8 0x02 0x00 0x00 0x08
diff --git a/test/MC/Disassembler/AMDGPU/trap_gfx9.txt b/test/MC/Disassembler/AMDGPU/trap_gfx9.txt
index 70a570eec5ab..0b140c42168a 100644
--- a/test/MC/Disassembler/AMDGPU/trap_gfx9.txt
+++ b/test/MC/Disassembler/AMDGPU/trap_gfx9.txt
@@ -107,3 +107,35 @@
# GFX9: buffer_atomic_inc v1, off, ttmp[12:15], 56 glc ; encoding: [0x00,0x40,0x2c,0xe1,0x00,0x01,0x1e,0xb8]
0x00,0x40,0x2c,0xe1,0x00,0x01,0x1e,0xb8
+
+#===----------------------------------------------------------------------===#
+# Trap Handler related - 8-dword registers
+#===----------------------------------------------------------------------===#
+
+# GFX9: s_buffer_load_dwordx8 ttmp[0:7], s[0:3], s0 ; encoding: [0x00,0x1b,0x2c,0xc0,0x00,0x00,0x00,0x00]
+0x00,0x1b,0x2c,0xc0,0x00,0x00,0x00,0x00
+
+# GFX9: s_buffer_load_dwordx8 ttmp[4:11], s[0:3], s0 ; encoding: [0x00,0x1c,0x2c,0xc0,0x00,0x00,0x00,0x00]
+0x00,0x1c,0x2c,0xc0,0x00,0x00,0x00,0x00
+
+# GFX9: s_buffer_load_dwordx8 ttmp[8:15], s[0:3], s0 ; encoding: [0x00,0x1d,0x2c,0xc0,0x00,0x00,0x00,0x00]
+0x00,0x1d,0x2c,0xc0,0x00,0x00,0x00,0x00
+
+# GFX9: s_load_dwordx8 ttmp[0:7], s[0:1], s0 ; encoding: [0x00,0x1b,0x0c,0xc0,0x00,0x00,0x00,0x00]
+0x00,0x1b,0x0c,0xc0,0x00,0x00,0x00,0x00
+
+# GFX9: s_load_dwordx8 ttmp[4:11], s[0:1], s0 ; encoding: [0x00,0x1c,0x0c,0xc0,0x00,0x00,0x00,0x00]
+0x00,0x1c,0x0c,0xc0,0x00,0x00,0x00,0x00
+
+# GFX9: s_load_dwordx8 ttmp[8:15], s[0:1], s0 ; encoding: [0x00,0x1d,0x0c,0xc0,0x00,0x00,0x00,0x00]
+0x00,0x1d,0x0c,0xc0,0x00,0x00,0x00,0x00
+
+#===----------------------------------------------------------------------===#
+# Trap Handler related - 16-dword registers
+#===----------------------------------------------------------------------===#
+
+# GFX9: s_buffer_load_dwordx16 ttmp[0:15], s[0:3], s0 ; encoding: [0x00,0x1b,0x30,0xc0,0x00,0x00,0x00,0x00]
+0x00,0x1b,0x30,0xc0,0x00,0x00,0x00,0x00
+
+# GFX9: s_load_dwordx16 ttmp[0:15], s[0:1], s0 ; encoding: [0x00,0x1b,0x10,0xc0,0x00,0x00,0x00,0x00]
+0x00,0x1b,0x10,0xc0,0x00,0x00,0x00,0x00
diff --git a/test/MC/Disassembler/AMDGPU/trap_vi.txt b/test/MC/Disassembler/AMDGPU/trap_vi.txt
index 8b131512050c..eb254134cc53 100644
--- a/test/MC/Disassembler/AMDGPU/trap_vi.txt
+++ b/test/MC/Disassembler/AMDGPU/trap_vi.txt
@@ -107,3 +107,19 @@
# VI: buffer_atomic_inc v1, off, ttmp[8:11], 56 glc ; encoding: [0x00,0x40,0x2c,0xe1,0x00,0x01,0x1e,0xb8]
0x00,0x40,0x2c,0xe1,0x00,0x01,0x1e,0xb8
+
+#===----------------------------------------------------------------------===#
+# Trap Handler related - 8-dword registers
+#===----------------------------------------------------------------------===#
+
+# VI: s_buffer_load_dwordx8 ttmp[0:7], s[0:3], s0 ; encoding: [0x00,0x1c,0x2c,0xc0,0x00,0x00,0x00,0x00]
+0x00,0x1c,0x2c,0xc0,0x00,0x00,0x00,0x00
+
+# VI: s_buffer_load_dwordx8 ttmp[4:11], s[0:3], s0 ; encoding: [0x00,0x1d,0x2c,0xc0,0x00,0x00,0x00,0x00]
+0x00,0x1d,0x2c,0xc0,0x00,0x00,0x00,0x00
+
+# VI: s_load_dwordx8 ttmp[0:7], s[0:1], s0 ; encoding: [0x00,0x1c,0x0c,0xc0,0x00,0x00,0x00,0x00]
+0x00,0x1c,0x0c,0xc0,0x00,0x00,0x00,0x00
+
+# VI: s_load_dwordx8 ttmp[4:11], s[0:1], s0 ; encoding: [0x00,0x1d,0x0c,0xc0,0x00,0x00,0x00,0x00]
+0x00,0x1d,0x0c,0xc0,0x00,0x00,0x00,0x00
diff --git a/test/MC/Disassembler/ARM/dfb-arm.txt b/test/MC/Disassembler/ARM/dfb-arm.txt
new file mode 100644
index 000000000000..26f81621274c
--- /dev/null
+++ b/test/MC/Disassembler/ARM/dfb-arm.txt
@@ -0,0 +1,6 @@
+# RUN: llvm-mc -disassemble -triple armv8-none-eabi -mcpu=cortex-r52 -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DFB
+# RUN: llvm-mc -disassemble -triple armv8-none-eabi -mcpu=cortex-r52 -mattr=-dfb -show-encoding < %s | FileCheck %s --check-prefix=CHECK-NODFB
+
+# CHECK-DFB: dfb @ encoding: [0x4c,0xf0,0x7f,0xf5]
+# CHECK-NODFB: dsb #0xc @ encoding: [0x4c,0xf0,0x7f,0xf5]
+[0x4c,0xf0,0x7f,0xf5]
diff --git a/test/MC/Disassembler/ARM/dfb-thumb.txt b/test/MC/Disassembler/ARM/dfb-thumb.txt
new file mode 100644
index 000000000000..aa8adc83c1f4
--- /dev/null
+++ b/test/MC/Disassembler/ARM/dfb-thumb.txt
@@ -0,0 +1,6 @@
+# RUN: llvm-mc -disassemble -triple thumbv8-none-eabi -mcpu=cortex-r52 -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DFB
+# RUN: llvm-mc -disassemble -triple thumbv8-none-eabi -mcpu=cortex-r52 -mattr=-dfb -show-encoding < %s | FileCheck %s --check-prefix=CHECK-NODFB
+
+# CHECK-DFB: dfb @ encoding: [0xbf,0xf3,0x4c,0x8f]
+# CHECK-NODFB: dsb #0xc @ encoding: [0xbf,0xf3,0x4c,0x8f]
+[0xbf,0xf3,0x4c,0x8f]
diff --git a/test/MC/Disassembler/X86/x86-32.txt b/test/MC/Disassembler/X86/x86-32.txt
index 4211721ec48b..cc05dfb6f896 100644
--- a/test/MC/Disassembler/X86/x86-32.txt
+++ b/test/MC/Disassembler/X86/x86-32.txt
@@ -667,6 +667,9 @@
# CHECK: prefetchw (%eax)
0x0f 0x0d 0x08
+# CHECK: prefetchwt1 (%eax)
+0x0f 0x0d 0x10
+
# CHECK: adcxl %eax, %eax
0x66 0x0f 0x38 0xf6 0xc0
diff --git a/test/MC/ELF/align-nops.s b/test/MC/ELF/align-nops.s
index 5e3386823f26..32da3dbd8e82 100644
--- a/test/MC/ELF/align-nops.s
+++ b/test/MC/ELF/align-nops.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - | llvm-readobj -s -sd | FileCheck %s
+// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro %s -o - | llvm-readobj -s -sd | FileCheck %s
// Test that we get optimal nops in text
.text
diff --git a/test/MC/MachO/x86_32-optimal_nop.s b/test/MC/MachO/x86_32-optimal_nop.s
index 01d8a1f6eb2a..1bc9eff337c4 100644
--- a/test/MC/MachO/x86_32-optimal_nop.s
+++ b/test/MC/MachO/x86_32-optimal_nop.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple i386-apple-darwin9 %s -filetype=obj -o - | llvm-readobj -file-headers -s -sd -r -t -macho-segment -macho-dysymtab -macho-indirect-symbols | FileCheck %s
+// RUN: llvm-mc -triple i386-apple-darwin9 -mcpu=pentiumpro %s -filetype=obj -o - | llvm-readobj -file-headers -s -sd -r -t -macho-segment -macho-dysymtab -macho-indirect-symbols | FileCheck %s
# 1 byte nop test
.align 4, 0 # start with 16 byte alignment filled with zeros
diff --git a/test/MC/Mips/eva/invalid.s b/test/MC/Mips/eva/invalid.s
index 5c4cc3ddf631..1f7c2e6ac8c6 100644
--- a/test/MC/Mips/eva/invalid.s
+++ b/test/MC/Mips/eva/invalid.s
@@ -1,35 +1,36 @@
# Instructions that are invalid
#
# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips32r2 \
-# RUN: -mattr==eva 2>%t1
+# RUN: -mattr=+eva 2>%t1
# RUN: FileCheck %s < %t1
.set noat
- cachee -1, 255($7) # CHECK: :[[@LINE]]:12: error: invalid operand for instruction
- cachee 32, 255($7) # CHECK: :[[@LINE]]:12: error: invalid operand for instruction
- prefe -1, 255($7) # CHECK: :[[@LINE]]:11: error: invalid operand for instruction
- prefe 32, 255($7) # CHECK: :[[@LINE]]:11: error: invalid operand for instruction
+ cachee -1, 255($7) # CHECK: :[[@LINE]]:12: error: expected 5-bit unsigned immediate
+ cachee 32, 255($7) # CHECK: :[[@LINE]]:12: error: expected 5-bit unsigned immediate
+ prefe -1, 255($7) # CHECK: :[[@LINE]]:11: error: expected 5-bit unsigned immediate
+ prefe 32, 255($7) # CHECK: :[[@LINE]]:11: error: expected 5-bit unsigned immediate
lle $33, 8($5) # CHECK: :[[@LINE]]:9: error: invalid operand for instruction
- lle $4, 8($33) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
- lle $4, 512($5) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
- lle $4, -513($5) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
+ lle $4, 8($33) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
+ lle $4, 512($5) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
+ lle $4, -513($5) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
lwe $33, 8($5) # CHECK: :[[@LINE]]:9: error: invalid operand for instruction
- lwe $4, 8($33) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
- lwe $4, 512($5) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
- lwe $4, -513($5) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
+ lwe $4, 8($33) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
+ lwe $4, 512($5) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
+ lwe $4, -513($5) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
sbe $33, 8($5) # CHECK: :[[@LINE]]:9: error: invalid operand for instruction
- sbe $4, 8($33) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
- sbe $4, 512($5) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
- sbe $4, -513($5) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
+ sbe $4, 8($33) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
+ sbe $4, 512($5) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
+ sbe $4, -513($5) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
sce $33, 8($5) # CHECK: :[[@LINE]]:9: error: invalid operand for instruction
- sce $4, 8($33) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
- sce $4, 512($5) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
- sce $4, -513($5) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
+ sce $4, 8($33) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
+ sce $4, 512($5) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
+ sce $4, -513($5) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
she $33, 8($5) # CHECK: :[[@LINE]]:9: error: invalid operand for instruction
- she $4, 8($33) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
- she $4, 512($5) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
- she $4, -513($5) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
+ she $4, 8($33) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
+ she $4, 512($5) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
+ she $4, -513($5) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
swe $33, 8($4) # CHECK: :[[@LINE]]:9: error: invalid operand for instruction
- swe $5, 8($34) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
- swe $5, 512($4) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
- swe $5, -513($4) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
+ swe $5, 8($34) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
+ swe $5, 512($4) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
+ swe $5, -513($4) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
+
diff --git a/test/MC/WebAssembly/weak-alias.ll b/test/MC/WebAssembly/weak-alias.ll
index 5f5d9b447c71..29eeac02da66 100644
--- a/test/MC/WebAssembly/weak-alias.ll
+++ b/test/MC/WebAssembly/weak-alias.ll
@@ -8,21 +8,41 @@
@bar = global i32 7, align 8
@bar_alias = weak hidden alias i32, i32* @bar
-@bar_alias_address = global i32* @bar_alias, align 8
-
@foo_alias = weak hidden alias i32 (), i32 ()* @foo
+@direct_address = global i32()* @foo, align 8
+@alias_address = global i32()* @foo_alias, align 8
+
+define hidden i32 @foo() #0 {
+entry:
+ ret i32 0
+}
+
+define hidden i32 @call_direct() #0 {
+entry:
+ %call = call i32 @foo()
+ ret i32 %call
+}
+
define hidden i32 @call_alias() #0 {
entry:
%call = call i32 @foo_alias()
ret i32 %call
}
-define hidden i32 @foo() #0 {
+define hidden i32 @call_direct_ptr() #0 {
entry:
- ret i32 0
+ %0 = load i32 ()*, i32 ()** @direct_address, align 8
+ %call = call i32 %0()
+ ret i32 %call
}
+define hidden i32 @call_alias_ptr() #0 {
+entry:
+ %0 = load i32 ()*, i32 ()** @alias_address, align 8
+ %call = call i32 %0()
+ ret i32 %call
+}
; CHECK: - Type: TYPE
; CHECK-NEXT: Signatures:
@@ -42,47 +62,112 @@ entry:
; CHECK-NEXT: Table:
; CHECK-NEXT: ElemType: ANYFUNC
; CHECK-NEXT: Limits:
-; CHECK-NEXT: Initial: 0x00000000
+; CHECK-NEXT: Initial: 0x00000002
+; CHECK-NEXT: - Module: env
+; CHECK-NEXT: Field: foo_alias
+; CHECK-NEXT: Kind: FUNCTION
+; CHECK-NEXT: SigIndex: 0
+; CHECK-NEXT: - Module: env
+; CHECK-NEXT: Field: bar_alias
+; CHECK-NEXT: Kind: GLOBAL
+; CHECK-NEXT: GlobalType: I32
+; CHECK-NEXT: GlobalMutable: false
; CHECK-NEXT: - Type: FUNCTION
-; CHECK-NEXT: FunctionTypes: [ 0, 0 ]
+; CHECK-NEXT: FunctionTypes: [ 0, 0, 0, 0, 0 ]
; CHECK-NEXT: - Type: GLOBAL
; CHECK-NEXT: Globals:
; CHECK-NEXT: - Type: I32
; CHECK-NEXT: Mutable: false
; CHECK-NEXT: InitExpr:
; CHECK-NEXT: Opcode: I32_CONST
-; CHECK-NEXT: Value: 0
+; CHECK-NEXT: Value: 8
; CHECK-NEXT: - Type: I32
; CHECK-NEXT: Mutable: false
; CHECK-NEXT: InitExpr:
; CHECK-NEXT: Opcode: I32_CONST
-; CHECK-NEXT: Value: 8
+; CHECK-NEXT: Value: 16
+; CHECK-NEXT: - Type: I32
+; CHECK-NEXT: Mutable: false
+; CHECK-NEXT: InitExpr:
+; CHECK-NEXT: Opcode: I32_CONST
+; CHECK-NEXT: Value: 0
; CHECK-NEXT: - Type: EXPORT
; CHECK-NEXT: Exports:
+; CHECK-NEXT: - Name: foo
+; CHECK-NEXT: Kind: FUNCTION
+; CHECK-NEXT: Index: 1
+; CHECK-NEXT: - Name: call_direct
+; CHECK-NEXT: Kind: FUNCTION
+; CHECK-NEXT: Index: 2
; CHECK-NEXT: - Name: call_alias
; CHECK-NEXT: Kind: FUNCTION
-; CHECK-NEXT: Index: 0
-; CHECK-NEXT: - Name: foo
+; CHECK-NEXT: Index: 3
+; CHECK-NEXT: - Name: call_direct_ptr
; CHECK-NEXT: Kind: FUNCTION
+; CHECK-NEXT: Index: 4
+; CHECK-NEXT: - Name: direct_address
+; CHECK-NEXT: Kind: GLOBAL
; CHECK-NEXT: Index: 1
-; CHECK-NEXT: - Name: bar
+; CHECK-NEXT: - Name: call_alias_ptr
+; CHECK-NEXT: Kind: FUNCTION
+; CHECK-NEXT: Index: 5
+; CHECK-NEXT: - Name: alias_address
; CHECK-NEXT: Kind: GLOBAL
-; CHECK-NEXT: Index: 0
-; CHECK-NEXT: - Name: bar_alias_address
+; CHECK-NEXT: Index: 2
+; CHECK-NEXT: - Name: bar
; CHECK-NEXT: Kind: GLOBAL
-; CHECK-NEXT: Index: 1
+; CHECK-NEXT: Index: 3
; CHECK-NEXT: - Name: foo_alias
; CHECK-NEXT: Kind: FUNCTION
; CHECK-NEXT: Index: 1
; CHECK-NEXT: - Name: bar_alias
; CHECK-NEXT: Kind: GLOBAL
+; CHECK-NEXT: Index: 3
+; CHECK-NEXT: - Type: ELEM
+; CHECK-NEXT: Segments:
+; CHECK-NEXT: - Offset:
+; CHECK-NEXT: Opcode: I32_CONST
+; CHECK-NEXT: Value: 0
+; CHECK-NEXT: Functions: [ 1, 0 ]
+; CHECK-NEXT: - Type: CODE
+; CHECK-NEXT: Relocations:
+; CHECK-NEXT: - Type: R_WEBASSEMBLY_FUNCTION_INDEX_LEB
+; CHECK-NEXT: Index: 1
+; CHECK-NEXT: Offset: 0x00000009
+; CHECK-NEXT: - Type: R_WEBASSEMBLY_FUNCTION_INDEX_LEB
; CHECK-NEXT: Index: 0
-
-; CHECK: - Type: DATA
+; CHECK-NEXT: Offset: 0x00000012
+; CHECK-NEXT: - Type: R_WEBASSEMBLY_MEMORY_ADDR_LEB
+; CHECK-NEXT: Index: 1
+; CHECK-NEXT: Offset: 0x0000001E
+; CHECK-NEXT: - Type: R_WEBASSEMBLY_TYPE_INDEX_LEB
+; CHECK-NEXT: Index: 0
+; CHECK-NEXT: Offset: 0x00000024
+; CHECK-NEXT: - Type: R_WEBASSEMBLY_MEMORY_ADDR_LEB
+; CHECK-NEXT: Index: 2
+; CHECK-NEXT: Offset: 0x00000031
+; CHECK-NEXT: - Type: R_WEBASSEMBLY_TYPE_INDEX_LEB
+; CHECK-NEXT: Index: 0
+; CHECK-NEXT: Offset: 0x00000037
+; CHECK-NEXT: Functions:
+; CHECK-NEXT: - Locals:
+; CHECK-NEXT: Body: 41000B
+; CHECK-NEXT: - Locals:
+; CHECK-NEXT: Body: 1081808080000B
+; CHECK-NEXT: - Locals:
+; CHECK-NEXT: Body: 1080808080000B
+; CHECK-NEXT: - Locals:
+; CHECK-NEXT: Body: 410028028880808000118080808000000B
+; CHECK-NEXT: - Locals:
+; CHECK-NEXT: Body: 410028029080808000118080808000000B
+; CHECK-NEXT: - Type: DATA
; CHECK-NEXT: Relocations:
-; CHECK-NEXT: - Type: R_WEBASSEMBLY_MEMORY_ADDR_I32
+; CHECK-NEXT: - Type: R_WEBASSEMBLY_TABLE_INDEX_I32
; CHECK-NEXT: Index: 0
; CHECK-NEXT: Offset: 0x0000000F
+; CHECK-NEXT: - Type: R_WEBASSEMBLY_TABLE_INDEX_I32
+; CHECK-NEXT: Index: 1
+; CHECK-NEXT: Offset: 0x00000018
; CHECK-NEXT: Segments:
; CHECK-NEXT: - SectionOffset: 6
; CHECK-NEXT: MemoryIndex: 0
@@ -101,38 +186,64 @@ entry:
; CHECK-NEXT: Name: name
; CHECK-NEXT: FunctionNames:
; CHECK-NEXT: - Index: 0
-; CHECK-NEXT: Name: call_alias
+; CHECK-NEXT: Name: foo_alias
; CHECK-NEXT: - Index: 1
; CHECK-NEXT: Name: foo
+; CHECK-NEXT: - Index: 2
+; CHECK-NEXT: Name: call_direct
+; CHECK-NEXT: - Index: 3
+; CHECK-NEXT: Name: call_alias
+; CHECK-NEXT: - Index: 4
+; CHECK-NEXT: Name: call_direct_ptr
+; CHECK-NEXT: - Index: 5
+; CHECK-NEXT: Name: call_alias_ptr
; CHECK-NEXT: - Type: CUSTOM
; CHECK-NEXT: Name: linking
-; CHECK-NEXT: DataSize: 12
+; CHECK-NEXT: DataSize: 20
; CHECK-NEXT: SymbolInfo:
-; CHECK-NEXT: - Name: call_alias
-; CHECK-NEXT: Flags: [ VISIBILITY_HIDDEN ]
-; CHECK-NEXT: - Name: foo
-; CHECK-NEXT: Flags: [ VISIBILITY_HIDDEN ]
; CHECK-NEXT: - Name: foo_alias
; CHECK-NEXT: Flags: [ BINDING_WEAK, VISIBILITY_HIDDEN ]
; CHECK-NEXT: - Name: bar_alias
; CHECK-NEXT: Flags: [ BINDING_WEAK, VISIBILITY_HIDDEN ]
+; CHECK-NEXT: - Name: foo
+; CHECK-NEXT: Flags: [ VISIBILITY_HIDDEN ]
+; CHECK-NEXT: - Name: call_direct
+; CHECK-NEXT: Flags: [ VISIBILITY_HIDDEN ]
+; CHECK-NEXT: - Name: call_alias
+; CHECK-NEXT: Flags: [ VISIBILITY_HIDDEN ]
+; CHECK-NEXT: - Name: call_direct_ptr
+; CHECK-NEXT: Flags: [ VISIBILITY_HIDDEN ]
+; CHECK-NEXT: - Name: call_alias_ptr
+; CHECK-NEXT: Flags: [ VISIBILITY_HIDDEN ]
; CHECK-NEXT: SegmentInfo:
; CHECK-NEXT: - Index: 0
; CHECK-NEXT: Name: .data.bar
; CHECK-NEXT: Alignment: 8
; CHECK-NEXT: Flags: [ ]
; CHECK-NEXT: - Index: 1
-; CHECK-NEXT: Name: .data.bar_alias_address
+; CHECK-NEXT: Name: .data.direct_address
+; CHECK-NEXT: Alignment: 8
+; CHECK-NEXT: Flags: [ ]
+; CHECK-NEXT: - Index: 2
+; CHECK-NEXT: Name: .data.alias_address
; CHECK-NEXT: Alignment: 8
; CHECK-NEXT: Flags: [ ]
; CHECK-NEXT: ...
; CHECK-SYMS: SYMBOL TABLE:
-; CHECK-SYMS-NEXT: 00000000 g F name call_alias
+; CHECK-SYMS-NEXT: 00000000 g F name foo_alias
; CHECK-SYMS-NEXT: 00000001 g F name foo
-; CHECK-SYMS-NEXT: 00000000 g F EXPORT .hidden call_alias
-; CHECK-SYMS-NEXT: 00000001 g F EXPORT .hidden foo
-; CHECK-SYMS-NEXT: 00000000 g EXPORT bar
-; CHECK-SYMS-NEXT: 00000008 g EXPORT bar_alias_address
+; CHECK-SYMS-NEXT: 00000002 g F name call_direct
+; CHECK-SYMS-NEXT: 00000003 g F name call_alias
+; CHECK-SYMS-NEXT: 00000004 g F name call_direct_ptr
+; CHECK-SYMS-NEXT: 00000005 g F name call_alias_ptr
; CHECK-SYMS-NEXT: 00000001 gw F EXPORT .hidden foo_alias
; CHECK-SYMS-NEXT: 00000000 gw EXPORT .hidden bar_alias
+; CHECK-SYMS-NEXT: 00000001 g F EXPORT .hidden foo
+; CHECK-SYMS-NEXT: 00000002 g F EXPORT .hidden call_direct
+; CHECK-SYMS-NEXT: 00000003 g F EXPORT .hidden call_alias
+; CHECK-SYMS-NEXT: 00000004 g F EXPORT .hidden call_direct_ptr
+; CHECK-SYMS-NEXT: 00000008 g EXPORT direct_address
+; CHECK-SYMS-NEXT: 00000005 g F EXPORT .hidden call_alias_ptr
+; CHECK-SYMS-NEXT: 00000010 g EXPORT alias_address
+; CHECK-SYMS-NEXT: 00000000 g EXPORT bar
diff --git a/test/MC/X86/3DNow.s b/test/MC/X86/3DNow.s
index 871857b155d0..e66e39b547ae 100644
--- a/test/MC/X86/3DNow.s
+++ b/test/MC/X86/3DNow.s
@@ -72,8 +72,10 @@ femms
// CHECK: prefetch (%rax) # encoding: [0x0f,0x0d,0x00]
// CHECK: prefetchw (%rax) # encoding: [0x0f,0x0d,0x08]
+// CHECK: prefetchwt1 (%rax) # encoding: [0x0f,0x0d,0x10]
prefetch (%rax)
prefetchw (%rax)
+prefetchwt1 (%rax)
// CHECK: pf2iw %mm2, %mm1 # encoding: [0x0f,0x0f,0xca,0x1c]
diff --git a/test/MC/X86/AlignedBundling/different-sections.s b/test/MC/X86/AlignedBundling/different-sections.s
index e12153210042..0af4a8f133e6 100644
--- a/test/MC/X86/AlignedBundling/different-sections.s
+++ b/test/MC/X86/AlignedBundling/different-sections.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro %s -o - \
# RUN: | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mc-relax-all %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - \
# RUN: | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s
# Test two different executable sections with bundling.
diff --git a/test/MC/X86/AlignedBundling/long-nop-pad.s b/test/MC/X86/AlignedBundling/long-nop-pad.s
index 36e4f4b553fb..bcc319456d35 100644
--- a/test/MC/X86/AlignedBundling/long-nop-pad.s
+++ b/test/MC/X86/AlignedBundling/long-nop-pad.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro %s -o - \
# RUN: | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mc-relax-all %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - \
# RUN: | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s
# Test that long nops are generated for padding where possible.
diff --git a/test/MC/X86/AlignedBundling/misaligned-bundle-group.s b/test/MC/X86/AlignedBundling/misaligned-bundle-group.s
index 04b3374716bb..edd933ed720d 100644
--- a/test/MC/X86/AlignedBundling/misaligned-bundle-group.s
+++ b/test/MC/X86/AlignedBundling/misaligned-bundle-group.s
@@ -1,7 +1,7 @@
-# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu %s -o - \
+# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu -mcpu=pentiumpro %s -o - \
# RUN: | llvm-objdump -disassemble -no-show-raw-insn - \
# RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-OPT %s
-# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu -mc-relax-all %s -o - \
+# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - \
# RUN: | llvm-objdump -disassemble -no-show-raw-insn - \
# RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-RELAX %s
diff --git a/test/MC/X86/AlignedBundling/misaligned-bundle.s b/test/MC/X86/AlignedBundling/misaligned-bundle.s
index 08d616109909..676b667cada5 100644
--- a/test/MC/X86/AlignedBundling/misaligned-bundle.s
+++ b/test/MC/X86/AlignedBundling/misaligned-bundle.s
@@ -1,7 +1,7 @@
-# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu %s -o - \
+# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu -mcpu=pentiumpro %s -o - \
# RUN: | llvm-objdump -disassemble -no-show-raw-insn - \
# RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-OPT %s
-# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu -mc-relax-all %s -o - \
+# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - \
# RUN: | llvm-objdump -disassemble -no-show-raw-insn - \
# RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-RELAX %s
diff --git a/test/MC/X86/AlignedBundling/pad-align-to-bundle-end.s b/test/MC/X86/AlignedBundling/pad-align-to-bundle-end.s
index 158cde8cd450..8605b7c0137d 100644
--- a/test/MC/X86/AlignedBundling/pad-align-to-bundle-end.s
+++ b/test/MC/X86/AlignedBundling/pad-align-to-bundle-end.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro %s -o - \
# RUN: | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mc-relax-all %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - \
# RUN: | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s
# Test some variations of padding to the end of a bundle.
diff --git a/test/MC/X86/AlignedBundling/pad-bundle-groups.s b/test/MC/X86/AlignedBundling/pad-bundle-groups.s
index 7a9e30c053ec..5993d73dd6dd 100644
--- a/test/MC/X86/AlignedBundling/pad-bundle-groups.s
+++ b/test/MC/X86/AlignedBundling/pad-bundle-groups.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro %s -o - \
# RUN: | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mc-relax-all %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - \
# RUN: | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s
# Test some variations of padding for bundle-locked groups.
diff --git a/test/MC/X86/AlignedBundling/relax-in-bundle-group.s b/test/MC/X86/AlignedBundling/relax-in-bundle-group.s
index d07619063f00..036249b906b4 100644
--- a/test/MC/X86/AlignedBundling/relax-in-bundle-group.s
+++ b/test/MC/X86/AlignedBundling/relax-in-bundle-group.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro %s -o - \
# RUN: | llvm-objdump -disassemble - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mc-relax-all %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - \
# RUN: | llvm-objdump -disassemble - | FileCheck %s
# Test that instructions inside bundle-locked groups are relaxed even if their
diff --git a/test/MC/X86/AlignedBundling/single-inst-bundling.s b/test/MC/X86/AlignedBundling/single-inst-bundling.s
index a7df2c96a8eb..cb0ad8adba39 100644
--- a/test/MC/X86/AlignedBundling/single-inst-bundling.s
+++ b/test/MC/X86/AlignedBundling/single-inst-bundling.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro %s -o - \
# RUN: | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck -check-prefix=CHECK -check-prefix=CHECK-OPT %s
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mc-relax-all %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - \
# RUN: | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck -check-prefix=CHECK -check-prefix=CHECK-RELAX %s
# Test simple NOP insertion for single instructions.
diff --git a/test/MC/X86/CLFLUSHOPT-32.s b/test/MC/X86/CLFLUSHOPT-32.s
new file mode 100644
index 000000000000..e3df46ced7b0
--- /dev/null
+++ b/test/MC/X86/CLFLUSHOPT-32.s
@@ -0,0 +1,26 @@
+// RUN: llvm-mc -triple i386-unknown-unknown --show-encoding %s | FileCheck %s
+
+// CHECK: clflushopt -485498096(%edx,%eax,4)
+// CHECK: encoding: [0x66,0x0f,0xae,0xbc,0x82,0x10,0xe3,0x0f,0xe3]
+clflushopt -485498096(%edx,%eax,4)
+
+// CHECK: clflushopt 485498096(%edx,%eax,4)
+// CHECK: encoding: [0x66,0x0f,0xae,0xbc,0x82,0xf0,0x1c,0xf0,0x1c]
+clflushopt 485498096(%edx,%eax,4)
+
+// CHECK: clflushopt 485498096(%edx)
+// CHECK: encoding: [0x66,0x0f,0xae,0xba,0xf0,0x1c,0xf0,0x1c]
+clflushopt 485498096(%edx)
+
+// CHECK: clflushopt 485498096
+// CHECK: encoding: [0x66,0x0f,0xae,0x3d,0xf0,0x1c,0xf0,0x1c]
+clflushopt 485498096
+
+// CHECK: clflushopt 64(%edx,%eax)
+// CHECK: encoding: [0x66,0x0f,0xae,0x7c,0x02,0x40]
+clflushopt 64(%edx,%eax)
+
+// CHECK: clflushopt (%edx)
+// CHECK: encoding: [0x66,0x0f,0xae,0x3a]
+clflushopt (%edx)
+
diff --git a/test/MC/X86/CLFLUSHOPT-64.s b/test/MC/X86/CLFLUSHOPT-64.s
new file mode 100644
index 000000000000..cdecced6fa2e
--- /dev/null
+++ b/test/MC/X86/CLFLUSHOPT-64.s
@@ -0,0 +1,26 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s
+
+// CHECK: clflushopt 485498096
+// CHECK: encoding: [0x66,0x0f,0xae,0x3c,0x25,0xf0,0x1c,0xf0,0x1c]
+clflushopt 485498096
+
+// CHECK: clflushopt 64(%rdx)
+// CHECK: encoding: [0x66,0x0f,0xae,0x7a,0x40]
+clflushopt 64(%rdx)
+
+// CHECK: clflushopt 64(%rdx,%rax,4)
+// CHECK: encoding: [0x66,0x0f,0xae,0x7c,0x82,0x40]
+clflushopt 64(%rdx,%rax,4)
+
+// CHECK: clflushopt -64(%rdx,%rax,4)
+// CHECK: encoding: [0x66,0x0f,0xae,0x7c,0x82,0xc0]
+clflushopt -64(%rdx,%rax,4)
+
+// CHECK: clflushopt 64(%rdx,%rax)
+// CHECK: encoding: [0x66,0x0f,0xae,0x7c,0x02,0x40]
+clflushopt 64(%rdx,%rax)
+
+// CHECK: clflushopt (%rdx)
+// CHECK: encoding: [0x66,0x0f,0xae,0x3a]
+clflushopt (%rdx)
+
diff --git a/test/MC/X86/CLFSH-32.s b/test/MC/X86/CLFSH-32.s
new file mode 100644
index 000000000000..898569ec2df6
--- /dev/null
+++ b/test/MC/X86/CLFSH-32.s
@@ -0,0 +1,26 @@
+// RUN: llvm-mc -triple i386-unknown-unknown --show-encoding %s | FileCheck %s
+
+// CHECK: clflush -485498096(%edx,%eax,4)
+// CHECK: encoding: [0x0f,0xae,0xbc,0x82,0x10,0xe3,0x0f,0xe3]
+clflush -485498096(%edx,%eax,4)
+
+// CHECK: clflush 485498096(%edx,%eax,4)
+// CHECK: encoding: [0x0f,0xae,0xbc,0x82,0xf0,0x1c,0xf0,0x1c]
+clflush 485498096(%edx,%eax,4)
+
+// CHECK: clflush 485498096(%edx)
+// CHECK: encoding: [0x0f,0xae,0xba,0xf0,0x1c,0xf0,0x1c]
+clflush 485498096(%edx)
+
+// CHECK: clflush 485498096
+// CHECK: encoding: [0x0f,0xae,0x3d,0xf0,0x1c,0xf0,0x1c]
+clflush 485498096
+
+// CHECK: clflush 64(%edx,%eax)
+// CHECK: encoding: [0x0f,0xae,0x7c,0x02,0x40]
+clflush 64(%edx,%eax)
+
+// CHECK: clflush (%edx)
+// CHECK: encoding: [0x0f,0xae,0x3a]
+clflush (%edx)
+
diff --git a/test/MC/X86/CLFSH-64.s b/test/MC/X86/CLFSH-64.s
new file mode 100644
index 000000000000..f2c2ae51c81b
--- /dev/null
+++ b/test/MC/X86/CLFSH-64.s
@@ -0,0 +1,26 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s
+
+// CHECK: clflush 485498096
+// CHECK: encoding: [0x0f,0xae,0x3c,0x25,0xf0,0x1c,0xf0,0x1c]
+clflush 485498096
+
+// CHECK: clflush 64(%rdx)
+// CHECK: encoding: [0x0f,0xae,0x7a,0x40]
+clflush 64(%rdx)
+
+// CHECK: clflush 64(%rdx,%rax,4)
+// CHECK: encoding: [0x0f,0xae,0x7c,0x82,0x40]
+clflush 64(%rdx,%rax,4)
+
+// CHECK: clflush -64(%rdx,%rax,4)
+// CHECK: encoding: [0x0f,0xae,0x7c,0x82,0xc0]
+clflush -64(%rdx,%rax,4)
+
+// CHECK: clflush 64(%rdx,%rax)
+// CHECK: encoding: [0x0f,0xae,0x7c,0x02,0x40]
+clflush 64(%rdx,%rax)
+
+// CHECK: clflush (%rdx)
+// CHECK: encoding: [0x0f,0xae,0x3a]
+clflush (%rdx)
+
diff --git a/test/MC/X86/x86_long_nop.s b/test/MC/X86/x86_long_nop.s
index fa8525b44c22..7fa553692ea0 100644
--- a/test/MC/X86/x86_long_nop.s
+++ b/test/MC/X86/x86_long_nop.s
@@ -1,7 +1,7 @@
-# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-apple-darwin10.0 %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-apple-darwin8 %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu -mcpu=pentiumpro %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s -mcpu=pentiumpro | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-apple-darwin10.0 -mcpu=pentiumpro %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-apple-darwin8 -mcpu=pentiumpro %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=slm %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=LNOP7 %s
# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=silvermont %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=LNOP7 %s
# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=lakemont %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=NOP1 %s
diff --git a/test/TableGen/GlobalISelEmitter.td b/test/TableGen/GlobalISelEmitter.td
index f9fe51f84380..bc0b509622c4 100644
--- a/test/TableGen/GlobalISelEmitter.td
+++ b/test/TableGen/GlobalISelEmitter.td
@@ -63,11 +63,14 @@ def HasC : Predicate<"Subtarget->hasC()"> { let RecomputePerFunction = 1; }
// CHECK-NEXT: typedef ComplexRendererFns(MyTargetInstructionSelector::*ComplexMatcherMemFn)(MachineOperand &) const;
// CHECK-NEXT: const MatcherInfoTy<PredicateBitset, ComplexMatcherMemFn> MatcherInfo;
// CHECK-NEXT: static MyTargetInstructionSelector::ComplexMatcherMemFn ComplexPredicateFns[];
+// CHECK-NEXT: bool testImmPredicate_I64(unsigned PredicateID, int64_t Imm) const override;
+// CHECK-NEXT: bool testImmPredicate_APInt(unsigned PredicateID, const APInt &Imm) const override;
+// CHECK-NEXT: bool testImmPredicate_APFloat(unsigned PredicateID, const APFloat &Imm) const override;
// CHECK-NEXT: #endif // ifdef GET_GLOBALISEL_TEMPORARIES_DECL
// CHECK-LABEL: #ifdef GET_GLOBALISEL_TEMPORARIES_INIT
// CHECK-NEXT: , State(2),
-// CHECK-NEXT: MatcherInfo({TypeObjects, FeatureBitsets, I64ImmPredicateFns, APIntImmPredicateFns, APFloatImmPredicateFns, ComplexPredicateFns})
+// CHECK-NEXT: MatcherInfo({TypeObjects, FeatureBitsets, ComplexPredicateFns})
// CHECK-NEXT: #endif // ifdef GET_GLOBALISEL_TEMPORARIES_INIT
// CHECK-LABEL: enum SubtargetFeatureBits : uint8_t {
@@ -127,31 +130,49 @@ def HasC : Predicate<"Subtarget->hasC()"> { let RecomputePerFunction = 1; }
// CHECK-NEXT: enum {
// CHECK-NEXT: GIPFP_I64_Predicate_simm8 = GIPFP_I64_Invalid + 1,
// CHECK-NEXT: };
-// CHECK-NEXT: static bool Predicate_simm8(int64_t Imm) { return isInt<8>(Imm); }
-// CHECK-NEXT: static InstructionSelector::I64ImmediatePredicateFn I64ImmPredicateFns[] = {
-// CHECK-NEXT: nullptr,
-// CHECK-NEXT: Predicate_simm8,
-// CHECK-NEXT: };
+// CHECK-NEXT: bool MyTargetInstructionSelector::testImmPredicate_I64(unsigned PredicateID, int64_t Imm) const {
+// CHECK-NEXT: switch (PredicateID) {
+// CHECK-NEXT: case GIPFP_I64_Predicate_simm8: {
+// CHECK-NEXT: return isInt<8>(Imm);
+// CHECK-NEXT: llvm_unreachable("ImmediateCode should have returned");
+// CHECK-NEXT: return false;
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: llvm_unreachable("Unknown predicate");
+// CHECK-NEXT: return false;
+// CHECK-NEXT: }
// CHECK-LABEL: // PatFrag predicates.
// CHECK-NEXT: enum {
// CHECK-NEXT: GIPFP_APFloat_Predicate_fpimmz = GIPFP_APFloat_Invalid + 1,
// CHECK-NEXT: };
-// CHECK-NEXT: static bool Predicate_fpimmz(const APFloat & Imm) { return Imm->isExactlyValue(0.0); }
-// CHECK-NEXT: static InstructionSelector::APFloatImmediatePredicateFn APFloatImmPredicateFns[] = {
-// CHECK-NEXT: nullptr,
-// CHECK-NEXT: Predicate_fpimmz,
-// CHECK-NEXT: };
+// CHECK-NEXT: bool MyTargetInstructionSelector::testImmPredicate_APFloat(unsigned PredicateID, const APFloat & Imm) const {
+// CHECK-NEXT: switch (PredicateID) {
+// CHECK-NEXT: case GIPFP_APFloat_Predicate_fpimmz: {
+// CHECK-NEXT: return Imm->isExactlyValue(0.0);
+// CHECK-NEXT: llvm_unreachable("ImmediateCode should have returned");
+// CHECK-NEXT: return false;
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: llvm_unreachable("Unknown predicate");
+// CHECK-NEXT: return false;
+// CHECK-NEXT: }
// CHECK-LABEL: // PatFrag predicates.
// CHECK-NEXT: enum {
// CHECK-NEXT: GIPFP_APInt_Predicate_simm9 = GIPFP_APInt_Invalid + 1,
// CHECK-NEXT: };
-// CHECK-NEXT: static bool Predicate_simm9(const APInt & Imm) { return isInt<9>(Imm->getSExtValue()); }
-// CHECK-NEXT: static InstructionSelector::APIntImmediatePredicateFn APIntImmPredicateFns[] = {
-// CHECK-NEXT: nullptr,
-// CHECK-NEXT: Predicate_simm9,
-// CHECK-NEXT: };
+// CHECK-NEXT: bool MyTargetInstructionSelector::testImmPredicate_APInt(unsigned PredicateID, const APInt & Imm) const {
+// CHECK-NEXT: switch (PredicateID) {
+// CHECK-NEXT: case GIPFP_APInt_Predicate_simm9: {
+// CHECK-NEXT: return isInt<9>(Imm->getSExtValue());
+// CHECK-NEXT: llvm_unreachable("ImmediateCode should have returned");
+// CHECK-NEXT: return false;
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: llvm_unreachable("Unknown predicate");
+// CHECK-NEXT: return false;
+// CHECK-NEXT: }
// CHECK-LABEL: MyTargetInstructionSelector::ComplexMatcherMemFn
// CHECK-NEXT: MyTargetInstructionSelector::ComplexPredicateFns[] = {
diff --git a/test/TableGen/intrinsic-long-name.td b/test/TableGen/intrinsic-long-name.td
index 24ed89ac4acf..9bbfe9829a65 100644
--- a/test/TableGen/intrinsic-long-name.td
+++ b/test/TableGen/intrinsic-long-name.td
@@ -2,6 +2,7 @@
// XFAIL: vg_leak
class IntrinsicProperty;
+class SDNodeProperty;
class ValueType<int size, int value> {
string Namespace = "MVT";
@@ -20,6 +21,7 @@ class Intrinsic<string name, list<LLVMType> param_types = []> {
list<LLVMType> RetTypes = [];
list<LLVMType> ParamTypes = param_types;
list<IntrinsicProperty> IntrProperties = [];
+ list<SDNodeProperty> Properties = [];
}
def iAny : ValueType<0, 253>;
diff --git a/test/TableGen/intrinsic-struct.td b/test/TableGen/intrinsic-struct.td
index 93737b14db2a..1f1a8c2c8220 100644
--- a/test/TableGen/intrinsic-struct.td
+++ b/test/TableGen/intrinsic-struct.td
@@ -2,6 +2,7 @@
// XFAIL: vg_leak
class IntrinsicProperty;
+class SDNodeProperty;
class ValueType<int size, int value> {
string Namespace = "MVT";
@@ -20,6 +21,7 @@ class Intrinsic<string name, list<LLVMType> ret_types = []> {
list<LLVMType> RetTypes = ret_types;
list<LLVMType> ParamTypes = [];
list<IntrinsicProperty> IntrProperties = [];
+ list<SDNodeProperty> Properties = [];
}
def iAny : ValueType<0, 253>;
diff --git a/test/TableGen/intrinsic-varargs.td b/test/TableGen/intrinsic-varargs.td
index 4b2cc5ae02f7..484364779012 100644
--- a/test/TableGen/intrinsic-varargs.td
+++ b/test/TableGen/intrinsic-varargs.td
@@ -2,6 +2,7 @@
// XFAIL: vg_leak
class IntrinsicProperty;
+class SDNodeProperty;
class ValueType<int size, int value> {
string Namespace = "MVT";
@@ -20,6 +21,7 @@ class Intrinsic<string name, list<LLVMType> param_types = []> {
list<LLVMType> RetTypes = [];
list<LLVMType> ParamTypes = param_types;
list<IntrinsicProperty> IntrProperties = [];
+ list<SDNodeProperty> Properties = [];
}
// isVoid needs to match the definition in ValueTypes.td
diff --git a/test/ThinLTO/X86/cache.ll b/test/ThinLTO/X86/cache.ll
index ea5c2f98d876..75466442d786 100644
--- a/test/ThinLTO/X86/cache.ll
+++ b/test/ThinLTO/X86/cache.ll
@@ -5,7 +5,7 @@
; Verify that enabling caching is ignoring module without hash
; RUN: rm -Rf %t.cache && mkdir %t.cache
-; RUN: llvm-lto -thinlto-action=run -exported-symbol=globalfunc %t2.bc %t.bc -thinlto-cache-dir %t.cache
+; RUN: llvm-lto -thinlto-action=run -exported-symbol=globalfunc %t2.bc %t.bc -thinlto-cache-dir %t.cache
; RUN: ls %t.cache/llvmcache.timestamp
; RUN: ls %t.cache | count 1
@@ -27,7 +27,7 @@
; files matching the pattern "llvmcache-*".
; RUN: rm -Rf %t.cache && mkdir %t.cache
; RUN: touch -t 197001011200 %t.cache/llvmcache-foo %t.cache/foo
-; RUN: llvm-lto -thinlto-action=run -exported-symbol=globalfunc %t2.bc %t.bc -thinlto-cache-dir %t.cache
+; RUN: llvm-lto -thinlto-action=run -exported-symbol=globalfunc %t2.bc %t.bc -thinlto-cache-dir %t.cache
; RUN: ls %t.cache | count 4
; RUN: ls %t.cache/llvmcache.timestamp
; RUN: ls %t.cache/foo
@@ -36,13 +36,29 @@
; Verify that enabling caching is working with llvm-lto2
; RUN: rm -Rf %t.cache
-; RUN: llvm-lto2 run -o %t.o %t2.bc %t.bc -cache-dir %t.cache \
+; RUN: llvm-lto2 run -o %t.o %t2.bc %t.bc -cache-dir %t.cache \
; RUN: -r=%t2.bc,_main,plx \
; RUN: -r=%t2.bc,_globalfunc,lx \
; RUN: -r=%t.bc,_globalfunc,plx
; RUN: ls %t.cache | count 2
; RUN: ls %t.cache/llvmcache-* | count 2
+; Verify that caches with a timestamp older than the pruning interval
+; will be pruned
+; RUN: rm -Rf %t.cache && mkdir %t.cache
+; RUN: touch -t 197001011200 %t.cache/llvmcache-foo
+; RUN: touch -t 197001011200 %t.cache/llvmcache.timestamp
+; RUN: llvm-lto -thinlto-action=run -exported-symbol=globalfunc %t2.bc %t.bc -thinlto-cache-dir %t.cache
+; RUN: not ls %t.cache/llvmcache-foo
+
+; Verify that specifying a negative number for the pruning interval
+; effectively disables the pruning
+; RUN: rm -Rf %t.cache && mkdir %t.cache
+; RUN: touch -t 197001011200 %t.cache/llvmcache-foo
+; RUN: touch -t 197001011200 %t.cache/llvmcache.timestamp
+; RUN: llvm-lto -thinlto-action=run -exported-symbol=globalfunc %t2.bc %t.bc -thinlto-cache-dir %t.cache --thinlto-cache-pruning-interval -1
+; RUN: ls %t.cache/llvmcache-foo
+
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.11.0"
diff --git a/test/Transforms/CallSiteSplitting/callsite-no-or-structure.ll b/test/Transforms/CallSiteSplitting/callsite-no-or-structure.ll
new file mode 100644
index 000000000000..e10b04a850af
--- /dev/null
+++ b/test/Transforms/CallSiteSplitting/callsite-no-or-structure.ll
@@ -0,0 +1,139 @@
+; RUN: opt < %s -callsite-splitting -S | FileCheck %s
+; RUN: opt < %s -passes='function(callsite-splitting)' -S | FileCheck %s
+
+; CHECK-LABEL: @test_simple
+; CHECK-LABEL: Header:
+; CHECK-NEXT: br i1 undef, label %Tail.predBB1.split
+; CHECK-LABEL: TBB:
+; CHECK: br i1 %cmp, label %Tail.predBB2.split
+; CHECK-LABEL: Tail.predBB1.split:
+; CHECK: %[[CALL1:.*]] = call i32 @callee(i32* %a, i32 %v, i32 %p)
+; CHECK-LABEL: Tail.predBB2.split:
+; CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 %v, i32 %p)
+; CHECK-LABEL: Tail
+; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+; CHECK: ret i32 %[[MERGED]]
+define i32 @test_simple(i32* %a, i32 %v, i32 %p) {
+Header:
+ br i1 undef, label %Tail, label %End
+
+TBB:
+ %cmp = icmp eq i32* %a, null
+ br i1 %cmp, label %Tail, label %End
+
+Tail:
+ %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+ ret i32 %r
+
+End:
+ ret i32 %v
+}
+
+; CHECK-LABEL: @test_eq_eq_eq_untaken
+; CHECK-LABEL: Header:
+; CHECK: br i1 %tobool1, label %TBB1, label %Tail.predBB1.split
+; CHECK-LABEL: TBB2:
+; CHECK: br i1 %cmp2, label %Tail.predBB2.split, label %End
+; CHECK-LABEL: Tail.predBB1.split:
+; CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 %p)
+; CHECK-LABEL: Tail.predBB2.split:
+; CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 1, i32 99)
+; CHECK-LABEL: Tail
+; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+; CHECK: ret i32 %[[MERGED]]
+define i32 @test_eq_eq_eq_untaken2(i32* %a, i32 %v, i32 %p) {
+Header:
+ %tobool1 = icmp eq i32* %a, null
+ br i1 %tobool1, label %TBB1, label %Tail
+
+TBB1:
+ %cmp1 = icmp eq i32 %v, 1
+ br i1 %cmp1, label %TBB2, label %End
+
+TBB2:
+ %cmp2 = icmp eq i32 %p, 99
+ br i1 %cmp2, label %Tail, label %End
+
+Tail:
+ %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+ ret i32 %r
+
+End:
+ ret i32 %v
+}
+
+; CHECK-LABEL: @test_eq_ne_eq_untaken
+; CHECK-LABEL: Header:
+; CHECK: br i1 %tobool1, label %TBB1, label %Tail.predBB1.split
+; CHECK-LABEL: TBB2:
+; CHECK: br i1 %cmp2, label %Tail.predBB2.split, label %End
+; CHECK-LABEL: Tail.predBB1.split:
+; CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 %p)
+; CHECK-LABEL: Tail.predBB2.split:
+; CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 %v, i32 99)
+; CHECK-LABEL: Tail
+; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+; CHECK: ret i32 %[[MERGED]]
+define i32 @test_eq_ne_eq_untaken(i32* %a, i32 %v, i32 %p) {
+Header:
+ %tobool1 = icmp eq i32* %a, null
+ br i1 %tobool1, label %TBB1, label %Tail
+
+TBB1:
+ %cmp1 = icmp ne i32 %v, 1
+ br i1 %cmp1, label %TBB2, label %End
+
+TBB2:
+ %cmp2 = icmp eq i32 %p, 99
+ br i1 %cmp2, label %Tail, label %End
+
+Tail:
+ %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+ ret i32 %r
+
+End:
+ ret i32 %v
+}
+
+; CHECK-LABEL: @test_header_header2_tbb
+; CHECK: Header2:
+; CHECK:br i1 %tobool2, label %Tail.predBB1.split, label %TBB1
+; CHECK-LABEL: TBB2:
+; CHECK: br i1 %cmp2, label %Tail.predBB2.split, label %End
+; CHECK-LABEL: Tail.predBB1.split:
+; CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 10)
+; CHECK-LABEL: Tail.predBB2.split:
+; NOTE: CallSiteSplitting cannot infer that %a is null here, as it currently
+; only supports recording conditions along a single predecessor path.
+; CHECK: %[[CALL2:.*]] = call i32 @callee(i32* %a, i32 1, i32 99)
+; CHECK-LABEL: Tail
+; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+; CHECK: ret i32 %[[MERGED]]
+define i32 @test_header_header2_tbb(i32* %a, i32 %v, i32 %p) {
+Header:
+ %tobool1 = icmp eq i32* %a, null
+ br i1 %tobool1, label %TBB1, label %Header2
+
+Header2:
+ %tobool2 = icmp eq i32 %p, 10
+ br i1 %tobool2, label %Tail, label %TBB1
+
+TBB1:
+ %cmp1 = icmp eq i32 %v, 1
+ br i1 %cmp1, label %TBB2, label %End
+
+TBB2:
+ %cmp2 = icmp eq i32 %p, 99
+ br i1 %cmp2, label %Tail, label %End
+
+Tail:
+ %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+ ret i32 %r
+
+End:
+ ret i32 %v
+}
+
+define i32 @callee(i32* %a, i32 %v, i32 %p) {
+ ret i32 10
+}
diff --git a/test/Transforms/CallSiteSplitting/callsite-no-splitting.ll b/test/Transforms/CallSiteSplitting/callsite-no-splitting.ll
new file mode 100644
index 000000000000..ca41bd6fc5e1
--- /dev/null
+++ b/test/Transforms/CallSiteSplitting/callsite-no-splitting.ll
@@ -0,0 +1,18 @@
+; RUN: opt < %s -callsite-splitting -S | FileCheck %s
+; RUN: opt < %s -passes='function(callsite-splitting)' -S | FileCheck %s
+
+define i32 @callee(i32*, i32, i32) {
+ ret i32 10
+}
+
+; CHECK-LABEL: @test_preds_equal
+; CHECK-NOT: split
+; CHECK: br i1 %cmp, label %Tail, label %Tail
+define i32 @test_preds_equal(i32* %a, i32 %v, i32 %p) {
+TBB:
+ %cmp = icmp eq i32* %a, null
+ br i1 %cmp, label %Tail, label %Tail
+Tail:
+ %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+ ret i32 %r
+}
diff --git a/test/Transforms/CodeGenPrepare/section.ll b/test/Transforms/CodeGenPrepare/section.ll
index 4f3144e7fc73..30598ba7afbe 100644
--- a/test/Transforms/CodeGenPrepare/section.ll
+++ b/test/Transforms/CodeGenPrepare/section.ll
@@ -4,33 +4,59 @@ target triple = "x86_64-pc-linux-gnu"
; This tests that hot/cold functions get correct section prefix assigned
-; CHECK: hot_func{{.*}}!section_prefix ![[HOT_ID:[0-9]+]]
+; CHECK: hot_func1{{.*}}!section_prefix ![[HOT_ID:[0-9]+]]
; The entry is hot
-define void @hot_func() !prof !15 {
+define void @hot_func1() !prof !15 {
ret void
}
-; For instrumentation based PGO, we should only look at entry counts,
+; CHECK: hot_func2{{.*}}!section_prefix ![[HOT_ID:[0-9]+]]
+; Entry is cold but inner block is hot
+define void @hot_func2(i32 %n) !prof !16 {
+entry:
+ %n.addr = alloca i32, align 4
+ %i = alloca i32, align 4
+ store i32 %n, i32* %n.addr, align 4
+ store i32 0, i32* %i, align 4
+ br label %for.cond
+
+for.cond:
+ %0 = load i32, i32* %i, align 4
+ %1 = load i32, i32* %n.addr, align 4
+ %cmp = icmp slt i32 %0, %1
+ br i1 %cmp, label %for.body, label %for.end, !prof !19
+
+for.body:
+ %2 = load i32, i32* %i, align 4
+ %inc = add nsw i32 %2, 1
+ store i32 %inc, i32* %i, align 4
+ br label %for.cond
+
+for.end:
+ ret void
+}
+
+; For instrumentation based PGO, we should only look at block counts,
; not call site VP metadata (which can exist on value profiled memcpy,
; or possibly left behind after static analysis based devirtualization).
; CHECK: cold_func1{{.*}}!section_prefix ![[COLD_ID:[0-9]+]]
define void @cold_func1() !prof !16 {
- call void @hot_func(), !prof !17
- call void @hot_func(), !prof !17
+ call void @hot_func1(), !prof !17
+ call void @hot_func1(), !prof !17
ret void
}
-; CHECK: cold_func2{{.*}}!section_prefix
+; CHECK: cold_func2{{.*}}!section_prefix ![[COLD_ID]]
define void @cold_func2() !prof !16 {
- call void @hot_func(), !prof !17
- call void @hot_func(), !prof !18
- call void @hot_func(), !prof !18
+ call void @hot_func1(), !prof !17
+ call void @hot_func1(), !prof !18
+ call void @hot_func1(), !prof !18
ret void
}
; CHECK: cold_func3{{.*}}!section_prefix ![[COLD_ID]]
define void @cold_func3() !prof !16 {
- call void @hot_func(), !prof !18
+ call void @hot_func1(), !prof !18
ret void
}
@@ -55,3 +81,4 @@ define void @cold_func3() !prof !16 {
!16 = !{!"function_entry_count", i64 1}
!17 = !{!"branch_weights", i32 80}
!18 = !{!"branch_weights", i32 1}
+!19 = !{!"branch_weights", i32 1000, i32 1}
diff --git a/test/Transforms/GVN/tbaa.ll b/test/Transforms/GVN/tbaa.ll
index 7c05fda6cb8f..5cb4e0359970 100644
--- a/test/Transforms/GVN/tbaa.ll
+++ b/test/Transforms/GVN/tbaa.ll
@@ -1,7 +1,7 @@
; RUN: opt -tbaa -basicaa -gvn -S < %s | FileCheck %s
define i32 @test1(i8* %p, i8* %q) {
-; CHECK: @test1(i8* %p, i8* %q)
+; CHECK-LABEL: @test1(i8* %p, i8* %q)
; CHECK: call i32 @foo(i8* %p)
; CHECK-NOT: tbaa
; CHECK: %c = add i32 %a, %a
@@ -12,7 +12,7 @@ define i32 @test1(i8* %p, i8* %q) {
}
define i32 @test2(i8* %p, i8* %q) {
-; CHECK: @test2(i8* %p, i8* %q)
+; CHECK-LABEL: @test2(i8* %p, i8* %q)
; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGC:!.*]]
; CHECK: %c = add i32 %a, %a
%a = call i32 @foo(i8* %p), !tbaa !0
@@ -22,7 +22,7 @@ define i32 @test2(i8* %p, i8* %q) {
}
define i32 @test3(i8* %p, i8* %q) {
-; CHECK: @test3(i8* %p, i8* %q)
+; CHECK-LABEL: @test3(i8* %p, i8* %q)
; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGB:!.*]]
; CHECK: %c = add i32 %a, %a
%a = call i32 @foo(i8* %p), !tbaa !3
@@ -32,7 +32,7 @@ define i32 @test3(i8* %p, i8* %q) {
}
define i32 @test4(i8* %p, i8* %q) {
-; CHECK: @test4(i8* %p, i8* %q)
+; CHECK-LABEL: @test4(i8* %p, i8* %q)
; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA:!.*]]
; CHECK: %c = add i32 %a, %a
%a = call i32 @foo(i8* %p), !tbaa !1
@@ -42,8 +42,8 @@ define i32 @test4(i8* %p, i8* %q) {
}
define i32 @test5(i8* %p, i8* %q) {
-; CHECK: @test5(i8* %p, i8* %q)
-; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA:!.*]]
+; CHECK-LABEL: @test5(i8* %p, i8* %q)
+; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA]]
; CHECK: %c = add i32 %a, %a
%a = call i32 @foo(i8* %p), !tbaa !0
%b = call i32 @foo(i8* %p), !tbaa !1
@@ -52,8 +52,8 @@ define i32 @test5(i8* %p, i8* %q) {
}
define i32 @test6(i8* %p, i8* %q) {
-; CHECK: @test6(i8* %p, i8* %q)
-; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA:!.*]]
+; CHECK-LABEL: @test6(i8* %p, i8* %q)
+; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA]]
; CHECK: %c = add i32 %a, %a
%a = call i32 @foo(i8* %p), !tbaa !0
%b = call i32 @foo(i8* %p), !tbaa !3
@@ -62,7 +62,7 @@ define i32 @test6(i8* %p, i8* %q) {
}
define i32 @test7(i8* %p, i8* %q) {
-; CHECK: @test7(i8* %p, i8* %q)
+; CHECK-LABEL: @test7(i8* %p, i8* %q)
; CHECK: call i32 @foo(i8* %p)
; CHECK-NOT: tbaa
; CHECK: %c = add i32 %a, %a
@@ -72,10 +72,8 @@ define i32 @test7(i8* %p, i8* %q) {
ret i32 %c
}
-
-
define i32 @test8(i32* %p, i32* %q) {
-; CHECK-LABEL: test8
+; CHECK-LABEL: @test8
; CHECK-NEXT: store i32 15, i32* %p
; CHECK-NEXT: ret i32 0
; Since we know the location is invariant, we can forward the
@@ -87,8 +85,9 @@ define i32 @test8(i32* %p, i32* %q) {
%c = sub i32 %a, %b
ret i32 %c
}
+
define i32 @test9(i32* %p, i32* %q) {
-; CHECK-LABEL: test9
+; CHECK-LABEL: @test9
; CHECK-NEXT: call void @clobber()
; CHECK-NEXT: ret i32 0
; Since we know the location is invariant, we can forward the
@@ -101,16 +100,27 @@ define i32 @test9(i32* %p, i32* %q) {
ret i32 %c
}
+define i32 @test10(i8* %p, i8* %q) {
+; If one access encloses the other, then the merged access is the enclosed one
+; and not just the common final access type.
+; CHECK-LABEL: @test10
+; CHECK: call i32 @foo(i8* %p), !tbaa [[TAG_X_i:!.*]]
+; CHECK: %c = add i32 %a, %a
+ %a = call i32 @foo(i8* %p), !tbaa !15 ; TAG_X_i
+ %b = call i32 @foo(i8* %p), !tbaa !19 ; TAG_Y_x_i
+ %c = add i32 %a, %b
+ ret i32 %c
+}
declare void @clobber()
declare i32 @foo(i8*) readonly
-; CHECK: [[TAGC]] = !{[[TYPEC:!.*]], [[TYPEC]], i64 0}
-; CHECK: [[TYPEC]] = !{!"C", [[TYPEA:!.*]]}
-; CHECK: [[TYPEA]] = !{!"A", !{{.*}}}
-; CHECK: [[TAGB]] = !{[[TYPEB:!.*]], [[TYPEB]], i64 0}
-; CHECK: [[TYPEB]] = !{!"B", [[TYPEA]]}
-; CHECK: [[TAGA]] = !{[[TYPEA]], [[TYPEA]], i64 0}
+; CHECK-DAG: [[TAGC]] = !{[[TYPEC:!.*]], [[TYPEC]], i64 0}
+; CHECK-DAG: [[TYPEC]] = !{!"C", [[TYPEA:!.*]]}
+; CHECK-DAG: [[TYPEA]] = !{!"A", !{{.*}}}
+; CHECK-DAG: [[TAGB]] = !{[[TYPEB:!.*]], [[TYPEB]], i64 0}
+; CHECK-DAG: [[TYPEB]] = !{!"B", [[TYPEA]]}
+; CHECK-DAG: [[TAGA]] = !{[[TYPEA]], [[TYPEA]], i64 0}
!0 = !{!5, !5, i64 0}
!1 = !{!6, !6, i64 0}
!2 = !{!"tbaa root"}
@@ -122,8 +132,17 @@ declare i32 @foo(i8*) readonly
!8 = !{!"another root"}
!11 = !{!"scalar type", !8}
+; CHECK-DAG: [[TAG_X_i]] = !{[[TYPE_X:!.*]], [[TYPE_int:!.*]], i64 0}
+; CHECK-DAG: [[TYPE_X:!.*]] = !{!"struct X", [[TYPE_int]], i64 0}
+; CHECK-DAG: [[TYPE_int]] = !{!"int", {{!.*}}, i64 0}
+!15 = !{!16, !17, i64 0} ; TAG_X_i
+!16 = !{!"struct X", !17, i64 0} ; struct X { int i; };
+!17 = !{!"int", !18, i64 0}
+!18 = !{!"char", !2, i64 0}
-;; A TBAA structure who's only point is to have a constant location
+!19 = !{!20, !17, i64 0} ; TAG_Y_x_i
+!20 = !{!"struct Y", !16, i64 0} ; struct Y { struct X x; };
+
+; A TBAA structure who's only point is to have a constant location.
!9 = !{!"yet another root"}
!10 = !{!"node", !9, i64 1}
-
diff --git a/test/Transforms/Inline/AArch64/binop.ll b/test/Transforms/Inline/AArch64/binop.ll
new file mode 100644
index 000000000000..051528991e46
--- /dev/null
+++ b/test/Transforms/Inline/AArch64/binop.ll
@@ -0,0 +1,291 @@
+; RUN: opt -inline -mtriple=aarch64--linux-gnu -S -o - < %s -inline-threshold=0 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+declare void @pad()
+@glbl = external global i32
+
+define i32 @outer_add1(i32 %a) {
+; CHECK-LABEL: @outer_add1(
+; CHECK-NOT: call i32 @add
+ %C = call i32 @add(i32 %a, i32 0)
+ ret i32 %C
+}
+
+define i32 @outer_add2(i32 %a) {
+; CHECK-LABEL: @outer_add2(
+; CHECK-NOT: call i32 @add
+ %C = call i32 @add(i32 0, i32 %a)
+ ret i32 %C
+}
+
+define i32 @add(i32 %a, i32 %b) {
+ %add = add i32 %a, %b
+ call void @pad()
+ store i32 0, i32* @glbl
+ ret i32 %add
+}
+
+
+
+define i32 @outer_sub1(i32 %a) {
+; CHECK-LABEL: @outer_sub1(
+; CHECK-NOT: call i32 @sub1
+ %C = call i32 @sub1(i32 %a, i32 0)
+ ret i32 %C
+}
+
+define i32 @sub1(i32 %a, i32 %b) {
+ %sub = sub i32 %a, %b
+ call void @pad()
+ store i32 0, i32* @glbl
+ ret i32 %sub
+}
+
+
+define i32 @outer_sub2(i32 %a) {
+; CHECK-LABEL: @outer_sub2(
+; CHECK-NOT: call i32 @sub2
+ %C = call i32 @sub2(i32 %a)
+ ret i32 %C
+}
+
+define i32 @sub2(i32 %a) {
+ %sub = sub i32 %a, %a
+ call void @pad()
+ ret i32 %sub
+}
+
+
+
+define i32 @outer_mul1(i32 %a) {
+; CHECK-LABEL: @outer_mul1(
+; CHECK-NOT: call i32 @mul
+ %C = call i32 @mul(i32 %a, i32 0)
+ ret i32 %C
+}
+
+define i32 @outer_mul2(i32 %a) {
+; CHECK-LABEL: @outer_mul2(
+; CHECK-NOT: call i32 @mul
+ %C = call i32 @mul(i32 %a, i32 1)
+ ret i32 %C
+}
+
+define i32 @mul(i32 %a, i32 %b) {
+ %mul = mul i32 %a, %b
+ call void @pad()
+ store i32 0, i32* @glbl
+ ret i32 %mul
+}
+
+
+
+define i32 @outer_div1(i32 %a) {
+; CHECK-LABEL: @outer_div1(
+; CHECK-NOT: call i32 @div1
+ %C = call i32 @div1(i32 0, i32 %a)
+ ret i32 %C
+}
+
+define i32 @outer_div2(i32 %a) {
+; CHECK-LABEL: @outer_div2(
+; CHECK-NOT: call i32 @div1
+ %C = call i32 @div1(i32 %a, i32 1)
+ ret i32 %C
+}
+
+define i32 @div1(i32 %a, i32 %b) {
+ %div = sdiv i32 %a, %b
+ call void @pad()
+ store i32 0, i32* @glbl
+ ret i32 %div
+}
+
+
+define i32 @outer_div3(i32 %a) {
+; CHECK-LABEL: @outer_div3(
+; CHECK-NOT: call i32 @div
+ %C = call i32 @div2(i32 %a)
+ ret i32 %C
+}
+
+define i32 @div2(i32 %a) {
+ %div = sdiv i32 %a, %a
+ call void @pad()
+ ret i32 %div
+}
+
+
+
+define i32 @outer_rem1(i32 %a) {
+; CHECK-LABEL: @outer_rem1(
+; CHECK-NOT: call i32 @rem
+ %C = call i32 @rem1(i32 0, i32 %a)
+ ret i32 %C
+}
+
+define i32 @outer_rem2(i32 %a) {
+; CHECK-LABEL: @outer_rem2(
+; CHECK-NOT: call i32 @rem
+ %C = call i32 @rem1(i32 %a, i32 1)
+ ret i32 %C
+}
+
+define i32 @rem1(i32 %a, i32 %b) {
+ %rem = urem i32 %a, %b
+ call void @pad()
+ store i32 0, i32* @glbl
+ ret i32 %rem
+}
+
+
+define i32 @outer_rem3(i32 %a) {
+; CHECK-LABEL: @outer_rem3(
+; CHECK-NOT: call i32 @rem
+ %C = call i32 @rem2(i32 %a)
+ ret i32 %C
+}
+
+define i32 @rem2(i32 %a) {
+ %rem = urem i32 %a, %a
+ call void @pad()
+ ret i32 %rem
+}
+
+
+
+define i32 @outer_shl1(i32 %a) {
+; CHECK-LABEL: @outer_shl1(
+; CHECK-NOT: call i32 @shl
+ %C = call i32 @shl(i32 %a, i32 0)
+ ret i32 %C
+}
+
+define i32 @shl(i32 %a, i32 %b) {
+ %shl = shl i32 %a, %b
+ call void @pad()
+ store i32 0, i32* @glbl
+ ret i32 %shl
+}
+
+
+
+define i32 @outer_shr1(i32 %a) {
+; CHECK-LABEL: @outer_shr1(
+; CHECK-NOT: call i32 @shr
+ %C = call i32 @shr(i32 %a, i32 0)
+ ret i32 %C
+}
+
+define i32 @shr(i32 %a, i32 %b) {
+ %shr = ashr i32 %a, %b
+ call void @pad()
+ store i32 0, i32* @glbl
+ ret i32 %shr
+}
+
+
+
+define i1 @outer_and1(i1 %a) {
+; check-label: @outer_and1(
+; check-not: call i1 @and1
+ %c = call i1 @and1(i1 %a, i1 false)
+ ret i1 %c
+}
+
+define i1 @outer_and2(i1 %a) {
+; check-label: @outer_and2(
+; check-not: call i1 @and1
+ %c = call i1 @and1(i1 %a, i1 true)
+ ret i1 %c
+}
+
+define i1 @and1(i1 %a, i1 %b) {
+ %and = and i1 %a, %b
+ call void @pad()
+ store i32 0, i32* @glbl
+ ret i1 %and
+}
+
+
+define i1 @outer_and3(i1 %a) {
+; check-label: @outer_and3(
+; check-not: call i1 @and2
+ %c = call i1 @and2(i1 %a)
+ ret i1 %c
+}
+
+define i1 @and2(i1 %a) {
+ %and = and i1 %a, %a
+ call void @pad()
+ ret i1 %and
+}
+
+
+
+define i1 @outer_or1(i1 %a) {
+; check-label: @outer_or1(
+; check-not: call i1 @or1
+ %c = call i1 @or1(i1 %a, i1 false)
+ ret i1 %c
+}
+
+define i1 @outer_or2(i1 %a) {
+; check-label: @outer_or2(
+; check-not: call i1 @or1
+ %c = call i1 @or1(i1 %a, i1 true)
+ ret i1 %c
+}
+
+define i1 @or1(i1 %a, i1 %b) {
+ %or = or i1 %a, %b
+ call void @pad()
+ store i32 0, i32* @glbl
+ ret i1 %or
+}
+
+
+define i1 @outer_or3(i1 %a) {
+; check-label: @outer_or3(
+; check-not: call i1 @or2
+ %c = call i1 @or2(i1 %a)
+ ret i1 %c
+}
+
+define i1 @or2(i1 %a) {
+ %or = or i1 %a, %a
+ call void @pad()
+ ret i1 %or
+}
+
+
+
+define i1 @outer_xor1(i1 %a) {
+; check-label: @outer_xor1(
+; check-not: call i1 @xor
+ %c = call i1 @xor1(i1 %a, i1 false)
+ ret i1 %c
+}
+
+define i1 @xor1(i1 %a, i1 %b) {
+ %xor = xor i1 %a, %b
+ call void @pad()
+ store i32 0, i32* @glbl
+ ret i1 %xor
+}
+
+
+define i1 @outer_xor3(i1 %a) {
+; check-label: @outer_xor3(
+; check-not: call i1 @xor
+ %c = call i1 @xor2(i1 %a)
+ ret i1 %c
+}
+
+define i1 @xor2(i1 %a) {
+ %xor = xor i1 %a, %a
+ call void @pad()
+ ret i1 %xor
+}
diff --git a/test/Transforms/Inline/ARM/inline-fp.ll b/test/Transforms/Inline/ARM/inline-fp.ll
new file mode 100644
index 000000000000..b4e76dfc7d2d
--- /dev/null
+++ b/test/Transforms/Inline/ARM/inline-fp.ll
@@ -0,0 +1,113 @@
+; RUN: opt -S -inline -mtriple=arm-eabi -pass-remarks=.* -pass-remarks-missed=.* < %s 2>&1 | FileCheck %s -check-prefix=NOFP
+; RUN: opt -S -inline -mtriple=arm-eabi -mattr=+vfp2 -pass-remarks=.* -pass-remarks-missed=.* < %s 2>&1 | FileCheck %s -check-prefix=FULLFP
+; RUN: opt -S -inline -mtriple=arm-eabi -mattr=+vfp2,+fp-only-sp -pass-remarks=.* -pass-remarks-missed=.* < %s 2>&1 | FileCheck %s -check-prefix=SINGLEFP
+; Make sure that soft float implementations are calculated as being more expensive
+; to the inliner.
+
+; NOFP-DAG: single not inlined into test_single because too costly to inline (cost=125, threshold=75)
+; NOFP-DAG: single not inlined into test_single because too costly to inline (cost=125, threshold=75)
+; NOFP-DAG: single_cheap inlined into test_single_cheap with cost=-15 (threshold=75)
+; NOFP-DAG: single_cheap inlined into test_single_cheap with cost=-15015 (threshold=75)
+; NOFP-DAG: double not inlined into test_double because too costly to inline (cost=125, threshold=75)
+; NOFP-DAG: double not inlined into test_double because too costly to inline (cost=125, threshold=75)
+; NOFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75)
+; NOFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75)
+
+; FULLFP-DAG: single inlined into test_single with cost=0 (threshold=75)
+; FULLFP-DAG: single inlined into test_single with cost=-15000 (threshold=75)
+; FULLFP-DAG: single_cheap inlined into test_single_cheap with cost=-15 (threshold=75)
+; FULLFP-DAG: single_cheap inlined into test_single_cheap with cost=-15015 (threshold=75)
+; FULLFP-DAG: double inlined into test_double with cost=0 (threshold=75)
+; FULLFP-DAG: double inlined into test_double with cost=-15000 (threshold=75)
+; FULLFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75)
+; FULLFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75)
+
+; SINGLEFP-DAG: single inlined into test_single with cost=0 (threshold=75)
+; SINGLEFP-DAG: single inlined into test_single with cost=-15000 (threshold=75)
+; SINGLEFP-DAG: single_cheap inlined into test_single_cheap with cost=-15 (threshold=75)
+; SINGLEFP-DAG: single_cheap inlined into test_single_cheap with cost=-15015 (threshold=75)
+; SINGLEFP-DAG: double not inlined into test_double because too costly to inline (cost=125, threshold=75)
+; SINGLEFP-DAG: double not inlined into test_double because too costly to inline (cost=125, threshold=75)
+; SINGLEFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75)
+; SINGLEFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75)
+
+define i32 @test_single(i32 %a, i8 %b, i32 %c, i8 %d) #0 {
+ %call = call float @single(i32 %a, i8 zeroext %b)
+ %call2 = call float @single(i32 %c, i8 zeroext %d)
+ ret i32 0
+}
+
+define i32 @test_single_cheap(i32 %a, i8 %b, i32 %c, i8 %d) #0 {
+ %call = call float @single_cheap(i32 %a, i8 zeroext %b)
+ %call2 = call float @single_cheap(i32 %c, i8 zeroext %d)
+ ret i32 0
+}
+
+define i32 @test_double(i32 %a, i8 %b, i32 %c, i8 %d) #0 {
+ %call = call double @double(i32 %a, i8 zeroext %b)
+ %call2 = call double @double(i32 %c, i8 zeroext %d)
+ ret i32 0
+}
+
+define i32 @test_single_force_soft(i32 %a, i8 %b, i32 %c, i8 %d) #1 {
+ %call = call float @single_force_soft(i32 %a, i8 zeroext %b) #1
+ %call2 = call float @single_force_soft(i32 %c, i8 zeroext %d) #1
+ ret i32 0
+}
+
+define internal float @single(i32 %response, i8 zeroext %value1) #0 {
+entry:
+ %conv = zext i8 %value1 to i32
+ %sub = add nsw i32 %conv, -1
+ %conv1 = sitofp i32 %sub to float
+ %0 = tail call float @llvm.pow.f32(float 0x3FF028F5C0000000, float %conv1)
+ %mul = fmul float %0, 2.620000e+03
+ %conv2 = sitofp i32 %response to float
+ %sub3 = fsub float %conv2, %mul
+ %div = fdiv float %sub3, %mul
+ ret float %div
+}
+
+define internal float @single_cheap(i32 %response, i8 zeroext %value1) #0 {
+entry:
+ %conv = zext i8 %value1 to i32
+ %sub = add nsw i32 %conv, -1
+ %conv1 = bitcast i32 %sub to float
+ %conv2 = bitcast i32 %response to float
+ %0 = tail call float @llvm.pow.f32(float %conv2, float %conv1)
+ %1 = tail call float @llvm.pow.f32(float %0, float %0)
+ %2 = tail call float @llvm.pow.f32(float %1, float %1)
+ ret float %2
+}
+
+define internal double @double(i32 %response, i8 zeroext %value1) #0 {
+entry:
+ %conv = zext i8 %value1 to i32
+ %sub = add nsw i32 %conv, -1
+ %conv1 = sitofp i32 %sub to double
+ %0 = tail call double @llvm.pow.f64(double 0x3FF028F5C0000000, double %conv1)
+ %mul = fmul double %0, 2.620000e+03
+ %conv2 = sitofp i32 %response to double
+ %sub3 = fsub double %conv2, %mul
+ %div = fdiv double %sub3, %mul
+ ret double %div
+}
+
+define internal float @single_force_soft(i32 %response, i8 zeroext %value1) #1 {
+entry:
+ %conv = zext i8 %value1 to i32
+ %sub = add nsw i32 %conv, -1
+ %conv1 = sitofp i32 %sub to float
+ %0 = tail call float @llvm.pow.f32(float 0x3FF028F5C0000000, float %conv1)
+ %mul = fmul float %0, 2.620000e+03
+ %conv2 = sitofp i32 %response to float
+ %sub3 = fsub float %conv2, %mul
+ %div = fdiv float %sub3, %mul
+ ret float %div
+}
+
+declare float @llvm.pow.f32(float, float) optsize minsize
+declare double @llvm.pow.f64(double, double) optsize minsize
+
+attributes #0 = { optsize }
+attributes #1 = { optsize "use-soft-float"="true" "target-features"="+soft-float" }
diff --git a/test/Transforms/Inline/inline-fp.ll b/test/Transforms/Inline/inline-fp.ll
deleted file mode 100644
index dd5972fe1b8a..000000000000
--- a/test/Transforms/Inline/inline-fp.ll
+++ /dev/null
@@ -1,137 +0,0 @@
-; RUN: opt -S -inline < %s | FileCheck %s
-; RUN: opt -S -passes='cgscc(inline)' < %s | FileCheck %s
-; Make sure that soft float implementations are calculated as being more expensive
-; to the inliner.
-
-define i32 @test_nofp() #0 {
-; f_nofp() has the "use-soft-float" attribute, so it should never get inlined.
-; CHECK-LABEL: test_nofp
-; CHECK: call float @f_nofp
-entry:
- %responseX = alloca i32, align 4
- %responseY = alloca i32, align 4
- %responseZ = alloca i32, align 4
- %valueX = alloca i8, align 1
- %valueY = alloca i8, align 1
- %valueZ = alloca i8, align 1
-
- call void @getX(i32* %responseX, i8* %valueX)
- call void @getY(i32* %responseY, i8* %valueY)
- call void @getZ(i32* %responseZ, i8* %valueZ)
-
- %0 = load i32, i32* %responseX
- %1 = load i8, i8* %valueX
- %call = call float @f_nofp(i32 %0, i8 zeroext %1)
- %2 = load i32, i32* %responseZ
- %3 = load i8, i8* %valueZ
- %call2 = call float @f_nofp(i32 %2, i8 zeroext %3)
- %call3 = call float @fabsf(float %call)
- %cmp = fcmp ogt float %call3, 0x3FC1EB8520000000
- br i1 %cmp, label %if.end12, label %if.else
-
-if.else: ; preds = %entry
- %4 = load i32, i32* %responseY
- %5 = load i8, i8* %valueY
- %call1 = call float @f_nofp(i32 %4, i8 zeroext %5)
- %call4 = call float @fabsf(float %call1)
- %cmp5 = fcmp ogt float %call4, 0x3FC1EB8520000000
- br i1 %cmp5, label %if.end12, label %if.else7
-
-if.else7: ; preds = %if.else
- %call8 = call float @fabsf(float %call2)
- %cmp9 = fcmp ogt float %call8, 0x3FC1EB8520000000
- br i1 %cmp9, label %if.then10, label %if.end12
-
-if.then10: ; preds = %if.else7
- br label %if.end12
-
-if.end12: ; preds = %if.else, %entry, %if.then10, %if.else7
- %success.0 = phi i32 [ 0, %if.then10 ], [ 1, %if.else7 ], [ 0, %entry ], [ 0, %if.else ]
- ret i32 %success.0
-}
-
-define i32 @test_hasfp() #0 {
-; f_hasfp() does not have the "use-soft-float" attribute, so it should get inlined.
-; CHECK-LABEL: test_hasfp
-; CHECK-NOT: call float @f_hasfp
-entry:
- %responseX = alloca i32, align 4
- %responseY = alloca i32, align 4
- %responseZ = alloca i32, align 4
- %valueX = alloca i8, align 1
- %valueY = alloca i8, align 1
- %valueZ = alloca i8, align 1
-
- call void @getX(i32* %responseX, i8* %valueX)
- call void @getY(i32* %responseY, i8* %valueY)
- call void @getZ(i32* %responseZ, i8* %valueZ)
-
- %0 = load i32, i32* %responseX
- %1 = load i8, i8* %valueX
- %call = call float @f_hasfp(i32 %0, i8 zeroext %1)
- %2 = load i32, i32* %responseZ
- %3 = load i8, i8* %valueZ
- %call2 = call float @f_hasfp(i32 %2, i8 zeroext %3)
- %call3 = call float @fabsf(float %call)
- %cmp = fcmp ogt float %call3, 0x3FC1EB8520000000
- br i1 %cmp, label %if.end12, label %if.else
-
-if.else: ; preds = %entry
- %4 = load i32, i32* %responseY
- %5 = load i8, i8* %valueY
- %call1 = call float @f_hasfp(i32 %4, i8 zeroext %5)
- %call4 = call float @fabsf(float %call1)
- %cmp5 = fcmp ogt float %call4, 0x3FC1EB8520000000
- br i1 %cmp5, label %if.end12, label %if.else7
-
-if.else7: ; preds = %if.else
- %call8 = call float @fabsf(float %call2)
- %cmp9 = fcmp ogt float %call8, 0x3FC1EB8520000000
- br i1 %cmp9, label %if.then10, label %if.end12
-
-if.then10: ; preds = %if.else7
- br label %if.end12
-
-if.end12: ; preds = %if.else, %entry, %if.then10, %if.else7
- %success.0 = phi i32 [ 0, %if.then10 ], [ 1, %if.else7 ], [ 0, %entry ], [ 0, %if.else ]
- ret i32 %success.0
-}
-
-declare void @getX(i32*, i8*) #0
-
-declare void @getY(i32*, i8*) #0
-
-declare void @getZ(i32*, i8*) #0
-
-define internal float @f_hasfp(i32 %response, i8 zeroext %value1) #0 {
-entry:
- %conv = zext i8 %value1 to i32
- %sub = add nsw i32 %conv, -1
- %conv1 = sitofp i32 %sub to float
- %0 = tail call float @llvm.pow.f32(float 0x3FF028F5C0000000, float %conv1)
- %mul = fmul float %0, 2.620000e+03
- %conv2 = sitofp i32 %response to float
- %sub3 = fsub float %conv2, %mul
- %div = fdiv float %sub3, %mul
- ret float %div
-}
-
-define internal float @f_nofp(i32 %response, i8 zeroext %value1) #1 {
-entry:
- %conv = zext i8 %value1 to i32
- %sub = add nsw i32 %conv, -1
- %conv1 = sitofp i32 %sub to float
- %0 = tail call float @llvm.pow.f32(float 0x3FF028F5C0000000, float %conv1)
- %mul = fmul float %0, 2.620000e+03
- %conv2 = sitofp i32 %response to float
- %sub3 = fsub float %conv2, %mul
- %div = fdiv float %sub3, %mul
- ret float %div
-}
-
-declare float @fabsf(float) optsize minsize
-
-declare float @llvm.pow.f32(float, float) optsize minsize
-
-attributes #0 = { optsize }
-attributes #1 = { optsize "use-soft-float"="true" }
diff --git a/test/Transforms/Inline/redundant-loads.ll b/test/Transforms/Inline/redundant-loads.ll
index 6b89f1db484b..176f605fc73b 100644
--- a/test/Transforms/Inline/redundant-loads.ll
+++ b/test/Transforms/Inline/redundant-loads.ll
@@ -184,3 +184,21 @@ define void @inner9(i32* %a, void ()* %f) {
call void @pad()
ret void
}
+
+
+define void @outer10(i32* %a) {
+; CHECK-LABEL: @outer10(
+; CHECK: call void @inner10
+ %b = alloca i32
+ call void @inner10(i32* %a, i32* %b)
+ ret void
+}
+
+define void @inner10(i32* %a, i32* %b) {
+ %1 = load i32, i32* %a
+ store i32 %1, i32 * %b
+ %2 = load volatile i32, i32* %a ; volatile load should be kept.
+ call void @pad()
+ %3 = load volatile i32, i32* %a ; Same as the above.
+ ret void
+}
diff --git a/test/Transforms/InstCombine/2011-09-03-Trampoline.ll b/test/Transforms/InstCombine/2011-09-03-Trampoline.ll
index 1833558cbceb..7a315094a04e 100644
--- a/test/Transforms/InstCombine/2011-09-03-Trampoline.ll
+++ b/test/Transforms/InstCombine/2011-09-03-Trampoline.ll
@@ -5,18 +5,18 @@ declare i8* @llvm.adjust.trampoline(i8*)
declare i32 @f(i8 * nest, i32)
; Most common case
-define i32 @test0(i32 %n) {
+define i32 @test0(i32 %n) !dbg !4 {
%alloca = alloca [10 x i8], align 16
%gep = getelementptr [10 x i8], [10 x i8]* %alloca, i32 0, i32 0
call void @llvm.init.trampoline(i8* %gep, i8* bitcast (i32 (i8*, i32)* @f to i8*),
i8* null)
%tramp = call i8* @llvm.adjust.trampoline(i8* %gep)
%function = bitcast i8* %tramp to i32(i32)*
- %ret = call i32 %function(i32 %n)
+ %ret = call i32 %function(i32 %n), !dbg !10
ret i32 %ret
-; CHECK: define i32 @test0(i32 %n) {
-; CHECK: %ret = call i32 @f(i8* nest null, i32 %n)
+; CHECK: define i32 @test0(i32 %n) !dbg !4 {
+; CHECK: %ret = call i32 @f(i8* nest null, i32 %n), !dbg !10
}
define i32 @test1(i32 %n, i8* %trampmem) {
@@ -85,3 +85,18 @@ define i32 @test4(i32 %n) {
; CHECK: %ret1 = call i32 @f(i8* nest null, i32 %n)
; CHECK: %ret2 = call i32 @f(i8* nest null, i32 %n)
}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.0 (trunk 127710)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2)
+!1 = !DIFile(filename: "string.h", directory: "Game")
+!2 = !{}
+!3 = !{i32 1, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "passthru", scope: !1, file: !1, line: 79, type: !5, isLocal: true, isDefinition: true, scopeLine: 79, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !8)
+!5 = !DISubroutineType(types: !6)
+!6 = !{!7}
+!7 = !DIDerivedType(tag: DW_TAG_pointer_type, scope: !0, baseType: null, size: 64, align: 64)
+!8 = !{!9}
+!9 = !DILocalVariable(name: "a", arg: 1, scope: !4, file: !1, line: 78, type: !7)
+!10 = !DILocation(line: 78, column: 28, scope: !4)
diff --git a/test/Transforms/JumpThreading/guards.ll b/test/Transforms/JumpThreading/guards.ll
index 53175a7b7253..c760283f9e52 100644
--- a/test/Transforms/JumpThreading/guards.ll
+++ b/test/Transforms/JumpThreading/guards.ll
@@ -278,3 +278,106 @@ L2:
L3:
ret void
}
+
+; Make sure that we don't PRE a non-speculable load across a guard.
+define void @unsafe_pre_across_guard(i8* %p, i1 %load.is.valid) {
+
+; CHECK-LABEL: @unsafe_pre_across_guard(
+; CHECK-NOT: loaded.pr
+; CHECK: entry:
+; CHECK-NEXT: br label %loop
+; CHECK: loop:
+; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %load.is.valid) [ "deopt"() ]
+; CHECK-NEXT: %loaded = load i8, i8* %p
+; CHECK-NEXT: %continue = icmp eq i8 %loaded, 0
+; CHECK-NEXT: br i1 %continue, label %exit, label %loop
+entry:
+ br label %loop
+
+loop: ; preds = %loop, %entry
+ call void (i1, ...) @llvm.experimental.guard(i1 %load.is.valid) [ "deopt"() ]
+ %loaded = load i8, i8* %p
+ %continue = icmp eq i8 %loaded, 0
+ br i1 %continue, label %exit, label %loop
+
+exit: ; preds = %loop
+ ret void
+}
+
+; Make sure that we can safely PRE a speculable load across a guard.
+define void @safe_pre_across_guard(i8* noalias nocapture readonly dereferenceable(8) %p, i1 %load.is.valid) {
+
+; CHECK-LABEL: @safe_pre_across_guard(
+; CHECK: entry:
+; CHECK-NEXT: %loaded.pr = load i8, i8* %p
+; CHECK-NEXT: br label %loop
+; CHECK: loop:
+; CHECK-NEXT: %loaded = phi i8 [ %loaded, %loop ], [ %loaded.pr, %entry ]
+; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %load.is.valid) [ "deopt"() ]
+; CHECK-NEXT: %continue = icmp eq i8 %loaded, 0
+; CHECK-NEXT: br i1 %continue, label %exit, label %loop
+
+entry:
+ br label %loop
+
+loop: ; preds = %loop, %entry
+ call void (i1, ...) @llvm.experimental.guard(i1 %load.is.valid) [ "deopt"() ]
+ %loaded = load i8, i8* %p
+ %continue = icmp eq i8 %loaded, 0
+ br i1 %continue, label %exit, label %loop
+
+exit: ; preds = %loop
+ ret void
+}
+
+; Make sure that we don't PRE a non-speculable load across a call which may
+; alias with the load.
+define void @unsafe_pre_across_call(i8* %p) {
+
+; CHECK-LABEL: @unsafe_pre_across_call(
+; CHECK-NOT: loaded.pr
+; CHECK: entry:
+; CHECK-NEXT: br label %loop
+; CHECK: loop:
+; CHECK-NEXT: call i32 @f1()
+; CHECK-NEXT: %loaded = load i8, i8* %p
+; CHECK-NEXT: %continue = icmp eq i8 %loaded, 0
+; CHECK-NEXT: br i1 %continue, label %exit, label %loop
+entry:
+ br label %loop
+
+loop: ; preds = %loop, %entry
+ call i32 @f1()
+ %loaded = load i8, i8* %p
+ %continue = icmp eq i8 %loaded, 0
+ br i1 %continue, label %exit, label %loop
+
+exit: ; preds = %loop
+ ret void
+}
+
+; Make sure that we can safely PRE a speculable load across a call.
+define void @safe_pre_across_call(i8* noalias nocapture readonly dereferenceable(8) %p) {
+
+; CHECK-LABEL: @safe_pre_across_call(
+; CHECK: entry:
+; CHECK-NEXT: %loaded.pr = load i8, i8* %p
+; CHECK-NEXT: br label %loop
+; CHECK: loop:
+; CHECK-NEXT: %loaded = phi i8 [ %loaded, %loop ], [ %loaded.pr, %entry ]
+; CHECK-NEXT: call i32 @f1()
+; CHECK-NEXT: %continue = icmp eq i8 %loaded, 0
+; CHECK-NEXT: br i1 %continue, label %exit, label %loop
+
+entry:
+ br label %loop
+
+loop: ; preds = %loop, %entry
+ call i32 @f1()
+ %loaded = load i8, i8* %p
+ %continue = icmp eq i8 %loaded, 0
+ br i1 %continue, label %exit, label %loop
+
+exit: ; preds = %loop
+ ret void
+}
diff --git a/test/Transforms/LoopVectorize/legal_preheader_check.ll b/test/Transforms/LoopVectorize/legal_preheader_check.ll
new file mode 100644
index 000000000000..32aa796394d6
--- /dev/null
+++ b/test/Transforms/LoopVectorize/legal_preheader_check.ll
@@ -0,0 +1,27 @@
+; RUN: opt < %s -loop-vectorize -debug -S -o /dev/null 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; D40973
+; Make sure LV legal bails out when the loop doesn't have a legal pre-header.
+
+; CHECK: LV: Loop doesn't have a legal pre-header.
+
+define void @inc(i32 %n, i8* %P) {
+ %1 = icmp sgt i32 %n, 0
+ br i1 %1, label %BB1, label %BB2
+
+BB1:
+ indirectbr i8* %P, [label %.lr.ph]
+
+BB2:
+ br label %.lr.ph
+
+.lr.ph:
+ %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %BB1 ], [ 0, %BB2 ]
+ %indvars.iv.next = add i32 %indvars.iv, 1
+ %exitcond = icmp eq i32 %indvars.iv.next, %n
+ br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:
+ ret void
+}
diff --git a/test/Transforms/MemCpyOpt/memcpy-invoke-memcpy.ll b/test/Transforms/MemCpyOpt/memcpy-invoke-memcpy.ll
new file mode 100644
index 000000000000..e3d1f6dd2b17
--- /dev/null
+++ b/test/Transforms/MemCpyOpt/memcpy-invoke-memcpy.ll
@@ -0,0 +1,48 @@
+; RUN: opt < %s -memcpyopt -S | FileCheck %s
+; Test memcpy-memcpy dependencies across invoke edges.
+
+; Test that memcpyopt works across the non-unwind edge of an invoke.
+
+define hidden void @test_normal(i8* noalias %dst, i8* %src) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+ %temp = alloca i8, i32 64
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %temp, i8* nonnull %src, i64 64, i32 8, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %temp, i8* nonnull %src, i64 64, i32 8, i1 false)
+ invoke void @invoke_me()
+ to label %try.cont unwind label %lpad
+
+lpad:
+ landingpad { i8*, i32 }
+ catch i8* null
+ ret void
+
+try.cont:
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %temp, i64 64, i32 8, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 64, i32 8, i1 false)
+ ret void
+}
+
+; Test that memcpyopt works across the unwind edge of an invoke.
+
+define hidden void @test_unwind(i8* noalias %dst, i8* %src) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+ %temp = alloca i8, i32 64
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %temp, i8* nonnull %src, i64 64, i32 8, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %temp, i8* nonnull %src, i64 64, i32 8, i1 false)
+ invoke void @invoke_me()
+ to label %try.cont unwind label %lpad
+
+lpad:
+ landingpad { i8*, i32 }
+ catch i8* null
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %temp, i64 64, i32 8, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 64, i32 8, i1 false)
+ ret void
+
+try.cont:
+ ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)
+declare i32 @__gxx_personality_v0(...)
+declare void @invoke_me() readnone
diff --git a/test/Transforms/MemCpyOpt/merge-into-memset.ll b/test/Transforms/MemCpyOpt/merge-into-memset.ll
new file mode 100644
index 000000000000..fc31038a4e6d
--- /dev/null
+++ b/test/Transforms/MemCpyOpt/merge-into-memset.ll
@@ -0,0 +1,45 @@
+; RUN: opt < %s -memcpyopt -S | FileCheck %s
+; Update cached non-local dependence information when merging stores into memset.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Don't delete the memcpy in %if.then, even though it depends on an instruction
+; which will be deleted.
+
+; CHECK-LABEL: @foo
+define void @foo(i1 %c, i8* %d, i8* %e, i8* %f) {
+entry:
+ %tmp = alloca [50 x i8], align 8
+ %tmp4 = bitcast [50 x i8]* %tmp to i8*
+ %tmp1 = getelementptr inbounds i8, i8* %tmp4, i64 1
+ call void @llvm.memset.p0i8.i64(i8* nonnull %d, i8 0, i64 10, i32 1, i1 false), !dbg !5
+ store i8 0, i8* %tmp4, align 8, !dbg !5
+; CHECK: call void @llvm.memset.p0i8.i64(i8* nonnull %d, i8 0, i64 10, i32 1, i1 false), !dbg !5
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %tmp1, i8* nonnull %d, i64 10, i32 1, i1 false)
+ br i1 %c, label %if.then, label %exit
+
+if.then:
+; CHECK: if.then:
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %f, i8* nonnull %tmp4, i64 30, i32 8, i1 false)
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %f, i8* nonnull %tmp4, i64 30, i32 8, i1 false)
+ br label %exit
+
+exit:
+ ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1)
+declare void @llvm.memset.p0i8.i64(i8*, i8, i64, i32, i1)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_Rust, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "t.rs", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !DILocation(line: 8, column: 5, scope: !6)
+!6 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 5, type: !7, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!7 = !DISubroutineType(types: !8)
+!8 = !{null}
diff --git a/test/Transforms/MemCpyOpt/mixed-sizes.ll b/test/Transforms/MemCpyOpt/mixed-sizes.ll
new file mode 100644
index 000000000000..9091fe7f56c0
--- /dev/null
+++ b/test/Transforms/MemCpyOpt/mixed-sizes.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s -memcpyopt -S | FileCheck %s
+; Handle memcpy-memcpy dependencies of differing sizes correctly.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Don't delete the second memcpy, even though there's an earlier
+; memcpy with a larger size from the same address.
+
+; CHECK-LABEL: @foo
+define i32 @foo(i1 %z) {
+entry:
+ %a = alloca [10 x i32]
+ %s = alloca [10 x i32]
+ %0 = bitcast [10 x i32]* %a to i8*
+ %1 = bitcast [10 x i32]* %s to i8*
+ call void @llvm.memset.p0i8.i64(i8* nonnull %1, i8 0, i64 40, i32 16, i1 false)
+ %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %a, i64 0, i64 0
+ store i32 1, i32* %arrayidx
+ %scevgep = getelementptr [10 x i32], [10 x i32]* %s, i64 0, i64 1
+ %scevgep7 = bitcast i32* %scevgep to i8*
+ br i1 %z, label %for.body3.lr.ph, label %for.inc7.1
+
+for.body3.lr.ph: ; preds = %entry
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %scevgep7, i64 17179869180, i32 4, i1 false)
+ br label %for.inc7.1
+
+for.inc7.1:
+; CHECK: for.inc7.1:
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %scevgep7, i64 4, i32 4, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %scevgep7, i64 4, i32 4, i1 false)
+ %2 = load i32, i32* %arrayidx
+ ret i32 %2
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1)
+declare void @llvm.memset.p0i8.i64(i8*, i8, i64, i32, i1)
diff --git a/test/Transforms/MemCpyOpt/nonlocal-memcpy-memcpy.ll b/test/Transforms/MemCpyOpt/nonlocal-memcpy-memcpy.ll
new file mode 100644
index 000000000000..5b0510211d9f
--- /dev/null
+++ b/test/Transforms/MemCpyOpt/nonlocal-memcpy-memcpy.ll
@@ -0,0 +1,114 @@
+; RUN: opt < %s -memcpyopt -S | FileCheck %s
+; Make sure memcpy-memcpy dependence is optimized across
+; basic blocks (conditional branches and invokes).
+
+%struct.s = type { i32, i32 }
+
+@s_foo = private unnamed_addr constant %struct.s { i32 1, i32 2 }, align 4
+@s_baz = private unnamed_addr constant %struct.s { i32 1, i32 2 }, align 4
+@i = external constant i8*
+
+declare void @qux()
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)
+declare void @__cxa_throw(i8*, i8*, i8*)
+declare i32 @__gxx_personality_v0(...)
+declare i8* @__cxa_begin_catch(i8*)
+
+; A simple partial redundancy. Test that the second memcpy is optimized
+; to copy directly from the original source rather than from the temporary.
+
+; CHECK-LABEL: @wobble
+define void @wobble(i8* noalias %dst, i8* %src, i1 %some_condition) {
+bb:
+ %temp = alloca i8, i32 64
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %temp, i8* nonnull %src, i64 64, i32 8, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %temp, i8* nonnull %src, i64 64, i32 8, i1 false)
+ br i1 %some_condition, label %more, label %out
+
+out:
+ call void @qux()
+ unreachable
+
+more:
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %temp, i64 64, i32 8, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 64, i32 8, i1 false)
+ ret void
+}
+
+; A CFG triangle with a partial redundancy targeting an alloca. Test that the
+; memcpy inside the triangle is optimized to copy directly from the original
+; source rather than from the temporary.
+
+; CHECK-LABEL: @foo
+define i32 @foo(i1 %t3) {
+bb:
+ %s = alloca %struct.s, align 4
+ %t = alloca %struct.s, align 4
+ %s1 = bitcast %struct.s* %s to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %s1, i8* bitcast (%struct.s* @s_foo to i8*), i64 8, i32 4, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %s1, i8* bitcast (%struct.s* @s_foo to i8*), i64 8, i32 4, i1 false)
+ br i1 %t3, label %bb4, label %bb7
+
+bb4: ; preds = %bb
+ %t5 = bitcast %struct.s* %t to i8*
+ %s6 = bitcast %struct.s* %s to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %t5, i8* %s6, i64 8, i32 4, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %t5, i8* bitcast (%struct.s* @s_foo to i8*), i64 8, i32 4, i1 false)
+ br label %bb7
+
+bb7: ; preds = %bb4, %bb
+ %t8 = getelementptr %struct.s, %struct.s* %t, i32 0, i32 0
+ %t9 = load i32, i32* %t8, align 4
+ %t10 = getelementptr %struct.s, %struct.s* %t, i32 0, i32 1
+ %t11 = load i32, i32* %t10, align 4
+ %t12 = add i32 %t9, %t11
+ ret i32 %t12
+}
+
+; A CFG diamond with an invoke on one side, and a partially redundant memcpy
+; into an alloca on the other. Test that the memcpy inside the diamond is
+; optimized to copy ; directly from the original source rather than from the
+; temporary. This more complex test represents a relatively common usage
+; pattern.
+
+; CHECK-LABEL: @baz
+define i32 @baz(i1 %t5) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+bb:
+ %s = alloca %struct.s, align 4
+ %t = alloca %struct.s, align 4
+ %s3 = bitcast %struct.s* %s to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %s3, i8* bitcast (%struct.s* @s_baz to i8*), i64 8, i32 4, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %s3, i8* bitcast (%struct.s* @s_baz to i8*), i64 8, i32 4, i1 false)
+ br i1 %t5, label %bb6, label %bb22
+
+bb6: ; preds = %bb
+ invoke void @__cxa_throw(i8* null, i8* bitcast (i8** @i to i8*), i8* null)
+ to label %bb25 unwind label %bb9
+
+bb9: ; preds = %bb6
+ %t10 = landingpad { i8*, i32 }
+ catch i8* null
+ br label %bb13
+
+bb13: ; preds = %bb9
+ %t15 = call i8* @__cxa_begin_catch(i8* null)
+ br label %bb23
+
+bb22: ; preds = %bb
+ %t23 = bitcast %struct.s* %t to i8*
+ %s24 = bitcast %struct.s* %s to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %t23, i8* %s24, i64 8, i32 4, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %t23, i8* bitcast (%struct.s* @s_baz to i8*), i64 8, i32 4, i1 false)
+ br label %bb23
+
+bb23: ; preds = %bb22, %bb13
+ %t17 = getelementptr inbounds %struct.s, %struct.s* %t, i32 0, i32 0
+ %t18 = load i32, i32* %t17, align 4
+ %t19 = getelementptr inbounds %struct.s, %struct.s* %t, i32 0, i32 1
+ %t20 = load i32, i32* %t19, align 4
+ %t21 = add nsw i32 %t18, %t20
+ ret i32 %t21
+
+bb25: ; preds = %bb6
+ unreachable
+}
diff --git a/test/Transforms/NewGVN/tbaa.ll b/test/Transforms/NewGVN/tbaa.ll
index 3dcc4f8acc14..d48ededac03a 100644
--- a/test/Transforms/NewGVN/tbaa.ll
+++ b/test/Transforms/NewGVN/tbaa.ll
@@ -1,7 +1,7 @@
; RUN: opt -tbaa -basicaa -newgvn -S < %s | FileCheck %s
define i32 @test1(i8* %p, i8* %q) {
-; CHECK: @test1(i8* %p, i8* %q)
+; CHECK-LABEL: @test1(i8* %p, i8* %q)
; CHECK: call i32 @foo(i8* %p)
; CHECK-NOT: tbaa
; CHECK: %c = add i32 %a, %a
@@ -12,7 +12,7 @@ define i32 @test1(i8* %p, i8* %q) {
}
define i32 @test2(i8* %p, i8* %q) {
-; CHECK: @test2(i8* %p, i8* %q)
+; CHECK-LABEL: @test2(i8* %p, i8* %q)
; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGC:!.*]]
; CHECK: %c = add i32 %a, %a
%a = call i32 @foo(i8* %p), !tbaa !0
@@ -22,7 +22,7 @@ define i32 @test2(i8* %p, i8* %q) {
}
define i32 @test3(i8* %p, i8* %q) {
-; CHECK: @test3(i8* %p, i8* %q)
+; CHECK-LABEL: @test3(i8* %p, i8* %q)
; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGB:!.*]]
; CHECK: %c = add i32 %a, %a
%a = call i32 @foo(i8* %p), !tbaa !3
@@ -32,7 +32,7 @@ define i32 @test3(i8* %p, i8* %q) {
}
define i32 @test4(i8* %p, i8* %q) {
-; CHECK: @test4(i8* %p, i8* %q)
+; CHECK-LABEL: @test4(i8* %p, i8* %q)
; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA:!.*]]
; CHECK: %c = add i32 %a, %a
%a = call i32 @foo(i8* %p), !tbaa !1
@@ -42,8 +42,8 @@ define i32 @test4(i8* %p, i8* %q) {
}
define i32 @test5(i8* %p, i8* %q) {
-; CHECK: @test5(i8* %p, i8* %q)
-; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA:!.*]]
+; CHECK-LABEL: @test5(i8* %p, i8* %q)
+; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA]]
; CHECK: %c = add i32 %a, %a
%a = call i32 @foo(i8* %p), !tbaa !0
%b = call i32 @foo(i8* %p), !tbaa !1
@@ -52,8 +52,8 @@ define i32 @test5(i8* %p, i8* %q) {
}
define i32 @test6(i8* %p, i8* %q) {
-; CHECK: @test6(i8* %p, i8* %q)
-; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA:!.*]]
+; CHECK-LABEL: @test6(i8* %p, i8* %q)
+; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA]]
; CHECK: %c = add i32 %a, %a
%a = call i32 @foo(i8* %p), !tbaa !0
%b = call i32 @foo(i8* %p), !tbaa !3
@@ -62,7 +62,7 @@ define i32 @test6(i8* %p, i8* %q) {
}
define i32 @test7(i8* %p, i8* %q) {
-; CHECK: @test7(i8* %p, i8* %q)
+; CHECK-LABEL: @test7(i8* %p, i8* %q)
; CHECK: call i32 @foo(i8* %p)
; CHECK-NOT: tbaa
; CHECK: %c = add i32 %a, %a
@@ -72,10 +72,8 @@ define i32 @test7(i8* %p, i8* %q) {
ret i32 %c
}
-
-
define i32 @test8(i32* %p, i32* %q) {
-; CHECK-LABEL: test8
+; CHECK-LABEL: @test8
; CHECK-NEXT: store i32 15, i32* %p
; CHECK-NEXT: ret i32 0
; Since we know the location is invariant, we can forward the
@@ -87,8 +85,9 @@ define i32 @test8(i32* %p, i32* %q) {
%c = sub i32 %a, %b
ret i32 %c
}
+
define i32 @test9(i32* %p, i32* %q) {
-; CHECK-LABEL: test9
+; CHECK-LABEL: @test9
; CHECK-NEXT: call void @clobber()
; CHECK-NEXT: ret i32 0
; Since we know the location is invariant, we can forward the
@@ -101,16 +100,27 @@ define i32 @test9(i32* %p, i32* %q) {
ret i32 %c
}
+define i32 @test10(i8* %p, i8* %q) {
+; If one access encloses the other, then the merged access is the enclosed one
+; and not just the common final access type.
+; CHECK-LABEL: @test10
+; CHECK: call i32 @foo(i8* %p), !tbaa [[TAG_X_i:!.*]]
+; CHECK: %c = add i32 %a, %a
+ %a = call i32 @foo(i8* %p), !tbaa !15 ; TAG_X_i
+ %b = call i32 @foo(i8* %p), !tbaa !19 ; TAG_Y_x_i
+ %c = add i32 %a, %b
+ ret i32 %c
+}
declare void @clobber()
declare i32 @foo(i8*) readonly
-; CHECK: [[TAGC]] = !{[[TYPEC:!.*]], [[TYPEC]], i64 0}
-; CHECK: [[TYPEC]] = !{!"C", [[TYPEA:!.*]]}
-; CHECK: [[TYPEA]] = !{!"A", !{{.*}}}
-; CHECK: [[TAGB]] = !{[[TYPEB:!.*]], [[TYPEB]], i64 0}
-; CHECK: [[TYPEB]] = !{!"B", [[TYPEA]]}
-; CHECK: [[TAGA]] = !{[[TYPEA]], [[TYPEA]], i64 0}
+; CHECK-DAG: [[TAGC]] = !{[[TYPEC:!.*]], [[TYPEC]], i64 0}
+; CHECK-DAG: [[TYPEC]] = !{!"C", [[TYPEA:!.*]]}
+; CHECK-DAG: [[TYPEA]] = !{!"A", !{{.*}}}
+; CHECK-DAG: [[TAGB]] = !{[[TYPEB:!.*]], [[TYPEB]], i64 0}
+; CHECK-DAG: [[TYPEB]] = !{!"B", [[TYPEA]]}
+; CHECK-DAG: [[TAGA]] = !{[[TYPEA]], [[TYPEA]], i64 0}
!0 = !{!5, !5, i64 0}
!1 = !{!6, !6, i64 0}
!2 = !{!"tbaa root"}
@@ -122,8 +132,17 @@ declare i32 @foo(i8*) readonly
!8 = !{!"another root"}
!11 = !{!"scalar type", !8}
+; CHECK-DAG: [[TAG_X_i]] = !{[[TYPE_X:!.*]], [[TYPE_int:!.*]], i64 0}
+; CHECK-DAG: [[TYPE_X:!.*]] = !{!"struct X", [[TYPE_int]], i64 0}
+; CHECK-DAG: [[TYPE_int]] = !{!"int", {{!.*}}, i64 0}
+!15 = !{!16, !17, i64 0} ; TAG_X_i
+!16 = !{!"struct X", !17, i64 0} ; struct X { int i; };
+!17 = !{!"int", !18, i64 0}
+!18 = !{!"char", !2, i64 0}
-;; A TBAA structure who's only point is to have a constant location
+!19 = !{!20, !17, i64 0} ; TAG_Y_x_i
+!20 = !{!"struct Y", !16, i64 0} ; struct Y { struct X x; };
+
+; A TBAA structure who's only point is to have a constant location.
!9 = !{!"yet another root"}
!10 = !{!"node", !9, i64 1}
-
diff --git a/test/Transforms/PGOProfile/icp_covariant_call_return.ll b/test/Transforms/PGOProfile/icp_covariant_call_return.ll
index fc5054e3a574..aba075461deb 100644
--- a/test/Transforms/PGOProfile/icp_covariant_call_return.ll
+++ b/test/Transforms/PGOProfile/icp_covariant_call_return.ll
@@ -22,8 +22,7 @@ entry:
%vtable = load %struct.Base* (%struct.B*)**, %struct.Base* (%struct.B*)*** %tmp2, align 8
%vfn = getelementptr inbounds %struct.Base* (%struct.B*)*, %struct.Base* (%struct.B*)** %vtable, i64 0
%tmp3 = load %struct.Base* (%struct.B*)*, %struct.Base* (%struct.B*)** %vfn, align 8
-; ICALL-PROM: [[BITCAST:%[0-9]+]] = bitcast %struct.Base* (%struct.B*)* %tmp3 to i8*
-; ICALL-PROM: [[CMP:%[0-9]+]] = icmp eq i8* [[BITCAST]], bitcast (%struct.Derived* (%struct.D*)* @_ZN1D4funcEv to i8*)
+; ICALL-PROM: [[CMP:%[0-9]+]] = icmp eq %struct.Base* (%struct.B*)* %tmp3, bitcast (%struct.Derived* (%struct.D*)* @_ZN1D4funcEv to %struct.Base* (%struct.B*)*)
; ICALL-PROM: br i1 [[CMP]], label %if.true.direct_targ, label %if.false.orig_indirect, !prof [[BRANCH_WEIGHT:![0-9]+]]
; ICALL-PROM:if.true.direct_targ:
; ICALL-PROM: [[ARG_BITCAST:%[0-9]+]] = bitcast %struct.B* %tmp1 to %struct.D*
diff --git a/test/Transforms/PGOProfile/icp_covariant_invoke_return.ll b/test/Transforms/PGOProfile/icp_covariant_invoke_return.ll
index d2ff47dda0e6..0a4444783eb0 100644
--- a/test/Transforms/PGOProfile/icp_covariant_invoke_return.ll
+++ b/test/Transforms/PGOProfile/icp_covariant_invoke_return.ll
@@ -32,18 +32,19 @@ invoke.cont:
%vtable = load %struct.Base* (%struct.B*)**, %struct.Base* (%struct.B*)*** %tmp2, align 8
%vfn = getelementptr inbounds %struct.Base* (%struct.B*)*, %struct.Base* (%struct.B*)** %vtable, i64 0
%tmp3 = load %struct.Base* (%struct.B*)*, %struct.Base* (%struct.B*)** %vfn, align 8
-; ICALL-PROM: [[BITCAST:%[0-9]+]] = bitcast %struct.Base* (%struct.B*)* %tmp3 to i8*
-; ICALL-PROM: [[CMP:%[0-9]+]] = icmp eq i8* [[BITCAST]], bitcast (%struct.Derived* (%struct.D*)* @_ZN1D4funcEv to i8*)
+; ICALL-PROM: [[CMP:%[0-9]+]] = icmp eq %struct.Base* (%struct.B*)* %tmp3, bitcast (%struct.Derived* (%struct.D*)* @_ZN1D4funcEv to %struct.Base* (%struct.B*)*)
; ICALL-PROM: br i1 [[CMP]], label %if.true.direct_targ, label %if.false.orig_indirect, !prof [[BRANCH_WEIGHT:![0-9]+]]
; ICALL-PROM:if.true.direct_targ:
; ICALL-PROM: [[ARG_BITCAST:%[0-9]+]] = bitcast %struct.B* %tmp1 to %struct.D*
; ICALL-PROM: [[DIRCALL_RET:%[0-9]+]] = invoke %struct.Derived* @_ZN1D4funcEv(%struct.D* [[ARG_BITCAST]])
-; ICALL-PROM: to label %if.end.icp unwind label %lpad
+; ICALL-PROM: to label %if.true.direct_targ.if.end.icp_crit_edge unwind label %lpad
+; ICALL-PROM:if.true.direct_targ.if.end.icp_crit_edge:
+; ICALL-PROM: [[DIRCALL_RET_CAST:%[0-9]+]] = bitcast %struct.Derived* [[DIRCALL_RET]] to %struct.Base*
+; ICALL-PROM: br label %if.end.icp
; ICALL-PROM:if.false.orig_indirect:
; ICAll-PROM: %call2 = invoke %struct.Base* %tmp3(%struct.B* %tmp1)
; ICAll-PROM: to label %invoke.cont1 unwind label %lpad
; ICALL-PROM:if.end.icp:
-; ICALL-PROM: [[DIRCALL_RET_CAST:%[0-9]+]] = bitcast %struct.Derived* [[DIRCALL_RET]] to %struct.Base*
; ICALL-PROM: br label %invoke.cont1
%call2 = invoke %struct.Base* %tmp3(%struct.B* %tmp1)
to label %invoke.cont1 unwind label %lpad, !prof !1
diff --git a/test/Transforms/PGOProfile/icp_invoke.ll b/test/Transforms/PGOProfile/icp_invoke.ll
index 2ec564627aa1..1cacc1bc1aca 100644
--- a/test/Transforms/PGOProfile/icp_invoke.ll
+++ b/test/Transforms/PGOProfile/icp_invoke.ll
@@ -20,8 +20,7 @@ entry:
define i32 @_Z3goov() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
entry:
%tmp = load void ()*, void ()** @foo1, align 8
-; ICP: [[BITCAST_IC1:%[0-9]+]] = bitcast void ()* %tmp to i8*
-; ICP: [[CMP_IC1:%[0-9]+]] = icmp eq i8* [[BITCAST_IC1]], bitcast (void ()* @_ZL4bar1v to i8*)
+; ICP: [[CMP_IC1:%[0-9]+]] = icmp eq void ()* %tmp, @_ZL4bar1v
; ICP: br i1 [[CMP_IC1]], label %[[TRUE_LABEL_IC1:.*]], label %[[FALSE_LABEL_IC1:.*]], !prof [[BRANCH_WEIGHT:![0-9]+]]
; ICP:[[TRUE_LABEL_IC1]]:
; ICP: invoke void @_ZL4bar1v()
@@ -49,17 +48,19 @@ catch:
try.cont:
%tmp6 = load i32 ()*, i32 ()** @foo2, align 8
-; ICP: [[BITCAST_IC2:%[0-9]+]] = bitcast i32 ()* %tmp6 to i8*
-; ICP: [[CMP_IC2:%[0-9]+]] = icmp eq i8* [[BITCAST_IC2]], bitcast (i32 ()* @_ZL4bar2v to i8*)
+; ICP: [[CMP_IC2:%[0-9]+]] = icmp eq i32 ()* %tmp6, @_ZL4bar2v
; ICP: br i1 [[CMP_IC2]], label %[[TRUE_LABEL_IC2:.*]], label %[[FALSE_LABEL_IC2:.*]], !prof [[BRANCH_WEIGHT:![0-9]+]]
; ICP:[[TRUE_LABEL_IC2]]:
-; ICP: [[RESULT_IC2:%[0-9]+]] = invoke i32 @_ZL4bar2v()
-; ICP: to label %[[DCALL_NORMAL_DEST_IC2:.*]] unwind label %lpad1
+; ICP: [[RESULT_IC2_0:%[0-9]+]] = invoke i32 @_ZL4bar2v()
+; ICP: to label %[[MERGE_BB:.*]] unwind label %lpad1
; ICP:[[FALSE_LABEL_IC2]]:
+; ICP: [[RESULT_IC2_1:%.+]] = invoke i32 %tmp6()
+; ICP: to label %[[MERGE_BB]] unwind label %lpad1
%call = invoke i32 %tmp6()
to label %try.cont8 unwind label %lpad1, !prof !3
-; ICP:[[DCALL_NORMAL_DEST_IC2]]:
+; ICP:[[MERGE_BB]]:
+; ICP: [[MERGE_PHI:%.+]] = phi i32 [ [[RESULT_IC2_1]], %[[FALSE_LABEL_IC2]] ], [ [[RESULT_IC2_0]], %[[TRUE_LABEL_IC2]] ]
; ICP: br label %try.cont8
lpad1:
%tmp7 = landingpad { i8*, i32 }
@@ -77,7 +78,7 @@ catch6:
try.cont8:
%i.0 = phi i32 [ undef, %catch6 ], [ %call, %try.cont ]
-; ICP: %i.0 = phi i32 [ undef, %catch6 ], [ %call, %[[FALSE_LABEL_IC2]] ], [ [[RESULT_IC2]], %[[DCALL_NORMAL_DEST_IC2]] ]
+; ICP: %i.0 = phi i32 [ undef, %catch6 ], [ [[MERGE_PHI]], %[[MERGE_BB]] ]
ret i32 %i.0
eh.resume:
diff --git a/test/Transforms/PGOProfile/icp_invoke_nouse.ll b/test/Transforms/PGOProfile/icp_invoke_nouse.ll
index 5a1e6358cb61..096d2e0f222e 100644
--- a/test/Transforms/PGOProfile/icp_invoke_nouse.ll
+++ b/test/Transforms/PGOProfile/icp_invoke_nouse.ll
@@ -18,8 +18,7 @@ entry:
if.end: ; preds = %entry
%fptr = load i32 ()*, i32 ()** @pfptr, align 8
-; ICP: [[BITCAST_IC1:%[0-9]+]] = bitcast i32 ()* %fptr to i8*
-; ICP: [[CMP_IC1:%[0-9]+]] = icmp eq i8* [[BITCAST_IC1]], bitcast (i32 ()* @_ZL4bar1v to i8*)
+; ICP: [[CMP_IC1:%[0-9]+]] = icmp eq i32 ()* %fptr, @_ZL4bar1v
; ICP: br i1 [[CMP_IC1]], label %[[TRUE_LABEL_IC1:.*]], label %[[FALSE_LABEL_IC1:.*]], !prof [[BRANCH_WEIGHT:![0-9]+]]
; ICP:[[TRUE_LABEL_IC1]]:
; ICP: invoke i32 @_ZL4bar1v()
diff --git a/test/Transforms/PGOProfile/icp_vararg.ll b/test/Transforms/PGOProfile/icp_vararg.ll
index 400aab3aead7..ec243470290a 100644
--- a/test/Transforms/PGOProfile/icp_vararg.ll
+++ b/test/Transforms/PGOProfile/icp_vararg.ll
@@ -13,8 +13,7 @@ entry:
define i32 @bar() #1 {
entry:
%tmp = load i32 (i32, ...)*, i32 (i32, ...)** @foo, align 8
-; ICALL-PROM: [[BITCAST:%[0-9]+]] = bitcast i32 (i32, ...)* %tmp to i8*
-; ICALL-PROM: [[CMP:%[0-9]+]] = icmp eq i8* [[BITCAST]], bitcast (i32 (i32, ...)* @va_func to i8*)
+; ICALL-PROM: [[CMP:%[0-9]+]] = icmp eq i32 (i32, ...)* %tmp, @va_func
; ICALL-PROM: br i1 [[CMP]], label %if.true.direct_targ, label %if.false.orig_indirect, !prof [[BRANCH_WEIGHT:![0-9]+]]
; ICALL-PROM:if.true.direct_targ:
; ICALL-PROM: [[DIRCALL_RET:%[0-9]+]] = call i32 (i32, ...) @va_func(i32 3, i32 12, i32 22, i32 4)
diff --git a/test/Transforms/PGOProfile/indirect_call_promotion.ll b/test/Transforms/PGOProfile/indirect_call_promotion.ll
index 6832fecfaed3..85df5260f199 100644
--- a/test/Transforms/PGOProfile/indirect_call_promotion.ll
+++ b/test/Transforms/PGOProfile/indirect_call_promotion.ll
@@ -43,8 +43,7 @@ entry:
define i32 @bar() {
entry:
%tmp = load i32 ()*, i32 ()** @foo, align 8
-; ICALL-PROM: [[BITCAST:%[0-9]+]] = bitcast i32 ()* %tmp to i8*
-; ICALL-PROM: [[CMP:%[0-9]+]] = icmp eq i8* [[BITCAST]], bitcast (i32 ()* @func4 to i8*)
+; ICALL-PROM: [[CMP:%[0-9]+]] = icmp eq i32 ()* %tmp, @func4
; ICALL-PROM: br i1 [[CMP]], label %if.true.direct_targ, label %if.false.orig_indirect, !prof [[BRANCH_WEIGHT:![0-9]+]]
; ICALL-PROM: if.true.direct_targ:
; ICALL-PROM: [[DIRCALL_RET:%[0-9]+]] = call i32 @func4()
diff --git a/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll b/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll
index 4def8ce561c0..557a83a75626 100644
--- a/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll
+++ b/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll
@@ -11,16 +11,20 @@
define i32 @fn1() {
; CHECK-LABEL: @fn1(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @b to <4 x i32>*), align 4
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
-; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 1
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 8, i32 3
-; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP7]], <4 x i32> <i32 6, i32 0, i32 0, i32 0>
-; CHECK-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 0), align 4
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 1), align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 2), align 4
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 3), align 4
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP2]], i32 1
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP3]], i32 2
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP0]], i32 3
+; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP4]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 1
+; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 2
+; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 8, i32 3
+; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP11]], <4 x i32> <i32 6, i32 0, i32 0, i32 0>
+; CHECK-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4
; CHECK-NEXT: ret i32 0
;
entry:
diff --git a/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll b/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll
deleted file mode 100644
index 5fc0298b6cef..000000000000
--- a/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll
+++ /dev/null
@@ -1,125 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-vectorizer | FileCheck %s
-
-
-;void jumble (int * restrict A, int * restrict B) {
- ; int tmp0 = A[10]*A[0];
- ; int tmp1 = A[11]*A[1];
- ; int tmp2 = A[12]*A[3];
- ; int tmp3 = A[13]*A[2];
- ; B[0] = tmp0;
- ; B[1] = tmp1;
- ; B[2] = tmp2;
- ; B[3] = tmp3;
- ;}
-
-
- ; Function Attrs: norecurse nounwind uwtable
- define void @jumble1(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
-; CHECK-LABEL: @jumble1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 10
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 11
-; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1
-; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
-; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
-; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 13
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[A]] to <4 x i32>*
-; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
-; CHECK-NEXT: [[TMP5:%.*]] = mul nsw <4 x i32> [[TMP1]], [[TMP4]]
-; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
-; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
-; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[B]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
-; CHECK-NEXT: ret void
-;
-entry:
- %arrayidx = getelementptr inbounds i32, i32* %A, i64 10
- %0 = load i32, i32* %arrayidx, align 4
- %1 = load i32, i32* %A, align 4
- %mul = mul nsw i32 %0, %1
- %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 11
- %2 = load i32, i32* %arrayidx2, align 4
- %arrayidx3 = getelementptr inbounds i32, i32* %A, i64 1
- %3 = load i32, i32* %arrayidx3, align 4
- %mul4 = mul nsw i32 %2, %3
- %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 12
- %4 = load i32, i32* %arrayidx5, align 4
- %arrayidx6 = getelementptr inbounds i32, i32* %A, i64 3
- %5 = load i32, i32* %arrayidx6, align 4
- %mul7 = mul nsw i32 %4, %5
- %arrayidx8 = getelementptr inbounds i32, i32* %A, i64 13
- %6 = load i32, i32* %arrayidx8, align 4
- %arrayidx9 = getelementptr inbounds i32, i32* %A, i64 2
- %7 = load i32, i32* %arrayidx9, align 4
- %mul10 = mul nsw i32 %6, %7
- store i32 %mul, i32* %B, align 4
- %arrayidx12 = getelementptr inbounds i32, i32* %B, i64 1
- store i32 %mul4, i32* %arrayidx12, align 4
- %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 2
- store i32 %mul7, i32* %arrayidx13, align 4
- %arrayidx14 = getelementptr inbounds i32, i32* %B, i64 3
- store i32 %mul10, i32* %arrayidx14, align 4
- ret void
- }
-
-;Reversing the operand of MUL
- ; Function Attrs: norecurse nounwind uwtable
- define void @jumble2(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
-; CHECK-LABEL: @jumble2(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 10
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 11
-; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1
-; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
-; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
-; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 13
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[A]] to <4 x i32>*
-; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
-; CHECK-NEXT: [[TMP5:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP1]]
-; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
-; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
-; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[B]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
-; CHECK-NEXT: ret void
-;
-entry:
- %arrayidx = getelementptr inbounds i32, i32* %A, i64 10
- %0 = load i32, i32* %arrayidx, align 4
- %1 = load i32, i32* %A, align 4
- %mul = mul nsw i32 %1, %0
- %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 11
- %2 = load i32, i32* %arrayidx2, align 4
- %arrayidx3 = getelementptr inbounds i32, i32* %A, i64 1
- %3 = load i32, i32* %arrayidx3, align 4
- %mul4 = mul nsw i32 %3, %2
- %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 12
- %4 = load i32, i32* %arrayidx5, align 4
- %arrayidx6 = getelementptr inbounds i32, i32* %A, i64 3
- %5 = load i32, i32* %arrayidx6, align 4
- %mul7 = mul nsw i32 %5, %4
- %arrayidx8 = getelementptr inbounds i32, i32* %A, i64 13
- %6 = load i32, i32* %arrayidx8, align 4
- %arrayidx9 = getelementptr inbounds i32, i32* %A, i64 2
- %7 = load i32, i32* %arrayidx9, align 4
- %mul10 = mul nsw i32 %7, %6
- store i32 %mul, i32* %B, align 4
- %arrayidx12 = getelementptr inbounds i32, i32* %B, i64 1
- store i32 %mul4, i32* %arrayidx12, align 4
- %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 2
- store i32 %mul7, i32* %arrayidx13, align 4
- %arrayidx14 = getelementptr inbounds i32, i32* %B, i64 3
- store i32 %mul10, i32* %arrayidx14, align 4
- ret void
- }
-
diff --git a/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll b/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll
deleted file mode 100644
index 568fd9f3ac79..000000000000
--- a/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll
+++ /dev/null
@@ -1,225 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-vectorizer | FileCheck %s
-
-;void phiUsingLoads(int *restrict A, int *restrict B) {
-; int tmp0, tmp1, tmp2, tmp3;
-; for (int i = 0; i < 100; i++) {
-; if (A[0] == 0) {
-; tmp0 = A[i + 0];
-; tmp1 = A[i + 1];
-; tmp2 = A[i + 2];
-; tmp3 = A[i + 3];
-; } else if (A[25] == 0) {
-; tmp0 = A[i + 0];
-; tmp1 = A[i + 1];
-; tmp2 = A[i + 2];
-; tmp3 = A[i + 3];
-; } else if (A[50] == 0) {
-; tmp0 = A[i + 0];
-; tmp1 = A[i + 1];
-; tmp2 = A[i + 2];
-; tmp3 = A[i + 3];
-; } else if (A[75] == 0) {
-; tmp0 = A[i + 0];
-; tmp1 = A[i + 1];
-; tmp2 = A[i + 3];
-; tmp3 = A[i + 2];
-; }
-; }
-; B[0] = tmp0;
-; B[1] = tmp1;
-; B[2] = tmp2;
-; B[3] = tmp3;
-;}
-
-
-; Function Attrs: norecurse nounwind uwtable
-define void @phiUsingLoads(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) local_unnamed_addr #0 {
-; CHECK-LABEL: @phiUsingLoads(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4
-; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[TMP0]], 0
-; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 25
-; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 50
-; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 75
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
-; CHECK-NEXT: [[ARRAYIDX65:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
-; CHECK-NEXT: [[ARRAYIDX66:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[B]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP27:%.*]], <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT: ret void
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x i32> [ undef, [[ENTRY]] ], [ [[TMP27]], [[FOR_INC]] ]
-; CHECK-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-; CHECK: if.then:
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP3]]
-; CHECK-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
-; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[ARRAYIDX2]] to <4 x i32>*
-; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
-; CHECK-NEXT: br label [[FOR_INC]]
-; CHECK: if.else:
-; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX12]], align 4
-; CHECK-NEXT: [[CMP13:%.*]] = icmp eq i32 [[TMP8]], 0
-; CHECK-NEXT: br i1 [[CMP13]], label [[IF_THEN14:%.*]], label [[IF_ELSE27:%.*]]
-; CHECK: if.then14:
-; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP9:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP10]]
-; CHECK-NEXT: [[TMP11:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
-; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP11]]
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[ARRAYIDX17]] to <4 x i32>*
-; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP12]], align 4
-; CHECK-NEXT: br label [[FOR_INC]]
-; CHECK: if.else27:
-; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX28]], align 4
-; CHECK-NEXT: [[CMP29:%.*]] = icmp eq i32 [[TMP14]], 0
-; CHECK-NEXT: br i1 [[CMP29]], label [[IF_THEN30:%.*]], label [[IF_ELSE43:%.*]]
-; CHECK: if.then30:
-; CHECK-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP15:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP15]]
-; CHECK-NEXT: [[TMP16:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP16]]
-; CHECK-NEXT: [[TMP17:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
-; CHECK-NEXT: [[ARRAYIDX42:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP17]]
-; CHECK-NEXT: [[TMP18:%.*]] = bitcast i32* [[ARRAYIDX33]] to <4 x i32>*
-; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i32>, <4 x i32>* [[TMP18]], align 4
-; CHECK-NEXT: br label [[FOR_INC]]
-; CHECK: if.else43:
-; CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX44]], align 4
-; CHECK-NEXT: [[CMP45:%.*]] = icmp eq i32 [[TMP20]], 0
-; CHECK-NEXT: br i1 [[CMP45]], label [[IF_THEN46:%.*]], label [[FOR_INC]]
-; CHECK: if.then46:
-; CHECK-NEXT: [[ARRAYIDX49:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP21:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[ARRAYIDX52:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP21]]
-; CHECK-NEXT: [[TMP22:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
-; CHECK-NEXT: [[ARRAYIDX55:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP22]]
-; CHECK-NEXT: [[TMP23:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT: [[ARRAYIDX58:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP23]]
-; CHECK-NEXT: [[TMP24:%.*]] = bitcast i32* [[ARRAYIDX49]] to <4 x i32>*
-; CHECK-NEXT: [[TMP25:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
-; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <4 x i32> [[TMP25]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
-; CHECK-NEXT: br label [[FOR_INC]]
-; CHECK: for.inc:
-; CHECK-NEXT: [[TMP27]] = phi <4 x i32> [ [[TMP7]], [[IF_THEN]] ], [ [[TMP13]], [[IF_THEN14]] ], [ [[TMP19]], [[IF_THEN30]] ], [ [[TMP26]], [[IF_THEN46]] ], [ [[TMP2]], [[IF_ELSE43]] ]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
-;
-entry:
- %0 = load i32, i32* %A, align 4
- %cmp1 = icmp eq i32 %0, 0
- %arrayidx12 = getelementptr inbounds i32, i32* %A, i64 25
- %arrayidx28 = getelementptr inbounds i32, i32* %A, i64 50
- %arrayidx44 = getelementptr inbounds i32, i32* %A, i64 75
- br label %for.body
-
-for.cond.cleanup: ; preds = %for.inc
- store i32 %tmp0.1, i32* %B, align 4
- %arrayidx64 = getelementptr inbounds i32, i32* %B, i64 1
- store i32 %tmp1.1, i32* %arrayidx64, align 4
- %arrayidx65 = getelementptr inbounds i32, i32* %B, i64 2
- store i32 %tmp2.1, i32* %arrayidx65, align 4
- %arrayidx66 = getelementptr inbounds i32, i32* %B, i64 3
- store i32 %tmp3.1, i32* %arrayidx66, align 4
- ret void
-
-for.body: ; preds = %for.inc, %entry
- %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
- %tmp3.0111 = phi i32 [ undef, %entry ], [ %tmp3.1, %for.inc ]
- %tmp2.0110 = phi i32 [ undef, %entry ], [ %tmp2.1, %for.inc ]
- %tmp1.0109 = phi i32 [ undef, %entry ], [ %tmp1.1, %for.inc ]
- %tmp0.0108 = phi i32 [ undef, %entry ], [ %tmp0.1, %for.inc ]
- br i1 %cmp1, label %if.then, label %if.else
-
-if.then: ; preds = %for.body
- %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
- %1 = load i32, i32* %arrayidx2, align 4
- %2 = add nuw nsw i64 %indvars.iv, 1
- %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 %2
- %3 = load i32, i32* %arrayidx5, align 4
- %4 = add nuw nsw i64 %indvars.iv, 2
- %arrayidx8 = getelementptr inbounds i32, i32* %A, i64 %4
- %5 = load i32, i32* %arrayidx8, align 4
- %6 = add nuw nsw i64 %indvars.iv, 3
- %arrayidx11 = getelementptr inbounds i32, i32* %A, i64 %6
- %7 = load i32, i32* %arrayidx11, align 4
- br label %for.inc
-
-if.else: ; preds = %for.body
- %8 = load i32, i32* %arrayidx12, align 4
- %cmp13 = icmp eq i32 %8, 0
- br i1 %cmp13, label %if.then14, label %if.else27
-
-if.then14: ; preds = %if.else
- %arrayidx17 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
- %9 = load i32, i32* %arrayidx17, align 4
- %10 = add nuw nsw i64 %indvars.iv, 1
- %arrayidx20 = getelementptr inbounds i32, i32* %A, i64 %10
- %11 = load i32, i32* %arrayidx20, align 4
- %12 = add nuw nsw i64 %indvars.iv, 2
- %arrayidx23 = getelementptr inbounds i32, i32* %A, i64 %12
- %13 = load i32, i32* %arrayidx23, align 4
- %14 = add nuw nsw i64 %indvars.iv, 3
- %arrayidx26 = getelementptr inbounds i32, i32* %A, i64 %14
- %15 = load i32, i32* %arrayidx26, align 4
- br label %for.inc
-
-if.else27: ; preds = %if.else
- %16 = load i32, i32* %arrayidx28, align 4
- %cmp29 = icmp eq i32 %16, 0
- br i1 %cmp29, label %if.then30, label %if.else43
-
-if.then30: ; preds = %if.else27
- %arrayidx33 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
- %17 = load i32, i32* %arrayidx33, align 4
- %18 = add nuw nsw i64 %indvars.iv, 1
- %arrayidx36 = getelementptr inbounds i32, i32* %A, i64 %18
- %19 = load i32, i32* %arrayidx36, align 4
- %20 = add nuw nsw i64 %indvars.iv, 2
- %arrayidx39 = getelementptr inbounds i32, i32* %A, i64 %20
- %21 = load i32, i32* %arrayidx39, align 4
- %22 = add nuw nsw i64 %indvars.iv, 3
- %arrayidx42 = getelementptr inbounds i32, i32* %A, i64 %22
- %23 = load i32, i32* %arrayidx42, align 4
- br label %for.inc
-
-if.else43: ; preds = %if.else27
- %24 = load i32, i32* %arrayidx44, align 4
- %cmp45 = icmp eq i32 %24, 0
- br i1 %cmp45, label %if.then46, label %for.inc
-
-if.then46: ; preds = %if.else43
- %arrayidx49 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
- %25 = load i32, i32* %arrayidx49, align 4
- %26 = add nuw nsw i64 %indvars.iv, 1
- %arrayidx52 = getelementptr inbounds i32, i32* %A, i64 %26
- %27 = load i32, i32* %arrayidx52, align 4
- %28 = add nuw nsw i64 %indvars.iv, 3
- %arrayidx55 = getelementptr inbounds i32, i32* %A, i64 %28
- %29 = load i32, i32* %arrayidx55, align 4
- %30 = add nuw nsw i64 %indvars.iv, 2
- %arrayidx58 = getelementptr inbounds i32, i32* %A, i64 %30
- %31 = load i32, i32* %arrayidx58, align 4
- br label %for.inc
-
-for.inc: ; preds = %if.then, %if.then30, %if.else43, %if.then46, %if.then14
- %tmp0.1 = phi i32 [ %1, %if.then ], [ %9, %if.then14 ], [ %17, %if.then30 ], [ %25, %if.then46 ], [ %tmp0.0108, %if.else43 ]
- %tmp1.1 = phi i32 [ %3, %if.then ], [ %11, %if.then14 ], [ %19, %if.then30 ], [ %27, %if.then46 ], [ %tmp1.0109, %if.else43 ]
- %tmp2.1 = phi i32 [ %5, %if.then ], [ %13, %if.then14 ], [ %21, %if.then30 ], [ %29, %if.then46 ], [ %tmp2.0110, %if.else43 ]
- %tmp3.1 = phi i32 [ %7, %if.then ], [ %15, %if.then14 ], [ %23, %if.then30 ], [ %31, %if.then46 ], [ %tmp3.0111, %if.else43 ]
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond = icmp eq i64 %indvars.iv.next, 100
- br i1 %exitcond, label %for.cond.cleanup, label %for.body
-}
diff --git a/test/Transforms/SLPVectorizer/X86/jumbled-load.ll b/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
index be58521ed898..06e051a90b0d 100644
--- a/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
+++ b/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
@@ -5,27 +5,34 @@
define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) {
; CHECK-LABEL: @jumbled-load(
-; CHECK-NEXT: [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0
+; CHECK-NEXT: [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* %in, i64 0
+; CHECK-NEXT: [[LOAD_1:%.*]] = load i32, i32* [[IN_ADDR]], align 4
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
+; CHECK-NEXT: [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4
; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
+; CHECK-NEXT: [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4
; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
-; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; CHECK-NEXT: [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0
+; CHECK-NEXT: [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4
+; CHECK-NEXT: [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* %inn, i64 0
+; CHECK-NEXT: [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4
; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2
+; CHECK-NEXT: [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4
; CHECK-NEXT: [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
+; CHECK-NEXT: [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4
; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>*
-; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
-; CHECK-NEXT: [[TMP7:%.*]] = mul <4 x i32> [[TMP3]], [[TMP6]]
-; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
-; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
-; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
-; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
+; CHECK-NEXT: [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4
+; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[LOAD_3]], [[LOAD_5]]
+; CHECK-NEXT: [[MUL_2:%.*]] = mul i32 [[LOAD_2]], [[LOAD_8]]
+; CHECK-NEXT: [[MUL_3:%.*]] = mul i32 [[LOAD_4]], [[LOAD_7]]
+; CHECK-NEXT: [[MUL_4:%.*]] = mul i32 [[LOAD_1]], [[LOAD_6]]
+; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr inbounds i32, i32* %out, i64 0
+; CHECK-NEXT: store i32 [[MUL_1]], i32* [[GEP_7]], align 4
+; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i32, i32* %out, i64 1
+; CHECK-NEXT: store i32 [[MUL_2]], i32* [[GEP_8]], align 4
+; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i32, i32* %out, i64 2
+; CHECK-NEXT: store i32 [[MUL_3]], i32* [[GEP_9]], align 4
+; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* %out, i64 3
+; CHECK-NEXT: store i32 [[MUL_4]], i32* [[GEP_10]], align 4
; CHECK-NEXT: ret i32 undef
;
%in.addr = getelementptr inbounds i32, i32* %in, i64 0
diff --git a/test/Transforms/SLPVectorizer/X86/store-jumbled.ll b/test/Transforms/SLPVectorizer/X86/store-jumbled.ll
index 6ae763520013..1b2c76384e0b 100644
--- a/test/Transforms/SLPVectorizer/X86/store-jumbled.ll
+++ b/test/Transforms/SLPVectorizer/X86/store-jumbled.ll
@@ -6,26 +6,33 @@
define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) {
; CHECK-LABEL: @jumbled-load(
; CHECK-NEXT: [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0
+; CHECK-NEXT: [[LOAD_1:%.*]] = load i32, i32* [[IN_ADDR]], align 4
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
+; CHECK-NEXT: [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4
; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
+; CHECK-NEXT: [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4
; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
-; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
+; CHECK-NEXT: [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4
; CHECK-NEXT: [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0
+; CHECK-NEXT: [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4
; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1
+; CHECK-NEXT: [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4
; CHECK-NEXT: [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2
+; CHECK-NEXT: [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4
; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>*
-; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
-; CHECK-NEXT: [[TMP7:%.*]] = mul <4 x i32> [[TMP3]], [[TMP6]]
+; CHECK-NEXT: [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4
+; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[LOAD_1]], [[LOAD_5]]
+; CHECK-NEXT: [[MUL_2:%.*]] = mul i32 [[LOAD_2]], [[LOAD_6]]
+; CHECK-NEXT: [[MUL_3:%.*]] = mul i32 [[LOAD_3]], [[LOAD_7]]
+; CHECK-NEXT: [[MUL_4:%.*]] = mul i32 [[LOAD_4]], [[LOAD_8]]
; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
+; CHECK-NEXT: store i32 [[MUL_1]], i32* [[GEP_9]], align 4
+; CHECK-NEXT: store i32 [[MUL_2]], i32* [[GEP_7]], align 4
+; CHECK-NEXT: store i32 [[MUL_3]], i32* [[GEP_10]], align 4
+; CHECK-NEXT: store i32 [[MUL_4]], i32* [[GEP_8]], align 4
; CHECK-NEXT: ret i32 undef
;
%in.addr = getelementptr inbounds i32, i32* %in, i64 0
diff --git a/test/Transforms/SampleProfile/entry_counts.ll b/test/Transforms/SampleProfile/entry_counts.ll
index 6137a6908cf5..cab7c87e0493 100644
--- a/test/Transforms/SampleProfile/entry_counts.ll
+++ b/test/Transforms/SampleProfile/entry_counts.ll
@@ -9,8 +9,8 @@ entry:
ret void, !dbg !9
}
-; This function does not have profile, check if function_entry_count is 0
-; CHECK: {{.*}} = !{!"function_entry_count", i64 0}
+; This function does not have profile, check if function_entry_count is -1
+; CHECK: {{.*}} = !{!"function_entry_count", i64 -1}
define void @no_profile() {
entry:
ret void
diff --git a/test/Transforms/SimplifyCFG/X86/if-conversion.ll b/test/Transforms/SimplifyCFG/X86/if-conversion.ll
new file mode 100644
index 000000000000..28702572d480
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/X86/if-conversion.ll
@@ -0,0 +1,231 @@
+; RUN: opt < %s -simplifycfg -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -S | FileCheck %s
+; Avoid if-conversion if there is a long dependence chain.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; The first several cases test FindLongDependenceChain returns true, so
+; if-conversion is blocked.
+
+define i64 @test1(i64** %pp, i64* %p) {
+entry:
+ %0 = load i64*, i64** %pp, align 8
+ %1 = load i64, i64* %0, align 8
+ %cmp = icmp slt i64 %1, 0
+ %pint = ptrtoint i64* %p to i64
+ br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+ %p1 = add i64 %pint, 8
+ br label %cond.end
+
+cond.false:
+ %p2 = or i64 %pint, 16
+ br label %cond.end
+
+cond.end:
+ %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
+ %ptr = inttoptr i64 %p3 to i64*
+ %val = load i64, i64* %ptr, align 8
+ ret i64 %val
+
+; CHECK-NOT: select
+}
+
+define i64 @test2(i64** %pp, i64* %p) {
+entry:
+ %0 = load i64*, i64** %pp, align 8
+ %1 = load i64, i64* %0, align 8
+ %cmp = icmp slt i64 %1, 0
+ %pint = ptrtoint i64* %p to i64
+ br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+ %p1 = add i64 %pint, 8
+ br label %cond.end
+
+cond.false:
+ %p2 = add i64 %pint, 16
+ br label %cond.end
+
+cond.end:
+ %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
+ %ptr = inttoptr i64 %p3 to i64*
+ %val = load i64, i64* %ptr, align 8
+ ret i64 %val
+
+; CHECK-LABEL: @test2
+; CHECK-NOT: select
+}
+
+; The following cases test FindLongDependenceChain returns false, so
+; if-conversion will proceed.
+
+; Non trivial LatencyAdjustment.
+define i64 @test3(i64** %pp, i64* %p) {
+entry:
+ %0 = load i64*, i64** %pp, align 8
+ %1 = load i64, i64* %0, align 8
+ %cmp = icmp slt i64 %1, 0
+ %pint = ptrtoint i64* %p to i64
+ br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+ %p1 = add i64 %pint, 8
+ br label %cond.end
+
+cond.false:
+ %p2 = or i64 %pint, 16
+ br label %cond.end
+
+cond.end:
+ %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
+ %p4 = add i64 %p3, %1
+ %ptr = inttoptr i64 %p4 to i64*
+ %val = load i64, i64* %ptr, align 8
+ ret i64 %val
+
+; CHECK-LABEL: @test3
+; CHECK: select
+}
+
+; Short dependence chain.
+define i64 @test4(i64* %pp, i64* %p) {
+entry:
+ %0 = load i64, i64* %pp, align 8
+ %cmp = icmp slt i64 %0, 0
+ %pint = ptrtoint i64* %p to i64
+ br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+ %p1 = add i64 %pint, 8
+ br label %cond.end
+
+cond.false:
+ %p2 = or i64 %pint, 16
+ br label %cond.end
+
+cond.end:
+ %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
+ %ptr = inttoptr i64 %p3 to i64*
+ %val = load i64, i64* %ptr, align 8
+ ret i64 %val
+
+; CHECK-LABEL: @test4
+; CHECK: select
+}
+
+; High IPC.
+define i64 @test5(i64** %pp, i64* %p) {
+entry:
+ %0 = load i64*, i64** %pp, align 8
+ %1 = load i64, i64* %0, align 8
+ %cmp = icmp slt i64 %1, 0
+ %pint = ptrtoint i64* %p to i64
+ %2 = add i64 %pint, 2
+ %3 = add i64 %pint, 3
+ %4 = or i64 %pint, 16
+ %5 = and i64 %pint, 255
+
+ %6 = or i64 %2, 9
+ %7 = and i64 %3, 255
+ %8 = add i64 %4, 4
+ %9 = add i64 %5, 5
+
+ %10 = add i64 %6, 2
+ %11 = add i64 %7, 3
+ %12 = add i64 %8, 4
+ %13 = add i64 %9, 5
+
+ %14 = add i64 %10, 6
+ %15 = add i64 %11, 7
+ %16 = add i64 %12, 8
+ %17 = add i64 %13, 9
+
+ %18 = add i64 %14, 10
+ %19 = add i64 %15, 11
+ %20 = add i64 %16, 12
+ %21 = add i64 %17, 13
+
+ br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+ %p1 = add i64 %pint, 8
+ br label %cond.end
+
+cond.false:
+ %p2 = or i64 %pint, 16
+ br label %cond.end
+
+cond.end:
+ %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
+ %ptr = inttoptr i64 %p3 to i64*
+ %val = load i64, i64* %ptr, align 8
+
+ ret i64 %val
+
+; CHECK-LABEL: @test5
+; CHECK: select
+}
+
+; Large BB size.
+define i64 @test6(i64** %pp, i64* %p) {
+entry:
+ %0 = load i64*, i64** %pp, align 8
+ %1 = load i64, i64* %0, align 8
+ %cmp = icmp slt i64 %1, 0
+ %pint = ptrtoint i64* %p to i64
+ br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+ %p1 = add i64 %pint, 8
+ br label %cond.end
+
+cond.false:
+ %p2 = or i64 %pint, 16
+ br label %cond.end
+
+cond.end:
+ %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
+ %ptr = inttoptr i64 %p3 to i64*
+ %val = load i64, i64* %ptr, align 8
+ %2 = add i64 %pint, 2
+ %3 = add i64 %pint, 3
+ %4 = add i64 %2, 4
+ %5 = add i64 %3, 5
+ %6 = add i64 %4, 6
+ %7 = add i64 %5, 7
+ %8 = add i64 %6, 6
+ %9 = add i64 %7, 7
+ %10 = add i64 %8, 6
+ %11 = add i64 %9, 7
+ %12 = add i64 %10, 6
+ %13 = add i64 %11, 7
+ %14 = add i64 %12, 6
+ %15 = add i64 %13, 7
+ %16 = add i64 %14, 6
+ %17 = add i64 %15, 7
+ %18 = add i64 %16, 6
+ %19 = add i64 %17, 7
+ %20 = add i64 %18, 6
+ %21 = add i64 %19, 7
+ %22 = add i64 %20, 6
+ %23 = add i64 %21, 7
+ %24 = add i64 %22, 6
+ %25 = add i64 %23, 7
+ %26 = add i64 %24, 6
+ %27 = add i64 %25, 7
+ %28 = add i64 %26, 6
+ %29 = add i64 %27, 7
+ %30 = add i64 %28, 6
+ %31 = add i64 %29, 7
+ %32 = add i64 %30, 8
+ %33 = add i64 %31, 9
+ %34 = add i64 %32, %33
+ %35 = and i64 %34, 255
+ %res = add i64 %val, %35
+
+ ret i64 %res
+
+; CHECK-LABEL: @test6
+; CHECK: select
+}
diff --git a/test/tools/llvm-cvtres/machine.test b/test/tools/llvm-cvtres/machine.test
index cac36062f301..5ce6cbaf697d 100644
--- a/test/tools/llvm-cvtres/machine.test
+++ b/test/tools/llvm-cvtres/machine.test
@@ -29,46 +29,46 @@ X86: Machine: IMAGE_FILE_MACHINE_I386 (0x14C)
X86-DAG: Relocations [
X86-DAG: .rsrc$01 {
X86-NEXT: 0x1E8 IMAGE_REL_I386_DIR32NB $R000000
-X86-NEXT: 0x198 IMAGE_REL_I386_DIR32NB $R000018
-X86-NEXT: 0x1A8 IMAGE_REL_I386_DIR32NB $R000340
-X86-NEXT: 0x1C8 IMAGE_REL_I386_DIR32NB $R000668
-X86-NEXT: 0x1D8 IMAGE_REL_I386_DIR32NB $R000698
-X86-NEXT: 0x1F8 IMAGE_REL_I386_DIR32NB $R000708
-X86-NEXT: 0x1B8 IMAGE_REL_I386_DIR32NB $R000720
-X86-NEXT: 0x188 IMAGE_REL_I386_DIR32NB $R000750
+X86-NEXT: 0x198 IMAGE_REL_I386_DIR32NB $R000001
+X86-NEXT: 0x1A8 IMAGE_REL_I386_DIR32NB $R000002
+X86-NEXT: 0x1C8 IMAGE_REL_I386_DIR32NB $R000003
+X86-NEXT: 0x1D8 IMAGE_REL_I386_DIR32NB $R000004
+X86-NEXT: 0x1F8 IMAGE_REL_I386_DIR32NB $R000005
+X86-NEXT: 0x1B8 IMAGE_REL_I386_DIR32NB $R000006
+X86-NEXT: 0x188 IMAGE_REL_I386_DIR32NB $R000007
X64: Machine: IMAGE_FILE_MACHINE_AMD64 (0x8664)
X64-DAG: Relocations [
X64-DAG: .rsrc$01 {
X64-NEXT: 0x1E8 IMAGE_REL_AMD64_ADDR32NB $R000000
-X64-NEXT: 0x198 IMAGE_REL_AMD64_ADDR32NB $R000018
-X64-NEXT: 0x1A8 IMAGE_REL_AMD64_ADDR32NB $R000340
-X64-NEXT: 0x1C8 IMAGE_REL_AMD64_ADDR32NB $R000668
-X64-NEXT: 0x1D8 IMAGE_REL_AMD64_ADDR32NB $R000698
-X64-NEXT: 0x1F8 IMAGE_REL_AMD64_ADDR32NB $R000708
-X64-NEXT: 0x1B8 IMAGE_REL_AMD64_ADDR32NB $R000720
-X64-NEXT: 0x188 IMAGE_REL_AMD64_ADDR32NB $R000750
+X64-NEXT: 0x198 IMAGE_REL_AMD64_ADDR32NB $R000001
+X64-NEXT: 0x1A8 IMAGE_REL_AMD64_ADDR32NB $R000002
+X64-NEXT: 0x1C8 IMAGE_REL_AMD64_ADDR32NB $R000003
+X64-NEXT: 0x1D8 IMAGE_REL_AMD64_ADDR32NB $R000004
+X64-NEXT: 0x1F8 IMAGE_REL_AMD64_ADDR32NB $R000005
+X64-NEXT: 0x1B8 IMAGE_REL_AMD64_ADDR32NB $R000006
+X64-NEXT: 0x188 IMAGE_REL_AMD64_ADDR32NB $R000007
ARM: Machine: IMAGE_FILE_MACHINE_ARMNT (0x1C4)
ARM-DAG: Relocations [
ARM-DAG: .rsrc$01 {
ARM-NEXT: 0x1E8 IMAGE_REL_ARM_ADDR32NB $R000000
-ARM-NEXT: 0x198 IMAGE_REL_ARM_ADDR32NB $R000018
-ARM-NEXT: 0x1A8 IMAGE_REL_ARM_ADDR32NB $R000340
-ARM-NEXT: 0x1C8 IMAGE_REL_ARM_ADDR32NB $R000668
-ARM-NEXT: 0x1D8 IMAGE_REL_ARM_ADDR32NB $R000698
-ARM-NEXT: 0x1F8 IMAGE_REL_ARM_ADDR32NB $R000708
-ARM-NEXT: 0x1B8 IMAGE_REL_ARM_ADDR32NB $R000720
-ARM-NEXT: 0x188 IMAGE_REL_ARM_ADDR32NB $R000750
+ARM-NEXT: 0x198 IMAGE_REL_ARM_ADDR32NB $R000001
+ARM-NEXT: 0x1A8 IMAGE_REL_ARM_ADDR32NB $R000002
+ARM-NEXT: 0x1C8 IMAGE_REL_ARM_ADDR32NB $R000003
+ARM-NEXT: 0x1D8 IMAGE_REL_ARM_ADDR32NB $R000004
+ARM-NEXT: 0x1F8 IMAGE_REL_ARM_ADDR32NB $R000005
+ARM-NEXT: 0x1B8 IMAGE_REL_ARM_ADDR32NB $R000006
+ARM-NEXT: 0x188 IMAGE_REL_ARM_ADDR32NB $R000007
ARM64: Machine: IMAGE_FILE_MACHINE_ARM64 (0xAA64)
ARM64-DAG: Relocations [
ARM64-DAG: .rsrc$01 {
ARM64-NEXT: 0x1E8 IMAGE_REL_ARM64_ADDR32NB $R000000
-ARM64-NEXT: 0x198 IMAGE_REL_ARM64_ADDR32NB $R000018
-ARM64-NEXT: 0x1A8 IMAGE_REL_ARM64_ADDR32NB $R000340
-ARM64-NEXT: 0x1C8 IMAGE_REL_ARM64_ADDR32NB $R000668
-ARM64-NEXT: 0x1D8 IMAGE_REL_ARM64_ADDR32NB $R000698
-ARM64-NEXT: 0x1F8 IMAGE_REL_ARM64_ADDR32NB $R000708
-ARM64-NEXT: 0x1B8 IMAGE_REL_ARM64_ADDR32NB $R000720
-ARM64-NEXT: 0x188 IMAGE_REL_ARM64_ADDR32NB $R000750
+ARM64-NEXT: 0x198 IMAGE_REL_ARM64_ADDR32NB $R000001
+ARM64-NEXT: 0x1A8 IMAGE_REL_ARM64_ADDR32NB $R000002
+ARM64-NEXT: 0x1C8 IMAGE_REL_ARM64_ADDR32NB $R000003
+ARM64-NEXT: 0x1D8 IMAGE_REL_ARM64_ADDR32NB $R000004
+ARM64-NEXT: 0x1F8 IMAGE_REL_ARM64_ADDR32NB $R000005
+ARM64-NEXT: 0x1B8 IMAGE_REL_ARM64_ADDR32NB $R000006
+ARM64-NEXT: 0x188 IMAGE_REL_ARM64_ADDR32NB $R000007
diff --git a/test/tools/llvm-cvtres/symbols.test b/test/tools/llvm-cvtres/symbols.test
index 2ca3a193ac40..14f5c360d454 100644
--- a/test/tools/llvm-cvtres/symbols.test
+++ b/test/tools/llvm-cvtres/symbols.test
@@ -13,21 +13,21 @@ RUN: llvm-readobj -symbols %t | FileCheck %s
CHECK: Name: $R000000
CHECK-NEXT: Value: 0
CHECK-NEXT: Section: .rsrc$02
-CHECK: Name: $R000018
+CHECK: Name: $R000001
CHECK-NEXT: Value: 24
CHECK-NEXT: Section: .rsrc$02
-CHECK: Name: $R000340
+CHECK: Name: $R000002
CHECK-NEXT: Value: 832
CHECK-NEXT: Section: .rsrc$02
-CHECK: Name: $R000668
+CHECK: Name: $R000003
CHECK-NEXT: Value: 1640
CHECK-NEXT: Section: .rsrc$02
-CHECK: Name: $R000698
+CHECK: Name: $R000004
CHECK-NEXT: Value: 1688
CHECK-NEXT: Section: .rsrc$02
-CHECK: Name: $R000720
+CHECK: Name: $R000006
CHECK-NEXT: Value: 1824
CHECK-NEXT: Section: .rsrc$02
-CHECK: Name: $R000750
+CHECK: Name: $R000007
CHECK-NEXT: Value: 1872
CHECK-NEXT: Section: .rsrc$02
diff --git a/test/tools/llvm-dwarfdump/X86/lookup.s b/test/tools/llvm-dwarfdump/X86/lookup.s
index 652844447c11..d09528d667fa 100644
--- a/test/tools/llvm-dwarfdump/X86/lookup.s
+++ b/test/tools/llvm-dwarfdump/X86/lookup.s
@@ -1,5 +1,10 @@
# RUN: llvm-mc %s -filetype obj -triple x86_64-apple-darwin -o - \
-# RUN: | llvm-dwarfdump -lookup=0x7fffffff - | \
+# RUN: | llvm-dwarfdump -lookup=0xffffffff - | \
+# RUN: FileCheck %s --check-prefix=EMPTY --allow-empty
+# EMPTY: {{^$}}
+
+# RUN: llvm-mc %s -filetype obj -triple x86_64-apple-darwin -o - \
+# RUN: | llvm-dwarfdump -lookup=0xffffffffffffffff - | \
# RUN: FileCheck %s --check-prefix=EMPTY --allow-empty
# EMPTY: {{^$}}
diff --git a/test/tools/llvm-objcopy/add-section-remove.test b/test/tools/llvm-objcopy/add-section-remove.test
new file mode 100644
index 000000000000..0dee1182a28d
--- /dev/null
+++ b/test/tools/llvm-objcopy/add-section-remove.test
@@ -0,0 +1,36 @@
+# RUN: yaml2obj %s > %t
+# RUN: echo 0000 > %t.sec
+# RUN: llvm-objcopy -R=.test2 -add-section=.test2=%t.sec %t %t2
+# RUN: llvm-readobj -file-headers -sections -section-data %t2 | FileCheck %s
+
+!ELF
+FileHeader:
+ Class: ELFCLASS64
+ Data: ELFDATA2LSB
+ Type: ET_REL
+ Machine: EM_X86_64
+Sections:
+ - Name: .test1
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC ]
+ Content: "c3c3c3c3"
+ - Name: .test2
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC ]
+ Content: "DEADBEEF"
+ - Name: .test3
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC ]
+ Content: "32323232"
+
+# CHECK: SectionHeaderCount: 7
+
+# CHECK: Name: .test1
+# CHECK: Name: .test3
+# CHECK: Name: .symtab
+# CHECK: Name: .strtab
+# CHECK: Name: .shstrtab
+# CHECK: Name: .test2
+# CHECK: SectionData (
+# CHECK-NEXT: 0000: 30303030
+# CHECK-NEXT: )
diff --git a/test/tools/llvm-objcopy/add-section.test b/test/tools/llvm-objcopy/add-section.test
new file mode 100644
index 000000000000..048edcba227f
--- /dev/null
+++ b/test/tools/llvm-objcopy/add-section.test
@@ -0,0 +1,37 @@
+# RUN: yaml2obj %s > %t
+# RUN: llvm-objcopy -O binary -j .test2 %t %t.sec
+# RUN: llvm-objcopy -R=.test2 %t %t2
+# RUN: llvm-objcopy -add-section=.test2=%t.sec %t2 %t3
+# RUN: llvm-readobj -file-headers -sections -section-data %t3 | FileCheck %s
+
+!ELF
+FileHeader:
+ Class: ELFCLASS64
+ Data: ELFDATA2LSB
+ Type: ET_REL
+ Machine: EM_X86_64
+Sections:
+ - Name: .test1
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC ]
+ Content: "c3c3c3c3"
+ - Name: .test2
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC ]
+ Content: "DEADBEEF"
+ - Name: .test3
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC ]
+ Content: "32323232"
+
+# CHECK: SectionHeaderCount: 7
+
+# CHECK: Name: .test1
+# CHECK: Name: .test3
+# CHECK: Name: .symtab
+# CHECK: Name: .strtab
+# CHECK: Name: .shstrtab
+# CHECK: Name: .test2
+# CHECK: SectionData (
+# CHECK-NEXT: 0000: DEADBEEF
+# CHECK-NEXT: )
diff --git a/test/tools/llvm-readobj/mips-got.test b/test/tools/llvm-readobj/mips-got.test
index 65ccf13f2b4c..a5c15fdfe230 100644
--- a/test/tools/llvm-readobj/mips-got.test
+++ b/test/tools/llvm-readobj/mips-got.test
@@ -1,4 +1,4 @@
-RUN: llvm-readobj -mips-plt-got %p/Inputs/relocs.obj.elf-mips | \
+RUN: not llvm-readobj -mips-plt-got %p/Inputs/relocs.obj.elf-mips 2>&1 | \
RUN: FileCheck %s -check-prefix GOT-OBJ
RUN: llvm-readobj -mips-plt-got %p/Inputs/dynamic-table-exe.mips | \
RUN: FileCheck %s -check-prefix GOT-EXE
@@ -11,7 +11,26 @@ RUN: FileCheck %s -check-prefix GOT-EMPTY
RUN: llvm-readobj -mips-plt-got %p/Inputs/got-static.exe.mips | \
RUN: FileCheck %s -check-prefix GOT-STATIC
-GOT-OBJ: Cannot find .got section
+RUN: not llvm-readobj -mips-plt-got %p/Inputs/relocs.obj.elf-mips \
+RUN: --elf-output-style=GNU 2>&1 | \
+RUN: FileCheck %s -check-prefix GNU-GOT-OBJ
+RUN: llvm-readobj -mips-plt-got %p/Inputs/dynamic-table-exe.mips \
+RUN: --elf-output-style=GNU | \
+RUN: FileCheck %s -check-prefix GNU-GOT-EXE
+RUN: llvm-readobj -mips-plt-got %p/Inputs/dynamic-table-so.mips \
+RUN: --elf-output-style=GNU | \
+RUN: FileCheck %s -check-prefix GNU-GOT-SO
+RUN: llvm-readobj -mips-plt-got %p/Inputs/got-tls.so.elf-mips64el \
+RUN: --elf-output-style=GNU | \
+RUN: FileCheck %s -check-prefix GNU-GOT-TLS
+RUN: llvm-readobj -mips-plt-got %p/Inputs/got-empty.exe.mipsel \
+RUN: --elf-output-style=GNU | \
+RUN: FileCheck %s -check-prefix GNU-GOT-EMPTY
+RUN: llvm-readobj -mips-plt-got %p/Inputs/got-static.exe.mips \
+RUN: --elf-output-style=GNU | \
+RUN: FileCheck %s -check-prefix GNU-GOT-STATIC
+
+GOT-OBJ: Error reading file: Cannot find .got section.
GOT-EXE: Primary GOT {
GOT-EXE-NEXT: Canonical gp value: 0x418880
@@ -366,3 +385,116 @@ GOT-STATIC-NEXT: Initial: 0x400104
GOT-STATIC-NEXT: }
GOT-STATIC-NEXT: ]
GOT-STATIC-NEXT: }
+
+GNU-GOT-OBJ: Error reading file: Cannot find .got section.
+
+GNU-GOT-EXE: Primary GOT:
+GNU-GOT-EXE-NEXT: Canonical gp value: 00418880
+
+GNU-GOT-EXE: Reserved entries:
+GNU-GOT-EXE-NEXT: Address Access Initial Purpose
+GNU-GOT-EXE-NEXT: 00410890 -32752(gp) 00000000 Lazy resolver
+GNU-GOT-EXE-NEXT: 00410894 -32748(gp) 80000000 Module pointer (GNU extension)
+
+GNU-GOT-EXE: Local entries:
+GNU-GOT-EXE-NEXT: Address Access Initial
+GNU-GOT-EXE-NEXT: 00410898 -32744(gp) 00400418
+GNU-GOT-EXE-NEXT: 0041089c -32740(gp) 00410840
+GNU-GOT-EXE-NEXT: 004108a0 -32736(gp) 00000000
+
+GNU-GOT-EXE: Global entries:
+GNU-GOT-EXE-NEXT: Address Access Initial Sym.Val. Type Ndx Name
+GNU-GOT-EXE-NEXT: 004108a4 -32732(gp) 00000000 00000000 FUNC UND __gmon_start__
+
+GNU-GOT-EXE: PLT GOT:
+
+GNU-GOT-EXE: Reserved entries:
+GNU-GOT-EXE-NEXT: Address Initial Purpose
+GNU-GOT-EXE-NEXT: 00410854 00000000 PLT lazy resolver
+GNU-GOT-EXE-NEXT: 00410894 80000000 Module pointer
+
+GNU-GOT-EXE: Entries:
+GNU-GOT-EXE-NEXT: Address Initial Sym.Val. Type Ndx Name
+GNU-GOT-EXE-NEXT: 0041085c 00400800 00000000 FUNC UND puts
+GNU-GOT-EXE-NEXT: 00410860 00400800 00000000 FUNC UND __libc_start_main
+
+GNU-GOT-SO: Primary GOT:
+GNU-GOT-SO-NEXT: Canonical gp value: 000188d0
+
+GNU-GOT-SO: Reserved entries:
+GNU-GOT-SO-NEXT: Address Access Initial Purpose
+GNU-GOT-SO-NEXT: 000108e0 -32752(gp) 00000000 Lazy resolver
+GNU-GOT-SO-NEXT: 000108e4 -32748(gp) 80000000 Module pointer (GNU extension)
+
+GNU-GOT-SO: Local entries:
+GNU-GOT-SO-NEXT: Address Access Initial
+GNU-GOT-SO-NEXT: 000108e8 -32744(gp) 000108e0
+GNU-GOT-SO-NEXT: 000108ec -32740(gp) 00010000
+GNU-GOT-SO-NEXT: 000108f0 -32736(gp) 00010920
+GNU-GOT-SO-NEXT: 000108f4 -32732(gp) 000108cc
+GNU-GOT-SO-NEXT: 000108f8 -32728(gp) 00000000
+GNU-GOT-SO-NEXT: 000108fc -32724(gp) 00000000
+GNU-GOT-SO-NEXT: 00010900 -32720(gp) 00000000
+GNU-GOT-SO-NEXT: 00010904 -32716(gp) 00000000
+
+GNU-GOT-SO: Global entries:
+GNU-GOT-SO-NEXT: Address Access Initial Sym.Val. Type Ndx Name
+GNU-GOT-SO-NEXT: 00010908 -32712(gp) 00000000 00000000 NOTYPE UND _ITM_registerTMCloneTable
+GNU-GOT-SO-NEXT: 0001090c -32708(gp) 00000000 00000000 NOTYPE UND _Jv_RegisterClasses
+GNU-GOT-SO-NEXT: 00010910 -32704(gp) 00000000 00000000 FUNC UND __gmon_start__
+GNU-GOT-SO-NEXT: 00010914 -32700(gp) 00000840 00000840 FUNC UND puts
+GNU-GOT-SO-NEXT: 00010918 -32696(gp) 00000000 00000000 NOTYPE UND _ITM_deregisterTMCloneTable
+GNU-GOT-SO-NEXT: 0001091c -32692(gp) 00000000 00000000 FUNC UND __cxa_finalize
+
+GNU-GOT-TLS: Primary GOT:
+GNU-GOT-TLS-NEXT: Canonical gp value: 0000000000018bf0
+
+GNU-GOT-TLS: Reserved entries:
+GNU-GOT-TLS-NEXT: Address Access Initial Purpose
+GNU-GOT-TLS-NEXT: 0000000000010c00 -32752(gp) 0000000000000000 Lazy resolver
+GNU-GOT-TLS-NEXT: 0000000000010c08 -32744(gp) 8000000000000000 Module pointer (GNU extension)
+
+GNU-GOT-TLS: Local entries:
+GNU-GOT-TLS-NEXT: Address Access Initial
+GNU-GOT-TLS-NEXT: 0000000000010c10 -32736(gp) 0000000000010000
+GNU-GOT-TLS-NEXT: 0000000000010c18 -32728(gp) 0000000000010c00
+GNU-GOT-TLS-NEXT: 0000000000010c20 -32720(gp) 0000000000010cb8
+GNU-GOT-TLS-NEXT: 0000000000010c28 -32712(gp) 0000000000010bf0
+GNU-GOT-TLS-NEXT: 0000000000010c30 -32704(gp) 0000000000000000
+GNU-GOT-TLS-NEXT: 0000000000010c38 -32696(gp) 0000000000000948
+GNU-GOT-TLS-NEXT: 0000000000010c40 -32688(gp) 0000000000000a20
+GNU-GOT-TLS-NEXT: 0000000000010c48 -32680(gp) 0000000000000af0
+GNU-GOT-TLS-NEXT: 0000000000010c50 -32672(gp) 0000000000000000
+GNU-GOT-TLS-NEXT: 0000000000010c58 -32664(gp) 0000000000000000
+GNU-GOT-TLS-NEXT: 0000000000010c60 -32656(gp) 0000000000000000
+
+GNU-GOT-TLS: Global entries:
+GNU-GOT-TLS-NEXT: Address Access Initial Sym.Val. Type Ndx Name
+GNU-GOT-TLS-NEXT: 0000000000010c68 -32648(gp) 0000000000000000 0000000000000000 NOTYPE UND _ITM_registerTMCloneTable
+GNU-GOT-TLS-NEXT: 0000000000010c70 -32640(gp) 0000000000000000 0000000000000000 NOTYPE UND _Jv_RegisterClasses
+GNU-GOT-TLS-NEXT: 0000000000010c78 -32632(gp) 0000000000000000 0000000000000000 FUNC UND __gmon_start__
+GNU-GOT-TLS-NEXT: 0000000000010c80 -32624(gp) 0000000000000b60 0000000000000b60 FUNC UND __tls_get_addr
+GNU-GOT-TLS-NEXT: 0000000000010c88 -32616(gp) 0000000000000000 0000000000000000 NOTYPE UND _ITM_deregisterTMCloneTable
+GNU-GOT-TLS-NEXT: 0000000000010c90 -32608(gp) 0000000000000000 0000000000000000 FUNC UND __cxa_finalize
+
+GNU-GOTY : Primary GOT:
+GNU-GOT-EMPTY: Canonical gp value: 00409ff0
+
+GNU-GOTY : Reserved entries:
+GNU-GOT-EMPTY: Address Access Initial Purpose
+GNU-GOT-EMPTY: 00402000 -32752(gp) 00000000 Lazy resolver
+GNU-GOT-EMPTY: 00402004 -32748(gp) 80000000 Module pointer (GNU extension)
+
+GNU-GOT-STATIC: Static GOT:
+GNU-GOT-STATIC-NEXT: Canonical gp value: 00418100
+
+GNU-GOT-STATIC: Reserved entries:
+GNU-GOT-STATIC-NEXT: Address Access Initial Purpose
+GNU-GOT-STATIC-NEXT: 00410110 -32752(gp) 00000000 Lazy resolver
+GNU-GOT-STATIC-NEXT: 00410114 -32748(gp) 80000000 Module pointer (GNU extension)
+
+GNU-GOT-STATIC: Local entries:
+GNU-GOT-STATIC-NEXT: Address Access Initial
+GNU-GOT-STATIC-NEXT: 00410118 -32744(gp) 00400000
+GNU-GOT-STATIC-NEXT: 0041011c -32740(gp) 00400100
+GNU-GOT-STATIC-NEXT: 00410120 -32736(gp) 00400104
diff --git a/test/tools/llvm-readobj/mips-plt.test b/test/tools/llvm-readobj/mips-plt.test
index ab0824b0be68..f41940c9cf34 100644
--- a/test/tools/llvm-readobj/mips-plt.test
+++ b/test/tools/llvm-readobj/mips-plt.test
@@ -1,4 +1,7 @@
RUN: llvm-readobj -mips-plt-got %p/Inputs/got-plt.exe.elf-mipsel | FileCheck %s
+RUN: llvm-readobj -mips-plt-got --elf-output-style=GNU \
+RUN: %p/Inputs/got-plt.exe.elf-mipsel \
+RUN: | FileCheck --check-prefix=GNU %s
CHECK: PLT GOT {
CHECK-NEXT: Reserved entries [
@@ -32,3 +35,32 @@ CHECK-NEXT: Name: __libc_start_main@GLIBC_2.0 (53)
CHECK-NEXT: }
CHECK-NEXT: ]
CHECK-NEXT: }
+
+GNU: Primary GOT:
+GNU-NEXT: Canonical gp value: 00418840
+
+GNU: Reserved entries:
+GNU-NEXT: Address Access Initial Purpose
+GNU-NEXT: 00410850 -32752(gp) 00000000 Lazy resolver
+GNU-NEXT: 00410854 -32748(gp) 80000000 Module pointer (GNU extension)
+
+GNU: Local entries:
+GNU-NEXT: Address Access Initial
+GNU-NEXT: 00410858 -32744(gp) 004003d4
+GNU-NEXT: 0041085c -32740(gp) 00410800
+GNU-NEXT: 00410860 -32736(gp) 00000000
+
+GNU: Global entries:
+GNU-NEXT: Address Access Initial Sym.Val. Type Ndx Name
+GNU-NEXT: 00410864 -32732(gp) 00000000 00000000 FUNC UND __gmon_start__
+GNU-NEXT: PLT GOT:
+
+GNU: Reserved entries:
+GNU-NEXT: Address Initial Purpose
+GNU-NEXT: 00410814 00000000 PLT lazy resolver
+GNU-NEXT: 00410854 80000000 Module pointer
+
+GNU: Entries:
+GNU-NEXT: Address Initial Sym.Val. Type Ndx Name
+GNU-NEXT: 0041081c 004007c0 00000000 FUNC UND puts
+GNU-NEXT: 00410820 004007c0 00000000 FUNC UND __libc_start_main